LLVM 22.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
41#include "llvm/Analysis/Loads.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <memory>
99#include <optional>
100#include <set>
101#include <string>
102#include <tuple>
103#include <utility>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107using namespace slpvectorizer;
108using namespace std::placeholders;
109
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
112
113STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
114
115DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
116 "Controls which SLP graphs should be vectorized.");
117
118static cl::opt<bool>
119 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
120 cl::desc("Run the SLP vectorization passes"));
121
122static cl::opt<bool>
123 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
124 cl::desc("Enable vectorization for wider vector utilization"));
125
126static cl::opt<int>
128 cl::desc("Only vectorize if you gain more than this "
129 "number "));
130
132 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
133 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
135
136static cl::opt<bool>
137ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
138 cl::desc("Attempt to vectorize horizontal reductions"));
139
141 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
142 cl::desc(
143 "Attempt to vectorize horizontal reductions feeding into a store"));
144
146 "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
147 cl::desc("Improve the code quality by splitting alternate instructions"));
148
149static cl::opt<int>
151 cl::desc("Attempt to vectorize for this register size in bits"));
152
155 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
156
157/// Limits the size of scheduling regions in a block.
158/// It avoid long compile times for _very_ large blocks where vector
159/// instructions are spread over a wide range.
160/// This limit is way higher than needed by real-world functions.
161static cl::opt<int>
162ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
163 cl::desc("Limit the size of the SLP scheduling region per block"));
164
166 "slp-min-reg-size", cl::init(128), cl::Hidden,
167 cl::desc("Attempt to vectorize for this register size in bits"));
168
170 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
171 cl::desc("Limit the recursion depth when building a vectorizable tree"));
172
174 "slp-min-tree-size", cl::init(3), cl::Hidden,
175 cl::desc("Only vectorize small trees if they are fully vectorizable"));
176
177// The maximum depth that the look-ahead score heuristic will explore.
178// The higher this value, the higher the compilation time overhead.
180 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
181 cl::desc("The maximum look-ahead depth for operand reordering scores"));
182
183// The maximum depth that the look-ahead score heuristic will explore
184// when it probing among candidates for vectorization tree roots.
185// The higher this value, the higher the compilation time overhead but unlike
186// similar limit for operands ordering this is less frequently used, hence
187// impact of higher value is less noticeable.
189 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
190 cl::desc("The maximum look-ahead depth for searching best rooting option"));
191
193 "slp-min-strided-loads", cl::init(2), cl::Hidden,
194 cl::desc("The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
196
198 "slp-max-stride", cl::init(8), cl::Hidden,
199 cl::desc("The maximum stride, considered to be profitable."));
200
201static cl::opt<bool>
202 DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden,
203 cl::desc("Disable tree reordering even if it is "
204 "profitable. Used for testing only."));
205
206static cl::opt<bool>
207 ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden,
208 cl::desc("Generate strided loads even if they are not "
209 "profitable. Used for testing only."));
210
211static cl::opt<bool>
212 ViewSLPTree("view-slp-tree", cl::Hidden,
213 cl::desc("Display the SLP trees with Graphviz"));
214
216 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
217 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
218
219/// Enables vectorization of copyable elements.
221 "slp-copyable-elements", cl::init(true), cl::Hidden,
222 cl::desc("Try to replace values with the idempotent instructions for "
223 "better vectorization."));
224
225// Limit the number of alias checks. The limit is chosen so that
226// it has no negative effect on the llvm benchmarks.
227static const unsigned AliasedCheckLimit = 10;
228
229// Limit of the number of uses for potentially transformed instructions/values,
230// used in checks to avoid compile-time explode.
231static constexpr int UsesLimit = 64;
232
233// Another limit for the alias checks: The maximum distance between load/store
234// instructions where alias checks are done.
235// This limit is useful for very large basic blocks.
236static const unsigned MaxMemDepDistance = 160;
237
238/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
239/// regions to be handled.
240static const int MinScheduleRegionSize = 16;
241
242/// Maximum allowed number of operands in the PHI nodes.
243static const unsigned MaxPHINumOperands = 128;
244
245/// Predicate for the element types that the SLP vectorizer supports.
246///
247/// The most important thing to filter here are types which are invalid in LLVM
248/// vectors. We also filter target specific types which have absolutely no
249/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
250/// avoids spending time checking the cost model and realizing that they will
251/// be inevitably scalarized.
252static bool isValidElementType(Type *Ty) {
253 // TODO: Support ScalableVectorType.
255 Ty = Ty->getScalarType();
256 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
257 !Ty->isPPC_FP128Ty();
258}
259
260/// Returns the type of the given value/instruction \p V. If it is store,
261/// returns the type of its value operand, for Cmp - the types of the compare
262/// operands and for insertelement - the type os the inserted operand.
263/// Otherwise, just the type of the value is returned.
265 if (auto *SI = dyn_cast<StoreInst>(V))
266 return SI->getValueOperand()->getType();
267 if (auto *CI = dyn_cast<CmpInst>(V))
268 return CI->getOperand(0)->getType();
269 if (auto *IE = dyn_cast<InsertElementInst>(V))
270 return IE->getOperand(1)->getType();
271 return V->getType();
272}
273
274/// \returns the number of elements for Ty.
275static unsigned getNumElements(Type *Ty) {
277 "ScalableVectorType is not supported.");
278 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
279 return VecTy->getNumElements();
280 return 1;
281}
282
283/// \returns the vector type of ScalarTy based on vectorization factor.
284static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
285 return FixedVectorType::get(ScalarTy->getScalarType(),
286 VF * getNumElements(ScalarTy));
287}
288
289/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
290/// which forms type, which splits by \p TTI into whole vector types during
291/// legalization.
293 Type *Ty, unsigned Sz) {
294 if (!isValidElementType(Ty))
295 return bit_ceil(Sz);
296 // Find the number of elements, which forms full vectors.
297 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
298 if (NumParts == 0 || NumParts >= Sz)
299 return bit_ceil(Sz);
300 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
301}
302
303/// Returns the number of elements of the given type \p Ty, not greater than \p
304/// Sz, which forms type, which splits by \p TTI into whole vector types during
305/// legalization.
306static unsigned
308 unsigned Sz) {
309 if (!isValidElementType(Ty))
310 return bit_floor(Sz);
311 // Find the number of elements, which forms full vectors.
312 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
313 if (NumParts == 0 || NumParts >= Sz)
314 return bit_floor(Sz);
315 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
316 if (RegVF > Sz)
317 return bit_floor(Sz);
318 return (Sz / RegVF) * RegVF;
319}
320
321static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
322 SmallVectorImpl<int> &Mask) {
323 // The ShuffleBuilder implementation use shufflevector to splat an "element".
324 // But the element have different meaning for SLP (scalar) and REVEC
325 // (vector). We need to expand Mask into masks which shufflevector can use
326 // directly.
327 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
328 for (unsigned I : seq<unsigned>(Mask.size()))
329 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
330 I * VecTyNumElements, VecTyNumElements)))
331 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
332 : Mask[I] * VecTyNumElements + J;
333 Mask.swap(NewMask);
334}
335
336/// \returns the number of groups of shufflevector
337/// A group has the following features
338/// 1. All of value in a group are shufflevector.
339/// 2. The mask of all shufflevector is isExtractSubvectorMask.
340/// 3. The mask of all shufflevector uses all of the elements of the source.
341/// e.g., it is 1 group (%0)
342/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
343/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
344/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
345/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
346/// it is 2 groups (%3 and %4)
347/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
348/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
349/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
350/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
351/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
352/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
353/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
354/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
355/// it is 0 group
356/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
357/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
358/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
359/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
361 if (VL.empty())
362 return 0;
364 return 0;
365 auto *SV = cast<ShuffleVectorInst>(VL.front());
366 unsigned SVNumElements =
367 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
368 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
369 if (SVNumElements % ShuffleMaskSize != 0)
370 return 0;
371 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
372 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
373 return 0;
374 unsigned NumGroup = 0;
375 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
376 auto *SV = cast<ShuffleVectorInst>(VL[I]);
377 Value *Src = SV->getOperand(0);
378 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
379 SmallBitVector ExpectedIndex(GroupSize);
380 if (!all_of(Group, [&](Value *V) {
381 auto *SV = cast<ShuffleVectorInst>(V);
382 // From the same source.
383 if (SV->getOperand(0) != Src)
384 return false;
385 int Index;
386 if (!SV->isExtractSubvectorMask(Index))
387 return false;
388 ExpectedIndex.set(Index / ShuffleMaskSize);
389 return true;
390 }))
391 return 0;
392 if (!ExpectedIndex.all())
393 return 0;
394 ++NumGroup;
395 }
396 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
397 return NumGroup;
398}
399
400/// \returns a shufflevector mask which is used to vectorize shufflevectors
401/// e.g.,
402/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
403/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
404/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
405/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
406/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
407/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
408/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
409/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
410/// the result is
411/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
413 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
414 auto *SV = cast<ShuffleVectorInst>(VL.front());
415 unsigned SVNumElements =
416 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
417 SmallVector<int> Mask;
418 unsigned AccumulateLength = 0;
419 for (Value *V : VL) {
420 auto *SV = cast<ShuffleVectorInst>(V);
421 for (int M : SV->getShuffleMask())
422 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
423 : AccumulateLength + M);
424 AccumulateLength += SVNumElements;
425 }
426 return Mask;
427}
428
429/// \returns True if the value is a constant (but not globals/constant
430/// expressions).
431static bool isConstant(Value *V) {
433}
434
435/// Checks if \p V is one of vector-like instructions, i.e. undef,
436/// insertelement/extractelement with constant indices for fixed vector type or
437/// extractvalue instruction.
441 return false;
442 auto *I = dyn_cast<Instruction>(V);
443 if (!I || isa<ExtractValueInst>(I))
444 return true;
445 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
446 return false;
448 return isConstant(I->getOperand(1));
449 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
450 return isConstant(I->getOperand(2));
451}
452
453/// Returns power-of-2 number of elements in a single register (part), given the
454/// total number of elements \p Size and number of registers (parts) \p
455/// NumParts.
456static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
457 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
458}
459
460/// Returns correct remaining number of elements, considering total amount \p
461/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
462/// and current register (part) \p Part.
463static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
464 unsigned Part) {
465 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
466}
467
468#if !defined(NDEBUG)
469/// Print a short descriptor of the instruction bundle suitable for debug output.
470static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
471 std::string Result;
472 raw_string_ostream OS(Result);
473 if (Idx >= 0)
474 OS << "Idx: " << Idx << ", ";
475 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
476 return Result;
477}
478#endif
479
480/// \returns true if all of the instructions in \p VL are in the same block or
481/// false otherwise.
483 auto *It = find_if(VL, IsaPred<Instruction>);
484 if (It == VL.end())
485 return false;
488 return true;
489
490 BasicBlock *BB = I0->getParent();
491 for (Value *V : iterator_range(It, VL.end())) {
492 if (isa<PoisonValue>(V))
493 continue;
494 auto *II = dyn_cast<Instruction>(V);
495 if (!II)
496 return false;
497
498 if (BB != II->getParent())
499 return false;
500 }
501 return true;
502}
503
504/// \returns True if all of the values in \p VL are constants (but not
505/// globals/constant expressions).
507 // Constant expressions and globals can't be vectorized like normal integer/FP
508 // constants.
509 return all_of(VL, isConstant);
510}
511
512/// \returns True if all of the values in \p VL are identical or some of them
513/// are UndefValue.
514static bool isSplat(ArrayRef<Value *> VL) {
515 Value *FirstNonUndef = nullptr;
516 for (Value *V : VL) {
517 if (isa<UndefValue>(V))
518 continue;
519 if (!FirstNonUndef) {
520 FirstNonUndef = V;
521 continue;
522 }
523 if (V != FirstNonUndef)
524 return false;
525 }
526 return FirstNonUndef != nullptr;
527}
528
529/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
530/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
531/// patterns that make it effectively commutative (like equality comparisons
532/// with zero).
533/// In most cases, users should not call this function directly (since \p I and
534/// \p InstWithUses are the same). However, when analyzing interchangeable
535/// instructions, we need to use the converted opcode along with the original
536/// uses.
537/// \param I The instruction to check for commutativity
538/// \param ValWithUses The value whose uses are analyzed for special
539/// patterns
540static bool isCommutative(Instruction *I, Value *ValWithUses) {
541 if (auto *Cmp = dyn_cast<CmpInst>(I))
542 return Cmp->isCommutative();
543 if (auto *BO = dyn_cast<BinaryOperator>(I))
544 return BO->isCommutative() ||
545 (BO->getOpcode() == Instruction::Sub &&
546 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
547 all_of(
548 ValWithUses->uses(),
549 [](const Use &U) {
550 // Commutative, if icmp eq/ne sub, 0
551 CmpPredicate Pred;
552 if (match(U.getUser(),
553 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
554 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
555 return true;
556 // Commutative, if abs(sub nsw, true) or abs(sub, false).
557 ConstantInt *Flag;
558 return match(U.getUser(),
559 m_Intrinsic<Intrinsic::abs>(
560 m_Specific(U.get()), m_ConstantInt(Flag))) &&
561 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
562 Flag->isOne());
563 })) ||
564 (BO->getOpcode() == Instruction::FSub &&
565 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
566 all_of(ValWithUses->uses(), [](const Use &U) {
567 return match(U.getUser(),
568 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
569 }));
570 return I->isCommutative();
571}
572
573/// This is a helper function to check whether \p I is commutative.
574/// This is a convenience wrapper that calls the two-parameter version of
575/// isCommutative with the same instruction for both parameters. This is
576/// the common case where the instruction being checked for commutativity
577/// is the same as the instruction whose uses are analyzed for special
578/// patterns (see the two-parameter version above for details).
579/// \param I The instruction to check for commutativity
580/// \returns true if the instruction is commutative, false otherwise
581static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
582
583/// \returns number of operands of \p I, considering commutativity. Returns 2
584/// for commutative instrinsics.
585/// \param I The instruction to check for commutativity
588 // IntrinsicInst::isCommutative returns true if swapping the first "two"
589 // arguments to the intrinsic produces the same result.
590 constexpr unsigned IntrinsicNumOperands = 2;
591 return IntrinsicNumOperands;
592 }
593 return I->getNumOperands();
594}
595
596template <typename T>
597static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
598 unsigned Offset) {
599 static_assert(std::is_same_v<T, InsertElementInst> ||
600 std::is_same_v<T, ExtractElementInst>,
601 "unsupported T");
602 int Index = Offset;
603 if (const auto *IE = dyn_cast<T>(Inst)) {
604 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
605 if (!VT)
606 return std::nullopt;
607 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
608 if (!CI)
609 return std::nullopt;
610 if (CI->getValue().uge(VT->getNumElements()))
611 return std::nullopt;
612 Index *= VT->getNumElements();
613 Index += CI->getZExtValue();
614 return Index;
615 }
616 return std::nullopt;
617}
618
619/// \returns inserting or extracting index of InsertElement, ExtractElement or
620/// InsertValue instruction, using Offset as base offset for index.
621/// \returns std::nullopt if the index is not an immediate.
622static std::optional<unsigned> getElementIndex(const Value *Inst,
623 unsigned Offset = 0) {
624 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
625 return Index;
627 return Index;
628
629 int Index = Offset;
630
631 const auto *IV = dyn_cast<InsertValueInst>(Inst);
632 if (!IV)
633 return std::nullopt;
634
635 Type *CurrentType = IV->getType();
636 for (unsigned I : IV->indices()) {
637 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
638 Index *= ST->getNumElements();
639 CurrentType = ST->getElementType(I);
640 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
641 Index *= AT->getNumElements();
642 CurrentType = AT->getElementType();
643 } else {
644 return std::nullopt;
645 }
646 Index += I;
647 }
648 return Index;
649}
650
651/// \returns true if all of the values in \p VL use the same opcode.
652/// For comparison instructions, also checks if predicates match.
653/// PoisonValues are considered matching.
654/// Interchangeable instructions are not considered.
656 auto *It = find_if(VL, IsaPred<Instruction>);
657 if (It == VL.end())
658 return true;
659 Instruction *MainOp = cast<Instruction>(*It);
660 unsigned Opcode = MainOp->getOpcode();
661 bool IsCmpOp = isa<CmpInst>(MainOp);
662 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
664 return std::all_of(It, VL.end(), [&](Value *V) {
665 if (auto *CI = dyn_cast<CmpInst>(V))
666 return BasePred == CI->getPredicate();
667 if (auto *I = dyn_cast<Instruction>(V))
668 return I->getOpcode() == Opcode;
669 return isa<PoisonValue>(V);
670 });
671}
672
673namespace {
674/// Specifies the way the mask should be analyzed for undefs/poisonous elements
675/// in the shuffle mask.
676enum class UseMask {
677 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
678 ///< check for the mask elements for the first argument (mask
679 ///< indices are in range [0:VF)).
680 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
681 ///< for the mask elements for the second argument (mask indices
682 ///< are in range [VF:2*VF))
683 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
684 ///< future shuffle elements and mark them as ones as being used
685 ///< in future. Non-undef elements are considered as unused since
686 ///< they're already marked as used in the mask.
687};
688} // namespace
689
690/// Prepares a use bitset for the given mask either for the first argument or
691/// for the second.
693 UseMask MaskArg) {
694 SmallBitVector UseMask(VF, true);
695 for (auto [Idx, Value] : enumerate(Mask)) {
696 if (Value == PoisonMaskElem) {
697 if (MaskArg == UseMask::UndefsAsMask)
698 UseMask.reset(Idx);
699 continue;
700 }
701 if (MaskArg == UseMask::FirstArg && Value < VF)
702 UseMask.reset(Value);
703 else if (MaskArg == UseMask::SecondArg && Value >= VF)
704 UseMask.reset(Value - VF);
705 }
706 return UseMask;
707}
708
709/// Checks if the given value is actually an undefined constant vector.
710/// Also, if the \p UseMask is not empty, tries to check if the non-masked
711/// elements actually mask the insertelement buildvector, if any.
712template <bool IsPoisonOnly = false>
714 const SmallBitVector &UseMask = {}) {
715 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
716 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
717 if (isa<T>(V))
718 return Res;
719 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
720 if (!VecTy)
721 return Res.reset();
722 auto *C = dyn_cast<Constant>(V);
723 if (!C) {
724 if (!UseMask.empty()) {
725 const Value *Base = V;
726 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
727 Base = II->getOperand(0);
728 if (isa<T>(II->getOperand(1)))
729 continue;
730 std::optional<unsigned> Idx = getElementIndex(II);
731 if (!Idx) {
732 Res.reset();
733 return Res;
734 }
735 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
736 Res.reset(*Idx);
737 }
738 // TODO: Add analysis for shuffles here too.
739 if (V == Base) {
740 Res.reset();
741 } else {
742 SmallBitVector SubMask(UseMask.size(), false);
743 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
744 }
745 } else {
746 Res.reset();
747 }
748 return Res;
749 }
750 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
751 if (Constant *Elem = C->getAggregateElement(I))
752 if (!isa<T>(Elem) &&
753 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
754 Res.reset(I);
755 }
756 return Res;
757}
758
759/// Checks if the vector of instructions can be represented as a shuffle, like:
760/// %x0 = extractelement <4 x i8> %x, i32 0
761/// %x3 = extractelement <4 x i8> %x, i32 3
762/// %y1 = extractelement <4 x i8> %y, i32 1
763/// %y2 = extractelement <4 x i8> %y, i32 2
764/// %x0x0 = mul i8 %x0, %x0
765/// %x3x3 = mul i8 %x3, %x3
766/// %y1y1 = mul i8 %y1, %y1
767/// %y2y2 = mul i8 %y2, %y2
768/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
769/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
770/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
771/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
772/// ret <4 x i8> %ins4
773/// can be transformed into:
774/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
775/// i32 6>
776/// %2 = mul <4 x i8> %1, %1
777/// ret <4 x i8> %2
778/// Mask will return the Shuffle Mask equivalent to the extracted elements.
779/// TODO: Can we split off and reuse the shuffle mask detection from
780/// ShuffleVectorInst/getShuffleCost?
781static std::optional<TargetTransformInfo::ShuffleKind>
783 AssumptionCache *AC) {
784 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
785 if (It == VL.end())
786 return std::nullopt;
787 unsigned Size =
788 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
789 auto *EI = dyn_cast<ExtractElementInst>(V);
790 if (!EI)
791 return S;
792 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
793 if (!VTy)
794 return S;
795 return std::max(S, VTy->getNumElements());
796 });
797
798 Value *Vec1 = nullptr;
799 Value *Vec2 = nullptr;
800 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
801 auto *EE = dyn_cast<ExtractElementInst>(V);
802 if (!EE)
803 return false;
804 Value *Vec = EE->getVectorOperand();
805 if (isa<UndefValue>(Vec))
806 return false;
807 return isGuaranteedNotToBePoison(Vec, AC);
808 });
809 enum ShuffleMode { Unknown, Select, Permute };
810 ShuffleMode CommonShuffleMode = Unknown;
811 Mask.assign(VL.size(), PoisonMaskElem);
812 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
813 // Undef can be represented as an undef element in a vector.
814 if (isa<UndefValue>(VL[I]))
815 continue;
816 auto *EI = cast<ExtractElementInst>(VL[I]);
817 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
818 return std::nullopt;
819 auto *Vec = EI->getVectorOperand();
820 // We can extractelement from undef or poison vector.
822 continue;
823 // All vector operands must have the same number of vector elements.
824 if (isa<UndefValue>(Vec)) {
825 Mask[I] = I;
826 } else {
827 if (isa<UndefValue>(EI->getIndexOperand()))
828 continue;
829 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
830 if (!Idx)
831 return std::nullopt;
832 // Undefined behavior if Idx is negative or >= Size.
833 if (Idx->getValue().uge(Size))
834 continue;
835 unsigned IntIdx = Idx->getValue().getZExtValue();
836 Mask[I] = IntIdx;
837 }
838 if (isUndefVector(Vec).all() && HasNonUndefVec)
839 continue;
840 // For correct shuffling we have to have at most 2 different vector operands
841 // in all extractelement instructions.
842 if (!Vec1 || Vec1 == Vec) {
843 Vec1 = Vec;
844 } else if (!Vec2 || Vec2 == Vec) {
845 Vec2 = Vec;
846 Mask[I] += Size;
847 } else {
848 return std::nullopt;
849 }
850 if (CommonShuffleMode == Permute)
851 continue;
852 // If the extract index is not the same as the operation number, it is a
853 // permutation.
854 if (Mask[I] % Size != I) {
855 CommonShuffleMode = Permute;
856 continue;
857 }
858 CommonShuffleMode = Select;
859 }
860 // If we're not crossing lanes in different vectors, consider it as blending.
861 if (CommonShuffleMode == Select && Vec2)
863 // If Vec2 was never used, we have a permutation of a single vector, otherwise
864 // we have permutation of 2 vectors.
867}
868
869/// \returns True if Extract{Value,Element} instruction extracts element Idx.
870static std::optional<unsigned> getExtractIndex(const Instruction *E) {
871 unsigned Opcode = E->getOpcode();
872 assert((Opcode == Instruction::ExtractElement ||
873 Opcode == Instruction::ExtractValue) &&
874 "Expected extractelement or extractvalue instruction.");
875 if (Opcode == Instruction::ExtractElement) {
876 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
877 if (!CI)
878 return std::nullopt;
879 return CI->getZExtValue();
880 }
881 auto *EI = cast<ExtractValueInst>(E);
882 if (EI->getNumIndices() != 1)
883 return std::nullopt;
884 return *EI->idx_begin();
885}
886
887namespace llvm {
888/// Checks if the provided value does not require scheduling. It does not
889/// require scheduling if this is not an instruction or it is an instruction
890/// that does not read/write memory and all operands are either not instructions
891/// or phi nodes or instructions from different blocks.
892static bool areAllOperandsNonInsts(Value *V);
893/// Checks if the provided value does not require scheduling. It does not
894/// require scheduling if this is not an instruction or it is an instruction
895/// that does not read/write memory and all users are phi nodes or instructions
896/// from the different blocks.
897static bool isUsedOutsideBlock(Value *V);
898/// Checks if the specified value does not require scheduling. It does not
899/// require scheduling if all operands and all users do not need to be scheduled
900/// in the current basic block.
901static bool doesNotNeedToBeScheduled(Value *V);
902} // namespace llvm
903
904namespace {
905/// \returns true if \p Opcode is allowed as part of the main/alternate
906/// instruction for SLP vectorization.
907///
908/// Example of unsupported opcode is SDIV that can potentially cause UB if the
909/// "shuffled out" lane would result in division by zero.
910bool isValidForAlternation(unsigned Opcode) {
911 return !Instruction::isIntDivRem(Opcode);
912}
913
914/// Helper class that determines VL can use the same opcode.
915/// Alternate instruction is supported. In addition, it supports interchangeable
916/// instruction. An interchangeable instruction is an instruction that can be
917/// converted to another instruction with same semantics. For example, x << 1 is
918/// equal to x * 2. x * 1 is equal to x | 0.
919class BinOpSameOpcodeHelper {
920 using MaskType = std::uint_fast16_t;
921 /// Sort SupportedOp because it is used by binary_search.
922 constexpr static std::initializer_list<unsigned> SupportedOp = {
923 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
924 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
925 enum : MaskType {
926 ShlBIT = 0b1,
927 AShrBIT = 0b10,
928 MulBIT = 0b100,
929 AddBIT = 0b1000,
930 SubBIT = 0b10000,
931 AndBIT = 0b100000,
932 OrBIT = 0b1000000,
933 XorBIT = 0b10000000,
934 MainOpBIT = 0b100000000,
936 };
937 /// Return a non-nullptr if either operand of I is a ConstantInt.
938 /// The second return value represents the operand position. We check the
939 /// right-hand side first (1). If the right hand side is not a ConstantInt and
940 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
941 /// side (0).
942 static std::pair<ConstantInt *, unsigned>
943 isBinOpWithConstantInt(const Instruction *I) {
944 unsigned Opcode = I->getOpcode();
945 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
946 (void)SupportedOp;
947 auto *BinOp = cast<BinaryOperator>(I);
948 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
949 return {CI, 1};
950 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
951 Opcode == Instruction::AShr)
952 return {nullptr, 0};
953 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
954 return {CI, 0};
955 return {nullptr, 0};
956 }
957 struct InterchangeableInfo {
958 const Instruction *I = nullptr;
959 /// The bit it sets represents whether MainOp can be converted to.
960 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
961 MulBIT | AShrBIT | ShlBIT;
962 /// We cannot create an interchangeable instruction that does not exist in
963 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
964 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
965 /// 1]. SeenBefore is used to know what operations have been seen before.
966 MaskType SeenBefore = 0;
967 InterchangeableInfo(const Instruction *I) : I(I) {}
968 /// Return false allows BinOpSameOpcodeHelper to find an alternate
969 /// instruction. Directly setting the mask will destroy the mask state,
970 /// preventing us from determining which instruction it should convert to.
971 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
972 if (Mask & InterchangeableMask) {
973 SeenBefore |= OpcodeInMaskForm;
974 Mask &= InterchangeableMask;
975 return true;
976 }
977 return false;
978 }
979 bool equal(unsigned Opcode) {
980 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
981 }
982 unsigned getOpcode() const {
983 MaskType Candidate = Mask & SeenBefore;
984 if (Candidate & MainOpBIT)
985 return I->getOpcode();
986 if (Candidate & ShlBIT)
987 return Instruction::Shl;
988 if (Candidate & AShrBIT)
989 return Instruction::AShr;
990 if (Candidate & MulBIT)
991 return Instruction::Mul;
992 if (Candidate & AddBIT)
993 return Instruction::Add;
994 if (Candidate & SubBIT)
995 return Instruction::Sub;
996 if (Candidate & AndBIT)
997 return Instruction::And;
998 if (Candidate & OrBIT)
999 return Instruction::Or;
1000 if (Candidate & XorBIT)
1001 return Instruction::Xor;
1002 llvm_unreachable("Cannot find interchangeable instruction.");
1003 }
1004
1005 /// Return true if the instruction can be converted to \p Opcode.
1006 bool hasCandidateOpcode(unsigned Opcode) const {
1007 MaskType Candidate = Mask & SeenBefore;
1008 switch (Opcode) {
1009 case Instruction::Shl:
1010 return Candidate & ShlBIT;
1011 case Instruction::AShr:
1012 return Candidate & AShrBIT;
1013 case Instruction::Mul:
1014 return Candidate & MulBIT;
1015 case Instruction::Add:
1016 return Candidate & AddBIT;
1017 case Instruction::Sub:
1018 return Candidate & SubBIT;
1019 case Instruction::And:
1020 return Candidate & AndBIT;
1021 case Instruction::Or:
1022 return Candidate & OrBIT;
1023 case Instruction::Xor:
1024 return Candidate & XorBIT;
1025 case Instruction::LShr:
1026 case Instruction::FAdd:
1027 case Instruction::FSub:
1028 case Instruction::FMul:
1029 case Instruction::SDiv:
1030 case Instruction::UDiv:
1031 case Instruction::FDiv:
1032 case Instruction::SRem:
1033 case Instruction::URem:
1034 case Instruction::FRem:
1035 return false;
1036 default:
1037 break;
1038 }
1039 llvm_unreachable("Cannot find interchangeable instruction.");
1040 }
1041
1042 SmallVector<Value *> getOperand(const Instruction *To) const {
1043 unsigned ToOpcode = To->getOpcode();
1044 unsigned FromOpcode = I->getOpcode();
1045 if (FromOpcode == ToOpcode)
1046 return SmallVector<Value *>(I->operands());
1047 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1048 auto [CI, Pos] = isBinOpWithConstantInt(I);
1049 const APInt &FromCIValue = CI->getValue();
1050 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1051 APInt ToCIValue;
1052 switch (FromOpcode) {
1053 case Instruction::Shl:
1054 if (ToOpcode == Instruction::Mul) {
1055 ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth,
1056 FromCIValue.getZExtValue());
1057 } else {
1058 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1059 ToCIValue = ToOpcode == Instruction::And
1060 ? APInt::getAllOnes(FromCIValueBitWidth)
1061 : APInt::getZero(FromCIValueBitWidth);
1062 }
1063 break;
1064 case Instruction::Mul:
1065 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1066 if (ToOpcode == Instruction::Shl) {
1067 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2());
1068 } else {
1069 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1070 ToCIValue = ToOpcode == Instruction::And
1071 ? APInt::getAllOnes(FromCIValueBitWidth)
1072 : APInt::getZero(FromCIValueBitWidth);
1073 }
1074 break;
1075 case Instruction::Add:
1076 case Instruction::Sub:
1077 if (FromCIValue.isZero()) {
1078 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1079 } else {
1080 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1081 "Cannot convert the instruction.");
1082 ToCIValue = FromCIValue;
1083 ToCIValue.negate();
1084 }
1085 break;
1086 case Instruction::And:
1087 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1088 ToCIValue = ToOpcode == Instruction::Mul
1089 ? APInt::getOneBitSet(FromCIValueBitWidth, 0)
1090 : APInt::getZero(FromCIValueBitWidth);
1091 break;
1092 default:
1093 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1094 ToCIValue = APInt::getZero(FromCIValueBitWidth);
1095 break;
1096 }
1097 Value *LHS = I->getOperand(1 - Pos);
1098 Constant *RHS =
1099 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1100 // constant + x cannot be -constant - x
1101 // instead, it should be x - -constant
1102 if (Pos == 1 ||
1103 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1104 FromOpcode == Instruction::Xor) &&
1105 ToOpcode == Instruction::Sub))
1106 return SmallVector<Value *>({LHS, RHS});
1107 return SmallVector<Value *>({RHS, LHS});
1108 }
1109 };
1110 InterchangeableInfo MainOp;
1111 InterchangeableInfo AltOp;
1112 bool isValidForAlternation(const Instruction *I) const {
1113 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1114 ::isValidForAlternation(I->getOpcode());
1115 }
1116 bool initializeAltOp(const Instruction *I) {
1117 if (AltOp.I)
1118 return true;
1119 if (!isValidForAlternation(I))
1120 return false;
1121 AltOp.I = I;
1122 return true;
1123 }
1124
1125public:
1126 BinOpSameOpcodeHelper(const Instruction *MainOp,
1127 const Instruction *AltOp = nullptr)
1128 : MainOp(MainOp), AltOp(AltOp) {
1129 assert(is_sorted(SupportedOp) && "SupportedOp is not sorted.");
1130 }
1131 bool add(const Instruction *I) {
1133 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1134 unsigned Opcode = I->getOpcode();
1135 MaskType OpcodeInMaskForm;
1136 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1137 switch (Opcode) {
1138 case Instruction::Shl:
1139 OpcodeInMaskForm = ShlBIT;
1140 break;
1141 case Instruction::AShr:
1142 OpcodeInMaskForm = AShrBIT;
1143 break;
1144 case Instruction::Mul:
1145 OpcodeInMaskForm = MulBIT;
1146 break;
1147 case Instruction::Add:
1148 OpcodeInMaskForm = AddBIT;
1149 break;
1150 case Instruction::Sub:
1151 OpcodeInMaskForm = SubBIT;
1152 break;
1153 case Instruction::And:
1154 OpcodeInMaskForm = AndBIT;
1155 break;
1156 case Instruction::Or:
1157 OpcodeInMaskForm = OrBIT;
1158 break;
1159 case Instruction::Xor:
1160 OpcodeInMaskForm = XorBIT;
1161 break;
1162 default:
1163 return MainOp.equal(Opcode) ||
1164 (initializeAltOp(I) && AltOp.equal(Opcode));
1165 }
1166 MaskType InterchangeableMask = OpcodeInMaskForm;
1167 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1168 if (CI) {
1169 constexpr MaskType CanBeAll =
1170 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1171 const APInt &CIValue = CI->getValue();
1172 switch (Opcode) {
1173 case Instruction::Shl:
1174 if (CIValue.ult(CIValue.getBitWidth()))
1175 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1176 break;
1177 case Instruction::Mul:
1178 if (CIValue.isOne()) {
1179 InterchangeableMask = CanBeAll;
1180 break;
1181 }
1182 if (CIValue.isPowerOf2())
1183 InterchangeableMask = MulBIT | ShlBIT;
1184 break;
1185 case Instruction::Add:
1186 case Instruction::Sub:
1187 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1188 break;
1189 case Instruction::And:
1190 if (CIValue.isAllOnes())
1191 InterchangeableMask = CanBeAll;
1192 break;
1193 case Instruction::Xor:
1194 if (CIValue.isZero())
1195 InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT;
1196 break;
1197 default:
1198 if (CIValue.isZero())
1199 InterchangeableMask = CanBeAll;
1200 break;
1201 }
1202 }
1203 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1204 (initializeAltOp(I) &&
1205 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1206 }
1207 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1208 /// Checks if the list of potential opcodes includes \p Opcode.
1209 bool hasCandidateOpcode(unsigned Opcode) const {
1210 return MainOp.hasCandidateOpcode(Opcode);
1211 }
1212 bool hasAltOp() const { return AltOp.I; }
1213 unsigned getAltOpcode() const {
1214 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1215 }
1216 SmallVector<Value *> getOperand(const Instruction *I) const {
1217 return MainOp.getOperand(I);
1218 }
1219};
1220
1221/// Main data required for vectorization of instructions.
1222class InstructionsState {
1223 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1224 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1225 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1226 /// isAltShuffle).
1227 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1228 /// from getMainAltOpsNoStateVL.
1229 /// For those InstructionsState that use alternate instructions, the resulting
1230 /// vectorized output ultimately comes from a shufflevector. For example,
1231 /// given a vector list (VL):
1232 /// VL[0] = add i32 a, e
1233 /// VL[1] = sub i32 b, f
1234 /// VL[2] = add i32 c, g
1235 /// VL[3] = sub i32 d, h
1236 /// The vectorized result would be:
1237 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1238 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1239 /// result = shufflevector <4 x i32> intermediated_0,
1240 /// <4 x i32> intermediated_1,
1241 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1242 /// Since shufflevector is used in the final result, when calculating the cost
1243 /// (getEntryCost), we must account for the usage of shufflevector in
1244 /// GetVectorCost.
1245 Instruction *MainOp = nullptr;
1246 Instruction *AltOp = nullptr;
1247 /// Wether the instruction state represents copyable instructions.
1248 bool HasCopyables = false;
1249
1250public:
1251 Instruction *getMainOp() const {
1252 assert(valid() && "InstructionsState is invalid.");
1253 return MainOp;
1254 }
1255
1256 Instruction *getAltOp() const {
1257 assert(valid() && "InstructionsState is invalid.");
1258 return AltOp;
1259 }
1260
1261 /// The main/alternate opcodes for the list of instructions.
1262 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1263
1264 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1265
1266 /// Some of the instructions in the list have alternate opcodes.
1267 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1268
1269 /// Checks if the instruction matches either the main or alternate opcode.
1270 /// \returns
1271 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1272 /// to it
1273 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1274 /// it
1275 /// - nullptr if \param I cannot be matched or converted to either opcode
1276 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1277 assert(MainOp && "MainOp cannot be nullptr.");
1278 if (I->getOpcode() == MainOp->getOpcode())
1279 return MainOp;
1280 // Prefer AltOp instead of interchangeable instruction of MainOp.
1281 assert(AltOp && "AltOp cannot be nullptr.");
1282 if (I->getOpcode() == AltOp->getOpcode())
1283 return AltOp;
1284 if (!I->isBinaryOp())
1285 return nullptr;
1286 BinOpSameOpcodeHelper Converter(MainOp);
1287 if (!Converter.add(I) || !Converter.add(MainOp))
1288 return nullptr;
1289 if (isAltShuffle() && !Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1290 BinOpSameOpcodeHelper AltConverter(AltOp);
1291 if (AltConverter.add(I) && AltConverter.add(AltOp) &&
1292 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1293 return AltOp;
1294 }
1295 if (Converter.hasAltOp() && !isAltShuffle())
1296 return nullptr;
1297 return Converter.hasAltOp() ? AltOp : MainOp;
1298 }
1299
1300 /// Checks if main/alt instructions are shift operations.
1301 bool isShiftOp() const {
1302 return getMainOp()->isShift() && getAltOp()->isShift();
1303 }
1304
1305 /// Checks if main/alt instructions are bitwise logic operations.
1306 bool isBitwiseLogicOp() const {
1307 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1308 }
1309
1310 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1311 bool isMulDivLikeOp() const {
1312 constexpr std::array<unsigned, 8> MulDiv = {
1313 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1314 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1315 Instruction::URem, Instruction::FRem};
1316 return is_contained(MulDiv, getOpcode()) &&
1317 is_contained(MulDiv, getAltOpcode());
1318 }
1319
1320 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1321 bool isAddSubLikeOp() const {
1322 constexpr std::array<unsigned, 4> AddSub = {
1323 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1324 Instruction::FSub};
1325 return is_contained(AddSub, getOpcode()) &&
1326 is_contained(AddSub, getAltOpcode());
1327 }
1328
1329 /// Checks if main/alt instructions are cmp operations.
1330 bool isCmpOp() const {
1331 return (getOpcode() == Instruction::ICmp ||
1332 getOpcode() == Instruction::FCmp) &&
1333 getAltOpcode() == getOpcode();
1334 }
1335
1336 /// Checks if the current state is valid, i.e. has non-null MainOp
1337 bool valid() const { return MainOp && AltOp; }
1338
1339 explicit operator bool() const { return valid(); }
1340
1341 InstructionsState() = delete;
1342 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1343 bool HasCopyables = false)
1344 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1345 static InstructionsState invalid() { return {nullptr, nullptr}; }
1346
1347 /// Checks if the value is a copyable element.
1348 bool isCopyableElement(Value *V) const {
1349 assert(valid() && "InstructionsState is invalid.");
1350 if (!HasCopyables)
1351 return false;
1352 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1353 return false;
1354 auto *I = dyn_cast<Instruction>(V);
1355 if (!I)
1356 return !isa<PoisonValue>(V);
1357 if (I->getParent() != MainOp->getParent() &&
1360 return true;
1361 if (I->getOpcode() == MainOp->getOpcode())
1362 return false;
1363 if (!I->isBinaryOp())
1364 return true;
1365 BinOpSameOpcodeHelper Converter(MainOp);
1366 return !Converter.add(I) || !Converter.add(MainOp) ||
1367 Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
1368 }
1369
1370 /// Checks if the value is non-schedulable.
1371 bool isNonSchedulable(Value *V) const {
1372 assert(valid() && "InstructionsState is invalid.");
1373 auto *I = dyn_cast<Instruction>(V);
1374 if (!HasCopyables)
1375 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1377 // MainOp for copyables always schedulable to correctly identify
1378 // non-schedulable copyables.
1379 if (getMainOp() == V)
1380 return false;
1381 if (isCopyableElement(V)) {
1382 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1383 auto *I = dyn_cast<Instruction>(V);
1384 return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
1386 // If the copyable instructions comes after MainOp
1387 // (non-schedulable, but used in the block) - cannot vectorize
1388 // it, will possibly generate use before def.
1389 !MainOp->comesBefore(I));
1390 };
1391
1392 return IsNonSchedulableCopyableElement(V);
1393 }
1394 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1396 }
1397
1398 /// Checks if the state represents copyable instructions.
1399 bool areInstructionsWithCopyableElements() const {
1400 assert(valid() && "InstructionsState is invalid.");
1401 return HasCopyables;
1402 }
1403};
1404
1405std::pair<Instruction *, SmallVector<Value *>>
1406convertTo(Instruction *I, const InstructionsState &S) {
1407 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1408 assert(SelectedOp && "Cannot convert the instruction.");
1409 if (I->isBinaryOp()) {
1410 BinOpSameOpcodeHelper Converter(I);
1411 return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp));
1412 }
1413 return std::make_pair(SelectedOp, SmallVector<Value *>(I->operands()));
1414}
1415
1416} // end anonymous namespace
1417
1418static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1419 const TargetLibraryInfo &TLI);
1420
1421/// Find an instruction with a specific opcode in VL.
1422/// \param VL Array of values to search through. Must contain only Instructions
1423/// and PoisonValues.
1424/// \param Opcode The instruction opcode to search for
1425/// \returns
1426/// - The first instruction found with matching opcode
1427/// - nullptr if no matching instruction is found
1429 unsigned Opcode) {
1430 for (Value *V : VL) {
1431 if (isa<PoisonValue>(V))
1432 continue;
1433 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1434 auto *Inst = cast<Instruction>(V);
1435 if (Inst->getOpcode() == Opcode)
1436 return Inst;
1437 }
1438 return nullptr;
1439}
1440
1441/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1442/// compatible instructions or constants, or just some other regular values.
1443static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1444 Value *Op1, const TargetLibraryInfo &TLI) {
1445 return (isConstant(BaseOp0) && isConstant(Op0)) ||
1446 (isConstant(BaseOp1) && isConstant(Op1)) ||
1447 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
1448 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
1449 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1450 getSameOpcode({BaseOp0, Op0}, TLI) ||
1451 getSameOpcode({BaseOp1, Op1}, TLI);
1452}
1453
1454/// \returns true if a compare instruction \p CI has similar "look" and
1455/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1456/// swapped, false otherwise.
1457static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1458 const TargetLibraryInfo &TLI) {
1459 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1460 "Assessing comparisons of different types?");
1461 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1462 CmpInst::Predicate Pred = CI->getPredicate();
1464
1465 Value *BaseOp0 = BaseCI->getOperand(0);
1466 Value *BaseOp1 = BaseCI->getOperand(1);
1467 Value *Op0 = CI->getOperand(0);
1468 Value *Op1 = CI->getOperand(1);
1469
1470 return (BasePred == Pred &&
1471 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1472 (BasePred == SwappedPred &&
1473 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
1474}
1475
1476/// \returns analysis of the Instructions in \p VL described in
1477/// InstructionsState, the Opcode that we suppose the whole list
1478/// could be vectorized even if its structure is diverse.
1479static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1480 const TargetLibraryInfo &TLI) {
1481 // Make sure these are all Instructions.
1483 return InstructionsState::invalid();
1484
1485 auto *It = find_if(VL, IsaPred<Instruction>);
1486 if (It == VL.end())
1487 return InstructionsState::invalid();
1488
1489 Instruction *MainOp = cast<Instruction>(*It);
1490 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
1491 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
1492 (VL.size() == 2 && InstCnt < 2))
1493 return InstructionsState::invalid();
1494
1495 bool IsCastOp = isa<CastInst>(MainOp);
1496 bool IsBinOp = isa<BinaryOperator>(MainOp);
1497 bool IsCmpOp = isa<CmpInst>(MainOp);
1498 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
1500 Instruction *AltOp = MainOp;
1501 unsigned Opcode = MainOp->getOpcode();
1502 unsigned AltOpcode = Opcode;
1503
1504 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1505 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1506 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1507 UniquePreds.insert(BasePred);
1508 UniqueNonSwappedPreds.insert(BasePred);
1509 for (Value *V : VL) {
1510 auto *I = dyn_cast<CmpInst>(V);
1511 if (!I)
1512 return false;
1513 CmpInst::Predicate CurrentPred = I->getPredicate();
1514 CmpInst::Predicate SwappedCurrentPred =
1515 CmpInst::getSwappedPredicate(CurrentPred);
1516 UniqueNonSwappedPreds.insert(CurrentPred);
1517 if (!UniquePreds.contains(CurrentPred) &&
1518 !UniquePreds.contains(SwappedCurrentPred))
1519 UniquePreds.insert(CurrentPred);
1520 }
1521 // Total number of predicates > 2, but if consider swapped predicates
1522 // compatible only 2, consider swappable predicates as compatible opcodes,
1523 // not alternate.
1524 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1525 }();
1526 // Check for one alternate opcode from another BinaryOperator.
1527 // TODO - generalize to support all operators (types, calls etc.).
1528 Intrinsic::ID BaseID = 0;
1529 SmallVector<VFInfo> BaseMappings;
1530 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
1531 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
1532 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
1533 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
1534 return InstructionsState::invalid();
1535 }
1536 bool AnyPoison = InstCnt != VL.size();
1537 // Check MainOp too to be sure that it matches the requirements for the
1538 // instructions.
1539 for (Value *V : iterator_range(It, VL.end())) {
1540 auto *I = dyn_cast<Instruction>(V);
1541 if (!I)
1542 continue;
1543
1544 // Cannot combine poison and divisions.
1545 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1546 // intrinsics/functions only.
1547 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
1548 return InstructionsState::invalid();
1549 unsigned InstOpcode = I->getOpcode();
1550 if (IsBinOp && isa<BinaryOperator>(I)) {
1551 if (BinOpHelper.add(I))
1552 continue;
1553 } else if (IsCastOp && isa<CastInst>(I)) {
1554 Value *Op0 = MainOp->getOperand(0);
1555 Type *Ty0 = Op0->getType();
1556 Value *Op1 = I->getOperand(0);
1557 Type *Ty1 = Op1->getType();
1558 if (Ty0 == Ty1) {
1559 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1560 continue;
1561 if (Opcode == AltOpcode) {
1562 assert(isValidForAlternation(Opcode) &&
1563 isValidForAlternation(InstOpcode) &&
1564 "Cast isn't safe for alternation, logic needs to be updated!");
1565 AltOpcode = InstOpcode;
1566 AltOp = I;
1567 continue;
1568 }
1569 }
1570 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1571 auto *BaseInst = cast<CmpInst>(MainOp);
1572 Type *Ty0 = BaseInst->getOperand(0)->getType();
1573 Type *Ty1 = Inst->getOperand(0)->getType();
1574 if (Ty0 == Ty1) {
1575 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1576 assert(InstOpcode == AltOpcode &&
1577 "Alternate instructions are only supported by BinaryOperator "
1578 "and CastInst.");
1579 // Check for compatible operands. If the corresponding operands are not
1580 // compatible - need to perform alternate vectorization.
1581 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1582 CmpInst::Predicate SwappedCurrentPred =
1583 CmpInst::getSwappedPredicate(CurrentPred);
1584
1585 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1586 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1587 continue;
1588
1589 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1590 continue;
1591 auto *AltInst = cast<CmpInst>(AltOp);
1592 if (MainOp != AltOp) {
1593 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1594 continue;
1595 } else if (BasePred != CurrentPred) {
1596 assert(
1597 isValidForAlternation(InstOpcode) &&
1598 "CmpInst isn't safe for alternation, logic needs to be updated!");
1599 AltOp = I;
1600 continue;
1601 }
1602 CmpInst::Predicate AltPred = AltInst->getPredicate();
1603 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1604 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1605 continue;
1606 }
1607 } else if (InstOpcode == Opcode) {
1608 assert(InstOpcode == AltOpcode &&
1609 "Alternate instructions are only supported by BinaryOperator and "
1610 "CastInst.");
1611 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1612 if (Gep->getNumOperands() != 2 ||
1613 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1614 return InstructionsState::invalid();
1615 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1617 return InstructionsState::invalid();
1618 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1619 auto *BaseLI = cast<LoadInst>(MainOp);
1620 if (!LI->isSimple() || !BaseLI->isSimple())
1621 return InstructionsState::invalid();
1622 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1623 auto *CallBase = cast<CallInst>(MainOp);
1624 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1625 return InstructionsState::invalid();
1626 if (Call->hasOperandBundles() &&
1628 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1629 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1630 CallBase->op_begin() +
1632 return InstructionsState::invalid();
1634 if (ID != BaseID)
1635 return InstructionsState::invalid();
1636 if (!ID) {
1637 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1638 if (Mappings.size() != BaseMappings.size() ||
1639 Mappings.front().ISA != BaseMappings.front().ISA ||
1640 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1641 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1642 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1643 Mappings.front().Shape.Parameters !=
1644 BaseMappings.front().Shape.Parameters)
1645 return InstructionsState::invalid();
1646 }
1647 }
1648 continue;
1649 }
1650 return InstructionsState::invalid();
1651 }
1652
1653 if (IsBinOp) {
1654 MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode());
1655 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1656 AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode());
1657 assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1658 }
1659 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1660 "Incorrect implementation of allSameOpcode.");
1661 InstructionsState S(MainOp, AltOp);
1662 assert(all_of(VL,
1663 [&](Value *V) {
1664 return isa<PoisonValue>(V) ||
1665 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1666 }) &&
1667 "Invalid InstructionsState.");
1668 return S;
1669}
1670
1671/// \returns true if all of the values in \p VL have the same type or false
1672/// otherwise.
1674 Type *Ty = VL.consume_front()->getType();
1675 return all_of(VL, [&](Value *V) { return V->getType() == Ty; });
1676}
1677
1678/// \returns True if in-tree use also needs extract. This refers to
1679/// possible scalar operand in vectorized instruction.
1680static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1681 TargetLibraryInfo *TLI,
1682 const TargetTransformInfo *TTI) {
1683 if (!UserInst)
1684 return false;
1685 unsigned Opcode = UserInst->getOpcode();
1686 switch (Opcode) {
1687 case Instruction::Load: {
1688 LoadInst *LI = cast<LoadInst>(UserInst);
1689 return (LI->getPointerOperand() == Scalar);
1690 }
1691 case Instruction::Store: {
1692 StoreInst *SI = cast<StoreInst>(UserInst);
1693 return (SI->getPointerOperand() == Scalar);
1694 }
1695 case Instruction::Call: {
1696 CallInst *CI = cast<CallInst>(UserInst);
1698 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1699 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1700 Arg.value().get() == Scalar;
1701 });
1702 }
1703 default:
1704 return false;
1705 }
1706}
1707
1708/// \returns the AA location that is being access by the instruction.
1711 return MemoryLocation::get(SI);
1712 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1713 return MemoryLocation::get(LI);
1714 return MemoryLocation();
1715}
1716
1717/// \returns True if the instruction is not a volatile or atomic load/store.
1718static bool isSimple(Instruction *I) {
1719 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1720 return LI->isSimple();
1722 return SI->isSimple();
1724 return !MI->isVolatile();
1725 return true;
1726}
1727
1728/// Shuffles \p Mask in accordance with the given \p SubMask.
1729/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1730/// one but two input vectors.
1731static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1732 bool ExtendingManyInputs = false) {
1733 if (SubMask.empty())
1734 return;
1735 assert(
1736 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1737 // Check if input scalars were extended to match the size of other node.
1738 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1739 "SubMask with many inputs support must be larger than the mask.");
1740 if (Mask.empty()) {
1741 Mask.append(SubMask.begin(), SubMask.end());
1742 return;
1743 }
1744 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1745 int TermValue = std::min(Mask.size(), SubMask.size());
1746 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1747 if (SubMask[I] == PoisonMaskElem ||
1748 (!ExtendingManyInputs &&
1749 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1750 continue;
1751 NewMask[I] = Mask[SubMask[I]];
1752 }
1753 Mask.swap(NewMask);
1754}
1755
1756/// Order may have elements assigned special value (size) which is out of
1757/// bounds. Such indices only appear on places which correspond to undef values
1758/// (see canReuseExtract for details) and used in order to avoid undef values
1759/// have effect on operands ordering.
1760/// The first loop below simply finds all unused indices and then the next loop
1761/// nest assigns these indices for undef values positions.
1762/// As an example below Order has two undef positions and they have assigned
1763/// values 3 and 7 respectively:
1764/// before: 6 9 5 4 9 2 1 0
1765/// after: 6 3 5 4 7 2 1 0
1767 const size_t Sz = Order.size();
1768 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1769 SmallBitVector MaskedIndices(Sz);
1770 for (unsigned I = 0; I < Sz; ++I) {
1771 if (Order[I] < Sz)
1772 UnusedIndices.reset(Order[I]);
1773 else
1774 MaskedIndices.set(I);
1775 }
1776 if (MaskedIndices.none())
1777 return;
1778 assert(UnusedIndices.count() == MaskedIndices.count() &&
1779 "Non-synced masked/available indices.");
1780 int Idx = UnusedIndices.find_first();
1781 int MIdx = MaskedIndices.find_first();
1782 while (MIdx >= 0) {
1783 assert(Idx >= 0 && "Indices must be synced.");
1784 Order[MIdx] = Idx;
1785 Idx = UnusedIndices.find_next(Idx);
1786 MIdx = MaskedIndices.find_next(MIdx);
1787 }
1788}
1789
1790/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1791/// Opcode1.
1793 unsigned Opcode0, unsigned Opcode1) {
1794 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1795 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1796 for (unsigned Lane : seq<unsigned>(VL.size())) {
1797 if (isa<PoisonValue>(VL[Lane]))
1798 continue;
1799 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1800 OpcodeMask.set(Lane * ScalarTyNumElements,
1801 Lane * ScalarTyNumElements + ScalarTyNumElements);
1802 }
1803 return OpcodeMask;
1804}
1805
1806/// Replicates the given \p Val \p VF times.
1808 unsigned VF) {
1809 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1810 "Expected scalar constants.");
1811 SmallVector<Constant *> NewVal(Val.size() * VF);
1812 for (auto [I, V] : enumerate(Val))
1813 std::fill_n(NewVal.begin() + I * VF, VF, V);
1814 return NewVal;
1815}
1816
1817namespace llvm {
1818
1820 SmallVectorImpl<int> &Mask) {
1821 Mask.clear();
1822 const unsigned E = Indices.size();
1823 Mask.resize(E, PoisonMaskElem);
1824 for (unsigned I = 0; I < E; ++I)
1825 Mask[Indices[I]] = I;
1826}
1827
1828/// Reorders the list of scalars in accordance with the given \p Mask.
1830 ArrayRef<int> Mask) {
1831 assert(!Mask.empty() && "Expected non-empty mask.");
1832 SmallVector<Value *> Prev(Scalars.size(),
1833 PoisonValue::get(Scalars.front()->getType()));
1834 Prev.swap(Scalars);
1835 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1836 if (Mask[I] != PoisonMaskElem)
1837 Scalars[Mask[I]] = Prev[I];
1838}
1839
1840/// Checks if the provided value does not require scheduling. It does not
1841/// require scheduling if this is not an instruction or it is an instruction
1842/// that does not read/write memory and all operands are either not instructions
1843/// or phi nodes or instructions from different blocks.
1845 auto *I = dyn_cast<Instruction>(V);
1846 if (!I)
1847 return true;
1848 return !mayHaveNonDefUseDependency(*I) &&
1849 all_of(I->operands(), [I](Value *V) {
1850 auto *IO = dyn_cast<Instruction>(V);
1851 if (!IO)
1852 return true;
1853 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1854 });
1855}
1856
1857/// Checks if the provided value does not require scheduling. It does not
1858/// require scheduling if this is not an instruction or it is an instruction
1859/// that does not read/write memory and all users are phi nodes or instructions
1860/// from the different blocks.
1861static bool isUsedOutsideBlock(Value *V) {
1862 auto *I = dyn_cast<Instruction>(V);
1863 if (!I)
1864 return true;
1865 // Limits the number of uses to save compile time.
1866 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1867 all_of(I->users(), [I](User *U) {
1868 auto *IU = dyn_cast<Instruction>(U);
1869 if (!IU)
1870 return true;
1871 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1872 });
1873}
1874
1875/// Checks if the specified value does not require scheduling. It does not
1876/// require scheduling if all operands and all users do not need to be scheduled
1877/// in the current basic block.
1880}
1881
1882/// Checks if the specified array of instructions does not require scheduling.
1883/// It is so if all either instructions have operands that do not require
1884/// scheduling or their users do not require scheduling since they are phis or
1885/// in other basic blocks.
1887 return !VL.empty() &&
1889}
1890
1891/// Returns true if widened type of \p Ty elements with size \p Sz represents
1892/// full vector type, i.e. adding extra element results in extra parts upon type
1893/// legalization.
1895 unsigned Sz) {
1896 if (Sz <= 1)
1897 return false;
1899 return false;
1900 if (has_single_bit(Sz))
1901 return true;
1902 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1903 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1904 Sz % NumParts == 0;
1905}
1906
1907/// Returns number of parts, the type \p VecTy will be split at the codegen
1908/// phase. If the type is going to be scalarized or does not uses whole
1909/// registers, returns 1.
1910static unsigned
1912 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1913 unsigned NumParts = TTI.getNumberOfParts(VecTy);
1914 if (NumParts == 0 || NumParts >= Limit)
1915 return 1;
1916 unsigned Sz = getNumElements(VecTy);
1917 if (NumParts >= Sz || Sz % NumParts != 0 ||
1918 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1919 return 1;
1920 return NumParts;
1921}
1922
1923namespace slpvectorizer {
1924
1925/// Bottom Up SLP Vectorizer.
1926class BoUpSLP {
1927 class TreeEntry;
1928 class ScheduleEntity;
1929 class ScheduleData;
1930 class ScheduleCopyableData;
1931 class ScheduleBundle;
1934
1935 /// If we decide to generate strided load / store, this struct contains all
1936 /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
1937 /// and analyzeConstantStrideCandidate. Note that Stride can be given either
1938 /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
1939 /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
1940 /// size of element of FixedVectorType.
1941 struct StridedPtrInfo {
1942 Value *StrideVal = nullptr;
1943 const SCEV *StrideSCEV = nullptr;
1944 FixedVectorType *Ty = nullptr;
1945 };
1946 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
1947
1948public:
1949 /// Tracks the state we can represent the loads in the given sequence.
1957
1964
1966 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1968 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1969 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1970 AC(AC), DB(DB), DL(DL), ORE(ORE),
1971 Builder(Se->getContext(), TargetFolder(*DL)) {
1972 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1973 // Use the vector register size specified by the target unless overridden
1974 // by a command-line option.
1975 // TODO: It would be better to limit the vectorization factor based on
1976 // data type rather than just register size. For example, x86 AVX has
1977 // 256-bit registers, but it does not support integer operations
1978 // at that width (that requires AVX2).
1979 if (MaxVectorRegSizeOption.getNumOccurrences())
1980 MaxVecRegSize = MaxVectorRegSizeOption;
1981 else
1982 MaxVecRegSize =
1983 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
1984 .getFixedValue();
1985
1986 if (MinVectorRegSizeOption.getNumOccurrences())
1987 MinVecRegSize = MinVectorRegSizeOption;
1988 else
1989 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1990 }
1991
1992 /// Vectorize the tree that starts with the elements in \p VL.
1993 /// Returns the vectorized root.
1995
1996 /// Vectorize the tree but with the list of externally used values \p
1997 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1998 /// generated extractvalue instructions.
2000 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
2001 Instruction *ReductionRoot = nullptr,
2002 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2003
2004 /// \returns the cost incurred by unwanted spills and fills, caused by
2005 /// holding live values over call sites.
2007
2008 /// \returns the vectorization cost of the subtree that starts at \p VL.
2009 /// A negative number means that this is profitable.
2010 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {},
2011 InstructionCost ReductionCost = TTI::TCC_Free);
2012
2013 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
2014 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
2015 void buildTree(ArrayRef<Value *> Roots,
2016 const SmallDenseSet<Value *> &UserIgnoreLst);
2017
2018 /// Construct a vectorizable tree that starts at \p Roots.
2019 void buildTree(ArrayRef<Value *> Roots);
2020
2021 /// Return the scalars of the root node.
2023 assert(!VectorizableTree.empty() && "No graph to get the first node from");
2024 return VectorizableTree.front()->Scalars;
2025 }
2026
2027 /// Returns the type/is-signed info for the root node in the graph without
2028 /// casting.
2029 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2030 const TreeEntry &Root = *VectorizableTree.front();
2031 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2032 !Root.Scalars.front()->getType()->isIntegerTy())
2033 return std::nullopt;
2034 auto It = MinBWs.find(&Root);
2035 if (It != MinBWs.end())
2036 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
2037 It->second.first),
2038 It->second.second);
2039 if (Root.getOpcode() == Instruction::ZExt ||
2040 Root.getOpcode() == Instruction::SExt)
2041 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
2042 Root.getOpcode() == Instruction::SExt);
2043 return std::nullopt;
2044 }
2045
2046 /// Checks if the root graph node can be emitted with narrower bitwidth at
2047 /// codegen and returns it signedness, if so.
2049 return MinBWs.at(VectorizableTree.front().get()).second;
2050 }
2051
2052 /// Returns reduction type after minbitdth analysis.
2054 if (ReductionBitWidth == 0 ||
2055 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2056 ReductionBitWidth >=
2057 DL->getTypeSizeInBits(
2058 VectorizableTree.front()->Scalars.front()->getType()))
2059 return getWidenedType(
2060 VectorizableTree.front()->Scalars.front()->getType(),
2061 VectorizableTree.front()->getVectorFactor());
2062 return getWidenedType(
2064 VectorizableTree.front()->Scalars.front()->getContext(),
2065 ReductionBitWidth),
2066 VectorizableTree.front()->getVectorFactor());
2067 }
2068
2069 /// Builds external uses of the vectorized scalars, i.e. the list of
2070 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2071 /// ExternallyUsedValues contains additional list of external uses to handle
2072 /// vectorization of reductions.
2073 void
2074 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2075
2076 /// Transforms graph nodes to target specific representations, if profitable.
2077 void transformNodes();
2078
2079 /// Clear the internal data structures that are created by 'buildTree'.
2080 void deleteTree() {
2081 VectorizableTree.clear();
2082 ScalarToTreeEntries.clear();
2083 OperandsToTreeEntry.clear();
2084 ScalarsInSplitNodes.clear();
2085 MustGather.clear();
2086 NonScheduledFirst.clear();
2087 EntryToLastInstruction.clear();
2088 LoadEntriesToVectorize.clear();
2089 IsGraphTransformMode = false;
2090 GatheredLoadsEntriesFirst.reset();
2091 CompressEntryToData.clear();
2092 ExternalUses.clear();
2093 ExternalUsesAsOriginalScalar.clear();
2094 ExternalUsesWithNonUsers.clear();
2095 for (auto &Iter : BlocksSchedules) {
2096 BlockScheduling *BS = Iter.second.get();
2097 BS->clear();
2098 }
2099 MinBWs.clear();
2100 ReductionBitWidth = 0;
2101 BaseGraphSize = 1;
2102 CastMaxMinBWSizes.reset();
2103 ExtraBitWidthNodes.clear();
2104 InstrElementSize.clear();
2105 UserIgnoreList = nullptr;
2106 PostponedGathers.clear();
2107 ValueToGatherNodes.clear();
2108 TreeEntryToStridedPtrInfoMap.clear();
2109 }
2110
2111 unsigned getTreeSize() const { return VectorizableTree.size(); }
2112
2113 /// Returns the base graph size, before any transformations.
2114 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2115
2116 /// Perform LICM and CSE on the newly generated gather sequences.
2118
2119 /// Does this non-empty order represent an identity order? Identity
2120 /// should be represented as an empty order, so this is used to
2121 /// decide if we can canonicalize a computed order. Undef elements
2122 /// (represented as size) are ignored.
2124 assert(!Order.empty() && "expected non-empty order");
2125 const unsigned Sz = Order.size();
2126 return all_of(enumerate(Order), [&](const auto &P) {
2127 return P.value() == P.index() || P.value() == Sz;
2128 });
2129 }
2130
2131 /// Checks if the specified gather tree entry \p TE can be represented as a
2132 /// shuffled vector entry + (possibly) permutation with other gathers. It
2133 /// implements the checks only for possibly ordered scalars (Loads,
2134 /// ExtractElement, ExtractValue), which can be part of the graph.
2135 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2136 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2137 /// node might be ignored.
2138 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2139 bool TopToBottom,
2140 bool IgnoreReorder);
2141
2142 /// Sort loads into increasing pointers offsets to allow greater clustering.
2143 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2144
2145 /// Gets reordering data for the given tree entry. If the entry is vectorized
2146 /// - just return ReorderIndices, otherwise check if the scalars can be
2147 /// reordered and return the most optimal order.
2148 /// \return std::nullopt if ordering is not important, empty order, if
2149 /// identity order is important, or the actual order.
2150 /// \param TopToBottom If true, include the order of vectorized stores and
2151 /// insertelement nodes, otherwise skip them.
2152 /// \param IgnoreReorder true, if the root node order can be ignored.
2153 std::optional<OrdersType>
2154 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2155
2156 /// Checks if it is profitable to reorder the current tree.
2157 /// If the tree does not contain many profitable reordable nodes, better to
2158 /// skip it to save compile time.
2159 bool isProfitableToReorder() const;
2160
2161 /// Reorders the current graph to the most profitable order starting from the
2162 /// root node to the leaf nodes. The best order is chosen only from the nodes
2163 /// of the same size (vectorization factor). Smaller nodes are considered
2164 /// parts of subgraph with smaller VF and they are reordered independently. We
2165 /// can make it because we still need to extend smaller nodes to the wider VF
2166 /// and we can merge reordering shuffles with the widening shuffles.
2167 void reorderTopToBottom();
2168
2169 /// Reorders the current graph to the most profitable order starting from
2170 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2171 /// number of reshuffles if the leaf nodes use the same order. In this case we
2172 /// can merge the orders and just shuffle user node instead of shuffling its
2173 /// operands. Plus, even the leaf nodes have different orders, it allows to
2174 /// sink reordering in the graph closer to the root node and merge it later
2175 /// during analysis.
2176 void reorderBottomToTop(bool IgnoreReorder = false);
2177
2178 /// \return The vector element size in bits to use when vectorizing the
2179 /// expression tree ending at \p V. If V is a store, the size is the width of
2180 /// the stored value. Otherwise, the size is the width of the largest loaded
2181 /// value reaching V. This method is used by the vectorizer to calculate
2182 /// vectorization factors.
2183 unsigned getVectorElementSize(Value *V);
2184
2185 /// Compute the minimum type sizes required to represent the entries in a
2186 /// vectorizable tree.
2188
2189 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2190 unsigned getMaxVecRegSize() const {
2191 return MaxVecRegSize;
2192 }
2193
2194 // \returns minimum vector register size as set by cl::opt.
2195 unsigned getMinVecRegSize() const {
2196 return MinVecRegSize;
2197 }
2198
2199 unsigned getMinVF(unsigned Sz) const {
2200 return std::max(2U, getMinVecRegSize() / Sz);
2201 }
2202
2203 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2204 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2205 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2206 return MaxVF ? MaxVF : UINT_MAX;
2207 }
2208
2209 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2210 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2211 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2212 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2213 ///
2214 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2215 unsigned canMapToVector(Type *T) const;
2216
2217 /// \returns True if the VectorizableTree is both tiny and not fully
2218 /// vectorizable. We do not vectorize such trees.
2219 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2220
2221 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2222 /// It may happen, if all gather nodes are loads and they cannot be
2223 /// "clusterized". In this case even subgraphs cannot be vectorized more
2224 /// effectively than the base graph.
2225 bool isTreeNotExtendable() const;
2226
2227 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
2228 /// can be load combined in the backend. Load combining may not be allowed in
2229 /// the IR optimizer, so we do not want to alter the pattern. For example,
2230 /// partially transforming a scalar bswap() pattern into vector code is
2231 /// effectively impossible for the backend to undo.
2232 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2233 /// may not be necessary.
2234 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
2235
2236 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
2237 /// can be load combined in the backend. Load combining may not be allowed in
2238 /// the IR optimizer, so we do not want to alter the pattern. For example,
2239 /// partially transforming a scalar bswap() pattern into vector code is
2240 /// effectively impossible for the backend to undo.
2241 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2242 /// may not be necessary.
2243 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
2244 bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2245 Align Alignment, const int64_t Diff,
2246 const size_t Sz) const;
2247
2248 /// Return true if an array of scalar loads can be replaced with a strided
2249 /// load (with constant stride).
2250 ///
2251 /// TODO:
2252 /// It is possible that the load gets "widened". Suppose that originally each
2253 /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
2254 /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
2255 /// ...
2256 /// %b + 0 * %s + (w - 1)
2257 ///
2258 /// %b + 1 * %s + 0
2259 /// %b + 1 * %s + 1
2260 /// %b + 1 * %s + 2
2261 /// ...
2262 /// %b + 1 * %s + (w - 1)
2263 /// ...
2264 ///
2265 /// %b + (n - 1) * %s + 0
2266 /// %b + (n - 1) * %s + 1
2267 /// %b + (n - 1) * %s + 2
2268 /// ...
2269 /// %b + (n - 1) * %s + (w - 1)
2270 ///
2271 /// In this case we will generate a strided load of type `<n x (k * w)>`.
2272 ///
2273 /// \param PointerOps list of pointer arguments of loads.
2274 /// \param ElemTy original scalar type of loads.
2275 /// \param Alignment alignment of the first load.
2276 /// \param SortedIndices is the order of PointerOps as returned by
2277 /// `sortPtrAccesses`
2278 /// \param Diff Pointer difference between the lowest and the highes pointer
2279 /// in `PointerOps` as returned by `getPointersDiff`.
2280 /// \param Ptr0 first pointer in `PointersOps`.
2281 /// \param PtrN last pointer in `PointersOps`.
2282 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2283 /// of `SPtrInfo` necessary to generate the strided load later.
2285 const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
2286 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
2287 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const;
2288
2289 /// Return true if an array of scalar loads can be replaced with a strided
2290 /// load (with run-time stride).
2291 /// \param PointerOps list of pointer arguments of loads.
2292 /// \param ScalarTy type of loads.
2293 /// \param CommonAlignment common alignement of loads as computed by
2294 /// `computeCommonAlignment<LoadInst>`.
2295 /// \param SortedIndicies is a list of indicies computed by this function such
2296 /// that the sequence `PointerOps[SortedIndices[0]],
2297 /// PointerOps[SortedIndicies[1]], ..., PointerOps[SortedIndices[n]]` is
2298 /// ordered by the coefficient of the stride. For example, if PointerOps is
2299 /// `%base + %stride, %base, %base + 2 * stride` the `SortedIndices` will be
2300 /// `[1, 0, 2]`. We follow the convention that if `SortedIndices` has to be
2301 /// `0, 1, 2, 3, ...` we return empty vector for `SortedIndicies`.
2302 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2303 /// of `SPtrInfo` necessary to generate the strided load later.
2304 bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2305 Align CommonAlignment,
2306 SmallVectorImpl<unsigned> &SortedIndices,
2307 StridedPtrInfo &SPtrInfo) const;
2308
2309 /// Checks if the given array of loads can be represented as a vectorized,
2310 /// scatter or just simple gather.
2311 /// \param VL list of loads.
2312 /// \param VL0 main load value.
2313 /// \param Order returned order of load instructions.
2314 /// \param PointerOps returned list of pointer operands.
2315 /// \param BestVF return best vector factor, if recursive check found better
2316 /// vectorization sequences rather than masked gather.
2317 /// \param TryRecursiveCheck used to check if long masked gather can be
2318 /// represented as a serie of loads/insert subvector, if profitable.
2321 SmallVectorImpl<Value *> &PointerOps,
2322 StridedPtrInfo &SPtrInfo,
2323 unsigned *BestVF = nullptr,
2324 bool TryRecursiveCheck = true) const;
2325
2326 /// Registers non-vectorizable sequence of loads
2327 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2328 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2329 }
2330
2331 /// Checks if the given loads sequence is known as not vectorizable
2332 template <typename T>
2334 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
2335 }
2336
2338
2339 /// This structure holds any data we need about the edges being traversed
2340 /// during buildTreeRec(). We keep track of:
2341 /// (i) the user TreeEntry index, and
2342 /// (ii) the index of the edge.
2343 struct EdgeInfo {
2344 EdgeInfo() = default;
2345 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2347 /// The user TreeEntry.
2348 TreeEntry *UserTE = nullptr;
2349 /// The operand index of the use.
2350 unsigned EdgeIdx = UINT_MAX;
2351#ifndef NDEBUG
2353 const BoUpSLP::EdgeInfo &EI) {
2354 EI.dump(OS);
2355 return OS;
2356 }
2357 /// Debug print.
2358 void dump(raw_ostream &OS) const {
2359 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2360 << " EdgeIdx:" << EdgeIdx << "}";
2361 }
2362 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2363#endif
2364 bool operator == (const EdgeInfo &Other) const {
2365 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2366 }
2367
2368 operator bool() const { return UserTE != nullptr; }
2369 };
2370 friend struct DenseMapInfo<EdgeInfo>;
2371
2372 /// A helper class used for scoring candidates for two consecutive lanes.
2374 const TargetLibraryInfo &TLI;
2375 const DataLayout &DL;
2376 ScalarEvolution &SE;
2377 const BoUpSLP &R;
2378 int NumLanes; // Total number of lanes (aka vectorization factor).
2379 int MaxLevel; // The maximum recursion depth for accumulating score.
2380
2381 public:
2383 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2384 int MaxLevel)
2385 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2386 MaxLevel(MaxLevel) {}
2387
2388 // The hard-coded scores listed here are not very important, though it shall
2389 // be higher for better matches to improve the resulting cost. When
2390 // computing the scores of matching one sub-tree with another, we are
2391 // basically counting the number of values that are matching. So even if all
2392 // scores are set to 1, we would still get a decent matching result.
2393 // However, sometimes we have to break ties. For example we may have to
2394 // choose between matching loads vs matching opcodes. This is what these
2395 // scores are helping us with: they provide the order of preference. Also,
2396 // this is important if the scalar is externally used or used in another
2397 // tree entry node in the different lane.
2398
2399 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2400 static const int ScoreConsecutiveLoads = 4;
2401 /// The same load multiple times. This should have a better score than
2402 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2403 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2404 /// a vector load and 1.0 for a broadcast.
2405 static const int ScoreSplatLoads = 3;
2406 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2407 static const int ScoreReversedLoads = 3;
2408 /// A load candidate for masked gather.
2409 static const int ScoreMaskedGatherCandidate = 1;
2410 /// ExtractElementInst from same vector and consecutive indexes.
2411 static const int ScoreConsecutiveExtracts = 4;
2412 /// ExtractElementInst from same vector and reversed indices.
2413 static const int ScoreReversedExtracts = 3;
2414 /// Constants.
2415 static const int ScoreConstants = 2;
2416 /// Instructions with the same opcode.
2417 static const int ScoreSameOpcode = 2;
2418 /// Instructions with alt opcodes (e.g, add + sub).
2419 static const int ScoreAltOpcodes = 1;
2420 /// Identical instructions (a.k.a. splat or broadcast).
2421 static const int ScoreSplat = 1;
2422 /// Matching with an undef is preferable to failing.
2423 static const int ScoreUndef = 1;
2424 /// Score for failing to find a decent match.
2425 static const int ScoreFail = 0;
2426 /// Score if all users are vectorized.
2427 static const int ScoreAllUserVectorized = 1;
2428
2429 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2430 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2431 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2432 /// MainAltOps.
2434 ArrayRef<Value *> MainAltOps) const {
2435 if (!isValidElementType(V1->getType()) ||
2438
2439 if (V1 == V2) {
2440 if (isa<LoadInst>(V1)) {
2441 // Retruns true if the users of V1 and V2 won't need to be extracted.
2442 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2443 // Bail out if we have too many uses to save compilation time.
2444 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
2445 return false;
2446
2447 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2448 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
2449 return U == U1 || U == U2 || R.isVectorized(U);
2450 });
2451 };
2452 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2453 };
2454 // A broadcast of a load can be cheaper on some targets.
2455 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2456 ElementCount::getFixed(NumLanes)) &&
2457 ((int)V1->getNumUses() == NumLanes ||
2458 AllUsersAreInternal(V1, V2)))
2460 }
2462 }
2463
2464 auto CheckSameEntryOrFail = [&]() {
2465 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
2467 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
2468 !TEs2.empty() &&
2469 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
2471 }
2473 };
2474
2475 auto *LI1 = dyn_cast<LoadInst>(V1);
2476 auto *LI2 = dyn_cast<LoadInst>(V2);
2477 if (LI1 && LI2) {
2478 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2479 !LI2->isSimple())
2480 return CheckSameEntryOrFail();
2481
2482 std::optional<int64_t> Dist = getPointersDiff(
2483 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2484 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2485 if (!Dist || *Dist == 0) {
2486 if (getUnderlyingObject(LI1->getPointerOperand()) ==
2487 getUnderlyingObject(LI2->getPointerOperand()) &&
2488 R.TTI->isLegalMaskedGather(
2489 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
2491 return CheckSameEntryOrFail();
2492 }
2493 // The distance is too large - still may be profitable to use masked
2494 // loads/gathers.
2495 if (std::abs(*Dist) > NumLanes / 2)
2497 // This still will detect consecutive loads, but we might have "holes"
2498 // in some cases. It is ok for non-power-2 vectorization and may produce
2499 // better results. It should not affect current vectorization.
2502 }
2503
2504 auto *C1 = dyn_cast<Constant>(V1);
2505 auto *C2 = dyn_cast<Constant>(V2);
2506 if (C1 && C2)
2508
2509 // Consider constants and buildvector compatible.
2510 if ((C1 && isa<InsertElementInst>(V2)) ||
2511 (C2 && isa<InsertElementInst>(V1)))
2513
2514 // Extracts from consecutive indexes of the same vector better score as
2515 // the extracts could be optimized away.
2516 Value *EV1;
2517 ConstantInt *Ex1Idx;
2518 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
2519 // Undefs are always profitable for extractelements.
2520 // Compiler can easily combine poison and extractelement <non-poison> or
2521 // undef and extractelement <poison>. But combining undef +
2522 // extractelement <non-poison-but-may-produce-poison> requires some
2523 // extra operations.
2524 if (isa<UndefValue>(V2))
2525 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
2528 Value *EV2 = nullptr;
2529 ConstantInt *Ex2Idx = nullptr;
2530 if (match(V2,
2532 m_Undef())))) {
2533 // Undefs are always profitable for extractelements.
2534 if (!Ex2Idx)
2536 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
2538 if (EV2 == EV1) {
2539 int Idx1 = Ex1Idx->getZExtValue();
2540 int Idx2 = Ex2Idx->getZExtValue();
2541 int Dist = Idx2 - Idx1;
2542 // The distance is too large - still may be profitable to use
2543 // shuffles.
2544 if (std::abs(Dist) == 0)
2546 if (std::abs(Dist) > NumLanes / 2)
2550 }
2552 }
2553 return CheckSameEntryOrFail();
2554 }
2555
2556 auto *I1 = dyn_cast<Instruction>(V1);
2557 auto *I2 = dyn_cast<Instruction>(V2);
2558 if (I1 && I2) {
2559 if (I1->getParent() != I2->getParent())
2560 return CheckSameEntryOrFail();
2561 SmallVector<Value *, 4> Ops(MainAltOps);
2562 Ops.push_back(I1);
2563 Ops.push_back(I2);
2564 InstructionsState S = getSameOpcode(Ops, TLI);
2565 // Note: Only consider instructions with <= 2 operands to avoid
2566 // complexity explosion.
2567 if (S &&
2568 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2569 !S.isAltShuffle()) &&
2570 all_of(Ops, [&S](Value *V) {
2571 return isa<PoisonValue>(V) ||
2572 cast<Instruction>(V)->getNumOperands() ==
2573 S.getMainOp()->getNumOperands();
2574 }))
2575 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2577 }
2578
2579 if (I1 && isa<PoisonValue>(V2))
2581
2582 if (isa<UndefValue>(V2))
2584
2585 return CheckSameEntryOrFail();
2586 }
2587
2588 /// Go through the operands of \p LHS and \p RHS recursively until
2589 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2590 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2591 /// of \p U1 and \p U2), except at the beginning of the recursion where
2592 /// these are set to nullptr.
2593 ///
2594 /// For example:
2595 /// \verbatim
2596 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2597 /// \ / \ / \ / \ /
2598 /// + + + +
2599 /// G1 G2 G3 G4
2600 /// \endverbatim
2601 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2602 /// each level recursively, accumulating the score. It starts from matching
2603 /// the additions at level 0, then moves on to the loads (level 1). The
2604 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2605 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2606 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2607 /// Please note that the order of the operands does not matter, as we
2608 /// evaluate the score of all profitable combinations of operands. In
2609 /// other words the score of G1 and G4 is the same as G1 and G2. This
2610 /// heuristic is based on ideas described in:
2611 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2612 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2613 /// Luís F. W. Góes
2615 Instruction *U2, int CurrLevel,
2616 ArrayRef<Value *> MainAltOps) const {
2617
2618 // Get the shallow score of V1 and V2.
2619 int ShallowScoreAtThisLevel =
2620 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
2621
2622 // If reached MaxLevel,
2623 // or if V1 and V2 are not instructions,
2624 // or if they are SPLAT,
2625 // or if they are not consecutive,
2626 // or if profitable to vectorize loads or extractelements, early return
2627 // the current cost.
2628 auto *I1 = dyn_cast<Instruction>(LHS);
2629 auto *I2 = dyn_cast<Instruction>(RHS);
2630 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2631 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2632 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
2633 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2635 ShallowScoreAtThisLevel))
2636 return ShallowScoreAtThisLevel;
2637 assert(I1 && I2 && "Should have early exited.");
2638
2639 // Contains the I2 operand indexes that got matched with I1 operands.
2640 SmallSet<unsigned, 4> Op2Used;
2641
2642 // Recursion towards the operands of I1 and I2. We are trying all possible
2643 // operand pairs, and keeping track of the best score.
2644 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2645 OpIdx1 != NumOperands1; ++OpIdx1) {
2646 // Try to pair op1I with the best operand of I2.
2647 int MaxTmpScore = 0;
2648 unsigned MaxOpIdx2 = 0;
2649 bool FoundBest = false;
2650 // If I2 is commutative try all combinations.
2651 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
2652 unsigned ToIdx = isCommutative(I2)
2653 ? I2->getNumOperands()
2654 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2655 assert(FromIdx <= ToIdx && "Bad index");
2656 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2657 // Skip operands already paired with OpIdx1.
2658 if (Op2Used.count(OpIdx2))
2659 continue;
2660 // Recursively calculate the cost at each level
2661 int TmpScore =
2662 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
2663 I1, I2, CurrLevel + 1, {});
2664 // Look for the best score.
2665 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2666 TmpScore > MaxTmpScore) {
2667 MaxTmpScore = TmpScore;
2668 MaxOpIdx2 = OpIdx2;
2669 FoundBest = true;
2670 }
2671 }
2672 if (FoundBest) {
2673 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2674 Op2Used.insert(MaxOpIdx2);
2675 ShallowScoreAtThisLevel += MaxTmpScore;
2676 }
2677 }
2678 return ShallowScoreAtThisLevel;
2679 }
2680 };
2681 /// A helper data structure to hold the operands of a vector of instructions.
2682 /// This supports a fixed vector length for all operand vectors.
2684 /// For each operand we need (i) the value, and (ii) the opcode that it
2685 /// would be attached to if the expression was in a left-linearized form.
2686 /// This is required to avoid illegal operand reordering.
2687 /// For example:
2688 /// \verbatim
2689 /// 0 Op1
2690 /// |/
2691 /// Op1 Op2 Linearized + Op2
2692 /// \ / ----------> |/
2693 /// - -
2694 ///
2695 /// Op1 - Op2 (0 + Op1) - Op2
2696 /// \endverbatim
2697 ///
2698 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2699 ///
2700 /// Another way to think of this is to track all the operations across the
2701 /// path from the operand all the way to the root of the tree and to
2702 /// calculate the operation that corresponds to this path. For example, the
2703 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2704 /// corresponding operation is a '-' (which matches the one in the
2705 /// linearized tree, as shown above).
2706 ///
2707 /// For lack of a better term, we refer to this operation as Accumulated
2708 /// Path Operation (APO).
2709 struct OperandData {
2710 OperandData() = default;
2711 OperandData(Value *V, bool APO, bool IsUsed)
2712 : V(V), APO(APO), IsUsed(IsUsed) {}
2713 /// The operand value.
2714 Value *V = nullptr;
2715 /// TreeEntries only allow a single opcode, or an alternate sequence of
2716 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2717 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2718 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2719 /// (e.g., Add/Mul)
2720 bool APO = false;
2721 /// Helper data for the reordering function.
2722 bool IsUsed = false;
2723 };
2724
2725 /// During operand reordering, we are trying to select the operand at lane
2726 /// that matches best with the operand at the neighboring lane. Our
2727 /// selection is based on the type of value we are looking for. For example,
2728 /// if the neighboring lane has a load, we need to look for a load that is
2729 /// accessing a consecutive address. These strategies are summarized in the
2730 /// 'ReorderingMode' enumerator.
2731 enum class ReorderingMode {
2732 Load, ///< Matching loads to consecutive memory addresses
2733 Opcode, ///< Matching instructions based on opcode (same or alternate)
2734 Constant, ///< Matching constants
2735 Splat, ///< Matching the same instruction multiple times (broadcast)
2736 Failed, ///< We failed to create a vectorizable group
2737 };
2738
2739 using OperandDataVec = SmallVector<OperandData, 2>;
2740
2741 /// A vector of operand vectors.
2743 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2744 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2745 unsigned ArgSize = 0;
2746
2747 const TargetLibraryInfo &TLI;
2748 const DataLayout &DL;
2749 ScalarEvolution &SE;
2750 const BoUpSLP &R;
2751 const Loop *L = nullptr;
2752
2753 /// \returns the operand data at \p OpIdx and \p Lane.
2754 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2755 return OpsVec[OpIdx][Lane];
2756 }
2757
2758 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2759 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2760 return OpsVec[OpIdx][Lane];
2761 }
2762
2763 /// Clears the used flag for all entries.
2764 void clearUsed() {
2765 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2766 OpIdx != NumOperands; ++OpIdx)
2767 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2768 ++Lane)
2769 OpsVec[OpIdx][Lane].IsUsed = false;
2770 }
2771
2772 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2773 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2774 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2775 }
2776
2777 /// \param Lane lane of the operands under analysis.
2778 /// \param OpIdx operand index in \p Lane lane we're looking the best
2779 /// candidate for.
2780 /// \param Idx operand index of the current candidate value.
2781 /// \returns The additional score due to possible broadcasting of the
2782 /// elements in the lane. It is more profitable to have power-of-2 unique
2783 /// elements in the lane, it will be vectorized with higher probability
2784 /// after removing duplicates. Currently the SLP vectorizer supports only
2785 /// vectorization of the power-of-2 number of unique scalars.
2786 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2787 const SmallBitVector &UsedLanes) const {
2788 Value *IdxLaneV = getData(Idx, Lane).V;
2789 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2790 isa<ExtractElementInst>(IdxLaneV))
2791 return 0;
2793 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2794 if (Ln == Lane)
2795 continue;
2796 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2797 if (!isa<Instruction>(OpIdxLnV))
2798 return 0;
2799 Uniques.try_emplace(OpIdxLnV, Ln);
2800 }
2801 unsigned UniquesCount = Uniques.size();
2802 auto IdxIt = Uniques.find(IdxLaneV);
2803 unsigned UniquesCntWithIdxLaneV =
2804 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2805 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2806 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2807 unsigned UniquesCntWithOpIdxLaneV =
2808 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2809 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2810 return 0;
2811 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2812 UniquesCntWithOpIdxLaneV,
2813 UniquesCntWithOpIdxLaneV -
2814 bit_floor(UniquesCntWithOpIdxLaneV)) -
2815 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2816 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2817 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2818 }
2819
2820 /// \param Lane lane of the operands under analysis.
2821 /// \param OpIdx operand index in \p Lane lane we're looking the best
2822 /// candidate for.
2823 /// \param Idx operand index of the current candidate value.
2824 /// \returns The additional score for the scalar which users are all
2825 /// vectorized.
2826 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2827 Value *IdxLaneV = getData(Idx, Lane).V;
2828 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2829 // Do not care about number of uses for vector-like instructions
2830 // (extractelement/extractvalue with constant indices), they are extracts
2831 // themselves and already externally used. Vectorization of such
2832 // instructions does not add extra extractelement instruction, just may
2833 // remove it.
2834 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2835 isVectorLikeInstWithConstOps(OpIdxLaneV))
2837 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2838 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2839 return 0;
2840 return R.areAllUsersVectorized(IdxLaneI)
2842 : 0;
2843 }
2844
2845 /// Score scaling factor for fully compatible instructions but with
2846 /// different number of external uses. Allows better selection of the
2847 /// instructions with less external uses.
2848 static const int ScoreScaleFactor = 10;
2849
2850 /// \Returns the look-ahead score, which tells us how much the sub-trees
2851 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2852 /// score. This helps break ties in an informed way when we cannot decide on
2853 /// the order of the operands by just considering the immediate
2854 /// predecessors.
2855 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2856 int Lane, unsigned OpIdx, unsigned Idx,
2857 bool &IsUsed, const SmallBitVector &UsedLanes) {
2858 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2860 // Keep track of the instruction stack as we recurse into the operands
2861 // during the look-ahead score exploration.
2862 int Score =
2863 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2864 /*CurrLevel=*/1, MainAltOps);
2865 if (Score) {
2866 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2867 if (Score <= -SplatScore) {
2868 // Failed score.
2869 Score = 0;
2870 } else {
2871 Score += SplatScore;
2872 // Scale score to see the difference between different operands
2873 // and similar operands but all vectorized/not all vectorized
2874 // uses. It does not affect actual selection of the best
2875 // compatible operand in general, just allows to select the
2876 // operand with all vectorized uses.
2877 Score *= ScoreScaleFactor;
2878 Score += getExternalUseScore(Lane, OpIdx, Idx);
2879 IsUsed = true;
2880 }
2881 }
2882 return Score;
2883 }
2884
2885 /// Best defined scores per lanes between the passes. Used to choose the
2886 /// best operand (with the highest score) between the passes.
2887 /// The key - {Operand Index, Lane}.
2888 /// The value - the best score between the passes for the lane and the
2889 /// operand.
2891 BestScoresPerLanes;
2892
2893 // Search all operands in Ops[*][Lane] for the one that matches best
2894 // Ops[OpIdx][LastLane] and return its opreand index.
2895 // If no good match can be found, return std::nullopt.
2896 std::optional<unsigned>
2897 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2898 ArrayRef<ReorderingMode> ReorderingModes,
2899 ArrayRef<Value *> MainAltOps,
2900 const SmallBitVector &UsedLanes) {
2901 unsigned NumOperands = getNumOperands();
2902
2903 // The operand of the previous lane at OpIdx.
2904 Value *OpLastLane = getData(OpIdx, LastLane).V;
2905
2906 // Our strategy mode for OpIdx.
2907 ReorderingMode RMode = ReorderingModes[OpIdx];
2908 if (RMode == ReorderingMode::Failed)
2909 return std::nullopt;
2910
2911 // The linearized opcode of the operand at OpIdx, Lane.
2912 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2913
2914 // The best operand index and its score.
2915 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2916 // are using the score to differentiate between the two.
2917 struct BestOpData {
2918 std::optional<unsigned> Idx;
2919 unsigned Score = 0;
2920 } BestOp;
2921 BestOp.Score =
2922 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2923 .first->second;
2924
2925 // Track if the operand must be marked as used. If the operand is set to
2926 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2927 // want to reestimate the operands again on the following iterations).
2928 bool IsUsed = RMode == ReorderingMode::Splat ||
2929 RMode == ReorderingMode::Constant ||
2930 RMode == ReorderingMode::Load;
2931 // Iterate through all unused operands and look for the best.
2932 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2933 // Get the operand at Idx and Lane.
2934 OperandData &OpData = getData(Idx, Lane);
2935 Value *Op = OpData.V;
2936 bool OpAPO = OpData.APO;
2937
2938 // Skip already selected operands.
2939 if (OpData.IsUsed)
2940 continue;
2941
2942 // Skip if we are trying to move the operand to a position with a
2943 // different opcode in the linearized tree form. This would break the
2944 // semantics.
2945 if (OpAPO != OpIdxAPO)
2946 continue;
2947
2948 // Look for an operand that matches the current mode.
2949 switch (RMode) {
2950 case ReorderingMode::Load:
2951 case ReorderingMode::Opcode: {
2952 bool LeftToRight = Lane > LastLane;
2953 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2954 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2955 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2956 OpIdx, Idx, IsUsed, UsedLanes);
2957 if (Score > static_cast<int>(BestOp.Score) ||
2958 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2959 Idx == OpIdx)) {
2960 BestOp.Idx = Idx;
2961 BestOp.Score = Score;
2962 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2963 }
2964 break;
2965 }
2966 case ReorderingMode::Constant:
2967 if (isa<Constant>(Op) ||
2968 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2969 BestOp.Idx = Idx;
2970 if (isa<Constant>(Op)) {
2972 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2974 }
2976 IsUsed = false;
2977 }
2978 break;
2979 case ReorderingMode::Splat:
2980 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2981 IsUsed = Op == OpLastLane;
2982 if (Op == OpLastLane) {
2983 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2984 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2986 }
2987 BestOp.Idx = Idx;
2988 }
2989 break;
2990 case ReorderingMode::Failed:
2991 llvm_unreachable("Not expected Failed reordering mode.");
2992 }
2993 }
2994
2995 if (BestOp.Idx) {
2996 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2997 return BestOp.Idx;
2998 }
2999 // If we could not find a good match return std::nullopt.
3000 return std::nullopt;
3001 }
3002
3003 /// Helper for reorderOperandVecs.
3004 /// \returns the lane that we should start reordering from. This is the one
3005 /// which has the least number of operands that can freely move about or
3006 /// less profitable because it already has the most optimal set of operands.
3007 unsigned getBestLaneToStartReordering() const {
3008 unsigned Min = UINT_MAX;
3009 unsigned SameOpNumber = 0;
3010 // std::pair<unsigned, unsigned> is used to implement a simple voting
3011 // algorithm and choose the lane with the least number of operands that
3012 // can freely move about or less profitable because it already has the
3013 // most optimal set of operands. The first unsigned is a counter for
3014 // voting, the second unsigned is the counter of lanes with instructions
3015 // with same/alternate opcodes and same parent basic block.
3017 // Try to be closer to the original results, if we have multiple lanes
3018 // with same cost. If 2 lanes have the same cost, use the one with the
3019 // highest index.
3020 for (int I = getNumLanes(); I > 0; --I) {
3021 unsigned Lane = I - 1;
3022 OperandsOrderData NumFreeOpsHash =
3023 getMaxNumOperandsThatCanBeReordered(Lane);
3024 // Compare the number of operands that can move and choose the one with
3025 // the least number.
3026 if (NumFreeOpsHash.NumOfAPOs < Min) {
3027 Min = NumFreeOpsHash.NumOfAPOs;
3028 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3029 HashMap.clear();
3030 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3031 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3032 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3033 // Select the most optimal lane in terms of number of operands that
3034 // should be moved around.
3035 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3036 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3037 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3038 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3039 auto [It, Inserted] =
3040 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3041 if (!Inserted)
3042 ++It->second.first;
3043 }
3044 }
3045 // Select the lane with the minimum counter.
3046 unsigned BestLane = 0;
3047 unsigned CntMin = UINT_MAX;
3048 for (const auto &Data : reverse(HashMap)) {
3049 if (Data.second.first < CntMin) {
3050 CntMin = Data.second.first;
3051 BestLane = Data.second.second;
3052 }
3053 }
3054 return BestLane;
3055 }
3056
3057 /// Data structure that helps to reorder operands.
3058 struct OperandsOrderData {
3059 /// The best number of operands with the same APOs, which can be
3060 /// reordered.
3061 unsigned NumOfAPOs = UINT_MAX;
3062 /// Number of operands with the same/alternate instruction opcode and
3063 /// parent.
3064 unsigned NumOpsWithSameOpcodeParent = 0;
3065 /// Hash for the actual operands ordering.
3066 /// Used to count operands, actually their position id and opcode
3067 /// value. It is used in the voting mechanism to find the lane with the
3068 /// least number of operands that can freely move about or less profitable
3069 /// because it already has the most optimal set of operands. Can be
3070 /// replaced with SmallVector<unsigned> instead but hash code is faster
3071 /// and requires less memory.
3072 unsigned Hash = 0;
3073 };
3074 /// \returns the maximum number of operands that are allowed to be reordered
3075 /// for \p Lane and the number of compatible instructions(with the same
3076 /// parent/opcode). This is used as a heuristic for selecting the first lane
3077 /// to start operand reordering.
3078 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
3079 unsigned CntTrue = 0;
3080 unsigned NumOperands = getNumOperands();
3081 // Operands with the same APO can be reordered. We therefore need to count
3082 // how many of them we have for each APO, like this: Cnt[APO] = x.
3083 // Since we only have two APOs, namely true and false, we can avoid using
3084 // a map. Instead we can simply count the number of operands that
3085 // correspond to one of them (in this case the 'true' APO), and calculate
3086 // the other by subtracting it from the total number of operands.
3087 // Operands with the same instruction opcode and parent are more
3088 // profitable since we don't need to move them in many cases, with a high
3089 // probability such lane already can be vectorized effectively.
3090 bool AllUndefs = true;
3091 unsigned NumOpsWithSameOpcodeParent = 0;
3092 Instruction *OpcodeI = nullptr;
3093 BasicBlock *Parent = nullptr;
3094 unsigned Hash = 0;
3095 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3096 const OperandData &OpData = getData(OpIdx, Lane);
3097 if (OpData.APO)
3098 ++CntTrue;
3099 // Use Boyer-Moore majority voting for finding the majority opcode and
3100 // the number of times it occurs.
3101 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
3102 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
3103 I->getParent() != Parent) {
3104 if (NumOpsWithSameOpcodeParent == 0) {
3105 NumOpsWithSameOpcodeParent = 1;
3106 OpcodeI = I;
3107 Parent = I->getParent();
3108 } else {
3109 --NumOpsWithSameOpcodeParent;
3110 }
3111 } else {
3112 ++NumOpsWithSameOpcodeParent;
3113 }
3114 }
3115 Hash = hash_combine(
3116 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
3117 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
3118 }
3119 if (AllUndefs)
3120 return {};
3121 OperandsOrderData Data;
3122 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3123 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3124 Data.Hash = Hash;
3125 return Data;
3126 }
3127
3128 /// Go through the instructions in VL and append their operands.
3129 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3130 const InstructionsState &S) {
3131 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3132 assert((empty() || all_of(Operands,
3133 [this](const ValueList &VL) {
3134 return VL.size() == getNumLanes();
3135 })) &&
3136 "Expected same number of lanes");
3137 assert(S.valid() && "InstructionsState is invalid.");
3138 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3139 // arguments to the intrinsic produces the same result.
3140 Instruction *MainOp = S.getMainOp();
3141 unsigned NumOperands = MainOp->getNumOperands();
3143 OpsVec.resize(ArgSize);
3144 unsigned NumLanes = VL.size();
3145 for (OperandDataVec &Ops : OpsVec)
3146 Ops.resize(NumLanes);
3147 for (unsigned Lane : seq<unsigned>(NumLanes)) {
3148 // Our tree has just 3 nodes: the root and two operands.
3149 // It is therefore trivial to get the APO. We only need to check the
3150 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3151 // operand. The LHS operand of both add and sub is never attached to an
3152 // inversese operation in the linearized form, therefore its APO is
3153 // false. The RHS is true only if V is an inverse operation.
3154
3155 // Since operand reordering is performed on groups of commutative
3156 // operations or alternating sequences (e.g., +, -), we can safely tell
3157 // the inverse operations by checking commutativity.
3158 auto *I = dyn_cast<Instruction>(VL[Lane]);
3159 if (!I && isa<PoisonValue>(VL[Lane])) {
3160 for (unsigned OpIdx : seq<unsigned>(NumOperands))
3161 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3162 continue;
3163 }
3164 bool IsInverseOperation = false;
3165 if (S.isCopyableElement(VL[Lane])) {
3166 // The value is a copyable element.
3167 IsInverseOperation = !isCommutative(MainOp, VL[Lane]);
3168 } else {
3169 assert(I && "Expected instruction");
3170 auto [SelectedOp, Ops] = convertTo(I, S);
3171 // We cannot check commutativity by the converted instruction
3172 // (SelectedOp) because isCommutative also examines def-use
3173 // relationships.
3174 IsInverseOperation = !isCommutative(SelectedOp, I);
3175 }
3176 for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
3177 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3178 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3179 }
3180 }
3181 }
3182
3183 /// \returns the number of operands.
3184 unsigned getNumOperands() const { return ArgSize; }
3185
3186 /// \returns the number of lanes.
3187 unsigned getNumLanes() const { return OpsVec[0].size(); }
3188
3189 /// \returns the operand value at \p OpIdx and \p Lane.
3190 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3191 return getData(OpIdx, Lane).V;
3192 }
3193
3194 /// \returns true if the data structure is empty.
3195 bool empty() const { return OpsVec.empty(); }
3196
3197 /// Clears the data.
3198 void clear() { OpsVec.clear(); }
3199
3200 /// \Returns true if there are enough operands identical to \p Op to fill
3201 /// the whole vector (it is mixed with constants or loop invariant values).
3202 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3203 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3204 assert(Op == getValue(OpIdx, Lane) &&
3205 "Op is expected to be getValue(OpIdx, Lane).");
3206 // Small number of loads - try load matching.
3207 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
3208 return false;
3209 bool OpAPO = getData(OpIdx, Lane).APO;
3210 bool IsInvariant = L && L->isLoopInvariant(Op);
3211 unsigned Cnt = 0;
3212 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3213 if (Ln == Lane)
3214 continue;
3215 // This is set to true if we found a candidate for broadcast at Lane.
3216 bool FoundCandidate = false;
3217 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3218 OperandData &Data = getData(OpI, Ln);
3219 if (Data.APO != OpAPO || Data.IsUsed)
3220 continue;
3221 Value *OpILane = getValue(OpI, Lane);
3222 bool IsConstantOp = isa<Constant>(OpILane);
3223 // Consider the broadcast candidate if:
3224 // 1. Same value is found in one of the operands.
3225 if (Data.V == Op ||
3226 // 2. The operand in the given lane is not constant but there is a
3227 // constant operand in another lane (which can be moved to the
3228 // given lane). In this case we can represent it as a simple
3229 // permutation of constant and broadcast.
3230 (!IsConstantOp &&
3231 ((Lns > 2 && isa<Constant>(Data.V)) ||
3232 // 2.1. If we have only 2 lanes, need to check that value in the
3233 // next lane does not build same opcode sequence.
3234 (Lns == 2 &&
3235 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
3236 isa<Constant>(Data.V)))) ||
3237 // 3. The operand in the current lane is loop invariant (can be
3238 // hoisted out) and another operand is also a loop invariant
3239 // (though not a constant). In this case the whole vector can be
3240 // hoisted out.
3241 // FIXME: need to teach the cost model about this case for better
3242 // estimation.
3243 (IsInvariant && !isa<Constant>(Data.V) &&
3244 !getSameOpcode({Op, Data.V}, TLI) &&
3245 L->isLoopInvariant(Data.V))) {
3246 FoundCandidate = true;
3247 Data.IsUsed = Data.V == Op;
3248 if (Data.V == Op)
3249 ++Cnt;
3250 break;
3251 }
3252 }
3253 if (!FoundCandidate)
3254 return false;
3255 }
3256 return getNumLanes() == 2 || Cnt > 1;
3257 }
3258
3259 /// Checks if there is at least single compatible operand in lanes other
3260 /// than \p Lane, compatible with the operand \p Op.
3261 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3262 assert(Op == getValue(OpIdx, Lane) &&
3263 "Op is expected to be getValue(OpIdx, Lane).");
3264 bool OpAPO = getData(OpIdx, Lane).APO;
3265 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3266 if (Ln == Lane)
3267 continue;
3268 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
3269 const OperandData &Data = getData(OpI, Ln);
3270 if (Data.APO != OpAPO || Data.IsUsed)
3271 return true;
3272 Value *OpILn = getValue(OpI, Ln);
3273 return (L && L->isLoopInvariant(OpILn)) ||
3274 (getSameOpcode({Op, OpILn}, TLI) &&
3275 allSameBlock({Op, OpILn}));
3276 }))
3277 return true;
3278 }
3279 return false;
3280 }
3281
3282 public:
3283 /// Initialize with all the operands of the instruction vector \p RootVL.
3285 const InstructionsState &S, const BoUpSLP &R)
3286 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3287 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
3288 // Append all the operands of RootVL.
3289 appendOperands(RootVL, Operands, S);
3290 }
3291
3292 /// \Returns a value vector with the operands across all lanes for the
3293 /// opearnd at \p OpIdx.
3294 ValueList getVL(unsigned OpIdx) const {
3295 ValueList OpVL(OpsVec[OpIdx].size());
3296 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3297 "Expected same num of lanes across all operands");
3298 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3299 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3300 return OpVL;
3301 }
3302
3303 // Performs operand reordering for 2 or more operands.
3304 // The original operands are in OrigOps[OpIdx][Lane].
3305 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3306 void reorder() {
3307 unsigned NumOperands = getNumOperands();
3308 unsigned NumLanes = getNumLanes();
3309 // Each operand has its own mode. We are using this mode to help us select
3310 // the instructions for each lane, so that they match best with the ones
3311 // we have selected so far.
3312 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3313
3314 // This is a greedy single-pass algorithm. We are going over each lane
3315 // once and deciding on the best order right away with no back-tracking.
3316 // However, in order to increase its effectiveness, we start with the lane
3317 // that has operands that can move the least. For example, given the
3318 // following lanes:
3319 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3320 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3321 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3322 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3323 // we will start at Lane 1, since the operands of the subtraction cannot
3324 // be reordered. Then we will visit the rest of the lanes in a circular
3325 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3326
3327 // Find the first lane that we will start our search from.
3328 unsigned FirstLane = getBestLaneToStartReordering();
3329
3330 // Initialize the modes.
3331 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3332 Value *OpLane0 = getValue(OpIdx, FirstLane);
3333 // Keep track if we have instructions with all the same opcode on one
3334 // side.
3335 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
3336 // Check if OpLane0 should be broadcast.
3337 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
3338 !canBeVectorized(OpILane0, OpIdx, FirstLane))
3339 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3340 else if (isa<LoadInst>(OpILane0))
3341 ReorderingModes[OpIdx] = ReorderingMode::Load;
3342 else
3343 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3344 } else if (isa<Constant>(OpLane0)) {
3345 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3346 } else if (isa<Argument>(OpLane0)) {
3347 // Our best hope is a Splat. It may save some cost in some cases.
3348 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3349 } else {
3350 llvm_unreachable("Unexpected value kind.");
3351 }
3352 }
3353
3354 // Check that we don't have same operands. No need to reorder if operands
3355 // are just perfect diamond or shuffled diamond match. Do not do it only
3356 // for possible broadcasts or non-power of 2 number of scalars (just for
3357 // now).
3358 auto &&SkipReordering = [this]() {
3359 SmallPtrSet<Value *, 4> UniqueValues;
3360 ArrayRef<OperandData> Op0 = OpsVec.front();
3361 for (const OperandData &Data : Op0)
3362 UniqueValues.insert(Data.V);
3364 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3365 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
3366 return !UniqueValues.contains(Data.V);
3367 }))
3368 return false;
3369 }
3370 // TODO: Check if we can remove a check for non-power-2 number of
3371 // scalars after full support of non-power-2 vectorization.
3372 return UniqueValues.size() != 2 &&
3373 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
3374 UniqueValues.size());
3375 };
3376
3377 // If the initial strategy fails for any of the operand indexes, then we
3378 // perform reordering again in a second pass. This helps avoid assigning
3379 // high priority to the failed strategy, and should improve reordering for
3380 // the non-failed operand indexes.
3381 for (int Pass = 0; Pass != 2; ++Pass) {
3382 // Check if no need to reorder operands since they're are perfect or
3383 // shuffled diamond match.
3384 // Need to do it to avoid extra external use cost counting for
3385 // shuffled matches, which may cause regressions.
3386 if (SkipReordering())
3387 break;
3388 // Skip the second pass if the first pass did not fail.
3389 bool StrategyFailed = false;
3390 // Mark all operand data as free to use.
3391 clearUsed();
3392 // We keep the original operand order for the FirstLane, so reorder the
3393 // rest of the lanes. We are visiting the nodes in a circular fashion,
3394 // using FirstLane as the center point and increasing the radius
3395 // distance.
3396 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3397 for (unsigned I = 0; I < NumOperands; ++I)
3398 MainAltOps[I].push_back(getData(I, FirstLane).V);
3399
3400 SmallBitVector UsedLanes(NumLanes);
3401 UsedLanes.set(FirstLane);
3402 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3403 // Visit the lane on the right and then the lane on the left.
3404 for (int Direction : {+1, -1}) {
3405 int Lane = FirstLane + Direction * Distance;
3406 if (Lane < 0 || Lane >= (int)NumLanes)
3407 continue;
3408 UsedLanes.set(Lane);
3409 int LastLane = Lane - Direction;
3410 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3411 "Out of bounds");
3412 // Look for a good match for each operand.
3413 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3414 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3415 std::optional<unsigned> BestIdx =
3416 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3417 MainAltOps[OpIdx], UsedLanes);
3418 // By not selecting a value, we allow the operands that follow to
3419 // select a better matching value. We will get a non-null value in
3420 // the next run of getBestOperand().
3421 if (BestIdx) {
3422 // Swap the current operand with the one returned by
3423 // getBestOperand().
3424 swap(OpIdx, *BestIdx, Lane);
3425 } else {
3426 // Enable the second pass.
3427 StrategyFailed = true;
3428 }
3429 // Try to get the alternate opcode and follow it during analysis.
3430 if (MainAltOps[OpIdx].size() != 2) {
3431 OperandData &AltOp = getData(OpIdx, Lane);
3432 InstructionsState OpS =
3433 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3434 if (OpS && OpS.isAltShuffle())
3435 MainAltOps[OpIdx].push_back(AltOp.V);
3436 }
3437 }
3438 }
3439 }
3440 // Skip second pass if the strategy did not fail.
3441 if (!StrategyFailed)
3442 break;
3443 }
3444 }
3445
3446#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3447 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3448 switch (RMode) {
3449 case ReorderingMode::Load:
3450 return "Load";
3451 case ReorderingMode::Opcode:
3452 return "Opcode";
3453 case ReorderingMode::Constant:
3454 return "Constant";
3455 case ReorderingMode::Splat:
3456 return "Splat";
3457 case ReorderingMode::Failed:
3458 return "Failed";
3459 }
3460 llvm_unreachable("Unimplemented Reordering Type");
3461 }
3462
3463 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3464 raw_ostream &OS) {
3465 return OS << getModeStr(RMode);
3466 }
3467
3468 /// Debug print.
3469 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3470 printMode(RMode, dbgs());
3471 }
3472
3473 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3474 return printMode(RMode, OS);
3475 }
3476
3478 const unsigned Indent = 2;
3479 unsigned Cnt = 0;
3480 for (const OperandDataVec &OpDataVec : OpsVec) {
3481 OS << "Operand " << Cnt++ << "\n";
3482 for (const OperandData &OpData : OpDataVec) {
3483 OS.indent(Indent) << "{";
3484 if (Value *V = OpData.V)
3485 OS << *V;
3486 else
3487 OS << "null";
3488 OS << ", APO:" << OpData.APO << "}\n";
3489 }
3490 OS << "\n";
3491 }
3492 return OS;
3493 }
3494
3495 /// Debug print.
3496 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3497#endif
3498 };
3499
3500 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3501 /// for a pair which have highest score deemed to have best chance to form
3502 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3503 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3504 /// of the cost, considered to be good enough score.
3505 std::optional<int>
3506 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3507 int Limit = LookAheadHeuristics::ScoreFail) const {
3508 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3510 int BestScore = Limit;
3511 std::optional<int> Index;
3512 for (int I : seq<int>(0, Candidates.size())) {
3513 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
3514 Candidates[I].second,
3515 /*U1=*/nullptr, /*U2=*/nullptr,
3516 /*CurrLevel=*/1, {});
3517 if (Score > BestScore) {
3518 BestScore = Score;
3519 Index = I;
3520 }
3521 }
3522 return Index;
3523 }
3524
3525 /// Checks if the instruction is marked for deletion.
3526 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
3527
3528 /// Removes an instruction from its block and eventually deletes it.
3529 /// It's like Instruction::eraseFromParent() except that the actual deletion
3530 /// is delayed until BoUpSLP is destructed.
3532 DeletedInstructions.insert(I);
3533 }
3534
3535 /// Remove instructions from the parent function and clear the operands of \p
3536 /// DeadVals instructions, marking for deletion trivially dead operands.
3537 template <typename T>
3539 ArrayRef<T *> DeadVals,
3540 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3542 for (T *V : DeadVals) {
3543 auto *I = cast<Instruction>(V);
3545 }
3546 DenseSet<Value *> Processed;
3547 for (T *V : DeadVals) {
3548 if (!V || !Processed.insert(V).second)
3549 continue;
3550 auto *I = cast<Instruction>(V);
3552 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
3553 for (Use &U : I->operands()) {
3554 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
3555 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3557 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3558 return Entry->VectorizedValue == OpI;
3559 })))
3560 DeadInsts.push_back(OpI);
3561 }
3562 I->dropAllReferences();
3563 }
3564 for (T *V : DeadVals) {
3565 auto *I = cast<Instruction>(V);
3566 if (!I->getParent())
3567 continue;
3568 assert((I->use_empty() || all_of(I->uses(),
3569 [&](Use &U) {
3570 return isDeleted(
3571 cast<Instruction>(U.getUser()));
3572 })) &&
3573 "trying to erase instruction with users.");
3574 I->removeFromParent();
3575 SE->forgetValue(I);
3576 }
3577 // Process the dead instruction list until empty.
3578 while (!DeadInsts.empty()) {
3579 Value *V = DeadInsts.pop_back_val();
3581 if (!VI || !VI->getParent())
3582 continue;
3584 "Live instruction found in dead worklist!");
3585 assert(VI->use_empty() && "Instructions with uses are not dead.");
3586
3587 // Don't lose the debug info while deleting the instructions.
3588 salvageDebugInfo(*VI);
3589
3590 // Null out all of the instruction's operands to see if any operand
3591 // becomes dead as we go.
3592 for (Use &OpU : VI->operands()) {
3593 Value *OpV = OpU.get();
3594 if (!OpV)
3595 continue;
3596 OpU.set(nullptr);
3597
3598 if (!OpV->use_empty())
3599 continue;
3600
3601 // If the operand is an instruction that became dead as we nulled out
3602 // the operand, and if it is 'trivially' dead, delete it in a future
3603 // loop iteration.
3604 if (auto *OpI = dyn_cast<Instruction>(OpV))
3605 if (!DeletedInstructions.contains(OpI) &&
3606 (!OpI->getType()->isVectorTy() ||
3607 none_of(VectorValuesAndScales,
3608 [&](const std::tuple<Value *, unsigned, bool> &V) {
3609 return std::get<0>(V) == OpI;
3610 })) &&
3612 DeadInsts.push_back(OpI);
3613 }
3614
3615 VI->removeFromParent();
3616 eraseInstruction(VI);
3617 SE->forgetValue(VI);
3618 }
3619 }
3620
3621 /// Checks if the instruction was already analyzed for being possible
3622 /// reduction root.
3624 return AnalyzedReductionsRoots.count(I);
3625 }
3626 /// Register given instruction as already analyzed for being possible
3627 /// reduction root.
3629 AnalyzedReductionsRoots.insert(I);
3630 }
3631 /// Checks if the provided list of reduced values was checked already for
3632 /// vectorization.
3634 return AnalyzedReductionVals.contains(hash_value(VL));
3635 }
3636 /// Adds the list of reduced values to list of already checked values for the
3637 /// vectorization.
3639 AnalyzedReductionVals.insert(hash_value(VL));
3640 }
3641 /// Clear the list of the analyzed reduction root instructions.
3643 AnalyzedReductionsRoots.clear();
3644 AnalyzedReductionVals.clear();
3645 AnalyzedMinBWVals.clear();
3646 }
3647 /// Checks if the given value is gathered in one of the nodes.
3648 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3649 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
3650 }
3651 /// Checks if the given value is gathered in one of the nodes.
3652 bool isGathered(const Value *V) const {
3653 return MustGather.contains(V);
3654 }
3655 /// Checks if the specified value was not schedule.
3656 bool isNotScheduled(const Value *V) const {
3657 return NonScheduledFirst.contains(V);
3658 }
3659
3660 /// Check if the value is vectorized in the tree.
3661 bool isVectorized(const Value *V) const {
3662 assert(V && "V cannot be nullptr.");
3663 return ScalarToTreeEntries.contains(V);
3664 }
3665
3666 ~BoUpSLP();
3667
3668private:
3669 /// Determine if a node \p E in can be demoted to a smaller type with a
3670 /// truncation. We collect the entries that will be demoted in ToDemote.
3671 /// \param E Node for analysis
3672 /// \param ToDemote indices of the nodes to be demoted.
3673 bool collectValuesToDemote(
3674 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3676 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3677 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3678
3679 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3680 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3681 /// they have only one user and reordarable).
3682 /// \param ReorderableGathers List of all gather nodes that require reordering
3683 /// (e.g., gather of extractlements or partially vectorizable loads).
3684 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3685 /// reordering, subset of \p NonVectorized.
3686 void buildReorderableOperands(
3687 TreeEntry *UserTE,
3688 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3689 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3690 SmallVectorImpl<TreeEntry *> &GatherOps);
3691
3692 /// Checks if the given \p TE is a gather node with clustered reused scalars
3693 /// and reorders it per given \p Mask.
3694 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3695
3696 /// Checks if all users of \p I are the part of the vectorization tree.
3697 bool areAllUsersVectorized(
3698 Instruction *I,
3699 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3700
3701 /// Return information about the vector formed for the specified index
3702 /// of a vector of (the same) instruction.
3704
3705 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3706 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3707 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3708 return const_cast<TreeEntry *>(
3709 getOperandEntry(const_cast<const TreeEntry *>(E), Idx));
3710 }
3711
3712 /// Gets the root instruction for the given node. If the node is a strided
3713 /// load/store node with the reverse order, the root instruction is the last
3714 /// one.
3715 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3716
3717 /// \returns Cast context for the given graph node.
3719 getCastContextHint(const TreeEntry &TE) const;
3720
3721 /// \returns the cost of the vectorizable entry.
3722 InstructionCost getEntryCost(const TreeEntry *E,
3723 ArrayRef<Value *> VectorizedVals,
3724 SmallPtrSetImpl<Value *> &CheckedExtracts);
3725
3726 /// Checks if it is legal and profitable to build SplitVectorize node for the
3727 /// given \p VL.
3728 /// \param Op1 first homogeneous scalars.
3729 /// \param Op2 second homogeneous scalars.
3730 /// \param ReorderIndices indices to reorder the scalars.
3731 /// \returns true if the node was successfully built.
3732 bool canBuildSplitNode(ArrayRef<Value *> VL,
3733 const InstructionsState &LocalState,
3736 OrdersType &ReorderIndices) const;
3737
3738 /// This is the recursive part of buildTree.
3739 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3740 unsigned InterleaveFactor = 0);
3741
3742 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3743 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3744 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3745 /// returns false, setting \p CurrentOrder to either an empty vector or a
3746 /// non-identity permutation that allows to reuse extract instructions.
3747 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3748 /// extract order.
3749 bool canReuseExtract(ArrayRef<Value *> VL,
3750 SmallVectorImpl<unsigned> &CurrentOrder,
3751 bool ResizeAllowed = false) const;
3752
3753 /// Vectorize a single entry in the tree.
3754 Value *vectorizeTree(TreeEntry *E);
3755
3756 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3757 /// \p E.
3758 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
3759
3760 /// Create a new vector from a list of scalar values. Produces a sequence
3761 /// which exploits values reused across lanes, and arranges the inserts
3762 /// for ease of later optimization.
3763 template <typename BVTy, typename ResTy, typename... Args>
3764 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3765
3766 /// Create a new vector from a list of scalar values. Produces a sequence
3767 /// which exploits values reused across lanes, and arranges the inserts
3768 /// for ease of later optimization.
3769 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
3770
3771 /// Returns the instruction in the bundle, which can be used as a base point
3772 /// for scheduling. Usually it is the last instruction in the bundle, except
3773 /// for the case when all operands are external (in this case, it is the first
3774 /// instruction in the list).
3775 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3776
3777 /// Tries to find extractelement instructions with constant indices from fixed
3778 /// vector type and gather such instructions into a bunch, which highly likely
3779 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3780 /// was successful, the matched scalars are replaced by poison values in \p VL
3781 /// for future analysis.
3782 std::optional<TargetTransformInfo::ShuffleKind>
3783 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3784 SmallVectorImpl<int> &Mask) const;
3785
3786 /// Tries to find extractelement instructions with constant indices from fixed
3787 /// vector type and gather such instructions into a bunch, which highly likely
3788 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3789 /// was successful, the matched scalars are replaced by poison values in \p VL
3790 /// for future analysis.
3792 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3794 unsigned NumParts) const;
3795
3796 /// Checks if the gathered \p VL can be represented as a single register
3797 /// shuffle(s) of previous tree entries.
3798 /// \param TE Tree entry checked for permutation.
3799 /// \param VL List of scalars (a subset of the TE scalar), checked for
3800 /// permutations. Must form single-register vector.
3801 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3802 /// commands to build the mask using the original vector value, without
3803 /// relying on the potential reordering.
3804 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3805 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3806 std::optional<TargetTransformInfo::ShuffleKind>
3807 isGatherShuffledSingleRegisterEntry(
3808 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3809 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3810 bool ForOrder);
3811
3812 /// Checks if the gathered \p VL can be represented as multi-register
3813 /// shuffle(s) of previous tree entries.
3814 /// \param TE Tree entry checked for permutation.
3815 /// \param VL List of scalars (a subset of the TE scalar), checked for
3816 /// permutations.
3817 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3818 /// commands to build the mask using the original vector value, without
3819 /// relying on the potential reordering.
3820 /// \returns per-register series of ShuffleKind, if gathered values can be
3821 /// represented as shuffles of previous tree entries. \p Mask is filled with
3822 /// the shuffle mask (also on per-register base).
3824 isGatherShuffledEntry(
3825 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3827 unsigned NumParts, bool ForOrder = false);
3828
3829 /// \returns the cost of gathering (inserting) the values in \p VL into a
3830 /// vector.
3831 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3832 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3833 Type *ScalarTy) const;
3834
3835 /// Set the Builder insert point to one after the last instruction in
3836 /// the bundle
3837 void setInsertPointAfterBundle(const TreeEntry *E);
3838
3839 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3840 /// specified, the starting vector value is poison.
3841 Value *
3842 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3843 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3844
3845 /// \returns whether the VectorizableTree is fully vectorizable and will
3846 /// be beneficial even the tree height is tiny.
3847 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3848
3849 /// Run through the list of all gathered loads in the graph and try to find
3850 /// vector loads/masked gathers instead of regular gathers. Later these loads
3851 /// are reshufled to build final gathered nodes.
3852 void tryToVectorizeGatheredLoads(
3853 const SmallMapVector<
3854 std::tuple<BasicBlock *, Value *, Type *>,
3855 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
3856 &GatheredLoads);
3857
3858 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3859 /// users of \p TE and collects the stores. It returns the map from the store
3860 /// pointers to the collected stores.
3862 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3863
3864 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3865 /// stores in \p StoresVec can form a vector instruction. If so it returns
3866 /// true and populates \p ReorderIndices with the shuffle indices of the
3867 /// stores when compared to the sorted vector.
3868 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3869 OrdersType &ReorderIndices) const;
3870
3871 /// Iterates through the users of \p TE, looking for scalar stores that can be
3872 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3873 /// their order and builds an order index vector for each store bundle. It
3874 /// returns all these order vectors found.
3875 /// We run this after the tree has formed, otherwise we may come across user
3876 /// instructions that are not yet in the tree.
3878 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3879
3880 /// Tries to reorder the gathering node for better vectorization
3881 /// opportunities.
3882 void reorderGatherNode(TreeEntry &TE);
3883
3884 class TreeEntry {
3885 public:
3886 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3887 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3888
3889 /// \returns Common mask for reorder indices and reused scalars.
3890 SmallVector<int> getCommonMask() const {
3891 if (State == TreeEntry::SplitVectorize)
3892 return {};
3893 SmallVector<int> Mask;
3894 inversePermutation(ReorderIndices, Mask);
3895 ::addMask(Mask, ReuseShuffleIndices);
3896 return Mask;
3897 }
3898
3899 /// \returns The mask for split nodes.
3900 SmallVector<int> getSplitMask() const {
3901 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3902 "Expected only split vectorize node.");
3903 SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
3904 unsigned CommonVF = std::max<unsigned>(
3905 CombinedEntriesWithIndices.back().second,
3906 Scalars.size() - CombinedEntriesWithIndices.back().second);
3907 for (auto [Idx, I] : enumerate(ReorderIndices))
3908 Mask[I] =
3909 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3910 ? CommonVF - CombinedEntriesWithIndices.back().second
3911 : 0);
3912 return Mask;
3913 }
3914
3915 /// Updates (reorders) SplitVectorize node according to the given mask \p
3916 /// Mask and order \p MaskOrder.
3917 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
3918 ArrayRef<int> MaskOrder);
3919
3920 /// \returns true if the scalars in VL are equal to this entry.
3921 bool isSame(ArrayRef<Value *> VL) const {
3922 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3923 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3924 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3925 return VL.size() == Mask.size() &&
3926 std::equal(VL.begin(), VL.end(), Mask.begin(),
3927 [Scalars](Value *V, int Idx) {
3928 return (isa<UndefValue>(V) &&
3929 Idx == PoisonMaskElem) ||
3930 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3931 });
3932 };
3933 if (!ReorderIndices.empty()) {
3934 // TODO: implement matching if the nodes are just reordered, still can
3935 // treat the vector as the same if the list of scalars matches VL
3936 // directly, without reordering.
3937 SmallVector<int> Mask;
3938 inversePermutation(ReorderIndices, Mask);
3939 if (VL.size() == Scalars.size())
3940 return IsSame(Scalars, Mask);
3941 if (VL.size() == ReuseShuffleIndices.size()) {
3942 ::addMask(Mask, ReuseShuffleIndices);
3943 return IsSame(Scalars, Mask);
3944 }
3945 return false;
3946 }
3947 return IsSame(Scalars, ReuseShuffleIndices);
3948 }
3949
3950 /// \returns true if current entry has same operands as \p TE.
3951 bool hasEqualOperands(const TreeEntry &TE) const {
3952 if (TE.getNumOperands() != getNumOperands())
3953 return false;
3954 SmallBitVector Used(getNumOperands());
3955 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3956 unsigned PrevCount = Used.count();
3957 for (unsigned K = 0; K < E; ++K) {
3958 if (Used.test(K))
3959 continue;
3960 if (getOperand(K) == TE.getOperand(I)) {
3961 Used.set(K);
3962 break;
3963 }
3964 }
3965 // Check if we actually found the matching operand.
3966 if (PrevCount == Used.count())
3967 return false;
3968 }
3969 return true;
3970 }
3971
3972 /// \return Final vectorization factor for the node. Defined by the total
3973 /// number of vectorized scalars, including those, used several times in the
3974 /// entry and counted in the \a ReuseShuffleIndices, if any.
3975 unsigned getVectorFactor() const {
3976 if (!ReuseShuffleIndices.empty())
3977 return ReuseShuffleIndices.size();
3978 return Scalars.size();
3979 };
3980
3981 /// Checks if the current node is a gather node.
3982 bool isGather() const { return State == NeedToGather; }
3983
3984 /// A vector of scalars.
3985 ValueList Scalars;
3986
3987 /// The Scalars are vectorized into this value. It is initialized to Null.
3988 WeakTrackingVH VectorizedValue = nullptr;
3989
3990 /// Do we need to gather this sequence or vectorize it
3991 /// (either with vector instruction or with scatter/gather
3992 /// intrinsics for store/load)?
3993 enum EntryState {
3994 Vectorize, ///< The node is regularly vectorized.
3995 ScatterVectorize, ///< Masked scatter/gather node.
3996 StridedVectorize, ///< Strided loads (and stores)
3997 CompressVectorize, ///< (Masked) load with compress.
3998 NeedToGather, ///< Gather/buildvector node.
3999 CombinedVectorize, ///< Vectorized node, combined with its user into more
4000 ///< complex node like select/cmp to minmax, mul/add to
4001 ///< fma, etc. Must be used for the following nodes in
4002 ///< the pattern, not the very first one.
4003 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
4004 ///< independently and then combines back.
4005 };
4006 EntryState State;
4007
4008 /// List of combined opcodes supported by the vectorizer.
4009 enum CombinedOpcode {
4010 NotCombinedOp = -1,
4011 MinMax = Instruction::OtherOpsEnd + 1,
4012 FMulAdd,
4013 };
4014 CombinedOpcode CombinedOp = NotCombinedOp;
4015
4016 /// Does this sequence require some shuffling?
4017 SmallVector<int, 4> ReuseShuffleIndices;
4018
4019 /// Does this entry require reordering?
4020 SmallVector<unsigned, 4> ReorderIndices;
4021
4022 /// Points back to the VectorizableTree.
4023 ///
4024 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
4025 /// to be a pointer and needs to be able to initialize the child iterator.
4026 /// Thus we need a reference back to the container to translate the indices
4027 /// to entries.
4028 VecTreeTy &Container;
4029
4030 /// The TreeEntry index containing the user of this entry.
4031 EdgeInfo UserTreeIndex;
4032
4033 /// The index of this treeEntry in VectorizableTree.
4034 unsigned Idx = 0;
4035
4036 /// For gather/buildvector/alt opcode nodes, which are combined from
4037 /// other nodes as a series of insertvector instructions.
4038 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
4039
4040 private:
4041 /// The operands of each instruction in each lane Operands[op_index][lane].
4042 /// Note: This helps avoid the replication of the code that performs the
4043 /// reordering of operands during buildTreeRec() and vectorizeTree().
4045
4046 /// Copyable elements of the entry node.
4047 SmallPtrSet<const Value *, 4> CopyableElements;
4048
4049 /// MainOp and AltOp are recorded inside. S should be obtained from
4050 /// newTreeEntry.
4051 InstructionsState S = InstructionsState::invalid();
4052
4053 /// Interleaving factor for interleaved loads Vectorize nodes.
4054 unsigned InterleaveFactor = 0;
4055
4056 /// True if the node does not require scheduling.
4057 bool DoesNotNeedToSchedule = false;
4058
4059 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
4060 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
4061 if (Operands.size() < OpIdx + 1)
4062 Operands.resize(OpIdx + 1);
4063 assert(Operands[OpIdx].empty() && "Already resized?");
4064 assert(OpVL.size() <= Scalars.size() &&
4065 "Number of operands is greater than the number of scalars.");
4066 Operands[OpIdx].resize(OpVL.size());
4067 copy(OpVL, Operands[OpIdx].begin());
4068 }
4069
4070 public:
4071 /// Returns interleave factor for interleave nodes.
4072 unsigned getInterleaveFactor() const { return InterleaveFactor; }
4073 /// Sets interleaving factor for the interleaving nodes.
4074 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
4075
4076 /// Marks the node as one that does not require scheduling.
4077 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
4078 /// Returns true if the node is marked as one that does not require
4079 /// scheduling.
4080 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
4081
4082 /// Set this bundle's operands from \p Operands.
4083 void setOperands(ArrayRef<ValueList> Operands) {
4084 for (unsigned I : seq<unsigned>(Operands.size()))
4085 setOperand(I, Operands[I]);
4086 }
4087
4088 /// Reorders operands of the node to the given mask \p Mask.
4089 void reorderOperands(ArrayRef<int> Mask) {
4090 for (ValueList &Operand : Operands)
4091 reorderScalars(Operand, Mask);
4092 }
4093
4094 /// \returns the \p OpIdx operand of this TreeEntry.
4095 ValueList &getOperand(unsigned OpIdx) {
4096 assert(OpIdx < Operands.size() && "Off bounds");
4097 return Operands[OpIdx];
4098 }
4099
4100 /// \returns the \p OpIdx operand of this TreeEntry.
4101 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4102 assert(OpIdx < Operands.size() && "Off bounds");
4103 return Operands[OpIdx];
4104 }
4105
4106 /// \returns the number of operands.
4107 unsigned getNumOperands() const { return Operands.size(); }
4108
4109 /// \return the single \p OpIdx operand.
4110 Value *getSingleOperand(unsigned OpIdx) const {
4111 assert(OpIdx < Operands.size() && "Off bounds");
4112 assert(!Operands[OpIdx].empty() && "No operand available");
4113 return Operands[OpIdx][0];
4114 }
4115
4116 /// Some of the instructions in the list have alternate opcodes.
4117 bool isAltShuffle() const { return S.isAltShuffle(); }
4118
4119 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4120 return S.getMatchingMainOpOrAltOp(I);
4121 }
4122
4123 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4124 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4125 /// \p OpValue.
4126 Value *isOneOf(Value *Op) const {
4127 auto *I = dyn_cast<Instruction>(Op);
4128 if (I && getMatchingMainOpOrAltOp(I))
4129 return Op;
4130 return S.getMainOp();
4131 }
4132
4133 void setOperations(const InstructionsState &S) {
4134 assert(S && "InstructionsState is invalid.");
4135 this->S = S;
4136 }
4137
4138 Instruction *getMainOp() const { return S.getMainOp(); }
4139
4140 Instruction *getAltOp() const { return S.getAltOp(); }
4141
4142 /// The main/alternate opcodes for the list of instructions.
4143 unsigned getOpcode() const { return S.getOpcode(); }
4144
4145 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4146
4147 bool hasState() const { return S.valid(); }
4148
4149 /// Add \p V to the list of copyable elements.
4150 void addCopyableElement(Value *V) {
4151 assert(S.isCopyableElement(V) && "Not a copyable element.");
4152 CopyableElements.insert(V);
4153 }
4154
4155 /// Returns true if \p V is a copyable element.
4156 bool isCopyableElement(Value *V) const {
4157 return CopyableElements.contains(V);
4158 }
4159
4160 /// Returns true if any scalar in the list is a copyable element.
4161 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4162
4163 /// Returns the state of the operations.
4164 const InstructionsState &getOperations() const { return S; }
4165
4166 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4167 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4168 unsigned findLaneForValue(Value *V) const {
4169 unsigned FoundLane = getVectorFactor();
4170 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
4171 std::advance(It, 1)) {
4172 if (*It != V)
4173 continue;
4174 FoundLane = std::distance(Scalars.begin(), It);
4175 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4176 if (!ReorderIndices.empty())
4177 FoundLane = ReorderIndices[FoundLane];
4178 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4179 if (ReuseShuffleIndices.empty())
4180 break;
4181 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
4182 RIt != ReuseShuffleIndices.end()) {
4183 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4184 break;
4185 }
4186 }
4187 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4188 return FoundLane;
4189 }
4190
4191 /// Build a shuffle mask for graph entry which represents a merge of main
4192 /// and alternate operations.
4193 void
4194 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4195 SmallVectorImpl<int> &Mask,
4196 SmallVectorImpl<Value *> *OpScalars = nullptr,
4197 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4198
4199 /// Return true if this is a non-power-of-2 node.
4200 bool isNonPowOf2Vec() const {
4201 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
4202 return IsNonPowerOf2;
4203 }
4204
4205 /// Return true if this is a node, which tries to vectorize number of
4206 /// elements, forming whole vectors.
4207 bool
4208 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
4209 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
4210 TTI, getValueType(Scalars.front()), Scalars.size());
4211 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4212 "Reshuffling not supported with non-power-of-2 vectors yet.");
4213 return IsNonPowerOf2;
4214 }
4215
4216 Value *getOrdered(unsigned Idx) const {
4217 assert(isGather() && "Must be used only for buildvectors/gathers.");
4218 if (ReorderIndices.empty())
4219 return Scalars[Idx];
4220 SmallVector<int> Mask;
4221 inversePermutation(ReorderIndices, Mask);
4222 return Scalars[Mask[Idx]];
4223 }
4224
4225#ifndef NDEBUG
4226 /// Debug printer.
4227 LLVM_DUMP_METHOD void dump() const {
4228 dbgs() << Idx << ".\n";
4229 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4230 dbgs() << "Operand " << OpI << ":\n";
4231 for (const Value *V : Operands[OpI])
4232 dbgs().indent(2) << *V << "\n";
4233 }
4234 dbgs() << "Scalars: \n";
4235 for (Value *V : Scalars)
4236 dbgs().indent(2) << *V << "\n";
4237 dbgs() << "State: ";
4238 if (S && hasCopyableElements())
4239 dbgs() << "[[Copyable]] ";
4240 switch (State) {
4241 case Vectorize:
4242 if (InterleaveFactor > 0) {
4243 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4244 << "\n";
4245 } else {
4246 dbgs() << "Vectorize\n";
4247 }
4248 break;
4249 case ScatterVectorize:
4250 dbgs() << "ScatterVectorize\n";
4251 break;
4252 case StridedVectorize:
4253 dbgs() << "StridedVectorize\n";
4254 break;
4255 case CompressVectorize:
4256 dbgs() << "CompressVectorize\n";
4257 break;
4258 case NeedToGather:
4259 dbgs() << "NeedToGather\n";
4260 break;
4261 case CombinedVectorize:
4262 dbgs() << "CombinedVectorize\n";
4263 break;
4264 case SplitVectorize:
4265 dbgs() << "SplitVectorize\n";
4266 break;
4267 }
4268 if (S) {
4269 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4270 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4271 } else {
4272 dbgs() << "MainOp: NULL\n";
4273 dbgs() << "AltOp: NULL\n";
4274 }
4275 dbgs() << "VectorizedValue: ";
4276 if (VectorizedValue)
4277 dbgs() << *VectorizedValue << "\n";
4278 else
4279 dbgs() << "NULL\n";
4280 dbgs() << "ReuseShuffleIndices: ";
4281 if (ReuseShuffleIndices.empty())
4282 dbgs() << "Empty";
4283 else
4284 for (int ReuseIdx : ReuseShuffleIndices)
4285 dbgs() << ReuseIdx << ", ";
4286 dbgs() << "\n";
4287 dbgs() << "ReorderIndices: ";
4288 for (unsigned ReorderIdx : ReorderIndices)
4289 dbgs() << ReorderIdx << ", ";
4290 dbgs() << "\n";
4291 dbgs() << "UserTreeIndex: ";
4292 if (UserTreeIndex)
4293 dbgs() << UserTreeIndex;
4294 else
4295 dbgs() << "<invalid>";
4296 dbgs() << "\n";
4297 if (!CombinedEntriesWithIndices.empty()) {
4298 dbgs() << "Combined entries: ";
4299 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4300 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4301 });
4302 dbgs() << "\n";
4303 }
4304 }
4305#endif
4306 };
4307
4308#ifndef NDEBUG
4309 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4310 InstructionCost VecCost, InstructionCost ScalarCost,
4311 StringRef Banner) const {
4312 dbgs() << "SLP: " << Banner << ":\n";
4313 E->dump();
4314 dbgs() << "SLP: Costs:\n";
4315 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4316 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4317 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4318 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4319 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4320 }
4321#endif
4322
4323 /// Create a new gather TreeEntry
4324 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4325 const InstructionsState &S,
4326 const EdgeInfo &UserTreeIdx,
4327 ArrayRef<int> ReuseShuffleIndices = {}) {
4328 auto Invalid = ScheduleBundle::invalid();
4329 return newTreeEntry(VL, Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4330 }
4331
4332 /// Create a new VectorizableTree entry.
4333 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4334 const InstructionsState &S,
4335 const EdgeInfo &UserTreeIdx,
4336 ArrayRef<int> ReuseShuffleIndices = {},
4337 ArrayRef<unsigned> ReorderIndices = {},
4338 unsigned InterleaveFactor = 0) {
4339 TreeEntry::EntryState EntryState =
4340 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4341 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4342 ReuseShuffleIndices, ReorderIndices);
4343 if (E && InterleaveFactor > 0)
4344 E->setInterleave(InterleaveFactor);
4345 return E;
4346 }
4347
4348 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4349 TreeEntry::EntryState EntryState,
4350 ScheduleBundle &Bundle, const InstructionsState &S,
4351 const EdgeInfo &UserTreeIdx,
4352 ArrayRef<int> ReuseShuffleIndices = {},
4353 ArrayRef<unsigned> ReorderIndices = {}) {
4354 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4355 EntryState == TreeEntry::SplitVectorize)) ||
4356 (Bundle && EntryState != TreeEntry::NeedToGather &&
4357 EntryState != TreeEntry::SplitVectorize)) &&
4358 "Need to vectorize gather entry?");
4359 // Gathered loads still gathered? Do not create entry, use the original one.
4360 if (GatheredLoadsEntriesFirst.has_value() &&
4361 EntryState == TreeEntry::NeedToGather && S &&
4362 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4363 !UserTreeIdx.UserTE)
4364 return nullptr;
4365 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4366 TreeEntry *Last = VectorizableTree.back().get();
4367 Last->Idx = VectorizableTree.size() - 1;
4368 Last->State = EntryState;
4369 if (UserTreeIdx.UserTE)
4370 OperandsToTreeEntry.try_emplace(
4371 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
4372 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
4373 // for non-power-of-two vectors.
4374 assert(
4375 (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
4376 ReuseShuffleIndices.empty()) &&
4377 "Reshuffling scalars not yet supported for nodes with padding");
4378 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4379 ReuseShuffleIndices.end());
4380 if (ReorderIndices.empty()) {
4381 Last->Scalars.assign(VL.begin(), VL.end());
4382 if (S)
4383 Last->setOperations(S);
4384 } else {
4385 // Reorder scalars and build final mask.
4386 Last->Scalars.assign(VL.size(), nullptr);
4387 transform(ReorderIndices, Last->Scalars.begin(),
4388 [VL](unsigned Idx) -> Value * {
4389 if (Idx >= VL.size())
4390 return UndefValue::get(VL.front()->getType());
4391 return VL[Idx];
4392 });
4393 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
4394 if (S)
4395 Last->setOperations(S);
4396 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
4397 }
4398 if (EntryState == TreeEntry::SplitVectorize) {
4399 assert(S && "Split nodes must have operations.");
4400 Last->setOperations(S);
4401 SmallPtrSet<Value *, 4> Processed;
4402 for (Value *V : VL) {
4403 auto *I = dyn_cast<Instruction>(V);
4404 if (!I)
4405 continue;
4406 auto It = ScalarsInSplitNodes.find(V);
4407 if (It == ScalarsInSplitNodes.end()) {
4408 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);
4409 (void)Processed.insert(V);
4410 } else if (Processed.insert(V).second) {
4411 assert(!is_contained(It->getSecond(), Last) &&
4412 "Value already associated with the node.");
4413 It->getSecond().push_back(Last);
4414 }
4415 }
4416 } else if (!Last->isGather()) {
4417 if (isa<PHINode>(S.getMainOp()) ||
4418 isVectorLikeInstWithConstOps(S.getMainOp()) ||
4419 (!S.areInstructionsWithCopyableElements() &&
4420 doesNotNeedToSchedule(VL)) ||
4421 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
4422 Last->setDoesNotNeedToSchedule();
4423 SmallPtrSet<Value *, 4> Processed;
4424 for (Value *V : VL) {
4425 if (isa<PoisonValue>(V))
4426 continue;
4427 if (S.isCopyableElement(V)) {
4428 Last->addCopyableElement(V);
4429 continue;
4430 }
4431 auto It = ScalarToTreeEntries.find(V);
4432 if (It == ScalarToTreeEntries.end()) {
4433 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
4434 (void)Processed.insert(V);
4435 } else if (Processed.insert(V).second) {
4436 assert(!is_contained(It->getSecond(), Last) &&
4437 "Value already associated with the node.");
4438 It->getSecond().push_back(Last);
4439 }
4440 }
4441 // Update the scheduler bundle to point to this TreeEntry.
4442 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4443 "Bundle and VL out of sync");
4444 if (!Bundle.getBundle().empty()) {
4445#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4446 auto *BundleMember = Bundle.getBundle().begin();
4447 SmallPtrSet<Value *, 4> Processed;
4448 for (Value *V : VL) {
4449 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4450 continue;
4451 ++BundleMember;
4452 }
4453 assert(BundleMember == Bundle.getBundle().end() &&
4454 "Bundle and VL out of sync");
4455#endif
4456 Bundle.setTreeEntry(Last);
4457 }
4458 } else {
4459 // Build a map for gathered scalars to the nodes where they are used.
4460 bool AllConstsOrCasts = true;
4461 for (Value *V : VL) {
4462 if (S && S.areInstructionsWithCopyableElements() &&
4463 S.isCopyableElement(V))
4464 Last->addCopyableElement(V);
4465 if (!isConstant(V)) {
4466 auto *I = dyn_cast<CastInst>(V);
4467 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4468 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4469 !UserTreeIdx.UserTE->isGather())
4470 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
4471 }
4472 }
4473 if (AllConstsOrCasts)
4474 CastMaxMinBWSizes =
4475 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4476 MustGather.insert_range(VL);
4477 }
4478
4479 if (UserTreeIdx.UserTE)
4480 Last->UserTreeIndex = UserTreeIdx;
4481 return Last;
4482 }
4483
4484 /// -- Vectorization State --
4485 /// Holds all of the tree entries.
4486 TreeEntry::VecTreeTy VectorizableTree;
4487
4488#ifndef NDEBUG
4489 /// Debug printer.
4490 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4491 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4492 VectorizableTree[Id]->dump();
4493 dbgs() << "\n";
4494 }
4495 }
4496#endif
4497
4498 /// Get list of vector entries, associated with the value \p V.
4499 ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {
4500 assert(V && "V cannot be nullptr.");
4501 auto It = ScalarToTreeEntries.find(V);
4502 if (It == ScalarToTreeEntries.end())
4503 return {};
4504 return It->getSecond();
4505 }
4506
4507 /// Get list of split vector entries, associated with the value \p V.
4508 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4509 assert(V && "V cannot be nullptr.");
4510 auto It = ScalarsInSplitNodes.find(V);
4511 if (It == ScalarsInSplitNodes.end())
4512 return {};
4513 return It->getSecond();
4514 }
4515
4516 /// Returns first vector node for value \p V, matching values \p VL.
4517 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4518 bool SameVF = false) const {
4519 assert(V && "V cannot be nullptr.");
4520 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4521 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4522 return TE;
4523 return nullptr;
4524 }
4525
4526 /// Check that the operand node of alternate node does not generate
4527 /// buildvector sequence. If it is, then probably not worth it to build
4528 /// alternate shuffle, if number of buildvector operands + alternate
4529 /// instruction > than the number of buildvector instructions.
4530 /// \param S the instructions state of the analyzed values.
4531 /// \param VL list of the instructions with alternate opcodes.
4532 bool areAltOperandsProfitable(const InstructionsState &S,
4533 ArrayRef<Value *> VL) const;
4534
4535 /// Contains all the outputs of legality analysis for a list of values to
4536 /// vectorize.
4537 class ScalarsVectorizationLegality {
4538 InstructionsState S;
4539 bool IsLegal;
4540 bool TryToFindDuplicates;
4541 bool TrySplitVectorize;
4542
4543 public:
4544 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4545 bool TryToFindDuplicates = true,
4546 bool TrySplitVectorize = false)
4547 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4548 TrySplitVectorize(TrySplitVectorize) {
4549 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4550 "Inconsistent state");
4551 }
4552 const InstructionsState &getInstructionsState() const { return S; };
4553 bool isLegal() const { return IsLegal; }
4554 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4555 bool trySplitVectorize() const { return TrySplitVectorize; }
4556 };
4557
4558 /// Checks if the specified list of the instructions/values can be vectorized
4559 /// in general.
4560 ScalarsVectorizationLegality
4561 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4562 const EdgeInfo &UserTreeIdx,
4563 bool TryCopyableElementsVectorization) const;
4564
4565 /// Checks if the specified list of the instructions/values can be vectorized
4566 /// and fills required data before actual scheduling of the instructions.
4567 TreeEntry::EntryState getScalarsVectorizationState(
4568 const InstructionsState &S, ArrayRef<Value *> VL,
4569 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4570 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4571
4572 /// Maps a specific scalar to its tree entry(ies).
4573 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4574
4575 /// Maps the operand index and entry to the corresponding tree entry.
4576 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4577 OperandsToTreeEntry;
4578
4579 /// Scalars, used in split vectorize nodes.
4580 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4581
4582 /// Maps a value to the proposed vectorizable size.
4583 SmallDenseMap<Value *, unsigned> InstrElementSize;
4584
4585 /// A list of scalars that we found that we need to keep as scalars.
4586 ValueSet MustGather;
4587
4588 /// A set of first non-schedulable values.
4589 ValueSet NonScheduledFirst;
4590
4591 /// A map between the vectorized entries and the last instructions in the
4592 /// bundles. The bundles are built in use order, not in the def order of the
4593 /// instructions. So, we cannot rely directly on the last instruction in the
4594 /// bundle being the last instruction in the program order during
4595 /// vectorization process since the basic blocks are affected, need to
4596 /// pre-gather them before.
4597 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4598
4599 /// List of gather nodes, depending on other gather/vector nodes, which should
4600 /// be emitted after the vector instruction emission process to correctly
4601 /// handle order of the vector instructions and shuffles.
4602 SetVector<const TreeEntry *> PostponedGathers;
4603
4604 using ValueToGatherNodesMap =
4605 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4606 ValueToGatherNodesMap ValueToGatherNodes;
4607
4608 /// A list of the load entries (node indices), which can be vectorized using
4609 /// strided or masked gather approach, but attempted to be represented as
4610 /// contiguous loads.
4611 SetVector<unsigned> LoadEntriesToVectorize;
4612
4613 /// true if graph nodes transforming mode is on.
4614 bool IsGraphTransformMode = false;
4615
4616 /// The index of the first gathered load entry in the VectorizeTree.
4617 std::optional<unsigned> GatheredLoadsEntriesFirst;
4618
4619 /// Maps compress entries to their mask data for the final codegen.
4620 SmallDenseMap<const TreeEntry *,
4621 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4622 CompressEntryToData;
4623
4624 /// This POD struct describes one external user in the vectorized tree.
4625 struct ExternalUser {
4626 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4627 : Scalar(S), User(U), E(E), Lane(L) {}
4628
4629 /// Which scalar in our function.
4630 Value *Scalar = nullptr;
4631
4632 /// Which user that uses the scalar.
4633 llvm::User *User = nullptr;
4634
4635 /// Vector node, the value is part of.
4636 const TreeEntry &E;
4637
4638 /// Which lane does the scalar belong to.
4639 unsigned Lane;
4640 };
4641 using UserList = SmallVector<ExternalUser, 16>;
4642
4643 /// Checks if two instructions may access the same memory.
4644 ///
4645 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4646 /// is invariant in the calling loop.
4647 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4648 Instruction *Inst2) {
4649 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4650 // First check if the result is already in the cache.
4651 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
4652 auto Res = AliasCache.try_emplace(Key);
4653 if (!Res.second)
4654 return Res.first->second;
4655 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4656 // Store the result in the cache.
4657 Res.first->getSecond() = Aliased;
4658 return Aliased;
4659 }
4660
4661 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4662
4663 /// Cache for alias results.
4664 /// TODO: consider moving this to the AliasAnalysis itself.
4665 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4666
4667 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4668 // globally through SLP because we don't perform any action which
4669 // invalidates capture results.
4670 BatchAAResults BatchAA;
4671
4672 /// Temporary store for deleted instructions. Instructions will be deleted
4673 /// eventually when the BoUpSLP is destructed. The deferral is required to
4674 /// ensure that there are no incorrect collisions in the AliasCache, which
4675 /// can happen if a new instruction is allocated at the same address as a
4676 /// previously deleted instruction.
4677 DenseSet<Instruction *> DeletedInstructions;
4678
4679 /// Set of the instruction, being analyzed already for reductions.
4680 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4681
4682 /// Set of hashes for the list of reduction values already being analyzed.
4683 DenseSet<size_t> AnalyzedReductionVals;
4684
4685 /// Values, already been analyzed for mininmal bitwidth and found to be
4686 /// non-profitable.
4687 DenseSet<Value *> AnalyzedMinBWVals;
4688
4689 /// A list of values that need to extracted out of the tree.
4690 /// This list holds pairs of (Internal Scalar : External User). External User
4691 /// can be nullptr, it means that this Internal Scalar will be used later,
4692 /// after vectorization.
4693 UserList ExternalUses;
4694
4695 /// A list of GEPs which can be reaplced by scalar GEPs instead of
4696 /// extractelement instructions.
4697 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4698
4699 /// A list of scalar to be extracted without specific user necause of too many
4700 /// uses.
4701 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4702
4703 /// Values used only by @llvm.assume calls.
4704 SmallPtrSet<const Value *, 32> EphValues;
4705
4706 /// Holds all of the instructions that we gathered, shuffle instructions and
4707 /// extractelements.
4708 SetVector<Instruction *> GatherShuffleExtractSeq;
4709
4710 /// A list of blocks that we are going to CSE.
4711 DenseSet<BasicBlock *> CSEBlocks;
4712
4713 /// List of hashes of vector of loads, which are known to be non vectorizable.
4714 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4715
4716 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
4717 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
4718 /// instructions, while ScheduleBundle represents a batch of instructions,
4719 /// going to be groupped together. ScheduleCopyableData models extra user for
4720 /// "copyable" instructions.
4721 class ScheduleEntity {
4722 friend class ScheduleBundle;
4723 friend class ScheduleData;
4724 friend class ScheduleCopyableData;
4725
4726 protected:
4727 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4728 Kind getKind() const { return K; }
4729 ScheduleEntity(Kind K) : K(K) {}
4730
4731 private:
4732 /// Used for getting a "good" final ordering of instructions.
4733 int SchedulingPriority = 0;
4734 /// True if this instruction (or bundle) is scheduled (or considered as
4735 /// scheduled in the dry-run).
4736 bool IsScheduled = false;
4737 /// The kind of the ScheduleEntity.
4738 const Kind K = Kind::ScheduleData;
4739
4740 public:
4741 ScheduleEntity() = delete;
4742 /// Gets/sets the scheduling priority.
4743 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4744 int getSchedulingPriority() const { return SchedulingPriority; }
4745 bool isReady() const {
4746 if (const auto *SD = dyn_cast<ScheduleData>(this))
4747 return SD->isReady();
4748 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4749 return CD->isReady();
4750 return cast<ScheduleBundle>(this)->isReady();
4751 }
4752 /// Returns true if the dependency information has been calculated.
4753 /// Note that depenendency validity can vary between instructions within
4754 /// a single bundle.
4755 bool hasValidDependencies() const {
4756 if (const auto *SD = dyn_cast<ScheduleData>(this))
4757 return SD->hasValidDependencies();
4758 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4759 return CD->hasValidDependencies();
4760 return cast<ScheduleBundle>(this)->hasValidDependencies();
4761 }
4762 /// Gets the number of unscheduled dependencies.
4763 int getUnscheduledDeps() const {
4764 if (const auto *SD = dyn_cast<ScheduleData>(this))
4765 return SD->getUnscheduledDeps();
4766 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4767 return CD->getUnscheduledDeps();
4768 return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
4769 }
4770 /// Increments the number of unscheduled dependencies.
4771 int incrementUnscheduledDeps(int Incr) {
4772 if (auto *SD = dyn_cast<ScheduleData>(this))
4773 return SD->incrementUnscheduledDeps(Incr);
4774 return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
4775 }
4776 /// Gets the number of dependencies.
4777 int getDependencies() const {
4778 if (const auto *SD = dyn_cast<ScheduleData>(this))
4779 return SD->getDependencies();
4780 return cast<ScheduleCopyableData>(this)->getDependencies();
4781 }
4782 /// Gets the instruction.
4783 Instruction *getInst() const {
4784 if (const auto *SD = dyn_cast<ScheduleData>(this))
4785 return SD->getInst();
4786 return cast<ScheduleCopyableData>(this)->getInst();
4787 }
4788
4789 /// Gets/sets if the bundle is scheduled.
4790 bool isScheduled() const { return IsScheduled; }
4791 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4792
4793 static bool classof(const ScheduleEntity *) { return true; }
4794
4795#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4796 void dump(raw_ostream &OS) const {
4797 if (const auto *SD = dyn_cast<ScheduleData>(this))
4798 return SD->dump(OS);
4799 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4800 return CD->dump(OS);
4801 return cast<ScheduleBundle>(this)->dump(OS);
4802 }
4803
4804 LLVM_DUMP_METHOD void dump() const {
4805 dump(dbgs());
4806 dbgs() << '\n';
4807 }
4808#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4809 };
4810
4811#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4813 const BoUpSLP::ScheduleEntity &SE) {
4814 SE.dump(OS);
4815 return OS;
4816 }
4817#endif
4818
4819 /// Contains all scheduling relevant data for an instruction.
4820 /// A ScheduleData either represents a single instruction or a member of an
4821 /// instruction bundle (= a group of instructions which is combined into a
4822 /// vector instruction).
4823 class ScheduleData final : public ScheduleEntity {
4824 public:
4825 // The initial value for the dependency counters. It means that the
4826 // dependencies are not calculated yet.
4827 enum { InvalidDeps = -1 };
4828
4829 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4830 static bool classof(const ScheduleEntity *Entity) {
4831 return Entity->getKind() == Kind::ScheduleData;
4832 }
4833
4834 void init(int BlockSchedulingRegionID, Instruction *I) {
4835 NextLoadStore = nullptr;
4836 IsScheduled = false;
4837 SchedulingRegionID = BlockSchedulingRegionID;
4838 clearDependencies();
4839 Inst = I;
4840 }
4841
4842 /// Verify basic self consistency properties
4843 void verify() {
4844 if (hasValidDependencies()) {
4845 assert(UnscheduledDeps <= Dependencies && "invariant");
4846 } else {
4847 assert(UnscheduledDeps == Dependencies && "invariant");
4848 }
4849
4850 if (IsScheduled) {
4851 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4852 "unexpected scheduled state");
4853 }
4854 }
4855
4856 /// Returns true if the dependency information has been calculated.
4857 /// Note that depenendency validity can vary between instructions within
4858 /// a single bundle.
4859 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4860
4861 /// Returns true if it is ready for scheduling, i.e. it has no more
4862 /// unscheduled depending instructions/bundles.
4863 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
4864
4865 /// Modifies the number of unscheduled dependencies for this instruction,
4866 /// and returns the number of remaining dependencies for the containing
4867 /// bundle.
4868 int incrementUnscheduledDeps(int Incr) {
4869 assert(hasValidDependencies() &&
4870 "increment of unscheduled deps would be meaningless");
4871 UnscheduledDeps += Incr;
4872 return UnscheduledDeps;
4873 }
4874
4875 /// Sets the number of unscheduled dependencies to the number of
4876 /// dependencies.
4877 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4878
4879 /// Clears all dependency information.
4880 void clearDependencies() {
4881 clearDirectDependencies();
4882 MemoryDependencies.clear();
4883 ControlDependencies.clear();
4884 }
4885
4886 /// Clears all direct dependencies only, except for control and memory
4887 /// dependencies.
4888 /// Required for copyable elements to correctly handle control/memory deps
4889 /// and avoid extra reclaculation of such deps.
4890 void clearDirectDependencies() {
4891 Dependencies = InvalidDeps;
4892 resetUnscheduledDeps();
4893 IsScheduled = false;
4894 }
4895
4896 /// Gets the number of unscheduled dependencies.
4897 int getUnscheduledDeps() const { return UnscheduledDeps; }
4898 /// Gets the number of dependencies.
4899 int getDependencies() const { return Dependencies; }
4900 /// Initializes the number of dependencies.
4901 void initDependencies() { Dependencies = 0; }
4902 /// Increments the number of dependencies.
4903 void incDependencies() { Dependencies++; }
4904
4905 /// Gets scheduling region ID.
4906 int getSchedulingRegionID() const { return SchedulingRegionID; }
4907
4908 /// Gets the instruction.
4909 Instruction *getInst() const { return Inst; }
4910
4911 /// Gets the list of memory dependencies.
4912 ArrayRef<ScheduleData *> getMemoryDependencies() const {
4913 return MemoryDependencies;
4914 }
4915 /// Adds a memory dependency.
4916 void addMemoryDependency(ScheduleData *Dep) {
4917 MemoryDependencies.push_back(Dep);
4918 }
4919 /// Gets the list of control dependencies.
4920 ArrayRef<ScheduleData *> getControlDependencies() const {
4921 return ControlDependencies;
4922 }
4923 /// Adds a control dependency.
4924 void addControlDependency(ScheduleData *Dep) {
4925 ControlDependencies.push_back(Dep);
4926 }
4927 /// Gets/sets the next load/store instruction in the block.
4928 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
4929 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
4930
4931 void dump(raw_ostream &OS) const { OS << *Inst; }
4932
4933 LLVM_DUMP_METHOD void dump() const {
4934 dump(dbgs());
4935 dbgs() << '\n';
4936 }
4937
4938 private:
4939 Instruction *Inst = nullptr;
4940
4941 /// Single linked list of all memory instructions (e.g. load, store, call)
4942 /// in the block - until the end of the scheduling region.
4943 ScheduleData *NextLoadStore = nullptr;
4944
4945 /// The dependent memory instructions.
4946 /// This list is derived on demand in calculateDependencies().
4947 SmallVector<ScheduleData *> MemoryDependencies;
4948
4949 /// List of instructions which this instruction could be control dependent
4950 /// on. Allowing such nodes to be scheduled below this one could introduce
4951 /// a runtime fault which didn't exist in the original program.
4952 /// ex: this is a load or udiv following a readonly call which inf loops
4953 SmallVector<ScheduleData *> ControlDependencies;
4954
4955 /// This ScheduleData is in the current scheduling region if this matches
4956 /// the current SchedulingRegionID of BlockScheduling.
4957 int SchedulingRegionID = 0;
4958
4959 /// The number of dependencies. Constitutes of the number of users of the
4960 /// instruction plus the number of dependent memory instructions (if any).
4961 /// This value is calculated on demand.
4962 /// If InvalidDeps, the number of dependencies is not calculated yet.
4963 int Dependencies = InvalidDeps;
4964
4965 /// The number of dependencies minus the number of dependencies of scheduled
4966 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4967 /// for scheduling.
4968 /// Note that this is negative as long as Dependencies is not calculated.
4969 int UnscheduledDeps = InvalidDeps;
4970 };
4971
4972#ifndef NDEBUG
4974 const BoUpSLP::ScheduleData &SD) {
4975 SD.dump(OS);
4976 return OS;
4977 }
4978#endif
4979
4980 class ScheduleBundle final : public ScheduleEntity {
4981 /// The schedule data for the instructions in the bundle.
4983 /// True if this bundle is valid.
4984 bool IsValid = true;
4985 /// The TreeEntry that this instruction corresponds to.
4986 TreeEntry *TE = nullptr;
4987 ScheduleBundle(bool IsValid)
4988 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4989
4990 public:
4991 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4992 static bool classof(const ScheduleEntity *Entity) {
4993 return Entity->getKind() == Kind::ScheduleBundle;
4994 }
4995
4996 /// Verify basic self consistency properties
4997 void verify() const {
4998 for (const ScheduleEntity *SD : Bundle) {
4999 if (SD->hasValidDependencies()) {
5000 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5001 "invariant");
5002 } else {
5003 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5004 "invariant");
5005 }
5006
5007 if (isScheduled()) {
5008 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5009 "unexpected scheduled state");
5010 }
5011 }
5012 }
5013
5014 /// Returns the number of unscheduled dependencies in the bundle.
5015 int unscheduledDepsInBundle() const {
5016 assert(*this && "bundle must not be empty");
5017 int Sum = 0;
5018 for (const ScheduleEntity *BundleMember : Bundle) {
5019 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5020 return ScheduleData::InvalidDeps;
5021 Sum += BundleMember->getUnscheduledDeps();
5022 }
5023 return Sum;
5024 }
5025
5026 /// Returns true if the dependency information has been calculated.
5027 /// Note that depenendency validity can vary between instructions within
5028 /// a single bundle.
5029 bool hasValidDependencies() const {
5030 return all_of(Bundle, [](const ScheduleEntity *SD) {
5031 return SD->hasValidDependencies();
5032 });
5033 }
5034
5035 /// Returns true if it is ready for scheduling, i.e. it has no more
5036 /// unscheduled depending instructions/bundles.
5037 bool isReady() const {
5038 assert(*this && "bundle must not be empty");
5039 return unscheduledDepsInBundle() == 0 && !isScheduled();
5040 }
5041
5042 /// Returns the bundle of scheduling data, associated with the current
5043 /// instruction.
5044 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
5045 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
5046 /// Adds an instruction to the bundle.
5047 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5048
5049 /// Gets/sets the associated tree entry.
5050 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5051 TreeEntry *getTreeEntry() const { return TE; }
5052
5053 static ScheduleBundle invalid() { return {false}; }
5054
5055 operator bool() const { return IsValid; }
5056
5057#ifndef NDEBUG
5058 void dump(raw_ostream &OS) const {
5059 if (!*this) {
5060 OS << "[]";
5061 return;
5062 }
5063 OS << '[';
5064 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
5066 OS << "<Copyable>";
5067 OS << *SD->getInst();
5068 });
5069 OS << ']';
5070 }
5071
5072 LLVM_DUMP_METHOD void dump() const {
5073 dump(dbgs());
5074 dbgs() << '\n';
5075 }
5076#endif // NDEBUG
5077 };
5078
5079#ifndef NDEBUG
5081 const BoUpSLP::ScheduleBundle &Bundle) {
5082 Bundle.dump(OS);
5083 return OS;
5084 }
5085#endif
5086
5087 /// Contains all scheduling relevant data for the copyable instruction.
5088 /// It models the virtual instructions, supposed to replace the original
5089 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
5090 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
5091 /// instruction %virt = add %0, 0.
5092 class ScheduleCopyableData final : public ScheduleEntity {
5093 /// The source schedule data for the instruction.
5094 Instruction *Inst = nullptr;
5095 /// The edge information for the instruction.
5096 const EdgeInfo EI;
5097 /// This ScheduleData is in the current scheduling region if this matches
5098 /// the current SchedulingRegionID of BlockScheduling.
5099 int SchedulingRegionID = 0;
5100 /// Bundle, this data is part of.
5101 ScheduleBundle &Bundle;
5102
5103 public:
5104 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5105 const EdgeInfo &EI, ScheduleBundle &Bundle)
5106 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5107 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5108 static bool classof(const ScheduleEntity *Entity) {
5109 return Entity->getKind() == Kind::ScheduleCopyableData;
5110 }
5111
5112 /// Verify basic self consistency properties
5113 void verify() {
5114 if (hasValidDependencies()) {
5115 assert(UnscheduledDeps <= Dependencies && "invariant");
5116 } else {
5117 assert(UnscheduledDeps == Dependencies && "invariant");
5118 }
5119
5120 if (IsScheduled) {
5121 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5122 "unexpected scheduled state");
5123 }
5124 }
5125
5126 /// Returns true if the dependency information has been calculated.
5127 /// Note that depenendency validity can vary between instructions within
5128 /// a single bundle.
5129 bool hasValidDependencies() const {
5130 return Dependencies != ScheduleData::InvalidDeps;
5131 }
5132
5133 /// Returns true if it is ready for scheduling, i.e. it has no more
5134 /// unscheduled depending instructions/bundles.
5135 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5136
5137 /// Modifies the number of unscheduled dependencies for this instruction,
5138 /// and returns the number of remaining dependencies for the containing
5139 /// bundle.
5140 int incrementUnscheduledDeps(int Incr) {
5141 assert(hasValidDependencies() &&
5142 "increment of unscheduled deps would be meaningless");
5143 UnscheduledDeps += Incr;
5144 assert(UnscheduledDeps >= 0 && "invariant");
5145 return UnscheduledDeps;
5146 }
5147
5148 /// Sets the number of unscheduled dependencies to the number of
5149 /// dependencies.
5150 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5151
5152 /// Gets the number of unscheduled dependencies.
5153 int getUnscheduledDeps() const { return UnscheduledDeps; }
5154 /// Gets the number of dependencies.
5155 int getDependencies() const { return Dependencies; }
5156 /// Initializes the number of dependencies.
5157 void initDependencies() { Dependencies = 0; }
5158 /// Increments the number of dependencies.
5159 void incDependencies() { Dependencies++; }
5160
5161 /// Gets scheduling region ID.
5162 int getSchedulingRegionID() const { return SchedulingRegionID; }
5163
5164 /// Gets the instruction.
5165 Instruction *getInst() const { return Inst; }
5166
5167 /// Clears all dependency information.
5168 void clearDependencies() {
5169 Dependencies = ScheduleData::InvalidDeps;
5170 UnscheduledDeps = ScheduleData::InvalidDeps;
5171 IsScheduled = false;
5172 }
5173
5174 /// Gets the edge information.
5175 const EdgeInfo &getEdgeInfo() const { return EI; }
5176
5177 /// Gets the bundle.
5178 ScheduleBundle &getBundle() { return Bundle; }
5179 const ScheduleBundle &getBundle() const { return Bundle; }
5180
5181#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5182 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5183
5184 LLVM_DUMP_METHOD void dump() const {
5185 dump(dbgs());
5186 dbgs() << '\n';
5187 }
5188#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5189
5190 private:
5191 /// true, if it has valid dependency information. These nodes always have
5192 /// only single dependency.
5193 int Dependencies = ScheduleData::InvalidDeps;
5194
5195 /// The number of dependencies minus the number of dependencies of scheduled
5196 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5197 /// for scheduling.
5198 /// Note that this is negative as long as Dependencies is not calculated.
5199 int UnscheduledDeps = ScheduleData::InvalidDeps;
5200 };
5201
5202#ifndef NDEBUG
5203 friend inline raw_ostream &
5204 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5205 SD.dump(OS);
5206 return OS;
5207 }
5208#endif
5209
5210 friend struct GraphTraits<BoUpSLP *>;
5211 friend struct DOTGraphTraits<BoUpSLP *>;
5212
5213 /// Contains all scheduling data for a basic block.
5214 /// It does not schedules instructions, which are not memory read/write
5215 /// instructions and their operands are either constants, or arguments, or
5216 /// phis, or instructions from others blocks, or their users are phis or from
5217 /// the other blocks. The resulting vector instructions can be placed at the
5218 /// beginning of the basic block without scheduling (if operands does not need
5219 /// to be scheduled) or at the end of the block (if users are outside of the
5220 /// block). It allows to save some compile time and memory used by the
5221 /// compiler.
5222 /// ScheduleData is assigned for each instruction in between the boundaries of
5223 /// the tree entry, even for those, which are not part of the graph. It is
5224 /// required to correctly follow the dependencies between the instructions and
5225 /// their correct scheduling. The ScheduleData is not allocated for the
5226 /// instructions, which do not require scheduling, like phis, nodes with
5227 /// extractelements/insertelements only or nodes with instructions, with
5228 /// uses/operands outside of the block.
5229 struct BlockScheduling {
5230 BlockScheduling(BasicBlock *BB)
5231 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5232
5233 void clear() {
5234 ScheduledBundles.clear();
5235 ScheduledBundlesList.clear();
5236 ScheduleCopyableDataMap.clear();
5237 ScheduleCopyableDataMapByInst.clear();
5238 ScheduleCopyableDataMapByInstUser.clear();
5239 ScheduleCopyableDataMapByUsers.clear();
5240 ReadyInsts.clear();
5241 ScheduleStart = nullptr;
5242 ScheduleEnd = nullptr;
5243 FirstLoadStoreInRegion = nullptr;
5244 LastLoadStoreInRegion = nullptr;
5245 RegionHasStackSave = false;
5246
5247 // Reduce the maximum schedule region size by the size of the
5248 // previous scheduling run.
5249 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5250 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5251 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5252 ScheduleRegionSize = 0;
5253
5254 // Make a new scheduling region, i.e. all existing ScheduleData is not
5255 // in the new region yet.
5256 ++SchedulingRegionID;
5257 }
5258
5259 ScheduleData *getScheduleData(Instruction *I) {
5260 if (!I)
5261 return nullptr;
5262 if (BB != I->getParent())
5263 // Avoid lookup if can't possibly be in map.
5264 return nullptr;
5265 ScheduleData *SD = ScheduleDataMap.lookup(I);
5266 if (SD && isInSchedulingRegion(*SD))
5267 return SD;
5268 return nullptr;
5269 }
5270
5271 ScheduleData *getScheduleData(Value *V) {
5272 return getScheduleData(dyn_cast<Instruction>(V));
5273 }
5274
5275 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5276 /// operand number) and value.
5277 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5278 const Value *V) const {
5279 if (ScheduleCopyableDataMap.empty())
5280 return nullptr;
5281 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5282 if (It == ScheduleCopyableDataMap.end())
5283 return nullptr;
5284 ScheduleCopyableData *SD = It->getSecond().get();
5285 if (!isInSchedulingRegion(*SD))
5286 return nullptr;
5287 return SD;
5288 }
5289
5290 /// Returns the ScheduleCopyableData for the given user \p User, operand
5291 /// number and operand \p V.
5293 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5294 const Value *V) {
5295 if (ScheduleCopyableDataMapByInstUser.empty())
5296 return {};
5297 const auto It = ScheduleCopyableDataMapByInstUser.find(
5298 std::make_pair(std::make_pair(User, OperandIdx), V));
5299 if (It == ScheduleCopyableDataMapByInstUser.end())
5300 return {};
5302 for (ScheduleCopyableData *SD : It->getSecond()) {
5303 if (isInSchedulingRegion(*SD))
5304 Res.push_back(SD);
5305 }
5306 return Res;
5307 }
5308
5309 /// Returns true if all operands of the given instruction \p User are
5310 /// replaced by copyable data.
5311 /// \param User The user instruction.
5312 /// \param Op The operand, which might be replaced by the copyable data.
5313 /// \param SLP The SLP tree.
5314 /// \param NumOps The number of operands used. If the instruction uses the
5315 /// same operand several times, check for the first use, then the second,
5316 /// etc.
5317 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5318 Instruction *Op, BoUpSLP &SLP,
5319 unsigned NumOps) const {
5320 assert(NumOps > 0 && "No operands");
5321 if (ScheduleCopyableDataMap.empty())
5322 return false;
5323 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5324 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5325 for (const Use &U : User->operands()) {
5326 if (U.get() != Op)
5327 continue;
5328 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
5329 if (Entries.empty())
5330 return false;
5331 // Check all tree entries, if they have operands replaced by copyable
5332 // data.
5333 for (TreeEntry *TE : Entries) {
5334 // Check if the user is commutative.
5335 // The commutatives are handled later, as their operands can be
5336 // reordered.
5337 // Same applies even for non-commutative cmps, because we can invert
5338 // their predicate potentially and, thus, reorder the operands.
5339 bool IsCommutativeUser =
5340 ::isCommutative(User) ||
5341 ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
5342 if (!IsCommutativeUser && !isa<CmpInst>(User)) {
5343 unsigned &OpCnt =
5344 OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
5345 EdgeInfo EI(TE, U.getOperandNo());
5346 if (!getScheduleCopyableData(EI, Op))
5347 continue;
5348 // Found copyable operand - continue.
5349 ++OpCnt;
5350 continue;
5351 }
5352 ++PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
5353 .first->getSecond();
5354 }
5355 }
5356 if (PotentiallyReorderedEntriesCount.empty())
5357 return all_of(OrderedEntriesCount,
5358 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5359 return P.second == NumOps;
5360 });
5361 // Check the commutative/cmp entries.
5362 for (auto &P : PotentiallyReorderedEntriesCount) {
5363 auto *It = find(P.first->Scalars, User);
5364 assert(It != P.first->Scalars.end() && "User is not in the tree entry");
5365 int Lane = std::distance(P.first->Scalars.begin(), It);
5366 assert(Lane >= 0 && "Lane is not found");
5367 if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
5368 Lane = P.first->ReorderIndices[Lane];
5369 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5370 "Couldn't find extract lane");
5371 for (unsigned OpIdx :
5373 P.first->getMainOp()))) {
5374 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5375 getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
5376 --P.getSecond();
5377 }
5378 }
5379 return all_of(PotentiallyReorderedEntriesCount,
5380 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5381 return P.second == NumOps - 1;
5382 }) &&
5383 all_of(OrderedEntriesCount,
5384 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5385 return P.second == NumOps;
5386 });
5387 }
5388
5390 getScheduleCopyableData(const Instruction *I) const {
5391 if (ScheduleCopyableDataMapByInst.empty())
5392 return {};
5393 const auto It = ScheduleCopyableDataMapByInst.find(I);
5394 if (It == ScheduleCopyableDataMapByInst.end())
5395 return {};
5397 for (ScheduleCopyableData *SD : It->getSecond()) {
5398 if (isInSchedulingRegion(*SD))
5399 Res.push_back(SD);
5400 }
5401 return Res;
5402 }
5403
5405 getScheduleCopyableDataUsers(const Instruction *User) const {
5406 if (ScheduleCopyableDataMapByUsers.empty())
5407 return {};
5408 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5409 if (It == ScheduleCopyableDataMapByUsers.end())
5410 return {};
5412 for (ScheduleCopyableData *SD : It->getSecond()) {
5413 if (isInSchedulingRegion(*SD))
5414 Res.push_back(SD);
5415 }
5416 return Res;
5417 }
5418
5419 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5420 Instruction *I,
5421 int SchedulingRegionID,
5422 ScheduleBundle &Bundle) {
5423 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5424 ScheduleCopyableData *CD =
5425 ScheduleCopyableDataMap
5426 .try_emplace(std::make_pair(EI, I),
5427 std::make_unique<ScheduleCopyableData>(
5428 SchedulingRegionID, I, EI, Bundle))
5429 .first->getSecond()
5430 .get();
5431 ScheduleCopyableDataMapByInst[I].push_back(CD);
5432 if (EI.UserTE) {
5433 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
5434 const auto *It = find(Op, I);
5435 assert(It != Op.end() && "Lane not set");
5436 SmallPtrSet<Instruction *, 4> Visited;
5437 do {
5438 int Lane = std::distance(Op.begin(), It);
5439 assert(Lane >= 0 && "Lane not set");
5440 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
5441 !EI.UserTE->ReorderIndices.empty())
5442 Lane = EI.UserTE->ReorderIndices[Lane];
5443 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5444 "Couldn't find extract lane");
5445 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
5446 if (!Visited.insert(In).second) {
5447 It = find(make_range(std::next(It), Op.end()), I);
5448 continue;
5449 }
5450 ScheduleCopyableDataMapByInstUser
5451 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
5452 .first->getSecond()
5453 .push_back(CD);
5454 ScheduleCopyableDataMapByUsers.try_emplace(I)
5455 .first->getSecond()
5456 .insert(CD);
5457 // Remove extra deps for users, becoming non-immediate users of the
5458 // instruction. It may happen, if the chain of same copyable elements
5459 // appears in the tree.
5460 if (In == I) {
5461 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5462 if (ScheduleCopyableData *UserCD =
5463 getScheduleCopyableData(UserEI, In))
5464 ScheduleCopyableDataMapByUsers[I].remove(UserCD);
5465 }
5466 It = find(make_range(std::next(It), Op.end()), I);
5467 } while (It != Op.end());
5468 } else {
5469 ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(
5470 CD);
5471 }
5472 return *CD;
5473 }
5474
5475 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5476 auto *I = dyn_cast<Instruction>(V);
5477 if (!I)
5478 return {};
5479 auto It = ScheduledBundles.find(I);
5480 if (It == ScheduledBundles.end())
5481 return {};
5482 return It->getSecond();
5483 }
5484
5485 /// Returns true if the entity is in the scheduling region.
5486 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5487 if (const auto *Data = dyn_cast<ScheduleData>(&SD))
5488 return Data->getSchedulingRegionID() == SchedulingRegionID;
5489 if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
5490 return CD->getSchedulingRegionID() == SchedulingRegionID;
5491 return all_of(cast<ScheduleBundle>(SD).getBundle(),
5492 [&](const ScheduleEntity *BundleMember) {
5493 return isInSchedulingRegion(*BundleMember);
5494 });
5495 }
5496
5497 /// Marks an instruction as scheduled and puts all dependent ready
5498 /// instructions into the ready-list.
5499 template <typename ReadyListType>
5500 void schedule(const BoUpSLP &R, const InstructionsState &S,
5501 const EdgeInfo &EI, ScheduleEntity *Data,
5502 ReadyListType &ReadyList) {
5503 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5505 // Handle the def-use chain dependencies.
5506
5507 // Decrement the unscheduled counter and insert to ready list if ready.
5508 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5509 if ((IsControl || Data->hasValidDependencies()) &&
5510 Data->incrementUnscheduledDeps(-1) == 0) {
5511 // There are no more unscheduled dependencies after
5512 // decrementing, so we can put the dependent instruction
5513 // into the ready list.
5514 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5516 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5517 CopyableBundle.push_back(&CD->getBundle());
5518 Bundles = CopyableBundle;
5519 } else {
5520 Bundles = getScheduleBundles(Data->getInst());
5521 }
5522 if (!Bundles.empty()) {
5523 for (ScheduleBundle *Bundle : Bundles) {
5524 if (Bundle->unscheduledDepsInBundle() == 0) {
5525 assert(!Bundle->isScheduled() &&
5526 "already scheduled bundle gets ready");
5527 ReadyList.insert(Bundle);
5529 << "SLP: gets ready: " << *Bundle << "\n");
5530 }
5531 }
5532 return;
5533 }
5534 assert(!Data->isScheduled() &&
5535 "already scheduled bundle gets ready");
5537 "Expected non-copyable data");
5538 ReadyList.insert(Data);
5539 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5540 }
5541 };
5542
5543 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5544 Instruction *I) {
5545 if (!ScheduleCopyableDataMap.empty()) {
5547 getScheduleCopyableData(User, OpIdx, I);
5548 for (ScheduleCopyableData *CD : CopyableData)
5549 DecrUnsched(CD, /*IsControl=*/false);
5550 if (!CopyableData.empty())
5551 return;
5552 }
5553 if (ScheduleData *OpSD = getScheduleData(I))
5554 DecrUnsched(OpSD, /*IsControl=*/false);
5555 };
5556
5557 // If BundleMember is a vector bundle, its operands may have been
5558 // reordered during buildTree(). We therefore need to get its operands
5559 // through the TreeEntry.
5560 if (!Bundles.empty()) {
5561 auto *In = BundleMember->getInst();
5562 // Count uses of each instruction operand.
5563 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5564 unsigned TotalOpCount = 0;
5565 if (isa<ScheduleCopyableData>(BundleMember)) {
5566 // Copyable data is used only once (uses itself).
5567 TotalOpCount = OperandsUses[In] = 1;
5568 } else {
5569 for (const Use &U : In->operands()) {
5570 if (auto *I = dyn_cast<Instruction>(U.get())) {
5571 auto Res = OperandsUses.try_emplace(I, 0);
5572 ++Res.first->getSecond();
5573 ++TotalOpCount;
5574 }
5575 }
5576 }
5577 // Decrement the unscheduled counter and insert to ready list if
5578 // ready.
5579 auto DecrUnschedForInst =
5580 [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
5581 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5582 &Checked) {
5583 if (!ScheduleCopyableDataMap.empty()) {
5584 const EdgeInfo EI = {UserTE, OpIdx};
5585 if (ScheduleCopyableData *CD =
5586 getScheduleCopyableData(EI, I)) {
5587 if (!Checked.insert(std::make_pair(CD, OpIdx)).second)
5588 return;
5589 DecrUnsched(CD, /*IsControl=*/false);
5590 return;
5591 }
5592 }
5593 auto It = OperandsUses.find(I);
5594 assert(It != OperandsUses.end() && "Operand not found");
5595 if (It->second > 0) {
5596 --It->getSecond();
5597 assert(TotalOpCount > 0 && "No more operands to decrement");
5598 --TotalOpCount;
5599 if (ScheduleData *OpSD = getScheduleData(I)) {
5600 if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second)
5601 return;
5602 DecrUnsched(OpSD, /*IsControl=*/false);
5603 }
5604 }
5605 };
5606
5607 for (ScheduleBundle *Bundle : Bundles) {
5608 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5609 break;
5610 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5611 // Need to search for the lane since the tree entry can be
5612 // reordered.
5613 auto *It = find(Bundle->getTreeEntry()->Scalars, In);
5614 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5615 do {
5616 int Lane =
5617 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
5618 assert(Lane >= 0 && "Lane not set");
5619 if (isa<StoreInst>(In) &&
5620 !Bundle->getTreeEntry()->ReorderIndices.empty())
5621 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5622 assert(Lane < static_cast<int>(
5623 Bundle->getTreeEntry()->Scalars.size()) &&
5624 "Couldn't find extract lane");
5625
5626 // Since vectorization tree is being built recursively this
5627 // assertion ensures that the tree entry has all operands set
5628 // before reaching this code. Couple of exceptions known at the
5629 // moment are extracts where their second (immediate) operand is
5630 // not added. Since immediates do not affect scheduler behavior
5631 // this is considered okay.
5632 assert(In &&
5634 In->getNumOperands() ==
5635 Bundle->getTreeEntry()->getNumOperands() ||
5636 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5637 "Missed TreeEntry operands?");
5638
5639 bool IsNonSchedulableWithParentPhiNode =
5640 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5641 Bundle->getTreeEntry()->UserTreeIndex &&
5642 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5643 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5644 Instruction::PHI;
5645 // Count the number of unique phi nodes, which are the parent for
5646 // parent entry, and exit, if all the unique phis are processed.
5647 if (IsNonSchedulableWithParentPhiNode) {
5648 const TreeEntry *ParentTE =
5649 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5650 Value *User = ParentTE->Scalars[Lane];
5651 if (!ParentsUniqueUsers.insert(User).second)
5652 break;
5653 }
5654
5655 for (unsigned OpIdx :
5656 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
5657 if (auto *I = dyn_cast<Instruction>(
5658 Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
5659 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): "
5660 << *I << "\n");
5661 DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
5662 }
5663 // If parent node is schedulable, it will be handled correctly.
5664 if (!IsNonSchedulableWithParentPhiNode)
5665 break;
5666 It = std::find(std::next(It),
5667 Bundle->getTreeEntry()->Scalars.end(), In);
5668 } while (It != Bundle->getTreeEntry()->Scalars.end());
5669 }
5670 } else {
5671 // If BundleMember is a stand-alone instruction, no operand reordering
5672 // has taken place, so we directly access its operands.
5673 for (Use &U : BundleMember->getInst()->operands()) {
5674 if (auto *I = dyn_cast<Instruction>(U.get())) {
5676 << "SLP: check for readiness (def): " << *I << "\n");
5677 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
5678 }
5679 }
5680 }
5681 // Handle the memory dependencies.
5682 auto *SD = dyn_cast<ScheduleData>(BundleMember);
5683 if (!SD)
5684 return;
5685 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5686 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5687 if (!VisitedMemory.insert(MemoryDep).second)
5688 continue;
5689 // There are no more unscheduled dependencies after decrementing,
5690 // so we can put the dependent instruction into the ready list.
5691 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
5692 << *MemoryDep << "\n");
5693 DecrUnsched(MemoryDep);
5694 }
5695 // Handle the control dependencies.
5696 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5697 for (ScheduleData *Dep : SD->getControlDependencies()) {
5698 if (!VisitedControl.insert(Dep).second)
5699 continue;
5700 // There are no more unscheduled dependencies after decrementing,
5701 // so we can put the dependent instruction into the ready list.
5703 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
5704 DecrUnsched(Dep, /*IsControl=*/true);
5705 }
5706 };
5707 if (auto *SD = dyn_cast<ScheduleData>(Data)) {
5708 SD->setScheduled(/*Scheduled=*/true);
5709 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
5712 Instruction *In = SD->getInst();
5713 if (R.isVectorized(In)) {
5714 ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In);
5715 for (TreeEntry *TE : Entries) {
5717 In->getNumOperands() != TE->getNumOperands())
5718 continue;
5719 auto &BundlePtr =
5720 PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>());
5721 BundlePtr->setTreeEntry(TE);
5722 BundlePtr->add(SD);
5723 Bundles.push_back(BundlePtr.get());
5724 }
5725 }
5726 ProcessBundleMember(SD, Bundles);
5727 } else {
5728 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
5729 Bundle.setScheduled(/*Scheduled=*/true);
5730 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
5731 auto AreAllBundlesScheduled =
5732 [&](const ScheduleEntity *SD,
5733 ArrayRef<ScheduleBundle *> SDBundles) {
5735 return true;
5736 return !SDBundles.empty() &&
5737 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
5738 return SDBundle->isScheduled();
5739 });
5740 };
5741 for (ScheduleEntity *SD : Bundle.getBundle()) {
5744 SDBundles = getScheduleBundles(SD->getInst());
5745 if (AreAllBundlesScheduled(SD, SDBundles)) {
5746 SD->setScheduled(/*Scheduled=*/true);
5747 ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
5748 : SDBundles);
5749 }
5750 }
5751 }
5752 }
5753
5754 /// Verify basic self consistency properties of the data structure.
5755 void verify() {
5756 if (!ScheduleStart)
5757 return;
5758
5759 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5760 ScheduleStart->comesBefore(ScheduleEnd) &&
5761 "Not a valid scheduling region?");
5762
5763 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5764 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5765 if (!Bundles.empty()) {
5766 for (ScheduleBundle *Bundle : Bundles) {
5767 assert(isInSchedulingRegion(*Bundle) &&
5768 "primary schedule data not in window?");
5769 Bundle->verify();
5770 }
5771 continue;
5772 }
5773 auto *SD = getScheduleData(I);
5774 if (!SD)
5775 continue;
5776 assert(isInSchedulingRegion(*SD) &&
5777 "primary schedule data not in window?");
5778 SD->verify();
5779 }
5780
5781 assert(all_of(ReadyInsts,
5782 [](const ScheduleEntity *Bundle) {
5783 return Bundle->isReady();
5784 }) &&
5785 "item in ready list not ready?");
5786 }
5787
5788 /// Put all instructions into the ReadyList which are ready for scheduling.
5789 template <typename ReadyListType>
5790 void initialFillReadyList(ReadyListType &ReadyList) {
5791 SmallPtrSet<ScheduleBundle *, 16> Visited;
5792 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5793 ScheduleData *SD = getScheduleData(I);
5794 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5795 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
5796 !Bundles.empty()) {
5797 for (ScheduleBundle *Bundle : Bundles) {
5798 if (!Visited.insert(Bundle).second)
5799 continue;
5800 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5801 ReadyList.insert(Bundle);
5802 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
5803 << *Bundle << "\n");
5804 }
5805 }
5806 continue;
5807 }
5808 ReadyList.insert(SD);
5810 << "SLP: initially in ready list: " << *SD << "\n");
5811 }
5812 }
5813 }
5814
5815 /// Build a bundle from the ScheduleData nodes corresponding to the
5816 /// scalar instruction for each lane.
5817 /// \param VL The list of scalar instructions.
5818 /// \param S The state of the instructions.
5819 /// \param EI The edge in the SLP graph or the user node/operand number.
5820 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
5821 const InstructionsState &S, const EdgeInfo &EI);
5822
5823 /// Checks if a bundle of instructions can be scheduled, i.e. has no
5824 /// cyclic dependencies. This is only a dry-run, no instructions are
5825 /// actually moved at this stage.
5826 /// \returns the scheduling bundle. The returned Optional value is not
5827 /// std::nullopt if \p VL is allowed to be scheduled.
5828 std::optional<ScheduleBundle *>
5829 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
5830 const InstructionsState &S, const EdgeInfo &EI);
5831
5832 /// Allocates schedule data chunk.
5833 ScheduleData *allocateScheduleDataChunks();
5834
5835 /// Extends the scheduling region so that V is inside the region.
5836 /// \returns true if the region size is within the limit.
5837 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
5838
5839 /// Initialize the ScheduleData structures for new instructions in the
5840 /// scheduling region.
5841 void initScheduleData(Instruction *FromI, Instruction *ToI,
5842 ScheduleData *PrevLoadStore,
5843 ScheduleData *NextLoadStore);
5844
5845 /// Updates the dependency information of a bundle and of all instructions/
5846 /// bundles which depend on the original bundle.
5847 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
5848 BoUpSLP *SLP,
5849 ArrayRef<ScheduleData *> ControlDeps = {});
5850
5851 /// Sets all instruction in the scheduling region to un-scheduled.
5852 void resetSchedule();
5853
5854 BasicBlock *BB;
5855
5856 /// Simple memory allocation for ScheduleData.
5858
5859 /// The size of a ScheduleData array in ScheduleDataChunks.
5860 int ChunkSize;
5861
5862 /// The allocator position in the current chunk, which is the last entry
5863 /// of ScheduleDataChunks.
5864 int ChunkPos;
5865
5866 /// Attaches ScheduleData to Instruction.
5867 /// Note that the mapping survives during all vectorization iterations, i.e.
5868 /// ScheduleData structures are recycled.
5869 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5870
5871 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
5872 /// number) and the operand instruction, represented as copyable element.
5873 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5874 std::unique_ptr<ScheduleCopyableData>>
5875 ScheduleCopyableDataMap;
5876
5877 /// Represents mapping between instruction and all related
5878 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
5879 /// element). The SLP tree may contain several representations of the same
5880 /// instruction.
5881 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5882 ScheduleCopyableDataMapByInst;
5883
5884 /// Represents mapping between user value and operand number, the operand
5885 /// value and all related ScheduleCopyableData. The relation is 1:n, because
5886 /// the same user may refernce the same operand in different tree entries
5887 /// and the operand may be modelled by the different copyable data element.
5888 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
5890 ScheduleCopyableDataMapByInstUser;
5891
5892 /// Represents mapping between instruction and all related
5893 /// ScheduleCopyableData. It represents the mapping between the actual
5894 /// instruction and the last copyable data element in the chain. E.g., if
5895 /// the graph models the following instructions:
5896 /// %0 = non-add instruction ...
5897 /// ...
5898 /// %4 = add %3, 1
5899 /// %5 = add %4, 1
5900 /// %6 = insertelement poison, %0, 0
5901 /// %7 = insertelement %6, %5, 1
5902 /// And the graph is modeled as:
5903 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
5904 /// -> [1, 0] -> [%1, 0]
5905 ///
5906 /// this map will map %0 only to the copyable element <1>, which is the last
5907 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
5908 /// keep the map to <0>, not the %0.
5909 SmallDenseMap<const Instruction *,
5910 SmallSetVector<ScheduleCopyableData *, 4>>
5911 ScheduleCopyableDataMapByUsers;
5912
5913 /// Attaches ScheduleBundle to Instruction.
5914 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5915 ScheduledBundles;
5916 /// The list of ScheduleBundles.
5917 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
5918
5919 /// The ready-list for scheduling (only used for the dry-run).
5920 SetVector<ScheduleEntity *> ReadyInsts;
5921
5922 /// The first instruction of the scheduling region.
5923 Instruction *ScheduleStart = nullptr;
5924
5925 /// The first instruction _after_ the scheduling region.
5926 Instruction *ScheduleEnd = nullptr;
5927
5928 /// The first memory accessing instruction in the scheduling region
5929 /// (can be null).
5930 ScheduleData *FirstLoadStoreInRegion = nullptr;
5931
5932 /// The last memory accessing instruction in the scheduling region
5933 /// (can be null).
5934 ScheduleData *LastLoadStoreInRegion = nullptr;
5935
5936 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
5937 /// region? Used to optimize the dependence calculation for the
5938 /// common case where there isn't.
5939 bool RegionHasStackSave = false;
5940
5941 /// The current size of the scheduling region.
5942 int ScheduleRegionSize = 0;
5943
5944 /// The maximum size allowed for the scheduling region.
5945 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
5946
5947 /// The ID of the scheduling region. For a new vectorization iteration this
5948 /// is incremented which "removes" all ScheduleData from the region.
5949 /// Make sure that the initial SchedulingRegionID is greater than the
5950 /// initial SchedulingRegionID in ScheduleData (which is 0).
5951 int SchedulingRegionID = 1;
5952 };
5953
5954 /// Attaches the BlockScheduling structures to basic blocks.
5955 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
5956
5957 /// Performs the "real" scheduling. Done before vectorization is actually
5958 /// performed in a basic block.
5959 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
5960
5961 /// List of users to ignore during scheduling and that don't need extracting.
5962 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
5963
5964 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
5965 /// sorted SmallVectors of unsigned.
5966 struct OrdersTypeDenseMapInfo {
5967 static OrdersType getEmptyKey() {
5968 OrdersType V;
5969 V.push_back(~1U);
5970 return V;
5971 }
5972
5973 static OrdersType getTombstoneKey() {
5974 OrdersType V;
5975 V.push_back(~2U);
5976 return V;
5977 }
5978
5979 static unsigned getHashValue(const OrdersType &V) {
5980 return static_cast<unsigned>(hash_combine_range(V));
5981 }
5982
5983 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
5984 return LHS == RHS;
5985 }
5986 };
5987
5988 // Analysis and block reference.
5989 Function *F;
5990 ScalarEvolution *SE;
5991 TargetTransformInfo *TTI;
5992 TargetLibraryInfo *TLI;
5993 LoopInfo *LI;
5994 DominatorTree *DT;
5995 AssumptionCache *AC;
5996 DemandedBits *DB;
5997 const DataLayout *DL;
5998 OptimizationRemarkEmitter *ORE;
5999
6000 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
6001 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
6002
6003 /// Instruction builder to construct the vectorized tree.
6004 IRBuilder<TargetFolder> Builder;
6005
6006 /// A map of scalar integer values to the smallest bit width with which they
6007 /// can legally be represented. The values map to (width, signed) pairs,
6008 /// where "width" indicates the minimum bit width and "signed" is True if the
6009 /// value must be signed-extended, rather than zero-extended, back to its
6010 /// original width.
6011 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6012
6013 /// Final size of the reduced vector, if the current graph represents the
6014 /// input for the reduction and it was possible to narrow the size of the
6015 /// reduction.
6016 unsigned ReductionBitWidth = 0;
6017
6018 /// Canonical graph size before the transformations.
6019 unsigned BaseGraphSize = 1;
6020
6021 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
6022 /// type sizes, used in the tree.
6023 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6024
6025 /// Indices of the vectorized nodes, which supposed to be the roots of the new
6026 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
6027 DenseSet<unsigned> ExtraBitWidthNodes;
6028};
6029
6030} // end namespace slpvectorizer
6031
6032template <> struct DenseMapInfo<BoUpSLP::EdgeInfo> {
6036 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
6037 SecondInfo::getEmptyKey());
6038 }
6039
6041 return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
6042 SecondInfo::getTombstoneKey());
6043 }
6044
6045 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
6046 return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
6047 SecondInfo::getHashValue(Val.EdgeIdx));
6048 }
6049
6050 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
6051 const BoUpSLP::EdgeInfo &RHS) {
6052 return LHS == RHS;
6053 }
6054};
6055
6056template <> struct GraphTraits<BoUpSLP *> {
6057 using TreeEntry = BoUpSLP::TreeEntry;
6058
6059 /// NodeRef has to be a pointer per the GraphWriter.
6061
6062 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
6063
6064 /// Add the VectorizableTree to the index iterator to be able to return
6065 /// TreeEntry pointers.
6067 : public iterator_adaptor_base<
6068 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6070
6074
6075 NodeRef operator*() { return I->UserTE; }
6076 };
6077
6079 return R.VectorizableTree[0].get();
6080 }
6081
6083 return {&N->UserTreeIndex, N->Container};
6084 }
6085
6087 return {&N->UserTreeIndex + 1, N->Container};
6088 }
6089
6090 /// For the node iterator we just need to turn the TreeEntry iterator into a
6091 /// TreeEntry* iterator so that it dereferences to NodeRef.
6093 using ItTy = ContainerTy::iterator;
6094 ItTy It;
6095
6096 public:
6097 nodes_iterator(const ItTy &It2) : It(It2) {}
6098 NodeRef operator*() { return It->get(); }
6100 ++It;
6101 return *this;
6102 }
6103 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
6104 };
6105
6107 return nodes_iterator(R->VectorizableTree.begin());
6108 }
6109
6111 return nodes_iterator(R->VectorizableTree.end());
6112 }
6113
6114 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
6115};
6116
6117template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
6118 using TreeEntry = BoUpSLP::TreeEntry;
6119
6120 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
6121
6122 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
6123 std::string Str;
6124 raw_string_ostream OS(Str);
6125 OS << Entry->Idx << ".\n";
6126 if (isSplat(Entry->Scalars))
6127 OS << "<splat> ";
6128 for (auto *V : Entry->Scalars) {
6129 OS << *V;
6130 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
6131 return EU.Scalar == V;
6132 }))
6133 OS << " <extract>";
6134 OS << "\n";
6135 }
6136 return Str;
6137 }
6138
6139 static std::string getNodeAttributes(const TreeEntry *Entry,
6140 const BoUpSLP *) {
6141 if (Entry->isGather())
6142 return "color=red";
6143 if (Entry->State == TreeEntry::ScatterVectorize ||
6144 Entry->State == TreeEntry::StridedVectorize ||
6145 Entry->State == TreeEntry::CompressVectorize)
6146 return "color=blue";
6147 return "";
6148 }
6149};
6150
6151} // end namespace llvm
6152
6155 for (auto *I : DeletedInstructions) {
6156 if (!I->getParent()) {
6157 // Temporarily insert instruction back to erase them from parent and
6158 // memory later.
6159 if (isa<PHINode>(I))
6160 // Phi nodes must be the very first instructions in the block.
6161 I->insertBefore(F->getEntryBlock(),
6162 F->getEntryBlock().getFirstNonPHIIt());
6163 else
6164 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6165 continue;
6166 }
6167 for (Use &U : I->operands()) {
6168 auto *Op = dyn_cast<Instruction>(U.get());
6169 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
6171 DeadInsts.emplace_back(Op);
6172 }
6173 I->dropAllReferences();
6174 }
6175 for (auto *I : DeletedInstructions) {
6176 assert(I->use_empty() &&
6177 "trying to erase instruction with users.");
6178 I->eraseFromParent();
6179 }
6180
6181 // Cleanup any dead scalar code feeding the vectorized instructions
6183
6184#ifdef EXPENSIVE_CHECKS
6185 // If we could guarantee that this call is not extremely slow, we could
6186 // remove the ifdef limitation (see PR47712).
6187 assert(!verifyFunction(*F, &dbgs()));
6188#endif
6189}
6190
6191/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6192/// contains original mask for the scalars reused in the node. Procedure
6193/// transform this mask in accordance with the given \p Mask.
6195 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6196 "Expected non-empty mask.");
6197 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6198 Prev.swap(Reuses);
6199 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6200 if (Mask[I] != PoisonMaskElem)
6201 Reuses[Mask[I]] = Prev[I];
6202}
6203
6204/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6205/// the original order of the scalars. Procedure transforms the provided order
6206/// in accordance with the given \p Mask. If the resulting \p Order is just an
6207/// identity order, \p Order is cleared.
6209 bool BottomOrder = false) {
6210 assert(!Mask.empty() && "Expected non-empty mask.");
6211 unsigned Sz = Mask.size();
6212 if (BottomOrder) {
6213 SmallVector<unsigned> PrevOrder;
6214 if (Order.empty()) {
6215 PrevOrder.resize(Sz);
6216 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
6217 } else {
6218 PrevOrder.swap(Order);
6219 }
6220 Order.assign(Sz, Sz);
6221 for (unsigned I = 0; I < Sz; ++I)
6222 if (Mask[I] != PoisonMaskElem)
6223 Order[I] = PrevOrder[Mask[I]];
6224 if (all_of(enumerate(Order), [&](const auto &Data) {
6225 return Data.value() == Sz || Data.index() == Data.value();
6226 })) {
6227 Order.clear();
6228 return;
6229 }
6230 fixupOrderingIndices(Order);
6231 return;
6232 }
6233 SmallVector<int> MaskOrder;
6234 if (Order.empty()) {
6235 MaskOrder.resize(Sz);
6236 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
6237 } else {
6238 inversePermutation(Order, MaskOrder);
6239 }
6240 reorderReuses(MaskOrder, Mask);
6241 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
6242 Order.clear();
6243 return;
6244 }
6245 Order.assign(Sz, Sz);
6246 for (unsigned I = 0; I < Sz; ++I)
6247 if (MaskOrder[I] != PoisonMaskElem)
6248 Order[MaskOrder[I]] = I;
6249 fixupOrderingIndices(Order);
6250}
6251
6252std::optional<BoUpSLP::OrdersType>
6253BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6254 bool TopToBottom, bool IgnoreReorder) {
6255 assert(TE.isGather() && "Expected gather node only.");
6256 // Try to find subvector extract/insert patterns and reorder only such
6257 // patterns.
6258 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6259 Type *ScalarTy = GatheredScalars.front()->getType();
6260 size_t NumScalars = GatheredScalars.size();
6261 if (!isValidElementType(ScalarTy))
6262 return std::nullopt;
6263 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6264 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, NumScalars);
6265 SmallVector<int> ExtractMask;
6266 SmallVector<int> Mask;
6269 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6271 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6272 /*ForOrder=*/true);
6273 // No shuffled operands - ignore.
6274 if (GatherShuffles.empty() && ExtractShuffles.empty())
6275 return std::nullopt;
6276 OrdersType CurrentOrder(NumScalars, NumScalars);
6277 if (GatherShuffles.size() == 1 &&
6278 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6279 Entries.front().front()->isSame(TE.Scalars)) {
6280 // If the full matched node in whole tree rotation - no need to consider the
6281 // matching order, rotating the whole tree.
6282 if (TopToBottom)
6283 return std::nullopt;
6284 // No need to keep the order for the same user node.
6285 if (Entries.front().front()->UserTreeIndex.UserTE ==
6286 TE.UserTreeIndex.UserTE)
6287 return std::nullopt;
6288 // No need to keep the order for the matched root node, if it can be freely
6289 // reordered.
6290 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6291 return std::nullopt;
6292 // If shuffling 2 elements only and the matching node has reverse reuses -
6293 // no need to count order, both work fine.
6294 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6295 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6296 any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
6297 [](const auto &P) {
6298 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6299 }))
6300 return std::nullopt;
6301
6302 // Perfect match in the graph, will reuse the previously vectorized
6303 // node. Cost is 0.
6304 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
6305 return CurrentOrder;
6306 }
6307 auto IsSplatMask = [](ArrayRef<int> Mask) {
6308 int SingleElt = PoisonMaskElem;
6309 return all_of(Mask, [&](int I) {
6310 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6311 SingleElt = I;
6312 return I == PoisonMaskElem || I == SingleElt;
6313 });
6314 };
6315 // Exclusive broadcast mask - ignore.
6316 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6317 (Entries.size() != 1 ||
6318 Entries.front().front()->ReorderIndices.empty())) ||
6319 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6320 return std::nullopt;
6321 SmallBitVector ShuffledSubMasks(NumParts);
6322 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6323 ArrayRef<int> Mask, int PartSz, int NumParts,
6324 function_ref<unsigned(unsigned)> GetVF) {
6325 for (int I : seq<int>(0, NumParts)) {
6326 if (ShuffledSubMasks.test(I))
6327 continue;
6328 const int VF = GetVF(I);
6329 if (VF == 0)
6330 continue;
6331 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
6332 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
6333 // Shuffle of at least 2 vectors - ignore.
6334 if (any_of(Slice, [&](unsigned I) { return I != NumScalars; })) {
6335 llvm::fill(Slice, NumScalars);
6336 ShuffledSubMasks.set(I);
6337 continue;
6338 }
6339 // Try to include as much elements from the mask as possible.
6340 int FirstMin = INT_MAX;
6341 int SecondVecFound = false;
6342 for (int K : seq<int>(Limit)) {
6343 int Idx = Mask[I * PartSz + K];
6344 if (Idx == PoisonMaskElem) {
6345 Value *V = GatheredScalars[I * PartSz + K];
6346 if (isConstant(V) && !isa<PoisonValue>(V)) {
6347 SecondVecFound = true;
6348 break;
6349 }
6350 continue;
6351 }
6352 if (Idx < VF) {
6353 if (FirstMin > Idx)
6354 FirstMin = Idx;
6355 } else {
6356 SecondVecFound = true;
6357 break;
6358 }
6359 }
6360 FirstMin = (FirstMin / PartSz) * PartSz;
6361 // Shuffle of at least 2 vectors - ignore.
6362 if (SecondVecFound) {
6363 llvm::fill(Slice, NumScalars);
6364 ShuffledSubMasks.set(I);
6365 continue;
6366 }
6367 for (int K : seq<int>(Limit)) {
6368 int Idx = Mask[I * PartSz + K];
6369 if (Idx == PoisonMaskElem)
6370 continue;
6371 Idx -= FirstMin;
6372 if (Idx >= PartSz) {
6373 SecondVecFound = true;
6374 break;
6375 }
6376 if (CurrentOrder[I * PartSz + Idx] >
6377 static_cast<unsigned>(I * PartSz + K) &&
6378 CurrentOrder[I * PartSz + Idx] !=
6379 static_cast<unsigned>(I * PartSz + Idx))
6380 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6381 }
6382 // Shuffle of at least 2 vectors - ignore.
6383 if (SecondVecFound) {
6384 llvm::fill(Slice, NumScalars);
6385 ShuffledSubMasks.set(I);
6386 continue;
6387 }
6388 }
6389 };
6390 int PartSz = getPartNumElems(NumScalars, NumParts);
6391 if (!ExtractShuffles.empty())
6392 TransformMaskToOrder(
6393 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6394 if (!ExtractShuffles[I])
6395 return 0U;
6396 unsigned VF = 0;
6397 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
6398 for (unsigned Idx : seq<unsigned>(Sz)) {
6399 int K = I * PartSz + Idx;
6400 if (ExtractMask[K] == PoisonMaskElem)
6401 continue;
6402 if (!TE.ReuseShuffleIndices.empty())
6403 K = TE.ReuseShuffleIndices[K];
6404 if (K == PoisonMaskElem)
6405 continue;
6406 if (!TE.ReorderIndices.empty())
6407 K = std::distance(TE.ReorderIndices.begin(),
6408 find(TE.ReorderIndices, K));
6409 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
6410 if (!EI)
6411 continue;
6412 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
6413 ->getElementCount()
6414 .getKnownMinValue());
6415 }
6416 return VF;
6417 });
6418 // Check special corner case - single shuffle of the same entry.
6419 if (GatherShuffles.size() == 1 && NumParts != 1) {
6420 if (ShuffledSubMasks.any())
6421 return std::nullopt;
6422 PartSz = NumScalars;
6423 NumParts = 1;
6424 }
6425 if (!Entries.empty())
6426 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6427 if (!GatherShuffles[I])
6428 return 0U;
6429 return std::max(Entries[I].front()->getVectorFactor(),
6430 Entries[I].back()->getVectorFactor());
6431 });
6432 unsigned NumUndefs = count(CurrentOrder, NumScalars);
6433 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6434 return std::nullopt;
6435 return std::move(CurrentOrder);
6436}
6437
6438static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6439 const TargetLibraryInfo &TLI,
6440 bool CompareOpcodes = true) {
6443 return false;
6444 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
6445 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
6446 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6447 (!GEP2 || GEP2->getNumOperands() == 2) &&
6448 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
6449 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
6450 !CompareOpcodes ||
6451 (GEP1 && GEP2 &&
6452 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6453}
6454
6455/// Calculates minimal alignment as a common alignment.
6456template <typename T>
6458 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6459 for (Value *V : VL)
6460 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6461 return CommonAlignment;
6462}
6463
6464/// Check if \p Order represents reverse order.
6466 assert(!Order.empty() &&
6467 "Order is empty. Please check it before using isReverseOrder.");
6468 unsigned Sz = Order.size();
6469 return all_of(enumerate(Order), [&](const auto &Pair) {
6470 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6471 });
6472}
6473
6474/// Checks if the provided list of pointers \p Pointers represents the strided
6475/// pointers for type ElemTy. If they are not, nullptr is returned.
6476/// Otherwise, SCEV* of the stride value is returned.
6477static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
6478 const DataLayout &DL, ScalarEvolution &SE,
6479 SmallVectorImpl<unsigned> &SortedIndices) {
6481 const SCEV *PtrSCEVLowest = nullptr;
6482 const SCEV *PtrSCEVHighest = nullptr;
6483 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6484 // addresses).
6485 for (Value *Ptr : PointerOps) {
6486 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
6487 if (!PtrSCEV)
6488 return nullptr;
6489 SCEVs.push_back(PtrSCEV);
6490 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6491 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6492 continue;
6493 }
6494 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6495 if (isa<SCEVCouldNotCompute>(Diff))
6496 return nullptr;
6497 if (Diff->isNonConstantNegative()) {
6498 PtrSCEVLowest = PtrSCEV;
6499 continue;
6500 }
6501 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
6502 if (isa<SCEVCouldNotCompute>(Diff1))
6503 return nullptr;
6504 if (Diff1->isNonConstantNegative()) {
6505 PtrSCEVHighest = PtrSCEV;
6506 continue;
6507 }
6508 }
6509 // Dist = PtrSCEVHighest - PtrSCEVLowest;
6510 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
6511 if (isa<SCEVCouldNotCompute>(Dist))
6512 return nullptr;
6513 int Size = DL.getTypeStoreSize(ElemTy);
6514 auto TryGetStride = [&](const SCEV *Dist,
6515 const SCEV *Multiplier) -> const SCEV * {
6516 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
6517 if (M->getOperand(0) == Multiplier)
6518 return M->getOperand(1);
6519 if (M->getOperand(1) == Multiplier)
6520 return M->getOperand(0);
6521 return nullptr;
6522 }
6523 if (Multiplier == Dist)
6524 return SE.getConstant(Dist->getType(), 1);
6525 return SE.getUDivExactExpr(Dist, Multiplier);
6526 };
6527 // Stride_in_elements = Dist / element_size * (num_elems - 1).
6528 const SCEV *Stride = nullptr;
6529 if (Size != 1 || SCEVs.size() > 2) {
6530 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
6531 Stride = TryGetStride(Dist, Sz);
6532 if (!Stride)
6533 return nullptr;
6534 }
6535 if (!Stride || isa<SCEVConstant>(Stride))
6536 return nullptr;
6537 // Iterate through all pointers and check if all distances are
6538 // unique multiple of Stride.
6539 using DistOrdPair = std::pair<int64_t, int>;
6540 auto Compare = llvm::less_first();
6541 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6542 int Cnt = 0;
6543 bool IsConsecutive = true;
6544 for (const SCEV *PtrSCEV : SCEVs) {
6545 unsigned Dist = 0;
6546 if (PtrSCEV != PtrSCEVLowest) {
6547 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
6548 const SCEV *Coeff = TryGetStride(Diff, Stride);
6549 if (!Coeff)
6550 return nullptr;
6551 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
6552 if (!SC || isa<SCEVCouldNotCompute>(SC))
6553 return nullptr;
6554 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
6555 SE.getMulExpr(Stride, SC)))
6556 ->isZero())
6557 return nullptr;
6558 Dist = SC->getAPInt().getZExtValue();
6559 }
6560 // If the strides are not the same or repeated, we can't vectorize.
6561 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
6562 return nullptr;
6563 auto Res = Offsets.emplace(Dist, Cnt);
6564 if (!Res.second)
6565 return nullptr;
6566 // Consecutive order if the inserted element is the last one.
6567 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6568 ++Cnt;
6569 }
6570 if (Offsets.size() != SCEVs.size())
6571 return nullptr;
6572 SortedIndices.clear();
6573 if (!IsConsecutive) {
6574 // Fill SortedIndices array only if it is non-consecutive.
6575 SortedIndices.resize(PointerOps.size());
6576 Cnt = 0;
6577 for (const std::pair<int64_t, int> &Pair : Offsets) {
6578 SortedIndices[Cnt] = Pair.second;
6579 ++Cnt;
6580 }
6581 }
6582 return Stride;
6583}
6584
6585static std::pair<InstructionCost, InstructionCost>
6586getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
6587 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
6588 Type *ScalarTy, VectorType *VecTy);
6589
6590/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6591/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6592/// subvector pattern.
6593static InstructionCost
6595 VectorType *Tp, ArrayRef<int> Mask = {},
6597 int Index = 0, VectorType *SubTp = nullptr,
6599 VectorType *DstTy = Tp;
6600 if (!Mask.empty())
6601 DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size());
6602
6603 if (Kind != TTI::SK_PermuteTwoSrc)
6604 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6605 Args);
6606 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6607 int NumSubElts;
6609 Mask, NumSrcElts, NumSubElts, Index)) {
6610 if (Index + NumSubElts > NumSrcElts &&
6611 Index + NumSrcElts <= static_cast<int>(Mask.size()))
6612 return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask,
6613 TTI::TCK_RecipThroughput, Index, Tp);
6614 }
6615 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
6616 Args);
6617}
6618
6619/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
6620/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
6621/// instead of a scalar.
6622static InstructionCost
6624 VectorType *Ty, const APInt &DemandedElts, bool Insert,
6625 bool Extract, TTI::TargetCostKind CostKind,
6626 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
6628 "ScalableVectorType is not supported.");
6629 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
6630 getNumElements(Ty) &&
6631 "Incorrect usage.");
6632 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6633 assert(SLPReVec && "Only supported by REVEC.");
6634 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
6635 // of CreateInsertElement.
6636 unsigned ScalarTyNumElements = VecTy->getNumElements();
6638 for (unsigned I : seq(DemandedElts.getBitWidth())) {
6639 if (!DemandedElts[I])
6640 continue;
6641 if (Insert)
6643 I * ScalarTyNumElements, VecTy);
6644 if (Extract)
6646 I * ScalarTyNumElements, VecTy);
6647 }
6648 return Cost;
6649 }
6650 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6651 CostKind, ForPoisonSrc, VL);
6652}
6653
6654/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
6655/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6657 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
6658 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
6659 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6660 if (Opcode == Instruction::ExtractElement) {
6661 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
6662 assert(SLPReVec && "Only supported by REVEC.");
6663 assert(isa<VectorType>(Val) && "Val must be a vector type.");
6665 cast<VectorType>(Val), {}, CostKind,
6666 Index * VecTy->getNumElements(), VecTy);
6667 }
6668 }
6669 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
6670 ScalarUserAndIdx);
6671}
6672
6673/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
6674/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6676 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
6677 VectorType *VecTy, unsigned Index,
6679 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
6680 assert(SLPReVec && "Only supported by REVEC.");
6681 auto *SubTp =
6682 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6684 Index * ScalarTy->getNumElements(), SubTp) +
6685 TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
6686 CostKind);
6687 }
6688 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
6689}
6690
6691/// Creates subvector insert. Generates shuffle using \p Generator or
6692/// using default shuffle.
6694 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
6695 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
6696 if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
6697 return Vec;
6698 const unsigned SubVecVF = getNumElements(V->getType());
6699 // Create shuffle, insertvector requires that index is multiple of
6700 // the subvector length.
6701 const unsigned VecVF = getNumElements(Vec->getType());
6702 SmallVector<int> Mask(VecVF, PoisonMaskElem);
6703 if (isa<PoisonValue>(Vec)) {
6704 auto *Begin = std::next(Mask.begin(), Index);
6705 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6706 Vec = Builder.CreateShuffleVector(V, Mask);
6707 return Vec;
6708 }
6709 std::iota(Mask.begin(), Mask.end(), 0);
6710 std::iota(std::next(Mask.begin(), Index),
6711 std::next(Mask.begin(), Index + SubVecVF), VecVF);
6712 if (Generator)
6713 return Generator(Vec, V, Mask);
6714 // 1. Resize V to the size of Vec.
6715 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
6716 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6717 V = Builder.CreateShuffleVector(V, ResizeMask);
6718 // 2. Insert V into Vec.
6719 return Builder.CreateShuffleVector(Vec, V, Mask);
6720}
6721
6722/// Generates subvector extract using \p Generator or using default shuffle.
6724 unsigned SubVecVF, unsigned Index) {
6725 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
6726 std::iota(Mask.begin(), Mask.end(), Index);
6727 return Builder.CreateShuffleVector(Vec, Mask);
6728}
6729
6730/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
6731/// with \p Order.
6732/// \return true if the mask represents strided access, false - otherwise.
6734 ArrayRef<unsigned> Order, Type *ScalarTy,
6735 const DataLayout &DL, ScalarEvolution &SE,
6736 SmallVectorImpl<int> &CompressMask) {
6737 const unsigned Sz = PointerOps.size();
6738 CompressMask.assign(Sz, PoisonMaskElem);
6739 // The first element always set.
6740 CompressMask[0] = 0;
6741 // Check if the mask represents strided access.
6742 std::optional<unsigned> Stride = 0;
6743 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
6744 for (unsigned I : seq<unsigned>(1, Sz)) {
6745 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
6746 std::optional<int64_t> OptPos =
6747 getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
6748 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6749 return false;
6750 unsigned Pos = static_cast<unsigned>(*OptPos);
6751 CompressMask[I] = Pos;
6752 if (!Stride)
6753 continue;
6754 if (*Stride == 0) {
6755 *Stride = Pos;
6756 continue;
6757 }
6758 if (Pos != *Stride * I)
6759 Stride.reset();
6760 }
6761 return Stride.has_value();
6762}
6763
6764/// Checks if the \p VL can be transformed to a (masked)load + compress or
6765/// (masked) interleaved load.
6767 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6770 const DominatorTree &DT, const TargetLibraryInfo &TLI,
6771 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
6772 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
6773 VectorType *&LoadVecTy) {
6774 InterleaveFactor = 0;
6775 Type *ScalarTy = VL.front()->getType();
6776 const size_t Sz = VL.size();
6777 auto *VecTy = getWidenedType(ScalarTy, Sz);
6779 SmallVector<int> Mask;
6780 if (!Order.empty())
6781 inversePermutation(Order, Mask);
6782 // Check external uses.
6783 for (const auto [I, V] : enumerate(VL)) {
6784 if (AreAllUsersVectorized(V))
6785 continue;
6786 InstructionCost ExtractCost =
6787 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
6788 Mask.empty() ? I : Mask[I]);
6789 InstructionCost ScalarCost =
6790 TTI.getInstructionCost(cast<Instruction>(V), CostKind);
6791 if (ExtractCost <= ScalarCost)
6792 return false;
6793 }
6794 Value *Ptr0;
6795 Value *PtrN;
6796 if (Order.empty()) {
6797 Ptr0 = PointerOps.front();
6798 PtrN = PointerOps.back();
6799 } else {
6800 Ptr0 = PointerOps[Order.front()];
6801 PtrN = PointerOps[Order.back()];
6802 }
6803 std::optional<int64_t> Diff =
6804 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
6805 if (!Diff)
6806 return false;
6807 const size_t MaxRegSize =
6809 .getFixedValue();
6810 // Check for very large distances between elements.
6811 if (*Diff / Sz >= MaxRegSize / 8)
6812 return false;
6813 LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
6814 auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
6815 Align CommonAlignment = LI->getAlign();
6816 IsMasked = !isSafeToLoadUnconditionally(
6817 Ptr0, LoadVecTy, CommonAlignment, DL,
6818 cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
6819 &TLI);
6820 if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6821 LI->getPointerAddressSpace()))
6822 return false;
6823 // TODO: perform the analysis of each scalar load for better
6824 // safe-load-unconditionally analysis.
6825 bool IsStrided =
6826 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
6827 assert(CompressMask.size() >= 2 && "At least two elements are required");
6828 SmallVector<Value *> OrderedPointerOps(PointerOps);
6829 if (!Order.empty())
6830 reorderScalars(OrderedPointerOps, Mask);
6831 auto [ScalarGEPCost, VectorGEPCost] =
6832 getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
6833 Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
6834 // The cost of scalar loads.
6835 InstructionCost ScalarLoadsCost =
6836 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
6837 [&](InstructionCost C, Value *V) {
6838 return C + TTI.getInstructionCost(cast<Instruction>(V),
6839 CostKind);
6840 }) +
6841 ScalarGEPCost;
6842 APInt DemandedElts = APInt::getAllOnes(Sz);
6843 InstructionCost GatherCost =
6844 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
6845 /*Insert=*/true,
6846 /*Extract=*/false, CostKind) +
6847 ScalarLoadsCost;
6848 InstructionCost LoadCost = 0;
6849 if (IsMasked) {
6850 LoadCost =
6851 TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6852 LI->getPointerAddressSpace(), CostKind);
6853 } else {
6854 LoadCost =
6855 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6856 LI->getPointerAddressSpace(), CostKind);
6857 }
6858 if (IsStrided && !IsMasked && Order.empty()) {
6859 // Check for potential segmented(interleaved) loads.
6860 VectorType *AlignedLoadVecTy = getWidenedType(
6861 ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
6862 if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
6863 DL, cast<LoadInst>(VL.back()), &AC, &DT,
6864 &TLI))
6865 AlignedLoadVecTy = LoadVecTy;
6866 if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6867 CommonAlignment,
6868 LI->getPointerAddressSpace())) {
6869 InstructionCost InterleavedCost =
6870 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
6871 Instruction::Load, AlignedLoadVecTy,
6872 CompressMask[1], {}, CommonAlignment,
6873 LI->getPointerAddressSpace(), CostKind, IsMasked);
6874 if (InterleavedCost < GatherCost) {
6875 InterleaveFactor = CompressMask[1];
6876 LoadVecTy = AlignedLoadVecTy;
6877 return true;
6878 }
6879 }
6880 }
6881 InstructionCost CompressCost = ::getShuffleCost(
6882 TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
6883 if (!Order.empty()) {
6884 SmallVector<int> NewMask(Sz, PoisonMaskElem);
6885 for (unsigned I : seq<unsigned>(Sz)) {
6886 NewMask[I] = CompressMask[Mask[I]];
6887 }
6888 CompressMask.swap(NewMask);
6889 }
6890 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6891 return TotalVecCost < GatherCost;
6892}
6893
6894/// Checks if the \p VL can be transformed to a (masked)load + compress or
6895/// (masked) interleaved load.
6896static bool
6899 const DataLayout &DL, ScalarEvolution &SE,
6900 AssumptionCache &AC, const DominatorTree &DT,
6901 const TargetLibraryInfo &TLI,
6902 const function_ref<bool(Value *)> AreAllUsersVectorized) {
6903 bool IsMasked;
6904 unsigned InterleaveFactor;
6905 SmallVector<int> CompressMask;
6906 VectorType *LoadVecTy;
6907 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
6908 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6909 CompressMask, LoadVecTy);
6910}
6911
6912/// Checks if strided loads can be generated out of \p VL loads with pointers \p
6913/// PointerOps:
6914/// 1. Target with strided load support is detected.
6915/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
6916/// potential stride <= MaxProfitableLoadStride and the potential stride is
6917/// power-of-2 (to avoid perf regressions for the very small number of loads)
6918/// and max distance > number of loads, or potential stride is -1.
6919/// 3. The loads are ordered, or number of unordered loads <=
6920/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
6921/// to avoid extra costs for very expensive shuffles).
6922/// 4. Any pointer operand is an instruction with the users outside of the
6923/// current graph (for masked gathers extra extractelement instructions
6924/// might be required).
6926 Align Alignment, const int64_t Diff,
6927 const size_t Sz) const {
6928 if (Diff % (Sz - 1) != 0)
6929 return false;
6930
6931 // Try to generate strided load node.
6932 auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) {
6933 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
6934 return !isVectorized(U) && !MustGather.contains(U);
6935 });
6936 });
6937
6938 const uint64_t AbsoluteDiff = std::abs(Diff);
6939 auto *VecTy = getWidenedType(ScalarTy, Sz);
6940 if (IsAnyPointerUsedOutGraph ||
6941 (AbsoluteDiff > Sz &&
6943 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
6944 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
6945 Diff == -(static_cast<int64_t>(Sz) - 1)) {
6946 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
6947 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
6948 return false;
6949 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
6950 return false;
6951 return true;
6952 }
6953 return false;
6954}
6955
6957 const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment,
6958 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
6959 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
6960 const size_t Sz = PointerOps.size();
6961 if (!isStridedLoad(PointerOps, ScalarTy, Alignment, Diff, Sz))
6962 return false;
6963
6964 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
6965
6966 // Iterate through all pointers and check if all distances are
6967 // unique multiple of Dist.
6969 for (Value *Ptr : PointerOps) {
6970 int64_t Dist = 0;
6971 if (Ptr == PtrN)
6972 Dist = Diff;
6973 else if (Ptr != Ptr0)
6974 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
6975 // If the strides are not the same or repeated, we can't
6976 // vectorize.
6977 if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
6978 break;
6979 }
6980 if (Dists.size() == Sz) {
6981 Type *StrideTy = DL->getIndexType(Ptr0->getType());
6982 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
6983 SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
6984 return true;
6985 }
6986 return false;
6987}
6988
6990 Type *ScalarTy, Align CommonAlignment,
6991 SmallVectorImpl<unsigned> &SortedIndices,
6992 StridedPtrInfo &SPtrInfo) const {
6993 const unsigned Sz = PointerOps.size();
6994 FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, Sz);
6995 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
6996 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
6997 return false;
6998 if (const SCEV *Stride =
6999 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, SortedIndices)) {
7000 SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size());
7001 SPtrInfo.StrideSCEV = Stride;
7002 return true;
7003 }
7004 return false;
7005}
7006
7008 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
7009 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
7010 unsigned *BestVF, bool TryRecursiveCheck) const {
7011 // Check that a vectorized load would load the same memory as a scalar
7012 // load. For example, we don't want to vectorize loads that are smaller
7013 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7014 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7015 // from such a struct, we read/write packed bits disagreeing with the
7016 // unvectorized version.
7017 if (BestVF)
7018 *BestVF = 0;
7020 return LoadsState::Gather;
7021 Type *ScalarTy = VL0->getType();
7022
7023 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7024 return LoadsState::Gather;
7025
7026 // Make sure all loads in the bundle are simple - we can't vectorize
7027 // atomic or volatile loads.
7028 PointerOps.clear();
7029 const size_t Sz = VL.size();
7030 PointerOps.resize(Sz);
7031 auto *POIter = PointerOps.begin();
7032 for (Value *V : VL) {
7033 auto *L = dyn_cast<LoadInst>(V);
7034 if (!L || !L->isSimple())
7035 return LoadsState::Gather;
7036 *POIter = L->getPointerOperand();
7037 ++POIter;
7038 }
7039
7040 Order.clear();
7041 // Check the order of pointer operands or that all pointers are the same.
7042 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7043
7044 auto *VecTy = getWidenedType(ScalarTy, Sz);
7045 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
7046 if (!IsSorted) {
7047 if (analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
7048 SPtrInfo))
7050
7051 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7052 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7053 return LoadsState::Gather;
7054
7055 if (!all_of(PointerOps, [&](Value *P) {
7056 return arePointersCompatible(P, PointerOps.front(), *TLI);
7057 }))
7058 return LoadsState::Gather;
7059
7060 } else {
7061 Value *Ptr0;
7062 Value *PtrN;
7063 if (Order.empty()) {
7064 Ptr0 = PointerOps.front();
7065 PtrN = PointerOps.back();
7066 } else {
7067 Ptr0 = PointerOps[Order.front()];
7068 PtrN = PointerOps[Order.back()];
7069 }
7070 std::optional<int64_t> Diff =
7071 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
7072 // Check that the sorted loads are consecutive.
7073 if (static_cast<uint64_t>(*Diff) == Sz - 1)
7074 return LoadsState::Vectorize;
7075 if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT,
7076 *TLI, [&](Value *V) {
7077 return areAllUsersVectorized(
7078 cast<Instruction>(V), UserIgnoreList);
7079 }))
7081 Align Alignment =
7082 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
7083 ->getAlign();
7084 if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order,
7085 *Diff, Ptr0, PtrN, SPtrInfo))
7087 }
7088 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7089 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7090 return LoadsState::Gather;
7091 // Correctly identify compare the cost of loads + shuffles rather than
7092 // strided/masked gather loads. Returns true if vectorized + shuffles
7093 // representation is better than just gather.
7094 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
7095 unsigned *BestVF,
7096 bool ProfitableGatherPointers) {
7097 if (BestVF)
7098 *BestVF = 0;
7099 // Compare masked gather cost and loads + insert subvector costs.
7101 auto [ScalarGEPCost, VectorGEPCost] =
7102 getGEPCosts(TTI, PointerOps, PointerOps.front(),
7103 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
7104 // Estimate the cost of masked gather GEP. If not a splat, roughly
7105 // estimate as a buildvector, otherwise estimate as splat.
7106 APInt DemandedElts = APInt::getAllOnes(Sz);
7107 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
7108 VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
7109 if (static_cast<unsigned>(count_if(
7110 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
7111 any_of(PointerOps, [&](Value *V) {
7112 return getUnderlyingObject(V) !=
7113 getUnderlyingObject(PointerOps.front());
7114 }))
7115 VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
7116 DemandedElts, /*Insert=*/true,
7117 /*Extract=*/false, CostKind);
7118 else
7119 VectorGEPCost +=
7121 TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
7122 /*Insert=*/true, /*Extract=*/false, CostKind) +
7123 ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
7124 // The cost of scalar loads.
7125 InstructionCost ScalarLoadsCost =
7126 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
7127 [&](InstructionCost C, Value *V) {
7128 return C + TTI.getInstructionCost(
7130 }) +
7131 ScalarGEPCost;
7132 // The cost of masked gather.
7133 InstructionCost MaskedGatherCost =
7134 TTI.getGatherScatterOpCost(
7135 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
7136 /*VariableMask=*/false, CommonAlignment, CostKind) +
7137 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7138 InstructionCost GatherCost =
7139 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7140 /*Insert=*/true,
7141 /*Extract=*/false, CostKind) +
7142 ScalarLoadsCost;
7143 // The list of loads is small or perform partial check already - directly
7144 // compare masked gather cost and gather cost.
7145 constexpr unsigned ListLimit = 4;
7146 if (!TryRecursiveCheck || VL.size() < ListLimit)
7147 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7148
7149 // FIXME: The following code has not been updated for non-power-of-2
7150 // vectors (and not whole registers). The splitting logic here does not
7151 // cover the original vector if the vector factor is not a power of two.
7152 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
7153 return false;
7154
7155 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7156 unsigned MinVF = getMinVF(2 * Sz);
7157 DemandedElts.clearAllBits();
7158 // Iterate through possible vectorization factors and check if vectorized +
7159 // shuffles is better than just gather.
7160 for (unsigned VF =
7161 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
7162 VF >= MinVF;
7163 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
7165 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
7166 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7168 SmallVector<Value *> PointerOps;
7169 LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
7170 PointerOps, SPtrInfo, BestVF,
7171 /*TryRecursiveCheck=*/false);
7172 // Check that the sorted loads are consecutive.
7173 if (LS == LoadsState::Gather) {
7174 if (BestVF) {
7175 DemandedElts.setAllBits();
7176 break;
7177 }
7178 DemandedElts.setBits(Cnt, Cnt + VF);
7179 continue;
7180 }
7181 // If need the reorder - consider as high-cost masked gather for now.
7182 if ((LS == LoadsState::Vectorize ||
7185 !Order.empty() && !isReverseOrder(Order))
7187 States.push_back(LS);
7188 }
7189 if (DemandedElts.isAllOnes())
7190 // All loads gathered - try smaller VF.
7191 continue;
7192 // Can be vectorized later as a serie of loads/insertelements.
7193 InstructionCost VecLdCost = 0;
7194 if (!DemandedElts.isZero()) {
7195 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7196 /*Insert=*/true,
7197 /*Extract=*/false, CostKind) +
7198 ScalarGEPCost;
7199 for (unsigned Idx : seq<unsigned>(VL.size()))
7200 if (DemandedElts[Idx])
7201 VecLdCost +=
7202 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
7203 }
7204 auto *SubVecTy = getWidenedType(ScalarTy, VF);
7205 for (auto [I, LS] : enumerate(States)) {
7206 auto *LI0 = cast<LoadInst>(VL[I * VF]);
7207 InstructionCost VectorGEPCost =
7208 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7209 ? 0
7210 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
7211 LI0->getPointerOperand(),
7212 Instruction::GetElementPtr, CostKind, ScalarTy,
7213 SubVecTy)
7214 .second;
7215 if (LS == LoadsState::ScatterVectorize) {
7216 if (static_cast<unsigned>(
7217 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
7218 PointerOps.size() - 1 ||
7219 any_of(PointerOps, [&](Value *V) {
7220 return getUnderlyingObject(V) !=
7221 getUnderlyingObject(PointerOps.front());
7222 }))
7223 VectorGEPCost += getScalarizationOverhead(
7224 TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
7225 /*Insert=*/true, /*Extract=*/false, CostKind);
7226 else
7227 VectorGEPCost +=
7229 TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
7230 /*Insert=*/true, /*Extract=*/false, CostKind) +
7231 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
7232 CostKind);
7233 }
7234 switch (LS) {
7236 VecLdCost +=
7237 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7238 LI0->getPointerAddressSpace(), CostKind,
7240 VectorGEPCost;
7241 break;
7243 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
7244 LI0->getPointerOperand(),
7245 /*VariableMask=*/false,
7246 CommonAlignment, CostKind) +
7247 VectorGEPCost;
7248 break;
7250 VecLdCost += TTI.getMaskedMemoryOpCost(
7251 Instruction::Load, SubVecTy, CommonAlignment,
7252 LI0->getPointerAddressSpace(), CostKind) +
7253 VectorGEPCost +
7255 {}, CostKind);
7256 break;
7258 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
7259 LI0->getPointerOperand(),
7260 /*VariableMask=*/false,
7261 CommonAlignment, CostKind) +
7262 VectorGEPCost;
7263 break;
7264 case LoadsState::Gather:
7265 // Gathers are already calculated - ignore.
7266 continue;
7267 }
7268 SmallVector<int> ShuffleMask(VL.size());
7269 for (int Idx : seq<int>(0, VL.size()))
7270 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
7271 if (I > 0)
7272 VecLdCost +=
7273 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
7274 CostKind, I * VF, SubVecTy);
7275 }
7276 // If masked gather cost is higher - better to vectorize, so
7277 // consider it as a gather node. It will be better estimated
7278 // later.
7279 if (MaskedGatherCost >= VecLdCost &&
7280 VecLdCost - GatherCost < -SLPCostThreshold) {
7281 if (BestVF)
7282 *BestVF = VF;
7283 return true;
7284 }
7285 }
7286 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7287 };
7288 // TODO: need to improve analysis of the pointers, if not all of them are
7289 // GEPs or have > 2 operands, we end up with a gather node, which just
7290 // increases the cost.
7291 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
7292 bool ProfitableGatherPointers =
7293 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
7294 return L->isLoopInvariant(V);
7295 })) <= Sz / 2;
7296 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
7298 return (!GEP && doesNotNeedToBeScheduled(P)) ||
7299 (GEP && GEP->getNumOperands() == 2 &&
7300 isa<Constant, Instruction>(GEP->getOperand(1)));
7301 })) {
7302 // Check if potential masked gather can be represented as series
7303 // of loads + insertsubvectors.
7304 // If masked gather cost is higher - better to vectorize, so
7305 // consider it as a gather node. It will be better estimated
7306 // later.
7307 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7308 ProfitableGatherPointers))
7310 }
7311
7312 return LoadsState::Gather;
7313}
7314
7316 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
7317 const DataLayout &DL, ScalarEvolution &SE,
7318 SmallVectorImpl<unsigned> &SortedIndices) {
7319 assert(
7320 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
7321 "Expected list of pointer operands.");
7322 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
7323 // Ptr into, sort and return the sorted indices with values next to one
7324 // another.
7326 std::pair<BasicBlock *, Value *>,
7328 Bases;
7329 Bases
7330 .try_emplace(std::make_pair(
7332 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
7333
7334 SortedIndices.clear();
7335 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
7336 auto Key = std::make_pair(BBs[Cnt + 1],
7338 bool Found = any_of(Bases.try_emplace(Key).first->second,
7339 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
7340 std::optional<int64_t> Diff =
7341 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7342 ElemTy, Ptr, DL, SE,
7343 /*StrictCheck=*/true);
7344 if (!Diff)
7345 return false;
7346
7347 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7348 return true;
7349 });
7350
7351 if (!Found) {
7352 // If we haven't found enough to usefully cluster, return early.
7353 if (Bases.size() > VL.size() / 2 - 1)
7354 return false;
7355
7356 // Not found already - add a new Base
7357 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7358 }
7359 }
7360
7361 if (Bases.size() == VL.size())
7362 return false;
7363
7364 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7365 Bases.front().second.size() == VL.size()))
7366 return false;
7367
7368 // For each of the bases sort the pointers by Offset and check if any of the
7369 // base become consecutively allocated.
7370 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
7371 SmallPtrSet<Value *, 13> FirstPointers;
7372 SmallPtrSet<Value *, 13> SecondPointers;
7373 Value *P1 = Ptr1;
7374 Value *P2 = Ptr2;
7375 unsigned Depth = 0;
7376 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
7377 if (P1 == P2 || Depth > RecursionMaxDepth)
7378 return false;
7379 FirstPointers.insert(P1);
7380 SecondPointers.insert(P2);
7381 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
7382 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
7383 ++Depth;
7384 }
7385 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
7386 "Unable to find matching root.");
7387 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
7388 };
7389 for (auto &Base : Bases) {
7390 for (auto &Vec : Base.second) {
7391 if (Vec.size() > 1) {
7393 int64_t InitialOffset = std::get<1>(Vec[0]);
7394 bool AnyConsecutive =
7395 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
7396 return std::get<1>(P.value()) ==
7397 int64_t(P.index()) + InitialOffset;
7398 });
7399 // Fill SortedIndices array only if it looks worth-while to sort the
7400 // ptrs.
7401 if (!AnyConsecutive)
7402 return false;
7403 }
7404 }
7405 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
7406 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7407 });
7408 }
7409
7410 for (auto &T : Bases)
7411 for (const auto &Vec : T.second)
7412 for (const auto &P : Vec)
7413 SortedIndices.push_back(std::get<2>(P));
7414
7415 assert(SortedIndices.size() == VL.size() &&
7416 "Expected SortedIndices to be the size of VL");
7417 return true;
7418}
7419
7420std::optional<BoUpSLP::OrdersType>
7421BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
7422 assert(TE.isGather() && "Expected gather node only.");
7423 Type *ScalarTy = TE.Scalars[0]->getType();
7424
7426 Ptrs.reserve(TE.Scalars.size());
7428 BBs.reserve(TE.Scalars.size());
7429 for (Value *V : TE.Scalars) {
7430 auto *L = dyn_cast<LoadInst>(V);
7431 if (!L || !L->isSimple())
7432 return std::nullopt;
7433 Ptrs.push_back(L->getPointerOperand());
7434 BBs.push_back(L->getParent());
7435 }
7436
7437 BoUpSLP::OrdersType Order;
7438 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7439 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
7440 return std::move(Order);
7441 return std::nullopt;
7442}
7443
7444/// Check if two insertelement instructions are from the same buildvector.
7447 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
7448 // Instructions must be from the same basic blocks.
7449 if (VU->getParent() != V->getParent())
7450 return false;
7451 // Checks if 2 insertelements are from the same buildvector.
7452 if (VU->getType() != V->getType())
7453 return false;
7454 // Multiple used inserts are separate nodes.
7455 if (!VU->hasOneUse() && !V->hasOneUse())
7456 return false;
7457 auto *IE1 = VU;
7458 auto *IE2 = V;
7459 std::optional<unsigned> Idx1 = getElementIndex(IE1);
7460 std::optional<unsigned> Idx2 = getElementIndex(IE2);
7461 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7462 return false;
7463 // Go through the vector operand of insertelement instructions trying to find
7464 // either VU as the original vector for IE2 or V as the original vector for
7465 // IE1.
7466 SmallBitVector ReusedIdx(
7467 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
7468 bool IsReusedIdx = false;
7469 do {
7470 if (IE2 == VU && !IE1)
7471 return VU->hasOneUse();
7472 if (IE1 == V && !IE2)
7473 return V->hasOneUse();
7474 if (IE1 && IE1 != V) {
7475 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
7476 IsReusedIdx |= ReusedIdx.test(Idx1);
7477 ReusedIdx.set(Idx1);
7478 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
7479 IE1 = nullptr;
7480 else
7481 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
7482 }
7483 if (IE2 && IE2 != VU) {
7484 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
7485 IsReusedIdx |= ReusedIdx.test(Idx2);
7486 ReusedIdx.set(Idx2);
7487 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7488 IE2 = nullptr;
7489 else
7490 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
7491 }
7492 } while (!IsReusedIdx && (IE1 || IE2));
7493 return false;
7494}
7495
7496/// Checks if the specified instruction \p I is an alternate operation for
7497/// the given \p MainOp and \p AltOp instructions.
7498static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
7499 Instruction *AltOp,
7500 const TargetLibraryInfo &TLI);
7501
7502std::optional<BoUpSLP::OrdersType>
7503BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7504 bool IgnoreReorder) {
7505 // No need to reorder if need to shuffle reuses, still need to shuffle the
7506 // node.
7507 if (!TE.ReuseShuffleIndices.empty()) {
7508 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
7509 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7510 "Reshuffling scalars not yet supported for nodes with padding");
7511
7512 if (isSplat(TE.Scalars))
7513 return std::nullopt;
7514 // Check if reuse shuffle indices can be improved by reordering.
7515 // For this, check that reuse mask is "clustered", i.e. each scalar values
7516 // is used once in each submask of size <number_of_scalars>.
7517 // Example: 4 scalar values.
7518 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
7519 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
7520 // element 3 is used twice in the second submask.
7521 unsigned Sz = TE.Scalars.size();
7522 if (TE.isGather()) {
7523 if (std::optional<OrdersType> CurrentOrder =
7524 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
7525 SmallVector<int> Mask;
7526 fixupOrderingIndices(*CurrentOrder);
7527 inversePermutation(*CurrentOrder, Mask);
7528 ::addMask(Mask, TE.ReuseShuffleIndices);
7529 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7530 unsigned Sz = TE.Scalars.size();
7531 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7532 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
7533 if (Idx != PoisonMaskElem)
7534 Res[Idx + K * Sz] = I + K * Sz;
7535 }
7536 return std::move(Res);
7537 }
7538 }
7539 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7540 ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(),
7541 2 * TE.getVectorFactor())) == 1)
7542 return std::nullopt;
7543 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7544 return std::nullopt;
7545 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
7546 Sz)) {
7547 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7548 if (TE.ReorderIndices.empty())
7549 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7550 else
7551 inversePermutation(TE.ReorderIndices, ReorderMask);
7552 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7553 unsigned VF = ReorderMask.size();
7554 OrdersType ResOrder(VF, VF);
7555 unsigned NumParts = divideCeil(VF, Sz);
7556 SmallBitVector UsedVals(NumParts);
7557 for (unsigned I = 0; I < VF; I += Sz) {
7558 int Val = PoisonMaskElem;
7559 unsigned UndefCnt = 0;
7560 unsigned Limit = std::min(Sz, VF - I);
7561 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
7562 [&](int Idx) {
7563 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
7564 Val = Idx;
7565 if (Idx == PoisonMaskElem)
7566 ++UndefCnt;
7567 return Idx != PoisonMaskElem && Idx != Val;
7568 }) ||
7569 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
7570 UndefCnt > Sz / 2)
7571 return std::nullopt;
7572 UsedVals.set(Val);
7573 for (unsigned K = 0; K < NumParts; ++K) {
7574 unsigned Idx = Val + Sz * K;
7575 if (Idx < VF && I + K < VF)
7576 ResOrder[Idx] = I + K;
7577 }
7578 }
7579 return std::move(ResOrder);
7580 }
7581 unsigned VF = TE.getVectorFactor();
7582 // Try build correct order for extractelement instructions.
7583 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
7584 TE.ReuseShuffleIndices.end());
7585 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7586 all_of(TE.Scalars, [Sz](Value *V) {
7587 if (isa<PoisonValue>(V))
7588 return true;
7589 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7590 return Idx && *Idx < Sz;
7591 })) {
7592 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
7593 "by BinaryOperator and CastInst.");
7594 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7595 if (TE.ReorderIndices.empty())
7596 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
7597 else
7598 inversePermutation(TE.ReorderIndices, ReorderMask);
7599 for (unsigned I = 0; I < VF; ++I) {
7600 int &Idx = ReusedMask[I];
7601 if (Idx == PoisonMaskElem)
7602 continue;
7603 Value *V = TE.Scalars[ReorderMask[Idx]];
7604 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
7605 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
7606 }
7607 }
7608 // Build the order of the VF size, need to reorder reuses shuffles, they are
7609 // always of VF size.
7610 OrdersType ResOrder(VF);
7611 std::iota(ResOrder.begin(), ResOrder.end(), 0);
7612 auto *It = ResOrder.begin();
7613 for (unsigned K = 0; K < VF; K += Sz) {
7614 OrdersType CurrentOrder(TE.ReorderIndices);
7615 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
7616 if (SubMask.front() == PoisonMaskElem)
7617 std::iota(SubMask.begin(), SubMask.end(), 0);
7618 reorderOrder(CurrentOrder, SubMask);
7619 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
7620 std::advance(It, Sz);
7621 }
7622 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
7623 return Data.index() == Data.value();
7624 }))
7625 return std::nullopt; // No need to reorder.
7626 return std::move(ResOrder);
7627 }
7628 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7629 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7630 !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
7631 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
7632 return std::nullopt;
7633 if (TE.State == TreeEntry::SplitVectorize ||
7634 ((TE.State == TreeEntry::Vectorize ||
7635 TE.State == TreeEntry::StridedVectorize ||
7636 TE.State == TreeEntry::CompressVectorize) &&
7638 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
7639 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7640 "Alternate instructions are only supported by "
7641 "BinaryOperator and CastInst.");
7642 return TE.ReorderIndices;
7643 }
7644 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7645 TE.isAltShuffle()) {
7646 assert(TE.ReuseShuffleIndices.empty() &&
7647 "ReuseShuffleIndices should be "
7648 "empty for alternate instructions.");
7649 SmallVector<int> Mask;
7650 TE.buildAltOpShuffleMask(
7651 [&](Instruction *I) {
7652 assert(TE.getMatchingMainOpOrAltOp(I) &&
7653 "Unexpected main/alternate opcode");
7654 return isAlternateInstruction(I, TE.getMainOp(), TE.getAltOp(), *TLI);
7655 },
7656 Mask);
7657 const int VF = TE.getVectorFactor();
7658 OrdersType ResOrder(VF, VF);
7659 for (unsigned I : seq<unsigned>(VF)) {
7660 if (Mask[I] == PoisonMaskElem)
7661 continue;
7662 ResOrder[Mask[I] % VF] = I;
7663 }
7664 return std::move(ResOrder);
7665 }
7666 if (!TE.ReorderIndices.empty())
7667 return TE.ReorderIndices;
7668 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7669 if (!TE.ReorderIndices.empty())
7670 return TE.ReorderIndices;
7671
7672 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
7673 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
7674 if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
7675 continue;
7676 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
7677 if (!II)
7678 continue;
7679 Instruction *BVHead = nullptr;
7680 BasicBlock *BB = II->getParent();
7681 while (II && II->hasOneUse() && II->getParent() == BB) {
7682 BVHead = II;
7683 II = dyn_cast<InsertElementInst>(II->getOperand(0));
7684 }
7685 I = BVHead;
7686 }
7687
7688 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
7689 assert(BB1 != BB2 && "Expected different basic blocks.");
7690 if (!DT->isReachableFromEntry(BB1))
7691 return false;
7692 if (!DT->isReachableFromEntry(BB2))
7693 return true;
7694 auto *NodeA = DT->getNode(BB1);
7695 auto *NodeB = DT->getNode(BB2);
7696 assert(NodeA && "Should only process reachable instructions");
7697 assert(NodeB && "Should only process reachable instructions");
7698 assert((NodeA == NodeB) ==
7699 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7700 "Different nodes should have different DFS numbers");
7701 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7702 };
7703 auto PHICompare = [&](unsigned I1, unsigned I2) {
7704 Value *V1 = TE.Scalars[I1];
7705 Value *V2 = TE.Scalars[I2];
7706 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
7707 return false;
7708 if (isa<PoisonValue>(V1))
7709 return true;
7710 if (isa<PoisonValue>(V2))
7711 return false;
7712 if (V1->getNumUses() < V2->getNumUses())
7713 return true;
7714 if (V1->getNumUses() > V2->getNumUses())
7715 return false;
7716 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
7717 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
7718 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7719 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7720 FirstUserOfPhi2->getParent());
7721 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
7722 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
7723 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
7724 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
7725 if (IE1 && !IE2)
7726 return true;
7727 if (!IE1 && IE2)
7728 return false;
7729 if (IE1 && IE2) {
7730 if (UserBVHead[I1] && !UserBVHead[I2])
7731 return true;
7732 if (!UserBVHead[I1])
7733 return false;
7734 if (UserBVHead[I1] == UserBVHead[I2])
7735 return getElementIndex(IE1) < getElementIndex(IE2);
7736 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
7737 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
7738 UserBVHead[I2]->getParent());
7739 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7740 }
7741 if (EE1 && !EE2)
7742 return true;
7743 if (!EE1 && EE2)
7744 return false;
7745 if (EE1 && EE2) {
7746 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
7747 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
7748 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
7749 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
7750 if (!Inst2 && !P2)
7751 return Inst1 || P1;
7752 if (EE1->getOperand(0) == EE2->getOperand(0))
7753 return getElementIndex(EE1) < getElementIndex(EE2);
7754 if (!Inst1 && Inst2)
7755 return false;
7756 if (Inst1 && Inst2) {
7757 if (Inst1->getParent() != Inst2->getParent())
7758 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
7759 return Inst1->comesBefore(Inst2);
7760 }
7761 if (!P1 && P2)
7762 return false;
7763 assert(P1 && P2 &&
7764 "Expected either instructions or arguments vector operands.");
7765 return P1->getArgNo() < P2->getArgNo();
7766 }
7767 return false;
7768 };
7769 OrdersType Phis(TE.Scalars.size());
7770 std::iota(Phis.begin(), Phis.end(), 0);
7771 stable_sort(Phis, PHICompare);
7772 if (isIdentityOrder(Phis))
7773 return std::nullopt; // No need to reorder.
7774 return std::move(Phis);
7775 }
7776 if (TE.isGather() &&
7777 (!TE.hasState() || !TE.isAltShuffle() ||
7778 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7779 allSameType(TE.Scalars)) {
7780 // TODO: add analysis of other gather nodes with extractelement
7781 // instructions and other values/instructions, not only undefs.
7782 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7784 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
7785 all_of(TE.Scalars, [](Value *V) {
7786 auto *EE = dyn_cast<ExtractElementInst>(V);
7787 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7788 })) {
7789 // Check that gather of extractelements can be represented as
7790 // just a shuffle of a single vector.
7791 OrdersType CurrentOrder;
7792 bool Reuse =
7793 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
7794 if (Reuse || !CurrentOrder.empty())
7795 return std::move(CurrentOrder);
7796 }
7797 // If the gather node is <undef, v, .., poison> and
7798 // insertelement poison, v, 0 [+ permute]
7799 // is cheaper than
7800 // insertelement poison, v, n - try to reorder.
7801 // If rotating the whole graph, exclude the permute cost, the whole graph
7802 // might be transformed.
7803 int Sz = TE.Scalars.size();
7804 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
7805 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
7806 const auto *It = find_if_not(TE.Scalars, isConstant);
7807 if (It == TE.Scalars.begin())
7808 return OrdersType();
7809 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
7810 if (It != TE.Scalars.end()) {
7811 OrdersType Order(Sz, Sz);
7812 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7813 Order[Idx] = 0;
7814 fixupOrderingIndices(Order);
7815 SmallVector<int> Mask;
7816 inversePermutation(Order, Mask);
7817 InstructionCost PermuteCost =
7818 TopToBottom
7819 ? 0
7820 : ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, Ty, Mask);
7821 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
7822 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
7823 PoisonValue::get(Ty), *It);
7824 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
7825 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
7826 PoisonValue::get(Ty), *It);
7827 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7828 OrdersType Order(Sz, Sz);
7829 Order[Idx] = 0;
7830 return std::move(Order);
7831 }
7832 }
7833 }
7834 if (isSplat(TE.Scalars))
7835 return std::nullopt;
7836 if (TE.Scalars.size() >= 3)
7837 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
7838 return Order;
7839 // Check if can include the order of vectorized loads. For masked gathers do
7840 // extra analysis later, so include such nodes into a special list.
7841 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7842 SmallVector<Value *> PointerOps;
7843 StridedPtrInfo SPtrInfo;
7844 OrdersType CurrentOrder;
7845 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
7846 CurrentOrder, PointerOps, SPtrInfo);
7849 return std::move(CurrentOrder);
7850 }
7851 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
7852 // has been auditted for correctness with non-power-of-two vectors.
7853 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
7854 if (std::optional<OrdersType> CurrentOrder =
7855 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
7856 return CurrentOrder;
7857 }
7858 return std::nullopt;
7859}
7860
7861/// Checks if the given mask is a "clustered" mask with the same clusters of
7862/// size \p Sz, which are not identity submasks.
7864 unsigned Sz) {
7865 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
7866 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
7867 return false;
7868 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
7869 ArrayRef<int> Cluster = Mask.slice(I, Sz);
7870 if (Cluster != FirstCluster)
7871 return false;
7872 }
7873 return true;
7874}
7875
7876void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
7877 // Reorder reuses mask.
7878 reorderReuses(TE.ReuseShuffleIndices, Mask);
7879 const unsigned Sz = TE.Scalars.size();
7880 // For vectorized and non-clustered reused no need to do anything else.
7881 if (!TE.isGather() ||
7883 Sz) ||
7884 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
7885 return;
7886 SmallVector<int> NewMask;
7887 inversePermutation(TE.ReorderIndices, NewMask);
7888 addMask(NewMask, TE.ReuseShuffleIndices);
7889 // Clear reorder since it is going to be applied to the new mask.
7890 TE.ReorderIndices.clear();
7891 // Try to improve gathered nodes with clustered reuses, if possible.
7892 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
7893 SmallVector<unsigned> NewOrder(Slice);
7894 inversePermutation(NewOrder, NewMask);
7895 reorderScalars(TE.Scalars, NewMask);
7896 // Fill the reuses mask with the identity submasks.
7897 for (auto *It = TE.ReuseShuffleIndices.begin(),
7898 *End = TE.ReuseShuffleIndices.end();
7899 It != End; std::advance(It, Sz))
7900 std::iota(It, std::next(It, Sz), 0);
7901}
7902
7904 ArrayRef<unsigned> SecondaryOrder) {
7905 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
7906 "Expected same size of orders");
7907 size_t Sz = Order.size();
7908 SmallBitVector UsedIndices(Sz);
7909 for (unsigned Idx : seq<unsigned>(0, Sz)) {
7910 if (Order[Idx] != Sz)
7911 UsedIndices.set(Order[Idx]);
7912 }
7913 if (SecondaryOrder.empty()) {
7914 for (unsigned Idx : seq<unsigned>(0, Sz))
7915 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
7916 Order[Idx] = Idx;
7917 } else {
7918 for (unsigned Idx : seq<unsigned>(0, Sz))
7919 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
7920 !UsedIndices.test(SecondaryOrder[Idx]))
7921 Order[Idx] = SecondaryOrder[Idx];
7922 }
7923}
7924
7927 return false;
7928
7929 constexpr unsigned TinyVF = 2;
7930 constexpr unsigned TinyTree = 10;
7931 constexpr unsigned PhiOpsLimit = 12;
7932 constexpr unsigned GatherLoadsLimit = 2;
7933 if (VectorizableTree.size() <= TinyTree)
7934 return true;
7935 if (VectorizableTree.front()->hasState() &&
7936 !VectorizableTree.front()->isGather() &&
7937 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
7938 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
7939 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7940 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
7941 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7942 VectorizableTree.front()->ReorderIndices.empty()) {
7943 // Check if the tree has only single store and single (unordered) load node,
7944 // other nodes are phis or geps/binops, combined with phis, and/or single
7945 // gather load node
7946 if (VectorizableTree.front()->hasState() &&
7947 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7948 VectorizableTree.front()->Scalars.size() == TinyVF &&
7949 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7950 return false;
7951 // Single node, which require reorder - skip.
7952 if (VectorizableTree.front()->hasState() &&
7953 VectorizableTree.front()->getOpcode() == Instruction::Store &&
7954 VectorizableTree.front()->ReorderIndices.empty()) {
7955 const unsigned ReorderedSplitsCnt =
7956 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
7957 return TE->State == TreeEntry::SplitVectorize &&
7958 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
7959 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7960 ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());
7961 });
7962 if (ReorderedSplitsCnt <= 1 &&
7963 static_cast<unsigned>(count_if(
7964 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
7965 return ((!TE->isGather() &&
7966 (TE->ReorderIndices.empty() ||
7967 (TE->UserTreeIndex.UserTE &&
7968 TE->UserTreeIndex.UserTE->State ==
7969 TreeEntry::Vectorize &&
7970 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
7971 .empty()))) ||
7972 (TE->isGather() && TE->ReorderIndices.empty() &&
7973 (!TE->hasState() || TE->isAltShuffle() ||
7974 TE->getOpcode() == Instruction::Load ||
7975 TE->getOpcode() == Instruction::ZExt ||
7976 TE->getOpcode() == Instruction::SExt))) &&
7977 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
7978 !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
7979 return !isConstant(V) && isVectorized(V);
7980 }));
7981 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
7982 return false;
7983 }
7984 bool HasPhis = false;
7985 bool HasLoad = true;
7986 unsigned GatherLoads = 0;
7987 for (const std::unique_ptr<TreeEntry> &TE :
7988 ArrayRef(VectorizableTree).drop_front()) {
7989 if (TE->State == TreeEntry::SplitVectorize)
7990 continue;
7991 if (!TE->hasState()) {
7992 if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
7994 continue;
7995 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7997 continue;
7998 return true;
7999 }
8000 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8001 if (!TE->isGather()) {
8002 HasLoad = false;
8003 continue;
8004 }
8005 if (HasLoad)
8006 return true;
8007 ++GatherLoads;
8008 if (GatherLoads >= GatherLoadsLimit)
8009 return true;
8010 }
8011 if (TE->getOpcode() == Instruction::GetElementPtr ||
8012 Instruction::isBinaryOp(TE->getOpcode()))
8013 continue;
8014 if (TE->getOpcode() != Instruction::PHI &&
8015 (!TE->hasCopyableElements() ||
8016 static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
8017 TE->Scalars.size() / 2))
8018 return true;
8019 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8020 TE->getNumOperands() > PhiOpsLimit)
8021 return false;
8022 HasPhis = true;
8023 }
8024 return !HasPhis;
8025 }
8026 return true;
8027}
8028
8029void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
8030 ArrayRef<int> MaskOrder) {
8031 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
8032 SmallVector<int> NewMask(getVectorFactor());
8033 SmallVector<int> NewMaskOrder(getVectorFactor());
8034 std::iota(NewMask.begin(), NewMask.end(), 0);
8035 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8036 if (Idx == 0) {
8037 copy(Mask, NewMask.begin());
8038 copy(MaskOrder, NewMaskOrder.begin());
8039 } else {
8040 assert(Idx == 1 && "Expected either 0 or 1 index.");
8041 unsigned Offset = CombinedEntriesWithIndices.back().second;
8042 for (unsigned I : seq<unsigned>(Mask.size())) {
8043 NewMask[I + Offset] = Mask[I] + Offset;
8044 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
8045 }
8046 }
8047 reorderScalars(Scalars, NewMask);
8048 reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
8049 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
8050 ReorderIndices.clear();
8051}
8052
8054 // Maps VF to the graph nodes.
8056 // ExtractElement gather nodes which can be vectorized and need to handle
8057 // their ordering.
8059
8060 // Phi nodes can have preferred ordering based on their result users
8062
8063 // AltShuffles can also have a preferred ordering that leads to fewer
8064 // instructions, e.g., the addsub instruction in x86.
8065 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
8066
8067 // Maps a TreeEntry to the reorder indices of external users.
8069 ExternalUserReorderMap;
8070 // Find all reorderable nodes with the given VF.
8071 // Currently the are vectorized stores,loads,extracts + some gathering of
8072 // extracts.
8073 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8074 const std::unique_ptr<TreeEntry> &TE) {
8075 // Look for external users that will probably be vectorized.
8076 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
8077 findExternalStoreUsersReorderIndices(TE.get());
8078 if (!ExternalUserReorderIndices.empty()) {
8079 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8080 ExternalUserReorderMap.try_emplace(TE.get(),
8081 std::move(ExternalUserReorderIndices));
8082 }
8083
8084 // Patterns like [fadd,fsub] can be combined into a single instruction in
8085 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
8086 // to take into account their order when looking for the most used order.
8087 if (TE->hasState() && TE->isAltShuffle() &&
8088 TE->State != TreeEntry::SplitVectorize) {
8089 Type *ScalarTy = TE->Scalars[0]->getType();
8090 VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
8091 unsigned Opcode0 = TE->getOpcode();
8092 unsigned Opcode1 = TE->getAltOpcode();
8093 SmallBitVector OpcodeMask(
8094 getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1));
8095 // If this pattern is supported by the target then we consider the order.
8096 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8097 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8098 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
8099 }
8100 // TODO: Check the reverse order too.
8101 }
8102
8103 bool IgnoreReorder =
8104 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8105 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
8106 VectorizableTree.front()->getOpcode() == Instruction::Store);
8107 if (std::optional<OrdersType> CurrentOrder =
8108 getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
8109 // Do not include ordering for nodes used in the alt opcode vectorization,
8110 // better to reorder them during bottom-to-top stage. If follow the order
8111 // here, it causes reordering of the whole graph though actually it is
8112 // profitable just to reorder the subgraph that starts from the alternate
8113 // opcode vectorization node. Such nodes already end-up with the shuffle
8114 // instruction and it is just enough to change this shuffle rather than
8115 // rotate the scalars for the whole graph.
8116 unsigned Cnt = 0;
8117 const TreeEntry *UserTE = TE.get();
8118 while (UserTE && Cnt < RecursionMaxDepth) {
8119 if (!UserTE->UserTreeIndex)
8120 break;
8121 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8122 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8123 UserTE->UserTreeIndex.UserTE->Idx != 0)
8124 return;
8125 UserTE = UserTE->UserTreeIndex.UserTE;
8126 ++Cnt;
8127 }
8128 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8129 if (!(TE->State == TreeEntry::Vectorize ||
8130 TE->State == TreeEntry::StridedVectorize ||
8131 TE->State == TreeEntry::SplitVectorize ||
8132 TE->State == TreeEntry::CompressVectorize) ||
8133 !TE->ReuseShuffleIndices.empty())
8134 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
8135 if (TE->State == TreeEntry::Vectorize &&
8136 TE->getOpcode() == Instruction::PHI)
8137 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
8138 }
8139 });
8140
8141 // Reorder the graph nodes according to their vectorization factor.
8142 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
8143 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8144 auto It = VFToOrderedEntries.find(VF);
8145 if (It == VFToOrderedEntries.end())
8146 continue;
8147 // Try to find the most profitable order. We just are looking for the most
8148 // used order and reorder scalar elements in the nodes according to this
8149 // mostly used order.
8150 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
8151 // Delete VF entry upon exit.
8152 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
8153
8154 // All operands are reordered and used only in this node - propagate the
8155 // most used order to the user node.
8158 OrdersUses;
8159 for (const TreeEntry *OpTE : OrderedEntries) {
8160 // No need to reorder this nodes, still need to extend and to use shuffle,
8161 // just need to merge reordering shuffle and the reuse shuffle.
8162 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
8163 OpTE->State != TreeEntry::SplitVectorize)
8164 continue;
8165 // Count number of orders uses.
8166 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8167 &PhisToOrders]() -> const OrdersType & {
8168 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8169 auto It = GathersToOrders.find(OpTE);
8170 if (It != GathersToOrders.end())
8171 return It->second;
8172 }
8173 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8174 auto It = AltShufflesToOrders.find(OpTE);
8175 if (It != AltShufflesToOrders.end())
8176 return It->second;
8177 }
8178 if (OpTE->State == TreeEntry::Vectorize &&
8179 OpTE->getOpcode() == Instruction::PHI) {
8180 auto It = PhisToOrders.find(OpTE);
8181 if (It != PhisToOrders.end())
8182 return It->second;
8183 }
8184 return OpTE->ReorderIndices;
8185 }();
8186 // First consider the order of the external scalar users.
8187 auto It = ExternalUserReorderMap.find(OpTE);
8188 if (It != ExternalUserReorderMap.end()) {
8189 const auto &ExternalUserReorderIndices = It->second;
8190 // If the OpTE vector factor != number of scalars - use natural order,
8191 // it is an attempt to reorder node with reused scalars but with
8192 // external uses.
8193 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8194 OrdersUses.try_emplace(OrdersType(), 0).first->second +=
8195 ExternalUserReorderIndices.size();
8196 } else {
8197 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8198 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8199 }
8200 // No other useful reorder data in this entry.
8201 if (Order.empty())
8202 continue;
8203 }
8204 // Stores actually store the mask, not the order, need to invert.
8205 if (OpTE->State == TreeEntry::Vectorize &&
8206 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8207 assert(!OpTE->isAltShuffle() &&
8208 "Alternate instructions are only supported by BinaryOperator "
8209 "and CastInst.");
8210 SmallVector<int> Mask;
8211 inversePermutation(Order, Mask);
8212 unsigned E = Order.size();
8213 OrdersType CurrentOrder(E, E);
8214 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8215 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8216 });
8217 fixupOrderingIndices(CurrentOrder);
8218 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8219 } else {
8220 ++OrdersUses.try_emplace(Order, 0).first->second;
8221 }
8222 }
8223 if (OrdersUses.empty())
8224 continue;
8225 // Choose the most used order.
8226 unsigned IdentityCnt = 0;
8227 unsigned FilledIdentityCnt = 0;
8228 OrdersType IdentityOrder(VF, VF);
8229 for (auto &Pair : OrdersUses) {
8230 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8231 if (!Pair.first.empty())
8232 FilledIdentityCnt += Pair.second;
8233 IdentityCnt += Pair.second;
8234 combineOrders(IdentityOrder, Pair.first);
8235 }
8236 }
8237 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8238 unsigned Cnt = IdentityCnt;
8239 for (auto &Pair : OrdersUses) {
8240 // Prefer identity order. But, if filled identity found (non-empty order)
8241 // with same number of uses, as the new candidate order, we can choose
8242 // this candidate order.
8243 if (Cnt < Pair.second ||
8244 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8245 Cnt == Pair.second && !BestOrder.empty() &&
8246 isIdentityOrder(BestOrder))) {
8247 combineOrders(Pair.first, BestOrder);
8248 BestOrder = Pair.first;
8249 Cnt = Pair.second;
8250 } else {
8251 combineOrders(BestOrder, Pair.first);
8252 }
8253 }
8254 // Set order of the user node.
8255 if (isIdentityOrder(BestOrder))
8256 continue;
8257 fixupOrderingIndices(BestOrder);
8258 SmallVector<int> Mask;
8259 inversePermutation(BestOrder, Mask);
8260 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8261 unsigned E = BestOrder.size();
8262 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8263 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8264 });
8265 // Do an actual reordering, if profitable.
8266 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8267 // Just do the reordering for the nodes with the given VF.
8268 if (TE->Scalars.size() != VF) {
8269 if (TE->ReuseShuffleIndices.size() == VF) {
8270 assert(TE->State != TreeEntry::SplitVectorize &&
8271 "Split vectorized not expected.");
8272 // Need to reorder the reuses masks of the operands with smaller VF to
8273 // be able to find the match between the graph nodes and scalar
8274 // operands of the given node during vectorization/cost estimation.
8275 assert(
8276 (!TE->UserTreeIndex ||
8277 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8278 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8279 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8280 "All users must be of VF size.");
8281 if (SLPReVec) {
8282 assert(SLPReVec && "Only supported by REVEC.");
8283 // ShuffleVectorInst does not do reorderOperands (and it should not
8284 // because ShuffleVectorInst supports only a limited set of
8285 // patterns). Only do reorderNodeWithReuses if the user is not
8286 // ShuffleVectorInst.
8287 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8288 isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
8289 continue;
8290 }
8291 // Update ordering of the operands with the smaller VF than the given
8292 // one.
8293 reorderNodeWithReuses(*TE, Mask);
8294 // Update orders in user split vectorize nodes.
8295 if (TE->UserTreeIndex &&
8296 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8297 TE->UserTreeIndex.UserTE->reorderSplitNode(
8298 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8299 }
8300 continue;
8301 }
8302 if ((TE->State == TreeEntry::SplitVectorize &&
8303 TE->ReuseShuffleIndices.empty()) ||
8304 ((TE->State == TreeEntry::Vectorize ||
8305 TE->State == TreeEntry::StridedVectorize ||
8306 TE->State == TreeEntry::CompressVectorize) &&
8308 InsertElementInst>(TE->getMainOp()) ||
8309 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
8310 assert(
8311 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8312 TE->ReuseShuffleIndices.empty())) &&
8313 "Alternate instructions are only supported by BinaryOperator "
8314 "and CastInst.");
8315 // Build correct orders for extract{element,value}, loads,
8316 // stores and alternate (split) nodes.
8317 reorderOrder(TE->ReorderIndices, Mask);
8318 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
8319 TE->reorderOperands(Mask);
8320 } else {
8321 // Reorder the node and its operands.
8322 TE->reorderOperands(Mask);
8323 assert(TE->ReorderIndices.empty() &&
8324 "Expected empty reorder sequence.");
8325 reorderScalars(TE->Scalars, Mask);
8326 }
8327 if (!TE->ReuseShuffleIndices.empty()) {
8328 // Apply reversed order to keep the original ordering of the reused
8329 // elements to avoid extra reorder indices shuffling.
8330 OrdersType CurrentOrder;
8331 reorderOrder(CurrentOrder, MaskOrder);
8332 SmallVector<int> NewReuses;
8333 inversePermutation(CurrentOrder, NewReuses);
8334 addMask(NewReuses, TE->ReuseShuffleIndices);
8335 TE->ReuseShuffleIndices.swap(NewReuses);
8336 } else if (TE->UserTreeIndex &&
8337 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8338 // Update orders in user split vectorize nodes.
8339 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8340 Mask, MaskOrder);
8341 }
8342 }
8343}
8344
8345void BoUpSLP::buildReorderableOperands(
8346 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8347 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
8348 SmallVectorImpl<TreeEntry *> &GatherOps) {
8349 for (unsigned I : seq<unsigned>(UserTE->getNumOperands())) {
8350 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
8351 return OpData.first == I &&
8352 (OpData.second->State == TreeEntry::Vectorize ||
8353 OpData.second->State == TreeEntry::StridedVectorize ||
8354 OpData.second->State == TreeEntry::CompressVectorize ||
8355 OpData.second->State == TreeEntry::SplitVectorize);
8356 }))
8357 continue;
8358 // Do not request operands, if they do not exist.
8359 if (UserTE->hasState()) {
8360 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8361 UserTE->getOpcode() == Instruction::ExtractValue)
8362 continue;
8363 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
8364 continue;
8365 if (UserTE->getOpcode() == Instruction::Store &&
8366 UserTE->State == TreeEntry::Vectorize && I == 1)
8367 continue;
8368 if (UserTE->getOpcode() == Instruction::Load &&
8369 (UserTE->State == TreeEntry::Vectorize ||
8370 UserTE->State == TreeEntry::StridedVectorize ||
8371 UserTE->State == TreeEntry::CompressVectorize))
8372 continue;
8373 }
8374 TreeEntry *TE = getOperandEntry(UserTE, I);
8375 assert(TE && "Expected operand entry.");
8376 if (!TE->isGather()) {
8377 // Add the node to the list of the ordered nodes with the identity
8378 // order.
8379 Edges.emplace_back(I, TE);
8380 // Add ScatterVectorize nodes to the list of operands, where just
8381 // reordering of the scalars is required. Similar to the gathers, so
8382 // simply add to the list of gathered ops.
8383 // If there are reused scalars, process this node as a regular vectorize
8384 // node, just reorder reuses mask.
8385 if (TE->State == TreeEntry::ScatterVectorize &&
8386 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8387 GatherOps.push_back(TE);
8388 continue;
8389 }
8390 if (ReorderableGathers.contains(TE))
8391 GatherOps.push_back(TE);
8392 }
8393}
8394
8395void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
8396 struct TreeEntryCompare {
8397 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
8398 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8399 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8400 return LHS->Idx < RHS->Idx;
8401 }
8402 };
8404 DenseSet<const TreeEntry *> GathersToOrders;
8405 // Find all reorderable leaf nodes with the given VF.
8406 // Currently the are vectorized loads,extracts without alternate operands +
8407 // some gathering of extracts.
8409 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8410 if (TE->State != TreeEntry::Vectorize &&
8411 TE->State != TreeEntry::StridedVectorize &&
8412 TE->State != TreeEntry::CompressVectorize &&
8413 TE->State != TreeEntry::SplitVectorize)
8414 NonVectorized.insert(TE.get());
8415 if (std::optional<OrdersType> CurrentOrder =
8416 getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {
8417 Queue.push(TE.get());
8418 if (!(TE->State == TreeEntry::Vectorize ||
8419 TE->State == TreeEntry::StridedVectorize ||
8420 TE->State == TreeEntry::CompressVectorize ||
8421 TE->State == TreeEntry::SplitVectorize) ||
8422 !TE->ReuseShuffleIndices.empty())
8423 GathersToOrders.insert(TE.get());
8424 }
8425 }
8426
8427 // 1. Propagate order to the graph nodes, which use only reordered nodes.
8428 // I.e., if the node has operands, that are reordered, try to make at least
8429 // one operand order in the natural order and reorder others + reorder the
8430 // user node itself.
8431 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
8432 while (!Queue.empty()) {
8433 // 1. Filter out only reordered nodes.
8434 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
8435 TreeEntry *TE = Queue.top();
8436 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8437 Queue.pop();
8438 SmallVector<TreeEntry *> OrderedOps(1, TE);
8439 while (!Queue.empty()) {
8440 TE = Queue.top();
8441 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8442 break;
8443 Queue.pop();
8444 OrderedOps.push_back(TE);
8445 }
8446 for (TreeEntry *TE : OrderedOps) {
8447 if (!(TE->State == TreeEntry::Vectorize ||
8448 TE->State == TreeEntry::StridedVectorize ||
8449 TE->State == TreeEntry::CompressVectorize ||
8450 TE->State == TreeEntry::SplitVectorize ||
8451 (TE->isGather() && GathersToOrders.contains(TE))) ||
8452 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8453 !Visited.insert(TE).second)
8454 continue;
8455 // Build a map between user nodes and their operands order to speedup
8456 // search. The graph currently does not provide this dependency directly.
8457 Users.first = TE->UserTreeIndex.UserTE;
8458 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8459 }
8460 if (Users.first) {
8461 auto &Data = Users;
8462 if (Data.first->State == TreeEntry::SplitVectorize) {
8463 assert(
8464 Data.second.size() <= 2 &&
8465 "Expected not greater than 2 operands for split vectorize node.");
8466 if (any_of(Data.second,
8467 [](const auto &Op) { return !Op.second->UserTreeIndex; }))
8468 continue;
8469 // Update orders in user split vectorize nodes.
8470 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
8471 "Expected exactly 2 entries.");
8472 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
8473 TreeEntry &OpTE = *VectorizableTree[P.first];
8474 OrdersType Order = OpTE.ReorderIndices;
8475 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
8476 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8477 continue;
8478 const auto BestOrder =
8479 getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
8480 if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))
8481 continue;
8482 Order = *BestOrder;
8483 }
8484 fixupOrderingIndices(Order);
8485 SmallVector<int> Mask;
8486 inversePermutation(Order, Mask);
8487 const unsigned E = Order.size();
8488 SmallVector<int> MaskOrder(E, PoisonMaskElem);
8489 transform(Order, MaskOrder.begin(), [E](unsigned I) {
8490 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8491 });
8492 Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
8493 // Clear ordering of the operand.
8494 if (!OpTE.ReorderIndices.empty()) {
8495 OpTE.ReorderIndices.clear();
8496 } else if (!OpTE.ReuseShuffleIndices.empty()) {
8497 reorderReuses(OpTE.ReuseShuffleIndices, Mask);
8498 } else {
8499 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
8500 reorderScalars(OpTE.Scalars, Mask);
8501 }
8502 }
8503 if (Data.first->ReuseShuffleIndices.empty() &&
8504 !Data.first->ReorderIndices.empty()) {
8505 // Insert user node to the list to try to sink reordering deeper in
8506 // the graph.
8507 Queue.push(Data.first);
8508 }
8509 continue;
8510 }
8511 // Check that operands are used only in the User node.
8512 SmallVector<TreeEntry *> GatherOps;
8513 buildReorderableOperands(Data.first, Data.second, NonVectorized,
8514 GatherOps);
8515 // All operands are reordered and used only in this node - propagate the
8516 // most used order to the user node.
8519 OrdersUses;
8520 // Do the analysis for each tree entry only once, otherwise the order of
8521 // the same node my be considered several times, though might be not
8522 // profitable.
8525 for (const auto &Op : Data.second) {
8526 TreeEntry *OpTE = Op.second;
8527 if (!VisitedOps.insert(OpTE).second)
8528 continue;
8529 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
8530 continue;
8531 const auto Order = [&]() -> const OrdersType {
8532 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8533 return getReorderingData(*OpTE, /*TopToBottom=*/false,
8534 IgnoreReorder)
8535 .value_or(OrdersType(1));
8536 return OpTE->ReorderIndices;
8537 }();
8538 // The order is partially ordered, skip it in favor of fully non-ordered
8539 // orders.
8540 if (Order.size() == 1)
8541 continue;
8542
8543 // Check that the reordering does not increase number of shuffles, i.e.
8544 // same-values-nodes has same parents or their parents has same parents.
8545 if (!Order.empty() && !isIdentityOrder(Order)) {
8546 Value *Root = OpTE->hasState()
8547 ? OpTE->getMainOp()
8548 : *find_if_not(OpTE->Scalars, isConstant);
8549 auto GetSameNodesUsers = [&](Value *Root) {
8551 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8552 if (TE != OpTE && TE->UserTreeIndex &&
8553 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8554 TE->Scalars.size() == OpTE->Scalars.size() &&
8555 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8556 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8557 Res.insert(TE->UserTreeIndex.UserTE);
8558 }
8559 for (const TreeEntry *TE : getTreeEntries(Root)) {
8560 if (TE != OpTE && TE->UserTreeIndex &&
8561 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8562 TE->Scalars.size() == OpTE->Scalars.size() &&
8563 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8564 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8565 Res.insert(TE->UserTreeIndex.UserTE);
8566 }
8567 return Res.takeVector();
8568 };
8569 auto GetNumOperands = [](const TreeEntry *TE) {
8570 if (TE->State == TreeEntry::SplitVectorize)
8571 return TE->getNumOperands();
8572 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8573 return CI->arg_size();
8574 return TE->getNumOperands();
8575 };
8576 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8577 const TreeEntry *TE) {
8579 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
8581 for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
8584 continue;
8585 const TreeEntry *Op = getOperandEntry(TE, Idx);
8586 if (Op->isGather() && Op->hasState()) {
8587 const TreeEntry *VecOp =
8588 getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
8589 if (VecOp)
8590 Op = VecOp;
8591 }
8592 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
8593 return false;
8594 }
8595 return true;
8596 };
8597 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
8598 if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
8599 if (!RevisitedOps.insert(UTE).second)
8600 return false;
8601 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
8602 !UTE->ReuseShuffleIndices.empty() ||
8603 (UTE->UserTreeIndex &&
8604 UTE->UserTreeIndex.UserTE == Data.first) ||
8605 (Data.first->UserTreeIndex &&
8606 Data.first->UserTreeIndex.UserTE == UTE) ||
8607 (IgnoreReorder && UTE->UserTreeIndex &&
8608 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8609 NodeShouldBeReorderedWithOperands(UTE);
8610 }))
8611 continue;
8612 for (TreeEntry *UTE : Users) {
8614 if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
8616 for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
8619 continue;
8620 const TreeEntry *Op = getOperandEntry(UTE, Idx);
8621 Visited.erase(Op);
8622 Queue.push(const_cast<TreeEntry *>(Op));
8623 }
8624 }
8625 }
8626 unsigned NumOps = count_if(
8627 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
8628 return P.second == OpTE;
8629 });
8630 // Stores actually store the mask, not the order, need to invert.
8631 if (OpTE->State == TreeEntry::Vectorize &&
8632 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8633 assert(!OpTE->isAltShuffle() &&
8634 "Alternate instructions are only supported by BinaryOperator "
8635 "and CastInst.");
8636 SmallVector<int> Mask;
8637 inversePermutation(Order, Mask);
8638 unsigned E = Order.size();
8639 OrdersType CurrentOrder(E, E);
8640 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
8641 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8642 });
8643 fixupOrderingIndices(CurrentOrder);
8644 OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
8645 } else {
8646 OrdersUses.try_emplace(Order, 0).first->second += NumOps;
8647 }
8648 auto Res = OrdersUses.try_emplace(OrdersType(), 0);
8649 const auto AllowsReordering = [&](const TreeEntry *TE) {
8650 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8651 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8652 (IgnoreReorder && TE->Idx == 0))
8653 return true;
8654 if (TE->isGather()) {
8655 if (GathersToOrders.contains(TE))
8656 return !getReorderingData(*TE, /*TopToBottom=*/false,
8657 IgnoreReorder)
8658 .value_or(OrdersType(1))
8659 .empty();
8660 return true;
8661 }
8662 return false;
8663 };
8664 if (OpTE->UserTreeIndex) {
8665 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8666 if (!VisitedUsers.insert(UserTE).second)
8667 continue;
8668 // May reorder user node if it requires reordering, has reused
8669 // scalars, is an alternate op vectorize node or its op nodes require
8670 // reordering.
8671 if (AllowsReordering(UserTE))
8672 continue;
8673 // Check if users allow reordering.
8674 // Currently look up just 1 level of operands to avoid increase of
8675 // the compile time.
8676 // Profitable to reorder if definitely more operands allow
8677 // reordering rather than those with natural order.
8679 if (static_cast<unsigned>(count_if(
8680 Ops, [UserTE, &AllowsReordering](
8681 const std::pair<unsigned, TreeEntry *> &Op) {
8682 return AllowsReordering(Op.second) &&
8683 Op.second->UserTreeIndex.UserTE == UserTE;
8684 })) <= Ops.size() / 2)
8685 ++Res.first->second;
8686 }
8687 }
8688 if (OrdersUses.empty()) {
8689 Visited.insert_range(llvm::make_second_range(Data.second));
8690 continue;
8691 }
8692 // Choose the most used order.
8693 unsigned IdentityCnt = 0;
8694 unsigned VF = Data.second.front().second->getVectorFactor();
8695 OrdersType IdentityOrder(VF, VF);
8696 for (auto &Pair : OrdersUses) {
8697 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
8698 IdentityCnt += Pair.second;
8699 combineOrders(IdentityOrder, Pair.first);
8700 }
8701 }
8702 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8703 unsigned Cnt = IdentityCnt;
8704 for (auto &Pair : OrdersUses) {
8705 // Prefer identity order. But, if filled identity found (non-empty
8706 // order) with same number of uses, as the new candidate order, we can
8707 // choose this candidate order.
8708 if (Cnt < Pair.second) {
8709 combineOrders(Pair.first, BestOrder);
8710 BestOrder = Pair.first;
8711 Cnt = Pair.second;
8712 } else {
8713 combineOrders(BestOrder, Pair.first);
8714 }
8715 }
8716 // Set order of the user node.
8717 if (isIdentityOrder(BestOrder)) {
8718 Visited.insert_range(llvm::make_second_range(Data.second));
8719 continue;
8720 }
8721 fixupOrderingIndices(BestOrder);
8722 // Erase operands from OrderedEntries list and adjust their orders.
8723 VisitedOps.clear();
8724 SmallVector<int> Mask;
8725 inversePermutation(BestOrder, Mask);
8726 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8727 unsigned E = BestOrder.size();
8728 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
8729 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8730 });
8731 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
8732 TreeEntry *TE = Op.second;
8733 if (!VisitedOps.insert(TE).second)
8734 continue;
8735 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
8736 reorderNodeWithReuses(*TE, Mask);
8737 continue;
8738 }
8739 // Gathers are processed separately.
8740 if (TE->State != TreeEntry::Vectorize &&
8741 TE->State != TreeEntry::StridedVectorize &&
8742 TE->State != TreeEntry::CompressVectorize &&
8743 TE->State != TreeEntry::SplitVectorize &&
8744 (TE->State != TreeEntry::ScatterVectorize ||
8745 TE->ReorderIndices.empty()))
8746 continue;
8747 assert((BestOrder.size() == TE->ReorderIndices.size() ||
8748 TE->ReorderIndices.empty()) &&
8749 "Non-matching sizes of user/operand entries.");
8750 reorderOrder(TE->ReorderIndices, Mask);
8751 if (IgnoreReorder && TE == VectorizableTree.front().get())
8752 IgnoreReorder = false;
8753 }
8754 // For gathers just need to reorder its scalars.
8755 for (TreeEntry *Gather : GatherOps) {
8756 assert(Gather->ReorderIndices.empty() &&
8757 "Unexpected reordering of gathers.");
8758 if (!Gather->ReuseShuffleIndices.empty()) {
8759 // Just reorder reuses indices.
8760 reorderReuses(Gather->ReuseShuffleIndices, Mask);
8761 continue;
8762 }
8763 reorderScalars(Gather->Scalars, Mask);
8764 Visited.insert(Gather);
8765 }
8766 // Reorder operands of the user node and set the ordering for the user
8767 // node itself.
8768 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
8769 return TE.isAltShuffle() &&
8770 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8771 TE.ReorderIndices.empty());
8772 };
8773 if (Data.first->State != TreeEntry::Vectorize ||
8775 Data.first->getMainOp()) ||
8776 IsNotProfitableAltCodeNode(*Data.first))
8777 Data.first->reorderOperands(Mask);
8778 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
8779 IsNotProfitableAltCodeNode(*Data.first) ||
8780 Data.first->State == TreeEntry::StridedVectorize ||
8781 Data.first->State == TreeEntry::CompressVectorize) {
8782 reorderScalars(Data.first->Scalars, Mask);
8783 reorderOrder(Data.first->ReorderIndices, MaskOrder,
8784 /*BottomOrder=*/true);
8785 if (Data.first->ReuseShuffleIndices.empty() &&
8786 !Data.first->ReorderIndices.empty() &&
8787 !IsNotProfitableAltCodeNode(*Data.first)) {
8788 // Insert user node to the list to try to sink reordering deeper in
8789 // the graph.
8790 Queue.push(Data.first);
8791 }
8792 } else {
8793 reorderOrder(Data.first->ReorderIndices, Mask);
8794 }
8795 }
8796 }
8797 // If the reordering is unnecessary, just remove the reorder.
8798 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8799 VectorizableTree.front()->ReuseShuffleIndices.empty())
8800 VectorizableTree.front()->ReorderIndices.clear();
8801}
8802
8803Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
8804 if (Entry.hasState() &&
8805 (Entry.getOpcode() == Instruction::Store ||
8806 Entry.getOpcode() == Instruction::Load) &&
8807 Entry.State == TreeEntry::StridedVectorize &&
8808 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
8809 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
8810 return dyn_cast<Instruction>(Entry.Scalars.front());
8811}
8812
8814 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
8815 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8816 DenseMap<Value *, unsigned> ScalarToExtUses;
8817 // Collect the values that we need to extract from the tree.
8818 for (auto &TEPtr : VectorizableTree) {
8819 TreeEntry *Entry = TEPtr.get();
8820
8821 // No need to handle users of gathered values.
8822 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8823 continue;
8824
8825 // For each lane:
8826 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8827 Value *Scalar = Entry->Scalars[Lane];
8828 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
8829 continue;
8830
8831 // All uses must be replaced already? No need to do it again.
8832 auto It = ScalarToExtUses.find(Scalar);
8833 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
8834 continue;
8835
8836 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8837 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8838 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
8839 << " from " << *Scalar << "for many users.\n");
8840 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
8841 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8842 ExternalUsesWithNonUsers.insert(Scalar);
8843 continue;
8844 }
8845
8846 // Check if the scalar is externally used as an extra arg.
8847 const auto ExtI = ExternallyUsedValues.find(Scalar);
8848 if (ExtI != ExternallyUsedValues.end()) {
8849 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8850 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
8851 << FoundLane << " from " << *Scalar << ".\n");
8852 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
8853 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
8854 continue;
8855 }
8856 for (User *U : Scalar->users()) {
8857 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
8858
8859 Instruction *UserInst = dyn_cast<Instruction>(U);
8860 if (!UserInst || isDeleted(UserInst))
8861 continue;
8862
8863 // Ignore users in the user ignore list.
8864 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8865 continue;
8866
8867 // Skip in-tree scalars that become vectors
8868 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
8869 !UseEntries.empty()) {
8870 // Some in-tree scalars will remain as scalar in vectorized
8871 // instructions. If that is the case, the one in FoundLane will
8872 // be used.
8873 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8874 isa<LoadInst, StoreInst>(UserInst)) ||
8875 isa<CallInst>(UserInst)) ||
8876 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8877 return UseEntry->State == TreeEntry::ScatterVectorize ||
8879 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8880 TTI);
8881 })) {
8882 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
8883 << ".\n");
8884 assert(none_of(UseEntries,
8885 [](TreeEntry *UseEntry) {
8886 return UseEntry->isGather();
8887 }) &&
8888 "Bad state");
8889 continue;
8890 }
8891 U = nullptr;
8892 if (It != ScalarToExtUses.end()) {
8893 ExternalUses[It->second].User = nullptr;
8894 break;
8895 }
8896 }
8897
8898 if (U && Scalar->hasNUsesOrMore(UsesLimit))
8899 U = nullptr;
8900 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8901 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
8902 << " from lane " << FoundLane << " from " << *Scalar
8903 << ".\n");
8904 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
8905 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
8906 ExternalUsesWithNonUsers.insert(Scalar);
8907 if (!U)
8908 break;
8909 }
8910 }
8911 }
8912}
8913
8915BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
8918 PtrToStoresMap;
8919 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
8920 Value *V = TE->Scalars[Lane];
8921 // Don't iterate over the users of constant data.
8922 if (!isa<Instruction>(V))
8923 continue;
8924 // To save compilation time we don't visit if we have too many users.
8925 if (V->hasNUsesOrMore(UsesLimit))
8926 break;
8927
8928 // Collect stores per pointer object.
8929 for (User *U : V->users()) {
8930 auto *SI = dyn_cast<StoreInst>(U);
8931 // Test whether we can handle the store. V might be a global, which could
8932 // be used in a different function.
8933 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
8934 !isValidElementType(SI->getValueOperand()->getType()))
8935 continue;
8936 // Skip entry if already
8937 if (isVectorized(U))
8938 continue;
8939
8940 Value *Ptr =
8941 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
8942 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
8943 SI->getValueOperand()->getType(), Ptr}];
8944 // For now just keep one store per pointer object per lane.
8945 // TODO: Extend this to support multiple stores per pointer per lane
8946 if (StoresVec.size() > Lane)
8947 continue;
8948 if (!StoresVec.empty()) {
8949 std::optional<int64_t> Diff = getPointersDiff(
8950 SI->getValueOperand()->getType(), SI->getPointerOperand(),
8951 SI->getValueOperand()->getType(),
8952 StoresVec.front()->getPointerOperand(), *DL, *SE,
8953 /*StrictCheck=*/true);
8954 // We failed to compare the pointers so just abandon this store.
8955 if (!Diff)
8956 continue;
8957 }
8958 StoresVec.push_back(SI);
8959 }
8960 }
8961 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
8962 unsigned I = 0;
8963 for (auto &P : PtrToStoresMap) {
8964 Res[I].swap(P.second);
8965 ++I;
8966 }
8967 return Res;
8968}
8969
8970bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
8971 OrdersType &ReorderIndices) const {
8972 // We check whether the stores in StoreVec can form a vector by sorting them
8973 // and checking whether they are consecutive.
8974
8975 // To avoid calling getPointersDiff() while sorting we create a vector of
8976 // pairs {store, offset from first} and sort this instead.
8978 StoreInst *S0 = StoresVec[0];
8979 StoreOffsetVec.emplace_back(0, 0);
8980 Type *S0Ty = S0->getValueOperand()->getType();
8981 Value *S0Ptr = S0->getPointerOperand();
8982 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
8983 StoreInst *SI = StoresVec[Idx];
8984 std::optional<int64_t> Diff =
8985 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
8986 SI->getPointerOperand(), *DL, *SE,
8987 /*StrictCheck=*/true);
8988 StoreOffsetVec.emplace_back(*Diff, Idx);
8989 }
8990
8991 // Check if the stores are consecutive by checking if their difference is 1.
8992 if (StoreOffsetVec.size() != StoresVec.size())
8993 return false;
8994 sort(StoreOffsetVec, llvm::less_first());
8995 unsigned Idx = 0;
8996 int64_t PrevDist = 0;
8997 for (const auto &P : StoreOffsetVec) {
8998 if (Idx > 0 && P.first != PrevDist + 1)
8999 return false;
9000 PrevDist = P.first;
9001 ++Idx;
9002 }
9003
9004 // Calculate the shuffle indices according to their offset against the sorted
9005 // StoreOffsetVec.
9006 ReorderIndices.assign(StoresVec.size(), 0);
9007 bool IsIdentity = true;
9008 for (auto [I, P] : enumerate(StoreOffsetVec)) {
9009 ReorderIndices[P.second] = I;
9010 IsIdentity &= P.second == I;
9011 }
9012 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
9013 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
9014 // same convention here.
9015 if (IsIdentity)
9016 ReorderIndices.clear();
9017
9018 return true;
9019}
9020
9021#ifndef NDEBUG
9023 for (unsigned Idx : Order)
9024 dbgs() << Idx << ", ";
9025 dbgs() << "\n";
9026}
9027#endif
9028
9030BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
9031 unsigned NumLanes = TE->Scalars.size();
9032
9033 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
9034
9035 // Holds the reorder indices for each candidate store vector that is a user of
9036 // the current TreeEntry.
9037 SmallVector<OrdersType, 1> ExternalReorderIndices;
9038
9039 // Now inspect the stores collected per pointer and look for vectorization
9040 // candidates. For each candidate calculate the reorder index vector and push
9041 // it into `ExternalReorderIndices`
9042 for (ArrayRef<StoreInst *> StoresVec : Stores) {
9043 // If we have fewer than NumLanes stores, then we can't form a vector.
9044 if (StoresVec.size() != NumLanes)
9045 continue;
9046
9047 // If the stores are not consecutive then abandon this StoresVec.
9048 OrdersType ReorderIndices;
9049 if (!canFormVector(StoresVec, ReorderIndices))
9050 continue;
9051
9052 // We now know that the scalars in StoresVec can form a vector instruction,
9053 // so set the reorder indices.
9054 ExternalReorderIndices.push_back(ReorderIndices);
9055 }
9056 return ExternalReorderIndices;
9057}
9058
9060 const SmallDenseSet<Value *> &UserIgnoreLst) {
9061 deleteTree();
9062 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9063 "TreeEntryToStridedPtrInfoMap is not cleared");
9064 UserIgnoreList = &UserIgnoreLst;
9065 if (!allSameType(Roots))
9066 return;
9067 buildTreeRec(Roots, 0, EdgeInfo());
9068}
9069
9071 deleteTree();
9072 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9073 "TreeEntryToStridedPtrInfoMap is not cleared");
9074 if (!allSameType(Roots))
9075 return;
9076 buildTreeRec(Roots, 0, EdgeInfo());
9077}
9078
9079/// Tries to find subvector of loads and builds new vector of only loads if can
9080/// be profitable.
9082 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
9084 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
9085 bool AddNew = true) {
9086 if (VL.empty())
9087 return;
9088 Type *ScalarTy = getValueType(VL.front());
9089 if (!isValidElementType(ScalarTy))
9090 return;
9092 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
9093 for (Value *V : VL) {
9094 auto *LI = dyn_cast<LoadInst>(V);
9095 if (!LI)
9096 continue;
9097 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9098 continue;
9099 bool IsFound = false;
9100 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
9101 assert(LI->getParent() == Data.front().first->getParent() &&
9102 LI->getType() == Data.front().first->getType() &&
9103 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
9104 getUnderlyingObject(Data.front().first->getPointerOperand(),
9106 "Expected loads with the same type, same parent and same "
9107 "underlying pointer.");
9108 std::optional<int64_t> Dist = getPointersDiff(
9109 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
9110 Data.front().first->getPointerOperand(), DL, SE,
9111 /*StrictCheck=*/true);
9112 if (!Dist)
9113 continue;
9114 auto It = Map.find(*Dist);
9115 if (It != Map.end() && It->second != LI)
9116 continue;
9117 if (It == Map.end()) {
9118 Data.emplace_back(LI, *Dist);
9119 Map.try_emplace(*Dist, LI);
9120 }
9121 IsFound = true;
9122 break;
9123 }
9124 if (!IsFound) {
9125 ClusteredLoads.emplace_back().emplace_back(LI, 0);
9126 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
9127 }
9128 }
9129 auto FindMatchingLoads =
9132 &GatheredLoads,
9133 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
9134 int64_t &Offset, unsigned &Start) {
9135 if (Loads.empty())
9136 return GatheredLoads.end();
9137 LoadInst *LI = Loads.front().first;
9138 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
9139 if (Idx < Start)
9140 continue;
9141 ToAdd.clear();
9142 if (LI->getParent() != Data.front().first->getParent() ||
9143 LI->getType() != Data.front().first->getType())
9144 continue;
9145 std::optional<int64_t> Dist =
9147 Data.front().first->getType(),
9148 Data.front().first->getPointerOperand(), DL, SE,
9149 /*StrictCheck=*/true);
9150 if (!Dist)
9151 continue;
9152 SmallSet<int64_t, 4> DataDists;
9154 for (std::pair<LoadInst *, int64_t> P : Data) {
9155 DataDists.insert(P.second);
9156 DataLoads.insert(P.first);
9157 }
9158 // Found matching gathered loads - check if all loads are unique or
9159 // can be effectively vectorized.
9160 unsigned NumUniques = 0;
9161 for (auto [Cnt, Pair] : enumerate(Loads)) {
9162 bool Used = DataLoads.contains(Pair.first);
9163 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
9164 ++NumUniques;
9165 ToAdd.insert(Cnt);
9166 } else if (Used) {
9167 Repeated.insert(Cnt);
9168 }
9169 }
9170 if (NumUniques > 0 &&
9171 (Loads.size() == NumUniques ||
9172 (Loads.size() - NumUniques >= 2 &&
9173 Loads.size() - NumUniques >= Loads.size() / 2 &&
9174 (has_single_bit(Data.size() + NumUniques) ||
9175 bit_ceil(Data.size()) <
9176 bit_ceil(Data.size() + NumUniques))))) {
9177 Offset = *Dist;
9178 Start = Idx + 1;
9179 return std::next(GatheredLoads.begin(), Idx);
9180 }
9181 }
9182 ToAdd.clear();
9183 return GatheredLoads.end();
9184 };
9185 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9186 unsigned Start = 0;
9187 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9188 int64_t Offset = 0;
9189 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
9190 Offset, Start);
9191 while (It != GatheredLoads.end()) {
9192 assert(!LocalToAdd.empty() && "Expected some elements to add.");
9193 for (unsigned Idx : LocalToAdd)
9194 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
9195 ToAdd.insert_range(LocalToAdd);
9196 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9197 Start);
9198 }
9199 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
9200 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9201 })) {
9202 auto AddNewLoads =
9204 for (unsigned Idx : seq<unsigned>(Data.size())) {
9205 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
9206 continue;
9207 Loads.push_back(Data[Idx]);
9208 }
9209 };
9210 if (!AddNew) {
9211 LoadInst *LI = Data.front().first;
9212 It = find_if(
9213 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9214 return PD.front().first->getParent() == LI->getParent() &&
9215 PD.front().first->getType() == LI->getType();
9216 });
9217 while (It != GatheredLoads.end()) {
9218 AddNewLoads(*It);
9219 It = std::find_if(
9220 std::next(It), GatheredLoads.end(),
9221 [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9222 return PD.front().first->getParent() == LI->getParent() &&
9223 PD.front().first->getType() == LI->getType();
9224 });
9225 }
9226 }
9227 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
9228 AddNewLoads(GatheredLoads.emplace_back());
9229 }
9230 }
9231}
9232
9233void BoUpSLP::tryToVectorizeGatheredLoads(
9234 const SmallMapVector<
9235 std::tuple<BasicBlock *, Value *, Type *>,
9236 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
9237 &GatheredLoads) {
9238 GatheredLoadsEntriesFirst = VectorizableTree.size();
9239
9240 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
9241 LoadEntriesToVectorize.size());
9242 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9243 Set.insert_range(VectorizableTree[Idx]->Scalars);
9244
9245 // Sort loads by distance.
9246 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9247 const std::pair<LoadInst *, int64_t> &L2) {
9248 return L1.second > L2.second;
9249 };
9250
9251 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9252 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
9253 Loads.size());
9254 Align Alignment = computeCommonAlignment<LoadInst>(Values);
9255 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
9256 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9257 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9258 };
9259
9260 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9261 BoUpSLP::ValueSet &VectorizedLoads,
9262 SmallVectorImpl<LoadInst *> &NonVectorized,
9263 bool Final, unsigned MaxVF) {
9265 unsigned StartIdx = 0;
9266 SmallVector<int> CandidateVFs;
9267 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
9268 CandidateVFs.push_back(MaxVF);
9269 for (int NumElts = getFloorFullVectorNumberOfElements(
9270 *TTI, Loads.front()->getType(), MaxVF);
9271 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
9272 *TTI, Loads.front()->getType(), NumElts - 1)) {
9273 CandidateVFs.push_back(NumElts);
9274 if (VectorizeNonPowerOf2 && NumElts > 2)
9275 CandidateVFs.push_back(NumElts - 1);
9276 }
9277
9278 if (Final && CandidateVFs.empty())
9279 return Results;
9280
9281 unsigned BestVF = Final ? CandidateVFs.back() : 0;
9282 for (unsigned NumElts : CandidateVFs) {
9283 if (Final && NumElts > BestVF)
9284 continue;
9285 SmallVector<unsigned> MaskedGatherVectorized;
9286 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
9287 ++Cnt) {
9288 ArrayRef<LoadInst *> Slice =
9289 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
9290 if (VectorizedLoads.count(Slice.front()) ||
9291 VectorizedLoads.count(Slice.back()) ||
9293 continue;
9294 // Check if it is profitable to try vectorizing gathered loads. It is
9295 // profitable if we have more than 3 consecutive loads or if we have
9296 // less but all users are vectorized or deleted.
9297 bool AllowToVectorize = false;
9298 // Check if it is profitable to vectorize 2-elements loads.
9299 if (NumElts == 2) {
9300 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9301 Slice.front()->getType(), ElementCount::getFixed(NumElts));
9302 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
9303 for (LoadInst *LI : Slice) {
9304 // If single use/user - allow to vectorize.
9305 if (LI->hasOneUse())
9306 continue;
9307 // 1. Check if number of uses equals number of users.
9308 // 2. All users are deleted.
9309 // 3. The load broadcasts are not allowed or the load is not
9310 // broadcasted.
9311 if (static_cast<unsigned int>(std::distance(
9312 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9313 return false;
9314 if (!IsLegalBroadcastLoad)
9315 continue;
9316 if (LI->hasNUsesOrMore(UsesLimit))
9317 return false;
9318 for (User *U : LI->users()) {
9319 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
9320 continue;
9321 for (const TreeEntry *UTE : getTreeEntries(U)) {
9322 for (int I : seq<int>(UTE->getNumOperands())) {
9323 if (all_of(UTE->getOperand(I), [LI](Value *V) {
9324 return V == LI || isa<PoisonValue>(V);
9325 }))
9326 // Found legal broadcast - do not vectorize.
9327 return false;
9328 }
9329 }
9330 }
9331 }
9332 return true;
9333 };
9334 AllowToVectorize = CheckIfAllowed(Slice);
9335 } else {
9336 AllowToVectorize =
9337 (NumElts >= 3 ||
9338 any_of(ValueToGatherNodes.at(Slice.front()),
9339 [=](const TreeEntry *TE) {
9340 return TE->Scalars.size() == 2 &&
9341 ((TE->Scalars.front() == Slice.front() &&
9342 TE->Scalars.back() == Slice.back()) ||
9343 (TE->Scalars.front() == Slice.back() &&
9344 TE->Scalars.back() == Slice.front()));
9345 })) &&
9346 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
9347 Slice.size());
9348 }
9349 if (AllowToVectorize) {
9350 SmallVector<Value *> PointerOps;
9351 OrdersType CurrentOrder;
9352 // Try to build vector load.
9353 ArrayRef<Value *> Values(
9354 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9355 StridedPtrInfo SPtrInfo;
9356 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
9357 PointerOps, SPtrInfo, &BestVF);
9358 if (LS != LoadsState::Gather ||
9359 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9360 if (LS == LoadsState::ScatterVectorize) {
9361 if (MaskedGatherVectorized.empty() ||
9362 Cnt >= MaskedGatherVectorized.back() + NumElts)
9363 MaskedGatherVectorized.push_back(Cnt);
9364 continue;
9365 }
9366 if (LS != LoadsState::Gather) {
9367 Results.emplace_back(Values, LS);
9368 VectorizedLoads.insert_range(Slice);
9369 // If we vectorized initial block, no need to try to vectorize it
9370 // again.
9371 if (Cnt == StartIdx)
9372 StartIdx += NumElts;
9373 }
9374 // Check if the whole array was vectorized already - exit.
9375 if (StartIdx >= Loads.size())
9376 break;
9377 // Erase last masked gather candidate, if another candidate within
9378 // the range is found to be better.
9379 if (!MaskedGatherVectorized.empty() &&
9380 Cnt < MaskedGatherVectorized.back() + NumElts)
9381 MaskedGatherVectorized.pop_back();
9382 Cnt += NumElts - 1;
9383 continue;
9384 }
9385 }
9386 if (!AllowToVectorize || BestVF == 0)
9388 }
9389 // Mark masked gathers candidates as vectorized, if any.
9390 for (unsigned Cnt : MaskedGatherVectorized) {
9391 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
9392 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
9393 ArrayRef<Value *> Values(
9394 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9395 Results.emplace_back(Values, LoadsState::ScatterVectorize);
9396 VectorizedLoads.insert_range(Slice);
9397 // If we vectorized initial block, no need to try to vectorize it again.
9398 if (Cnt == StartIdx)
9399 StartIdx += NumElts;
9400 }
9401 }
9402 for (LoadInst *LI : Loads) {
9403 if (!VectorizedLoads.contains(LI))
9404 NonVectorized.push_back(LI);
9405 }
9406 return Results;
9407 };
9408 auto ProcessGatheredLoads =
9409 [&, &TTI = *TTI](
9411 bool Final = false) {
9412 SmallVector<LoadInst *> NonVectorized;
9413 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9414 GatheredLoads) {
9415 if (LoadsDists.size() <= 1) {
9416 NonVectorized.push_back(LoadsDists.back().first);
9417 continue;
9418 }
9420 LoadsDists);
9421 SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
9422 stable_sort(LocalLoadsDists, LoadSorter);
9424 unsigned MaxConsecutiveDistance = 0;
9425 unsigned CurrentConsecutiveDist = 1;
9426 int64_t LastDist = LocalLoadsDists.front().second;
9427 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9428 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9429 if (isVectorized(L.first))
9430 continue;
9431 assert(LastDist >= L.second &&
9432 "Expected first distance always not less than second");
9433 if (static_cast<uint64_t>(LastDist - L.second) ==
9434 CurrentConsecutiveDist) {
9435 ++CurrentConsecutiveDist;
9436 MaxConsecutiveDistance =
9437 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9438 Loads.push_back(L.first);
9439 continue;
9440 }
9441 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9442 !Loads.empty())
9443 Loads.pop_back();
9444 CurrentConsecutiveDist = 1;
9445 LastDist = L.second;
9446 Loads.push_back(L.first);
9447 }
9448 if (Loads.size() <= 1)
9449 continue;
9450 if (AllowMaskedGather)
9451 MaxConsecutiveDistance = Loads.size();
9452 else if (MaxConsecutiveDistance < 2)
9453 continue;
9454 BoUpSLP::ValueSet VectorizedLoads;
9455 SmallVector<LoadInst *> SortedNonVectorized;
9457 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9458 Final, MaxConsecutiveDistance);
9459 if (!Results.empty() && !SortedNonVectorized.empty() &&
9460 OriginalLoads.size() == Loads.size() &&
9461 MaxConsecutiveDistance == Loads.size() &&
9463 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
9464 return P.second == LoadsState::ScatterVectorize;
9465 })) {
9466 VectorizedLoads.clear();
9467 SmallVector<LoadInst *> UnsortedNonVectorized;
9469 UnsortedResults =
9470 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9471 UnsortedNonVectorized, Final,
9472 OriginalLoads.size());
9473 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
9474 SortedNonVectorized.swap(UnsortedNonVectorized);
9475 Results.swap(UnsortedResults);
9476 }
9477 }
9478 for (auto [Slice, _] : Results) {
9479 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
9480 << Slice.size() << ")\n");
9481 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
9482 for (Value *L : Slice)
9483 if (!isVectorized(L))
9484 SortedNonVectorized.push_back(cast<LoadInst>(L));
9485 continue;
9486 }
9487
9488 // Select maximum VF as a maximum of user gathered nodes and
9489 // distance between scalar loads in these nodes.
9490 unsigned MaxVF = Slice.size();
9491 unsigned UserMaxVF = 0;
9492 unsigned InterleaveFactor = 0;
9493 if (MaxVF == 2) {
9494 UserMaxVF = MaxVF;
9495 } else {
9496 // Found distance between segments of the interleaved loads.
9497 std::optional<unsigned> InterleavedLoadsDistance = 0;
9498 unsigned Order = 0;
9499 std::optional<unsigned> CommonVF = 0;
9500 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9501 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9502 for (auto [Idx, V] : enumerate(Slice)) {
9503 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
9504 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
9505 unsigned Pos =
9506 EntryToPosition.try_emplace(E, Idx).first->second;
9507 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9508 if (CommonVF) {
9509 if (*CommonVF == 0) {
9510 CommonVF = E->Scalars.size();
9511 continue;
9512 }
9513 if (*CommonVF != E->Scalars.size())
9514 CommonVF.reset();
9515 }
9516 // Check if the load is the part of the interleaved load.
9517 if (Pos != Idx && InterleavedLoadsDistance) {
9518 if (!DeinterleavedNodes.contains(E) &&
9519 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
9520 if (isa<Constant>(V))
9521 return false;
9522 if (isVectorized(V))
9523 return true;
9524 const auto &Nodes = ValueToGatherNodes.at(V);
9525 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9526 !is_contained(Slice, V);
9527 })) {
9528 InterleavedLoadsDistance.reset();
9529 continue;
9530 }
9531 DeinterleavedNodes.insert(E);
9532 if (*InterleavedLoadsDistance == 0) {
9533 InterleavedLoadsDistance = Idx - Pos;
9534 continue;
9535 }
9536 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9537 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9538 InterleavedLoadsDistance.reset();
9539 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9540 }
9541 }
9542 }
9543 DeinterleavedNodes.clear();
9544 // Check if the large load represents interleaved load operation.
9545 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9546 CommonVF.value_or(0) != 0) {
9547 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
9548 unsigned VF = *CommonVF;
9549 OrdersType Order;
9550 SmallVector<Value *> PointerOps;
9551 StridedPtrInfo SPtrInfo;
9552 // Segmented load detected - vectorize at maximum vector factor.
9553 if (InterleaveFactor <= Slice.size() &&
9554 TTI.isLegalInterleavedAccessType(
9555 getWidenedType(Slice.front()->getType(), VF),
9556 InterleaveFactor,
9557 cast<LoadInst>(Slice.front())->getAlign(),
9558 cast<LoadInst>(Slice.front())
9560 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
9561 SPtrInfo) == LoadsState::Vectorize) {
9562 UserMaxVF = InterleaveFactor * VF;
9563 } else {
9564 InterleaveFactor = 0;
9565 }
9566 }
9567 // Cannot represent the loads as consecutive vectorizable nodes -
9568 // just exit.
9569 unsigned ConsecutiveNodesSize = 0;
9570 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9571 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9572 [&, Slice = Slice](const auto &P) {
9573 const auto *It = find_if(Slice, [&](Value *V) {
9574 return std::get<1>(P).contains(V);
9575 });
9576 if (It == Slice.end())
9577 return false;
9578 const TreeEntry &TE =
9579 *VectorizableTree[std::get<0>(P)];
9580 ArrayRef<Value *> VL = TE.Scalars;
9581 OrdersType Order;
9582 SmallVector<Value *> PointerOps;
9583 StridedPtrInfo SPtrInfo;
9585 VL, VL.front(), Order, PointerOps, SPtrInfo);
9586 if (State == LoadsState::ScatterVectorize ||
9588 return false;
9589 ConsecutiveNodesSize += VL.size();
9590 size_t Start = std::distance(Slice.begin(), It);
9591 size_t Sz = Slice.size() - Start;
9592 return Sz < VL.size() ||
9593 Slice.slice(Start, VL.size()) != VL;
9594 }))
9595 continue;
9596 // Try to build long masked gather loads.
9597 UserMaxVF = bit_ceil(UserMaxVF);
9598 if (InterleaveFactor == 0 &&
9599 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
9600 [&, Slice = Slice](unsigned Idx) {
9601 OrdersType Order;
9602 SmallVector<Value *> PointerOps;
9603 StridedPtrInfo SPtrInfo;
9604 return canVectorizeLoads(
9605 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9606 Slice[Idx * UserMaxVF], Order, PointerOps,
9607 SPtrInfo) == LoadsState::ScatterVectorize;
9608 }))
9609 UserMaxVF = MaxVF;
9610 if (Slice.size() != ConsecutiveNodesSize)
9611 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9612 }
9613 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9614 bool IsVectorized = true;
9615 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
9616 ArrayRef<Value *> SubSlice =
9617 Slice.slice(I, std::min(VF, E - I));
9618 if (isVectorized(SubSlice.front()))
9619 continue;
9620 // Check if the subslice is to be-vectorized entry, which is not
9621 // equal to entry.
9622 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9623 [&](const auto &P) {
9624 return !SubSlice.equals(
9625 VectorizableTree[std::get<0>(P)]
9626 ->Scalars) &&
9627 set_is_subset(SubSlice, std::get<1>(P));
9628 }))
9629 continue;
9630 unsigned Sz = VectorizableTree.size();
9631 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
9632 if (Sz == VectorizableTree.size()) {
9633 IsVectorized = false;
9634 // Try non-interleaved vectorization with smaller vector
9635 // factor.
9636 if (InterleaveFactor > 0) {
9637 VF = 2 * (MaxVF / InterleaveFactor);
9638 InterleaveFactor = 0;
9639 }
9640 continue;
9641 }
9642 }
9643 if (IsVectorized)
9644 break;
9645 }
9646 }
9647 NonVectorized.append(SortedNonVectorized);
9648 }
9649 return NonVectorized;
9650 };
9651 for (const auto &GLs : GatheredLoads) {
9652 const auto &Ref = GLs.second;
9653 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
9654 if (!Ref.empty() && !NonVectorized.empty() &&
9655 std::accumulate(
9656 Ref.begin(), Ref.end(), 0u,
9657 [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9658 -> unsigned { return S + LoadsDists.size(); }) !=
9659 NonVectorized.size() &&
9660 IsMaskedGatherSupported(NonVectorized)) {
9662 FinalGatheredLoads;
9663 for (LoadInst *LI : NonVectorized) {
9664 // Reinsert non-vectorized loads to other list of loads with the same
9665 // base pointers.
9666 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
9667 FinalGatheredLoads,
9668 /*AddNew=*/false);
9669 }
9670 // Final attempt to vectorize non-vectorized loads.
9671 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
9672 }
9673 }
9674 // Try to vectorize postponed load entries, previously marked as gathered.
9675 for (unsigned Idx : LoadEntriesToVectorize) {
9676 const TreeEntry &E = *VectorizableTree[Idx];
9677 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
9678 // Avoid reordering, if possible.
9679 if (!E.ReorderIndices.empty()) {
9680 // Build a mask out of the reorder indices and reorder scalars per this
9681 // mask.
9682 SmallVector<int> ReorderMask;
9683 inversePermutation(E.ReorderIndices, ReorderMask);
9684 reorderScalars(GatheredScalars, ReorderMask);
9685 }
9686 buildTreeRec(GatheredScalars, 0, EdgeInfo());
9687 }
9688 // If no new entries created, consider it as no gathered loads entries must be
9689 // handled.
9690 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9691 VectorizableTree.size())
9692 GatheredLoadsEntriesFirst.reset();
9693}
9694
9695/// Generates key/subkey pair for the given value to provide effective sorting
9696/// of the values and better detection of the vectorizable values sequences. The
9697/// keys/subkeys can be used for better sorting of the values themselves (keys)
9698/// and in values subgroups (subkeys).
9699static std::pair<size_t, size_t> generateKeySubkey(
9700 Value *V, const TargetLibraryInfo *TLI,
9701 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
9702 bool AllowAlternate) {
9703 hash_code Key = hash_value(V->getValueID() + 2);
9704 hash_code SubKey = hash_value(0);
9705 // Sort the loads by the distance between the pointers.
9706 if (auto *LI = dyn_cast<LoadInst>(V)) {
9707 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
9708 if (LI->isSimple())
9709 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
9710 else
9711 Key = SubKey = hash_value(LI);
9712 } else if (isVectorLikeInstWithConstOps(V)) {
9713 // Sort extracts by the vector operands.
9715 Key = hash_value(Value::UndefValueVal + 1);
9716 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
9717 if (!isUndefVector(EI->getVectorOperand()).all() &&
9718 !isa<UndefValue>(EI->getIndexOperand()))
9719 SubKey = hash_value(EI->getVectorOperand());
9720 }
9721 } else if (auto *I = dyn_cast<Instruction>(V)) {
9722 // Sort other instructions just by the opcodes except for CMPInst.
9723 // For CMP also sort by the predicate kind.
9725 isValidForAlternation(I->getOpcode())) {
9726 if (AllowAlternate)
9727 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
9728 else
9729 Key = hash_combine(hash_value(I->getOpcode()), Key);
9730 SubKey = hash_combine(
9731 hash_value(I->getOpcode()), hash_value(I->getType()),
9733 ? I->getType()
9734 : cast<CastInst>(I)->getOperand(0)->getType()));
9735 // For casts, look through the only operand to improve compile time.
9736 if (isa<CastInst>(I)) {
9737 std::pair<size_t, size_t> OpVals =
9738 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
9739 /*AllowAlternate=*/true);
9740 Key = hash_combine(OpVals.first, Key);
9741 SubKey = hash_combine(OpVals.first, SubKey);
9742 }
9743 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
9744 CmpInst::Predicate Pred = CI->getPredicate();
9745 if (CI->isCommutative())
9746 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
9748 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
9749 hash_value(SwapPred),
9750 hash_value(CI->getOperand(0)->getType()));
9751 } else if (auto *Call = dyn_cast<CallInst>(I)) {
9754 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
9755 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
9756 SubKey = hash_combine(hash_value(I->getOpcode()),
9757 hash_value(Call->getCalledFunction()));
9758 } else {
9760 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
9761 }
9762 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
9763 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
9764 hash_value(Op.Tag), SubKey);
9765 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
9766 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
9767 SubKey = hash_value(Gep->getPointerOperand());
9768 else
9769 SubKey = hash_value(Gep);
9770 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
9771 !isa<ConstantInt>(I->getOperand(1))) {
9772 // Do not try to vectorize instructions with potentially high cost.
9773 SubKey = hash_value(I);
9774 } else {
9775 SubKey = hash_value(I->getOpcode());
9776 }
9777 Key = hash_combine(hash_value(I->getParent()), Key);
9778 }
9779 return std::make_pair(Key, SubKey);
9780}
9781
9782/// Checks if the specified instruction \p I is an main operation for the given
9783/// \p MainOp and \p AltOp instructions.
9784static bool isMainInstruction(Instruction *I, Instruction *MainOp,
9785 Instruction *AltOp, const TargetLibraryInfo &TLI);
9786
9787bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
9788 ArrayRef<Value *> VL) const {
9789 Type *ScalarTy = S.getMainOp()->getType();
9790 unsigned Opcode0 = S.getOpcode();
9791 unsigned Opcode1 = S.getAltOpcode();
9792 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9793 // If this pattern is supported by the target then consider it profitable.
9794 if (TTI->isLegalAltInstr(getWidenedType(ScalarTy, VL.size()), Opcode0,
9795 Opcode1, OpcodeMask))
9796 return true;
9797 SmallVector<ValueList> Operands;
9798 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
9799 Operands.emplace_back();
9800 // Prepare the operand vector.
9801 for (Value *V : VL) {
9802 if (isa<PoisonValue>(V)) {
9803 Operands.back().push_back(
9804 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
9805 continue;
9806 }
9807 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
9808 }
9809 }
9810 if (Operands.size() == 2) {
9811 // Try find best operands candidates.
9812 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
9814 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
9815 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
9816 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
9817 std::optional<int> Res = findBestRootPair(Candidates);
9818 switch (Res.value_or(0)) {
9819 case 0:
9820 break;
9821 case 1:
9822 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
9823 break;
9824 case 2:
9825 std::swap(Operands[0][I], Operands[1][I]);
9826 break;
9827 default:
9828 llvm_unreachable("Unexpected index.");
9829 }
9830 }
9831 }
9832 DenseSet<unsigned> UniqueOpcodes;
9833 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
9834 unsigned NonInstCnt = 0;
9835 // Estimate number of instructions, required for the vectorized node and for
9836 // the buildvector node.
9837 unsigned UndefCnt = 0;
9838 // Count the number of extra shuffles, required for vector nodes.
9839 unsigned ExtraShuffleInsts = 0;
9840 // Check that operands do not contain same values and create either perfect
9841 // diamond match or shuffled match.
9842 if (Operands.size() == 2) {
9843 // Do not count same operands twice.
9844 if (Operands.front() == Operands.back()) {
9845 Operands.erase(Operands.begin());
9846 } else if (!allConstant(Operands.front()) &&
9847 all_of(Operands.front(), [&](Value *V) {
9848 return is_contained(Operands.back(), V);
9849 })) {
9850 Operands.erase(Operands.begin());
9851 ++ExtraShuffleInsts;
9852 }
9853 }
9854 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
9855 // Vectorize node, if:
9856 // 1. at least single operand is constant or splat.
9857 // 2. Operands have many loop invariants (the instructions are not loop
9858 // invariants).
9859 // 3. At least single unique operands is supposed to vectorized.
9860 return none_of(Operands,
9861 [&](ArrayRef<Value *> Op) {
9862 if (allConstant(Op) ||
9863 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
9864 getSameOpcode(Op, *TLI)))
9865 return false;
9866 DenseMap<Value *, unsigned> Uniques;
9867 for (Value *V : Op) {
9869 isVectorized(V) || (L && L->isLoopInvariant(V))) {
9870 if (isa<UndefValue>(V))
9871 ++UndefCnt;
9872 continue;
9873 }
9874 auto Res = Uniques.try_emplace(V, 0);
9875 // Found first duplicate - need to add shuffle.
9876 if (!Res.second && Res.first->second == 1)
9877 ++ExtraShuffleInsts;
9878 ++Res.first->getSecond();
9879 if (auto *I = dyn_cast<Instruction>(V))
9880 UniqueOpcodes.insert(I->getOpcode());
9881 else if (Res.second)
9882 ++NonInstCnt;
9883 }
9884 return none_of(Uniques, [&](const auto &P) {
9885 return P.first->hasNUsesOrMore(P.second + 1) &&
9886 none_of(P.first->users(), [&](User *U) {
9887 return isVectorized(U) || Uniques.contains(U);
9888 });
9889 });
9890 }) ||
9891 // Do not vectorize node, if estimated number of vector instructions is
9892 // more than estimated number of buildvector instructions. Number of
9893 // vector operands is number of vector instructions + number of vector
9894 // instructions for operands (buildvectors). Number of buildvector
9895 // instructions is just number_of_operands * number_of_scalars.
9896 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
9897 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
9898 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
9899}
9900
9901/// Builds the arguments types vector for the given call instruction with the
9902/// given \p ID for the specified vector factor.
9905 const unsigned VF, unsigned MinBW,
9906 const TargetTransformInfo *TTI) {
9907 SmallVector<Type *> ArgTys;
9908 for (auto [Idx, Arg] : enumerate(CI->args())) {
9911 ArgTys.push_back(Arg->getType());
9912 continue;
9913 }
9914 if (MinBW > 0) {
9915 ArgTys.push_back(
9916 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
9917 continue;
9918 }
9919 }
9920 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
9921 }
9922 return ArgTys;
9923}
9924
9925/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
9926/// function (if possible) calls. Returns invalid cost for the corresponding
9927/// calls, if they cannot be vectorized/will be scalarized.
9928static std::pair<InstructionCost, InstructionCost>
9931 ArrayRef<Type *> ArgTys) {
9932 auto Shape = VFShape::get(CI->getFunctionType(),
9934 false /*HasGlobalPred*/);
9935 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
9936 auto LibCost = InstructionCost::getInvalid();
9937 if (!CI->isNoBuiltin() && VecFunc) {
9938 // Calculate the cost of the vector library call.
9939 // If the corresponding vector call is cheaper, return its cost.
9940 LibCost =
9941 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
9942 }
9944
9945 // Calculate the cost of the vector intrinsic call.
9946 FastMathFlags FMF;
9947 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
9948 FMF = FPCI->getFastMathFlags();
9949 const InstructionCost ScalarLimit = 10000;
9950 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
9951 LibCost.isValid() ? LibCost : ScalarLimit);
9952 auto IntrinsicCost =
9953 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
9954 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
9955 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
9957
9958 return {IntrinsicCost, LibCost};
9959}
9960
9961BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9962 const InstructionsState &S, ArrayRef<Value *> VL,
9963 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
9964 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
9965 assert(S.getMainOp() &&
9966 "Expected instructions with same/alternate opcodes only.");
9967
9968 unsigned ShuffleOrOp =
9969 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
9970 Instruction *VL0 = S.getMainOp();
9971 switch (ShuffleOrOp) {
9972 case Instruction::PHI: {
9973 // Too many operands - gather, most probably won't be vectorized.
9974 if (VL0->getNumOperands() > MaxPHINumOperands)
9975 return TreeEntry::NeedToGather;
9976 // Check for terminator values (e.g. invoke).
9977 for (Value *V : VL) {
9978 auto *PHI = dyn_cast<PHINode>(V);
9979 if (!PHI)
9980 continue;
9981 for (Value *Incoming : PHI->incoming_values()) {
9983 if (Term && Term->isTerminator()) {
9985 << "SLP: Need to swizzle PHINodes (terminator use).\n");
9986 return TreeEntry::NeedToGather;
9987 }
9988 }
9989 }
9990
9991 return TreeEntry::Vectorize;
9992 }
9993 case Instruction::ExtractElement:
9994 if (any_of(VL, [&](Value *V) {
9995 auto *EI = dyn_cast<ExtractElementInst>(V);
9996 if (!EI)
9997 return true;
9998 return isVectorized(EI->getOperand(0));
9999 }))
10000 return TreeEntry::NeedToGather;
10001 [[fallthrough]];
10002 case Instruction::ExtractValue: {
10003 bool Reuse = canReuseExtract(VL, CurrentOrder);
10004 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
10005 // non-full registers).
10006 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
10007 return TreeEntry::NeedToGather;
10008 if (Reuse || !CurrentOrder.empty())
10009 return TreeEntry::Vectorize;
10010 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
10011 return TreeEntry::NeedToGather;
10012 }
10013 case Instruction::InsertElement: {
10014 // Check that we have a buildvector and not a shuffle of 2 or more
10015 // different vectors.
10016 ValueSet SourceVectors;
10017 for (Value *V : VL) {
10018 if (isa<PoisonValue>(V)) {
10019 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
10020 return TreeEntry::NeedToGather;
10021 }
10022 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
10023 assert(getElementIndex(V) != std::nullopt &&
10024 "Non-constant or undef index?");
10025 }
10026
10027 if (count_if(VL, [&SourceVectors](Value *V) {
10028 return !SourceVectors.contains(V);
10029 }) >= 2) {
10030 // Found 2nd source vector - cancel.
10031 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10032 "different source vectors.\n");
10033 return TreeEntry::NeedToGather;
10034 }
10035
10036 if (any_of(VL, [&SourceVectors](Value *V) {
10037 // The last InsertElement can have multiple uses.
10038 return SourceVectors.contains(V) && !V->hasOneUse();
10039 })) {
10040 assert(SLPReVec && "Only supported by REVEC.");
10041 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10042 "multiple uses.\n");
10043 return TreeEntry::NeedToGather;
10044 }
10045
10046 return TreeEntry::Vectorize;
10047 }
10048 case Instruction::Load: {
10049 // Check that a vectorized load would load the same memory as a scalar
10050 // load. For example, we don't want to vectorize loads that are smaller
10051 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
10052 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
10053 // from such a struct, we read/write packed bits disagreeing with the
10054 // unvectorized version.
10055 auto IsGatheredNode = [&]() {
10056 if (!GatheredLoadsEntriesFirst)
10057 return false;
10058 return all_of(VL, [&](Value *V) {
10059 if (isa<PoisonValue>(V))
10060 return true;
10061 return any_of(getTreeEntries(V), [&](const TreeEntry *TE) {
10062 return TE->Idx >= *GatheredLoadsEntriesFirst;
10063 });
10064 });
10065 };
10066 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
10068 return TreeEntry::Vectorize;
10070 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10071 // Delay slow vectorized nodes for better vectorization attempts.
10072 LoadEntriesToVectorize.insert(VectorizableTree.size());
10073 return TreeEntry::NeedToGather;
10074 }
10075 return IsGatheredNode() ? TreeEntry::NeedToGather
10076 : TreeEntry::CompressVectorize;
10078 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10079 // Delay slow vectorized nodes for better vectorization attempts.
10080 LoadEntriesToVectorize.insert(VectorizableTree.size());
10081 return TreeEntry::NeedToGather;
10082 }
10083 return IsGatheredNode() ? TreeEntry::NeedToGather
10084 : TreeEntry::ScatterVectorize;
10086 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10087 // Delay slow vectorized nodes for better vectorization attempts.
10088 LoadEntriesToVectorize.insert(VectorizableTree.size());
10089 return TreeEntry::NeedToGather;
10090 }
10091 return IsGatheredNode() ? TreeEntry::NeedToGather
10092 : TreeEntry::StridedVectorize;
10093 case LoadsState::Gather:
10094#ifndef NDEBUG
10095 Type *ScalarTy = VL0->getType();
10096 if (DL->getTypeSizeInBits(ScalarTy) !=
10097 DL->getTypeAllocSizeInBits(ScalarTy))
10098 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
10099 else if (any_of(VL, [](Value *V) {
10100 auto *LI = dyn_cast<LoadInst>(V);
10101 return !LI || !LI->isSimple();
10102 }))
10103 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
10104 else
10105 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
10106#endif // NDEBUG
10108 return TreeEntry::NeedToGather;
10109 }
10110 llvm_unreachable("Unexpected state of loads");
10111 }
10112 case Instruction::ZExt:
10113 case Instruction::SExt:
10114 case Instruction::FPToUI:
10115 case Instruction::FPToSI:
10116 case Instruction::FPExt:
10117 case Instruction::PtrToInt:
10118 case Instruction::IntToPtr:
10119 case Instruction::SIToFP:
10120 case Instruction::UIToFP:
10121 case Instruction::Trunc:
10122 case Instruction::FPTrunc:
10123 case Instruction::BitCast: {
10124 Type *SrcTy = VL0->getOperand(0)->getType();
10125 for (Value *V : VL) {
10126 if (isa<PoisonValue>(V))
10127 continue;
10128 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
10129 if (Ty != SrcTy || !isValidElementType(Ty)) {
10130 LLVM_DEBUG(
10131 dbgs() << "SLP: Gathering casts with different src types.\n");
10132 return TreeEntry::NeedToGather;
10133 }
10134 }
10135 return TreeEntry::Vectorize;
10136 }
10137 case Instruction::ICmp:
10138 case Instruction::FCmp: {
10139 // Check that all of the compares have the same predicate.
10140 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
10142 Type *ComparedTy = VL0->getOperand(0)->getType();
10143 for (Value *V : VL) {
10144 if (isa<PoisonValue>(V))
10145 continue;
10146 auto *Cmp = cast<CmpInst>(V);
10147 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
10148 Cmp->getOperand(0)->getType() != ComparedTy) {
10149 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
10150 return TreeEntry::NeedToGather;
10151 }
10152 }
10153 return TreeEntry::Vectorize;
10154 }
10155 case Instruction::Select:
10156 case Instruction::FNeg:
10157 case Instruction::Add:
10158 case Instruction::FAdd:
10159 case Instruction::Sub:
10160 case Instruction::FSub:
10161 case Instruction::Mul:
10162 case Instruction::FMul:
10163 case Instruction::UDiv:
10164 case Instruction::SDiv:
10165 case Instruction::FDiv:
10166 case Instruction::URem:
10167 case Instruction::SRem:
10168 case Instruction::FRem:
10169 case Instruction::Shl:
10170 case Instruction::LShr:
10171 case Instruction::AShr:
10172 case Instruction::And:
10173 case Instruction::Or:
10174 case Instruction::Xor:
10175 case Instruction::Freeze:
10176 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10177 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10178 auto *I = dyn_cast<Instruction>(V);
10179 return I && I->isBinaryOp() && !I->isFast();
10180 }))
10181 return TreeEntry::NeedToGather;
10182 return TreeEntry::Vectorize;
10183 case Instruction::GetElementPtr: {
10184 // We don't combine GEPs with complicated (nested) indexing.
10185 for (Value *V : VL) {
10186 auto *I = dyn_cast<GetElementPtrInst>(V);
10187 if (!I)
10188 continue;
10189 if (I->getNumOperands() != 2) {
10190 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10191 return TreeEntry::NeedToGather;
10192 }
10193 }
10194
10195 // We can't combine several GEPs into one vector if they operate on
10196 // different types.
10197 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
10198 for (Value *V : VL) {
10199 auto *GEP = dyn_cast<GEPOperator>(V);
10200 if (!GEP)
10201 continue;
10202 Type *CurTy = GEP->getSourceElementType();
10203 if (Ty0 != CurTy) {
10204 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10205 return TreeEntry::NeedToGather;
10206 }
10207 }
10208
10209 // We don't combine GEPs with non-constant indexes.
10210 Type *Ty1 = VL0->getOperand(1)->getType();
10211 for (Value *V : VL) {
10212 auto *I = dyn_cast<GetElementPtrInst>(V);
10213 if (!I)
10214 continue;
10215 auto *Op = I->getOperand(1);
10216 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10217 (Op->getType() != Ty1 &&
10218 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
10219 Op->getType()->getScalarSizeInBits() >
10220 DL->getIndexSizeInBits(
10221 V->getType()->getPointerAddressSpace())))) {
10222 LLVM_DEBUG(
10223 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10224 return TreeEntry::NeedToGather;
10225 }
10226 }
10227
10228 return TreeEntry::Vectorize;
10229 }
10230 case Instruction::Store: {
10231 // Check if the stores are consecutive or if we need to swizzle them.
10232 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
10233 // Avoid types that are padded when being allocated as scalars, while
10234 // being packed together in a vector (such as i1).
10235 if (DL->getTypeSizeInBits(ScalarTy) !=
10236 DL->getTypeAllocSizeInBits(ScalarTy)) {
10237 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10238 return TreeEntry::NeedToGather;
10239 }
10240 // Make sure all stores in the bundle are simple - we can't vectorize
10241 // atomic or volatile stores.
10242 for (Value *V : VL) {
10243 auto *SI = cast<StoreInst>(V);
10244 if (!SI->isSimple()) {
10245 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10246 return TreeEntry::NeedToGather;
10247 }
10248 PointerOps.push_back(SI->getPointerOperand());
10249 }
10250
10251 // Check the order of pointer operands.
10252 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
10253 Value *Ptr0;
10254 Value *PtrN;
10255 if (CurrentOrder.empty()) {
10256 Ptr0 = PointerOps.front();
10257 PtrN = PointerOps.back();
10258 } else {
10259 Ptr0 = PointerOps[CurrentOrder.front()];
10260 PtrN = PointerOps[CurrentOrder.back()];
10261 }
10262 std::optional<int64_t> Dist =
10263 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
10264 // Check that the sorted pointer operands are consecutive.
10265 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
10266 return TreeEntry::Vectorize;
10267 }
10268
10269 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10270 return TreeEntry::NeedToGather;
10271 }
10272 case Instruction::Call: {
10273 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10274 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
10275 auto *I = dyn_cast<Instruction>(V);
10276 return I && !I->isFast();
10277 }))
10278 return TreeEntry::NeedToGather;
10279 // Check if the calls are all to the same vectorizable intrinsic or
10280 // library function.
10281 CallInst *CI = cast<CallInst>(VL0);
10283
10284 VFShape Shape = VFShape::get(
10285 CI->getFunctionType(),
10286 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
10287 false /*HasGlobalPred*/);
10288 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10289
10290 if (!VecFunc && !isTriviallyVectorizable(ID)) {
10291 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10292 return TreeEntry::NeedToGather;
10293 }
10294 Function *F = CI->getCalledFunction();
10295 unsigned NumArgs = CI->arg_size();
10296 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
10297 for (unsigned J = 0; J != NumArgs; ++J)
10299 ScalarArgs[J] = CI->getArgOperand(J);
10300 for (Value *V : VL) {
10301 CallInst *CI2 = dyn_cast<CallInst>(V);
10302 if (!CI2 || CI2->getCalledFunction() != F ||
10303 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
10304 (VecFunc &&
10305 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10307 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
10308 << "\n");
10309 return TreeEntry::NeedToGather;
10310 }
10311 // Some intrinsics have scalar arguments and should be same in order for
10312 // them to be vectorized.
10313 for (unsigned J = 0; J != NumArgs; ++J) {
10315 Value *A1J = CI2->getArgOperand(J);
10316 if (ScalarArgs[J] != A1J) {
10318 << "SLP: mismatched arguments in call:" << *CI
10319 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10320 return TreeEntry::NeedToGather;
10321 }
10322 }
10323 }
10324 // Verify that the bundle operands are identical between the two calls.
10325 if (CI->hasOperandBundles() &&
10326 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
10327 CI->op_begin() + CI->getBundleOperandsEndIndex(),
10328 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10329 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10330 << "!=" << *V << '\n');
10331 return TreeEntry::NeedToGather;
10332 }
10333 }
10334 SmallVector<Type *> ArgTys =
10335 buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
10336 auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
10337 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10338 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10339 return TreeEntry::NeedToGather;
10340
10341 return TreeEntry::Vectorize;
10342 }
10343 case Instruction::ShuffleVector: {
10344 if (!S.isAltShuffle()) {
10345 // REVEC can support non alternate shuffle.
10347 return TreeEntry::Vectorize;
10348 // If this is not an alternate sequence of opcode like add-sub
10349 // then do not vectorize this instruction.
10350 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
10351 return TreeEntry::NeedToGather;
10352 }
10353 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
10354 LLVM_DEBUG(
10355 dbgs()
10356 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
10357 "the whole alt sequence is not profitable.\n");
10358 return TreeEntry::NeedToGather;
10359 }
10360
10361 return TreeEntry::Vectorize;
10362 }
10363 default:
10364 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10365 return TreeEntry::NeedToGather;
10366 }
10367}
10368
10369namespace {
10370/// Allows to correctly handle operands of the phi nodes based on the \p Main
10371/// PHINode order of incoming basic blocks/values.
10372class PHIHandler {
10373 DominatorTree &DT;
10374 PHINode *Main = nullptr;
10377
10378public:
10379 PHIHandler() = delete;
10380 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
10381 : DT(DT), Main(Main), Phis(Phis),
10382 Operands(Main->getNumIncomingValues(),
10383 SmallVector<Value *>(Phis.size(), nullptr)) {}
10384 void buildOperands() {
10385 constexpr unsigned FastLimit = 4;
10386 if (Main->getNumIncomingValues() <= FastLimit) {
10387 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
10388 BasicBlock *InBB = Main->getIncomingBlock(I);
10389 if (!DT.isReachableFromEntry(InBB)) {
10390 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10391 continue;
10392 }
10393 // Prepare the operand vector.
10394 for (auto [Idx, V] : enumerate(Phis)) {
10395 auto *P = dyn_cast<PHINode>(V);
10396 if (!P) {
10398 "Expected isa instruction or poison value.");
10399 Operands[I][Idx] = V;
10400 continue;
10401 }
10402 if (P->getIncomingBlock(I) == InBB)
10403 Operands[I][Idx] = P->getIncomingValue(I);
10404 else
10405 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
10406 }
10407 }
10408 return;
10409 }
10410 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10411 Blocks;
10412 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
10413 BasicBlock *InBB = Main->getIncomingBlock(I);
10414 if (!DT.isReachableFromEntry(InBB)) {
10415 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
10416 continue;
10417 }
10418 Blocks.try_emplace(InBB).first->second.push_back(I);
10419 }
10420 for (auto [Idx, V] : enumerate(Phis)) {
10421 if (isa<PoisonValue>(V)) {
10422 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
10423 Operands[I][Idx] = V;
10424 continue;
10425 }
10426 auto *P = cast<PHINode>(V);
10427 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
10428 BasicBlock *InBB = P->getIncomingBlock(I);
10429 if (InBB == Main->getIncomingBlock(I)) {
10430 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
10431 continue;
10432 Operands[I][Idx] = P->getIncomingValue(I);
10433 continue;
10434 }
10435 auto *It = Blocks.find(InBB);
10436 if (It == Blocks.end())
10437 continue;
10438 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
10439 }
10440 }
10441 for (const auto &P : Blocks) {
10442 ArrayRef<unsigned> IncomingValues = P.second;
10443 if (IncomingValues.size() <= 1)
10444 continue;
10445 unsigned BasicI = IncomingValues.consume_front();
10446 for (unsigned I : IncomingValues) {
10447 assert(all_of(enumerate(Operands[I]),
10448 [&](const auto &Data) {
10449 return !Data.value() ||
10450 Data.value() == Operands[BasicI][Data.index()];
10451 }) &&
10452 "Expected empty operands list.");
10453 Operands[I] = Operands[BasicI];
10454 }
10455 }
10456 }
10457 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
10458};
10459} // namespace
10460
10461/// Returns main/alternate instructions for the given \p VL. Unlike
10462/// getSameOpcode supports non-compatible instructions for better SplitVectorize
10463/// node support.
10464/// \returns first main/alt instructions, if only poisons and instruction with
10465/// only 2 opcodes exists. Returns pair of nullptr otherwise.
10466static std::pair<Instruction *, Instruction *>
10468 Instruction *MainOp = nullptr;
10469 Instruction *AltOp = nullptr;
10470 for (Value *V : VL) {
10471 if (isa<PoisonValue>(V))
10472 continue;
10473 auto *I = dyn_cast<Instruction>(V);
10474 if (!I)
10475 return {};
10476 if (!MainOp) {
10477 MainOp = I;
10478 continue;
10479 }
10480 if (MainOp->getOpcode() == I->getOpcode()) {
10481 if (I->getParent() != MainOp->getParent())
10482 return {};
10483 continue;
10484 }
10485 if (!AltOp) {
10486 AltOp = I;
10487 continue;
10488 }
10489 if (AltOp->getOpcode() == I->getOpcode()) {
10490 if (I->getParent() != AltOp->getParent())
10491 return {};
10492 continue;
10493 }
10494 return {};
10495 }
10496 if (!AltOp)
10497 return {};
10498 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
10499 "Expected different main and alt instructions.");
10500 return std::make_pair(MainOp, AltOp);
10501}
10502
10503/// Checks that every instruction appears once in the list and if not, packs
10504/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
10505/// unique scalars is extended by poison values to the whole register size.
10506///
10507/// \returns false if \p VL could not be uniquified, in which case \p VL is
10508/// unchanged and \p ReuseShuffleIndices is empty.
10510 SmallVectorImpl<int> &ReuseShuffleIndices,
10511 const TargetTransformInfo &TTI,
10512 const TargetLibraryInfo &TLI,
10513 const InstructionsState &S,
10514 const BoUpSLP::EdgeInfo &UserTreeIdx,
10515 bool TryPad = false) {
10516 // Check that every instruction appears once in this bundle.
10517 SmallVector<Value *> UniqueValues;
10518 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
10519 for (Value *V : VL) {
10520 if (isConstant(V)) {
10521 // Constants are always considered distinct, even if the same constant
10522 // appears multiple times in VL.
10523 ReuseShuffleIndices.emplace_back(
10524 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
10525 UniqueValues.emplace_back(V);
10526 continue;
10527 }
10528 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
10529 ReuseShuffleIndices.emplace_back(Res.first->second);
10530 if (Res.second)
10531 UniqueValues.emplace_back(V);
10532 }
10533
10534 // Easy case: VL has unique values and a "natural" size
10535 size_t NumUniqueScalarValues = UniqueValues.size();
10536 bool IsFullVectors = hasFullVectorsOrPowerOf2(
10537 TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
10538 if (NumUniqueScalarValues == VL.size() &&
10539 (VectorizeNonPowerOf2 || IsFullVectors)) {
10540 ReuseShuffleIndices.clear();
10541 return true;
10542 }
10543
10544 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
10545 if ((UserTreeIdx.UserTE &&
10546 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
10548 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
10549 "for nodes with padding.\n");
10550 ReuseShuffleIndices.clear();
10551 return false;
10552 }
10553
10554 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
10555 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10556 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
10557 return isa<UndefValue>(V) || !isConstant(V);
10558 }))) {
10559 if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
10560 S.getMainOp()->isSafeToRemove() &&
10561 (S.areInstructionsWithCopyableElements() ||
10562 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
10563 // Find the number of elements, which forms full vectors.
10564 unsigned PWSz = getFullVectorNumberOfElements(
10565 TTI, UniqueValues.front()->getType(), UniqueValues.size());
10566 PWSz = std::min<unsigned>(PWSz, VL.size());
10567 if (PWSz == VL.size()) {
10568 // We ended up with the same size after removing duplicates and
10569 // upgrading the resulting vector size to a "nice size". Just keep
10570 // the initial VL then.
10571 ReuseShuffleIndices.clear();
10572 } else {
10573 // Pad unique values with poison to grow the vector to a "nice" size
10574 SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
10575 UniqueValues.end());
10576 PaddedUniqueValues.append(
10577 PWSz - UniqueValues.size(),
10578 PoisonValue::get(UniqueValues.front()->getType()));
10579 // Check that extended with poisons/copyable operations are still valid
10580 // for vectorization (div/rem are not allowed).
10581 if ((!S.areInstructionsWithCopyableElements() &&
10582 !getSameOpcode(PaddedUniqueValues, TLI).valid()) ||
10583 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
10584 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
10585 isa<CallInst>(S.getMainOp())))) {
10586 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10587 ReuseShuffleIndices.clear();
10588 return false;
10589 }
10590 VL = std::move(PaddedUniqueValues);
10591 }
10592 return true;
10593 }
10594 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10595 ReuseShuffleIndices.clear();
10596 return false;
10597 }
10598 VL = std::move(UniqueValues);
10599 return true;
10600}
10601
10602bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
10603 const InstructionsState &LocalState,
10604 SmallVectorImpl<Value *> &Op1,
10605 SmallVectorImpl<Value *> &Op2,
10606 OrdersType &ReorderIndices) const {
10607 constexpr unsigned SmallNodeSize = 4;
10608 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10610 return false;
10611
10612 // Check if this is a duplicate of another split entry.
10613 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
10614 << ".\n");
10615 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
10616 if (E->isSame(VL)) {
10617 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
10618 << *LocalState.getMainOp() << ".\n");
10619 return false;
10620 }
10621 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
10622 if (all_of(VL, [&](Value *V) {
10623 return isa<PoisonValue>(V) || Values.contains(V);
10624 })) {
10625 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
10626 return false;
10627 }
10628 }
10629
10630 ReorderIndices.assign(VL.size(), VL.size());
10631 SmallBitVector Op1Indices(VL.size());
10632 for (auto [Idx, V] : enumerate(VL)) {
10633 auto *I = dyn_cast<Instruction>(V);
10634 if (!I) {
10635 Op1.push_back(V);
10636 Op1Indices.set(Idx);
10637 continue;
10638 }
10639 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10640 isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(),
10641 *TLI)) ||
10642 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10643 !isAlternateInstruction(I, LocalState.getMainOp(),
10644 LocalState.getAltOp(), *TLI))) {
10645 Op1.push_back(V);
10646 Op1Indices.set(Idx);
10647 continue;
10648 }
10649 Op2.push_back(V);
10650 }
10651 Type *ScalarTy = getValueType(VL.front());
10652 VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
10653 unsigned Opcode0 = LocalState.getOpcode();
10654 unsigned Opcode1 = LocalState.getAltOpcode();
10655 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10656 // Enable split node, only if all nodes do not form legal alternate
10657 // instruction (like X86 addsub).
10658 SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
10659 SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
10660 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10661 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10662 !hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), Op1.size()) ||
10663 !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), Op2.size()))
10664 return false;
10665 // Enable split node, only if all nodes are power-of-2/full registers.
10666 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
10667 for (unsigned Idx : seq<unsigned>(VL.size())) {
10668 if (Op1Indices.test(Idx)) {
10669 ReorderIndices[Op1Cnt] = Idx;
10670 ++Op1Cnt;
10671 } else {
10672 ReorderIndices[Op2Cnt] = Idx;
10673 ++Op2Cnt;
10674 }
10675 }
10676 if (isIdentityOrder(ReorderIndices))
10677 ReorderIndices.clear();
10678 SmallVector<int> Mask;
10679 if (!ReorderIndices.empty())
10680 inversePermutation(ReorderIndices, Mask);
10681 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10682 VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
10683 VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
10684 // Check non-profitable single register ops, which better to be represented
10685 // as alternate ops.
10686 if (NumParts >= VL.size())
10687 return false;
10689 InstructionCost InsertCost = ::getShuffleCost(
10690 *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
10691 FixedVectorType *SubVecTy =
10692 getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
10693 InstructionCost NewShuffleCost =
10694 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
10695 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10696 (Mask.empty() || InsertCost >= NewShuffleCost))
10697 return false;
10698 if ((LocalState.getMainOp()->isBinaryOp() &&
10699 LocalState.getAltOp()->isBinaryOp() &&
10700 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10701 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10702 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10703 (LocalState.getMainOp()->isUnaryOp() &&
10704 LocalState.getAltOp()->isUnaryOp())) {
10705 InstructionCost OriginalVecOpsCost =
10706 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10707 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10708 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
10709 for (unsigned Idx : seq<unsigned>(VL.size())) {
10710 if (isa<PoisonValue>(VL[Idx]))
10711 continue;
10712 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
10713 }
10714 InstructionCost OriginalCost =
10715 OriginalVecOpsCost + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
10716 VecTy, OriginalMask, Kind);
10717 InstructionCost NewVecOpsCost =
10718 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10719 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10720 InstructionCost NewCost =
10721 NewVecOpsCost + InsertCost +
10722 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10723 VectorizableTree.front()->getOpcode() == Instruction::Store
10724 ? NewShuffleCost
10725 : 0);
10726 // If not profitable to split - exit.
10727 if (NewCost >= OriginalCost)
10728 return false;
10729 }
10730 return true;
10731}
10732
10733namespace {
10734/// Class accepts incoming list of values, checks if it is able to model
10735/// "copyable" values as compatible operations, and generates the list of values
10736/// for scheduling and list of operands doe the new nodes.
10737class InstructionsCompatibilityAnalysis {
10738 DominatorTree &DT;
10739 const DataLayout &DL;
10740 const TargetTransformInfo &TTI;
10741 const TargetLibraryInfo &TLI;
10742 unsigned MainOpcode = 0;
10743 Instruction *MainOp = nullptr;
10744
10745 /// Checks if the opcode is supported as the main opcode for copyable
10746 /// elements.
10747 static bool isSupportedOpcode(const unsigned Opcode) {
10748 return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
10749 Opcode == Instruction::Shl || Opcode == Instruction::SDiv ||
10750 Opcode == Instruction::UDiv || Opcode == Instruction::And ||
10751 Opcode == Instruction::Or || Opcode == Instruction::Xor;
10752 }
10753
10754 /// Identifies the best candidate value, which represents main opcode
10755 /// operation.
10756 /// Currently the best candidate is the Add instruction with the parent
10757 /// block with the highest DFS incoming number (block, that dominates other).
10758 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
10759 BasicBlock *Parent = nullptr;
10760 // Checks if the instruction has supported opcode.
10761 auto IsSupportedInstruction = [&](Instruction *I, bool AnyUndef) {
10762 if (AnyUndef && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
10763 return false;
10764 return I && isSupportedOpcode(I->getOpcode()) &&
10765 (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
10766 };
10767 // Exclude operands instructions immediately to improve compile time, it
10768 // will be unable to schedule anyway.
10769 SmallDenseSet<Value *, 8> Operands;
10770 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10771 bool AnyUndef = false;
10772 for (Value *V : VL) {
10773 auto *I = dyn_cast<Instruction>(V);
10774 if (!I) {
10775 AnyUndef |= isa<UndefValue>(V);
10776 continue;
10777 }
10778 if (!DT.isReachableFromEntry(I->getParent()))
10779 continue;
10780 if (Candidates.empty()) {
10781 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10782 Parent = I->getParent();
10783 Operands.insert(I->op_begin(), I->op_end());
10784 continue;
10785 }
10786 if (Parent == I->getParent()) {
10787 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10788 Operands.insert(I->op_begin(), I->op_end());
10789 continue;
10790 }
10791 auto *NodeA = DT.getNode(Parent);
10792 auto *NodeB = DT.getNode(I->getParent());
10793 assert(NodeA && "Should only process reachable instructions");
10794 assert(NodeB && "Should only process reachable instructions");
10795 assert((NodeA == NodeB) ==
10796 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10797 "Different nodes should have different DFS numbers");
10798 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10799 Candidates.clear();
10800 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10801 Parent = I->getParent();
10802 Operands.clear();
10803 Operands.insert(I->op_begin(), I->op_end());
10804 }
10805 }
10806 unsigned BestOpcodeNum = 0;
10807 MainOp = nullptr;
10808 for (const auto &P : Candidates) {
10809 if (P.second.size() < BestOpcodeNum)
10810 continue;
10811 for (Instruction *I : P.second) {
10812 if (IsSupportedInstruction(I, AnyUndef) && !Operands.contains(I)) {
10813 MainOp = I;
10814 BestOpcodeNum = P.second.size();
10815 break;
10816 }
10817 }
10818 }
10819 if (MainOp) {
10820 // Do not match, if any copyable is a terminator from the same block as
10821 // the main operation.
10822 if (any_of(VL, [&](Value *V) {
10823 auto *I = dyn_cast<Instruction>(V);
10824 return I && I->getParent() == MainOp->getParent() &&
10825 I->isTerminator();
10826 })) {
10827 MainOp = nullptr;
10828 return;
10829 }
10830 MainOpcode = MainOp->getOpcode();
10831 }
10832 }
10833
10834 /// Returns the idempotent value for the \p MainOp with the detected \p
10835 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
10836 /// the operand itself, since V or V == V.
10837 Value *selectBestIdempotentValue() const {
10838 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10839 return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
10840 !MainOp->isCommutative());
10841 }
10842
10843 /// Returns the value and operands for the \p V, considering if it is original
10844 /// instruction and its actual operands should be returned, or it is a
10845 /// copyable element and its should be represented as idempotent instruction.
10846 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
10847 if (isa<PoisonValue>(V))
10848 return {V, V};
10849 if (!S.isCopyableElement(V))
10850 return convertTo(cast<Instruction>(V), S).second;
10851 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10852 return {V, selectBestIdempotentValue()};
10853 }
10854
10855 /// Builds operands for the original instructions.
10856 void
10857 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
10858 SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
10859
10860 unsigned ShuffleOrOp =
10861 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10862 Instruction *VL0 = S.getMainOp();
10863
10864 switch (ShuffleOrOp) {
10865 case Instruction::PHI: {
10866 auto *PH = cast<PHINode>(VL0);
10867
10868 // Keeps the reordered operands to avoid code duplication.
10869 PHIHandler Handler(DT, PH, VL);
10870 Handler.buildOperands();
10871 Operands.assign(PH->getNumOperands(), {});
10872 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
10873 Operands[I].assign(Handler.getOperands(I).begin(),
10874 Handler.getOperands(I).end());
10875 return;
10876 }
10877 case Instruction::ExtractValue:
10878 case Instruction::ExtractElement:
10879 // This is a special case, as it does not gather, but at the same time
10880 // we are not extending buildTree_rec() towards the operands.
10881 Operands.assign(1, {VL.size(), VL0->getOperand(0)});
10882 return;
10883 case Instruction::InsertElement:
10884 Operands.assign(2, {VL.size(), nullptr});
10885 for (auto [Idx, V] : enumerate(VL)) {
10886 auto *IE = cast<InsertElementInst>(V);
10887 for (auto [OpIdx, Ops] : enumerate(Operands))
10888 Ops[Idx] = IE->getOperand(OpIdx);
10889 }
10890 return;
10891 case Instruction::Load:
10892 Operands.assign(
10893 1, {VL.size(),
10894 PoisonValue::get(cast<LoadInst>(VL0)->getPointerOperandType())});
10895 for (auto [V, Op] : zip(VL, Operands.back())) {
10896 auto *LI = dyn_cast<LoadInst>(V);
10897 if (!LI)
10898 continue;
10899 Op = LI->getPointerOperand();
10900 }
10901 return;
10902 case Instruction::ZExt:
10903 case Instruction::SExt:
10904 case Instruction::FPToUI:
10905 case Instruction::FPToSI:
10906 case Instruction::FPExt:
10907 case Instruction::PtrToInt:
10908 case Instruction::IntToPtr:
10909 case Instruction::SIToFP:
10910 case Instruction::UIToFP:
10911 case Instruction::Trunc:
10912 case Instruction::FPTrunc:
10913 case Instruction::BitCast:
10914 case Instruction::ICmp:
10915 case Instruction::FCmp:
10916 case Instruction::Select:
10917 case Instruction::FNeg:
10918 case Instruction::Add:
10919 case Instruction::FAdd:
10920 case Instruction::Sub:
10921 case Instruction::FSub:
10922 case Instruction::Mul:
10923 case Instruction::FMul:
10924 case Instruction::UDiv:
10925 case Instruction::SDiv:
10926 case Instruction::FDiv:
10927 case Instruction::URem:
10928 case Instruction::SRem:
10929 case Instruction::FRem:
10930 case Instruction::Shl:
10931 case Instruction::LShr:
10932 case Instruction::AShr:
10933 case Instruction::And:
10934 case Instruction::Or:
10935 case Instruction::Xor:
10936 case Instruction::Freeze:
10937 case Instruction::Store:
10938 case Instruction::ShuffleVector:
10939 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
10940 for (auto [Idx, V] : enumerate(VL)) {
10941 auto *I = dyn_cast<Instruction>(V);
10942 if (!I) {
10943 for (auto [OpIdx, Ops] : enumerate(Operands))
10944 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
10945 continue;
10946 }
10947 auto [Op, ConvertedOps] = convertTo(I, S);
10948 for (auto [OpIdx, Ops] : enumerate(Operands))
10949 Ops[Idx] = ConvertedOps[OpIdx];
10950 }
10951 return;
10952 case Instruction::GetElementPtr: {
10953 Operands.assign(2, {VL.size(), nullptr});
10954 // Need to cast all indices to the same type before vectorization to
10955 // avoid crash.
10956 // Required to be able to find correct matches between different gather
10957 // nodes and reuse the vectorized values rather than trying to gather them
10958 // again.
10959 const unsigned IndexIdx = 1;
10960 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
10961 Type *Ty =
10962 all_of(VL,
10963 [&](Value *V) {
10965 return !GEP || VL0Ty == GEP->getOperand(IndexIdx)->getType();
10966 })
10967 ? VL0Ty
10968 : DL.getIndexType(cast<GetElementPtrInst>(VL0)
10969 ->getPointerOperandType()
10970 ->getScalarType());
10971 for (auto [Idx, V] : enumerate(VL)) {
10973 if (!GEP) {
10974 Operands[0][Idx] = V;
10975 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
10976 continue;
10977 }
10978 Operands[0][Idx] = GEP->getPointerOperand();
10979 auto *Op = GEP->getOperand(IndexIdx);
10980 auto *CI = dyn_cast<ConstantInt>(Op);
10981 Operands[1][Idx] = CI ? ConstantFoldIntegerCast(
10982 CI, Ty, CI->getValue().isSignBitSet(), DL)
10983 : Op;
10984 }
10985 return;
10986 }
10987 case Instruction::Call: {
10988 auto *CI = cast<CallInst>(VL0);
10990 for (unsigned Idx : seq<unsigned>(CI->arg_size())) {
10992 continue;
10993 auto &Ops = Operands.emplace_back();
10994 for (Value *V : VL) {
10995 auto *I = dyn_cast<Instruction>(V);
10996 Ops.push_back(I ? I->getOperand(Idx)
10997 : PoisonValue::get(VL0->getOperand(Idx)->getType()));
10998 }
10999 }
11000 return;
11001 }
11002 default:
11003 break;
11004 }
11005 llvm_unreachable("Unexpected vectorization of the instructions.");
11006 }
11007
11008public:
11009 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
11010 const TargetTransformInfo &TTI,
11011 const TargetLibraryInfo &TLI)
11012 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
11013
11014 InstructionsState
11015 buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
11016 bool TryCopyableElementsVectorization,
11017 bool WithProfitabilityCheck = false,
11018 bool SkipSameCodeCheck = false) {
11019 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
11020 ? InstructionsState::invalid()
11021 : getSameOpcode(VL, TLI);
11022 if (S)
11023 return S;
11024 if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
11025 return S;
11026 findAndSetMainInstruction(VL, R);
11027 if (!MainOp)
11028 return InstructionsState::invalid();
11029 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
11030 if (!WithProfitabilityCheck)
11031 return S;
11032 // Check if it is profitable to vectorize the instruction.
11033 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
11034 auto BuildCandidates =
11035 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
11036 Value *V2) {
11037 if (V1 != V2 && isa<PHINode>(V1))
11038 return;
11039 auto *I1 = dyn_cast<Instruction>(V1);
11040 auto *I2 = dyn_cast<Instruction>(V2);
11041 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
11042 I1->getParent() != I2->getParent())
11043 return;
11044 Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
11045 };
11046 if (VL.size() == 2) {
11047 // Check if the operands allow better vectorization.
11048 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
11049 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11050 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11051 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
11052 R.findBestRootPair(Candidates1) &&
11053 R.findBestRootPair(Candidates2);
11054 if (!Res && isCommutative(MainOp)) {
11055 Candidates1.clear();
11056 Candidates2.clear();
11057 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11058 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11059 Res = !Candidates1.empty() && !Candidates2.empty() &&
11060 R.findBestRootPair(Candidates1) &&
11061 R.findBestRootPair(Candidates2);
11062 }
11063 if (!Res)
11064 return InstructionsState::invalid();
11066 InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
11067 InstructionCost VectorCost;
11068 FixedVectorType *VecTy =
11069 getWidenedType(S.getMainOp()->getType(), VL.size());
11070 switch (MainOpcode) {
11071 case Instruction::Add:
11072 case Instruction::LShr:
11073 case Instruction::Shl:
11074 case Instruction::SDiv:
11075 case Instruction::UDiv:
11076 case Instruction::And:
11077 case Instruction::Or:
11078 case Instruction::Xor:
11079 VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
11080 break;
11081 default:
11082 llvm_unreachable("Unexpected instruction.");
11083 }
11084 if (VectorCost > ScalarCost)
11085 return InstructionsState::invalid();
11086 return S;
11087 }
11088 assert(Operands.size() == 2 && "Unexpected number of operands!");
11089 unsigned CopyableNum =
11090 count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
11091 if (CopyableNum < VL.size() / 2)
11092 return S;
11093 // Too many phi copyables - exit.
11094 const unsigned Limit = VL.size() / 24;
11095 if ((CopyableNum >= VL.size() - Limit ||
11096 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
11097 CopyableNum >= MaxPHINumOperands) &&
11098 all_of(VL, [&](Value *V) {
11099 return isa<PHINode>(V) || !S.isCopyableElement(V);
11100 }))
11101 return InstructionsState::invalid();
11102 // Check profitability if number of copyables > VL.size() / 2.
11103 // 1. Reorder operands for better matching.
11104 if (isCommutative(MainOp)) {
11105 for (auto &Ops : Operands) {
11106 // Make instructions the first operands.
11107 if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
11108 std::swap(Ops.front(), Ops.back());
11109 continue;
11110 }
11111 // Make constants the second operands.
11112 if (isa<Constant>(Ops.front())) {
11113 std::swap(Ops.front(), Ops.back());
11114 continue;
11115 }
11116 }
11117 }
11118 // 2. Check, if operands can be vectorized.
11119 if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
11120 return InstructionsState::invalid();
11121 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
11122 if (allConstant(Ops) || isSplat(Ops))
11123 return true;
11124 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
11125 // one is different.
11126 constexpr unsigned Limit = 4;
11127 if (Operands.front().size() >= Limit) {
11128 SmallDenseMap<const Value *, unsigned> Counters;
11129 for (Value *V : Ops) {
11130 if (isa<UndefValue>(V))
11131 continue;
11132 ++Counters[V];
11133 }
11134 if (Counters.size() == 2 &&
11135 any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
11136 return C.second == 1;
11137 }))
11138 return true;
11139 }
11140 // First operand not a constant or splat? Last attempt - check for
11141 // potential vectorization.
11142 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
11143 InstructionsState OpS = Analysis.buildInstructionsState(
11144 Ops, R, /*TryCopyableElementsVectorization=*/true);
11145 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
11146 return false;
11147 unsigned CopyableNum =
11148 count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
11149 return CopyableNum <= VL.size() / 2;
11150 };
11151 if (!CheckOperand(Operands.front()))
11152 return InstructionsState::invalid();
11153
11154 return S;
11155 }
11156
11157 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
11158 ArrayRef<Value *> VL) {
11159 assert(S && "Invalid state!");
11161 if (S.areInstructionsWithCopyableElements()) {
11162 MainOp = S.getMainOp();
11163 MainOpcode = S.getOpcode();
11164 Operands.assign(MainOp->getNumOperands(),
11165 BoUpSLP::ValueList(VL.size(), nullptr));
11166 for (auto [Idx, V] : enumerate(VL)) {
11167 SmallVector<Value *> OperandsForValue = getOperands(S, V);
11168 for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
11169 Operands[OperandIdx][Idx] = Operand;
11170 }
11171 } else {
11172 buildOriginalOperands(S, VL, Operands);
11173 }
11174 return Operands;
11175 }
11176};
11177} // namespace
11178
11179BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11180 ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
11181 bool TryCopyableElementsVectorization) const {
11182 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
11183
11184 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11185 InstructionsState S = Analysis.buildInstructionsState(
11186 VL, *this, TryCopyableElementsVectorization,
11187 /*WithProfitabilityCheck=*/true, TryCopyableElementsVectorization);
11188
11189 // Don't go into catchswitch blocks, which can happen with PHIs.
11190 // Such blocks can only have PHIs and the catchswitch. There is no
11191 // place to insert a shuffle if we need to, so just avoid that issue.
11192 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
11193 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
11194 // Do not try to pack to avoid extra instructions here.
11195 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11196 /*TryToFindDuplicates=*/false);
11197 }
11198
11199 // Check if this is a duplicate of another entry.
11200 if (S) {
11201 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
11202 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
11203 if (E->isSame(VL)) {
11204 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
11205 << ".\n");
11206 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11207 }
11208 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11209 if (all_of(VL, [&](Value *V) {
11210 return isa<PoisonValue>(V) || Values.contains(V) ||
11211 (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) &&
11212 LI->getLoopFor(S.getMainOp()->getParent()) &&
11213 isVectorized(V));
11214 })) {
11215 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11216 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11217 }
11218 }
11219 }
11220
11221 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
11222 // a load), in which case peek through to include it in the tree, without
11223 // ballooning over-budget.
11224 if (Depth >= RecursionMaxDepth &&
11225 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
11226 (match(S.getMainOp(), m_Load(m_Value())) ||
11227 all_of(VL, [&S](const Value *I) {
11228 return match(I,
11230 cast<Instruction>(I)->getOpcode() == S.getOpcode();
11231 })))) {
11232 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
11233 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11234 }
11235
11236 // Don't handle scalable vectors
11237 if (S && S.getOpcode() == Instruction::ExtractElement &&
11239 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
11240 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
11241 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11242 }
11243
11244 // Don't handle vectors.
11245 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
11246 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
11247 // Do not try to pack to avoid extra instructions here.
11248 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11249 /*TryToFindDuplicates=*/false);
11250 }
11251
11252 // If all of the operands are identical or constant we have a simple solution.
11253 // If we deal with insert/extract instructions, they all must have constant
11254 // indices, otherwise we should gather them, not try to vectorize.
11255 // If alternate op node with 2 elements with gathered operands - do not
11256 // vectorize.
11257 auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
11258 if (!S || !S.isAltShuffle() || VL.size() > 2)
11259 return false;
11260 if (VectorizableTree.size() < MinTreeSize)
11261 return false;
11262 if (Depth >= RecursionMaxDepth - 1)
11263 return true;
11264 // Check if all operands are extracts, part of vector node or can build a
11265 // regular vectorize node.
11266 SmallVector<unsigned, 8> InstsCount;
11267 for (Value *V : VL) {
11268 auto *I = cast<Instruction>(V);
11269 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
11270 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11271 }));
11272 }
11273 bool IsCommutative =
11274 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
11275 if ((IsCommutative &&
11276 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
11277 (!IsCommutative &&
11278 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
11279 return true;
11280 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
11282 auto *I1 = cast<Instruction>(VL.front());
11283 auto *I2 = cast<Instruction>(VL.back());
11284 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
11285 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11286 I2->getOperand(Op));
11287 if (static_cast<unsigned>(count_if(
11288 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11290 })) >= S.getMainOp()->getNumOperands() / 2)
11291 return false;
11292 if (S.getMainOp()->getNumOperands() > 2)
11293 return true;
11294 if (IsCommutative) {
11295 // Check permuted operands.
11296 Candidates.clear();
11297 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
11298 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
11299 I2->getOperand((Op + 1) % E));
11300 if (any_of(
11301 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11303 }))
11304 return false;
11305 }
11306 return true;
11307 };
11308 SmallVector<unsigned> SortedIndices;
11309 BasicBlock *BB = nullptr;
11310 bool IsScatterVectorizeUserTE =
11311 UserTreeIdx.UserTE &&
11312 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11313 bool AreAllSameBlock = S.valid();
11314 bool AreScatterAllGEPSameBlock =
11315 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11316 VL.size() > 2 &&
11317 all_of(VL,
11318 [&BB](Value *V) {
11319 auto *I = dyn_cast<GetElementPtrInst>(V);
11320 if (!I)
11321 return doesNotNeedToBeScheduled(V);
11322 if (!BB)
11323 BB = I->getParent();
11324 return BB == I->getParent() && I->getNumOperands() == 2;
11325 }) &&
11326 BB &&
11327 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
11328 SortedIndices));
11329 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11330 if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) ||
11331 (S &&
11333 S.getMainOp()) &&
11335 NotProfitableForVectorization(VL)) {
11336 if (!S) {
11337 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
11338 "C,S,B,O, small shuffle. \n";
11339 dbgs() << "[";
11340 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11341 dbgs() << "]\n");
11342 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11343 /*TryToFindDuplicates=*/true,
11344 /*TrySplitVectorize=*/true);
11345 }
11346 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";
11347 dbgs() << "[";
11348 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11349 dbgs() << "]\n");
11350 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11351 }
11352
11353 // Don't vectorize ephemeral values.
11354 if (S && !EphValues.empty()) {
11355 for (Value *V : VL) {
11356 if (EphValues.count(V)) {
11357 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
11358 << ") is ephemeral.\n");
11359 // Do not try to pack to avoid extra instructions here.
11360 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11361 /*TryToFindDuplicates=*/false);
11362 }
11363 }
11364 }
11365
11366 // We now know that this is a vector of instructions of the same type from
11367 // the same block.
11368
11369 // Check that none of the instructions in the bundle are already in the tree
11370 // and the node may be not profitable for the vectorization as the small
11371 // alternate node.
11372 if (S && S.isAltShuffle()) {
11373 auto GetNumVectorizedExtracted = [&]() {
11374 APInt Extracted = APInt::getZero(VL.size());
11375 APInt Vectorized = APInt::getAllOnes(VL.size());
11376 for (auto [Idx, V] : enumerate(VL)) {
11377 auto *I = dyn_cast<Instruction>(V);
11378 if (!I || doesNotNeedToBeScheduled(I) ||
11379 all_of(I->operands(), [&](const Use &U) {
11380 return isa<ExtractElementInst>(U.get());
11381 }))
11382 continue;
11383 if (isVectorized(I))
11384 Vectorized.clearBit(Idx);
11385 else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
11386 Extracted.setBit(Idx);
11387 }
11388 return std::make_pair(Vectorized, Extracted);
11389 };
11390 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11392 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
11393 if (!Vectorized.isAllOnes() && !PreferScalarize) {
11394 // Rough cost estimation, if the vector code (+ potential extracts) is
11395 // more profitable than the scalar + buildvector.
11396 Type *ScalarTy = VL.front()->getType();
11397 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11398 InstructionCost VectorizeCostEstimate =
11399 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +
11400 ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
11401 /*Insert=*/false, /*Extract=*/true, Kind);
11402 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
11403 *TTI, ScalarTy, VecTy, Vectorized,
11404 /*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
11405 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11406 }
11407 if (PreferScalarize) {
11408 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
11409 "node is not profitable.\n");
11410 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11411 }
11412 }
11413
11414 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
11415 if (UserIgnoreList && !UserIgnoreList->empty()) {
11416 for (Value *V : VL) {
11417 if (UserIgnoreList->contains(V)) {
11418 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
11419 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11420 }
11421 }
11422 }
11423
11424 // Special processing for sorted pointers for ScatterVectorize node with
11425 // constant indeces only.
11426 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11427 assert(VL.front()->getType()->isPointerTy() &&
11429 "Expected pointers only.");
11430 // Reset S to make it GetElementPtr kind of node.
11431 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11432 assert(It != VL.end() && "Expected at least one GEP.");
11433 S = getSameOpcode(*It, *TLI);
11434 }
11435
11436 // Check that all of the users of the scalars that we want to vectorize are
11437 // schedulable.
11438 Instruction *VL0 = S.getMainOp();
11439 BB = VL0->getParent();
11440
11441 if (S &&
11443 !DT->isReachableFromEntry(BB))) {
11444 // Don't go into unreachable blocks. They may contain instructions with
11445 // dependency cycles which confuse the final scheduling.
11446 // Do not vectorize EH and non-returning blocks, not profitable in most
11447 // cases.
11448 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
11449 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11450 }
11451 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
11452}
11453
11454void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
11455 const EdgeInfo &UserTreeIdx,
11456 unsigned InterleaveFactor) {
11457 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
11458
11459 SmallVector<int> ReuseShuffleIndices;
11460 SmallVector<Value *> VL(VLRef);
11461
11462 // Tries to build split node.
11463 auto TrySplitNode = [&](const InstructionsState &LocalState) {
11464 SmallVector<Value *> Op1, Op2;
11465 OrdersType ReorderIndices;
11466 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11467 return false;
11468
11469 auto Invalid = ScheduleBundle::invalid();
11470 auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
11471 UserTreeIdx, {}, ReorderIndices);
11472 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
11473 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
11474 InstructionsState S = getSameOpcode(Op, *TLI);
11475 if (S && (isa<LoadInst>(S.getMainOp()) ||
11476 getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
11477 // Build gather node for loads, they will be gathered later.
11478 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11479 Idx == 0 ? 0 : Op1.size());
11480 (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
11481 } else {
11482 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11483 Idx == 0 ? 0 : Op1.size());
11484 buildTreeRec(Op, Depth, {TE, Idx});
11485 }
11486 };
11487 AddNode(Op1, 0);
11488 AddNode(Op2, 1);
11489 return true;
11490 };
11491
11492 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
11493 bool AreConsts = false;
11494 for (Value *V : VL) {
11495 if (isa<PoisonValue>(V))
11496 continue;
11497 if (isa<Constant>(V)) {
11498 AreConsts = true;
11499 continue;
11500 }
11501 if (!isa<PHINode>(V))
11502 return false;
11503 }
11504 return AreConsts;
11505 };
11506 if (AreOnlyConstsWithPHIs(VL)) {
11507 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
11508 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11509 return;
11510 }
11511
11512 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11513 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
11514 InstructionsState S = Legality.getInstructionsState();
11515 if (!Legality.isLegal()) {
11516 if (Legality.trySplitVectorize()) {
11517 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
11518 // Last chance to try to vectorize alternate node.
11519 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11520 return;
11521 }
11522 if (!S)
11523 Legality = getScalarsVectorizationLegality(
11524 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
11525 if (!Legality.isLegal()) {
11526 if (Legality.tryToFindDuplicates())
11527 tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
11528 UserTreeIdx);
11529
11530 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11531 return;
11532 }
11533 S = Legality.getInstructionsState();
11534 }
11535
11536 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
11537 if (S.isAltShuffle() && TrySplitNode(S))
11538 return;
11539
11540 // Check that every instruction appears once in this bundle.
11541 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx,
11542 /*TryPad=*/true)) {
11543 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11544 return;
11545 }
11546
11547 // Perform specific checks for each particular instruction kind.
11548 bool IsScatterVectorizeUserTE =
11549 UserTreeIdx.UserTE &&
11550 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11551 OrdersType CurrentOrder;
11552 SmallVector<Value *> PointerOps;
11553 StridedPtrInfo SPtrInfo;
11554 TreeEntry::EntryState State = getScalarsVectorizationState(
11555 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11556 if (State == TreeEntry::NeedToGather) {
11557 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11558 return;
11559 }
11560
11561 Instruction *VL0 = S.getMainOp();
11562 BasicBlock *BB = VL0->getParent();
11563 auto &BSRef = BlocksSchedules[BB];
11564 if (!BSRef)
11565 BSRef = std::make_unique<BlockScheduling>(BB);
11566
11567 BlockScheduling &BS = *BSRef;
11568
11569 SetVector<Value *> UniqueValues(llvm::from_range, VL);
11570 std::optional<ScheduleBundle *> BundlePtr =
11571 BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
11572#ifdef EXPENSIVE_CHECKS
11573 // Make sure we didn't break any internal invariants
11574 BS.verify();
11575#endif
11576 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11577 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
11578 // Last chance to try to vectorize alternate node.
11579 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
11580 return;
11581 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11582 NonScheduledFirst.insert(VL.front());
11583 if (S.getOpcode() == Instruction::Load &&
11584 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11586 return;
11587 }
11588 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11589 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
11590 ScheduleBundle Empty;
11591 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
11592 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
11593
11594 unsigned ShuffleOrOp =
11595 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11596 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
11597 // Postpone PHI nodes creation
11598 SmallVector<unsigned> PHIOps;
11599 for (unsigned I : seq<unsigned>(Operands.size())) {
11600 ArrayRef<Value *> Op = Operands[I];
11601 if (Op.empty())
11602 continue;
11603 InstructionsState S = getSameOpcode(Op, *TLI);
11604 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11605 buildTreeRec(Op, Depth + 1, {TE, I});
11606 else
11607 PHIOps.push_back(I);
11608 }
11609 for (unsigned I : PHIOps)
11610 buildTreeRec(Operands[I], Depth + 1, {TE, I});
11611 };
11612 switch (ShuffleOrOp) {
11613 case Instruction::PHI: {
11614 TreeEntry *TE =
11615 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11616 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
11617 TE->dump());
11618
11619 TE->setOperands(Operands);
11620 CreateOperandNodes(TE, Operands);
11621 return;
11622 }
11623 case Instruction::ExtractValue:
11624 case Instruction::ExtractElement: {
11625 if (CurrentOrder.empty()) {
11626 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
11627 } else {
11628 LLVM_DEBUG({
11629 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
11630 "with order";
11631 for (unsigned Idx : CurrentOrder)
11632 dbgs() << " " << Idx;
11633 dbgs() << "\n";
11634 });
11635 fixupOrderingIndices(CurrentOrder);
11636 }
11637 // Insert new order with initial value 0, if it does not exist,
11638 // otherwise return the iterator to the existing one.
11639 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11640 ReuseShuffleIndices, CurrentOrder);
11641 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
11642 "(ExtractValueInst/ExtractElementInst).\n";
11643 TE->dump());
11644 // This is a special case, as it does not gather, but at the same time
11645 // we are not extending buildTreeRec() towards the operands.
11646 TE->setOperands(Operands);
11647 return;
11648 }
11649 case Instruction::InsertElement: {
11650 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
11651
11652 auto OrdCompare = [](const std::pair<int, int> &P1,
11653 const std::pair<int, int> &P2) {
11654 return P1.first > P2.first;
11655 };
11656 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
11657 decltype(OrdCompare)>
11658 Indices(OrdCompare);
11659 for (int I = 0, E = VL.size(); I < E; ++I) {
11660 unsigned Idx = *getElementIndex(VL[I]);
11661 Indices.emplace(Idx, I);
11662 }
11663 OrdersType CurrentOrder(VL.size(), VL.size());
11664 bool IsIdentity = true;
11665 for (int I = 0, E = VL.size(); I < E; ++I) {
11666 CurrentOrder[Indices.top().second] = I;
11667 IsIdentity &= Indices.top().second == I;
11668 Indices.pop();
11669 }
11670 if (IsIdentity)
11671 CurrentOrder.clear();
11672 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11673 {}, CurrentOrder);
11674 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
11675 TE->dump());
11676
11677 TE->setOperands(Operands);
11678 buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
11679 return;
11680 }
11681 case Instruction::Load: {
11682 // Check that a vectorized load would load the same memory as a scalar
11683 // load. For example, we don't want to vectorize loads that are smaller
11684 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
11685 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
11686 // from such a struct, we read/write packed bits disagreeing with the
11687 // unvectorized version.
11688 TreeEntry *TE = nullptr;
11689 fixupOrderingIndices(CurrentOrder);
11690 switch (State) {
11691 case TreeEntry::Vectorize:
11692 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11693 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11694 if (CurrentOrder.empty())
11695 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
11696 TE->dump());
11697 else
11699 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
11700 TE->dump());
11701 break;
11702 case TreeEntry::CompressVectorize:
11703 // Vectorizing non-consecutive loads with (masked)load + compress.
11704 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11705 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11706 LLVM_DEBUG(
11707 dbgs()
11708 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11709 TE->dump());
11710 break;
11711 case TreeEntry::StridedVectorize:
11712 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11713 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11714 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11715 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
11716 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
11717 TE->dump());
11718 break;
11719 case TreeEntry::ScatterVectorize:
11720 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
11721 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11722 UserTreeIdx, ReuseShuffleIndices);
11723 LLVM_DEBUG(
11724 dbgs()
11725 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11726 TE->dump());
11727 break;
11728 case TreeEntry::CombinedVectorize:
11729 case TreeEntry::SplitVectorize:
11730 case TreeEntry::NeedToGather:
11731 llvm_unreachable("Unexpected loads state.");
11732 }
11733 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11734 assert(Operands.size() == 1 && "Expected a single operand only");
11735 SmallVector<int> Mask;
11736 inversePermutation(CurrentOrder, Mask);
11737 reorderScalars(Operands.front(), Mask);
11738 }
11739 TE->setOperands(Operands);
11740 if (State == TreeEntry::ScatterVectorize)
11741 buildTreeRec(PointerOps, Depth + 1, {TE, 0});
11742 return;
11743 }
11744 case Instruction::ZExt:
11745 case Instruction::SExt:
11746 case Instruction::FPToUI:
11747 case Instruction::FPToSI:
11748 case Instruction::FPExt:
11749 case Instruction::PtrToInt:
11750 case Instruction::IntToPtr:
11751 case Instruction::SIToFP:
11752 case Instruction::UIToFP:
11753 case Instruction::Trunc:
11754 case Instruction::FPTrunc:
11755 case Instruction::BitCast: {
11756 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11757 std::make_pair(std::numeric_limits<unsigned>::min(),
11758 std::numeric_limits<unsigned>::max()));
11759 if (ShuffleOrOp == Instruction::ZExt ||
11760 ShuffleOrOp == Instruction::SExt) {
11761 CastMaxMinBWSizes = std::make_pair(
11762 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11763 PrevMaxBW),
11764 std::min<unsigned>(
11765 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11766 PrevMinBW));
11767 } else if (ShuffleOrOp == Instruction::Trunc) {
11768 CastMaxMinBWSizes = std::make_pair(
11769 std::max<unsigned>(
11770 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
11771 PrevMaxBW),
11772 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
11773 PrevMinBW));
11774 }
11775 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11776 ReuseShuffleIndices);
11777 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
11778 TE->dump());
11779
11780 TE->setOperands(Operands);
11781 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11782 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11783 if (ShuffleOrOp == Instruction::Trunc) {
11784 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11785 } else if (ShuffleOrOp == Instruction::SIToFP ||
11786 ShuffleOrOp == Instruction::UIToFP) {
11787 unsigned NumSignBits =
11788 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11789 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
11790 APInt Mask = DB->getDemandedBits(OpI);
11791 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
11792 }
11793 if (NumSignBits * 2 >=
11794 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11795 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11796 }
11797 return;
11798 }
11799 case Instruction::ICmp:
11800 case Instruction::FCmp: {
11801 // Check that all of the compares have the same predicate.
11802 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
11803 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11804 ReuseShuffleIndices);
11805 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
11806 TE->dump());
11807
11808 VLOperands Ops(VL, Operands, S, *this);
11809 if (cast<CmpInst>(VL0)->isCommutative()) {
11810 // Commutative predicate - collect + sort operands of the instructions
11811 // so that each side is more likely to have the same opcode.
11813 "Commutative Predicate mismatch");
11814 Ops.reorder();
11815 Operands.front() = Ops.getVL(0);
11816 Operands.back() = Ops.getVL(1);
11817 } else {
11818 // Collect operands - commute if it uses the swapped predicate.
11819 for (auto [Idx, V] : enumerate(VL)) {
11820 if (isa<PoisonValue>(V))
11821 continue;
11822 auto *Cmp = cast<CmpInst>(V);
11823 if (Cmp->getPredicate() != P0)
11824 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11825 }
11826 }
11827 TE->setOperands(Operands);
11828 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
11829 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
11830 if (ShuffleOrOp == Instruction::ICmp) {
11831 unsigned NumSignBits0 =
11832 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
11833 if (NumSignBits0 * 2 >=
11834 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
11835 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11836 unsigned NumSignBits1 =
11837 ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
11838 if (NumSignBits1 * 2 >=
11839 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
11840 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11841 }
11842 return;
11843 }
11844 case Instruction::Select:
11845 case Instruction::FNeg:
11846 case Instruction::Add:
11847 case Instruction::FAdd:
11848 case Instruction::Sub:
11849 case Instruction::FSub:
11850 case Instruction::Mul:
11851 case Instruction::FMul:
11852 case Instruction::UDiv:
11853 case Instruction::SDiv:
11854 case Instruction::FDiv:
11855 case Instruction::URem:
11856 case Instruction::SRem:
11857 case Instruction::FRem:
11858 case Instruction::Shl:
11859 case Instruction::LShr:
11860 case Instruction::AShr:
11861 case Instruction::And:
11862 case Instruction::Or:
11863 case Instruction::Xor:
11864 case Instruction::Freeze: {
11865 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11866 ReuseShuffleIndices);
11867 LLVM_DEBUG(
11868 dbgs() << "SLP: added a new TreeEntry "
11869 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11870 TE->dump());
11871
11872 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
11873 VLOperands Ops(VL, Operands, S, *this);
11874 Ops.reorder();
11875 Operands[0] = Ops.getVL(0);
11876 Operands[1] = Ops.getVL(1);
11877 }
11878 TE->setOperands(Operands);
11879 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11880 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11881 return;
11882 }
11883 case Instruction::GetElementPtr: {
11884 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11885 ReuseShuffleIndices);
11886 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
11887 TE->dump());
11888 TE->setOperands(Operands);
11889
11890 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
11891 buildTreeRec(Operands[I], Depth + 1, {TE, I});
11892 return;
11893 }
11894 case Instruction::Store: {
11895 bool Consecutive = CurrentOrder.empty();
11896 if (!Consecutive)
11897 fixupOrderingIndices(CurrentOrder);
11898 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11899 ReuseShuffleIndices, CurrentOrder);
11900 if (Consecutive)
11901 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
11902 TE->dump());
11903 else
11904 LLVM_DEBUG(
11905 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
11906 TE->dump());
11907 TE->setOperands(Operands);
11908 buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
11909 return;
11910 }
11911 case Instruction::Call: {
11912 // Check if the calls are all to the same vectorizable intrinsic or
11913 // library function.
11914 CallInst *CI = cast<CallInst>(VL0);
11916
11917 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11918 ReuseShuffleIndices);
11919 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
11920 TE->dump());
11921 if (isCommutative(VL0)) {
11922 VLOperands Ops(VL, Operands, S, *this);
11923 Ops.reorder();
11924 Operands[0] = Ops.getVL(0);
11925 Operands[1] = Ops.getVL(1);
11926 }
11927 TE->setOperands(Operands);
11928 for (unsigned I : seq<unsigned>(CI->arg_size())) {
11929 // For scalar operands no need to create an entry since no need to
11930 // vectorize it.
11932 continue;
11933 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11934 }
11935 return;
11936 }
11937 case Instruction::ShuffleVector: {
11938 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11939 ReuseShuffleIndices);
11940 if (S.isAltShuffle()) {
11941 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
11942 TE->dump());
11943 } else {
11944 assert(SLPReVec && "Only supported by REVEC.");
11945 LLVM_DEBUG(
11946 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
11947 TE->dump());
11948 }
11949
11950 // Reorder operands if reordering would enable vectorization.
11951 auto *CI = dyn_cast<CmpInst>(VL0);
11952 if (CI && any_of(VL, [](Value *V) {
11953 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
11954 })) {
11955 auto *MainCI = cast<CmpInst>(S.getMainOp());
11956 auto *AltCI = cast<CmpInst>(S.getAltOp());
11957 CmpInst::Predicate MainP = MainCI->getPredicate();
11958 CmpInst::Predicate AltP = AltCI->getPredicate();
11959 assert(MainP != AltP &&
11960 "Expected different main/alternate predicates.");
11961 // Collect operands - commute if it uses the swapped predicate or
11962 // alternate operation.
11963 for (auto [Idx, V] : enumerate(VL)) {
11964 if (isa<PoisonValue>(V))
11965 continue;
11966 auto *Cmp = cast<CmpInst>(V);
11967
11968 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
11969 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
11970 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11971 } else {
11972 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
11973 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
11974 }
11975 }
11976 TE->setOperands(Operands);
11977 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
11978 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
11979 return;
11980 }
11981
11982 if (isa<BinaryOperator>(VL0) || CI) {
11983 VLOperands Ops(VL, Operands, S, *this);
11984 Ops.reorder();
11985 Operands[0] = Ops.getVL(0);
11986 Operands[1] = Ops.getVL(1);
11987 }
11988 TE->setOperands(Operands);
11989 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
11990 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
11991 return;
11992 }
11993 default:
11994 break;
11995 }
11996 llvm_unreachable("Unexpected vectorization of the instructions.");
11997}
11998
12000 unsigned N = 1;
12001 Type *EltTy = T;
12002
12004 if (EltTy->isEmptyTy())
12005 return 0;
12006 if (auto *ST = dyn_cast<StructType>(EltTy)) {
12007 // Check that struct is homogeneous.
12008 for (const auto *Ty : ST->elements())
12009 if (Ty != *ST->element_begin())
12010 return 0;
12011 N *= ST->getNumElements();
12012 EltTy = *ST->element_begin();
12013 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
12014 N *= AT->getNumElements();
12015 EltTy = AT->getElementType();
12016 } else {
12017 auto *VT = cast<FixedVectorType>(EltTy);
12018 N *= VT->getNumElements();
12019 EltTy = VT->getElementType();
12020 }
12021 }
12022
12023 if (!isValidElementType(EltTy))
12024 return 0;
12025 size_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
12026 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12027 VTSize != DL->getTypeStoreSizeInBits(T))
12028 return 0;
12029 return N;
12030}
12031
12032bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
12033 SmallVectorImpl<unsigned> &CurrentOrder,
12034 bool ResizeAllowed) const {
12036 assert(It != VL.end() && "Expected at least one extract instruction.");
12037 auto *E0 = cast<Instruction>(*It);
12038 assert(
12040 "Invalid opcode");
12041 // Check if all of the extracts come from the same vector and from the
12042 // correct offset.
12043 Value *Vec = E0->getOperand(0);
12044
12045 CurrentOrder.clear();
12046
12047 // We have to extract from a vector/aggregate with the same number of elements.
12048 unsigned NElts;
12049 if (E0->getOpcode() == Instruction::ExtractValue) {
12050 NElts = canMapToVector(Vec->getType());
12051 if (!NElts)
12052 return false;
12053 // Check if load can be rewritten as load of vector.
12054 LoadInst *LI = dyn_cast<LoadInst>(Vec);
12055 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
12056 return false;
12057 } else {
12058 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
12059 }
12060
12061 unsigned E = VL.size();
12062 if (!ResizeAllowed && NElts != E)
12063 return false;
12065 unsigned MinIdx = NElts, MaxIdx = 0;
12066 for (auto [I, V] : enumerate(VL)) {
12067 auto *Inst = dyn_cast<Instruction>(V);
12068 if (!Inst)
12069 continue;
12070 if (Inst->getOperand(0) != Vec)
12071 return false;
12072 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
12073 if (isa<UndefValue>(EE->getIndexOperand()))
12074 continue;
12075 std::optional<unsigned> Idx = getExtractIndex(Inst);
12076 if (!Idx)
12077 return false;
12078 const unsigned ExtIdx = *Idx;
12079 if (ExtIdx >= NElts)
12080 continue;
12081 Indices[I] = ExtIdx;
12082 if (MinIdx > ExtIdx)
12083 MinIdx = ExtIdx;
12084 if (MaxIdx < ExtIdx)
12085 MaxIdx = ExtIdx;
12086 }
12087 if (MaxIdx - MinIdx + 1 > E)
12088 return false;
12089 if (MaxIdx + 1 <= E)
12090 MinIdx = 0;
12091
12092 // Check that all of the indices extract from the correct offset.
12093 bool ShouldKeepOrder = true;
12094 // Assign to all items the initial value E + 1 so we can check if the extract
12095 // instruction index was used already.
12096 // Also, later we can check that all the indices are used and we have a
12097 // consecutive access in the extract instructions, by checking that no
12098 // element of CurrentOrder still has value E + 1.
12099 CurrentOrder.assign(E, E);
12100 for (unsigned I = 0; I < E; ++I) {
12101 if (Indices[I] == PoisonMaskElem)
12102 continue;
12103 const unsigned ExtIdx = Indices[I] - MinIdx;
12104 if (CurrentOrder[ExtIdx] != E) {
12105 CurrentOrder.clear();
12106 return false;
12107 }
12108 ShouldKeepOrder &= ExtIdx == I;
12109 CurrentOrder[ExtIdx] = I;
12110 }
12111 if (ShouldKeepOrder)
12112 CurrentOrder.clear();
12113
12114 return ShouldKeepOrder;
12115}
12116
12117bool BoUpSLP::areAllUsersVectorized(
12118 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
12119 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
12120 all_of(I->users(), [this](User *U) {
12121 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12122 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12123 });
12124}
12125
12126void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12127 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12128 SmallVectorImpl<Value *> *OpScalars,
12129 SmallVectorImpl<Value *> *AltScalars) const {
12130 unsigned Sz = Scalars.size();
12131 Mask.assign(Sz, PoisonMaskElem);
12132 SmallVector<int> OrderMask;
12133 if (!ReorderIndices.empty())
12134 inversePermutation(ReorderIndices, OrderMask);
12135 for (unsigned I = 0; I < Sz; ++I) {
12136 unsigned Idx = I;
12137 if (!ReorderIndices.empty())
12138 Idx = OrderMask[I];
12139 if (isa<PoisonValue>(Scalars[Idx]))
12140 continue;
12141 auto *OpInst = cast<Instruction>(Scalars[Idx]);
12142 if (IsAltOp(OpInst)) {
12143 Mask[I] = Sz + Idx;
12144 if (AltScalars)
12145 AltScalars->push_back(OpInst);
12146 } else {
12147 Mask[I] = Idx;
12148 if (OpScalars)
12149 OpScalars->push_back(OpInst);
12150 }
12151 }
12152 if (!ReuseShuffleIndices.empty()) {
12153 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
12154 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
12155 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12156 });
12157 Mask.swap(NewMask);
12158 }
12159}
12160
12162 Instruction *AltOp,
12163 const TargetLibraryInfo &TLI) {
12164 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
12165}
12166
12168 Instruction *AltOp,
12169 const TargetLibraryInfo &TLI) {
12170 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
12171 auto *AltCI = cast<CmpInst>(AltOp);
12172 CmpInst::Predicate MainP = MainCI->getPredicate();
12173 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
12174 assert(MainP != AltP && "Expected different main/alternate predicates.");
12175 auto *CI = cast<CmpInst>(I);
12176 if (isCmpSameOrSwapped(MainCI, CI, TLI))
12177 return false;
12178 if (isCmpSameOrSwapped(AltCI, CI, TLI))
12179 return true;
12180 CmpInst::Predicate P = CI->getPredicate();
12182
12183 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
12184 "CmpInst expected to match either main or alternate predicate or "
12185 "their swap.");
12186 return MainP != P && MainP != SwappedP;
12187 }
12188 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
12189}
12190
12191TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
12192 assert(!Ops.empty());
12193 const auto *Op0 = Ops.front();
12194
12195 const bool IsConstant = all_of(Ops, [](Value *V) {
12196 // TODO: We should allow undef elements here
12197 return isConstant(V) && !isa<UndefValue>(V);
12198 });
12199 const bool IsUniform = all_of(Ops, [=](Value *V) {
12200 // TODO: We should allow undef elements here
12201 return V == Op0;
12202 });
12203 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
12204 // TODO: We should allow undef elements here
12205 if (auto *CI = dyn_cast<ConstantInt>(V))
12206 return CI->getValue().isPowerOf2();
12207 return false;
12208 });
12209 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
12210 // TODO: We should allow undef elements here
12211 if (auto *CI = dyn_cast<ConstantInt>(V))
12212 return CI->getValue().isNegatedPowerOf2();
12213 return false;
12214 });
12215
12217 if (IsConstant && IsUniform)
12219 else if (IsConstant)
12221 else if (IsUniform)
12223
12225 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
12226 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
12227
12228 return {VK, VP};
12229}
12230
12231namespace {
12232/// The base class for shuffle instruction emission and shuffle cost estimation.
12233class BaseShuffleAnalysis {
12234protected:
12235 Type *ScalarTy = nullptr;
12236
12237 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
12238
12239 /// V is expected to be a vectorized value.
12240 /// When REVEC is disabled, there is no difference between VF and
12241 /// VNumElements.
12242 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
12243 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
12244 /// of 8.
12245 unsigned getVF(Value *V) const {
12246 assert(V && "V cannot be nullptr");
12247 assert(isa<FixedVectorType>(V->getType()) &&
12248 "V does not have FixedVectorType");
12249 assert(ScalarTy && "ScalarTy cannot be nullptr");
12250 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12251 unsigned VNumElements =
12252 cast<FixedVectorType>(V->getType())->getNumElements();
12253 assert(VNumElements > ScalarTyNumElements &&
12254 "the number of elements of V is not large enough");
12255 assert(VNumElements % ScalarTyNumElements == 0 &&
12256 "the number of elements of V is not a vectorized value");
12257 return VNumElements / ScalarTyNumElements;
12258 }
12259
12260 /// Checks if the mask is an identity mask.
12261 /// \param IsStrict if is true the function returns false if mask size does
12262 /// not match vector size.
12263 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
12264 bool IsStrict) {
12265 int Limit = Mask.size();
12266 int VF = VecTy->getNumElements();
12267 int Index = -1;
12268 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
12269 return true;
12270 if (!IsStrict) {
12271 // Consider extract subvector starting from index 0.
12272 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
12273 Index == 0)
12274 return true;
12275 // All VF-size submasks are identity (e.g.
12276 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
12277 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
12278 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
12279 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
12281 }))
12282 return true;
12283 }
12284 return false;
12285 }
12286
12287 /// Tries to combine 2 different masks into single one.
12288 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
12289 /// change the size of the vector, \p LocalVF is the original size of the
12290 /// shuffled vector.
12291 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
12292 ArrayRef<int> ExtMask) {
12293 unsigned VF = Mask.size();
12294 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12295 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12296 if (ExtMask[I] == PoisonMaskElem)
12297 continue;
12298 int MaskedIdx = Mask[ExtMask[I] % VF];
12299 NewMask[I] =
12300 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
12301 }
12302 Mask.swap(NewMask);
12303 }
12304
12305 /// Looks through shuffles trying to reduce final number of shuffles in the
12306 /// code. The function looks through the previously emitted shuffle
12307 /// instructions and properly mark indices in mask as undef.
12308 /// For example, given the code
12309 /// \code
12310 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12311 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12312 /// \endcode
12313 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12314 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12315 /// <0, 1, 2, 3> for the shuffle.
12316 /// If 2 operands are of different size, the smallest one will be resized and
12317 /// the mask recalculated properly.
12318 /// For example, given the code
12319 /// \code
12320 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12321 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12322 /// \endcode
12323 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12324 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12325 /// <0, 1, 2, 3> for the shuffle.
12326 /// So, it tries to transform permutations to simple vector merge, if
12327 /// possible.
12328 /// \param V The input vector which must be shuffled using the given \p Mask.
12329 /// If the better candidate is found, \p V is set to this best candidate
12330 /// vector.
12331 /// \param Mask The input mask for the shuffle. If the best candidate is found
12332 /// during looking-through-shuffles attempt, it is updated accordingly.
12333 /// \param SinglePermute true if the shuffle operation is originally a
12334 /// single-value-permutation. In this case the look-through-shuffles procedure
12335 /// may look for resizing shuffles as the best candidates.
12336 /// \return true if the shuffle results in the non-resizing identity shuffle
12337 /// (and thus can be ignored), false - otherwise.
12338 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
12339 bool SinglePermute) {
12340 Value *Op = V;
12341 ShuffleVectorInst *IdentityOp = nullptr;
12342 SmallVector<int> IdentityMask;
12343 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
12344 // Exit if not a fixed vector type or changing size shuffle.
12345 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
12346 if (!SVTy)
12347 break;
12348 // Remember the identity or broadcast mask, if it is not a resizing
12349 // shuffle. If no better candidates are found, this Op and Mask will be
12350 // used in the final shuffle.
12351 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
12352 if (!IdentityOp || !SinglePermute ||
12353 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
12355 IdentityMask.size()))) {
12356 IdentityOp = SV;
12357 // Store current mask in the IdentityMask so later we did not lost
12358 // this info if IdentityOp is selected as the best candidate for the
12359 // permutation.
12360 IdentityMask.assign(Mask);
12361 }
12362 }
12363 // Remember the broadcast mask. If no better candidates are found, this Op
12364 // and Mask will be used in the final shuffle.
12365 // Zero splat can be used as identity too, since it might be used with
12366 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
12367 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
12368 // expensive, the analysis founds out, that the source vector is just a
12369 // broadcast, this original mask can be transformed to identity mask <0,
12370 // 1, 2, 3>.
12371 // \code
12372 // %0 = shuffle %v, poison, zeroinitalizer
12373 // %res = shuffle %0, poison, <3, 1, 2, 0>
12374 // \endcode
12375 // may be transformed to
12376 // \code
12377 // %0 = shuffle %v, poison, zeroinitalizer
12378 // %res = shuffle %0, poison, <0, 1, 2, 3>
12379 // \endcode
12380 if (SV->isZeroEltSplat()) {
12381 IdentityOp = SV;
12382 IdentityMask.assign(Mask);
12383 }
12384 int LocalVF = Mask.size();
12385 if (auto *SVOpTy =
12386 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
12387 LocalVF = SVOpTy->getNumElements();
12388 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
12389 for (auto [Idx, I] : enumerate(Mask)) {
12390 if (I == PoisonMaskElem ||
12391 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
12392 continue;
12393 ExtMask[Idx] = SV->getMaskValue(I);
12394 }
12395 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
12396 SV->getOperand(0),
12397 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
12398 .all();
12399 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
12400 SV->getOperand(1),
12401 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
12402 .all();
12403 if (!IsOp1Undef && !IsOp2Undef) {
12404 // Update mask and mark undef elems.
12405 for (int &I : Mask) {
12406 if (I == PoisonMaskElem)
12407 continue;
12408 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
12410 I = PoisonMaskElem;
12411 }
12412 break;
12413 }
12414 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12415 combineMasks(LocalVF, ShuffleMask, Mask);
12416 Mask.swap(ShuffleMask);
12417 if (IsOp2Undef)
12418 Op = SV->getOperand(0);
12419 else
12420 Op = SV->getOperand(1);
12421 }
12422 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
12423 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12425 if (IdentityOp) {
12426 V = IdentityOp;
12427 assert(Mask.size() == IdentityMask.size() &&
12428 "Expected masks of same sizes.");
12429 // Clear known poison elements.
12430 for (auto [I, Idx] : enumerate(Mask))
12431 if (Idx == PoisonMaskElem)
12432 IdentityMask[I] = PoisonMaskElem;
12433 Mask.swap(IdentityMask);
12434 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
12435 return SinglePermute &&
12436 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
12437 /*IsStrict=*/true) ||
12438 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
12439 Shuffle->isZeroEltSplat() &&
12441 all_of(enumerate(Mask), [&](const auto &P) {
12442 return P.value() == PoisonMaskElem ||
12443 Shuffle->getShuffleMask()[P.index()] == 0;
12444 })));
12445 }
12446 V = Op;
12447 return false;
12448 }
12449 V = Op;
12450 return true;
12451 }
12452
12453 /// Smart shuffle instruction emission, walks through shuffles trees and
12454 /// tries to find the best matching vector for the actual shuffle
12455 /// instruction.
12456 template <typename T, typename ShuffleBuilderTy>
12457 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
12458 ShuffleBuilderTy &Builder, Type *ScalarTy) {
12459 assert(V1 && "Expected at least one vector value.");
12460 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12461 SmallVector<int> NewMask(Mask);
12462 if (ScalarTyNumElements != 1) {
12463 assert(SLPReVec && "FixedVectorType is not expected.");
12464 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
12465 Mask = NewMask;
12466 }
12467 if (V2)
12468 Builder.resizeToMatch(V1, V2);
12469 int VF = Mask.size();
12470 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
12471 VF = FTy->getNumElements();
12473 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
12474 .all()) {
12475 // Peek through shuffles.
12476 Value *Op1 = V1;
12477 Value *Op2 = V2;
12478 int VF =
12479 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
12480 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
12481 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
12482 for (int I = 0, E = Mask.size(); I < E; ++I) {
12483 if (Mask[I] < VF)
12484 CombinedMask1[I] = Mask[I];
12485 else
12486 CombinedMask2[I] = Mask[I] - VF;
12487 }
12488 Value *PrevOp1;
12489 Value *PrevOp2;
12490 do {
12491 PrevOp1 = Op1;
12492 PrevOp2 = Op2;
12493 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
12494 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
12495 // Check if we have 2 resizing shuffles - need to peek through operands
12496 // again.
12497 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
12498 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
12499 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
12500 for (auto [Idx, I] : enumerate(CombinedMask1)) {
12501 if (I == PoisonMaskElem)
12502 continue;
12503 ExtMask1[Idx] = SV1->getMaskValue(I);
12504 }
12505 SmallBitVector UseMask1 = buildUseMask(
12506 cast<FixedVectorType>(SV1->getOperand(1)->getType())
12507 ->getNumElements(),
12508 ExtMask1, UseMask::SecondArg);
12509 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
12510 for (auto [Idx, I] : enumerate(CombinedMask2)) {
12511 if (I == PoisonMaskElem)
12512 continue;
12513 ExtMask2[Idx] = SV2->getMaskValue(I);
12514 }
12515 SmallBitVector UseMask2 = buildUseMask(
12516 cast<FixedVectorType>(SV2->getOperand(1)->getType())
12517 ->getNumElements(),
12518 ExtMask2, UseMask::SecondArg);
12519 if (SV1->getOperand(0)->getType() ==
12520 SV2->getOperand(0)->getType() &&
12521 SV1->getOperand(0)->getType() != SV1->getType() &&
12522 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
12523 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
12524 Op1 = SV1->getOperand(0);
12525 Op2 = SV2->getOperand(0);
12526 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12527 int LocalVF = ShuffleMask1.size();
12528 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
12529 LocalVF = FTy->getNumElements();
12530 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12531 CombinedMask1.swap(ShuffleMask1);
12532 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12533 LocalVF = ShuffleMask2.size();
12534 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
12535 LocalVF = FTy->getNumElements();
12536 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12537 CombinedMask2.swap(ShuffleMask2);
12538 }
12539 }
12540 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
12541 Builder.resizeToMatch(Op1, Op2);
12542 VF = std::max(cast<VectorType>(Op1->getType())
12543 ->getElementCount()
12544 .getKnownMinValue(),
12546 ->getElementCount()
12547 .getKnownMinValue());
12548 for (int I = 0, E = Mask.size(); I < E; ++I) {
12549 if (CombinedMask2[I] != PoisonMaskElem) {
12550 assert(CombinedMask1[I] == PoisonMaskElem &&
12551 "Expected undefined mask element");
12552 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
12553 }
12554 }
12555 if (Op1 == Op2 &&
12556 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
12557 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
12559 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
12560 ArrayRef(CombinedMask1))))
12561 return Builder.createIdentity(Op1);
12562 return Builder.createShuffleVector(
12563 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
12564 CombinedMask1);
12565 }
12566 if (isa<PoisonValue>(V1))
12567 return Builder.createPoison(
12568 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
12569 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
12570 assert(V1 && "Expected non-null value after looking through shuffles.");
12571
12572 if (!IsIdentity)
12573 return Builder.createShuffleVector(V1, NewMask);
12574 return Builder.createIdentity(V1);
12575 }
12576
12577 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
12578 /// shuffle emission.
12579 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
12580 ArrayRef<int> Mask) {
12581 for (unsigned I : seq<unsigned>(CommonMask.size()))
12582 if (Mask[I] != PoisonMaskElem)
12583 CommonMask[I] = I;
12584 }
12585};
12586} // namespace
12587
12588/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
12589static std::pair<InstructionCost, InstructionCost>
12591 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
12592 Type *ScalarTy, VectorType *VecTy) {
12593 InstructionCost ScalarCost = 0;
12594 InstructionCost VecCost = 0;
12595 // Here we differentiate two cases: (1) when Ptrs represent a regular
12596 // vectorization tree node (as they are pointer arguments of scattered
12597 // loads) or (2) when Ptrs are the arguments of loads or stores being
12598 // vectorized as plane wide unit-stride load/store since all the
12599 // loads/stores are known to be from/to adjacent locations.
12600 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12601 // Case 2: estimate costs for pointer related costs when vectorizing to
12602 // a wide load/store.
12603 // Scalar cost is estimated as a set of pointers with known relationship
12604 // between them.
12605 // For vector code we will use BasePtr as argument for the wide load/store
12606 // but we also need to account all the instructions which are going to
12607 // stay in vectorized code due to uses outside of these scalar
12608 // loads/stores.
12609 ScalarCost = TTI.getPointersChainCost(
12610 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12611 CostKind);
12612
12613 SmallVector<const Value *> PtrsRetainedInVecCode;
12614 for (Value *V : Ptrs) {
12615 if (V == BasePtr) {
12616 PtrsRetainedInVecCode.push_back(V);
12617 continue;
12618 }
12620 // For simplicity assume Ptr to stay in vectorized code if it's not a
12621 // GEP instruction. We don't care since it's cost considered free.
12622 // TODO: We should check for any uses outside of vectorizable tree
12623 // rather than just single use.
12624 if (!Ptr || !Ptr->hasOneUse())
12625 PtrsRetainedInVecCode.push_back(V);
12626 }
12627
12628 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
12629 // If all pointers stay in vectorized code then we don't have
12630 // any savings on that.
12631 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
12632 }
12633 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12634 TTI::PointersChainInfo::getKnownStride(),
12635 VecTy, CostKind);
12636 } else {
12637 // Case 1: Ptrs are the arguments of loads that we are going to transform
12638 // into masked gather load intrinsic.
12639 // All the scalar GEPs will be removed as a result of vectorization.
12640 // For any external uses of some lanes extract element instructions will
12641 // be generated (which cost is estimated separately).
12642 TTI::PointersChainInfo PtrsInfo =
12643 all_of(Ptrs,
12644 [](const Value *V) {
12646 return Ptr && !Ptr->hasAllConstantIndices();
12647 })
12648 ? TTI::PointersChainInfo::getUnknownStride()
12649 : TTI::PointersChainInfo::getKnownStride();
12650
12651 ScalarCost =
12652 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
12653 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
12654 if (!BaseGEP) {
12655 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
12656 if (It != Ptrs.end())
12657 BaseGEP = cast<GEPOperator>(*It);
12658 }
12659 if (BaseGEP) {
12660 SmallVector<const Value *> Indices(BaseGEP->indices());
12661 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
12662 BaseGEP->getPointerOperand(), Indices, VecTy,
12663 CostKind);
12664 }
12665 }
12666
12667 return std::make_pair(ScalarCost, VecCost);
12668}
12669
12670void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12671 assert(TE.isGather() && TE.ReorderIndices.empty() &&
12672 "Expected gather node without reordering.");
12673 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
12674 SmallSet<size_t, 2> LoadKeyUsed;
12675
12676 // Do not reorder nodes if it small (just 2 elements), all-constant or all
12677 // instructions have same opcode already.
12678 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
12679 all_of(TE.Scalars, isConstant))
12680 return;
12681
12682 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
12683 return VectorizableTree[Idx]->isSame(TE.Scalars);
12684 }))
12685 return;
12686
12687 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
12688 Key = hash_combine(hash_value(LI->getParent()), Key);
12689 Value *Ptr =
12690 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
12691 if (LoadKeyUsed.contains(Key)) {
12692 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
12693 if (LIt != LoadsMap.end()) {
12694 for (LoadInst *RLI : LIt->second) {
12695 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
12696 LI->getType(), LI->getPointerOperand(), *DL, *SE,
12697 /*StrictCheck=*/true))
12698 return hash_value(RLI->getPointerOperand());
12699 }
12700 for (LoadInst *RLI : LIt->second) {
12702 LI->getPointerOperand(), *TLI)) {
12703 hash_code SubKey = hash_value(RLI->getPointerOperand());
12704 return SubKey;
12705 }
12706 }
12707 if (LIt->second.size() > 2) {
12708 hash_code SubKey =
12709 hash_value(LIt->second.back()->getPointerOperand());
12710 return SubKey;
12711 }
12712 }
12713 }
12714 LoadKeyUsed.insert(Key);
12715 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
12716 return hash_value(LI->getPointerOperand());
12717 };
12718 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12719 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12720 bool IsOrdered = true;
12721 unsigned NumInstructions = 0;
12722 // Try to "cluster" scalar instructions, to be able to build extra vectorized
12723 // nodes.
12724 for (auto [I, V] : enumerate(TE.Scalars)) {
12725 size_t Key = 1, Idx = 1;
12726 if (auto *Inst = dyn_cast<Instruction>(V);
12728 !isDeleted(Inst) && !isVectorized(V)) {
12729 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
12730 /*AllowAlternate=*/false);
12731 ++NumInstructions;
12732 }
12733 auto &Container = SortedValues[Key];
12734 if (IsOrdered && !KeyToIndex.contains(V) &&
12737 ((Container.contains(Idx) &&
12738 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
12739 (!Container.empty() && !Container.contains(Idx) &&
12740 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
12741 IsOrdered = false;
12742 auto &KTI = KeyToIndex[V];
12743 if (KTI.empty())
12744 Container[Idx].push_back(V);
12745 KTI.push_back(I);
12746 }
12748 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12749 if (!IsOrdered && NumInstructions > 1) {
12750 unsigned Cnt = 0;
12751 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
12752 for (const auto &D : SortedValues) {
12753 for (const auto &P : D.second) {
12754 unsigned Sz = 0;
12755 for (Value *V : P.second) {
12756 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
12757 for (auto [K, Idx] : enumerate(Indices)) {
12758 TE.ReorderIndices[Cnt + K] = Idx;
12759 TE.Scalars[Cnt + K] = V;
12760 }
12761 Sz += Indices.size();
12762 Cnt += Indices.size();
12763 }
12764 if (Sz > 1 && isa<Instruction>(P.second.front())) {
12765 const unsigned SubVF = getFloorFullVectorNumberOfElements(
12766 *TTI, TE.Scalars.front()->getType(), Sz);
12767 SubVectors.emplace_back(Cnt - Sz, SubVF);
12768 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
12769 DemandedElts.clearBit(I);
12770 } else if (!P.second.empty() && isConstant(P.second.front())) {
12771 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
12772 DemandedElts.clearBit(I);
12773 }
12774 }
12775 }
12776 }
12777 // Reuses always require shuffles, so consider it as profitable.
12778 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
12779 return;
12780 // Do simple cost estimation.
12783 auto *ScalarTy = TE.Scalars.front()->getType();
12784 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
12785 for (auto [Idx, Sz] : SubVectors) {
12787 Idx, getWidenedType(ScalarTy, Sz));
12788 }
12789 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12790 /*Insert=*/true,
12791 /*Extract=*/false, CostKind);
12792 int Sz = TE.Scalars.size();
12793 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
12794 TE.ReorderIndices.end());
12795 for (unsigned I : seq<unsigned>(Sz)) {
12796 Value *V = TE.getOrdered(I);
12797 if (isa<PoisonValue>(V)) {
12798 ReorderMask[I] = PoisonMaskElem;
12799 } else if (isConstant(V) || DemandedElts[I]) {
12800 ReorderMask[I] = I + TE.ReorderIndices.size();
12801 }
12802 }
12803 Cost += ::getShuffleCost(*TTI,
12804 any_of(ReorderMask, [&](int I) { return I >= Sz; })
12807 VecTy, ReorderMask);
12808 DemandedElts = APInt::getAllOnes(TE.Scalars.size());
12809 ReorderMask.assign(Sz, PoisonMaskElem);
12810 for (unsigned I : seq<unsigned>(Sz)) {
12811 Value *V = TE.getOrdered(I);
12812 if (isConstant(V)) {
12813 DemandedElts.clearBit(I);
12814 if (!isa<PoisonValue>(V))
12815 ReorderMask[I] = I;
12816 } else {
12817 ReorderMask[I] = I + Sz;
12818 }
12819 }
12820 InstructionCost BVCost =
12821 getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
12822 /*Insert=*/true, /*Extract=*/false, CostKind);
12823 if (!DemandedElts.isAllOnes())
12824 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
12825 if (Cost >= BVCost) {
12826 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
12827 reorderScalars(TE.Scalars, Mask);
12828 TE.ReorderIndices.clear();
12829 }
12830}
12831
12832/// Check if we can convert fadd/fsub sequence to FMAD.
12833/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
12835 const InstructionsState &S,
12836 DominatorTree &DT, const DataLayout &DL,
12838 const TargetLibraryInfo &TLI) {
12839 assert(all_of(VL,
12840 [](Value *V) {
12841 return V->getType()->getScalarType()->isFloatingPointTy();
12842 }) &&
12843 "Can only convert to FMA for floating point types");
12844 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
12845
12846 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
12847 FastMathFlags FMF;
12848 FMF.set();
12849 for (Value *V : VL) {
12850 auto *I = dyn_cast<Instruction>(V);
12851 if (!I)
12852 continue;
12853 if (S.isCopyableElement(I))
12854 continue;
12855 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
12856 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12857 continue;
12858 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12859 FMF &= FPCI->getFastMathFlags();
12860 }
12861 return FMF.allowContract();
12862 };
12863 if (!CheckForContractable(VL))
12865 // fmul also should be contractable
12866 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
12867 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
12868
12869 InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
12870 if (!OpS.valid())
12872
12873 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12875 if (!CheckForContractable(Operands.front()))
12877 // Compare the costs.
12878 InstructionCost FMulPlusFAddCost = 0;
12879 InstructionCost FMACost = 0;
12881 FastMathFlags FMF;
12882 FMF.set();
12883 for (Value *V : VL) {
12884 auto *I = dyn_cast<Instruction>(V);
12885 if (!I)
12886 continue;
12887 if (!S.isCopyableElement(I))
12888 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12889 FMF &= FPCI->getFastMathFlags();
12890 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
12891 }
12892 unsigned NumOps = 0;
12893 for (auto [V, Op] : zip(VL, Operands.front())) {
12894 if (S.isCopyableElement(V))
12895 continue;
12896 auto *I = dyn_cast<Instruction>(Op);
12897 if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {
12898 if (auto *OpI = dyn_cast<Instruction>(V))
12899 FMACost += TTI.getInstructionCost(OpI, CostKind);
12900 if (I)
12901 FMACost += TTI.getInstructionCost(I, CostKind);
12902 continue;
12903 }
12904 ++NumOps;
12905 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
12906 FMF &= FPCI->getFastMathFlags();
12907 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
12908 }
12909 Type *Ty = VL.front()->getType();
12910 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
12911 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
12912 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
12913}
12914
12917 BaseGraphSize = VectorizableTree.size();
12918 // Turn graph transforming mode on and off, when done.
12919 class GraphTransformModeRAAI {
12920 bool &SavedIsGraphTransformMode;
12921
12922 public:
12923 GraphTransformModeRAAI(bool &IsGraphTransformMode)
12924 : SavedIsGraphTransformMode(IsGraphTransformMode) {
12925 IsGraphTransformMode = true;
12926 }
12927 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
12928 } TransformContext(IsGraphTransformMode);
12929 // Operands are profitable if they are:
12930 // 1. At least one constant
12931 // or
12932 // 2. Splats
12933 // or
12934 // 3. Results in good vectorization opportunity, i.e. may generate vector
12935 // nodes and reduce cost of the graph.
12936 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
12937 const InstructionsState &S) {
12939 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
12940 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
12941 I2->getOperand(Op));
12942 return all_of(
12943 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
12944 return all_of(Cand,
12945 [](const std::pair<Value *, Value *> &P) {
12946 return isa<Constant>(P.first) ||
12947 isa<Constant>(P.second) || P.first == P.second;
12948 }) ||
12950 });
12951 };
12952
12953 // Try to reorder gather nodes for better vectorization opportunities.
12954 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
12955 TreeEntry &E = *VectorizableTree[Idx];
12956 if (E.isGather())
12957 reorderGatherNode(E);
12958 }
12959
12960 // Better to use full gathered loads analysis, if there are only 2 loads
12961 // gathered nodes each having less than 16 elements.
12962 constexpr unsigned VFLimit = 16;
12963 bool ForceLoadGather =
12964 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12965 return TE->isGather() && TE->hasState() &&
12966 TE->getOpcode() == Instruction::Load &&
12967 TE->getVectorFactor() < VFLimit;
12968 }) == 2;
12969
12970 // Checks if the scalars are used in other node.
12971 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
12972 function_ref<bool(Value *)> CheckContainer) {
12973 return TE->isSame(VL) || all_of(VL, [&](Value *V) {
12974 if (isa<PoisonValue>(V))
12975 return true;
12976 auto *I = dyn_cast<Instruction>(V);
12977 if (!I)
12978 return false;
12979 return is_contained(TE->Scalars, I) || CheckContainer(I);
12980 });
12981 };
12982 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
12983 if (E.hasState()) {
12984 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
12985 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12986 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12987 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
12988 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12989 return is_contained(TEs, TE);
12990 });
12991 });
12992 }))
12993 return true;
12994 ;
12995 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
12996 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
12997 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
12998 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12999 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13000 return is_contained(TEs, TE);
13001 });
13002 });
13003 }))
13004 return true;
13005 } else {
13006 // Check if the gather node full copy of split node.
13007 auto *It = find_if(E.Scalars, IsaPred<Instruction>);
13008 if (It != E.Scalars.end()) {
13009 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
13010 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
13011 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13012 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13013 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13014 return is_contained(TEs, TE);
13015 });
13016 });
13017 }))
13018 return true;
13019 }
13020 }
13021 return false;
13022 };
13023 // The tree may grow here, so iterate over nodes, built before.
13024 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
13025 TreeEntry &E = *VectorizableTree[Idx];
13026 if (E.isGather()) {
13027 ArrayRef<Value *> VL = E.Scalars;
13028 const unsigned Sz = getVectorElementSize(VL.front());
13029 unsigned MinVF = getMinVF(2 * Sz);
13030 // Do not try partial vectorization for small nodes (<= 2), nodes with the
13031 // same opcode and same parent block or all constants.
13032 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
13033 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
13034 // We use allSameOpcode instead of isAltShuffle because we don't
13035 // want to use interchangeable instruction here.
13036 !allSameOpcode(VL) || !allSameBlock(VL)) ||
13037 allConstant(VL) || isSplat(VL))
13038 continue;
13039 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
13040 continue;
13041 // Check if the node is a copy of other vector nodes.
13042 if (CheckForSameVectorNodes(E))
13043 continue;
13044 // Try to find vectorizable sequences and transform them into a series of
13045 // insertvector instructions.
13046 unsigned StartIdx = 0;
13047 unsigned End = VL.size();
13048 for (unsigned VF = getFloorFullVectorNumberOfElements(
13049 *TTI, VL.front()->getType(), VL.size() - 1);
13050 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
13051 *TTI, VL.front()->getType(), VF - 1)) {
13052 if (StartIdx + VF > End)
13053 continue;
13055 bool AllStrided = true;
13056 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13057 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
13058 // If any instruction is vectorized already - do not try again.
13059 // Reuse the existing node, if it fully matches the slice.
13060 if (isVectorized(Slice.front()) &&
13061 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
13062 continue;
13063 // Constant already handled effectively - skip.
13064 if (allConstant(Slice))
13065 continue;
13066 // Do not try to vectorize small splats (less than vector register and
13067 // only with the single non-undef element).
13068 bool IsSplat = isSplat(Slice);
13069 bool IsTwoRegisterSplat = true;
13070 if (IsSplat && VF == 2) {
13071 unsigned NumRegs2VF = ::getNumberOfParts(
13072 *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
13073 IsTwoRegisterSplat = NumRegs2VF == 2;
13074 }
13075 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
13076 count(Slice, Slice.front()) ==
13077 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
13078 : 1)) {
13079 if (IsSplat)
13080 continue;
13081 InstructionsState S = getSameOpcode(Slice, *TLI);
13082 if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
13083 (S.getOpcode() == Instruction::Load &&
13085 (S.getOpcode() != Instruction::Load &&
13086 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
13087 continue;
13088 if (VF == 2) {
13089 // Try to vectorize reduced values or if all users are vectorized.
13090 // For expensive instructions extra extracts might be profitable.
13091 if ((!UserIgnoreList || E.Idx != 0) &&
13092 TTI->getInstructionCost(S.getMainOp(), CostKind) <
13094 !all_of(Slice, [&](Value *V) {
13095 if (isa<PoisonValue>(V))
13096 return true;
13097 return areAllUsersVectorized(cast<Instruction>(V),
13098 UserIgnoreList);
13099 }))
13100 continue;
13101 if (S.getOpcode() == Instruction::Load) {
13102 OrdersType Order;
13103 SmallVector<Value *> PointerOps;
13104 StridedPtrInfo SPtrInfo;
13105 LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
13106 PointerOps, SPtrInfo);
13107 AllStrided &= Res == LoadsState::StridedVectorize ||
13109 Res == LoadsState::Gather;
13110 // Do not vectorize gathers.
13111 if (Res == LoadsState::ScatterVectorize ||
13112 Res == LoadsState::Gather) {
13113 if (Res == LoadsState::Gather) {
13115 // If reductions and the scalars from the root node are
13116 // analyzed - mark as non-vectorizable reduction.
13117 if (UserIgnoreList && E.Idx == 0)
13118 analyzedReductionVals(Slice);
13119 }
13120 continue;
13121 }
13122 } else if (S.getOpcode() == Instruction::ExtractElement ||
13123 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
13125 !CheckOperandsProfitability(
13126 S.getMainOp(),
13129 S))) {
13130 // Do not vectorize extractelements (handled effectively
13131 // alread). Do not vectorize non-profitable instructions (with
13132 // low cost and non-vectorizable operands.)
13133 continue;
13134 }
13135 }
13136 }
13137 Slices.emplace_back(Cnt, Slice.size());
13138 }
13139 // Do not try to vectorize if all slides are strided or gathered with
13140 // vector factor 2 and there are more than 2 slices. Better to handle
13141 // them in gathered loads analysis, may result in better vectorization.
13142 if (VF == 2 && AllStrided && Slices.size() > 2)
13143 continue;
13144 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
13145 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13146 if (StartIdx == Cnt)
13147 StartIdx = Cnt + Sz;
13148 if (End == Cnt + Sz)
13149 End = Cnt;
13150 };
13151 for (auto [Cnt, Sz] : Slices) {
13152 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
13153 const TreeEntry *SameTE = nullptr;
13154 if (const auto *It = find_if(Slice, IsaPred<Instruction>);
13155 It != Slice.end()) {
13156 // If any instruction is vectorized already - do not try again.
13157 SameTE = getSameValuesTreeEntry(*It, Slice);
13158 }
13159 unsigned PrevSize = VectorizableTree.size();
13160 [[maybe_unused]] unsigned PrevEntriesSize =
13161 LoadEntriesToVectorize.size();
13162 buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
13163 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13164 VectorizableTree[PrevSize]->isGather() &&
13165 VectorizableTree[PrevSize]->hasState() &&
13166 VectorizableTree[PrevSize]->getOpcode() !=
13167 Instruction::ExtractElement &&
13168 !isSplat(Slice)) {
13169 if (UserIgnoreList && E.Idx == 0 && VF == 2)
13170 analyzedReductionVals(Slice);
13171 VectorizableTree.pop_back();
13172 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13173 "LoadEntriesToVectorize expected to remain the same");
13174 continue;
13175 }
13176 AddCombinedNode(PrevSize, Cnt, Sz);
13177 }
13178 }
13179 // Restore ordering, if no extra vectorization happened.
13180 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
13181 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13182 reorderScalars(E.Scalars, Mask);
13183 E.ReorderIndices.clear();
13184 }
13185 }
13186 if (!E.hasState())
13187 continue;
13188 switch (E.getOpcode()) {
13189 case Instruction::Load: {
13190 // No need to reorder masked gather loads, just reorder the scalar
13191 // operands.
13192 if (E.State != TreeEntry::Vectorize)
13193 break;
13194 Type *ScalarTy = E.getMainOp()->getType();
13195 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13196 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
13197 // Check if profitable to represent consecutive load + reverse as strided
13198 // load with stride -1.
13199 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13200 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13201 SmallVector<int> Mask;
13202 inversePermutation(E.ReorderIndices, Mask);
13203 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
13204 InstructionCost OriginalVecCost =
13205 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13206 BaseLI->getPointerAddressSpace(), CostKind,
13208 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13209 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
13210 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13211 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
13212 if (StridedCost < OriginalVecCost || ForceStridedLoads) {
13213 // Strided load is more profitable than consecutive load + reverse -
13214 // transform the node to strided load.
13215 Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
13216 ->getPointerOperand()
13217 ->getType());
13218 StridedPtrInfo SPtrInfo;
13219 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13220 SPtrInfo.Ty = VecTy;
13221 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
13222 E.State = TreeEntry::StridedVectorize;
13223 }
13224 }
13225 break;
13226 }
13227 case Instruction::Store: {
13228 Type *ScalarTy =
13229 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
13230 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
13231 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
13232 // Check if profitable to represent consecutive load + reverse as strided
13233 // load with stride -1.
13234 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
13235 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13236 SmallVector<int> Mask;
13237 inversePermutation(E.ReorderIndices, Mask);
13238 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
13239 InstructionCost OriginalVecCost =
13240 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13241 BaseSI->getPointerAddressSpace(), CostKind,
13243 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
13244 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
13245 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13246 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
13247 if (StridedCost < OriginalVecCost)
13248 // Strided store is more profitable than reverse + consecutive store -
13249 // transform the node to strided store.
13250 E.State = TreeEntry::StridedVectorize;
13251 } else if (!E.ReorderIndices.empty()) {
13252 // Check for interleaved stores.
13253 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
13254 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
13255 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
13256 if (Mask.size() < 4)
13257 return 0u;
13258 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
13260 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13261 TTI.isLegalInterleavedAccessType(
13262 VecTy, Factor, BaseSI->getAlign(),
13263 BaseSI->getPointerAddressSpace()))
13264 return Factor;
13265 }
13266
13267 return 0u;
13268 };
13269 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13270 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13271 if (InterleaveFactor != 0)
13272 E.setInterleave(InterleaveFactor);
13273 }
13274 break;
13275 }
13276 case Instruction::Select: {
13277 if (E.State != TreeEntry::Vectorize)
13278 break;
13279 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
13280 if (MinMaxID == Intrinsic::not_intrinsic)
13281 break;
13282 // This node is a minmax node.
13283 E.CombinedOp = TreeEntry::MinMax;
13284 TreeEntry *CondEntry = getOperandEntry(&E, 0);
13285 if (SelectOnly && CondEntry->UserTreeIndex &&
13286 CondEntry->State == TreeEntry::Vectorize) {
13287 // The condition node is part of the combined minmax node.
13288 CondEntry->State = TreeEntry::CombinedVectorize;
13289 }
13290 break;
13291 }
13292 case Instruction::FSub:
13293 case Instruction::FAdd: {
13294 // Check if possible to convert (a*b)+c to fma.
13295 if (E.State != TreeEntry::Vectorize ||
13296 !E.getOperations().isAddSubLikeOp())
13297 break;
13298 if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
13299 .isValid())
13300 break;
13301 // This node is a fmuladd node.
13302 E.CombinedOp = TreeEntry::FMulAdd;
13303 TreeEntry *FMulEntry = getOperandEntry(&E, 0);
13304 if (FMulEntry->UserTreeIndex &&
13305 FMulEntry->State == TreeEntry::Vectorize) {
13306 // The FMul node is part of the combined fmuladd node.
13307 FMulEntry->State = TreeEntry::CombinedVectorize;
13308 }
13309 break;
13310 }
13311 default:
13312 break;
13313 }
13314 }
13315
13316 if (LoadEntriesToVectorize.empty()) {
13317 // Single load node - exit.
13318 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13319 VectorizableTree.front()->getOpcode() == Instruction::Load)
13320 return;
13321 // Small graph with small VF - exit.
13322 constexpr unsigned SmallTree = 3;
13323 constexpr unsigned SmallVF = 2;
13324 if ((VectorizableTree.size() <= SmallTree &&
13325 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13326 (VectorizableTree.size() <= 2 && UserIgnoreList))
13327 return;
13328
13329 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13330 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
13331 getCanonicalGraphSize() <= SmallTree &&
13332 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
13333 [](const std::unique_ptr<TreeEntry> &TE) {
13334 return TE->isGather() && TE->hasState() &&
13335 TE->getOpcode() == Instruction::Load &&
13336 !allSameBlock(TE->Scalars);
13337 }) == 1)
13338 return;
13339 }
13340
13341 // A list of loads to be gathered during the vectorization process. We can
13342 // try to vectorize them at the end, if profitable.
13343 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13345 GatheredLoads;
13346
13347 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13348 TreeEntry &E = *TE;
13349 if (E.isGather() &&
13350 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
13351 (!E.hasState() && any_of(E.Scalars,
13352 [&](Value *V) {
13353 return isa<LoadInst>(V) &&
13354 !isVectorized(V) &&
13355 !isDeleted(cast<Instruction>(V));
13356 }))) &&
13357 !isSplat(E.Scalars)) {
13358 for (Value *V : E.Scalars) {
13359 auto *LI = dyn_cast<LoadInst>(V);
13360 if (!LI)
13361 continue;
13362 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
13363 continue;
13365 *this, V, *DL, *SE, *TTI,
13366 GatheredLoads[std::make_tuple(
13367 LI->getParent(),
13368 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),
13369 LI->getType())]);
13370 }
13371 }
13372 }
13373 // Try to vectorize gathered loads if this is not just a gather of loads.
13374 if (!GatheredLoads.empty())
13375 tryToVectorizeGatheredLoads(GatheredLoads);
13376}
13377
13378/// Merges shuffle masks and emits final shuffle instruction, if required. It
13379/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
13380/// when the actual shuffle instruction is generated only if this is actually
13381/// required. Otherwise, the shuffle instruction emission is delayed till the
13382/// end of the process, to reduce the number of emitted instructions and further
13383/// analysis/transformations.
13384class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
13385 bool IsFinalized = false;
13386 SmallVector<int> CommonMask;
13388 const TargetTransformInfo &TTI;
13389 InstructionCost Cost = 0;
13390 SmallDenseSet<Value *> VectorizedVals;
13391 BoUpSLP &R;
13392 SmallPtrSetImpl<Value *> &CheckedExtracts;
13393 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13394 /// While set, still trying to estimate the cost for the same nodes and we
13395 /// can delay actual cost estimation (virtual shuffle instruction emission).
13396 /// May help better estimate the cost if same nodes must be permuted + allows
13397 /// to move most of the long shuffles cost estimation to TTI.
13398 bool SameNodesEstimated = true;
13399
13400 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
13401 if (Ty->getScalarType()->isPointerTy()) {
13404 IntegerType::get(Ty->getContext(),
13405 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13406 Ty->getScalarType());
13407 if (auto *VTy = dyn_cast<VectorType>(Ty))
13408 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
13409 return Res;
13410 }
13411 return Constant::getAllOnesValue(Ty);
13412 }
13413
13414 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
13415 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
13416 return TTI::TCC_Free;
13417 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13418 InstructionCost GatherCost = 0;
13419 SmallVector<Value *> Gathers(VL);
13420 if (!Root && isSplat(VL)) {
13421 // Found the broadcasting of the single scalar, calculate the cost as
13422 // the broadcast.
13423 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
13424 assert(It != VL.end() && "Expected at least one non-undef value.");
13425 // Add broadcast for non-identity shuffle only.
13426 bool NeedShuffle =
13427 count(VL, *It) > 1 &&
13428 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
13429 if (!NeedShuffle) {
13430 if (isa<FixedVectorType>(ScalarTy)) {
13431 assert(SLPReVec && "FixedVectorType is not expected.");
13432 return TTI.getShuffleCost(
13433 TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind,
13434 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
13435 cast<FixedVectorType>(ScalarTy));
13436 }
13437 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13438 CostKind, std::distance(VL.begin(), It),
13439 PoisonValue::get(VecTy), *It);
13440 }
13441
13442 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13443 transform(VL, ShuffleMask.begin(), [](Value *V) {
13444 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13445 });
13446 InstructionCost InsertCost =
13447 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13448 PoisonValue::get(VecTy), *It);
13449 return InsertCost + ::getShuffleCost(TTI,
13451 VecTy, ShuffleMask, CostKind,
13452 /*Index=*/0, /*SubTp=*/nullptr,
13453 /*Args=*/*It);
13454 }
13455 return GatherCost +
13456 (all_of(Gathers, IsaPred<UndefValue>)
13458 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
13459 ScalarTy));
13460 };
13461
13462 /// Compute the cost of creating a vector containing the extracted values from
13463 /// \p VL.
13465 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
13466 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13467 unsigned NumParts) {
13468 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
13469 unsigned NumElts =
13470 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
13471 auto *EE = dyn_cast<ExtractElementInst>(V);
13472 if (!EE)
13473 return Sz;
13474 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13475 if (!VecTy)
13476 return Sz;
13477 return std::max(Sz, VecTy->getNumElements());
13478 });
13479 // FIXME: this must be moved to TTI for better estimation.
13480 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
13481 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
13483 SmallVectorImpl<unsigned> &SubVecSizes)
13484 -> std::optional<TTI::ShuffleKind> {
13485 if (NumElts <= EltsPerVector)
13486 return std::nullopt;
13487 int OffsetReg0 =
13488 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13489 [](int S, int I) {
13490 if (I == PoisonMaskElem)
13491 return S;
13492 return std::min(S, I);
13493 }),
13494 EltsPerVector);
13495 int OffsetReg1 = OffsetReg0;
13496 DenseSet<int> RegIndices;
13497 // Check that if trying to permute same single/2 input vectors.
13499 int FirstRegId = -1;
13500 Indices.assign(1, OffsetReg0);
13501 for (auto [Pos, I] : enumerate(Mask)) {
13502 if (I == PoisonMaskElem)
13503 continue;
13504 int Idx = I - OffsetReg0;
13505 int RegId =
13506 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13507 if (FirstRegId < 0)
13508 FirstRegId = RegId;
13509 RegIndices.insert(RegId);
13510 if (RegIndices.size() > 2)
13511 return std::nullopt;
13512 if (RegIndices.size() == 2) {
13513 ShuffleKind = TTI::SK_PermuteTwoSrc;
13514 if (Indices.size() == 1) {
13515 OffsetReg1 = alignDown(
13516 std::accumulate(
13517 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13518 [&](int S, int I) {
13519 if (I == PoisonMaskElem)
13520 return S;
13521 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13522 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13523 if (RegId == FirstRegId)
13524 return S;
13525 return std::min(S, I);
13526 }),
13527 EltsPerVector);
13528 unsigned Index = OffsetReg1 % NumElts;
13529 Indices.push_back(Index);
13530 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13531 }
13532 Idx = I - OffsetReg1;
13533 }
13534 I = (Idx % NumElts) % EltsPerVector +
13535 (RegId == FirstRegId ? 0 : EltsPerVector);
13536 }
13537 return ShuffleKind;
13538 };
13539 InstructionCost Cost = 0;
13540
13541 // Process extracts in blocks of EltsPerVector to check if the source vector
13542 // operand can be re-used directly. If not, add the cost of creating a
13543 // shuffle to extract the values into a vector register.
13544 for (unsigned Part : seq<unsigned>(NumParts)) {
13545 if (!ShuffleKinds[Part])
13546 continue;
13547 ArrayRef<int> MaskSlice = Mask.slice(
13548 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
13549 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
13550 copy(MaskSlice, SubMask.begin());
13552 SmallVector<unsigned, 2> SubVecSizes;
13553 std::optional<TTI::ShuffleKind> RegShuffleKind =
13554 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13555 if (!RegShuffleKind) {
13556 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
13558 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
13559 Cost +=
13560 ::getShuffleCost(TTI, *ShuffleKinds[Part],
13561 getWidenedType(ScalarTy, NumElts), MaskSlice);
13562 continue;
13563 }
13564 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
13565 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
13566 Cost +=
13567 ::getShuffleCost(TTI, *RegShuffleKind,
13568 getWidenedType(ScalarTy, EltsPerVector), SubMask);
13569 }
13570 const unsigned BaseVF = getFullVectorNumberOfElements(
13571 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
13572 for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
13573 assert((Idx + SubVecSize) <= BaseVF &&
13574 "SK_ExtractSubvector index out of range");
13576 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
13577 Idx, getWidenedType(ScalarTy, SubVecSize));
13578 }
13579 // Second attempt to check, if just a permute is better estimated than
13580 // subvector extract.
13581 SubMask.assign(NumElts, PoisonMaskElem);
13582 copy(MaskSlice, SubMask.begin());
13583 InstructionCost OriginalCost = ::getShuffleCost(
13584 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
13585 if (OriginalCost < Cost)
13586 Cost = OriginalCost;
13587 }
13588 return Cost;
13589 }
13590 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
13591 /// mask \p Mask, register number \p Part, that includes \p SliceSize
13592 /// elements.
13593 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
13594 ArrayRef<int> Mask, unsigned Part,
13595 unsigned SliceSize) {
13596 if (SameNodesEstimated) {
13597 // Delay the cost estimation if the same nodes are reshuffling.
13598 // If we already requested the cost of reshuffling of E1 and E2 before, no
13599 // need to estimate another cost with the sub-Mask, instead include this
13600 // sub-Mask into the CommonMask to estimate it later and avoid double cost
13601 // estimation.
13602 if ((InVectors.size() == 2 &&
13603 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
13604 cast<const TreeEntry *>(InVectors.back()) == E2) ||
13605 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
13606 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
13607 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
13608 [](int Idx) { return Idx == PoisonMaskElem; }) &&
13609 "Expected all poisoned elements.");
13610 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
13611 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13612 return;
13613 }
13614 // Found non-matching nodes - need to estimate the cost for the matched
13615 // and transform mask.
13616 Cost += createShuffle(InVectors.front(),
13617 InVectors.size() == 1 ? nullptr : InVectors.back(),
13618 CommonMask);
13619 transformMaskAfterShuffle(CommonMask, CommonMask);
13620 } else if (InVectors.size() == 2) {
13621 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13622 transformMaskAfterShuffle(CommonMask, CommonMask);
13623 }
13624 SameNodesEstimated = false;
13625 if (!E2 && InVectors.size() == 1) {
13626 unsigned VF = E1.getVectorFactor();
13627 if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
13628 VF = std::max(VF, getVF(V1));
13629 } else {
13630 const auto *E = cast<const TreeEntry *>(InVectors.front());
13631 VF = std::max(VF, E->getVectorFactor());
13632 }
13633 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13634 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
13635 CommonMask[Idx] = Mask[Idx] + VF;
13636 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13637 transformMaskAfterShuffle(CommonMask, CommonMask);
13638 } else {
13639 auto P = InVectors.front();
13640 Cost += createShuffle(&E1, E2, Mask);
13641 unsigned VF = Mask.size();
13642 if (Value *V1 = dyn_cast<Value *>(P)) {
13643 VF = std::max(VF,
13644 getNumElements(V1->getType()));
13645 } else {
13646 const auto *E = cast<const TreeEntry *>(P);
13647 VF = std::max(VF, E->getVectorFactor());
13648 }
13649 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13650 if (Mask[Idx] != PoisonMaskElem)
13651 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13652 Cost += createShuffle(P, InVectors.front(), CommonMask);
13653 transformMaskAfterShuffle(CommonMask, CommonMask);
13654 }
13655 }
13656
13657 class ShuffleCostBuilder {
13658 const TargetTransformInfo &TTI;
13659
13660 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
13661 int Index = -1;
13662 return Mask.empty() ||
13663 (VF == Mask.size() &&
13666 Index == 0);
13667 }
13668
13669 public:
13670 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
13671 ~ShuffleCostBuilder() = default;
13672 InstructionCost createShuffleVector(Value *V1, Value *,
13673 ArrayRef<int> Mask) const {
13674 // Empty mask or identity mask are free.
13675 unsigned VF =
13676 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13677 if (isEmptyOrIdentity(Mask, VF))
13678 return TTI::TCC_Free;
13679 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
13680 cast<VectorType>(V1->getType()), Mask);
13681 }
13682 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
13683 // Empty mask or identity mask are free.
13684 unsigned VF =
13685 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
13686 if (isEmptyOrIdentity(Mask, VF))
13687 return TTI::TCC_Free;
13688 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
13689 cast<VectorType>(V1->getType()), Mask);
13690 }
13691 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
13692 InstructionCost createPoison(Type *Ty, unsigned VF) const {
13693 return TTI::TCC_Free;
13694 }
13695 void resizeToMatch(Value *&, Value *&) const {}
13696 };
13697
13698 /// Smart shuffle instruction emission, walks through shuffles trees and
13699 /// tries to find the best matching vector for the actual shuffle
13700 /// instruction.
13702 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
13704 ArrayRef<int> Mask) {
13705 ShuffleCostBuilder Builder(TTI);
13706 SmallVector<int> CommonMask(Mask);
13707 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
13708 unsigned CommonVF = Mask.size();
13709 InstructionCost ExtraCost = 0;
13710 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
13711 unsigned VF) -> InstructionCost {
13712 if (E.isGather() && allConstant(E.Scalars))
13713 return TTI::TCC_Free;
13714 Type *EScalarTy = E.Scalars.front()->getType();
13715 bool IsSigned = true;
13716 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13717 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
13718 IsSigned = It->second.second;
13719 }
13720 if (EScalarTy != ScalarTy) {
13721 unsigned CastOpcode = Instruction::Trunc;
13722 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13723 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13724 if (DstSz > SrcSz)
13725 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13726 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
13727 getWidenedType(EScalarTy, VF),
13728 TTI::CastContextHint::None, CostKind);
13729 }
13730 return TTI::TCC_Free;
13731 };
13732 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
13733 if (isa<Constant>(V))
13734 return TTI::TCC_Free;
13735 auto *VecTy = cast<VectorType>(V->getType());
13736 Type *EScalarTy = VecTy->getElementType();
13737 if (EScalarTy != ScalarTy) {
13738 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
13739 unsigned CastOpcode = Instruction::Trunc;
13740 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13741 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13742 if (DstSz > SrcSz)
13743 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13744 return TTI.getCastInstrCost(
13745 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
13746 VecTy, TTI::CastContextHint::None, CostKind);
13747 }
13748 return TTI::TCC_Free;
13749 };
13750 if (!V1 && !V2 && !P2.isNull()) {
13751 // Shuffle 2 entry nodes.
13752 const TreeEntry *E = cast<const TreeEntry *>(P1);
13753 unsigned VF = E->getVectorFactor();
13754 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13755 CommonVF = std::max(VF, E2->getVectorFactor());
13756 assert(all_of(Mask,
13757 [=](int Idx) {
13758 return Idx < 2 * static_cast<int>(CommonVF);
13759 }) &&
13760 "All elements in mask must be less than 2 * CommonVF.");
13761 if (E->Scalars.size() == E2->Scalars.size()) {
13762 SmallVector<int> EMask = E->getCommonMask();
13763 SmallVector<int> E2Mask = E2->getCommonMask();
13764 if (!EMask.empty() || !E2Mask.empty()) {
13765 for (int &Idx : CommonMask) {
13766 if (Idx == PoisonMaskElem)
13767 continue;
13768 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
13769 Idx = EMask[Idx];
13770 else if (Idx >= static_cast<int>(CommonVF))
13771 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13772 E->Scalars.size();
13773 }
13774 }
13775 CommonVF = E->Scalars.size();
13776 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13777 GetNodeMinBWAffectedCost(*E2, CommonVF);
13778 } else {
13779 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13780 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13781 }
13782 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13783 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13784 } else if (!V1 && P2.isNull()) {
13785 // Shuffle single entry node.
13786 const TreeEntry *E = cast<const TreeEntry *>(P1);
13787 unsigned VF = E->getVectorFactor();
13788 CommonVF = VF;
13789 assert(
13790 all_of(Mask,
13791 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13792 "All elements in mask must be less than CommonVF.");
13793 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13794 SmallVector<int> EMask = E->getCommonMask();
13795 assert(!EMask.empty() && "Expected non-empty common mask.");
13796 for (int &Idx : CommonMask) {
13797 if (Idx != PoisonMaskElem)
13798 Idx = EMask[Idx];
13799 }
13800 CommonVF = E->Scalars.size();
13801 } else if (unsigned Factor = E->getInterleaveFactor();
13802 Factor > 0 && E->Scalars.size() != Mask.size() &&
13804 Factor)) {
13805 // Deinterleaved nodes are free.
13806 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13807 }
13808 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13809 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13810 // Not identity/broadcast? Try to see if the original vector is better.
13811 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13812 CommonVF == CommonMask.size() &&
13813 any_of(enumerate(CommonMask),
13814 [](const auto &&P) {
13815 return P.value() != PoisonMaskElem &&
13816 static_cast<unsigned>(P.value()) != P.index();
13817 }) &&
13818 any_of(CommonMask,
13819 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
13820 SmallVector<int> ReorderMask;
13821 inversePermutation(E->ReorderIndices, ReorderMask);
13822 ::addMask(CommonMask, ReorderMask);
13823 }
13824 } else if (V1 && P2.isNull()) {
13825 // Shuffle single vector.
13826 ExtraCost += GetValueMinBWAffectedCost(V1);
13827 CommonVF = getVF(V1);
13828 assert(
13829 all_of(Mask,
13830 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
13831 "All elements in mask must be less than CommonVF.");
13832 } else if (V1 && !V2) {
13833 // Shuffle vector and tree node.
13834 unsigned VF = getVF(V1);
13835 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
13836 CommonVF = std::max(VF, E2->getVectorFactor());
13837 assert(all_of(Mask,
13838 [=](int Idx) {
13839 return Idx < 2 * static_cast<int>(CommonVF);
13840 }) &&
13841 "All elements in mask must be less than 2 * CommonVF.");
13842 if (E2->Scalars.size() == VF && VF != CommonVF) {
13843 SmallVector<int> E2Mask = E2->getCommonMask();
13844 assert(!E2Mask.empty() && "Expected non-empty common mask.");
13845 for (int &Idx : CommonMask) {
13846 if (Idx == PoisonMaskElem)
13847 continue;
13848 if (Idx >= static_cast<int>(CommonVF))
13849 Idx = E2Mask[Idx - CommonVF] + VF;
13850 }
13851 CommonVF = VF;
13852 }
13853 ExtraCost += GetValueMinBWAffectedCost(V1);
13854 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13855 ExtraCost += GetNodeMinBWAffectedCost(
13856 *E2, std::min(CommonVF, E2->getVectorFactor()));
13857 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13858 } else if (!V1 && V2) {
13859 // Shuffle vector and tree node.
13860 unsigned VF = getVF(V2);
13861 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
13862 CommonVF = std::max(VF, E1->getVectorFactor());
13863 assert(all_of(Mask,
13864 [=](int Idx) {
13865 return Idx < 2 * static_cast<int>(CommonVF);
13866 }) &&
13867 "All elements in mask must be less than 2 * CommonVF.");
13868 if (E1->Scalars.size() == VF && VF != CommonVF) {
13869 SmallVector<int> E1Mask = E1->getCommonMask();
13870 assert(!E1Mask.empty() && "Expected non-empty common mask.");
13871 for (int &Idx : CommonMask) {
13872 if (Idx == PoisonMaskElem)
13873 continue;
13874 if (Idx >= static_cast<int>(CommonVF))
13875 Idx = E1Mask[Idx - CommonVF] + VF;
13876 else
13877 Idx = E1Mask[Idx];
13878 }
13879 CommonVF = VF;
13880 }
13881 ExtraCost += GetNodeMinBWAffectedCost(
13882 *E1, std::min(CommonVF, E1->getVectorFactor()));
13883 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13884 ExtraCost += GetValueMinBWAffectedCost(V2);
13885 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13886 } else {
13887 assert(V1 && V2 && "Expected both vectors.");
13888 unsigned VF = getVF(V1);
13889 CommonVF = std::max(VF, getVF(V2));
13890 assert(all_of(Mask,
13891 [=](int Idx) {
13892 return Idx < 2 * static_cast<int>(CommonVF);
13893 }) &&
13894 "All elements in mask must be less than 2 * CommonVF.");
13895 ExtraCost +=
13896 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
13897 if (V1->getType() != V2->getType()) {
13898 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13899 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13900 } else {
13901 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
13902 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
13903 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
13904 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
13905 }
13906 }
13907 InVectors.front() =
13908 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
13909 if (InVectors.size() == 2)
13910 InVectors.pop_back();
13911 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
13912 V1, V2, CommonMask, Builder, ScalarTy);
13913 }
13914
13915public:
13917 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
13918 SmallPtrSetImpl<Value *> &CheckedExtracts)
13919 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
13920 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
13921 CheckedExtracts(CheckedExtracts) {}
13922 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
13923 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13924 unsigned NumParts, bool &UseVecBaseAsInput) {
13925 UseVecBaseAsInput = false;
13926 if (Mask.empty())
13927 return nullptr;
13928 Value *VecBase = nullptr;
13929 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
13930 if (!E->ReorderIndices.empty()) {
13931 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
13932 E->ReorderIndices.end());
13933 reorderScalars(VL, ReorderMask);
13934 }
13935 // Check if it can be considered reused if same extractelements were
13936 // vectorized already.
13937 bool PrevNodeFound = any_of(
13938 ArrayRef(R.VectorizableTree).take_front(E->Idx),
13939 [&](const std::unique_ptr<TreeEntry> &TE) {
13940 return ((TE->hasState() && !TE->isAltShuffle() &&
13941 TE->getOpcode() == Instruction::ExtractElement) ||
13942 TE->isGather()) &&
13943 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
13944 return VL.size() > Data.index() &&
13945 (Mask[Data.index()] == PoisonMaskElem ||
13946 isa<UndefValue>(VL[Data.index()]) ||
13947 Data.value() == VL[Data.index()]);
13948 });
13949 });
13950 SmallPtrSet<Value *, 4> UniqueBases;
13951 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13952 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
13953 for (unsigned Part : seq<unsigned>(NumParts)) {
13954 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
13955 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
13956 for (auto [I, V] :
13957 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
13958 // Ignore non-extractelement scalars.
13959 if (isa<UndefValue>(V) ||
13960 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
13961 continue;
13962 // If all users of instruction are going to be vectorized and this
13963 // instruction itself is not going to be vectorized, consider this
13964 // instruction as dead and remove its cost from the final cost of the
13965 // vectorized tree.
13966 // Also, avoid adjusting the cost for extractelements with multiple uses
13967 // in different graph entries.
13968 auto *EE = cast<ExtractElementInst>(V);
13969 VecBase = EE->getVectorOperand();
13970 UniqueBases.insert(VecBase);
13971 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
13972 if (!CheckedExtracts.insert(V).second ||
13973 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
13974 any_of(EE->users(),
13975 [&](User *U) {
13976 return isa<GetElementPtrInst>(U) &&
13977 !R.areAllUsersVectorized(cast<Instruction>(U),
13978 &VectorizedVals);
13979 }) ||
13980 (!VEs.empty() && !is_contained(VEs, E)))
13981 continue;
13982 std::optional<unsigned> EEIdx = getExtractIndex(EE);
13983 if (!EEIdx)
13984 continue;
13985 unsigned Idx = *EEIdx;
13986 // Take credit for instruction that will become dead.
13987 if (EE->hasOneUse() || !PrevNodeFound) {
13988 Instruction *Ext = EE->user_back();
13989 if (isa<SExtInst, ZExtInst>(Ext) &&
13990 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
13991 // Use getExtractWithExtendCost() to calculate the cost of
13992 // extractelement/ext pair.
13993 Cost -= TTI.getExtractWithExtendCost(
13994 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
13995 Idx, CostKind);
13996 // Add back the cost of s|zext which is subtracted separately.
13997 Cost += TTI.getCastInstrCost(
13998 Ext->getOpcode(), Ext->getType(), EE->getType(),
14000 continue;
14001 }
14002 }
14003 APInt &DemandedElts =
14004 VectorOpsToExtracts
14005 .try_emplace(VecBase,
14006 APInt::getZero(getNumElements(VecBase->getType())))
14007 .first->getSecond();
14008 DemandedElts.setBit(Idx);
14009 }
14010 }
14011 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
14013 DemandedElts, /*Insert=*/false,
14014 /*Extract=*/true, CostKind);
14015 // Check that gather of extractelements can be represented as just a
14016 // shuffle of a single/two vectors the scalars are extracted from.
14017 // Found the bunch of extractelement instructions that must be gathered
14018 // into a vector and can be represented as a permutation elements in a
14019 // single input vector or of 2 input vectors.
14020 // Done for reused if same extractelements were vectorized already.
14021 if (!PrevNodeFound)
14022 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14023 InVectors.assign(1, E);
14024 CommonMask.assign(Mask.begin(), Mask.end());
14025 transformMaskAfterShuffle(CommonMask, CommonMask);
14026 SameNodesEstimated = false;
14027 if (NumParts != 1 && UniqueBases.size() != 1) {
14028 UseVecBaseAsInput = true;
14029 VecBase =
14030 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
14031 }
14032 return VecBase;
14033 }
14034 /// Checks if the specified entry \p E needs to be delayed because of its
14035 /// dependency nodes.
14036 std::optional<InstructionCost>
14037 needToDelay(const TreeEntry *,
14039 // No need to delay the cost estimation during analysis.
14040 return std::nullopt;
14041 }
14042 /// Reset the builder to handle perfect diamond match.
14044 IsFinalized = false;
14045 CommonMask.clear();
14046 InVectors.clear();
14047 Cost = 0;
14048 VectorizedVals.clear();
14049 SameNodesEstimated = true;
14050 }
14051 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14052 if (&E1 == &E2) {
14053 assert(all_of(Mask,
14054 [&](int Idx) {
14055 return Idx < static_cast<int>(E1.getVectorFactor());
14056 }) &&
14057 "Expected single vector shuffle mask.");
14058 add(E1, Mask);
14059 return;
14060 }
14061 if (InVectors.empty()) {
14062 CommonMask.assign(Mask.begin(), Mask.end());
14063 InVectors.assign({&E1, &E2});
14064 return;
14065 }
14066 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14067 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
14068 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
14069 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
14070 const auto *It =
14071 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
14072 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14073 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
14074 }
14075 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14076 if (InVectors.empty()) {
14077 CommonMask.assign(Mask.begin(), Mask.end());
14078 InVectors.assign(1, &E1);
14079 return;
14080 }
14081 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14082 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
14083 unsigned NumParts = ::getNumberOfParts(TTI, MaskVecTy, Mask.size());
14084 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
14085 const auto *It =
14086 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
14087 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14088 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
14089 if (!SameNodesEstimated && InVectors.size() == 1)
14090 InVectors.emplace_back(&E1);
14091 }
14092 /// Adds 2 input vectors and the mask for their shuffling.
14093 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
14094 // May come only for shuffling of 2 vectors with extractelements, already
14095 // handled in adjustExtracts.
14096 assert(InVectors.size() == 1 &&
14097 all_of(enumerate(CommonMask),
14098 [&](auto P) {
14099 if (P.value() == PoisonMaskElem)
14100 return Mask[P.index()] == PoisonMaskElem;
14101 auto *EI = cast<ExtractElementInst>(
14102 cast<const TreeEntry *>(InVectors.front())
14103 ->getOrdered(P.index()));
14104 return EI->getVectorOperand() == V1 ||
14105 EI->getVectorOperand() == V2;
14106 }) &&
14107 "Expected extractelement vectors.");
14108 }
14109 /// Adds another one input vector and the mask for the shuffling.
14110 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
14111 if (InVectors.empty()) {
14112 assert(CommonMask.empty() && !ForExtracts &&
14113 "Expected empty input mask/vectors.");
14114 CommonMask.assign(Mask.begin(), Mask.end());
14115 InVectors.assign(1, V1);
14116 return;
14117 }
14118 if (ForExtracts) {
14119 // No need to add vectors here, already handled them in adjustExtracts.
14120 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
14121 !CommonMask.empty() &&
14122 all_of(enumerate(CommonMask),
14123 [&](auto P) {
14124 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
14125 ->getOrdered(P.index());
14126 if (P.value() == PoisonMaskElem)
14127 return P.value() == Mask[P.index()] ||
14128 isa<UndefValue>(Scalar);
14129 if (isa<Constant>(V1))
14130 return true;
14131 auto *EI = cast<ExtractElementInst>(Scalar);
14132 return EI->getVectorOperand() == V1;
14133 }) &&
14134 "Expected only tree entry for extractelement vectors.");
14135 return;
14136 }
14137 assert(!InVectors.empty() && !CommonMask.empty() &&
14138 "Expected only tree entries from extracts/reused buildvectors.");
14139 unsigned VF = getVF(V1);
14140 if (InVectors.size() == 2) {
14141 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14142 transformMaskAfterShuffle(CommonMask, CommonMask);
14143 VF = std::max<unsigned>(VF, CommonMask.size());
14144 } else if (const auto *InTE =
14145 InVectors.front().dyn_cast<const TreeEntry *>()) {
14146 VF = std::max(VF, InTE->getVectorFactor());
14147 } else {
14148 VF = std::max(
14149 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
14150 ->getNumElements());
14151 }
14152 InVectors.push_back(V1);
14153 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14154 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14155 CommonMask[Idx] = Mask[Idx] + VF;
14156 }
14157 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14158 Value *Root = nullptr) {
14159 Cost += getBuildVectorCost(VL, Root);
14160 if (!Root) {
14161 // FIXME: Need to find a way to avoid use of getNullValue here.
14163 unsigned VF = VL.size();
14164 if (MaskVF != 0)
14165 VF = std::min(VF, MaskVF);
14166 Type *VLScalarTy = VL.front()->getType();
14167 for (Value *V : VL.take_front(VF)) {
14168 Type *ScalarTy = VLScalarTy->getScalarType();
14169 if (isa<PoisonValue>(V)) {
14170 Vals.push_back(PoisonValue::get(ScalarTy));
14171 continue;
14172 }
14173 if (isa<UndefValue>(V)) {
14174 Vals.push_back(UndefValue::get(ScalarTy));
14175 continue;
14176 }
14177 Vals.push_back(Constant::getNullValue(ScalarTy));
14178 }
14179 if (auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
14180 assert(SLPReVec && "FixedVectorType is not expected.");
14181 // When REVEC is enabled, we need to expand vector types into scalar
14182 // types.
14183 Vals = replicateMask(Vals, VecTy->getNumElements());
14184 }
14185 return ConstantVector::get(Vals);
14186 }
14189 cast<FixedVectorType>(Root->getType())->getNumElements()),
14190 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14191 }
14193 /// Finalize emission of the shuffles.
14195 ArrayRef<int> ExtMask,
14196 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14197 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14200 Action = {}) {
14201 IsFinalized = true;
14202 if (Action) {
14203 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14204 if (InVectors.size() == 2)
14205 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14206 else
14207 Cost += createShuffle(Vec, nullptr, CommonMask);
14208 transformMaskAfterShuffle(CommonMask, CommonMask);
14209 assert(VF > 0 &&
14210 "Expected vector length for the final value before action.");
14211 Value *V = cast<Value *>(Vec);
14212 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
14213 Cost += createShuffle(V1, V2, Mask);
14214 return V1;
14215 });
14216 InVectors.front() = V;
14217 }
14218 if (!SubVectors.empty()) {
14219 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14220 if (InVectors.size() == 2)
14221 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
14222 else
14223 Cost += createShuffle(Vec, nullptr, CommonMask);
14224 transformMaskAfterShuffle(CommonMask, CommonMask);
14225 // Add subvectors permutation cost.
14226 if (!SubVectorsMask.empty()) {
14227 assert(SubVectorsMask.size() <= CommonMask.size() &&
14228 "Expected same size of masks for subvectors and common mask.");
14229 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14230 copy(SubVectorsMask, SVMask.begin());
14231 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14232 if (I2 != PoisonMaskElem) {
14233 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14234 I1 = I2 + CommonMask.size();
14235 }
14236 }
14238 getWidenedType(ScalarTy, CommonMask.size()),
14239 SVMask, CostKind);
14240 }
14241 for (auto [E, Idx] : SubVectors) {
14242 Type *EScalarTy = E->Scalars.front()->getType();
14243 bool IsSigned = true;
14244 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
14245 EScalarTy =
14246 IntegerType::get(EScalarTy->getContext(), It->second.first);
14247 IsSigned = It->second.second;
14248 }
14249 if (ScalarTy != EScalarTy) {
14250 unsigned CastOpcode = Instruction::Trunc;
14251 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14252 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14253 if (DstSz > SrcSz)
14254 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14255 Cost += TTI.getCastInstrCost(
14256 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
14257 getWidenedType(EScalarTy, E->getVectorFactor()),
14259 }
14262 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
14263 getWidenedType(ScalarTy, E->getVectorFactor()));
14264 if (!CommonMask.empty()) {
14265 std::iota(std::next(CommonMask.begin(), Idx),
14266 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
14267 Idx);
14268 }
14269 }
14270 }
14271
14272 if (!ExtMask.empty()) {
14273 if (CommonMask.empty()) {
14274 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14275 } else {
14276 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14277 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14278 if (ExtMask[I] == PoisonMaskElem)
14279 continue;
14280 NewMask[I] = CommonMask[ExtMask[I]];
14281 }
14282 CommonMask.swap(NewMask);
14283 }
14284 }
14285 if (CommonMask.empty()) {
14286 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14287 return Cost;
14288 }
14289 return Cost +
14290 createShuffle(InVectors.front(),
14291 InVectors.size() == 2 ? InVectors.back() : nullptr,
14292 CommonMask);
14293 }
14294
14296 assert((IsFinalized || CommonMask.empty()) &&
14297 "Shuffle construction must be finalized.");
14298 }
14299};
14300
14301const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
14302 unsigned Idx) const {
14303 TreeEntry *Op = OperandsToTreeEntry.at({E, Idx});
14304 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
14305 return Op;
14306}
14307
14308TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
14309 if (TE.State == TreeEntry::ScatterVectorize ||
14310 TE.State == TreeEntry::StridedVectorize)
14312 if (TE.State == TreeEntry::CompressVectorize)
14314 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14315 !TE.isAltShuffle()) {
14316 if (TE.ReorderIndices.empty())
14318 SmallVector<int> Mask;
14319 inversePermutation(TE.ReorderIndices, Mask);
14320 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
14322 }
14324}
14325
14327BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
14328 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14329 ArrayRef<Value *> VL = E->Scalars;
14330
14331 Type *ScalarTy = getValueType(VL[0]);
14332 if (!isValidElementType(ScalarTy))
14335
14336 // If we have computed a smaller type for the expression, update VecTy so
14337 // that the costs will be accurate.
14338 auto It = MinBWs.find(E);
14339 Type *OrigScalarTy = ScalarTy;
14340 if (It != MinBWs.end()) {
14341 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
14342 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
14343 if (VecTy)
14344 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
14345 }
14346 auto *VecTy = getWidenedType(ScalarTy, VL.size());
14347 unsigned EntryVF = E->getVectorFactor();
14348 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
14349
14350 if (E->isGather()) {
14351 if (allConstant(VL))
14352 return 0;
14353 if (isa<InsertElementInst>(VL[0]))
14355 if (isa<CmpInst>(VL.front()))
14356 ScalarTy = VL.front()->getType();
14357 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14358 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
14359 }
14360 if (E->State == TreeEntry::SplitVectorize) {
14361 assert(E->CombinedEntriesWithIndices.size() == 2 &&
14362 "Expected exactly 2 combined entries.");
14363 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
14364 InstructionCost VectorCost = 0;
14365 if (E->ReorderIndices.empty()) {
14366 VectorCost = ::getShuffleCost(
14367 *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
14368 E->CombinedEntriesWithIndices.back().second,
14370 ScalarTy,
14371 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14372 ->getVectorFactor()));
14373 } else {
14374 unsigned CommonVF =
14375 std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
14376 ->getVectorFactor(),
14377 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14378 ->getVectorFactor());
14379 VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
14380 getWidenedType(ScalarTy, CommonVF),
14381 E->getSplitMask(), CostKind);
14382 }
14383 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
14384 return VectorCost;
14385 }
14386 InstructionCost CommonCost = 0;
14387 SmallVector<int> Mask;
14388 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
14389 (E->State != TreeEntry::StridedVectorize ||
14390 !isReverseOrder(E->ReorderIndices))) {
14391 SmallVector<int> NewMask;
14392 if (E->getOpcode() == Instruction::Store) {
14393 // For stores the order is actually a mask.
14394 NewMask.resize(E->ReorderIndices.size());
14395 copy(E->ReorderIndices, NewMask.begin());
14396 } else {
14397 inversePermutation(E->ReorderIndices, NewMask);
14398 }
14399 ::addMask(Mask, NewMask);
14400 }
14401 if (!E->ReuseShuffleIndices.empty())
14402 ::addMask(Mask, E->ReuseShuffleIndices);
14403 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14404 CommonCost =
14405 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
14406 assert((E->State == TreeEntry::Vectorize ||
14407 E->State == TreeEntry::ScatterVectorize ||
14408 E->State == TreeEntry::StridedVectorize ||
14409 E->State == TreeEntry::CompressVectorize) &&
14410 "Unhandled state");
14411 assert(E->getOpcode() &&
14412 ((allSameType(VL) && allSameBlock(VL)) ||
14413 (E->getOpcode() == Instruction::GetElementPtr &&
14414 E->getMainOp()->getType()->isPointerTy()) ||
14415 E->hasCopyableElements()) &&
14416 "Invalid VL");
14417 Instruction *VL0 = E->getMainOp();
14418 unsigned ShuffleOrOp =
14419 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
14420 if (E->CombinedOp != TreeEntry::NotCombinedOp)
14421 ShuffleOrOp = E->CombinedOp;
14422 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
14423 const unsigned Sz = UniqueValues.size();
14424 SmallBitVector UsedScalars(Sz, false);
14425 for (unsigned I = 0; I < Sz; ++I) {
14426 if (isa<Instruction>(UniqueValues[I]) &&
14427 !E->isCopyableElement(UniqueValues[I]) &&
14428 getTreeEntries(UniqueValues[I]).front() == E)
14429 continue;
14430 UsedScalars.set(I);
14431 }
14432 auto GetCastContextHint = [&](Value *V) {
14433 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
14434 return getCastContextHint(*OpTEs.front());
14435 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
14436 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14437 !SrcState.isAltShuffle())
14440 };
14441 auto GetCostDiff =
14442 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
14443 function_ref<InstructionCost(InstructionCost)> VectorCost) {
14444 // Calculate the cost of this instruction.
14445 InstructionCost ScalarCost = 0;
14446 if (isa<CastInst, CallInst>(VL0)) {
14447 // For some of the instructions no need to calculate cost for each
14448 // particular instruction, we can use the cost of the single
14449 // instruction x total number of scalar instructions.
14450 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14451 } else {
14452 for (unsigned I = 0; I < Sz; ++I) {
14453 if (UsedScalars.test(I))
14454 continue;
14455 ScalarCost += ScalarEltCost(I);
14456 }
14457 }
14458
14459 InstructionCost VecCost = VectorCost(CommonCost);
14460 // Check if the current node must be resized, if the parent node is not
14461 // resized.
14462 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
14463 E->Idx != 0 &&
14464 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
14465 const EdgeInfo &EI = E->UserTreeIndex;
14466 if (!EI.UserTE->hasState() ||
14467 EI.UserTE->getOpcode() != Instruction::Select ||
14468 EI.EdgeIdx != 0) {
14469 auto UserBWIt = MinBWs.find(EI.UserTE);
14470 Type *UserScalarTy =
14471 (EI.UserTE->isGather() ||
14472 EI.UserTE->State == TreeEntry::SplitVectorize)
14473 ? EI.UserTE->Scalars.front()->getType()
14474 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14475 if (UserBWIt != MinBWs.end())
14476 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
14477 UserBWIt->second.first);
14478 if (ScalarTy != UserScalarTy) {
14479 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14480 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14481 unsigned VecOpcode;
14482 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
14483 if (BWSz > SrcBWSz)
14484 VecOpcode = Instruction::Trunc;
14485 else
14486 VecOpcode =
14487 It->second.second ? Instruction::SExt : Instruction::ZExt;
14488 TTI::CastContextHint CCH = GetCastContextHint(VL0);
14489 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14490 CostKind);
14491 }
14492 }
14493 }
14494 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
14495 ScalarCost, "Calculated costs for Tree"));
14496 return VecCost - ScalarCost;
14497 };
14498 // Calculate cost difference from vectorizing set of GEPs.
14499 // Negative value means vectorizing is profitable.
14500 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
14501 assert((E->State == TreeEntry::Vectorize ||
14502 E->State == TreeEntry::StridedVectorize ||
14503 E->State == TreeEntry::CompressVectorize) &&
14504 "Entry state expected to be Vectorize, StridedVectorize or "
14505 "MaskedLoadCompressVectorize here.");
14506 InstructionCost ScalarCost = 0;
14507 InstructionCost VecCost = 0;
14508 std::tie(ScalarCost, VecCost) = getGEPCosts(
14509 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
14510 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
14511 "Calculated GEPs cost for Tree"));
14512
14513 return VecCost - ScalarCost;
14514 };
14515
14516 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
14517 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
14518 if (MinMaxID == Intrinsic::not_intrinsic)
14520 Type *CanonicalType = Ty;
14521 if (CanonicalType->isPtrOrPtrVectorTy())
14522 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
14523 CanonicalType->getContext(),
14524 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
14525
14526 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14527 {CanonicalType, CanonicalType});
14529 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
14530 // If the selects are the only uses of the compares, they will be
14531 // dead and we can adjust the cost by removing their cost.
14532 if (VI && SelectOnly) {
14533 assert((!Ty->isVectorTy() || SLPReVec) &&
14534 "Expected only for scalar type.");
14535 auto *CI = cast<CmpInst>(VI->getOperand(0));
14536 IntrinsicCost -= TTI->getCmpSelInstrCost(
14537 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14538 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14539 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14540 }
14541 return IntrinsicCost;
14542 };
14543 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
14544 Instruction *VI) {
14545 InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
14546 return Cost;
14547 };
14548 switch (ShuffleOrOp) {
14549 case Instruction::PHI: {
14550 // Count reused scalars.
14551 InstructionCost ScalarCost = 0;
14552 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14553 for (Value *V : UniqueValues) {
14554 auto *PHI = dyn_cast<PHINode>(V);
14555 if (!PHI)
14556 continue;
14557
14558 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
14559 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
14560 Value *Op = PHI->getIncomingValue(I);
14561 Operands[I] = Op;
14562 }
14563 if (const TreeEntry *OpTE =
14564 getSameValuesTreeEntry(Operands.front(), Operands))
14565 if (CountedOps.insert(OpTE).second &&
14566 !OpTE->ReuseShuffleIndices.empty())
14567 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14568 OpTE->Scalars.size());
14569 }
14570
14571 return CommonCost - ScalarCost;
14572 }
14573 case Instruction::ExtractValue:
14574 case Instruction::ExtractElement: {
14575 APInt DemandedElts;
14576 VectorType *SrcVecTy = nullptr;
14577 auto GetScalarCost = [&](unsigned Idx) {
14578 if (isa<PoisonValue>(UniqueValues[Idx]))
14580
14581 auto *I = cast<Instruction>(UniqueValues[Idx]);
14582 if (!SrcVecTy) {
14583 if (ShuffleOrOp == Instruction::ExtractElement) {
14584 auto *EE = cast<ExtractElementInst>(I);
14585 SrcVecTy = EE->getVectorOperandType();
14586 } else {
14587 auto *EV = cast<ExtractValueInst>(I);
14588 Type *AggregateTy = EV->getAggregateOperand()->getType();
14589 unsigned NumElts;
14590 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
14591 NumElts = ATy->getNumElements();
14592 else
14593 NumElts = AggregateTy->getStructNumElements();
14594 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
14595 }
14596 }
14597 if (I->hasOneUse()) {
14598 Instruction *Ext = I->user_back();
14599 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
14601 // Use getExtractWithExtendCost() to calculate the cost of
14602 // extractelement/ext pair.
14603 InstructionCost Cost = TTI->getExtractWithExtendCost(
14604 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I),
14605 CostKind);
14606 // Subtract the cost of s|zext which is subtracted separately.
14607 Cost -= TTI->getCastInstrCost(
14608 Ext->getOpcode(), Ext->getType(), I->getType(),
14610 return Cost;
14611 }
14612 }
14613 if (DemandedElts.isZero())
14614 DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
14615 DemandedElts.setBit(*getExtractIndex(I));
14617 };
14618 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14619 return CommonCost - (DemandedElts.isZero()
14621 : TTI.getScalarizationOverhead(
14622 SrcVecTy, DemandedElts, /*Insert=*/false,
14623 /*Extract=*/true, CostKind));
14624 };
14625 return GetCostDiff(GetScalarCost, GetVectorCost);
14626 }
14627 case Instruction::InsertElement: {
14628 assert(E->ReuseShuffleIndices.empty() &&
14629 "Unique insertelements only are expected.");
14630 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
14631 unsigned const NumElts = SrcVecTy->getNumElements();
14632 unsigned const NumScalars = VL.size();
14633
14634 unsigned NumOfParts = ::getNumberOfParts(*TTI, SrcVecTy);
14635
14636 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
14637 unsigned OffsetBeg = *getElementIndex(VL.front());
14638 unsigned OffsetEnd = OffsetBeg;
14639 InsertMask[OffsetBeg] = 0;
14640 for (auto [I, V] : enumerate(VL.drop_front())) {
14641 unsigned Idx = *getElementIndex(V);
14642 if (OffsetBeg > Idx)
14643 OffsetBeg = Idx;
14644 else if (OffsetEnd < Idx)
14645 OffsetEnd = Idx;
14646 InsertMask[Idx] = I + 1;
14647 }
14648 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
14649 if (NumOfParts > 0 && NumOfParts < NumElts)
14650 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14651 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14652 VecScalarsSz;
14653 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14654 unsigned InsertVecSz = std::min<unsigned>(
14655 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
14656 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14657 bool IsWholeSubvector =
14658 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14659 // Check if we can safely insert a subvector. If it is not possible, just
14660 // generate a whole-sized vector and shuffle the source vector and the new
14661 // subvector.
14662 if (OffsetBeg + InsertVecSz > VecSz) {
14663 // Align OffsetBeg to generate correct mask.
14664 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
14665 InsertVecSz = VecSz;
14666 }
14667
14668 APInt DemandedElts = APInt::getZero(NumElts);
14669 // TODO: Add support for Instruction::InsertValue.
14670 SmallVector<int> Mask;
14671 if (!E->ReorderIndices.empty()) {
14672 inversePermutation(E->ReorderIndices, Mask);
14673 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
14674 } else {
14675 Mask.assign(VecSz, PoisonMaskElem);
14676 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
14677 }
14678 bool IsIdentity = true;
14679 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
14680 Mask.swap(PrevMask);
14681 for (unsigned I = 0; I < NumScalars; ++I) {
14682 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
14683 DemandedElts.setBit(InsertIdx);
14684 IsIdentity &= InsertIdx - OffsetBeg == I;
14685 Mask[InsertIdx - OffsetBeg] = I;
14686 }
14687 assert(Offset < NumElts && "Failed to find vector index offset");
14688
14690 Cost -=
14691 getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
14692 /*Insert*/ true, /*Extract*/ false, CostKind);
14693
14694 // First cost - resize to actual vector size if not identity shuffle or
14695 // need to shift the vector.
14696 // Do not calculate the cost if the actual size is the register size and
14697 // we can merge this shuffle with the following SK_Select.
14698 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
14699 if (!IsIdentity)
14701 InsertVecTy, Mask);
14702 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
14703 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14704 }));
14705 // Second cost - permutation with subvector, if some elements are from the
14706 // initial vector or inserting a subvector.
14707 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
14708 // subvector of ActualVecTy.
14709 SmallBitVector InMask =
14710 isUndefVector(FirstInsert->getOperand(0),
14711 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14712 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
14713 if (InsertVecSz != VecSz) {
14714 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
14715 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
14716 CostKind, OffsetBeg - Offset, InsertVecTy);
14717 } else {
14718 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
14719 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
14720 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
14721 I <= End; ++I)
14722 if (Mask[I] != PoisonMaskElem)
14723 Mask[I] = I + VecSz;
14724 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
14725 Mask[I] =
14726 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
14727 Cost +=
14728 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
14729 }
14730 }
14731 return Cost;
14732 }
14733 case Instruction::ZExt:
14734 case Instruction::SExt:
14735 case Instruction::FPToUI:
14736 case Instruction::FPToSI:
14737 case Instruction::FPExt:
14738 case Instruction::PtrToInt:
14739 case Instruction::IntToPtr:
14740 case Instruction::SIToFP:
14741 case Instruction::UIToFP:
14742 case Instruction::Trunc:
14743 case Instruction::FPTrunc:
14744 case Instruction::BitCast: {
14745 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
14746 Type *SrcScalarTy = VL0->getOperand(0)->getType();
14747 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
14748 unsigned Opcode = ShuffleOrOp;
14749 unsigned VecOpcode = Opcode;
14750 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
14751 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14752 // Check if the values are candidates to demote.
14753 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
14754 if (SrcIt != MinBWs.end()) {
14755 SrcBWSz = SrcIt->second.first;
14756 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
14757 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
14758 SrcVecTy =
14759 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
14760 }
14761 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
14762 if (BWSz == SrcBWSz) {
14763 VecOpcode = Instruction::BitCast;
14764 } else if (BWSz < SrcBWSz) {
14765 VecOpcode = Instruction::Trunc;
14766 } else if (It != MinBWs.end()) {
14767 assert(BWSz > SrcBWSz && "Invalid cast!");
14768 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14769 } else if (SrcIt != MinBWs.end()) {
14770 assert(BWSz > SrcBWSz && "Invalid cast!");
14771 VecOpcode =
14772 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14773 }
14774 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14775 !SrcIt->second.second) {
14776 VecOpcode = Instruction::UIToFP;
14777 }
14778 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
14779 assert(Idx == 0 && "Expected 0 index only");
14780 return TTI->getCastInstrCost(Opcode, VL0->getType(),
14781 VL0->getOperand(0)->getType(),
14783 };
14784 auto GetVectorCost = [=](InstructionCost CommonCost) {
14785 // Do not count cost here if minimum bitwidth is in effect and it is just
14786 // a bitcast (here it is just a noop).
14787 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14788 return CommonCost;
14789 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
14790 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
14791
14792 bool IsArithmeticExtendedReduction =
14793 E->Idx == 0 && UserIgnoreList &&
14794 all_of(*UserIgnoreList, [](Value *V) {
14795 auto *I = cast<Instruction>(V);
14796 return is_contained({Instruction::Add, Instruction::FAdd,
14797 Instruction::Mul, Instruction::FMul,
14798 Instruction::And, Instruction::Or,
14799 Instruction::Xor},
14800 I->getOpcode());
14801 });
14802 if (IsArithmeticExtendedReduction &&
14803 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14804 return CommonCost;
14805 return CommonCost +
14806 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
14807 VecOpcode == Opcode ? VI : nullptr);
14808 };
14809 return GetCostDiff(GetScalarCost, GetVectorCost);
14810 }
14811 case Instruction::FCmp:
14812 case Instruction::ICmp:
14813 case Instruction::Select: {
14814 CmpPredicate VecPred, SwappedVecPred;
14815 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
14816 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
14817 match(VL0, MatchCmp))
14818 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
14819 else
14820 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
14823 auto GetScalarCost = [&](unsigned Idx) {
14824 if (isa<PoisonValue>(UniqueValues[Idx]))
14826
14827 auto *VI = cast<Instruction>(UniqueValues[Idx]);
14828 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
14831 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
14832 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
14833 !match(VI, MatchCmp)) ||
14834 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
14835 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
14836 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
14839
14840 InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
14841 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14842 CostKind, getOperandInfo(VI->getOperand(0)),
14843 getOperandInfo(VI->getOperand(1)), VI);
14844 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
14845 if (IntrinsicCost.isValid())
14846 ScalarCost = IntrinsicCost;
14847
14848 return ScalarCost;
14849 };
14850 auto GetVectorCost = [&](InstructionCost CommonCost) {
14851 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
14852
14853 InstructionCost VecCost =
14854 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
14855 CostKind, getOperandInfo(E->getOperand(0)),
14856 getOperandInfo(E->getOperand(1)), VL0);
14857 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
14858 auto *CondType =
14859 getWidenedType(SI->getCondition()->getType(), VL.size());
14860 unsigned CondNumElements = CondType->getNumElements();
14861 unsigned VecTyNumElements = getNumElements(VecTy);
14862 assert(VecTyNumElements >= CondNumElements &&
14863 VecTyNumElements % CondNumElements == 0 &&
14864 "Cannot vectorize Instruction::Select");
14865 if (CondNumElements != VecTyNumElements) {
14866 // When the return type is i1 but the source is fixed vector type, we
14867 // need to duplicate the condition value.
14868 VecCost += ::getShuffleCost(
14869 *TTI, TTI::SK_PermuteSingleSrc, CondType,
14870 createReplicatedMask(VecTyNumElements / CondNumElements,
14871 CondNumElements));
14872 }
14873 }
14874 return VecCost + CommonCost;
14875 };
14876 return GetCostDiff(GetScalarCost, GetVectorCost);
14877 }
14878 case TreeEntry::MinMax: {
14879 auto GetScalarCost = [&](unsigned Idx) {
14880 return GetMinMaxCost(OrigScalarTy);
14881 };
14882 auto GetVectorCost = [&](InstructionCost CommonCost) {
14883 InstructionCost VecCost = GetMinMaxCost(VecTy);
14884 return VecCost + CommonCost;
14885 };
14886 return GetCostDiff(GetScalarCost, GetVectorCost);
14887 }
14888 case TreeEntry::FMulAdd: {
14889 auto GetScalarCost = [&](unsigned Idx) {
14890 if (isa<PoisonValue>(UniqueValues[Idx]))
14892 return GetFMulAddCost(E->getOperations(),
14893 cast<Instruction>(UniqueValues[Idx]));
14894 };
14895 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
14896 FastMathFlags FMF;
14897 FMF.set();
14898 for (Value *V : E->Scalars) {
14899 if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
14900 FMF &= FPCI->getFastMathFlags();
14901 if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
14902 FMF &= FPCIOp->getFastMathFlags();
14903 }
14904 }
14905 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
14906 {VecTy, VecTy, VecTy}, FMF);
14907 InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
14908 return VecCost + CommonCost;
14909 };
14910 return GetCostDiff(GetScalarCost, GetVectorCost);
14911 }
14912 case Instruction::FNeg:
14913 case Instruction::Add:
14914 case Instruction::FAdd:
14915 case Instruction::Sub:
14916 case Instruction::FSub:
14917 case Instruction::Mul:
14918 case Instruction::FMul:
14919 case Instruction::UDiv:
14920 case Instruction::SDiv:
14921 case Instruction::FDiv:
14922 case Instruction::URem:
14923 case Instruction::SRem:
14924 case Instruction::FRem:
14925 case Instruction::Shl:
14926 case Instruction::LShr:
14927 case Instruction::AShr:
14928 case Instruction::And:
14929 case Instruction::Or:
14930 case Instruction::Xor: {
14931 auto GetScalarCost = [&](unsigned Idx) {
14932 if (isa<PoisonValue>(UniqueValues[Idx]))
14934
14935 // We cannot retrieve the operand from UniqueValues[Idx] because an
14936 // interchangeable instruction may be used. The order and the actual
14937 // operand might differ from what is retrieved from UniqueValues[Idx].
14938 Value *Op1 = E->getOperand(0)[Idx];
14939 Value *Op2;
14940 SmallVector<const Value *, 2> Operands(1, Op1);
14941 if (isa<UnaryOperator>(UniqueValues[Idx])) {
14942 Op2 = Op1;
14943 } else {
14944 Op2 = E->getOperand(1)[Idx];
14945 Operands.push_back(Op2);
14946 }
14949 InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
14950 ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
14951 if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
14952 I && (ShuffleOrOp == Instruction::FAdd ||
14953 ShuffleOrOp == Instruction::FSub)) {
14954 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
14955 if (IntrinsicCost.isValid())
14956 ScalarCost = IntrinsicCost;
14957 }
14958 return ScalarCost;
14959 };
14960 auto GetVectorCost = [=](InstructionCost CommonCost) {
14961 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
14962 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
14963 ArrayRef<Value *> Ops = E->getOperand(I);
14964 if (all_of(Ops, [&](Value *Op) {
14965 auto *CI = dyn_cast<ConstantInt>(Op);
14966 return CI && CI->getValue().countr_one() >= It->second.first;
14967 }))
14968 return CommonCost;
14969 }
14970 }
14971 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
14972 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
14973 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
14974 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
14975 Op2Info, {}, nullptr, TLI) +
14976 CommonCost;
14977 };
14978 return GetCostDiff(GetScalarCost, GetVectorCost);
14979 }
14980 case Instruction::GetElementPtr: {
14981 return CommonCost + GetGEPCostDiff(VL, VL0);
14982 }
14983 case Instruction::Load: {
14984 auto GetScalarCost = [&](unsigned Idx) {
14985 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
14986 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
14987 VI->getAlign(), VI->getPointerAddressSpace(),
14989 };
14990 auto *LI0 = cast<LoadInst>(VL0);
14991 auto GetVectorCost = [&](InstructionCost CommonCost) {
14992 InstructionCost VecLdCost;
14993 switch (E->State) {
14994 case TreeEntry::Vectorize:
14995 if (unsigned Factor = E->getInterleaveFactor()) {
14996 VecLdCost = TTI->getInterleavedMemoryOpCost(
14997 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
14998 LI0->getPointerAddressSpace(), CostKind);
14999
15000 } else {
15001 VecLdCost = TTI->getMemoryOpCost(
15002 Instruction::Load, VecTy, LI0->getAlign(),
15003 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
15004 }
15005 break;
15006 case TreeEntry::StridedVectorize: {
15007 Align CommonAlignment =
15008 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
15009 VecLdCost = TTI->getStridedMemoryOpCost(
15010 Instruction::Load, VecTy, LI0->getPointerOperand(),
15011 /*VariableMask=*/false, CommonAlignment, CostKind);
15012 break;
15013 }
15014 case TreeEntry::CompressVectorize: {
15015 bool IsMasked;
15016 unsigned InterleaveFactor;
15017 SmallVector<int> CompressMask;
15018 VectorType *LoadVecTy;
15019 SmallVector<Value *> Scalars(VL);
15020 if (!E->ReorderIndices.empty()) {
15021 SmallVector<int> Mask(E->ReorderIndices.begin(),
15022 E->ReorderIndices.end());
15023 reorderScalars(Scalars, Mask);
15024 }
15025 SmallVector<Value *> PointerOps(Scalars.size());
15026 for (auto [I, V] : enumerate(Scalars))
15027 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
15028 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
15029 Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
15030 *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
15031 CompressMask, LoadVecTy);
15032 assert(IsVectorized && "Failed to vectorize load");
15033 CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
15034 InterleaveFactor, IsMasked);
15035 Align CommonAlignment = LI0->getAlign();
15036 if (InterleaveFactor) {
15037 VecLdCost = TTI->getInterleavedMemoryOpCost(
15038 Instruction::Load, LoadVecTy, InterleaveFactor, {},
15039 CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
15040 } else if (IsMasked) {
15041 VecLdCost = TTI->getMaskedMemoryOpCost(
15042 Instruction::Load, LoadVecTy, CommonAlignment,
15043 LI0->getPointerAddressSpace(), CostKind);
15044 // TODO: include this cost into CommonCost.
15045 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
15046 LoadVecTy, CompressMask, CostKind);
15047 } else {
15048 VecLdCost = TTI->getMemoryOpCost(
15049 Instruction::Load, LoadVecTy, CommonAlignment,
15050 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
15051 // TODO: include this cost into CommonCost.
15052 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
15053 LoadVecTy, CompressMask, CostKind);
15054 }
15055 break;
15056 }
15057 case TreeEntry::ScatterVectorize: {
15058 Align CommonAlignment =
15059 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
15060 VecLdCost = TTI->getGatherScatterOpCost(
15061 Instruction::Load, VecTy, LI0->getPointerOperand(),
15062 /*VariableMask=*/false, CommonAlignment, CostKind);
15063 break;
15064 }
15065 case TreeEntry::CombinedVectorize:
15066 case TreeEntry::SplitVectorize:
15067 case TreeEntry::NeedToGather:
15068 llvm_unreachable("Unexpected vectorization state.");
15069 }
15070 return VecLdCost + CommonCost;
15071 };
15072
15073 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
15074 // If this node generates masked gather load then it is not a terminal node.
15075 // Hence address operand cost is estimated separately.
15076 if (E->State == TreeEntry::ScatterVectorize)
15077 return Cost;
15078
15079 // Estimate cost of GEPs since this tree node is a terminator.
15080 SmallVector<Value *> PointerOps(VL.size());
15081 for (auto [I, V] : enumerate(VL))
15082 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
15083 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
15084 }
15085 case Instruction::Store: {
15086 bool IsReorder = !E->ReorderIndices.empty();
15087 auto GetScalarCost = [=](unsigned Idx) {
15088 auto *VI = cast<StoreInst>(VL[Idx]);
15089 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
15090 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
15091 VI->getAlign(), VI->getPointerAddressSpace(),
15092 CostKind, OpInfo, VI);
15093 };
15094 auto *BaseSI =
15095 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
15096 auto GetVectorCost = [=](InstructionCost CommonCost) {
15097 // We know that we can merge the stores. Calculate the cost.
15098 InstructionCost VecStCost;
15099 if (E->State == TreeEntry::StridedVectorize) {
15100 Align CommonAlignment =
15101 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
15102 VecStCost = TTI->getStridedMemoryOpCost(
15103 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
15104 /*VariableMask=*/false, CommonAlignment, CostKind);
15105 } else {
15106 assert(E->State == TreeEntry::Vectorize &&
15107 "Expected either strided or consecutive stores.");
15108 if (unsigned Factor = E->getInterleaveFactor()) {
15109 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
15110 "No reused shuffles expected");
15111 CommonCost = 0;
15112 VecStCost = TTI->getInterleavedMemoryOpCost(
15113 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
15114 BaseSI->getPointerAddressSpace(), CostKind);
15115 } else {
15116 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
15117 VecStCost = TTI->getMemoryOpCost(
15118 Instruction::Store, VecTy, BaseSI->getAlign(),
15119 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
15120 }
15121 }
15122 return VecStCost + CommonCost;
15123 };
15124 SmallVector<Value *> PointerOps(VL.size());
15125 for (auto [I, V] : enumerate(VL)) {
15126 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
15127 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
15128 }
15129
15130 return GetCostDiff(GetScalarCost, GetVectorCost) +
15131 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15132 }
15133 case Instruction::Call: {
15134 auto GetScalarCost = [&](unsigned Idx) {
15135 auto *CI = cast<CallInst>(UniqueValues[Idx]);
15138 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
15139 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
15140 }
15141 return TTI->getCallInstrCost(CI->getCalledFunction(),
15143 CI->getFunctionType()->params(), CostKind);
15144 };
15145 auto GetVectorCost = [=](InstructionCost CommonCost) {
15146 auto *CI = cast<CallInst>(VL0);
15149 CI, ID, VecTy->getNumElements(),
15150 It != MinBWs.end() ? It->second.first : 0, TTI);
15151 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
15152 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15153 };
15154 return GetCostDiff(GetScalarCost, GetVectorCost);
15155 }
15156 case Instruction::ShuffleVector: {
15157 if (!SLPReVec || E->isAltShuffle())
15158 assert(E->isAltShuffle() &&
15159 ((Instruction::isBinaryOp(E->getOpcode()) &&
15160 Instruction::isBinaryOp(E->getAltOpcode())) ||
15161 (Instruction::isCast(E->getOpcode()) &&
15162 Instruction::isCast(E->getAltOpcode())) ||
15163 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
15164 "Invalid Shuffle Vector Operand");
15165 // Try to find the previous shuffle node with the same operands and same
15166 // main/alternate ops.
15167 auto TryFindNodeWithEqualOperands = [=]() {
15168 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15169 if (TE.get() == E)
15170 break;
15171 if (TE->hasState() && TE->isAltShuffle() &&
15172 ((TE->getOpcode() == E->getOpcode() &&
15173 TE->getAltOpcode() == E->getAltOpcode()) ||
15174 (TE->getOpcode() == E->getAltOpcode() &&
15175 TE->getAltOpcode() == E->getOpcode())) &&
15176 TE->hasEqualOperands(*E))
15177 return true;
15178 }
15179 return false;
15180 };
15181 auto GetScalarCost = [&](unsigned Idx) {
15182 if (isa<PoisonValue>(UniqueValues[Idx]))
15184
15185 auto *VI = cast<Instruction>(UniqueValues[Idx]);
15186 assert(E->getMatchingMainOpOrAltOp(VI) &&
15187 "Unexpected main/alternate opcode");
15188 (void)E;
15189 return TTI->getInstructionCost(VI, CostKind);
15190 };
15191 // Need to clear CommonCost since the final shuffle cost is included into
15192 // vector cost.
15193 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
15194 // VecCost is equal to sum of the cost of creating 2 vectors
15195 // and the cost of creating shuffle.
15196 InstructionCost VecCost = 0;
15197 if (TryFindNodeWithEqualOperands()) {
15198 LLVM_DEBUG({
15199 dbgs() << "SLP: diamond match for alternate node found.\n";
15200 E->dump();
15201 });
15202 // No need to add new vector costs here since we're going to reuse
15203 // same main/alternate vector ops, just do different shuffling.
15204 } else if (Instruction::isBinaryOp(E->getOpcode())) {
15205 VecCost =
15206 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
15207 VecCost +=
15208 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
15209 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
15210 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
15211 VecCost = TTIRef.getCmpSelInstrCost(
15212 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
15213 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15214 VL0);
15215 VecCost += TTIRef.getCmpSelInstrCost(
15216 E->getOpcode(), VecTy, MaskTy,
15217 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
15218 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15219 E->getAltOp());
15220 } else {
15221 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
15222 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
15223 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
15224 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15225 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15226 unsigned SrcBWSz =
15227 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
15228 if (SrcIt != MinBWs.end()) {
15229 SrcBWSz = SrcIt->second.first;
15230 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
15231 SrcTy = getWidenedType(SrcSclTy, VL.size());
15232 }
15233 if (BWSz <= SrcBWSz) {
15234 if (BWSz < SrcBWSz)
15235 VecCost =
15236 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15238 LLVM_DEBUG({
15239 dbgs()
15240 << "SLP: alternate extension, which should be truncated.\n";
15241 E->dump();
15242 });
15243 return VecCost;
15244 }
15245 }
15246 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
15248 VecCost +=
15249 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
15251 }
15252 SmallVector<int> Mask;
15253 E->buildAltOpShuffleMask(
15254 [&](Instruction *I) {
15255 assert(E->getMatchingMainOpOrAltOp(I) &&
15256 "Unexpected main/alternate opcode");
15257 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
15258 *TLI);
15259 },
15260 Mask);
15262 FinalVecTy, Mask, CostKind);
15263 // Patterns like [fadd,fsub] can be combined into a single instruction
15264 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
15265 // need to take into account their order when looking for the most used
15266 // order.
15267 unsigned Opcode0 = E->getOpcode();
15268 unsigned Opcode1 = E->getAltOpcode();
15269 SmallBitVector OpcodeMask(
15270 getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));
15271 // If this pattern is supported by the target then we consider the
15272 // order.
15273 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15274 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
15275 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
15276 return AltVecCost < VecCost ? AltVecCost : VecCost;
15277 }
15278 // TODO: Check the reverse order too.
15279 return VecCost;
15280 };
15281 if (SLPReVec && !E->isAltShuffle())
15282 return GetCostDiff(
15283 GetScalarCost, [&](InstructionCost) -> InstructionCost {
15284 // If a group uses mask in order, the shufflevector can be
15285 // eliminated by instcombine. Then the cost is 0.
15287 "Not supported shufflevector usage.");
15288 auto *SV = cast<ShuffleVectorInst>(VL.front());
15289 unsigned SVNumElements =
15290 cast<FixedVectorType>(SV->getOperand(0)->getType())
15291 ->getNumElements();
15292 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15293 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
15294 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
15295 int NextIndex = 0;
15296 if (!all_of(Group, [&](Value *V) {
15298 "Not supported shufflevector usage.");
15299 auto *SV = cast<ShuffleVectorInst>(V);
15300 int Index;
15301 [[maybe_unused]] bool IsExtractSubvectorMask =
15302 SV->isExtractSubvectorMask(Index);
15303 assert(IsExtractSubvectorMask &&
15304 "Not supported shufflevector usage.");
15305 if (NextIndex != Index)
15306 return false;
15307 NextIndex += SV->getShuffleMask().size();
15308 return true;
15309 }))
15310 return ::getShuffleCost(
15312 calculateShufflevectorMask(E->Scalars));
15313 }
15314 return TTI::TCC_Free;
15315 });
15316 return GetCostDiff(GetScalarCost, GetVectorCost);
15317 }
15318 case Instruction::Freeze:
15319 return CommonCost;
15320 default:
15321 llvm_unreachable("Unknown instruction");
15322 }
15323}
15324
15325bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
15326 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
15327 << VectorizableTree.size() << " is fully vectorizable .\n");
15328
15329 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
15330 SmallVector<int> Mask;
15331 return TE->isGather() &&
15332 !any_of(TE->Scalars,
15333 [this](Value *V) { return EphValues.contains(V); }) &&
15334 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
15335 TE->Scalars.size() < Limit ||
15336 (((TE->hasState() &&
15337 TE->getOpcode() == Instruction::ExtractElement) ||
15339 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
15340 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
15341 !TE->isAltShuffle()) ||
15342 any_of(TE->Scalars, IsaPred<LoadInst>));
15343 };
15344
15345 // We only handle trees of heights 1 and 2.
15346 if (VectorizableTree.size() == 1 &&
15347 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15348 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15349 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15350 (ForReduction &&
15351 AreVectorizableGathers(VectorizableTree[0].get(),
15352 VectorizableTree[0]->Scalars.size()) &&
15353 VectorizableTree[0]->getVectorFactor() > 2)))
15354 return true;
15355
15356 if (VectorizableTree.size() != 2)
15357 return false;
15358
15359 // Handle splat and all-constants stores. Also try to vectorize tiny trees
15360 // with the second gather nodes if they have less scalar operands rather than
15361 // the initial tree element (may be profitable to shuffle the second gather)
15362 // or they are extractelements, which form shuffle.
15363 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15364 AreVectorizableGathers(VectorizableTree[1].get(),
15365 VectorizableTree[0]->Scalars.size()))
15366 return true;
15367
15368 // Gathering cost would be too much for tiny trees.
15369 if (VectorizableTree[0]->isGather() ||
15370 (VectorizableTree[1]->isGather() &&
15371 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15372 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15373 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15374 return false;
15375
15376 return true;
15377}
15378
15379static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
15381 bool MustMatchOrInst) {
15382 // Look past the root to find a source value. Arbitrarily follow the
15383 // path through operand 0 of any 'or'. Also, peek through optional
15384 // shift-left-by-multiple-of-8-bits.
15385 Value *ZextLoad = Root;
15386 const APInt *ShAmtC;
15387 bool FoundOr = false;
15388 while (!isa<ConstantExpr>(ZextLoad) &&
15389 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
15390 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
15391 ShAmtC->urem(8) == 0))) {
15392 auto *BinOp = cast<BinaryOperator>(ZextLoad);
15393 ZextLoad = BinOp->getOperand(0);
15394 if (BinOp->getOpcode() == Instruction::Or)
15395 FoundOr = true;
15396 }
15397 // Check if the input is an extended load of the required or/shift expression.
15398 Value *Load;
15399 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15400 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
15401 return false;
15402
15403 // Require that the total load bit width is a legal integer type.
15404 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
15405 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
15406 Type *SrcTy = Load->getType();
15407 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15408 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
15409 return false;
15410
15411 // Everything matched - assume that we can fold the whole sequence using
15412 // load combining.
15413 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
15414 << *(cast<Instruction>(Root)) << "\n");
15415
15416 return true;
15417}
15418
15420 if (RdxKind != RecurKind::Or)
15421 return false;
15422
15423 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15424 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15425 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
15426 /* MatchOr */ false);
15427}
15428
15430 // Peek through a final sequence of stores and check if all operations are
15431 // likely to be load-combined.
15432 unsigned NumElts = Stores.size();
15433 for (Value *Scalar : Stores) {
15434 Value *X;
15435 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
15436 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
15437 return false;
15438 }
15439 return true;
15440}
15441
15442bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
15443 if (!DebugCounter::shouldExecute(VectorizedGraphs))
15444 return true;
15445
15446 // Graph is empty - do nothing.
15447 if (VectorizableTree.empty()) {
15448 assert(ExternalUses.empty() && "We shouldn't have any external users");
15449
15450 return true;
15451 }
15452
15453 // No need to vectorize inserts of gathered values.
15454 if (VectorizableTree.size() == 2 &&
15455 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
15456 VectorizableTree[1]->isGather() &&
15457 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15458 !(isSplat(VectorizableTree[1]->Scalars) ||
15459 allConstant(VectorizableTree[1]->Scalars))))
15460 return true;
15461
15462 // If the graph includes only PHI nodes and gathers, it is defnitely not
15463 // profitable for the vectorization, we can skip it, if the cost threshold is
15464 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
15465 // gathers/buildvectors.
15466 constexpr int Limit = 4;
15467 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15468 !VectorizableTree.empty() &&
15469 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15470 return (TE->isGather() &&
15471 (!TE->hasState() ||
15472 TE->getOpcode() != Instruction::ExtractElement) &&
15473 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
15474 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15475 }))
15476 return true;
15477
15478 // Do not vectorize small tree of phis only, if all vector phis are also
15479 // gathered.
15480 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15481 VectorizableTree.size() <= Limit &&
15482 all_of(VectorizableTree,
15483 [&](const std::unique_ptr<TreeEntry> &TE) {
15484 return (TE->isGather() &&
15485 (!TE->hasState() ||
15486 TE->getOpcode() != Instruction::ExtractElement) &&
15487 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=
15488 Limit) ||
15489 (TE->hasState() &&
15490 (TE->getOpcode() == Instruction::InsertElement ||
15491 (TE->getOpcode() == Instruction::PHI &&
15492 all_of(TE->Scalars, [&](Value *V) {
15493 return isa<PoisonValue>(V) || MustGather.contains(V);
15494 }))));
15495 }) &&
15496 any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15497 return TE->State == TreeEntry::Vectorize &&
15498 TE->getOpcode() == Instruction::PHI;
15499 }))
15500 return true;
15501
15502 // If the tree contains only phis, buildvectors, split nodes and
15503 // small nodes with reuses, we can skip it.
15504 SmallVector<const TreeEntry *> StoreLoadNodes;
15505 unsigned NumGathers = 0;
15506 constexpr int LimitTreeSize = 36;
15507 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
15508 all_of(VectorizableTree,
15509 [&](const std::unique_ptr<TreeEntry> &TE) {
15510 if (!TE->isGather() && TE->hasState() &&
15511 (TE->getOpcode() == Instruction::Load ||
15512 TE->getOpcode() == Instruction::Store)) {
15513 StoreLoadNodes.push_back(TE.get());
15514 return true;
15515 }
15516 if (TE->isGather())
15517 ++NumGathers;
15518 return TE->State == TreeEntry::SplitVectorize ||
15519 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15520 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15521 VectorizableTree.size() > LimitTreeSize) ||
15522 (TE->isGather() &&
15523 none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
15524 (TE->hasState() &&
15525 (TE->getOpcode() == Instruction::PHI ||
15526 (TE->hasCopyableElements() &&
15527 static_cast<unsigned>(count_if(
15528 TE->Scalars, IsaPred<PHINode, Constant>)) >=
15529 TE->Scalars.size() / 2) ||
15530 ((!TE->ReuseShuffleIndices.empty() ||
15531 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15532 TE->Scalars.size() == 2)));
15533 }) &&
15534 (StoreLoadNodes.empty() ||
15535 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
15536 (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
15537 return TE->getOpcode() == Instruction::Store ||
15538 all_of(TE->Scalars, [&](Value *V) {
15539 return !isa<LoadInst>(V) ||
15540 areAllUsersVectorized(cast<Instruction>(V));
15541 });
15542 })))))
15543 return true;
15544
15545 // If the tree contains only buildvector, 2 non-buildvectors (with root user
15546 // tree node) and other buildvectors, we can skip it.
15547 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15548 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15549 VectorizableTree.size() >= Limit &&
15550 count_if(ArrayRef(VectorizableTree).drop_front(),
15551 [&](const std::unique_ptr<TreeEntry> &TE) {
15552 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15553 TE->UserTreeIndex.UserTE->Idx == 0;
15554 }) == 2)
15555 return true;
15556
15557 // If the tree contains only vectorization of the phi node from the
15558 // buildvector - skip it.
15559 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
15560 VectorizableTree.size() > 2 &&
15561 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15562 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15563 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15564 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15565 all_of(
15566 ArrayRef(VectorizableTree).drop_front(2),
15567 [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
15568 return true;
15569
15570 // We can vectorize the tree if its size is greater than or equal to the
15571 // minimum size specified by the MinTreeSize command line option.
15572 if (VectorizableTree.size() >= MinTreeSize)
15573 return false;
15574
15575 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
15576 // can vectorize it if we can prove it fully vectorizable.
15577 if (isFullyVectorizableTinyTree(ForReduction))
15578 return false;
15579
15580 // Check if any of the gather node forms an insertelement buildvector
15581 // somewhere.
15582 bool IsAllowedSingleBVNode =
15583 VectorizableTree.size() > 1 ||
15584 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15585 !VectorizableTree.front()->isAltShuffle() &&
15586 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15587 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15588 allSameBlock(VectorizableTree.front()->Scalars));
15589 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
15590 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
15591 return isa<ExtractElementInst, Constant>(V) ||
15592 (IsAllowedSingleBVNode &&
15593 !V->hasNUsesOrMore(UsesLimit) &&
15594 any_of(V->users(), IsaPred<InsertElementInst>));
15595 });
15596 }))
15597 return false;
15598
15599 if (VectorizableTree.back()->isGather() &&
15600 VectorizableTree.back()->hasState() &&
15601 VectorizableTree.back()->isAltShuffle() &&
15602 VectorizableTree.back()->getVectorFactor() > 2 &&
15603 allSameBlock(VectorizableTree.back()->Scalars) &&
15604 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15605 TTI->getScalarizationOverhead(
15606 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15607 VectorizableTree.back()->getVectorFactor()),
15608 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
15609 /*Insert=*/true, /*Extract=*/false,
15611 return false;
15612
15613 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
15614 // vectorizable.
15615 return true;
15616}
15617
15620 constexpr unsigned SmallTree = 3;
15621 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15622 getCanonicalGraphSize() <= SmallTree &&
15623 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
15624 [](const std::unique_ptr<TreeEntry> &TE) {
15625 return TE->isGather() && TE->hasState() &&
15626 TE->getOpcode() == Instruction::Load &&
15627 !allSameBlock(TE->Scalars);
15628 }) == 1)
15629 return true;
15630 return false;
15631 }
15632 bool Res = false;
15633 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
15634 TreeEntry &E = *VectorizableTree[Idx];
15635 if (E.State == TreeEntry::SplitVectorize)
15636 return false;
15637 if (!E.isGather())
15638 continue;
15639 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15640 (!E.hasState() &&
15642 (isa<ExtractElementInst>(E.Scalars.front()) &&
15643 getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid()))
15644 return false;
15645 if (isSplat(E.Scalars) || allConstant(E.Scalars))
15646 continue;
15647 Res = true;
15648 }
15649 return Res;
15650}
15651
15653 // Walk from the bottom of the tree to the top, tracking which values are
15654 // live. When we see a call instruction that is not part of our tree,
15655 // query TTI to see if there is a cost to keeping values live over it
15656 // (for example, if spills and fills are required).
15657
15658 const TreeEntry *Root = VectorizableTree.front().get();
15659 if (Root->isGather())
15660 return 0;
15661
15664 EntriesToOperands;
15665 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
15666 SmallPtrSet<const Instruction *, 8> LastInstructions;
15667 for (const auto &TEPtr : VectorizableTree) {
15668 if (!TEPtr->isGather()) {
15669 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15670 EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
15671 LastInstructions.insert(LastInst);
15672 }
15673 if (TEPtr->UserTreeIndex)
15674 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15675 }
15676
15677 auto NoCallIntrinsic = [this](const Instruction *I) {
15678 const auto *II = dyn_cast<IntrinsicInst>(I);
15679 if (!II)
15680 return false;
15681 if (II->isAssumeLikeIntrinsic())
15682 return true;
15683 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
15684 InstructionCost IntrCost =
15685 TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
15686 InstructionCost CallCost = TTI->getCallInstrCost(
15687 nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
15688 return IntrCost < CallCost;
15689 };
15690
15691 // Maps last instruction in the entry to the last instruction for the one of
15692 // operand entries and the flag. If the flag is true, there are no calls in
15693 // between these instructions.
15695 CheckedInstructions;
15696 unsigned Budget = 0;
15697 const unsigned BudgetLimit =
15698 ScheduleRegionSizeBudget / VectorizableTree.size();
15699 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
15700 const Instruction *Last) {
15701 assert(First->getParent() == Last->getParent() &&
15702 "Expected instructions in same block.");
15703 if (auto It = CheckedInstructions.find(Last);
15704 It != CheckedInstructions.end()) {
15705 const Instruction *Checked = It->second.getPointer();
15706 if (Checked == First || Checked->comesBefore(First))
15707 return It->second.getInt() != 0;
15708 Last = Checked;
15709 } else if (Last == First || Last->comesBefore(First)) {
15710 return true;
15711 }
15713 ++First->getIterator().getReverse(),
15714 PrevInstIt =
15715 Last->getIterator().getReverse();
15716 SmallVector<const Instruction *> LastInstsInRange;
15717 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15718 // Debug information does not impact spill cost.
15719 // Vectorized calls, represented as vector intrinsics, do not impact spill
15720 // cost.
15721 if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
15722 CB && !NoCallIntrinsic(CB) && !isVectorized(CB)) {
15723 for (const Instruction *LastInst : LastInstsInRange)
15724 CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
15725 return false;
15726 }
15727 if (LastInstructions.contains(&*PrevInstIt))
15728 LastInstsInRange.push_back(&*PrevInstIt);
15729
15730 ++PrevInstIt;
15731 ++Budget;
15732 }
15733 for (const Instruction *LastInst : LastInstsInRange)
15734 CheckedInstructions.try_emplace(
15735 LastInst, PrevInstIt == InstIt ? First : &*PrevInstIt,
15736 Budget <= BudgetLimit ? 1 : 0);
15737 return Budget <= BudgetLimit;
15738 };
15739 auto AddCosts = [&](const TreeEntry *Op) {
15740 Type *ScalarTy = Op->Scalars.front()->getType();
15741 auto It = MinBWs.find(Op);
15742 if (It != MinBWs.end())
15743 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
15744 auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
15745 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15746 if (ScalarTy->isVectorTy()) {
15747 // Handle revec dead vector instructions.
15748 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15749 }
15750 };
15751 // Memoize the relationship between blocks, i.e. if there is (at least one)
15752 // non-vectorized call between the blocks. This allows to skip the analysis of
15753 // the same block paths multiple times.
15755 ParentOpParentToPreds;
15756 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
15757 BasicBlock *OpParent) {
15758 auto Key = std::make_pair(Root, OpParent);
15759 if (auto It = ParentOpParentToPreds.find(Key);
15760 It != ParentOpParentToPreds.end())
15761 return It->second;
15763 if (Pred)
15764 Worklist.push_back(Pred);
15765 else
15766 Worklist.append(pred_begin(Root), pred_end(Root));
15769 ParentsPairsToAdd;
15770 bool Res = false;
15771 auto Cleanup = make_scope_exit([&]() {
15772 for (const auto &KeyPair : ParentsPairsToAdd) {
15773 assert(!ParentOpParentToPreds.contains(KeyPair) &&
15774 "Should not have been added before.");
15775 ParentOpParentToPreds.try_emplace(KeyPair, Res);
15776 }
15777 });
15778 while (!Worklist.empty()) {
15779 BasicBlock *BB = Worklist.pop_back_val();
15780 if (BB == OpParent || !Visited.insert(BB).second)
15781 continue;
15782 auto Pair = std::make_pair(BB, OpParent);
15783 if (auto It = ParentOpParentToPreds.find(Pair);
15784 It != ParentOpParentToPreds.end()) {
15785 Res = It->second;
15786 return Res;
15787 }
15788 ParentsPairsToAdd.insert(Pair);
15789 unsigned BlockSize = BB->size();
15790 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
15791 return Res;
15792 Budget += BlockSize;
15793 if (Budget > BudgetLimit)
15794 return Res;
15795 if (!isa<CatchSwitchInst>(BB->getTerminator()) &&
15796 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
15797 BB->getTerminator()))
15798 return Res;
15799 Worklist.append(pred_begin(BB), pred_end(BB));
15800 }
15801 Res = true;
15802 return Res;
15803 };
15804 SmallVector<const TreeEntry *> LiveEntries(1, Root);
15805 while (!LiveEntries.empty()) {
15806 const TreeEntry *Entry = LiveEntries.pop_back_val();
15807 SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Entry);
15808 if (Operands.empty())
15809 continue;
15810 Instruction *LastInst = EntriesToLastInstruction.at(Entry);
15811 BasicBlock *Parent = LastInst->getParent();
15812 for (const TreeEntry *Op : Operands) {
15813 if (!Op->isGather())
15814 LiveEntries.push_back(Op);
15815 if (Entry->State == TreeEntry::SplitVectorize ||
15816 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
15817 (Op->isGather() && allConstant(Op->Scalars)))
15818 continue;
15819 Budget = 0;
15820 BasicBlock *Pred = nullptr;
15821 if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
15822 Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15823 BasicBlock *OpParent;
15824 Instruction *OpLastInst;
15825 if (Op->isGather()) {
15826 assert(Entry->getOpcode() == Instruction::PHI &&
15827 "Expected phi node only.");
15828 OpParent = cast<PHINode>(Entry->getMainOp())
15829 ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
15830 OpLastInst = OpParent->getTerminator();
15831 for (Value *V : Op->Scalars) {
15832 auto *Inst = dyn_cast<Instruction>(V);
15833 if (!Inst)
15834 continue;
15835 if (isVectorized(V)) {
15836 OpParent = Inst->getParent();
15837 OpLastInst = Inst;
15838 break;
15839 }
15840 }
15841 } else {
15842 OpLastInst = EntriesToLastInstruction.at(Op);
15843 OpParent = OpLastInst->getParent();
15844 }
15845 // Check the call instructions within the same basic blocks.
15846 if (OpParent == Parent) {
15847 if (Entry->getOpcode() == Instruction::PHI) {
15848 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15849 AddCosts(Op);
15850 continue;
15851 }
15852 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15853 AddCosts(Op);
15854 continue;
15855 }
15856 // Check for call instruction in between blocks.
15857 // 1. Check entry's block to the head.
15858 if (Entry->getOpcode() != Instruction::PHI &&
15859 !CheckForNonVecCallsInSameBlock(
15860 &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
15861 LastInst)) {
15862 AddCosts(Op);
15863 continue;
15864 }
15865 // 2. Check op's block from the end.
15866 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
15867 OpParent->getTerminator())) {
15868 AddCosts(Op);
15869 continue;
15870 }
15871 // 3. Check the predecessors of entry's block till op's block.
15872 if (!CheckPredecessors(Parent, Pred, OpParent)) {
15873 AddCosts(Op);
15874 continue;
15875 }
15876 }
15877 }
15878
15879 return Cost;
15880}
15881
15882/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
15883/// buildvector sequence.
15885 const InsertElementInst *IE2) {
15886 if (IE1 == IE2)
15887 return false;
15888 const auto *I1 = IE1;
15889 const auto *I2 = IE2;
15890 const InsertElementInst *PrevI1;
15891 const InsertElementInst *PrevI2;
15892 unsigned Idx1 = *getElementIndex(IE1);
15893 unsigned Idx2 = *getElementIndex(IE2);
15894 do {
15895 if (I2 == IE1)
15896 return true;
15897 if (I1 == IE2)
15898 return false;
15899 PrevI1 = I1;
15900 PrevI2 = I2;
15901 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
15902 getElementIndex(I1).value_or(Idx2) != Idx2)
15903 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
15904 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
15905 getElementIndex(I2).value_or(Idx1) != Idx1)
15906 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
15907 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
15908 llvm_unreachable("Two different buildvectors not expected.");
15909}
15910
15911namespace {
15912/// Returns incoming Value *, if the requested type is Value * too, or a default
15913/// value, otherwise.
15914struct ValueSelect {
15915 template <typename U>
15916 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
15917 return V;
15918 }
15919 template <typename U>
15920 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
15921 return U();
15922 }
15923};
15924} // namespace
15925
15926/// Does the analysis of the provided shuffle masks and performs the requested
15927/// actions on the vectors with the given shuffle masks. It tries to do it in
15928/// several steps.
15929/// 1. If the Base vector is not undef vector, resizing the very first mask to
15930/// have common VF and perform action for 2 input vectors (including non-undef
15931/// Base). Other shuffle masks are combined with the resulting after the 1 stage
15932/// and processed as a shuffle of 2 elements.
15933/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
15934/// action only for 1 vector with the given mask, if it is not the identity
15935/// mask.
15936/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
15937/// vectors, combing the masks properly between the steps.
15938template <typename T>
15940 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
15941 function_ref<unsigned(T *)> GetVF,
15942 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
15944 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
15945 SmallVector<int> Mask(ShuffleMask.begin()->second);
15946 auto VMIt = std::next(ShuffleMask.begin());
15947 T *Prev = nullptr;
15948 SmallBitVector UseMask =
15949 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
15950 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
15951 if (!IsBaseUndef.all()) {
15952 // Base is not undef, need to combine it with the next subvectors.
15953 std::pair<T *, bool> Res =
15954 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
15955 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
15956 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
15957 if (Mask[Idx] == PoisonMaskElem)
15958 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
15959 else
15960 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
15961 }
15962 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
15963 assert((!V || GetVF(V) == Mask.size()) &&
15964 "Expected base vector of VF number of elements.");
15965 Prev = Action(Mask, {nullptr, Res.first});
15966 } else if (ShuffleMask.size() == 1) {
15967 // Base is undef and only 1 vector is shuffled - perform the action only for
15968 // single vector, if the mask is not the identity mask.
15969 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
15970 /*ForSingleMask=*/true);
15971 if (Res.second)
15972 // Identity mask is found.
15973 Prev = Res.first;
15974 else
15975 Prev = Action(Mask, {ShuffleMask.begin()->first});
15976 } else {
15977 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
15978 // shuffles step by step, combining shuffle between the steps.
15979 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
15980 unsigned Vec2VF = GetVF(VMIt->first);
15981 if (Vec1VF == Vec2VF) {
15982 // No need to resize the input vectors since they are of the same size, we
15983 // can shuffle them directly.
15984 ArrayRef<int> SecMask = VMIt->second;
15985 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
15986 if (SecMask[I] != PoisonMaskElem) {
15987 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
15988 Mask[I] = SecMask[I] + Vec1VF;
15989 }
15990 }
15991 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
15992 } else {
15993 // Vectors of different sizes - resize and reshuffle.
15994 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
15995 /*ForSingleMask=*/false);
15996 std::pair<T *, bool> Res2 =
15997 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
15998 ArrayRef<int> SecMask = VMIt->second;
15999 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16000 if (Mask[I] != PoisonMaskElem) {
16001 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16002 if (Res1.second)
16003 Mask[I] = I;
16004 } else if (SecMask[I] != PoisonMaskElem) {
16005 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16006 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
16007 }
16008 }
16009 Prev = Action(Mask, {Res1.first, Res2.first});
16010 }
16011 VMIt = std::next(VMIt);
16012 }
16013 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
16014 // Perform requested actions for the remaining masks/vectors.
16015 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
16016 // Shuffle other input vectors, if any.
16017 std::pair<T *, bool> Res =
16018 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16019 ArrayRef<int> SecMask = VMIt->second;
16020 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16021 if (SecMask[I] != PoisonMaskElem) {
16022 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
16023 "Multiple uses of scalars.");
16024 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
16025 } else if (Mask[I] != PoisonMaskElem) {
16026 Mask[I] = I;
16027 }
16028 }
16029 Prev = Action(Mask, {Prev, Res.first});
16030 }
16031 return Prev;
16032}
16033
16034namespace {
16035/// Data type for handling buildvector sequences with the reused scalars from
16036/// other tree entries.
16037template <typename T> struct ShuffledInsertData {
16038 /// List of insertelements to be replaced by shuffles.
16039 SmallVector<InsertElementInst *> InsertElements;
16040 /// The parent vectors and shuffle mask for the given list of inserts.
16041 MapVector<T, SmallVector<int>> ValueMasks;
16042};
16043} // namespace
16044
16046 InstructionCost ReductionCost) {
16047 InstructionCost Cost = ReductionCost;
16048 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
16049 << VectorizableTree.size() << ".\n");
16050
16051 SmallPtrSet<Value *, 4> CheckedExtracts;
16052 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
16053 TreeEntry &TE = *VectorizableTree[I];
16054 // No need to count the cost for combined entries, they are combined and
16055 // just skip their cost.
16056 if (TE.State == TreeEntry::CombinedVectorize) {
16057 LLVM_DEBUG(
16058 dbgs() << "SLP: Skipping cost for combined node that starts with "
16059 << *TE.Scalars[0] << ".\n";
16060 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16061 continue;
16062 }
16063 if (TE.hasState() &&
16064 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
16065 if (const TreeEntry *E =
16066 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
16067 E && E->getVectorFactor() == TE.getVectorFactor()) {
16068 // Some gather nodes might be absolutely the same as some vectorizable
16069 // nodes after reordering, need to handle it.
16070 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
16071 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16072 << "SLP: Current total cost = " << Cost << "\n");
16073 continue;
16074 }
16075 }
16076
16077 // Exclude cost of gather loads nodes which are not used. These nodes were
16078 // built as part of the final attempt to vectorize gathered loads.
16079 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
16080 "Expected gather nodes with users only.");
16081
16082 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
16083 Cost += C;
16084 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
16085 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16086 << "SLP: Current total cost = " << Cost << "\n");
16087 }
16088
16089 if (Cost >= -SLPCostThreshold &&
16090 none_of(ExternalUses, [](const ExternalUser &EU) {
16091 return isa_and_nonnull<InsertElementInst>(EU.User);
16092 }))
16093 return Cost;
16094
16095 SmallPtrSet<Value *, 16> ExtractCostCalculated;
16096 InstructionCost ExtractCost = 0;
16098 SmallVector<APInt> DemandedElts;
16099 SmallDenseSet<Value *, 4> UsedInserts;
16101 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
16103 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
16104 // Keep track {Scalar, Index, User} tuple.
16105 // On AArch64, this helps in fusing a mov instruction, associated with
16106 // extractelement, with fmul in the backend so that extractelement is free.
16108 for (ExternalUser &EU : ExternalUses) {
16109 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
16110 }
16111 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
16112 for (ExternalUser &EU : ExternalUses) {
16113 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
16114 << EU.E.Idx << " in lane " << EU.Lane << "\n");
16115 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
16116 else dbgs() << " User: nullptr\n");
16117 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
16118
16119 // Uses by ephemeral values are free (because the ephemeral value will be
16120 // removed prior to code generation, and so the extraction will be
16121 // removed as well).
16122 if (EphValues.count(EU.User))
16123 continue;
16124
16125 // Check if the scalar for the given user or all users is accounted already.
16126 if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||
16127 (EU.User &&
16128 CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))
16129 continue;
16130
16131 // Used in unreachable blocks or in EH pads (rarely executed) or is
16132 // terminated with unreachable instruction.
16133 if (BasicBlock *UserParent =
16134 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
16135 UserParent &&
16136 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16137 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
16138 continue;
16139
16140 // We only add extract cost once for the same scalar.
16141 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
16142 !ExtractCostCalculated.insert(EU.Scalar).second)
16143 continue;
16144
16145 // No extract cost for vector "scalar" if REVEC is disabled
16146 if (!SLPReVec && isa<FixedVectorType>(EU.Scalar->getType()))
16147 continue;
16148
16149 // If found user is an insertelement, do not calculate extract cost but try
16150 // to detect it as a final shuffled/identity match.
16151 // TODO: what if a user is insertvalue when REVEC is enabled?
16152 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
16153 VU && VU->getOperand(1) == EU.Scalar) {
16154 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
16155 if (!UsedInserts.insert(VU).second)
16156 continue;
16157 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16158 if (InsertIdx) {
16159 const TreeEntry *ScalarTE = &EU.E;
16160 auto *It = find_if(
16161 ShuffledInserts,
16162 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
16163 // Checks if 2 insertelements are from the same buildvector.
16164 InsertElementInst *VecInsert = Data.InsertElements.front();
16166 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
16167 Value *Op0 = II->getOperand(0);
16168 if (isVectorized(II) && !isVectorized(Op0))
16169 return nullptr;
16170 return Op0;
16171 });
16172 });
16173 int VecId = -1;
16174 if (It == ShuffledInserts.end()) {
16175 auto &Data = ShuffledInserts.emplace_back();
16176 Data.InsertElements.emplace_back(VU);
16177 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
16178 VecId = ShuffledInserts.size() - 1;
16179 auto It = MinBWs.find(ScalarTE);
16180 if (It != MinBWs.end() &&
16181 VectorCasts
16182 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
16183 .second) {
16184 unsigned BWSz = It->second.first;
16185 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16186 unsigned VecOpcode;
16187 if (DstBWSz < BWSz)
16188 VecOpcode = Instruction::Trunc;
16189 else
16190 VecOpcode =
16191 It->second.second ? Instruction::SExt : Instruction::ZExt;
16193 InstructionCost C = TTI->getCastInstrCost(
16194 VecOpcode, FTy,
16195 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
16196 FTy->getNumElements()),
16198 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16199 << " for extending externally used vector with "
16200 "non-equal minimum bitwidth.\n");
16201 Cost += C;
16202 }
16203 } else {
16204 if (isFirstInsertElement(VU, It->InsertElements.front()))
16205 It->InsertElements.front() = VU;
16206 VecId = std::distance(ShuffledInserts.begin(), It);
16207 }
16208 int InIdx = *InsertIdx;
16209 SmallVectorImpl<int> &Mask =
16210 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16211 if (Mask.empty())
16212 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16213 Mask[InIdx] = EU.Lane;
16214 DemandedElts[VecId].setBit(InIdx);
16215 continue;
16216 }
16217 }
16218 }
16219
16221 // If we plan to rewrite the tree in a smaller type, we will need to sign
16222 // extend the extracted value back to the original type. Here, we account
16223 // for the extract and the added cost of the sign extend if needed.
16224 InstructionCost ExtraCost = TTI::TCC_Free;
16225 auto *ScalarTy = EU.Scalar->getType();
16226 const unsigned BundleWidth = EU.E.getVectorFactor();
16227 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
16228 auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
16229 const TreeEntry *Entry = &EU.E;
16230 auto It = MinBWs.find(Entry);
16231 if (It != MinBWs.end()) {
16232 Type *MinTy = IntegerType::get(F->getContext(), It->second.first);
16233 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
16234 MinTy = getWidenedType(MinTy, VecTy->getNumElements());
16235 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
16236 ? Instruction::ZExt
16237 : Instruction::SExt;
16238 VecTy = getWidenedType(MinTy, BundleWidth);
16239 ExtraCost =
16240 getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
16241 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
16242 << ExtraCost << "\n");
16243 } else {
16244 ExtraCost =
16245 getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
16246 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16247 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
16248 << *VecTy << ": " << ExtraCost << "\n");
16249 }
16250 // Leave the scalar instructions as is if they are cheaper than extracts.
16251 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16252 Entry->getOpcode() == Instruction::Load) {
16253 // Checks if the user of the external scalar is phi in loop body.
16254 auto IsPhiInLoop = [&](const ExternalUser &U) {
16255 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
16256 auto *I = cast<Instruction>(U.Scalar);
16257 const Loop *L = LI->getLoopFor(Phi->getParent());
16258 return L && (Phi->getParent() == I->getParent() ||
16259 L == LI->getLoopFor(I->getParent()));
16260 }
16261 return false;
16262 };
16263 if (!ValueToExtUses) {
16264 ValueToExtUses.emplace();
16265 for (const auto &P : enumerate(ExternalUses)) {
16266 // Ignore phis in loops.
16267 if (IsPhiInLoop(P.value()))
16268 continue;
16269
16270 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
16271 }
16272 }
16273 // Can use original instruction, if no operands vectorized or they are
16274 // marked as externally used already.
16275 auto *Inst = cast<Instruction>(EU.Scalar);
16276 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
16277 auto OperandIsScalar = [&](Value *V) {
16278 if (!isVectorized(V)) {
16279 // Some extractelements might be not vectorized, but
16280 // transformed into shuffle and removed from the function,
16281 // consider it here.
16282 if (auto *EE = dyn_cast<ExtractElementInst>(V))
16283 return !EE->hasOneUse() || !MustGather.contains(EE);
16284 return true;
16285 }
16286 return ValueToExtUses->contains(V);
16287 };
16288 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
16289 bool CanBeUsedAsScalarCast = false;
16290 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
16291 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
16292 Op && all_of(Op->operands(), OperandIsScalar)) {
16293 InstructionCost OpCost =
16294 (isVectorized(Op) && !ValueToExtUses->contains(Op))
16295 ? TTI->getInstructionCost(Op, CostKind)
16296 : 0;
16297 if (ScalarCost + OpCost <= ExtraCost) {
16298 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
16299 ScalarCost += OpCost;
16300 }
16301 }
16302 }
16303 if (CanBeUsedAsScalar) {
16304 bool KeepScalar = ScalarCost <= ExtraCost;
16305 // Try to keep original scalar if the user is the phi node from the same
16306 // block as the root phis, currently vectorized. It allows to keep
16307 // better ordering info of PHIs, being vectorized currently.
16308 bool IsProfitablePHIUser =
16309 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
16310 VectorizableTree.front()->Scalars.size() > 2)) &&
16311 VectorizableTree.front()->hasState() &&
16312 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16313 !Inst->hasNUsesOrMore(UsesLimit) &&
16314 none_of(Inst->users(),
16315 [&](User *U) {
16316 auto *PHIUser = dyn_cast<PHINode>(U);
16317 return (!PHIUser ||
16318 PHIUser->getParent() !=
16319 cast<Instruction>(
16320 VectorizableTree.front()->getMainOp())
16321 ->getParent()) &&
16322 !isVectorized(U);
16323 }) &&
16324 count_if(Entry->Scalars, [&](Value *V) {
16325 return ValueToExtUses->contains(V);
16326 }) <= 2;
16327 if (IsProfitablePHIUser) {
16328 KeepScalar = true;
16329 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
16330 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
16331 (!GatheredLoadsEntriesFirst.has_value() ||
16332 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16333 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
16334 return ValueToExtUses->contains(V);
16335 });
16336 auto It = ExtractsCount.find(Entry);
16337 if (It != ExtractsCount.end()) {
16338 assert(ScalarUsesCount >= It->getSecond().size() &&
16339 "Expected total number of external uses not less than "
16340 "number of scalar uses.");
16341 ScalarUsesCount -= It->getSecond().size();
16342 }
16343 // Keep original scalar if number of externally used instructions in
16344 // the same entry is not power of 2. It may help to do some extra
16345 // vectorization for now.
16346 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
16347 }
16348 if (KeepScalar) {
16349 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16350 for (Value *V : Inst->operands()) {
16351 auto It = ValueToExtUses->find(V);
16352 if (It != ValueToExtUses->end()) {
16353 // Replace all uses to avoid compiler crash.
16354 ExternalUses[It->second].User = nullptr;
16355 }
16356 }
16357 ExtraCost = ScalarCost;
16358 if (!IsPhiInLoop(EU))
16359 ExtractsCount[Entry].insert(Inst);
16360 if (CanBeUsedAsScalarCast) {
16361 ScalarOpsFromCasts.insert(Inst->getOperand(0));
16362 // Update the users of the operands of the cast operand to avoid
16363 // compiler crash.
16364 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
16365 for (Value *V : IOp->operands()) {
16366 auto It = ValueToExtUses->find(V);
16367 if (It != ValueToExtUses->end()) {
16368 // Replace all uses to avoid compiler crash.
16369 ExternalUses[It->second].User = nullptr;
16370 }
16371 }
16372 }
16373 }
16374 }
16375 }
16376 }
16377
16378 ExtractCost += ExtraCost;
16379 }
16380 // Insert externals for extract of operands of casts to be emitted as scalars
16381 // instead of extractelement.
16382 for (Value *V : ScalarOpsFromCasts) {
16383 ExternalUsesAsOriginalScalar.insert(V);
16384 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
16385 ExternalUses.emplace_back(V, nullptr, *TEs.front(),
16386 TEs.front()->findLaneForValue(V));
16387 }
16388 }
16389 // Add reduced value cost, if resized.
16390 if (!VectorizedVals.empty()) {
16391 const TreeEntry &Root = *VectorizableTree.front();
16392 auto BWIt = MinBWs.find(&Root);
16393 if (BWIt != MinBWs.end()) {
16394 Type *DstTy = Root.Scalars.front()->getType();
16395 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
16396 unsigned SrcSz =
16397 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16398 if (OriginalSz != SrcSz) {
16399 unsigned Opcode = Instruction::Trunc;
16400 if (OriginalSz > SrcSz)
16401 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16402 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
16403 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
16404 assert(SLPReVec && "Only supported by REVEC.");
16405 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
16406 }
16407 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16410 }
16411 }
16412 }
16413
16414 Cost += ExtractCost;
16415 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
16416 bool ForSingleMask) {
16417 InstructionCost C = 0;
16418 unsigned VF = Mask.size();
16419 unsigned VecVF = TE->getVectorFactor();
16420 bool HasLargeIndex =
16421 any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
16422 if ((VF != VecVF && HasLargeIndex) ||
16424
16425 if (HasLargeIndex) {
16426 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
16427 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16428 OrigMask.begin());
16430 getWidenedType(TE->getMainOp()->getType(), VecVF),
16431 OrigMask);
16432 LLVM_DEBUG(
16433 dbgs() << "SLP: Adding cost " << C
16434 << " for final shuffle of insertelement external users.\n";
16435 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16436 Cost += C;
16437 return std::make_pair(TE, true);
16438 }
16439
16440 if (!ForSingleMask) {
16441 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16442 for (unsigned I = 0; I < VF; ++I) {
16443 if (Mask[I] != PoisonMaskElem)
16444 ResizeMask[Mask[I]] = Mask[I];
16445 }
16446 if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
16449 getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
16450 LLVM_DEBUG(
16451 dbgs() << "SLP: Adding cost " << C
16452 << " for final shuffle of insertelement external users.\n";
16453 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16454
16455 Cost += C;
16456 }
16457 }
16458 return std::make_pair(TE, false);
16459 };
16460 // Calculate the cost of the reshuffled vectors, if any.
16461 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
16462 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
16463 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
16464 unsigned VF = 0;
16465 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
16467 assert((TEs.size() == 1 || TEs.size() == 2) &&
16468 "Expected exactly 1 or 2 tree entries.");
16469 if (TEs.size() == 1) {
16470 if (VF == 0)
16471 VF = TEs.front()->getVectorFactor();
16472 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16473 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
16474 !all_of(enumerate(Mask), [=](const auto &Data) {
16475 return Data.value() == PoisonMaskElem ||
16476 (Data.index() < VF &&
16477 static_cast<int>(Data.index()) == Data.value());
16478 })) {
16481 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16482 << " for final shuffle of insertelement "
16483 "external users.\n";
16484 TEs.front()->dump();
16485 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16486 Cost += C;
16487 }
16488 } else {
16489 if (VF == 0) {
16490 if (TEs.front() &&
16491 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16492 VF = TEs.front()->getVectorFactor();
16493 else
16494 VF = Mask.size();
16495 }
16496 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16498 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
16499 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
16500 << " for final shuffle of vector node and external "
16501 "insertelement users.\n";
16502 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16503 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16504 Cost += C;
16505 }
16506 VF = Mask.size();
16507 return TEs.back();
16508 };
16510 MutableArrayRef(Vector.data(), Vector.size()), Base,
16511 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16512 EstimateShufflesCost);
16513 InstructionCost InsertCost = TTI->getScalarizationOverhead(
16515 ShuffledInserts[I].InsertElements.front()->getType()),
16516 DemandedElts[I],
16517 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
16518 Cost -= InsertCost;
16519 }
16520
16521 // Add the cost for reduced value resize (if required).
16522 if (ReductionBitWidth != 0) {
16523 assert(UserIgnoreList && "Expected reduction tree.");
16524 const TreeEntry &E = *VectorizableTree.front();
16525 auto It = MinBWs.find(&E);
16526 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16527 unsigned SrcSize = It->second.first;
16528 unsigned DstSize = ReductionBitWidth;
16529 unsigned Opcode = Instruction::Trunc;
16530 if (SrcSize < DstSize) {
16531 bool IsArithmeticExtendedReduction =
16532 all_of(*UserIgnoreList, [](Value *V) {
16533 auto *I = cast<Instruction>(V);
16534 return is_contained({Instruction::Add, Instruction::FAdd,
16535 Instruction::Mul, Instruction::FMul,
16536 Instruction::And, Instruction::Or,
16537 Instruction::Xor},
16538 I->getOpcode());
16539 });
16540 if (IsArithmeticExtendedReduction)
16541 Opcode =
16542 Instruction::BitCast; // Handle it by getExtendedReductionCost
16543 else
16544 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16545 }
16546 if (Opcode != Instruction::BitCast) {
16547 auto *SrcVecTy =
16548 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16549 auto *DstVecTy =
16550 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16551 TTI::CastContextHint CCH = getCastContextHint(E);
16552 InstructionCost CastCost;
16553 switch (E.getOpcode()) {
16554 case Instruction::SExt:
16555 case Instruction::ZExt:
16556 case Instruction::Trunc: {
16557 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16558 CCH = getCastContextHint(*OpTE);
16559 break;
16560 }
16561 default:
16562 break;
16563 }
16564 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16566 Cost += CastCost;
16567 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
16568 << " for final resize for reduction from " << SrcVecTy
16569 << " to " << DstVecTy << "\n";
16570 dbgs() << "SLP: Current total cost = " << Cost << "\n");
16571 }
16572 }
16573 }
16574
16575 std::optional<InstructionCost> SpillCost;
16576 if (Cost < -SLPCostThreshold) {
16577 SpillCost = getSpillCost();
16578 Cost += *SpillCost;
16579 }
16580#ifndef NDEBUG
16581 SmallString<256> Str;
16582 {
16583 raw_svector_ostream OS(Str);
16584 OS << "SLP: Spill Cost = ";
16585 if (SpillCost)
16586 OS << *SpillCost;
16587 else
16588 OS << "<skipped>";
16589 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
16590 << "SLP: Total Cost = " << Cost << ".\n";
16591 }
16592 LLVM_DEBUG(dbgs() << Str);
16593 if (ViewSLPTree)
16594 ViewGraph(this, "SLP" + F->getName(), false, Str);
16595#endif
16596
16597 return Cost;
16598}
16599
16600/// Tries to find extractelement instructions with constant indices from fixed
16601/// vector type and gather such instructions into a bunch, which highly likely
16602/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16603/// successful, the matched scalars are replaced by poison values in \p VL for
16604/// future analysis.
16605std::optional<TTI::ShuffleKind>
16606BoUpSLP::tryToGatherSingleRegisterExtractElements(
16608 // Scan list of gathered scalars for extractelements that can be represented
16609 // as shuffles.
16611 SmallVector<int> UndefVectorExtracts;
16612 for (int I = 0, E = VL.size(); I < E; ++I) {
16613 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16614 if (!EI) {
16615 if (isa<UndefValue>(VL[I]))
16616 UndefVectorExtracts.push_back(I);
16617 continue;
16618 }
16619 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
16620 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
16621 continue;
16622 std::optional<unsigned> Idx = getExtractIndex(EI);
16623 // Undefined index.
16624 if (!Idx) {
16625 UndefVectorExtracts.push_back(I);
16626 continue;
16627 }
16628 if (Idx >= VecTy->getNumElements()) {
16629 UndefVectorExtracts.push_back(I);
16630 continue;
16631 }
16632 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
16633 ExtractMask.reset(*Idx);
16634 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
16635 UndefVectorExtracts.push_back(I);
16636 continue;
16637 }
16638 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
16639 }
16640 // Sort the vector operands by the maximum number of uses in extractelements.
16642 VectorOpToIdx.takeVector();
16643 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
16644 return P1.second.size() > P2.second.size();
16645 });
16646 // Find the best pair of the vectors or a single vector.
16647 const int UndefSz = UndefVectorExtracts.size();
16648 unsigned SingleMax = 0;
16649 unsigned PairMax = 0;
16650 if (!Vectors.empty()) {
16651 SingleMax = Vectors.front().second.size() + UndefSz;
16652 if (Vectors.size() > 1) {
16653 auto *ItNext = std::next(Vectors.begin());
16654 PairMax = SingleMax + ItNext->second.size();
16655 }
16656 }
16657 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16658 return std::nullopt;
16659 // Check if better to perform a shuffle of 2 vectors or just of a single
16660 // vector.
16661 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
16662 SmallVector<Value *> GatheredExtracts(
16663 VL.size(), PoisonValue::get(VL.front()->getType()));
16664 if (SingleMax >= PairMax && SingleMax) {
16665 for (int Idx : Vectors.front().second)
16666 std::swap(GatheredExtracts[Idx], VL[Idx]);
16667 } else if (!Vectors.empty()) {
16668 for (unsigned Idx : {0, 1})
16669 for (int Idx : Vectors[Idx].second)
16670 std::swap(GatheredExtracts[Idx], VL[Idx]);
16671 }
16672 // Add extracts from undefs too.
16673 for (int Idx : UndefVectorExtracts)
16674 std::swap(GatheredExtracts[Idx], VL[Idx]);
16675 // Check that gather of extractelements can be represented as just a
16676 // shuffle of a single/two vectors the scalars are extracted from.
16677 std::optional<TTI::ShuffleKind> Res =
16678 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
16679 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16680 // TODO: try to check other subsets if possible.
16681 // Restore the original VL if attempt was not successful.
16682 copy(SavedVL, VL.begin());
16683 return std::nullopt;
16684 }
16685 // Restore unused scalars from mask, if some of the extractelements were not
16686 // selected for shuffle.
16687 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
16688 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
16689 isa<UndefValue>(GatheredExtracts[I])) {
16690 std::swap(VL[I], GatheredExtracts[I]);
16691 continue;
16692 }
16693 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
16694 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
16695 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
16696 is_contained(UndefVectorExtracts, I))
16697 continue;
16698 }
16699 return Res;
16700}
16701
16702/// Tries to find extractelement instructions with constant indices from fixed
16703/// vector type and gather such instructions into a bunch, which highly likely
16704/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
16705/// successful, the matched scalars are replaced by poison values in \p VL for
16706/// future analysis.
16708BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16709 SmallVectorImpl<int> &Mask,
16710 unsigned NumParts) const {
16711 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
16712 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
16713 Mask.assign(VL.size(), PoisonMaskElem);
16714 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
16715 for (unsigned Part : seq<unsigned>(NumParts)) {
16716 // Scan list of gathered scalars for extractelements that can be represented
16717 // as shuffles.
16718 MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
16719 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
16720 SmallVector<int> SubMask;
16721 std::optional<TTI::ShuffleKind> Res =
16722 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16723 ShufflesRes[Part] = Res;
16724 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
16725 }
16726 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
16727 return Res.has_value();
16728 }))
16729 ShufflesRes.clear();
16730 return ShufflesRes;
16731}
16732
16733std::optional<TargetTransformInfo::ShuffleKind>
16734BoUpSLP::isGatherShuffledSingleRegisterEntry(
16735 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
16736 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
16737 Entries.clear();
16738 // TODO: currently checking only for Scalars in the tree entry, need to count
16739 // reused elements too for better cost estimation.
16740 auto GetUserEntry = [&](const TreeEntry *TE) {
16741 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16742 TE = TE->UserTreeIndex.UserTE;
16743 if (TE == VectorizableTree.front().get())
16744 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
16745 return TE->UserTreeIndex;
16746 };
16747 auto HasGatherUser = [&](const TreeEntry *TE) {
16748 while (TE->Idx != 0 && TE->UserTreeIndex) {
16749 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16750 return true;
16751 TE = TE->UserTreeIndex.UserTE;
16752 }
16753 return false;
16754 };
16755 const EdgeInfo TEUseEI = GetUserEntry(TE);
16756 if (!TEUseEI)
16757 return std::nullopt;
16758 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16759 const BasicBlock *TEInsertBlock = nullptr;
16760 // Main node of PHI entries keeps the correct order of operands/incoming
16761 // blocks.
16762 if (auto *PHI = dyn_cast_or_null<PHINode>(
16763 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
16764 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16765 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16766 TEInsertPt = TEInsertBlock->getTerminator();
16767 } else {
16768 TEInsertBlock = TEInsertPt->getParent();
16769 }
16770 if (!DT->isReachableFromEntry(TEInsertBlock))
16771 return std::nullopt;
16772 auto *NodeUI = DT->getNode(TEInsertBlock);
16773 assert(NodeUI && "Should only process reachable instructions");
16774 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
16775 auto CheckOrdering = [&](const Instruction *InsertPt) {
16776 // Argument InsertPt is an instruction where vector code for some other
16777 // tree entry (one that shares one or more scalars with TE) is going to be
16778 // generated. This lambda returns true if insertion point of vector code
16779 // for the TE dominates that point (otherwise dependency is the other way
16780 // around). The other node is not limited to be of a gather kind. Gather
16781 // nodes are not scheduled and their vector code is inserted before their
16782 // first user. If user is PHI, that is supposed to be at the end of a
16783 // predecessor block. Otherwise it is the last instruction among scalars of
16784 // the user node. So, instead of checking dependency between instructions
16785 // themselves, we check dependency between their insertion points for vector
16786 // code (since each scalar instruction ends up as a lane of a vector
16787 // instruction).
16788 const BasicBlock *InsertBlock = InsertPt->getParent();
16789 auto *NodeEUI = DT->getNode(InsertBlock);
16790 if (!NodeEUI)
16791 return false;
16792 assert((NodeUI == NodeEUI) ==
16793 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16794 "Different nodes should have different DFS numbers");
16795 // Check the order of the gather nodes users.
16796 if (TEInsertPt->getParent() != InsertBlock &&
16797 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16798 return false;
16799 if (TEInsertPt->getParent() == InsertBlock &&
16800 TEInsertPt->comesBefore(InsertPt))
16801 return false;
16802 return true;
16803 };
16804 // Find all tree entries used by the gathered values. If no common entries
16805 // found - not a shuffle.
16806 // Here we build a set of tree nodes for each gathered value and trying to
16807 // find the intersection between these sets. If we have at least one common
16808 // tree node for each gathered value - we have just a permutation of the
16809 // single vector. If we have 2 different sets, we're in situation where we
16810 // have a permutation of 2 input vectors.
16812 SmallDenseMap<Value *, int> UsedValuesEntry;
16813 SmallPtrSet<const Value *, 16> VisitedValue;
16814 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
16815 // The node is reused - exit.
16816 if ((TEPtr->getVectorFactor() != VL.size() &&
16817 TEPtr->Scalars.size() != VL.size()) ||
16818 (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
16819 return false;
16820 UsedTEs.clear();
16821 UsedTEs.emplace_back().insert(TEPtr);
16822 for (Value *V : VL) {
16823 if (isConstant(V))
16824 continue;
16825 UsedValuesEntry.try_emplace(V, 0);
16826 }
16827 return true;
16828 };
16829 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
16830 unsigned EdgeIdx) {
16831 const TreeEntry *Ptr1 = User1;
16832 const TreeEntry *Ptr2 = User2;
16833 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16834 while (Ptr2) {
16835 PtrToIdx.try_emplace(Ptr2, EdgeIdx);
16836 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16837 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16838 }
16839 while (Ptr1) {
16840 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16841 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16842 if (auto It = PtrToIdx.find(Ptr1); It != PtrToIdx.end())
16843 return Idx < It->second;
16844 }
16845 return false;
16846 };
16847 for (Value *V : VL) {
16848 if (isConstant(V) || !VisitedValue.insert(V).second)
16849 continue;
16850 // Build a list of tree entries where V is used.
16851 SmallPtrSet<const TreeEntry *, 4> VToTEs;
16852 for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16853 if (TEPtr == TE || TEPtr->Idx == 0)
16854 continue;
16855 assert(any_of(TEPtr->Scalars,
16856 [&](Value *V) { return GatheredScalars.contains(V); }) &&
16857 "Must contain at least single gathered value.");
16858 assert(TEPtr->UserTreeIndex &&
16859 "Expected only single user of a gather node.");
16860 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
16861
16862 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
16863 UseEI.UserTE->hasState())
16864 ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
16865 : nullptr;
16866 Instruction *InsertPt =
16867 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
16868 : &getLastInstructionInBundle(UseEI.UserTE);
16869 if (TEInsertPt == InsertPt) {
16870 // Check nodes, which might be emitted first.
16871 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16872 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
16873 TEUseEI.UserTE->isAltShuffle()) &&
16874 all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) {
16875 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
16876 (UseEI.UserTE->hasState() &&
16877 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16878 !UseEI.UserTE->isAltShuffle()) ||
16879 !all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock))
16880 continue;
16881 }
16882
16883 // If the schedulable insertion point is used in multiple entries - just
16884 // exit, no known ordering at this point, available only after real
16885 // scheduling.
16886 if (!doesNotNeedToBeScheduled(InsertPt) &&
16887 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
16888 continue;
16889 // If the users are the PHI nodes with the same incoming blocks - skip.
16890 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16891 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
16892 UseEI.UserTE->State == TreeEntry::Vectorize &&
16893 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16894 TEUseEI.UserTE != UseEI.UserTE)
16895 continue;
16896 // If 2 gathers are operands of the same entry (regardless of whether
16897 // user is PHI or else), compare operands indices, use the earlier one
16898 // as the base.
16899 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
16900 continue;
16901 // If the user instruction is used for some reason in different
16902 // vectorized nodes - make it depend on index.
16903 if (TEUseEI.UserTE != UseEI.UserTE &&
16904 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
16905 HasGatherUser(TEUseEI.UserTE)))
16906 continue;
16907 // If the user node is the operand of the other user node - skip.
16908 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
16909 continue;
16910 }
16911
16912 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
16913 TEUseEI.UserTE->doesNotNeedToSchedule() !=
16914 UseEI.UserTE->doesNotNeedToSchedule() &&
16915 is_contained(UseEI.UserTE->Scalars, TEInsertPt))
16916 continue;
16917 // Check if the user node of the TE comes after user node of TEPtr,
16918 // otherwise TEPtr depends on TE.
16919 if ((TEInsertBlock != InsertPt->getParent() ||
16920 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
16921 (!CheckOrdering(InsertPt) ||
16922 (UseEI.UserTE->hasCopyableElements() &&
16923 isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt)) &&
16924 is_contained(UseEI.UserTE->Scalars, TEInsertPt))))
16925 continue;
16926 // The node is reused - exit.
16927 if (CheckAndUseSameNode(TEPtr))
16928 break;
16929 VToTEs.insert(TEPtr);
16930 }
16931 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
16932 const auto *It = find_if(
16933 VTEs, [&](const TreeEntry *MTE) { return MTE != TEUseEI.UserTE; });
16934 if (It != VTEs.end()) {
16935 const TreeEntry *VTE = *It;
16936 if (none_of(TE->CombinedEntriesWithIndices,
16937 [&](const auto &P) { return P.first == VTE->Idx; })) {
16938 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16939 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16940 continue;
16941 }
16942 // The node is reused - exit.
16943 if (CheckAndUseSameNode(VTE))
16944 break;
16945 VToTEs.insert(VTE);
16946 }
16947 }
16948 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
16949 const TreeEntry *VTE = VTEs.front();
16950 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
16951 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
16952 VTEs = VTEs.drop_front();
16953 // Iterate through all vectorized nodes.
16954 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
16955 return MTE->State == TreeEntry::Vectorize;
16956 });
16957 if (MIt == VTEs.end())
16958 continue;
16959 VTE = *MIt;
16960 }
16961 if (none_of(TE->CombinedEntriesWithIndices,
16962 [&](const auto &P) { return P.first == VTE->Idx; })) {
16963 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16964 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16965 continue;
16966 }
16967 // The node is reused - exit.
16968 if (CheckAndUseSameNode(VTE))
16969 break;
16970 VToTEs.insert(VTE);
16971 }
16972 if (VToTEs.empty())
16973 continue;
16974 if (UsedTEs.empty()) {
16975 // The first iteration, just insert the list of nodes to vector.
16976 UsedTEs.push_back(VToTEs);
16977 UsedValuesEntry.try_emplace(V, 0);
16978 } else {
16979 // Need to check if there are any previously used tree nodes which use V.
16980 // If there are no such nodes, consider that we have another one input
16981 // vector.
16982 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
16983 unsigned Idx = 0;
16984 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
16985 // Do we have a non-empty intersection of previously listed tree entries
16986 // and tree entries using current V?
16987 set_intersect(VToTEs, Set);
16988 if (!VToTEs.empty()) {
16989 // Yes, write the new subset and continue analysis for the next
16990 // scalar.
16991 Set.swap(VToTEs);
16992 break;
16993 }
16994 VToTEs = SavedVToTEs;
16995 ++Idx;
16996 }
16997 // No non-empty intersection found - need to add a second set of possible
16998 // source vectors.
16999 if (Idx == UsedTEs.size()) {
17000 // If the number of input vectors is greater than 2 - not a permutation,
17001 // fallback to the regular gather.
17002 // TODO: support multiple reshuffled nodes.
17003 if (UsedTEs.size() == 2)
17004 continue;
17005 UsedTEs.push_back(SavedVToTEs);
17006 Idx = UsedTEs.size() - 1;
17007 }
17008 UsedValuesEntry.try_emplace(V, Idx);
17009 }
17010 }
17011
17012 if (UsedTEs.empty()) {
17013 Entries.clear();
17014 return std::nullopt;
17015 }
17016
17017 unsigned VF = 0;
17018 if (UsedTEs.size() == 1) {
17019 // Keep the order to avoid non-determinism.
17020 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
17021 UsedTEs.front().end());
17022 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
17023 return TE1->Idx < TE2->Idx;
17024 });
17025 // Try to find the perfect match in another gather node at first.
17026 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
17027 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
17028 });
17029 if (It != FirstEntries.end() &&
17030 ((*It)->getVectorFactor() == VL.size() ||
17031 ((*It)->getVectorFactor() == TE->Scalars.size() &&
17032 TE->ReuseShuffleIndices.size() == VL.size() &&
17033 (*It)->isSame(TE->Scalars)))) {
17034 Entries.push_back(*It);
17035 if ((*It)->getVectorFactor() == VL.size()) {
17036 std::iota(std::next(Mask.begin(), Part * VL.size()),
17037 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
17038 } else {
17039 SmallVector<int> CommonMask = TE->getCommonMask();
17040 copy(CommonMask, Mask.begin());
17041 }
17042 // Clear undef scalars.
17043 for (unsigned I : seq<unsigned>(VL.size()))
17044 if (isa<PoisonValue>(VL[I]))
17045 Mask[Part * VL.size() + I] = PoisonMaskElem;
17047 }
17048 // No perfect match, just shuffle, so choose the first tree node from the
17049 // tree.
17050 Entries.push_back(FirstEntries.front());
17051 // Update mapping between values and corresponding tree entries.
17052 for (auto &P : UsedValuesEntry)
17053 P.second = 0;
17054 VF = FirstEntries.front()->getVectorFactor();
17055 } else {
17056 // Try to find nodes with the same vector factor.
17057 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
17058 // Keep the order of tree nodes to avoid non-determinism.
17059 DenseMap<int, const TreeEntry *> VFToTE;
17060 for (const TreeEntry *TE : UsedTEs.front()) {
17061 unsigned VF = TE->getVectorFactor();
17062 auto It = VFToTE.find(VF);
17063 if (It != VFToTE.end()) {
17064 if (It->second->Idx > TE->Idx)
17065 It->getSecond() = TE;
17066 continue;
17067 }
17068 VFToTE.try_emplace(VF, TE);
17069 }
17070 // Same, keep the order to avoid non-determinism.
17071 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
17072 UsedTEs.back().end());
17073 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
17074 return TE1->Idx < TE2->Idx;
17075 });
17076 for (const TreeEntry *TE : SecondEntries) {
17077 auto It = VFToTE.find(TE->getVectorFactor());
17078 if (It != VFToTE.end()) {
17079 VF = It->first;
17080 Entries.push_back(It->second);
17081 Entries.push_back(TE);
17082 break;
17083 }
17084 }
17085 // No 2 source vectors with the same vector factor - just choose 2 with max
17086 // index.
17087 if (Entries.empty()) {
17089 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
17090 return TE1->Idx < TE2->Idx;
17091 }));
17092 Entries.push_back(SecondEntries.front());
17093 VF = std::max(Entries.front()->getVectorFactor(),
17094 Entries.back()->getVectorFactor());
17095 } else {
17096 VF = Entries.front()->getVectorFactor();
17097 }
17098 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
17099 for (const TreeEntry *E : Entries)
17100 ValuesToEntries.emplace_back().insert(E->Scalars.begin(),
17101 E->Scalars.end());
17102 // Update mapping between values and corresponding tree entries.
17103 for (auto &P : UsedValuesEntry) {
17104 for (unsigned Idx : seq<unsigned>(ValuesToEntries.size()))
17105 if (ValuesToEntries[Idx].contains(P.first)) {
17106 P.second = Idx;
17107 break;
17108 }
17109 }
17110 }
17111
17112 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
17113 // Checks if the 2 PHIs are compatible in terms of high possibility to be
17114 // vectorized.
17115 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
17116 auto *PHI = cast<PHINode>(V);
17117 auto *PHI1 = cast<PHINode>(V1);
17118 // Check that all incoming values are compatible/from same parent (if they
17119 // are instructions).
17120 // The incoming values are compatible if they all are constants, or
17121 // instruction with the same/alternate opcodes from the same basic block.
17122 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
17123 Value *In = PHI->getIncomingValue(I);
17124 Value *In1 = PHI1->getIncomingValue(I);
17125 if (isConstant(In) && isConstant(In1))
17126 continue;
17127 if (!getSameOpcode({In, In1}, *TLI))
17128 return false;
17129 if (cast<Instruction>(In)->getParent() !=
17131 return false;
17132 }
17133 return true;
17134 };
17135 // Check if the value can be ignored during analysis for shuffled gathers.
17136 // We suppose it is better to ignore instruction, which do not form splats,
17137 // are not vectorized/not extractelements (these instructions will be handled
17138 // by extractelements processing) or may form vector node in future.
17139 auto MightBeIgnored = [=](Value *V) {
17140 auto *I = dyn_cast<Instruction>(V);
17141 return I && !IsSplatOrUndefs && !isVectorized(I) &&
17143 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
17144 };
17145 // Check that the neighbor instruction may form a full vector node with the
17146 // current instruction V. It is possible, if they have same/alternate opcode
17147 // and same parent basic block.
17148 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
17149 Value *V1 = VL[Idx];
17150 bool UsedInSameVTE = false;
17151 auto It = UsedValuesEntry.find(V1);
17152 if (It != UsedValuesEntry.end())
17153 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17154 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17155 getSameOpcode({V, V1}, *TLI) &&
17156 cast<Instruction>(V)->getParent() ==
17157 cast<Instruction>(V1)->getParent() &&
17158 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
17159 };
17160 // Build a shuffle mask for better cost estimation and vector emission.
17161 SmallBitVector UsedIdxs(Entries.size());
17163 for (int I = 0, E = VL.size(); I < E; ++I) {
17164 Value *V = VL[I];
17165 auto It = UsedValuesEntry.find(V);
17166 if (It == UsedValuesEntry.end())
17167 continue;
17168 // Do not try to shuffle scalars, if they are constants, or instructions
17169 // that can be vectorized as a result of the following vector build
17170 // vectorization.
17171 if (isConstant(V) || (MightBeIgnored(V) &&
17172 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
17173 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
17174 continue;
17175 unsigned Idx = It->second;
17176 EntryLanes.emplace_back(Idx, I);
17177 UsedIdxs.set(Idx);
17178 }
17179 // Iterate through all shuffled scalars and select entries, which can be used
17180 // for final shuffle.
17182 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
17183 if (!UsedIdxs.test(I))
17184 continue;
17185 // Fix the entry number for the given scalar. If it is the first entry, set
17186 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
17187 // These indices are used when calculating final shuffle mask as the vector
17188 // offset.
17189 for (std::pair<unsigned, int> &Pair : EntryLanes)
17190 if (Pair.first == I)
17191 Pair.first = TempEntries.size();
17192 TempEntries.push_back(Entries[I]);
17193 }
17194 Entries.swap(TempEntries);
17195 if (EntryLanes.size() == Entries.size() &&
17196 !VL.equals(ArrayRef(TE->Scalars)
17197 .slice(Part * VL.size(),
17198 std::min<int>(VL.size(), TE->Scalars.size())))) {
17199 // We may have here 1 or 2 entries only. If the number of scalars is equal
17200 // to the number of entries, no need to do the analysis, it is not very
17201 // profitable. Since VL is not the same as TE->Scalars, it means we already
17202 // have some shuffles before. Cut off not profitable case.
17203 Entries.clear();
17204 return std::nullopt;
17205 }
17206 // Build the final mask, check for the identity shuffle, if possible.
17207 bool IsIdentity = Entries.size() == 1;
17208 // Pair.first is the offset to the vector, while Pair.second is the index of
17209 // scalar in the list.
17210 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
17211 unsigned Idx = Part * VL.size() + Pair.second;
17212 Mask[Idx] =
17213 Pair.first * VF +
17214 (ForOrder ? std::distance(
17215 Entries[Pair.first]->Scalars.begin(),
17216 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17217 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17218 IsIdentity &= Mask[Idx] == Pair.second;
17219 }
17220 if (ForOrder || IsIdentity || Entries.empty()) {
17221 switch (Entries.size()) {
17222 case 1:
17223 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17225 break;
17226 case 2:
17227 if (EntryLanes.size() > 2 || VL.size() <= 2)
17229 break;
17230 default:
17231 break;
17232 }
17233 } else if (!isa<VectorType>(VL.front()->getType()) &&
17234 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17235 // Do the cost estimation if shuffle beneficial than buildvector.
17236 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
17237 std::next(Mask.begin(), (Part + 1) * VL.size()));
17238 int MinElement = SubMask.front(), MaxElement = SubMask.front();
17239 for (int Idx : SubMask) {
17240 if (Idx == PoisonMaskElem)
17241 continue;
17242 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
17243 MinElement = Idx;
17244 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
17245 MaxElement = Idx;
17246 }
17247 assert(MaxElement >= 0 && MinElement >= 0 &&
17248 MaxElement % VF >= MinElement % VF &&
17249 "Expected at least single element.");
17250 unsigned NewVF = std::max<unsigned>(
17251 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
17252 (MaxElement % VF) -
17253 (MinElement % VF) + 1));
17254 if (NewVF < VF) {
17255 for (int &Idx : SubMask) {
17256 if (Idx == PoisonMaskElem)
17257 continue;
17258 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17259 (Idx >= static_cast<int>(VF) ? NewVF : 0);
17260 }
17261 } else {
17262 NewVF = VF;
17263 }
17264
17266 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
17267 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
17268 auto GetShuffleCost = [&,
17269 &TTI = *TTI](ArrayRef<int> Mask,
17271 VectorType *VecTy) -> InstructionCost {
17272 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17274 Mask, Entries.front()->getInterleaveFactor()))
17275 return TTI::TCC_Free;
17276 return ::getShuffleCost(TTI,
17277 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
17279 VecTy, Mask, CostKind);
17280 };
17281 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17282 InstructionCost FirstShuffleCost = 0;
17283 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17284 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17285 FirstShuffleCost = ShuffleCost;
17286 } else {
17287 // Transform mask to include only first entry.
17288 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17289 bool IsIdentity = true;
17290 for (auto [I, Idx] : enumerate(FirstMask)) {
17291 if (Idx >= static_cast<int>(NewVF)) {
17292 Idx = PoisonMaskElem;
17293 } else {
17294 DemandedElts.clearBit(I);
17295 if (Idx != PoisonMaskElem)
17296 IsIdentity &= static_cast<int>(I) == Idx;
17297 }
17298 }
17299 if (!IsIdentity)
17300 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17301 FirstShuffleCost += getScalarizationOverhead(
17302 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17303 /*Extract=*/false, CostKind);
17304 }
17305 InstructionCost SecondShuffleCost = 0;
17306 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17307 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17308 SecondShuffleCost = ShuffleCost;
17309 } else {
17310 // Transform mask to include only first entry.
17311 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17312 bool IsIdentity = true;
17313 for (auto [I, Idx] : enumerate(SecondMask)) {
17314 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
17315 Idx = PoisonMaskElem;
17316 } else {
17317 DemandedElts.clearBit(I);
17318 if (Idx != PoisonMaskElem) {
17319 Idx -= NewVF;
17320 IsIdentity &= static_cast<int>(I) == Idx;
17321 }
17322 }
17323 }
17324 if (!IsIdentity)
17325 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17326 SecondShuffleCost += getScalarizationOverhead(
17327 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17328 /*Extract=*/false, CostKind);
17329 }
17330 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
17331 for (auto [I, Idx] : enumerate(SubMask))
17332 if (Idx == PoisonMaskElem)
17333 DemandedElts.clearBit(I);
17334 InstructionCost BuildVectorCost = getScalarizationOverhead(
17335 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
17336 /*Extract=*/false, CostKind);
17337 const TreeEntry *BestEntry = nullptr;
17338 if (FirstShuffleCost < ShuffleCost) {
17339 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17340 std::next(Mask.begin(), (Part + 1) * VL.size()),
17341 [&](int &Idx) {
17342 if (Idx >= static_cast<int>(VF))
17343 Idx = PoisonMaskElem;
17344 });
17345 BestEntry = Entries.front();
17346 ShuffleCost = FirstShuffleCost;
17347 }
17348 if (SecondShuffleCost < ShuffleCost) {
17349 std::for_each(std::next(Mask.begin(), Part * VL.size()),
17350 std::next(Mask.begin(), (Part + 1) * VL.size()),
17351 [&](int &Idx) {
17352 if (Idx < static_cast<int>(VF))
17353 Idx = PoisonMaskElem;
17354 else
17355 Idx -= VF;
17356 });
17357 BestEntry = Entries[1];
17358 ShuffleCost = SecondShuffleCost;
17359 }
17360 if (BuildVectorCost >= ShuffleCost) {
17361 if (BestEntry) {
17362 Entries.clear();
17363 Entries.push_back(BestEntry);
17364 }
17365 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
17367 }
17368 }
17369 Entries.clear();
17370 // Clear the corresponding mask elements.
17371 std::fill(std::next(Mask.begin(), Part * VL.size()),
17372 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
17373 return std::nullopt;
17374}
17375
17377BoUpSLP::isGatherShuffledEntry(
17378 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
17379 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
17380 bool ForOrder) {
17381 assert(NumParts > 0 && NumParts < VL.size() &&
17382 "Expected positive number of registers.");
17383 Entries.clear();
17384 // No need to check for the topmost gather node.
17385 if (TE == VectorizableTree.front().get() &&
17386 (!GatheredLoadsEntriesFirst.has_value() ||
17387 none_of(ArrayRef(VectorizableTree).drop_front(),
17388 [](const std::unique_ptr<TreeEntry> &TE) {
17389 return !TE->isGather();
17390 })))
17391 return {};
17392 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
17393 // implemented yet.
17394 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17395 return {};
17396 Mask.assign(VL.size(), PoisonMaskElem);
17397 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17398 "Expected only single user of the gather node.");
17399 assert(VL.size() % NumParts == 0 &&
17400 "Number of scalars must be divisible by NumParts.");
17401 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
17402 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17403 (TE->Idx == 0 ||
17404 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
17405 isSplat(TE->Scalars) ||
17406 (TE->hasState() &&
17407 getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))
17408 return {};
17409 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
17411 for (unsigned Part : seq<unsigned>(NumParts)) {
17412 ArrayRef<Value *> SubVL =
17413 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
17414 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17415 std::optional<TTI::ShuffleKind> SubRes =
17416 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17417 ForOrder);
17418 if (!SubRes)
17419 SubEntries.clear();
17420 Res.push_back(SubRes);
17421 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
17422 SubEntries.front()->getVectorFactor() == VL.size() &&
17423 (SubEntries.front()->isSame(TE->Scalars) ||
17424 SubEntries.front()->isSame(VL))) {
17425 SmallVector<const TreeEntry *> LocalSubEntries;
17426 LocalSubEntries.swap(SubEntries);
17427 Entries.clear();
17428 Res.clear();
17429 std::iota(Mask.begin(), Mask.end(), 0);
17430 // Clear undef scalars.
17431 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
17432 if (isa<PoisonValue>(VL[I]))
17434 Entries.emplace_back(1, LocalSubEntries.front());
17436 return Res;
17437 }
17438 }
17439 if (all_of(Res,
17440 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
17441 Entries.clear();
17442 return {};
17443 }
17444 return Res;
17445}
17446
17447InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
17448 Type *ScalarTy) const {
17449 const unsigned VF = VL.size();
17450 auto *VecTy = getWidenedType(ScalarTy, VF);
17451 // Find the cost of inserting/extracting values from the vector.
17452 // Check if the same elements are inserted several times and count them as
17453 // shuffle candidates.
17454 APInt DemandedElements = APInt::getZero(VF);
17457 auto EstimateInsertCost = [&](unsigned I, Value *V) {
17458 DemandedElements.setBit(I);
17459 if (V->getType() != ScalarTy)
17460 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
17462 };
17463 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
17464 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17465 for (auto [I, V] : enumerate(VL)) {
17466 // No need to shuffle duplicates for constants.
17467 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V))
17468 continue;
17469
17470 if (isConstant(V)) {
17471 ConstantShuffleMask[I] = I + VF;
17472 continue;
17473 }
17474 EstimateInsertCost(I, V);
17475 }
17476 // FIXME: add a cost for constant vector materialization.
17477 bool IsAnyNonUndefConst =
17478 any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });
17479 // 1. Shuffle input source vector and constant vector.
17480 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17482 ConstantShuffleMask);
17483 }
17484
17485 // 2. Insert unique non-constants.
17486 if (!DemandedElements.isZero())
17487 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
17488 /*Insert=*/true,
17489 /*Extract=*/false, CostKind,
17490 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17491 return Cost;
17492}
17493
17494Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
17495 auto It = EntryToLastInstruction.find(E);
17496 if (It != EntryToLastInstruction.end())
17497 return *cast<Instruction>(It->second);
17498 Instruction *Res = nullptr;
17499 // Get the basic block this bundle is in. All instructions in the bundle
17500 // should be in this block (except for extractelement-like instructions with
17501 // constant indices or gathered loads or copyables).
17502 Instruction *Front;
17503 unsigned Opcode;
17504 if (E->hasState()) {
17505 Front = E->getMainOp();
17506 Opcode = E->getOpcode();
17507 } else {
17508 Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
17509 Opcode = Front->getOpcode();
17510 }
17511 auto *BB = Front->getParent();
17512 assert(
17513 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17514 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
17515 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
17516 all_of(E->Scalars,
17517 [=](Value *V) -> bool {
17518 if (Opcode == Instruction::GetElementPtr &&
17519 !isa<GetElementPtrInst>(V))
17520 return true;
17521 auto *I = dyn_cast<Instruction>(V);
17522 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17523 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17524 })) &&
17525 "Expected gathered loads or GEPs or instructions from same basic "
17526 "block.");
17527
17528 auto FindLastInst = [&]() {
17529 Instruction *LastInst = Front;
17530 for (Value *V : E->Scalars) {
17531 auto *I = dyn_cast<Instruction>(V);
17532 if (!I)
17533 continue;
17534 if (E->isCopyableElement(I))
17535 continue;
17536 if (LastInst->getParent() == I->getParent()) {
17537 if (LastInst->comesBefore(I))
17538 LastInst = I;
17539 continue;
17540 }
17541 assert(((Opcode == Instruction::GetElementPtr &&
17543 E->State == TreeEntry::SplitVectorize ||
17544 (isVectorLikeInstWithConstOps(LastInst) &&
17546 (GatheredLoadsEntriesFirst.has_value() &&
17547 Opcode == Instruction::Load && E->isGather() &&
17548 E->Idx < *GatheredLoadsEntriesFirst)) &&
17549 "Expected vector-like or non-GEP in GEP node insts only.");
17550 if (!DT->isReachableFromEntry(LastInst->getParent())) {
17551 LastInst = I;
17552 continue;
17553 }
17554 if (!DT->isReachableFromEntry(I->getParent()))
17555 continue;
17556 auto *NodeA = DT->getNode(LastInst->getParent());
17557 auto *NodeB = DT->getNode(I->getParent());
17558 assert(NodeA && "Should only process reachable instructions");
17559 assert(NodeB && "Should only process reachable instructions");
17560 assert((NodeA == NodeB) ==
17561 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17562 "Different nodes should have different DFS numbers");
17563 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17564 LastInst = I;
17565 }
17566 BB = LastInst->getParent();
17567 return LastInst;
17568 };
17569
17570 auto FindFirstInst = [&]() {
17571 Instruction *FirstInst = Front;
17572 for (Value *V : E->Scalars) {
17573 auto *I = dyn_cast<Instruction>(V);
17574 if (!I)
17575 continue;
17576 if (E->isCopyableElement(I))
17577 continue;
17578 if (FirstInst->getParent() == I->getParent()) {
17579 if (I->comesBefore(FirstInst))
17580 FirstInst = I;
17581 continue;
17582 }
17583 assert(((Opcode == Instruction::GetElementPtr &&
17585 (isVectorLikeInstWithConstOps(FirstInst) &&
17587 "Expected vector-like or non-GEP in GEP node insts only.");
17588 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
17589 FirstInst = I;
17590 continue;
17591 }
17592 if (!DT->isReachableFromEntry(I->getParent()))
17593 continue;
17594 auto *NodeA = DT->getNode(FirstInst->getParent());
17595 auto *NodeB = DT->getNode(I->getParent());
17596 assert(NodeA && "Should only process reachable instructions");
17597 assert(NodeB && "Should only process reachable instructions");
17598 assert((NodeA == NodeB) ==
17599 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17600 "Different nodes should have different DFS numbers");
17601 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17602 FirstInst = I;
17603 }
17604 return FirstInst;
17605 };
17606
17607 if (E->State == TreeEntry::SplitVectorize) {
17608 Res = FindLastInst();
17609 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(Res); !Entries.empty()) {
17610 for (auto *E : Entries) {
17611 auto *I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
17612 if (!I)
17613 I = &getLastInstructionInBundle(E);
17614 if (Res->getParent() == I->getParent() && Res->comesBefore(I))
17615 Res = I;
17616 }
17617 }
17618 EntryToLastInstruction.try_emplace(E, Res);
17619 return *Res;
17620 }
17621
17622 // Set insertpoint for gathered loads to the very first load.
17623 if (GatheredLoadsEntriesFirst.has_value() &&
17624 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17625 Opcode == Instruction::Load) {
17626 Res = FindFirstInst();
17627 EntryToLastInstruction.try_emplace(E, Res);
17628 return *Res;
17629 }
17630
17631 // Set the insert point to the beginning of the basic block if the entry
17632 // should not be scheduled.
17633 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
17634 if (E->isGather())
17635 return nullptr;
17636 // Found previously that the instruction do not need to be scheduled.
17637 const auto *It = BlocksSchedules.find(BB);
17638 if (It == BlocksSchedules.end())
17639 return nullptr;
17640 for (Value *V : E->Scalars) {
17641 auto *I = dyn_cast<Instruction>(V);
17642 if (!I || isa<PHINode>(I) ||
17643 (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
17644 continue;
17645 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
17646 if (Bundles.empty())
17647 continue;
17648 const auto *It = find_if(
17649 Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
17650 if (It != Bundles.end())
17651 return *It;
17652 }
17653 return nullptr;
17654 };
17655 const ScheduleBundle *Bundle = FindScheduleBundle(E);
17656 if (!E->isGather() && !Bundle) {
17657 if ((Opcode == Instruction::GetElementPtr &&
17658 any_of(E->Scalars,
17659 [](Value *V) {
17660 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17661 })) ||
17662 (all_of(E->Scalars,
17663 [&](Value *V) {
17664 return isa<PoisonValue>(V) ||
17665 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
17666 E->isCopyableElement(V) ||
17667 (!isVectorLikeInstWithConstOps(V) &&
17668 isUsedOutsideBlock(V));
17669 }) &&
17670 (!E->doesNotNeedToSchedule() ||
17671 any_of(E->Scalars,
17672 [&](Value *V) {
17673 if (!isa<Instruction>(V) ||
17674 (E->hasCopyableElements() && E->isCopyableElement(V)))
17675 return false;
17676 return !areAllOperandsNonInsts(V);
17677 }) ||
17678 none_of(E->Scalars, [&](Value *V) {
17679 if (!isa<Instruction>(V) ||
17680 (E->hasCopyableElements() && E->isCopyableElement(V)))
17681 return false;
17682 return MustGather.contains(V);
17683 }))))
17684 Res = FindLastInst();
17685 else
17686 Res = FindFirstInst();
17687 EntryToLastInstruction.try_emplace(E, Res);
17688 return *Res;
17689 }
17690
17691 // Find the last instruction. The common case should be that BB has been
17692 // scheduled, and the last instruction is VL.back(). So we start with
17693 // VL.back() and iterate over schedule data until we reach the end of the
17694 // bundle. The end of the bundle is marked by null ScheduleData.
17695 if (Bundle) {
17696 assert(!E->isGather() && "Gathered instructions should not be scheduled");
17697 Res = Bundle->getBundle().back()->getInst();
17698 EntryToLastInstruction.try_emplace(E, Res);
17699 return *Res;
17700 }
17701
17702 // LastInst can still be null at this point if there's either not an entry
17703 // for BB in BlocksSchedules or there's no ScheduleData available for
17704 // VL.back(). This can be the case if buildTreeRec aborts for various
17705 // reasons (e.g., the maximum recursion depth is reached, the maximum region
17706 // size is reached, etc.). ScheduleData is initialized in the scheduling
17707 // "dry-run".
17708 //
17709 // If this happens, we can still find the last instruction by brute force. We
17710 // iterate forwards from Front (inclusive) until we either see all
17711 // instructions in the bundle or reach the end of the block. If Front is the
17712 // last instruction in program order, LastInst will be set to Front, and we
17713 // will visit all the remaining instructions in the block.
17714 //
17715 // One of the reasons we exit early from buildTreeRec is to place an upper
17716 // bound on compile-time. Thus, taking an additional compile-time hit here is
17717 // not ideal. However, this should be exceedingly rare since it requires that
17718 // we both exit early from buildTreeRec and that the bundle be out-of-order
17719 // (causing us to iterate all the way to the end of the block).
17720 if (!Res)
17721 Res = FindLastInst();
17722 assert(Res && "Failed to find last instruction in bundle");
17723 EntryToLastInstruction.try_emplace(E, Res);
17724 return *Res;
17725}
17726
17727void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
17728 auto *Front = E->getMainOp();
17729 Instruction *LastInst = &getLastInstructionInBundle(E);
17730 assert(LastInst && "Failed to find last instruction in bundle");
17731 BasicBlock::iterator LastInstIt = LastInst->getIterator();
17732 // If the instruction is PHI, set the insert point after all the PHIs.
17733 bool IsPHI = isa<PHINode>(LastInst);
17734 if (IsPHI) {
17735 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
17736 if (LastInstIt != LastInst->getParent()->end() &&
17737 LastInstIt->getParent()->isLandingPad())
17738 LastInstIt = std::next(LastInstIt);
17739 }
17740 if (IsPHI ||
17741 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
17742 (E->doesNotNeedToSchedule() ||
17743 (E->hasCopyableElements() && !E->isCopyableElement(LastInst) &&
17744 isUsedOutsideBlock(LastInst)))) ||
17745 (GatheredLoadsEntriesFirst.has_value() &&
17746 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
17747 E->getOpcode() == Instruction::Load)) {
17748 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
17749 } else {
17750 // Set the insertion point after the last instruction in the bundle. Set the
17751 // debug location to Front.
17752 Builder.SetInsertPoint(
17753 LastInst->getParent(),
17754 LastInst->getNextNode()->getIterator());
17755 }
17756 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
17757}
17758
17759Value *BoUpSLP::gather(
17760 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
17761 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
17762 // List of instructions/lanes from current block and/or the blocks which are
17763 // part of the current loop. These instructions will be inserted at the end to
17764 // make it possible to optimize loops and hoist invariant instructions out of
17765 // the loops body with better chances for success.
17767 SmallSet<int, 4> PostponedIndices;
17768 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
17769 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
17770 SmallPtrSet<BasicBlock *, 4> Visited;
17771 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
17772 InsertBB = InsertBB->getSinglePredecessor();
17773 return InsertBB && InsertBB == InstBB;
17774 };
17775 for (int I = 0, E = VL.size(); I < E; ++I) {
17776 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
17777 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17778 isVectorized(Inst) ||
17779 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
17780 PostponedIndices.insert(I).second)
17781 PostponedInsts.emplace_back(Inst, I);
17782 }
17783
17784 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
17785 Type *Ty) {
17786 Value *Scalar = V;
17787 if (Scalar->getType() != Ty) {
17788 assert(Scalar->getType()->isIntOrIntVectorTy() &&
17789 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
17790 Value *V = Scalar;
17791 if (auto *CI = dyn_cast<CastInst>(Scalar);
17793 Value *Op = CI->getOperand(0);
17794 if (auto *IOp = dyn_cast<Instruction>(Op);
17795 !IOp || !(isDeleted(IOp) || isVectorized(IOp)))
17796 V = Op;
17797 }
17798 Scalar = Builder.CreateIntCast(
17799 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
17800 }
17801
17802 Instruction *InsElt;
17803 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
17804 assert(SLPReVec && "FixedVectorType is not expected.");
17805 Vec =
17806 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
17807 auto *II = dyn_cast<Instruction>(Vec);
17808 if (!II)
17809 return Vec;
17810 InsElt = II;
17811 } else {
17812 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17813 InsElt = dyn_cast<InsertElementInst>(Vec);
17814 if (!InsElt)
17815 return Vec;
17816 }
17817 GatherShuffleExtractSeq.insert(InsElt);
17818 CSEBlocks.insert(InsElt->getParent());
17819 // Add to our 'need-to-extract' list.
17820 if (isa<Instruction>(V)) {
17821 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) {
17822 // Find which lane we need to extract.
17823 User *UserOp = nullptr;
17824 if (Scalar != V) {
17825 if (auto *SI = dyn_cast<Instruction>(Scalar))
17826 UserOp = SI;
17827 } else {
17828 if (V->getType()->isVectorTy()) {
17829 if (auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
17830 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17831 // Find shufflevector, caused by resize.
17832 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
17833 if (auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
17834 if (SV->getOperand(0) == V)
17835 return SV;
17836 if (SV->getOperand(1) == V)
17837 return SV;
17838 }
17839 return nullptr;
17840 };
17841 InsElt = nullptr;
17842 if (Instruction *User = FindOperand(SV->getOperand(0), V))
17843 InsElt = User;
17844 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
17845 InsElt = User;
17846 assert(InsElt &&
17847 "Failed to find shufflevector, caused by resize.");
17848 }
17849 }
17850 UserOp = InsElt;
17851 }
17852 if (UserOp) {
17853 unsigned FoundLane = Entries.front()->findLaneForValue(V);
17854 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
17855 }
17856 }
17857 }
17858 return Vec;
17859 };
17860 auto *VecTy = getWidenedType(ScalarTy, VL.size());
17861 Value *Vec = PoisonValue::get(VecTy);
17862 SmallVector<int> NonConsts;
17863 SmallVector<int> Mask(VL.size());
17864 std::iota(Mask.begin(), Mask.end(), 0);
17865 Value *OriginalRoot = Root;
17866 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
17867 SV && isa<PoisonValue>(SV->getOperand(1)) &&
17868 SV->getOperand(0)->getType() == VecTy) {
17869 Root = SV->getOperand(0);
17870 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
17871 }
17872 // Insert constant values at first.
17873 for (int I = 0, E = VL.size(); I < E; ++I) {
17874 if (PostponedIndices.contains(I))
17875 continue;
17876 if (!isConstant(VL[I])) {
17877 NonConsts.push_back(I);
17878 continue;
17879 }
17880 if (isa<PoisonValue>(VL[I]))
17881 continue;
17882 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
17883 Mask[I] = I + E;
17884 }
17885 if (Root) {
17886 if (isa<PoisonValue>(Vec)) {
17887 Vec = OriginalRoot;
17888 } else {
17889 Vec = CreateShuffle(Root, Vec, Mask);
17890 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
17891 OI && OI->use_empty() &&
17892 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
17893 return TE->VectorizedValue == OI;
17894 }))
17895 eraseInstruction(OI);
17896 }
17897 }
17898 // Insert non-constant values.
17899 for (int I : NonConsts)
17900 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
17901 // Append instructions, which are/may be part of the loop, in the end to make
17902 // it possible to hoist non-loop-based instructions.
17903 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
17904 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
17905
17906 return Vec;
17907}
17908
17909/// Merges shuffle masks and emits final shuffle instruction, if required. It
17910/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
17911/// when the actual shuffle instruction is generated only if this is actually
17912/// required. Otherwise, the shuffle instruction emission is delayed till the
17913/// end of the process, to reduce the number of emitted instructions and further
17914/// analysis/transformations.
17915/// The class also will look through the previously emitted shuffle instructions
17916/// and properly mark indices in mask as undef.
17917/// For example, given the code
17918/// \code
17919/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
17920/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
17921/// \endcode
17922/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
17923/// look through %s1 and %s2 and emit
17924/// \code
17925/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
17926/// \endcode
17927/// instead.
17928/// If 2 operands are of different size, the smallest one will be resized and
17929/// the mask recalculated properly.
17930/// For example, given the code
17931/// \code
17932/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
17933/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
17934/// \endcode
17935/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
17936/// look through %s1 and %s2 and emit
17937/// \code
17938/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
17939/// \endcode
17940/// instead.
17941class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
17942 bool IsFinalized = false;
17943 /// Combined mask for all applied operands and masks. It is built during
17944 /// analysis and actual emission of shuffle vector instructions.
17945 SmallVector<int> CommonMask;
17946 /// List of operands for the shuffle vector instruction. It hold at max 2
17947 /// operands, if the 3rd is going to be added, the first 2 are combined into
17948 /// shuffle with \p CommonMask mask, the first operand sets to be the
17949 /// resulting shuffle and the second operand sets to be the newly added
17950 /// operand. The \p CommonMask is transformed in the proper way after that.
17951 SmallVector<Value *, 2> InVectors;
17952 IRBuilderBase &Builder;
17953 BoUpSLP &R;
17954
17955 class ShuffleIRBuilder {
17956 IRBuilderBase &Builder;
17957 /// Holds all of the instructions that we gathered.
17958 SetVector<Instruction *> &GatherShuffleExtractSeq;
17959 /// A list of blocks that we are going to CSE.
17960 DenseSet<BasicBlock *> &CSEBlocks;
17961 /// Data layout.
17962 const DataLayout &DL;
17963
17964 public:
17965 ShuffleIRBuilder(IRBuilderBase &Builder,
17966 SetVector<Instruction *> &GatherShuffleExtractSeq,
17967 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
17968 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
17969 CSEBlocks(CSEBlocks), DL(DL) {}
17970 ~ShuffleIRBuilder() = default;
17971 /// Creates shufflevector for the 2 operands with the given mask.
17972 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
17973 if (V1->getType() != V2->getType()) {
17975 V1->getType()->isIntOrIntVectorTy() &&
17976 "Expected integer vector types only.");
17977 if (V1->getType() != V2->getType()) {
17978 if (cast<VectorType>(V2->getType())
17979 ->getElementType()
17980 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
17981 ->getElementType()
17982 ->getIntegerBitWidth())
17983 V2 = Builder.CreateIntCast(
17984 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
17985 else
17986 V1 = Builder.CreateIntCast(
17987 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
17988 }
17989 }
17990 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
17991 if (auto *I = dyn_cast<Instruction>(Vec)) {
17992 GatherShuffleExtractSeq.insert(I);
17993 CSEBlocks.insert(I->getParent());
17994 }
17995 return Vec;
17996 }
17997 /// Creates permutation of the single vector operand with the given mask, if
17998 /// it is not identity mask.
17999 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
18000 if (Mask.empty())
18001 return V1;
18002 unsigned VF = Mask.size();
18003 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
18004 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
18005 return V1;
18006 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
18007 if (auto *I = dyn_cast<Instruction>(Vec)) {
18008 GatherShuffleExtractSeq.insert(I);
18009 CSEBlocks.insert(I->getParent());
18010 }
18011 return Vec;
18012 }
18013 Value *createIdentity(Value *V) { return V; }
18014 Value *createPoison(Type *Ty, unsigned VF) {
18015 return PoisonValue::get(getWidenedType(Ty, VF));
18016 }
18017 /// Resizes 2 input vector to match the sizes, if the they are not equal
18018 /// yet. The smallest vector is resized to the size of the larger vector.
18019 void resizeToMatch(Value *&V1, Value *&V2) {
18020 if (V1->getType() == V2->getType())
18021 return;
18022 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
18023 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
18024 int VF = std::max(V1VF, V2VF);
18025 int MinVF = std::min(V1VF, V2VF);
18026 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
18027 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
18028 0);
18029 Value *&Op = MinVF == V1VF ? V1 : V2;
18030 Op = Builder.CreateShuffleVector(Op, IdentityMask);
18031 if (auto *I = dyn_cast<Instruction>(Op)) {
18032 GatherShuffleExtractSeq.insert(I);
18033 CSEBlocks.insert(I->getParent());
18034 }
18035 if (MinVF == V1VF)
18036 V1 = Op;
18037 else
18038 V2 = Op;
18039 }
18040 };
18041
18042 /// Smart shuffle instruction emission, walks through shuffles trees and
18043 /// tries to find the best matching vector for the actual shuffle
18044 /// instruction.
18045 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
18046 assert(V1 && "Expected at least one vector value.");
18047 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
18048 R.CSEBlocks, *R.DL);
18049 return BaseShuffleAnalysis::createShuffle<Value *>(
18050 V1, V2, Mask, ShuffleBuilder, ScalarTy);
18051 }
18052
18053 /// Cast value \p V to the vector type with the same number of elements, but
18054 /// the base type \p ScalarTy.
18055 Value *castToScalarTyElem(Value *V,
18056 std::optional<bool> IsSigned = std::nullopt) {
18057 auto *VecTy = cast<VectorType>(V->getType());
18058 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
18059 if (VecTy->getElementType() == ScalarTy->getScalarType())
18060 return V;
18061 return Builder.CreateIntCast(
18062 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
18063 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
18064 }
18065
18066 Value *getVectorizedValue(const TreeEntry &E) {
18067 Value *Vec = E.VectorizedValue;
18068 if (!Vec->getType()->isIntOrIntVectorTy())
18069 return Vec;
18070 return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) {
18071 return !isa<PoisonValue>(V) &&
18072 !isKnownNonNegative(
18073 V, SimplifyQuery(*R.DL));
18074 }));
18075 }
18076
18077public:
18079 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
18080
18081 /// Adjusts extractelements after reusing them.
18082 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
18083 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
18084 unsigned NumParts, bool &UseVecBaseAsInput) {
18085 UseVecBaseAsInput = false;
18086 SmallPtrSet<Value *, 4> UniqueBases;
18087 Value *VecBase = nullptr;
18088 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
18089 if (!E->ReorderIndices.empty()) {
18090 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
18091 E->ReorderIndices.end());
18092 reorderScalars(VL, ReorderMask);
18093 }
18094 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
18095 int Idx = Mask[I];
18096 if (Idx == PoisonMaskElem)
18097 continue;
18098 auto *EI = cast<ExtractElementInst>(VL[I]);
18099 VecBase = EI->getVectorOperand();
18100 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
18101 VecBase = TEs.front()->VectorizedValue;
18102 assert(VecBase && "Expected vectorized value.");
18103 UniqueBases.insert(VecBase);
18104 // If the only one use is vectorized - can delete the extractelement
18105 // itself.
18106 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
18107 (NumParts != 1 && count(VL, EI) > 1) ||
18108 any_of(EI->users(), [&](User *U) {
18109 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
18110 return UTEs.empty() || UTEs.size() > 1 ||
18111 (isa<GetElementPtrInst>(U) &&
18112 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
18113 (!UTEs.empty() &&
18114 count_if(R.VectorizableTree,
18115 [&](const std::unique_ptr<TreeEntry> &TE) {
18116 return TE->UserTreeIndex.UserTE ==
18117 UTEs.front() &&
18118 is_contained(VL, EI);
18119 }) != 1);
18120 }))
18121 continue;
18122 R.eraseInstruction(EI);
18123 }
18124 if (NumParts == 1 || UniqueBases.size() == 1) {
18125 assert(VecBase && "Expected vectorized value.");
18126 return castToScalarTyElem(VecBase);
18127 }
18128 UseVecBaseAsInput = true;
18129 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
18130 for (auto [I, Idx] : enumerate(Mask))
18131 if (Idx != PoisonMaskElem)
18132 Idx = I;
18133 };
18134 // Perform multi-register vector shuffle, joining them into a single virtual
18135 // long vector.
18136 // Need to shuffle each part independently and then insert all this parts
18137 // into a long virtual vector register, forming the original vector.
18138 Value *Vec = nullptr;
18139 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
18140 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
18141 for (unsigned Part : seq<unsigned>(NumParts)) {
18142 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
18143 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
18144 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
18145 constexpr int MaxBases = 2;
18146 SmallVector<Value *, MaxBases> Bases(MaxBases);
18147 auto VLMask = zip(SubVL, SubMask);
18148 const unsigned VF = std::accumulate(
18149 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
18150 if (std::get<1>(D) == PoisonMaskElem)
18151 return S;
18152 Value *VecOp =
18153 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
18154 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
18155 !TEs.empty())
18156 VecOp = TEs.front()->VectorizedValue;
18157 assert(VecOp && "Expected vectorized value.");
18158 const unsigned Size =
18159 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18160 return std::max(S, Size);
18161 });
18162 for (const auto [V, I] : VLMask) {
18163 if (I == PoisonMaskElem)
18164 continue;
18165 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
18166 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
18167 VecOp = TEs.front()->VectorizedValue;
18168 assert(VecOp && "Expected vectorized value.");
18169 VecOp = castToScalarTyElem(VecOp);
18170 Bases[I / VF] = VecOp;
18171 }
18172 if (!Bases.front())
18173 continue;
18174 Value *SubVec;
18175 if (Bases.back()) {
18176 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18177 TransformToIdentity(SubMask);
18178 } else {
18179 SubVec = Bases.front();
18180 }
18181 if (!Vec) {
18182 Vec = SubVec;
18183 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
18184 [&](unsigned P) {
18185 ArrayRef<int> SubMask =
18186 Mask.slice(P * SliceSize,
18187 getNumElems(Mask.size(),
18188 SliceSize, P));
18189 return all_of(SubMask, [](int Idx) {
18190 return Idx == PoisonMaskElem;
18191 });
18192 })) &&
18193 "Expected first part or all previous parts masked.");
18194 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18195 } else {
18196 unsigned NewVF =
18197 cast<FixedVectorType>(Vec->getType())->getNumElements();
18198 if (Vec->getType() != SubVec->getType()) {
18199 unsigned SubVecVF =
18200 cast<FixedVectorType>(SubVec->getType())->getNumElements();
18201 NewVF = std::max(NewVF, SubVecVF);
18202 }
18203 // Adjust SubMask.
18204 for (int &Idx : SubMask)
18205 if (Idx != PoisonMaskElem)
18206 Idx += NewVF;
18207 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18208 Vec = createShuffle(Vec, SubVec, VecMask);
18209 TransformToIdentity(VecMask);
18210 }
18211 }
18212 copy(VecMask, Mask.begin());
18213 return Vec;
18214 }
18215 /// Checks if the specified entry \p E needs to be delayed because of its
18216 /// dependency nodes.
18217 std::optional<Value *>
18218 needToDelay(const TreeEntry *E,
18220 // No need to delay emission if all deps are ready.
18221 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
18222 return all_of(
18223 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
18224 }))
18225 return std::nullopt;
18226 // Postpone gather emission, will be emitted after the end of the
18227 // process to keep correct order.
18228 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
18229 return Builder.CreateAlignedLoad(
18230 ResVecTy,
18231 PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
18232 MaybeAlign());
18233 }
18234 /// Reset the builder to handle perfect diamond match.
18236 IsFinalized = false;
18237 CommonMask.clear();
18238 InVectors.clear();
18239 }
18240 /// Adds 2 input vectors (in form of tree entries) and the mask for their
18241 /// shuffling.
18242 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
18243 Value *V1 = getVectorizedValue(E1);
18244 Value *V2 = getVectorizedValue(E2);
18245 add(V1, V2, Mask);
18246 }
18247 /// Adds single input vector (in form of tree entry) and the mask for its
18248 /// shuffling.
18249 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
18250 Value *V1 = getVectorizedValue(E1);
18251 add(V1, Mask);
18252 }
18253 /// Adds 2 input vectors and the mask for their shuffling.
18254 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
18255 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
18258 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18259 V1 = castToScalarTyElem(V1);
18260 V2 = castToScalarTyElem(V2);
18261 if (InVectors.empty()) {
18262 InVectors.push_back(V1);
18263 InVectors.push_back(V2);
18264 CommonMask.assign(Mask.begin(), Mask.end());
18265 return;
18266 }
18267 Value *Vec = InVectors.front();
18268 if (InVectors.size() == 2) {
18269 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18270 transformMaskAfterShuffle(CommonMask, CommonMask);
18271 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
18272 Mask.size()) {
18273 Vec = createShuffle(Vec, nullptr, CommonMask);
18274 transformMaskAfterShuffle(CommonMask, CommonMask);
18275 }
18276 V1 = createShuffle(V1, V2, Mask);
18277 unsigned VF = std::max(getVF(V1), getVF(Vec));
18278 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18279 if (Mask[Idx] != PoisonMaskElem)
18280 CommonMask[Idx] = Idx + VF;
18281 InVectors.front() = Vec;
18282 if (InVectors.size() == 2)
18283 InVectors.back() = V1;
18284 else
18285 InVectors.push_back(V1);
18286 }
18287 /// Adds another one input vector and the mask for the shuffling.
18288 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
18290 "castToScalarTyElem expects V1 to be FixedVectorType");
18291 V1 = castToScalarTyElem(V1);
18292 if (InVectors.empty()) {
18293 InVectors.push_back(V1);
18294 CommonMask.assign(Mask.begin(), Mask.end());
18295 return;
18296 }
18297 const auto *It = find(InVectors, V1);
18298 if (It == InVectors.end()) {
18299 if (InVectors.size() == 2 ||
18300 InVectors.front()->getType() != V1->getType()) {
18301 Value *V = InVectors.front();
18302 if (InVectors.size() == 2) {
18303 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18304 transformMaskAfterShuffle(CommonMask, CommonMask);
18305 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
18306 CommonMask.size()) {
18307 V = createShuffle(InVectors.front(), nullptr, CommonMask);
18308 transformMaskAfterShuffle(CommonMask, CommonMask);
18309 }
18310 unsigned VF = std::max(CommonMask.size(), Mask.size());
18311 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18312 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
18313 CommonMask[Idx] = V->getType() != V1->getType()
18314 ? Idx + VF
18315 : Mask[Idx] + getVF(V1);
18316 if (V->getType() != V1->getType())
18317 V1 = createShuffle(V1, nullptr, Mask);
18318 InVectors.front() = V;
18319 if (InVectors.size() == 2)
18320 InVectors.back() = V1;
18321 else
18322 InVectors.push_back(V1);
18323 return;
18324 }
18325 // Check if second vector is required if the used elements are already
18326 // used from the first one.
18327 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18328 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
18329 InVectors.push_back(V1);
18330 break;
18331 }
18332 }
18333 unsigned VF = 0;
18334 for (Value *V : InVectors)
18335 VF = std::max(VF, getVF(V));
18336 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18337 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
18338 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18339 }
18340 /// Adds another one input vector and the mask for the shuffling.
18342 SmallVector<int> NewMask;
18343 inversePermutation(Order, NewMask);
18344 add(V1, NewMask);
18345 }
18346 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
18347 Value *Root = nullptr) {
18348 return R.gather(VL, Root, ScalarTy,
18349 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
18350 return createShuffle(V1, V2, Mask);
18351 });
18352 }
18353 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
18354 /// Finalize emission of the shuffles.
18355 /// \param Action the action (if any) to be performed before final applying of
18356 /// the \p ExtMask mask.
18358 ArrayRef<int> ExtMask,
18359 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18360 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
18363 Action = {}) {
18364 IsFinalized = true;
18365 if (Action) {
18366 Value *Vec = InVectors.front();
18367 if (InVectors.size() == 2) {
18368 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18369 InVectors.pop_back();
18370 } else {
18371 Vec = createShuffle(Vec, nullptr, CommonMask);
18372 }
18373 transformMaskAfterShuffle(CommonMask, CommonMask);
18374 assert(VF > 0 &&
18375 "Expected vector length for the final value before action.");
18376 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
18377 if (VecVF < VF) {
18378 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
18379 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18380 Vec = createShuffle(Vec, nullptr, ResizeMask);
18381 }
18382 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
18383 return createShuffle(V1, V2, Mask);
18384 });
18385 InVectors.front() = Vec;
18386 }
18387 if (!SubVectors.empty()) {
18388 Value *Vec = InVectors.front();
18389 if (InVectors.size() == 2) {
18390 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18391 InVectors.pop_back();
18392 } else {
18393 Vec = createShuffle(Vec, nullptr, CommonMask);
18394 }
18395 transformMaskAfterShuffle(CommonMask, CommonMask);
18396 auto CreateSubVectors = [&](Value *Vec,
18397 SmallVectorImpl<int> &CommonMask) {
18398 for (auto [E, Idx] : SubVectors) {
18399 Value *V = getVectorizedValue(*E);
18400 unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
18401 // Use scalar version of the SCalarType to correctly handle shuffles
18402 // for revectorization. The revectorization mode operates by the
18403 // vectors, but here we need to operate on the scalars, because the
18404 // masks were already transformed for the vector elements and we don't
18405 // need doing this transformation again.
18406 Type *OrigScalarTy = ScalarTy;
18407 ScalarTy = ScalarTy->getScalarType();
18408 Vec = createInsertVector(
18409 Builder, Vec, V, InsertionIndex,
18410 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
18411 _3));
18412 ScalarTy = OrigScalarTy;
18413 if (!CommonMask.empty()) {
18414 std::iota(std::next(CommonMask.begin(), Idx),
18415 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
18416 Idx);
18417 }
18418 }
18419 return Vec;
18420 };
18421 if (SubVectorsMask.empty()) {
18422 Vec = CreateSubVectors(Vec, CommonMask);
18423 } else {
18424 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
18425 copy(SubVectorsMask, SVMask.begin());
18426 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
18427 if (I2 != PoisonMaskElem) {
18428 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
18429 I1 = I2 + CommonMask.size();
18430 }
18431 }
18432 Value *InsertVec =
18433 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
18434 Vec = createShuffle(InsertVec, Vec, SVMask);
18435 transformMaskAfterShuffle(CommonMask, SVMask);
18436 }
18437 InVectors.front() = Vec;
18438 }
18439
18440 if (!ExtMask.empty()) {
18441 if (CommonMask.empty()) {
18442 CommonMask.assign(ExtMask.begin(), ExtMask.end());
18443 } else {
18444 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
18445 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
18446 if (ExtMask[I] == PoisonMaskElem)
18447 continue;
18448 NewMask[I] = CommonMask[ExtMask[I]];
18449 }
18450 CommonMask.swap(NewMask);
18451 }
18452 }
18453 if (CommonMask.empty()) {
18454 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
18455 return InVectors.front();
18456 }
18457 if (InVectors.size() == 2)
18458 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18459 return createShuffle(InVectors.front(), nullptr, CommonMask);
18460 }
18461
18463 assert((IsFinalized || CommonMask.empty()) &&
18464 "Shuffle construction must be finalized.");
18465 }
18466};
18467
18468Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
18469 return vectorizeTree(getOperandEntry(E, NodeIdx));
18470}
18471
18472template <typename BVTy, typename ResTy, typename... Args>
18473ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
18474 Args &...Params) {
18475 assert(E->isGather() && "Expected gather node.");
18476 unsigned VF = E->getVectorFactor();
18477
18478 bool NeedFreeze = false;
18479 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
18480 // Clear values, to be replaced by insertvector instructions.
18481 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18482 for_each(MutableArrayRef(GatheredScalars)
18483 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18484 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
18486 E->CombinedEntriesWithIndices.size());
18487 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
18488 [&](const auto &P) {
18489 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18490 });
18491 // Build a mask out of the reorder indices and reorder scalars per this
18492 // mask.
18493 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
18494 E->ReorderIndices.end());
18495 if (!ReorderMask.empty())
18496 reorderScalars(GatheredScalars, ReorderMask);
18497 SmallVector<int> SubVectorsMask;
18498 inversePermutation(E->ReorderIndices, SubVectorsMask);
18499 // Transform non-clustered elements in the mask to poison (-1).
18500 // "Clustered" operations will be reordered using this mask later.
18501 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
18502 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
18503 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
18504 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
18505 } else {
18506 SubVectorsMask.clear();
18507 }
18508 SmallVector<Value *> StoredGS(GatheredScalars);
18509 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
18510 unsigned I, unsigned SliceSize,
18511 bool IsNotPoisonous) {
18512 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
18513 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18514 }))
18515 return false;
18516 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
18517 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
18518 if (UserTE->getNumOperands() != 2)
18519 return false;
18520 if (!IsNotPoisonous) {
18521 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18522 [=](const std::unique_ptr<TreeEntry> &TE) {
18523 return TE->UserTreeIndex.UserTE == UserTE &&
18524 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18525 });
18526 if (It == VectorizableTree.end())
18527 return false;
18528 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
18529 if (!(*It)->ReorderIndices.empty()) {
18530 inversePermutation((*It)->ReorderIndices, ReorderMask);
18531 reorderScalars(GS, ReorderMask);
18532 }
18533 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
18534 Value *V0 = std::get<0>(P);
18535 Value *V1 = std::get<1>(P);
18536 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
18537 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
18538 is_contained(E->Scalars, V1));
18539 }))
18540 return false;
18541 }
18542 int Idx;
18543 if ((Mask.size() < InputVF &&
18544 ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
18545 Idx == 0) ||
18546 (Mask.size() == InputVF &&
18547 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
18548 std::iota(
18549 std::next(Mask.begin(), I * SliceSize),
18550 std::next(Mask.begin(),
18551 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18552 0);
18553 } else {
18554 unsigned IVal =
18555 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
18556 std::fill(
18557 std::next(Mask.begin(), I * SliceSize),
18558 std::next(Mask.begin(),
18559 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
18560 IVal);
18561 }
18562 return true;
18563 };
18564 BVTy ShuffleBuilder(ScalarTy, Params...);
18565 ResTy Res = ResTy();
18566 SmallVector<int> Mask;
18567 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
18569 Value *ExtractVecBase = nullptr;
18570 bool UseVecBaseAsInput = false;
18573 Type *OrigScalarTy = GatheredScalars.front()->getType();
18574 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
18575 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
18576 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
18577 // Check for gathered extracts.
18578 bool Resized = false;
18579 ExtractShuffles =
18580 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18581 if (!ExtractShuffles.empty()) {
18582 SmallVector<const TreeEntry *> ExtractEntries;
18583 for (auto [Idx, I] : enumerate(ExtractMask)) {
18584 if (I == PoisonMaskElem)
18585 continue;
18586 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
18587 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
18588 !TEs.empty())
18589 ExtractEntries.append(TEs.begin(), TEs.end());
18590 }
18591 if (std::optional<ResTy> Delayed =
18592 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
18593 // Delay emission of gathers which are not ready yet.
18594 PostponedGathers.insert(E);
18595 // Postpone gather emission, will be emitted after the end of the
18596 // process to keep correct order.
18597 return *Delayed;
18598 }
18599 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
18600 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18601 ExtractVecBase = VecBase;
18602 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
18603 if (VF == VecBaseTy->getNumElements() &&
18604 GatheredScalars.size() != VF) {
18605 Resized = true;
18606 GatheredScalars.append(VF - GatheredScalars.size(),
18607 PoisonValue::get(OrigScalarTy));
18608 NumParts =
18609 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF), VF);
18610 }
18611 }
18612 }
18613 // Gather extracts after we check for full matched gathers only.
18614 if (!ExtractShuffles.empty() || !E->hasState() ||
18615 E->getOpcode() != Instruction::Load ||
18616 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
18617 any_of(E->Scalars, IsaPred<LoadInst>)) &&
18618 any_of(E->Scalars,
18619 [this](Value *V) {
18620 return isa<LoadInst>(V) && isVectorized(V);
18621 })) ||
18622 (E->hasState() && E->isAltShuffle()) ||
18623 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
18624 isSplat(E->Scalars) ||
18625 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18626 GatherShuffles =
18627 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
18628 }
18629 if (!GatherShuffles.empty()) {
18630 if (std::optional<ResTy> Delayed =
18631 ShuffleBuilder.needToDelay(E, Entries)) {
18632 // Delay emission of gathers which are not ready yet.
18633 PostponedGathers.insert(E);
18634 // Postpone gather emission, will be emitted after the end of the
18635 // process to keep correct order.
18636 return *Delayed;
18637 }
18638 if (GatherShuffles.size() == 1 &&
18639 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
18640 Entries.front().front()->isSame(E->Scalars)) {
18641 // Perfect match in the graph, will reuse the previously vectorized
18642 // node. Cost is 0.
18643 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
18644 << shortBundleName(E->Scalars, E->Idx) << ".\n");
18645 // Restore the mask for previous partially matched values.
18646 Mask.resize(E->Scalars.size());
18647 const TreeEntry *FrontTE = Entries.front().front();
18648 if (FrontTE->ReorderIndices.empty() &&
18649 ((FrontTE->ReuseShuffleIndices.empty() &&
18650 E->Scalars.size() == FrontTE->Scalars.size()) ||
18651 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18652 std::iota(Mask.begin(), Mask.end(), 0);
18653 } else {
18654 for (auto [I, V] : enumerate(E->Scalars)) {
18655 if (isa<PoisonValue>(V)) {
18657 continue;
18658 }
18659 Mask[I] = FrontTE->findLaneForValue(V);
18660 }
18661 }
18662 // Reset the builder(s) to correctly handle perfect diamond matched
18663 // nodes.
18664 ShuffleBuilder.resetForSameNode();
18665 ShuffleBuilder.add(*FrontTE, Mask);
18666 // Full matched entry found, no need to insert subvectors.
18667 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
18668 return Res;
18669 }
18670 if (!Resized) {
18671 if (GatheredScalars.size() != VF &&
18672 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
18673 return any_of(TEs, [&](const TreeEntry *TE) {
18674 return TE->getVectorFactor() == VF;
18675 });
18676 }))
18677 GatheredScalars.append(VF - GatheredScalars.size(),
18678 PoisonValue::get(OrigScalarTy));
18679 }
18680 // Remove shuffled elements from list of gathers.
18681 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
18682 if (Mask[I] != PoisonMaskElem)
18683 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
18684 }
18685 }
18686 }
18687 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
18688 SmallVectorImpl<int> &ReuseMask,
18689 bool IsRootPoison) {
18690 // For splats with can emit broadcasts instead of gathers, so try to find
18691 // such sequences.
18692 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
18693 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
18694 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
18695 SmallVector<int> UndefPos;
18696 DenseMap<Value *, unsigned> UniquePositions;
18697 // Gather unique non-const values and all constant values.
18698 // For repeated values, just shuffle them.
18699 int NumNonConsts = 0;
18700 int SinglePos = 0;
18701 for (auto [I, V] : enumerate(Scalars)) {
18702 if (isa<UndefValue>(V)) {
18703 if (!isa<PoisonValue>(V)) {
18704 ReuseMask[I] = I;
18705 UndefPos.push_back(I);
18706 }
18707 continue;
18708 }
18709 if (isConstant(V)) {
18710 ReuseMask[I] = I;
18711 continue;
18712 }
18713 ++NumNonConsts;
18714 SinglePos = I;
18715 Value *OrigV = V;
18716 Scalars[I] = PoisonValue::get(OrigScalarTy);
18717 if (IsSplat) {
18718 Scalars.front() = OrigV;
18719 ReuseMask[I] = 0;
18720 } else {
18721 const auto Res = UniquePositions.try_emplace(OrigV, I);
18722 Scalars[Res.first->second] = OrigV;
18723 ReuseMask[I] = Res.first->second;
18724 }
18725 }
18726 if (NumNonConsts == 1) {
18727 // Restore single insert element.
18728 if (IsSplat) {
18729 ReuseMask.assign(VF, PoisonMaskElem);
18730 std::swap(Scalars.front(), Scalars[SinglePos]);
18731 if (!UndefPos.empty() && UndefPos.front() == 0)
18732 Scalars.front() = UndefValue::get(OrigScalarTy);
18733 }
18734 ReuseMask[SinglePos] = SinglePos;
18735 } else if (!UndefPos.empty() && IsSplat) {
18736 // For undef values, try to replace them with the simple broadcast.
18737 // We can do it if the broadcasted value is guaranteed to be
18738 // non-poisonous, or by freezing the incoming scalar value first.
18739 auto *It = find_if(Scalars, [this, E](Value *V) {
18740 return !isa<UndefValue>(V) &&
18742 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
18743 // Check if the value already used in the same operation in
18744 // one of the nodes already.
18745 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18746 is_contained(E->UserTreeIndex.UserTE->Scalars,
18747 U.getUser());
18748 })));
18749 });
18750 if (It != Scalars.end()) {
18751 // Replace undefs by the non-poisoned scalars and emit broadcast.
18752 int Pos = std::distance(Scalars.begin(), It);
18753 for (int I : UndefPos) {
18754 // Set the undef position to the non-poisoned scalar.
18755 ReuseMask[I] = Pos;
18756 // Replace the undef by the poison, in the mask it is replaced by
18757 // non-poisoned scalar already.
18758 if (I != Pos)
18759 Scalars[I] = PoisonValue::get(OrigScalarTy);
18760 }
18761 } else {
18762 // Replace undefs by the poisons, emit broadcast and then emit
18763 // freeze.
18764 for (int I : UndefPos) {
18765 ReuseMask[I] = PoisonMaskElem;
18766 if (isa<UndefValue>(Scalars[I]))
18767 Scalars[I] = PoisonValue::get(OrigScalarTy);
18768 }
18769 NeedFreeze = true;
18770 }
18771 }
18772 };
18773 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
18774 bool IsNonPoisoned = true;
18775 bool IsUsedInExpr = true;
18776 Value *Vec1 = nullptr;
18777 if (!ExtractShuffles.empty()) {
18778 // Gather of extractelements can be represented as just a shuffle of
18779 // a single/two vectors the scalars are extracted from.
18780 // Find input vectors.
18781 Value *Vec2 = nullptr;
18782 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18783 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
18784 ExtractMask[I] = PoisonMaskElem;
18785 }
18786 if (UseVecBaseAsInput) {
18787 Vec1 = ExtractVecBase;
18788 } else {
18789 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
18790 if (ExtractMask[I] == PoisonMaskElem)
18791 continue;
18792 if (isa<UndefValue>(StoredGS[I]))
18793 continue;
18794 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
18795 Value *VecOp = EI->getVectorOperand();
18796 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
18797 !TEs.empty() && TEs.front()->VectorizedValue)
18798 VecOp = TEs.front()->VectorizedValue;
18799 if (!Vec1) {
18800 Vec1 = VecOp;
18801 } else if (Vec1 != VecOp) {
18802 assert((!Vec2 || Vec2 == VecOp) &&
18803 "Expected only 1 or 2 vectors shuffle.");
18804 Vec2 = VecOp;
18805 }
18806 }
18807 }
18808 if (Vec2) {
18809 IsUsedInExpr = false;
18810 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
18811 isGuaranteedNotToBePoison(Vec2, AC);
18812 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18813 } else if (Vec1) {
18814 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
18815 IsUsedInExpr &= FindReusedSplat(
18816 ExtractMask,
18817 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
18818 ExtractMask.size(), IsNotPoisonedVec);
18819 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
18820 IsNonPoisoned &= IsNotPoisonedVec;
18821 } else {
18822 IsUsedInExpr = false;
18823 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
18824 /*ForExtracts=*/true);
18825 }
18826 }
18827 if (!GatherShuffles.empty()) {
18828 unsigned SliceSize =
18829 getPartNumElems(E->Scalars.size(),
18830 ::getNumberOfParts(*TTI, VecTy, E->Scalars.size()));
18831 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
18832 for (const auto [I, TEs] : enumerate(Entries)) {
18833 if (TEs.empty()) {
18834 assert(!GatherShuffles[I] &&
18835 "No shuffles with empty entries list expected.");
18836 continue;
18837 }
18838 assert((TEs.size() == 1 || TEs.size() == 2) &&
18839 "Expected shuffle of 1 or 2 entries.");
18840 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
18841 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
18842 VecMask.assign(VecMask.size(), PoisonMaskElem);
18843 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
18844 if (TEs.size() == 1) {
18845 bool IsNotPoisonedVec =
18846 TEs.front()->VectorizedValue
18847 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
18848 : true;
18849 IsUsedInExpr &=
18850 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
18851 SliceSize, IsNotPoisonedVec);
18852 ShuffleBuilder.add(*TEs.front(), VecMask);
18853 IsNonPoisoned &= IsNotPoisonedVec;
18854 } else {
18855 IsUsedInExpr = false;
18856 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
18857 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
18858 IsNonPoisoned &=
18859 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
18860 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
18861 }
18862 }
18863 }
18864 // Try to figure out best way to combine values: build a shuffle and insert
18865 // elements or just build several shuffles.
18866 // Insert non-constant scalars.
18867 SmallVector<Value *> NonConstants(GatheredScalars);
18868 int EMSz = ExtractMask.size();
18869 int MSz = Mask.size();
18870 // Try to build constant vector and shuffle with it only if currently we
18871 // have a single permutation and more than 1 scalar constants.
18872 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
18873 bool IsIdentityShuffle =
18874 ((UseVecBaseAsInput ||
18875 all_of(ExtractShuffles,
18876 [](const std::optional<TTI::ShuffleKind> &SK) {
18877 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
18879 })) &&
18880 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
18881 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
18882 (!GatherShuffles.empty() &&
18883 all_of(GatherShuffles,
18884 [](const std::optional<TTI::ShuffleKind> &SK) {
18885 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
18887 }) &&
18888 none_of(Mask, [&](int I) { return I >= MSz; }) &&
18890 bool EnoughConstsForShuffle =
18891 IsSingleShuffle &&
18892 (none_of(GatheredScalars,
18893 [](Value *V) {
18894 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18895 }) ||
18896 any_of(GatheredScalars,
18897 [](Value *V) {
18898 return isa<Constant>(V) && !isa<UndefValue>(V);
18899 })) &&
18900 (!IsIdentityShuffle ||
18901 (GatheredScalars.size() == 2 &&
18902 any_of(GatheredScalars,
18903 [](Value *V) { return !isa<UndefValue>(V); })) ||
18904 count_if(GatheredScalars, [](Value *V) {
18905 return isa<Constant>(V) && !isa<PoisonValue>(V);
18906 }) > 1);
18907 // NonConstants array contains just non-constant values, GatheredScalars
18908 // contains only constant to build final vector and then shuffle.
18909 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
18910 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
18911 NonConstants[I] = PoisonValue::get(OrigScalarTy);
18912 else
18913 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
18914 }
18915 // Generate constants for final shuffle and build a mask for them.
18916 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
18917 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
18918 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
18919 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
18920 ShuffleBuilder.add(BV, BVMask);
18921 }
18922 if (all_of(NonConstants, [=](Value *V) {
18923 return isa<PoisonValue>(V) ||
18924 (IsSingleShuffle && ((IsIdentityShuffle &&
18925 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
18926 }))
18927 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
18928 SubVectorsMask);
18929 else
18930 Res = ShuffleBuilder.finalize(
18931 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
18932 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
18933 bool IsSplat = isSplat(NonConstants);
18934 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
18935 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
18936 auto CheckIfSplatIsProfitable = [&]() {
18937 // Estimate the cost of splatting + shuffle and compare with
18938 // insert + shuffle.
18939 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18940 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18941 if (isa<ExtractElementInst>(V) || isVectorized(V))
18942 return false;
18943 InstructionCost SplatCost = TTI->getVectorInstrCost(
18944 Instruction::InsertElement, VecTy, CostKind, /*Index=*/0,
18945 PoisonValue::get(VecTy), V);
18946 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18947 for (auto [Idx, I] : enumerate(BVMask))
18948 if (I != PoisonMaskElem)
18949 NewMask[Idx] = Mask.size();
18950 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
18951 NewMask, CostKind);
18952 InstructionCost BVCost = TTI->getVectorInstrCost(
18953 Instruction::InsertElement, VecTy, CostKind,
18954 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
18955 Vec, V);
18956 // Shuffle required?
18957 if (count(BVMask, PoisonMaskElem) <
18958 static_cast<int>(BVMask.size() - 1)) {
18959 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18960 for (auto [Idx, I] : enumerate(BVMask))
18961 if (I != PoisonMaskElem)
18962 NewMask[Idx] = I;
18963 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
18964 VecTy, NewMask, CostKind);
18965 }
18966 return SplatCost <= BVCost;
18967 };
18968 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
18969 for (auto [Idx, I] : enumerate(BVMask))
18970 if (I != PoisonMaskElem)
18971 Mask[Idx] = I;
18972 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
18973 } else {
18974 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18975 SmallVector<Value *> Values(NonConstants.size(),
18976 PoisonValue::get(ScalarTy));
18977 Values[0] = V;
18978 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
18979 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
18980 transform(BVMask, SplatMask.begin(), [](int I) {
18981 return I == PoisonMaskElem ? PoisonMaskElem : 0;
18982 });
18983 if (!ShuffleVectorInst::isIdentityMask(SplatMask, VF))
18984 BV = CreateShuffle(BV, nullptr, SplatMask);
18985 for (auto [Idx, I] : enumerate(BVMask))
18986 if (I != PoisonMaskElem)
18987 Mask[Idx] = BVMask.size() + Idx;
18988 Vec = CreateShuffle(Vec, BV, Mask);
18989 for (auto [Idx, I] : enumerate(Mask))
18990 if (I != PoisonMaskElem)
18991 Mask[Idx] = Idx;
18992 }
18993 });
18994 } else if (!allConstant(GatheredScalars)) {
18995 // Gather unique scalars and all constants.
18996 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
18997 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
18998 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
18999 ShuffleBuilder.add(BV, ReuseMask);
19000 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19001 SubVectorsMask);
19002 } else {
19003 // Gather all constants.
19004 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
19005 for (auto [I, V] : enumerate(GatheredScalars)) {
19006 if (!isa<PoisonValue>(V))
19007 Mask[I] = I;
19008 }
19009 Value *BV = ShuffleBuilder.gather(GatheredScalars);
19010 ShuffleBuilder.add(BV, Mask);
19011 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19012 SubVectorsMask);
19013 }
19014
19015 if (NeedFreeze)
19016 Res = ShuffleBuilder.createFreeze(Res);
19017 return Res;
19018}
19019
19020Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
19021 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
19022 (void)vectorizeTree(VectorizableTree[EIdx].get());
19023 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
19024 Builder, *this);
19025}
19026
19027/// \returns \p I after propagating metadata from \p VL only for instructions in
19028/// \p VL.
19031 for (Value *V : VL)
19032 if (isa<Instruction>(V))
19033 Insts.push_back(V);
19034 return llvm::propagateMetadata(Inst, Insts);
19035}
19036
19038 if (DebugLoc DL = PN.getDebugLoc())
19039 return DL;
19040 return DebugLoc::getUnknown();
19041}
19042
19043Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
19044 IRBuilderBase::InsertPointGuard Guard(Builder);
19045
19046 Value *V = E->Scalars.front();
19047 Type *ScalarTy = V->getType();
19048 if (!isa<CmpInst>(V))
19049 ScalarTy = getValueType(V);
19050 auto It = MinBWs.find(E);
19051 if (It != MinBWs.end()) {
19052 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
19053 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
19054 if (VecTy)
19055 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
19056 }
19057 if (E->VectorizedValue)
19058 return E->VectorizedValue;
19059 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
19060 if (E->isGather()) {
19061 // Set insert point for non-reduction initial nodes.
19062 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
19063 setInsertPointAfterBundle(E);
19064 Value *Vec = createBuildVector(E, ScalarTy);
19065 E->VectorizedValue = Vec;
19066 return Vec;
19067 }
19068 if (E->State == TreeEntry::SplitVectorize) {
19069 assert(E->CombinedEntriesWithIndices.size() == 2 &&
19070 "Expected exactly 2 combined entries.");
19071 setInsertPointAfterBundle(E);
19072 TreeEntry &OpTE1 =
19073 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
19074 assert(OpTE1.isSame(
19075 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
19076 "Expected same first part of scalars.");
19077 Value *Op1 = vectorizeTree(&OpTE1);
19078 TreeEntry &OpTE2 =
19079 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
19080 assert(
19081 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
19082 "Expected same second part of scalars.");
19083 Value *Op2 = vectorizeTree(&OpTE2);
19084 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
19085 bool IsSigned = false;
19086 auto It = MinBWs.find(OpE);
19087 if (It != MinBWs.end())
19088 IsSigned = It->second.second;
19089 else
19090 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19091 if (isa<PoisonValue>(V))
19092 return false;
19093 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19094 });
19095 return IsSigned;
19096 };
19097 if (cast<VectorType>(Op1->getType())->getElementType() !=
19098 ScalarTy->getScalarType()) {
19099 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19100 Op1 = Builder.CreateIntCast(
19101 Op1,
19103 ScalarTy,
19104 cast<FixedVectorType>(Op1->getType())->getNumElements()),
19105 GetOperandSignedness(&OpTE1));
19106 }
19107 if (cast<VectorType>(Op2->getType())->getElementType() !=
19108 ScalarTy->getScalarType()) {
19109 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19110 Op2 = Builder.CreateIntCast(
19111 Op2,
19113 ScalarTy,
19114 cast<FixedVectorType>(Op2->getType())->getNumElements()),
19115 GetOperandSignedness(&OpTE2));
19116 }
19117 if (E->ReorderIndices.empty()) {
19118 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
19119 std::iota(
19120 Mask.begin(),
19121 std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
19122 0);
19123 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
19124 if (ScalarTyNumElements != 1) {
19125 assert(SLPReVec && "Only supported by REVEC.");
19126 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, Mask);
19127 }
19128 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
19129 Vec = createInsertVector(Builder, Vec, Op2,
19130 E->CombinedEntriesWithIndices.back().second *
19131 ScalarTyNumElements);
19132 E->VectorizedValue = Vec;
19133 return Vec;
19134 }
19135 unsigned CommonVF =
19136 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
19137 if (getNumElements(Op1->getType()) != CommonVF) {
19138 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
19139 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()),
19140 0);
19141 Op1 = Builder.CreateShuffleVector(Op1, Mask);
19142 }
19143 if (getNumElements(Op2->getType()) != CommonVF) {
19144 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
19145 std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()),
19146 0);
19147 Op2 = Builder.CreateShuffleVector(Op2, Mask);
19148 }
19149 Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
19150 E->VectorizedValue = Vec;
19151 return Vec;
19152 }
19153
19154 bool IsReverseOrder =
19155 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
19156 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
19157 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
19158 if (E->getOpcode() == Instruction::Store &&
19159 E->State == TreeEntry::Vectorize) {
19160 ArrayRef<int> Mask =
19161 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
19162 E->ReorderIndices.size());
19163 ShuffleBuilder.add(V, Mask);
19164 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19165 E->State == TreeEntry::CompressVectorize) {
19166 ShuffleBuilder.addOrdered(V, {});
19167 } else {
19168 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
19169 }
19171 E->CombinedEntriesWithIndices.size());
19172 transform(
19173 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
19174 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19175 });
19176 assert(
19177 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
19178 "Expected either combined subnodes or reordering");
19179 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
19180 };
19181
19182 assert(!E->isGather() && "Unhandled state");
19183 unsigned ShuffleOrOp =
19184 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
19185 Instruction *VL0 = E->getMainOp();
19186 auto GetOperandSignedness = [&](unsigned Idx) {
19187 const TreeEntry *OpE = getOperandEntry(E, Idx);
19188 bool IsSigned = false;
19189 auto It = MinBWs.find(OpE);
19190 if (It != MinBWs.end())
19191 IsSigned = It->second.second;
19192 else
19193 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
19194 if (isa<PoisonValue>(V))
19195 return false;
19196 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19197 });
19198 return IsSigned;
19199 };
19200 switch (ShuffleOrOp) {
19201 case Instruction::PHI: {
19202 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
19203 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
19204 "PHI reordering is free.");
19205 auto *PH = cast<PHINode>(VL0);
19206 Builder.SetInsertPoint(PH->getParent(),
19207 PH->getParent()->getFirstNonPHIIt());
19208 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19209 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19210 Value *V = NewPhi;
19211
19212 // Adjust insertion point once all PHI's have been generated.
19213 Builder.SetInsertPoint(PH->getParent(),
19214 PH->getParent()->getFirstInsertionPt());
19215 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19216
19217 V = FinalShuffle(V, E);
19218
19219 E->VectorizedValue = V;
19220 // If phi node is fully emitted - exit.
19221 if (NewPhi->getNumIncomingValues() != 0)
19222 return NewPhi;
19223
19224 // PHINodes may have multiple entries from the same block. We want to
19225 // visit every block once.
19226 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19227
19228 for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
19229 BasicBlock *IBB = PH->getIncomingBlock(I);
19230
19231 // Stop emission if all incoming values are generated.
19232 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
19233 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
19234 return NewPhi;
19235 }
19236
19237 if (!VisitedBBs.insert(IBB).second) {
19238 Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
19239 NewPhi->addIncoming(VecOp, IBB);
19240 TreeEntry *OpTE = getOperandEntry(E, I);
19241 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
19242 OpTE->VectorizedValue = VecOp;
19243 continue;
19244 }
19245
19246 Builder.SetInsertPoint(IBB->getTerminator());
19247 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
19248 Value *Vec = vectorizeOperand(E, I);
19249 if (VecTy != Vec->getType()) {
19250 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
19251 MinBWs.contains(getOperandEntry(E, I))) &&
19252 "Expected item in MinBWs.");
19253 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
19254 }
19255 NewPhi->addIncoming(Vec, IBB);
19256 }
19257
19258 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
19259 "Invalid number of incoming values");
19260 assert(E->VectorizedValue && "Expected vectorized value.");
19261 return E->VectorizedValue;
19262 }
19263
19264 case Instruction::ExtractElement: {
19265 Value *V = E->getSingleOperand(0);
19266 setInsertPointAfterBundle(E);
19267 V = FinalShuffle(V, E);
19268 E->VectorizedValue = V;
19269 return V;
19270 }
19271 case Instruction::ExtractValue: {
19272 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
19273 Builder.SetInsertPoint(LI);
19274 Value *Ptr = LI->getPointerOperand();
19275 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
19276 Value *NewV = ::propagateMetadata(V, E->Scalars);
19277 NewV = FinalShuffle(NewV, E);
19278 E->VectorizedValue = NewV;
19279 return NewV;
19280 }
19281 case Instruction::InsertElement: {
19282 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
19283 if (const TreeEntry *OpE = getOperandEntry(E, 1);
19284 OpE && !OpE->isGather() && OpE->hasState() &&
19285 !OpE->hasCopyableElements())
19286 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
19287 else
19288 setInsertPointAfterBundle(E);
19289 Value *V = vectorizeOperand(E, 1);
19290 ArrayRef<Value *> Op = E->getOperand(1);
19291 Type *ScalarTy = Op.front()->getType();
19292 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
19293 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
19294 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
19295 assert(Res.first > 0 && "Expected item in MinBWs.");
19296 V = Builder.CreateIntCast(
19297 V,
19299 ScalarTy,
19300 cast<FixedVectorType>(V->getType())->getNumElements()),
19301 Res.second);
19302 }
19303
19304 // Create InsertVector shuffle if necessary
19305 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
19306 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19307 }));
19308 const unsigned NumElts =
19309 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
19310 const unsigned NumScalars = E->Scalars.size();
19311
19312 unsigned Offset = *getElementIndex(VL0);
19313 assert(Offset < NumElts && "Failed to find vector index offset");
19314
19315 // Create shuffle to resize vector
19316 SmallVector<int> Mask;
19317 if (!E->ReorderIndices.empty()) {
19318 inversePermutation(E->ReorderIndices, Mask);
19319 Mask.append(NumElts - NumScalars, PoisonMaskElem);
19320 } else {
19321 Mask.assign(NumElts, PoisonMaskElem);
19322 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
19323 }
19324 // Create InsertVector shuffle if necessary
19325 bool IsIdentity = true;
19326 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
19327 Mask.swap(PrevMask);
19328 for (unsigned I = 0; I < NumScalars; ++I) {
19329 Value *Scalar = E->Scalars[PrevMask[I]];
19330 unsigned InsertIdx = *getElementIndex(Scalar);
19331 IsIdentity &= InsertIdx - Offset == I;
19332 Mask[InsertIdx - Offset] = I;
19333 }
19334 if (!IsIdentity || NumElts != NumScalars) {
19335 Value *V2 = nullptr;
19336 bool IsVNonPoisonous =
19338 SmallVector<int> InsertMask(Mask);
19339 if (NumElts != NumScalars && Offset == 0) {
19340 // Follow all insert element instructions from the current buildvector
19341 // sequence.
19342 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
19343 do {
19344 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
19345 if (!InsertIdx)
19346 break;
19347 if (InsertMask[*InsertIdx] == PoisonMaskElem)
19348 InsertMask[*InsertIdx] = *InsertIdx;
19349 if (!Ins->hasOneUse())
19350 break;
19352 Ins->getUniqueUndroppableUser());
19353 } while (Ins);
19354 SmallBitVector UseMask =
19355 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19356 SmallBitVector IsFirstPoison =
19357 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19358 SmallBitVector IsFirstUndef =
19359 isUndefVector(FirstInsert->getOperand(0), UseMask);
19360 if (!IsFirstPoison.all()) {
19361 unsigned Idx = 0;
19362 for (unsigned I = 0; I < NumElts; I++) {
19363 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
19364 IsFirstUndef.test(I)) {
19365 if (IsVNonPoisonous) {
19366 InsertMask[I] = I < NumScalars ? I : 0;
19367 continue;
19368 }
19369 if (!V2)
19370 V2 = UndefValue::get(V->getType());
19371 if (Idx >= NumScalars)
19372 Idx = NumScalars - 1;
19373 InsertMask[I] = NumScalars + Idx;
19374 ++Idx;
19375 } else if (InsertMask[I] != PoisonMaskElem &&
19376 Mask[I] == PoisonMaskElem) {
19377 InsertMask[I] = PoisonMaskElem;
19378 }
19379 }
19380 } else {
19381 InsertMask = Mask;
19382 }
19383 }
19384 if (!V2)
19385 V2 = PoisonValue::get(V->getType());
19386 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19387 if (auto *I = dyn_cast<Instruction>(V)) {
19388 GatherShuffleExtractSeq.insert(I);
19389 CSEBlocks.insert(I->getParent());
19390 }
19391 }
19392
19393 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
19394 for (unsigned I = 0; I < NumElts; I++) {
19395 if (Mask[I] != PoisonMaskElem)
19396 InsertMask[Offset + I] = I;
19397 }
19398 SmallBitVector UseMask =
19399 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19400 SmallBitVector IsFirstUndef =
19401 isUndefVector(FirstInsert->getOperand(0), UseMask);
19402 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
19403 NumElts != NumScalars) {
19404 if (IsFirstUndef.all()) {
19405 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
19406 SmallBitVector IsFirstPoison =
19407 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19408 if (!IsFirstPoison.all()) {
19409 for (unsigned I = 0; I < NumElts; I++) {
19410 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
19411 InsertMask[I] = I + NumElts;
19412 }
19413 }
19414 V = Builder.CreateShuffleVector(
19415 V,
19416 IsFirstPoison.all() ? PoisonValue::get(V->getType())
19417 : FirstInsert->getOperand(0),
19418 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
19419 if (auto *I = dyn_cast<Instruction>(V)) {
19420 GatherShuffleExtractSeq.insert(I);
19421 CSEBlocks.insert(I->getParent());
19422 }
19423 }
19424 } else {
19425 SmallBitVector IsFirstPoison =
19426 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
19427 for (unsigned I = 0; I < NumElts; I++) {
19428 if (InsertMask[I] == PoisonMaskElem)
19429 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
19430 else
19431 InsertMask[I] += NumElts;
19432 }
19433 V = Builder.CreateShuffleVector(
19434 FirstInsert->getOperand(0), V, InsertMask,
19435 cast<Instruction>(E->Scalars.back())->getName());
19436 if (auto *I = dyn_cast<Instruction>(V)) {
19437 GatherShuffleExtractSeq.insert(I);
19438 CSEBlocks.insert(I->getParent());
19439 }
19440 }
19441 }
19442
19443 ++NumVectorInstructions;
19444 E->VectorizedValue = V;
19445 return V;
19446 }
19447 case Instruction::ZExt:
19448 case Instruction::SExt:
19449 case Instruction::FPToUI:
19450 case Instruction::FPToSI:
19451 case Instruction::FPExt:
19452 case Instruction::PtrToInt:
19453 case Instruction::IntToPtr:
19454 case Instruction::SIToFP:
19455 case Instruction::UIToFP:
19456 case Instruction::Trunc:
19457 case Instruction::FPTrunc:
19458 case Instruction::BitCast: {
19459 setInsertPointAfterBundle(E);
19460
19461 Value *InVec = vectorizeOperand(E, 0);
19462
19463 auto *CI = cast<CastInst>(VL0);
19464 Instruction::CastOps VecOpcode = CI->getOpcode();
19465 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
19466 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
19467 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
19468 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19469 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
19470 // Check if the values are candidates to demote.
19471 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19472 if (SrcIt != MinBWs.end())
19473 SrcBWSz = SrcIt->second.first;
19474 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
19475 if (BWSz == SrcBWSz) {
19476 VecOpcode = Instruction::BitCast;
19477 } else if (BWSz < SrcBWSz) {
19478 VecOpcode = Instruction::Trunc;
19479 } else if (It != MinBWs.end()) {
19480 assert(BWSz > SrcBWSz && "Invalid cast!");
19481 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19482 } else if (SrcIt != MinBWs.end()) {
19483 assert(BWSz > SrcBWSz && "Invalid cast!");
19484 VecOpcode =
19485 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19486 }
19487 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19488 !SrcIt->second.second) {
19489 VecOpcode = Instruction::UIToFP;
19490 }
19491 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19492 ? InVec
19493 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19494 V = FinalShuffle(V, E);
19495
19496 E->VectorizedValue = V;
19497 ++NumVectorInstructions;
19498 return V;
19499 }
19500 case Instruction::FCmp:
19501 case Instruction::ICmp: {
19502 setInsertPointAfterBundle(E);
19503
19504 Value *L = vectorizeOperand(E, 0);
19505 Value *R = vectorizeOperand(E, 1);
19506 if (L->getType() != R->getType()) {
19507 assert((getOperandEntry(E, 0)->isGather() ||
19508 getOperandEntry(E, 1)->isGather() ||
19509 MinBWs.contains(getOperandEntry(E, 0)) ||
19510 MinBWs.contains(getOperandEntry(E, 1))) &&
19511 "Expected item in MinBWs.");
19512 if (cast<VectorType>(L->getType())
19513 ->getElementType()
19514 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
19515 ->getElementType()
19516 ->getIntegerBitWidth()) {
19517 Type *CastTy = R->getType();
19518 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19519 } else {
19520 Type *CastTy = L->getType();
19521 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19522 }
19523 }
19524
19525 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
19526 Value *V = Builder.CreateCmp(P0, L, R);
19527 propagateIRFlags(V, E->Scalars, VL0);
19528 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
19529 ICmp->setSameSign(/*B=*/false);
19530 // Do not cast for cmps.
19531 VecTy = cast<FixedVectorType>(V->getType());
19532 V = FinalShuffle(V, E);
19533
19534 E->VectorizedValue = V;
19535 ++NumVectorInstructions;
19536 return V;
19537 }
19538 case Instruction::Select: {
19539 setInsertPointAfterBundle(E);
19540
19541 Value *Cond = vectorizeOperand(E, 0);
19542 Value *True = vectorizeOperand(E, 1);
19543 Value *False = vectorizeOperand(E, 2);
19544 if (True->getType() != VecTy || False->getType() != VecTy) {
19545 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
19546 getOperandEntry(E, 2)->isGather() ||
19547 MinBWs.contains(getOperandEntry(E, 1)) ||
19548 MinBWs.contains(getOperandEntry(E, 2))) &&
19549 "Expected item in MinBWs.");
19550 if (True->getType() != VecTy)
19551 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19552 if (False->getType() != VecTy)
19553 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19554 }
19555
19556 unsigned CondNumElements = getNumElements(Cond->getType());
19557 unsigned TrueNumElements = getNumElements(True->getType());
19558 assert(TrueNumElements >= CondNumElements &&
19559 TrueNumElements % CondNumElements == 0 &&
19560 "Cannot vectorize Instruction::Select");
19561 assert(TrueNumElements == getNumElements(False->getType()) &&
19562 "Cannot vectorize Instruction::Select");
19563 if (CondNumElements != TrueNumElements) {
19564 // When the return type is i1 but the source is fixed vector type, we
19565 // need to duplicate the condition value.
19566 Cond = Builder.CreateShuffleVector(
19567 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
19568 CondNumElements));
19569 }
19570 assert(getNumElements(Cond->getType()) == TrueNumElements &&
19571 "Cannot vectorize Instruction::Select");
19572 Value *V =
19573 Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
19574 V = FinalShuffle(V, E);
19575
19576 E->VectorizedValue = V;
19577 ++NumVectorInstructions;
19578 return V;
19579 }
19580 case Instruction::FNeg: {
19581 setInsertPointAfterBundle(E);
19582
19583 Value *Op = vectorizeOperand(E, 0);
19584
19585 Value *V = Builder.CreateUnOp(
19586 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
19587 propagateIRFlags(V, E->Scalars, VL0);
19588 if (auto *I = dyn_cast<Instruction>(V))
19589 V = ::propagateMetadata(I, E->Scalars);
19590
19591 V = FinalShuffle(V, E);
19592
19593 E->VectorizedValue = V;
19594 ++NumVectorInstructions;
19595
19596 return V;
19597 }
19598 case Instruction::Freeze: {
19599 setInsertPointAfterBundle(E);
19600
19601 Value *Op = vectorizeOperand(E, 0);
19602
19603 if (Op->getType() != VecTy) {
19604 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19605 MinBWs.contains(getOperandEntry(E, 0))) &&
19606 "Expected item in MinBWs.");
19607 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
19608 }
19609 Value *V = Builder.CreateFreeze(Op);
19610 V = FinalShuffle(V, E);
19611
19612 E->VectorizedValue = V;
19613 ++NumVectorInstructions;
19614
19615 return V;
19616 }
19617 case Instruction::Add:
19618 case Instruction::FAdd:
19619 case Instruction::Sub:
19620 case Instruction::FSub:
19621 case Instruction::Mul:
19622 case Instruction::FMul:
19623 case Instruction::UDiv:
19624 case Instruction::SDiv:
19625 case Instruction::FDiv:
19626 case Instruction::URem:
19627 case Instruction::SRem:
19628 case Instruction::FRem:
19629 case Instruction::Shl:
19630 case Instruction::LShr:
19631 case Instruction::AShr:
19632 case Instruction::And:
19633 case Instruction::Or:
19634 case Instruction::Xor: {
19635 setInsertPointAfterBundle(E);
19636
19637 Value *LHS = vectorizeOperand(E, 0);
19638 Value *RHS = vectorizeOperand(E, 1);
19639 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19640 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
19641 ArrayRef<Value *> Ops = E->getOperand(I);
19642 if (all_of(Ops, [&](Value *Op) {
19643 auto *CI = dyn_cast<ConstantInt>(Op);
19644 return CI && CI->getValue().countr_one() >= It->second.first;
19645 })) {
19646 V = FinalShuffle(I == 0 ? RHS : LHS, E);
19647 E->VectorizedValue = V;
19648 ++NumVectorInstructions;
19649 return V;
19650 }
19651 }
19652 }
19653 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
19654 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
19655 getOperandEntry(E, 1)->isGather() ||
19656 MinBWs.contains(getOperandEntry(E, 0)) ||
19657 MinBWs.contains(getOperandEntry(E, 1))) &&
19658 "Expected item in MinBWs.");
19659 if (LHS->getType() != VecTy)
19660 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
19661 if (RHS->getType() != VecTy)
19662 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
19663 }
19664
19665 Value *V = Builder.CreateBinOp(
19666 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
19667 RHS);
19668 propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
19669 if (auto *I = dyn_cast<Instruction>(V)) {
19670 V = ::propagateMetadata(I, E->Scalars);
19671 // Drop nuw flags for abs(sub(commutative), true).
19672 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
19673 any_of(E->Scalars, [](Value *V) {
19674 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
19675 }))
19676 I->setHasNoUnsignedWrap(/*b=*/false);
19677 }
19678
19679 V = FinalShuffle(V, E);
19680
19681 E->VectorizedValue = V;
19682 ++NumVectorInstructions;
19683
19684 return V;
19685 }
19686 case Instruction::Load: {
19687 // Loads are inserted at the head of the tree because we don't want to
19688 // sink them all the way down past store instructions.
19689 setInsertPointAfterBundle(E);
19690
19691 LoadInst *LI = cast<LoadInst>(VL0);
19692 Instruction *NewLI;
19693 FixedVectorType *StridedLoadTy = nullptr;
19694 Value *PO = LI->getPointerOperand();
19695 if (E->State == TreeEntry::Vectorize) {
19696 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19697 } else if (E->State == TreeEntry::CompressVectorize) {
19698 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19699 CompressEntryToData.at(E);
19700 Align CommonAlignment = LI->getAlign();
19701 if (IsMasked) {
19702 unsigned VF = getNumElements(LoadVecTy);
19703 SmallVector<Constant *> MaskValues(
19704 VF / getNumElements(LI->getType()),
19705 ConstantInt::getFalse(VecTy->getContext()));
19706 for (int I : CompressMask)
19707 MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
19708 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19709 assert(SLPReVec && "Only supported by REVEC.");
19710 MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
19711 }
19712 Constant *MaskValue = ConstantVector::get(MaskValues);
19713 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19714 MaskValue);
19715 } else {
19716 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19717 }
19718 NewLI = ::propagateMetadata(NewLI, E->Scalars);
19719 // TODO: include this cost into CommonCost.
19720 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
19721 assert(SLPReVec && "FixedVectorType is not expected.");
19722 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
19723 CompressMask);
19724 }
19725 NewLI =
19726 cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
19727 } else if (E->State == TreeEntry::StridedVectorize) {
19728 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
19729 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
19730 PO = IsReverseOrder ? PtrN : Ptr0;
19731 Type *StrideTy = DL->getIndexType(PO->getType());
19732 Value *StrideVal;
19733 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
19734 StridedLoadTy = SPtrInfo.Ty;
19735 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
19736 unsigned StridedLoadEC =
19737 StridedLoadTy->getElementCount().getKnownMinValue();
19738
19739 Value *Stride = SPtrInfo.StrideVal;
19740 if (!Stride) {
19741 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19742 assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
19743 SCEVExpander Expander(*SE, *DL, "strided-load-vec");
19744 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
19745 &*Builder.GetInsertPoint());
19746 }
19747 Value *NewStride =
19748 Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
19749 StrideVal = Builder.CreateMul(
19750 NewStride, ConstantInt::get(
19751 StrideTy, (IsReverseOrder ? -1 : 1) *
19752 static_cast<int>(
19753 DL->getTypeAllocSize(ScalarTy))));
19754 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19755 auto *Inst = Builder.CreateIntrinsic(
19756 Intrinsic::experimental_vp_strided_load,
19757 {StridedLoadTy, PO->getType(), StrideTy},
19758 {PO, StrideVal,
19759 Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
19760 Builder.getInt32(StridedLoadEC)});
19761 Inst->addParamAttr(
19762 /*ArgNo=*/0,
19763 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19764 NewLI = Inst;
19765 } else {
19766 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
19767 Value *VecPtr = vectorizeOperand(E, 0);
19768 if (isa<FixedVectorType>(ScalarTy)) {
19769 assert(SLPReVec && "FixedVectorType is not expected.");
19770 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
19771 // to expand VecPtr if ScalarTy is a vector type.
19772 unsigned ScalarTyNumElements =
19773 cast<FixedVectorType>(ScalarTy)->getNumElements();
19774 unsigned VecTyNumElements =
19775 cast<FixedVectorType>(VecTy)->getNumElements();
19776 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19777 "Cannot expand getelementptr.");
19778 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19779 SmallVector<Constant *> Indices(VecTyNumElements);
19780 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
19781 return Builder.getInt64(I % ScalarTyNumElements);
19782 });
19783 VecPtr = Builder.CreateGEP(
19784 VecTy->getElementType(),
19785 Builder.CreateShuffleVector(
19786 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
19787 ConstantVector::get(Indices));
19788 }
19789 // Use the minimum alignment of the gathered loads.
19790 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
19791 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19792 }
19793 Value *V = E->State == TreeEntry::CompressVectorize
19794 ? NewLI
19795 : ::propagateMetadata(NewLI, E->Scalars);
19796
19797 V = FinalShuffle(V, E);
19798 E->VectorizedValue = V;
19799 ++NumVectorInstructions;
19800 return V;
19801 }
19802 case Instruction::Store: {
19803 auto *SI = cast<StoreInst>(VL0);
19804
19805 setInsertPointAfterBundle(E);
19806
19807 Value *VecValue = vectorizeOperand(E, 0);
19808 if (VecValue->getType() != VecTy)
19809 VecValue =
19810 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19811 VecValue = FinalShuffle(VecValue, E);
19812
19813 Value *Ptr = SI->getPointerOperand();
19814 Instruction *ST;
19815 if (E->State == TreeEntry::Vectorize) {
19816 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
19817 } else {
19818 assert(E->State == TreeEntry::StridedVectorize &&
19819 "Expected either strided or consecutive stores.");
19820 if (!E->ReorderIndices.empty()) {
19821 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
19822 Ptr = SI->getPointerOperand();
19823 }
19824 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
19825 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
19826 auto *Inst = Builder.CreateIntrinsic(
19827 Intrinsic::experimental_vp_strided_store,
19828 {VecTy, Ptr->getType(), StrideTy},
19829 {VecValue, Ptr,
19830 ConstantInt::get(
19831 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
19832 Builder.getAllOnesMask(VecTy->getElementCount()),
19833 Builder.getInt32(E->Scalars.size())});
19834 Inst->addParamAttr(
19835 /*ArgNo=*/1,
19836 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19837 ST = Inst;
19838 }
19839
19840 Value *V = ::propagateMetadata(ST, E->Scalars);
19841
19842 E->VectorizedValue = V;
19843 ++NumVectorInstructions;
19844 return V;
19845 }
19846 case Instruction::GetElementPtr: {
19847 auto *GEP0 = cast<GetElementPtrInst>(VL0);
19848 setInsertPointAfterBundle(E);
19849
19850 Value *Op0 = vectorizeOperand(E, 0);
19851
19852 SmallVector<Value *> OpVecs;
19853 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
19854 Value *OpVec = vectorizeOperand(E, J);
19855 OpVecs.push_back(OpVec);
19856 }
19857
19858 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
19859 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
19861 for (Value *V : E->Scalars) {
19863 GEPs.push_back(V);
19864 }
19865 V = ::propagateMetadata(I, GEPs);
19866 }
19867
19868 V = FinalShuffle(V, E);
19869
19870 E->VectorizedValue = V;
19871 ++NumVectorInstructions;
19872
19873 return V;
19874 }
19875 case Instruction::Call: {
19876 CallInst *CI = cast<CallInst>(VL0);
19877 setInsertPointAfterBundle(E);
19878
19880
19882 CI, ID, VecTy->getNumElements(),
19883 It != MinBWs.end() ? It->second.first : 0, TTI);
19884 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
19885 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
19886 VecCallCosts.first <= VecCallCosts.second;
19887
19888 Value *ScalarArg = nullptr;
19889 SmallVector<Value *> OpVecs;
19890 SmallVector<Type *, 2> TysForDecl;
19891 // Add return type if intrinsic is overloaded on it.
19892 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
19893 TysForDecl.push_back(VecTy);
19894 auto *CEI = cast<CallInst>(VL0);
19895 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
19896 // Some intrinsics have scalar arguments. This argument should not be
19897 // vectorized.
19898 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
19899 ScalarArg = CEI->getArgOperand(I);
19900 // if decided to reduce bitwidth of abs intrinsic, it second argument
19901 // must be set false (do not return poison, if value issigned min).
19902 if (ID == Intrinsic::abs && It != MinBWs.end() &&
19903 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
19904 ScalarArg = Builder.getFalse();
19905 OpVecs.push_back(ScalarArg);
19907 TysForDecl.push_back(ScalarArg->getType());
19908 continue;
19909 }
19910
19911 Value *OpVec = vectorizeOperand(E, I);
19912 ScalarArg = CEI->getArgOperand(I);
19913 if (cast<VectorType>(OpVec->getType())->getElementType() !=
19914 ScalarArg->getType()->getScalarType() &&
19915 It == MinBWs.end()) {
19916 auto *CastTy =
19917 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
19918 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
19919 } else if (It != MinBWs.end()) {
19920 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
19921 }
19922 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
19923 OpVecs.push_back(OpVec);
19924 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
19925 TysForDecl.push_back(OpVec->getType());
19926 }
19927
19928 Function *CF;
19929 if (!UseIntrinsic) {
19930 VFShape Shape =
19932 ElementCount::getFixed(VecTy->getNumElements()),
19933 false /*HasGlobalPred*/);
19934 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
19935 } else {
19936 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
19937 }
19938
19940 CI->getOperandBundlesAsDefs(OpBundles);
19941 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
19942
19943 propagateIRFlags(V, E->Scalars, VL0);
19944 V = FinalShuffle(V, E);
19945
19946 E->VectorizedValue = V;
19947 ++NumVectorInstructions;
19948 return V;
19949 }
19950 case Instruction::ShuffleVector: {
19951 Value *V;
19952 if (SLPReVec && !E->isAltShuffle()) {
19953 setInsertPointAfterBundle(E);
19954 Value *Src = vectorizeOperand(E, 0);
19955 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
19956 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
19957 SmallVector<int> NewMask(ThisMask.size());
19958 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
19959 return SVSrc->getShuffleMask()[Mask];
19960 });
19961 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
19962 SVSrc->getOperand(1), NewMask);
19963 } else {
19964 V = Builder.CreateShuffleVector(Src, ThisMask);
19965 }
19966 propagateIRFlags(V, E->Scalars, VL0);
19967 if (auto *I = dyn_cast<Instruction>(V))
19968 V = ::propagateMetadata(I, E->Scalars);
19969 V = FinalShuffle(V, E);
19970 } else {
19971 assert(E->isAltShuffle() &&
19972 ((Instruction::isBinaryOp(E->getOpcode()) &&
19973 Instruction::isBinaryOp(E->getAltOpcode())) ||
19974 (Instruction::isCast(E->getOpcode()) &&
19975 Instruction::isCast(E->getAltOpcode())) ||
19976 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
19977 "Invalid Shuffle Vector Operand");
19978
19979 Value *LHS = nullptr, *RHS = nullptr;
19980 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
19981 setInsertPointAfterBundle(E);
19982 LHS = vectorizeOperand(E, 0);
19983 RHS = vectorizeOperand(E, 1);
19984 } else {
19985 setInsertPointAfterBundle(E);
19986 LHS = vectorizeOperand(E, 0);
19987 }
19988 if (LHS && RHS &&
19989 ((Instruction::isBinaryOp(E->getOpcode()) &&
19990 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
19991 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
19992 assert((It != MinBWs.end() ||
19993 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
19994 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
19995 MinBWs.contains(getOperandEntry(E, 0)) ||
19996 MinBWs.contains(getOperandEntry(E, 1))) &&
19997 "Expected item in MinBWs.");
19998 Type *CastTy = VecTy;
19999 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
20001 ->getElementType()
20002 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
20003 ->getElementType()
20004 ->getIntegerBitWidth())
20005 CastTy = RHS->getType();
20006 else
20007 CastTy = LHS->getType();
20008 }
20009 if (LHS->getType() != CastTy)
20010 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
20011 if (RHS->getType() != CastTy)
20012 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
20013 }
20014
20015 Value *V0, *V1;
20016 if (Instruction::isBinaryOp(E->getOpcode())) {
20017 V0 = Builder.CreateBinOp(
20018 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
20019 V1 = Builder.CreateBinOp(
20020 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
20021 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
20022 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
20023 auto *AltCI = cast<CmpInst>(E->getAltOp());
20024 CmpInst::Predicate AltPred = AltCI->getPredicate();
20025 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
20026 } else {
20027 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
20028 unsigned SrcBWSz = DL->getTypeSizeInBits(
20029 cast<VectorType>(LHS->getType())->getElementType());
20030 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
20031 if (BWSz <= SrcBWSz) {
20032 if (BWSz < SrcBWSz)
20033 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
20034 assert(LHS->getType() == VecTy &&
20035 "Expected same type as operand.");
20036 if (auto *I = dyn_cast<Instruction>(LHS))
20037 LHS = ::propagateMetadata(I, E->Scalars);
20038 LHS = FinalShuffle(LHS, E);
20039 E->VectorizedValue = LHS;
20040 ++NumVectorInstructions;
20041 return LHS;
20042 }
20043 }
20044 V0 = Builder.CreateCast(
20045 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
20046 V1 = Builder.CreateCast(
20047 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
20048 }
20049 // Add V0 and V1 to later analysis to try to find and remove matching
20050 // instruction, if any.
20051 for (Value *V : {V0, V1}) {
20052 if (auto *I = dyn_cast<Instruction>(V)) {
20053 GatherShuffleExtractSeq.insert(I);
20054 CSEBlocks.insert(I->getParent());
20055 }
20056 }
20057
20058 // Create shuffle to take alternate operations from the vector.
20059 // Also, gather up main and alt scalar ops to propagate IR flags to
20060 // each vector operation.
20061 ValueList OpScalars, AltScalars;
20062 SmallVector<int> Mask;
20063 E->buildAltOpShuffleMask(
20064 [E, this](Instruction *I) {
20065 assert(E->getMatchingMainOpOrAltOp(I) &&
20066 "Unexpected main/alternate opcode");
20067 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
20068 *TLI);
20069 },
20070 Mask, &OpScalars, &AltScalars);
20071
20072 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
20073 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
20074 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
20075 // Drop nuw flags for abs(sub(commutative), true).
20076 if (auto *I = dyn_cast<Instruction>(Vec);
20077 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
20078 any_of(E->Scalars, [](Value *V) {
20079 if (isa<PoisonValue>(V))
20080 return false;
20081 auto *IV = cast<Instruction>(V);
20082 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
20083 }))
20084 I->setHasNoUnsignedWrap(/*b=*/false);
20085 };
20086 DropNuwFlag(V0, E->getOpcode());
20087 DropNuwFlag(V1, E->getAltOpcode());
20088
20089 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20090 assert(SLPReVec && "FixedVectorType is not expected.");
20091 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
20092 }
20093 V = Builder.CreateShuffleVector(V0, V1, Mask);
20094 if (auto *I = dyn_cast<Instruction>(V)) {
20095 V = ::propagateMetadata(I, E->Scalars);
20096 GatherShuffleExtractSeq.insert(I);
20097 CSEBlocks.insert(I->getParent());
20098 }
20099 }
20100
20101 E->VectorizedValue = V;
20102 ++NumVectorInstructions;
20103
20104 return V;
20105 }
20106 default:
20107 llvm_unreachable("unknown inst");
20108 }
20109 return nullptr;
20110}
20111
20113 ExtraValueToDebugLocsMap ExternallyUsedValues;
20114 return vectorizeTree(ExternallyUsedValues);
20115}
20116
20118 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
20119 Instruction *ReductionRoot,
20120 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
20121 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
20122 // need to rebuild it.
20123 EntryToLastInstruction.clear();
20124 // All blocks must be scheduled before any instructions are inserted.
20125 for (auto &BSIter : BlocksSchedules)
20126 scheduleBlock(*this, BSIter.second.get());
20127 // Cache last instructions for the nodes to avoid side effects, which may
20128 // appear during vectorization, like extra uses, etc.
20129 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20130 if (TE->isGather())
20131 continue;
20132 (void)getLastInstructionInBundle(TE.get());
20133 }
20134
20135 if (ReductionRoot)
20136 Builder.SetInsertPoint(ReductionRoot->getParent(),
20137 ReductionRoot->getIterator());
20138 else
20139 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20140
20141 // Vectorize gather operands of the nodes with the external uses only.
20143 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20144 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
20145 TE->UserTreeIndex.UserTE->hasState() &&
20146 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
20147 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
20148 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
20149 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
20150 all_of(TE->UserTreeIndex.UserTE->Scalars,
20151 [](Value *V) { return isUsedOutsideBlock(V); })) {
20152 Instruction &LastInst =
20153 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
20154 GatherEntries.emplace_back(TE.get(), &LastInst);
20155 }
20156 }
20157 for (auto &Entry : GatherEntries) {
20158 IRBuilderBase::InsertPointGuard Guard(Builder);
20159 Builder.SetInsertPoint(Entry.second);
20160 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20161 (void)vectorizeTree(Entry.first);
20162 }
20163 // Emit gathered loads first to emit better code for the users of those
20164 // gathered loads.
20165 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20166 if (GatheredLoadsEntriesFirst.has_value() &&
20167 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20168 (!TE->isGather() || TE->UserTreeIndex)) {
20169 assert((TE->UserTreeIndex ||
20170 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20171 "Expected gathered load node.");
20172 (void)vectorizeTree(TE.get());
20173 }
20174 }
20175 (void)vectorizeTree(VectorizableTree[0].get());
20176 // Run through the list of postponed gathers and emit them, replacing the temp
20177 // emitted allocas with actual vector instructions.
20178 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
20180 for (const TreeEntry *E : PostponedNodes) {
20181 auto *TE = const_cast<TreeEntry *>(E);
20182 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
20183 TE->VectorizedValue = nullptr;
20184 auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
20185 // If user is a PHI node, its vector code have to be inserted right before
20186 // block terminator. Since the node was delayed, there were some unresolved
20187 // dependencies at the moment when stab instruction was emitted. In a case
20188 // when any of these dependencies turn out an operand of another PHI, coming
20189 // from this same block, position of a stab instruction will become invalid.
20190 // The is because source vector that supposed to feed this gather node was
20191 // inserted at the end of the block [after stab instruction]. So we need
20192 // to adjust insertion point again to the end of block.
20193 if (isa<PHINode>(UserI) ||
20194 (TE->UserTreeIndex.UserTE->hasState() &&
20195 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
20196 // Insert before all users.
20197 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
20198 for (User *U : PrevVec->users()) {
20199 if (U == UserI)
20200 continue;
20201 auto *UI = dyn_cast<Instruction>(U);
20202 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
20203 continue;
20204 if (UI->comesBefore(InsertPt))
20205 InsertPt = UI;
20206 }
20207 Builder.SetInsertPoint(InsertPt);
20208 } else {
20209 Builder.SetInsertPoint(PrevVec);
20210 }
20211 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20212 Value *Vec = vectorizeTree(TE);
20213 if (auto *VecI = dyn_cast<Instruction>(Vec);
20214 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20215 Builder.GetInsertPoint()->comesBefore(VecI))
20216 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20217 Builder.GetInsertPoint());
20218 if (Vec->getType() != PrevVec->getType()) {
20219 assert(Vec->getType()->isIntOrIntVectorTy() &&
20220 PrevVec->getType()->isIntOrIntVectorTy() &&
20221 "Expected integer vector types only.");
20222 std::optional<bool> IsSigned;
20223 for (Value *V : TE->Scalars) {
20224 if (isVectorized(V)) {
20225 for (const TreeEntry *MNTE : getTreeEntries(V)) {
20226 auto It = MinBWs.find(MNTE);
20227 if (It != MinBWs.end()) {
20228 IsSigned = IsSigned.value_or(false) || It->second.second;
20229 if (*IsSigned)
20230 break;
20231 }
20232 }
20233 if (IsSigned.value_or(false))
20234 break;
20235 // Scan through gather nodes.
20236 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20237 auto It = MinBWs.find(BVE);
20238 if (It != MinBWs.end()) {
20239 IsSigned = IsSigned.value_or(false) || It->second.second;
20240 if (*IsSigned)
20241 break;
20242 }
20243 }
20244 if (IsSigned.value_or(false))
20245 break;
20246 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
20247 IsSigned =
20248 IsSigned.value_or(false) ||
20249 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
20250 continue;
20251 }
20252 if (IsSigned.value_or(false))
20253 break;
20254 }
20255 }
20256 if (IsSigned.value_or(false)) {
20257 // Final attempt - check user node.
20258 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20259 if (It != MinBWs.end())
20260 IsSigned = It->second.second;
20261 }
20262 assert(IsSigned &&
20263 "Expected user node or perfect diamond match in MinBWs.");
20264 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
20265 }
20266 PrevVec->replaceAllUsesWith(Vec);
20267 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
20268 // Replace the stub vector node, if it was used before for one of the
20269 // buildvector nodes already.
20270 auto It = PostponedValues.find(PrevVec);
20271 if (It != PostponedValues.end()) {
20272 for (TreeEntry *VTE : It->getSecond())
20273 VTE->VectorizedValue = Vec;
20274 }
20275 eraseInstruction(PrevVec);
20276 }
20277
20278 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
20279 << " values .\n");
20280
20282 // Maps vector instruction to original insertelement instruction
20283 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
20284 // Maps extract Scalar to the corresponding extractelement instruction in the
20285 // basic block. Only one extractelement per block should be emitted.
20287 ScalarToEEs;
20288 SmallDenseSet<Value *, 4> UsedInserts;
20290 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
20292 // Extract all of the elements with the external uses.
20293 for (const auto &ExternalUse : ExternalUses) {
20294 Value *Scalar = ExternalUse.Scalar;
20295 llvm::User *User = ExternalUse.User;
20296
20297 // Skip users that we already RAUW. This happens when one instruction
20298 // has multiple uses of the same value.
20299 if (User && !is_contained(Scalar->users(), User))
20300 continue;
20301 const TreeEntry *E = &ExternalUse.E;
20302 assert(E && "Invalid scalar");
20303 assert(!E->isGather() && "Extracting from a gather list");
20304 // Non-instruction pointers are not deleted, just skip them.
20305 if (E->getOpcode() == Instruction::GetElementPtr &&
20306 !isa<GetElementPtrInst>(Scalar))
20307 continue;
20308
20309 Value *Vec = E->VectorizedValue;
20310 assert(Vec && "Can't find vectorizable value");
20311
20312 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20313 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
20314 if (Scalar->getType() != Vec->getType()) {
20315 Value *Ex = nullptr;
20316 Value *ExV = nullptr;
20317 auto *Inst = dyn_cast<Instruction>(Scalar);
20318 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20319 auto It = ScalarToEEs.find(Scalar);
20320 if (It != ScalarToEEs.end()) {
20321 // No need to emit many extracts, just move the only one in the
20322 // current block.
20323 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20324 : Builder.GetInsertBlock());
20325 if (EEIt != It->second.end()) {
20326 Value *PrevV = EEIt->second.first;
20327 if (auto *I = dyn_cast<Instruction>(PrevV);
20328 I && !ReplaceInst &&
20329 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20330 Builder.GetInsertPoint()->comesBefore(I)) {
20331 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20332 Builder.GetInsertPoint());
20333 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
20334 CI->moveAfter(I);
20335 }
20336 Ex = PrevV;
20337 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20338 }
20339 }
20340 if (!Ex) {
20341 // "Reuse" the existing extract to improve final codegen.
20342 if (ReplaceInst) {
20343 // Leave the instruction as is, if it cheaper extracts and all
20344 // operands are scalar.
20345 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
20346 IgnoredExtracts.insert(EE);
20347 Ex = EE;
20348 } else {
20349 auto *CloneInst = Inst->clone();
20350 CloneInst->insertBefore(Inst->getIterator());
20351 if (Inst->hasName())
20352 CloneInst->takeName(Inst);
20353 Ex = CloneInst;
20354 }
20355 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
20356 ES && isa<Instruction>(Vec)) {
20357 Value *V = ES->getVectorOperand();
20358 auto *IVec = cast<Instruction>(Vec);
20359 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
20360 V = ETEs.front()->VectorizedValue;
20361 if (auto *IV = dyn_cast<Instruction>(V);
20362 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
20363 IV->comesBefore(IVec))
20364 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20365 else
20366 Ex = Builder.CreateExtractElement(Vec, Lane);
20367 } else if (auto *VecTy =
20368 dyn_cast<FixedVectorType>(Scalar->getType())) {
20369 assert(SLPReVec && "FixedVectorType is not expected.");
20370 unsigned VecTyNumElements = VecTy->getNumElements();
20371 // When REVEC is enabled, we need to extract a vector.
20372 // Note: The element size of Scalar may be different from the
20373 // element size of Vec.
20374 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
20375 ExternalUse.Lane * VecTyNumElements);
20376 } else {
20377 Ex = Builder.CreateExtractElement(Vec, Lane);
20378 }
20379 // If necessary, sign-extend or zero-extend ScalarRoot
20380 // to the larger type.
20381 ExV = Ex;
20382 if (Scalar->getType() != Ex->getType())
20383 ExV = Builder.CreateIntCast(
20384 Ex, Scalar->getType(),
20385 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
20386 auto *I = dyn_cast<Instruction>(Ex);
20387 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
20388 : &F->getEntryBlock(),
20389 std::make_pair(Ex, ExV));
20390 }
20391 // The then branch of the previous if may produce constants, since 0
20392 // operand might be a constant.
20393 if (auto *ExI = dyn_cast<Instruction>(Ex);
20394 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
20395 GatherShuffleExtractSeq.insert(ExI);
20396 CSEBlocks.insert(ExI->getParent());
20397 }
20398 return ExV;
20399 }
20400 assert(isa<FixedVectorType>(Scalar->getType()) &&
20401 isa<InsertElementInst>(Scalar) &&
20402 "In-tree scalar of vector type is not insertelement?");
20403 auto *IE = cast<InsertElementInst>(Scalar);
20404 VectorToInsertElement.try_emplace(Vec, IE);
20405 return Vec;
20406 };
20407 // If User == nullptr, the Scalar remains as scalar in vectorized
20408 // instructions or is used as extra arg. Generate ExtractElement instruction
20409 // and update the record for this scalar in ExternallyUsedValues.
20410 if (!User) {
20411 if (!ScalarsWithNullptrUser.insert(Scalar).second)
20412 continue;
20413 assert(
20414 (ExternallyUsedValues.count(Scalar) ||
20415 ExternalUsesWithNonUsers.count(Scalar) ||
20416 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20417 any_of(
20418 Scalar->users(),
20419 [&, TTI = TTI](llvm::User *U) {
20420 if (ExternalUsesAsOriginalScalar.contains(U))
20421 return true;
20422 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20423 return !UseEntries.empty() &&
20424 (E->State == TreeEntry::Vectorize ||
20425 E->State == TreeEntry::StridedVectorize ||
20426 E->State == TreeEntry::CompressVectorize) &&
20427 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20428 return (UseEntry->State == TreeEntry::Vectorize ||
20429 UseEntry->State ==
20430 TreeEntry::StridedVectorize ||
20431 UseEntry->State ==
20432 TreeEntry::CompressVectorize) &&
20433 doesInTreeUserNeedToExtract(
20434 Scalar, getRootEntryInstruction(*UseEntry),
20435 TLI, TTI);
20436 });
20437 })) &&
20438 "Scalar with nullptr User must be registered in "
20439 "ExternallyUsedValues map or remain as scalar in vectorized "
20440 "instructions");
20441 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20442 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
20443 if (PHI->getParent()->isLandingPad())
20444 Builder.SetInsertPoint(
20445 PHI->getParent(),
20446 std::next(
20447 PHI->getParent()->getLandingPadInst()->getIterator()));
20448 else
20449 Builder.SetInsertPoint(PHI->getParent(),
20450 PHI->getParent()->getFirstNonPHIIt());
20451 } else {
20452 Builder.SetInsertPoint(VecI->getParent(),
20453 std::next(VecI->getIterator()));
20454 }
20455 } else {
20456 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20457 }
20458 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20459 // Required to update internally referenced instructions.
20460 if (Scalar != NewInst) {
20461 assert((!isa<ExtractElementInst>(Scalar) ||
20462 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
20463 "Extractelements should not be replaced.");
20464 Scalar->replaceAllUsesWith(NewInst);
20465 }
20466 continue;
20467 }
20468
20469 if (auto *VU = dyn_cast<InsertElementInst>(User);
20470 VU && VU->getOperand(1) == Scalar) {
20471 // Skip if the scalar is another vector op or Vec is not an instruction.
20472 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
20473 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
20474 if (!UsedInserts.insert(VU).second)
20475 continue;
20476 // Need to use original vector, if the root is truncated.
20477 auto BWIt = MinBWs.find(E);
20478 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
20479 auto *ScalarTy = FTy->getElementType();
20480 auto Key = std::make_pair(Vec, ScalarTy);
20481 auto VecIt = VectorCasts.find(Key);
20482 if (VecIt == VectorCasts.end()) {
20483 IRBuilderBase::InsertPointGuard Guard(Builder);
20484 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
20485 if (IVec->getParent()->isLandingPad())
20486 Builder.SetInsertPoint(IVec->getParent(),
20487 std::next(IVec->getParent()
20488 ->getLandingPadInst()
20489 ->getIterator()));
20490 else
20491 Builder.SetInsertPoint(
20492 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20493 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
20494 Builder.SetInsertPoint(IVec->getNextNode());
20495 }
20496 Vec = Builder.CreateIntCast(
20497 Vec,
20499 ScalarTy,
20500 cast<FixedVectorType>(Vec->getType())->getNumElements()),
20501 BWIt->second.second);
20502 VectorCasts.try_emplace(Key, Vec);
20503 } else {
20504 Vec = VecIt->second;
20505 }
20506 }
20507
20508 std::optional<unsigned> InsertIdx = getElementIndex(VU);
20509 if (InsertIdx) {
20510 auto *It = find_if(
20511 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
20512 // Checks if 2 insertelements are from the same buildvector.
20513 InsertElementInst *VecInsert = Data.InsertElements.front();
20515 VU, VecInsert,
20516 [](InsertElementInst *II) { return II->getOperand(0); });
20517 });
20518 unsigned Idx = *InsertIdx;
20519 if (It == ShuffledInserts.end()) {
20520 (void)ShuffledInserts.emplace_back();
20521 It = std::next(ShuffledInserts.begin(),
20522 ShuffledInserts.size() - 1);
20523 }
20524 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
20525 if (Mask.empty())
20526 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
20527 Mask[Idx] = ExternalUse.Lane;
20528 It->InsertElements.push_back(cast<InsertElementInst>(User));
20529 continue;
20530 }
20531 }
20532 }
20533 }
20534
20535 // Generate extracts for out-of-tree users.
20536 // Find the insertion point for the extractelement lane.
20537 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
20538 if (PHINode *PH = dyn_cast<PHINode>(User)) {
20539 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
20540 if (PH->getIncomingValue(I) == Scalar) {
20541 Instruction *IncomingTerminator =
20542 PH->getIncomingBlock(I)->getTerminator();
20543 if (isa<CatchSwitchInst>(IncomingTerminator)) {
20544 Builder.SetInsertPoint(VecI->getParent(),
20545 std::next(VecI->getIterator()));
20546 } else {
20547 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
20548 }
20549 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20550 PH->setOperand(I, NewInst);
20551 }
20552 }
20553 } else {
20554 Builder.SetInsertPoint(cast<Instruction>(User));
20555 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20556 User->replaceUsesOfWith(Scalar, NewInst);
20557 }
20558 } else {
20559 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20560 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20561 User->replaceUsesOfWith(Scalar, NewInst);
20562 }
20563
20564 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
20565 }
20566
20567 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
20568 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
20569 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
20570 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
20571 for (int I = 0, E = Mask.size(); I < E; ++I) {
20572 if (Mask[I] < VF)
20573 CombinedMask1[I] = Mask[I];
20574 else
20575 CombinedMask2[I] = Mask[I] - VF;
20576 }
20577 ShuffleInstructionBuilder ShuffleBuilder(
20578 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
20579 ShuffleBuilder.add(V1, CombinedMask1);
20580 if (V2)
20581 ShuffleBuilder.add(V2, CombinedMask2);
20582 return ShuffleBuilder.finalize({}, {}, {});
20583 };
20584
20585 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
20586 bool ForSingleMask) {
20587 unsigned VF = Mask.size();
20588 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
20589 if (VF != VecVF) {
20590 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
20591 Vec = CreateShuffle(Vec, nullptr, Mask);
20592 return std::make_pair(Vec, true);
20593 }
20594 if (!ForSingleMask) {
20595 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
20596 for (unsigned I = 0; I < VF; ++I) {
20597 if (Mask[I] != PoisonMaskElem)
20598 ResizeMask[Mask[I]] = Mask[I];
20599 }
20600 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
20601 }
20602 }
20603
20604 return std::make_pair(Vec, false);
20605 };
20606 // Perform shuffling of the vectorize tree entries for better handling of
20607 // external extracts.
20608 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
20609 // Find the first and the last instruction in the list of insertelements.
20610 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
20611 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
20612 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
20613 Builder.SetInsertPoint(LastInsert);
20614 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
20616 MutableArrayRef(Vector.data(), Vector.size()),
20617 FirstInsert->getOperand(0),
20618 [](Value *Vec) {
20619 return cast<VectorType>(Vec->getType())
20620 ->getElementCount()
20621 .getKnownMinValue();
20622 },
20623 ResizeToVF,
20624 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20625 ArrayRef<Value *> Vals) {
20626 assert((Vals.size() == 1 || Vals.size() == 2) &&
20627 "Expected exactly 1 or 2 input values.");
20628 if (Vals.size() == 1) {
20629 // Do not create shuffle if the mask is a simple identity
20630 // non-resizing mask.
20631 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20632 ->getNumElements() ||
20633 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20634 return CreateShuffle(Vals.front(), nullptr, Mask);
20635 return Vals.front();
20636 }
20637 return CreateShuffle(Vals.front() ? Vals.front()
20638 : FirstInsert->getOperand(0),
20639 Vals.back(), Mask);
20640 });
20641 auto It = ShuffledInserts[I].InsertElements.rbegin();
20642 // Rebuild buildvector chain.
20643 InsertElementInst *II = nullptr;
20644 if (It != ShuffledInserts[I].InsertElements.rend())
20645 II = *It;
20647 while (It != ShuffledInserts[I].InsertElements.rend()) {
20648 assert(II && "Must be an insertelement instruction.");
20649 if (*It == II)
20650 ++It;
20651 else
20652 Inserts.push_back(cast<Instruction>(II));
20653 II = dyn_cast<InsertElementInst>(II->getOperand(0));
20654 }
20655 for (Instruction *II : reverse(Inserts)) {
20656 II->replaceUsesOfWith(II->getOperand(0), NewInst);
20657 if (auto *NewI = dyn_cast<Instruction>(NewInst))
20658 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
20659 II->moveAfter(NewI);
20660 NewInst = II;
20661 }
20662 LastInsert->replaceAllUsesWith(NewInst);
20663 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
20664 IE->replaceUsesOfWith(IE->getOperand(0),
20665 PoisonValue::get(IE->getOperand(0)->getType()));
20666 IE->replaceUsesOfWith(IE->getOperand(1),
20667 PoisonValue::get(IE->getOperand(1)->getType()));
20668 eraseInstruction(IE);
20669 }
20670 CSEBlocks.insert(LastInsert->getParent());
20671 }
20672
20673 SmallVector<Instruction *> RemovedInsts;
20674 // For each vectorized value:
20675 for (auto &TEPtr : VectorizableTree) {
20676 TreeEntry *Entry = TEPtr.get();
20677
20678 // No need to handle users of gathered values.
20679 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
20680 continue;
20681
20682 assert(Entry->VectorizedValue && "Can't find vectorizable value");
20683
20684 // For each lane:
20685 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
20686 Value *Scalar = Entry->Scalars[Lane];
20687
20688 if (Entry->getOpcode() == Instruction::GetElementPtr &&
20689 !isa<GetElementPtrInst>(Scalar))
20690 continue;
20691 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
20692 EE && IgnoredExtracts.contains(EE))
20693 continue;
20694 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
20695 continue;
20696#ifndef NDEBUG
20697 Type *Ty = Scalar->getType();
20698 if (!Ty->isVoidTy()) {
20699 for (User *U : Scalar->users()) {
20700 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
20701
20702 // It is legal to delete users in the ignorelist.
20703 assert((isVectorized(U) ||
20704 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20707 "Deleting out-of-tree value");
20708 }
20709 }
20710#endif
20711 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
20712 auto *I = cast<Instruction>(Scalar);
20713 RemovedInsts.push_back(I);
20714 }
20715 }
20716
20717 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
20718 // new vector instruction.
20719 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
20720 V->mergeDIAssignID(RemovedInsts);
20721
20722 // Clear up reduction references, if any.
20723 if (UserIgnoreList) {
20724 for (Instruction *I : RemovedInsts) {
20725 const TreeEntry *IE = getTreeEntries(I).front();
20726 if (IE->Idx != 0 &&
20727 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
20728 (ValueToGatherNodes.lookup(I).contains(
20729 VectorizableTree.front().get()) ||
20730 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20731 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20732 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20733 IE->UserTreeIndex &&
20734 is_contained(VectorizableTree.front()->Scalars, I)) &&
20735 !(GatheredLoadsEntriesFirst.has_value() &&
20736 IE->Idx >= *GatheredLoadsEntriesFirst &&
20737 VectorizableTree.front()->isGather() &&
20738 is_contained(VectorizableTree.front()->Scalars, I)) &&
20739 !(!VectorizableTree.front()->isGather() &&
20740 VectorizableTree.front()->isCopyableElement(I)))
20741 continue;
20742 SmallVector<SelectInst *> LogicalOpSelects;
20743 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
20744 // Do not replace condition of the logical op in form select <cond>.
20745 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20746 (match(U.getUser(), m_LogicalAnd()) ||
20747 match(U.getUser(), m_LogicalOr())) &&
20748 U.getOperandNo() == 0;
20749 if (IsPoisoningLogicalOp) {
20750 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20751 return false;
20752 }
20753 return UserIgnoreList->contains(U.getUser());
20754 });
20755 // Replace conditions of the poisoning logical ops with the non-poison
20756 // constant value.
20757 for (SelectInst *SI : LogicalOpSelects)
20758 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
20759 }
20760 }
20761 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
20762 // cache correctness.
20763 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
20764 // - instructions are not deleted until later.
20765 removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
20766
20767 Builder.ClearInsertionPoint();
20768 InstrElementSize.clear();
20769
20770 const TreeEntry &RootTE = *VectorizableTree.front();
20771 Value *Vec = RootTE.VectorizedValue;
20772 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20773 It != MinBWs.end() &&
20774 ReductionBitWidth != It->second.first) {
20775 IRBuilder<>::InsertPointGuard Guard(Builder);
20776 Builder.SetInsertPoint(ReductionRoot->getParent(),
20777 ReductionRoot->getIterator());
20778 Vec = Builder.CreateIntCast(
20779 Vec,
20780 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
20781 cast<VectorType>(Vec->getType())->getElementCount()),
20782 It->second.second);
20783 }
20784 return Vec;
20785}
20786
20788 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
20789 << " gather sequences instructions.\n");
20790 // LICM InsertElementInst sequences.
20791 for (Instruction *I : GatherShuffleExtractSeq) {
20792 if (isDeleted(I))
20793 continue;
20794
20795 // Check if this block is inside a loop.
20796 Loop *L = LI->getLoopFor(I->getParent());
20797 if (!L)
20798 continue;
20799
20800 // Check if it has a preheader.
20801 BasicBlock *PreHeader = L->getLoopPreheader();
20802 if (!PreHeader)
20803 continue;
20804
20805 // If the vector or the element that we insert into it are
20806 // instructions that are defined in this basic block then we can't
20807 // hoist this instruction.
20808 if (any_of(I->operands(), [L](Value *V) {
20809 auto *OpI = dyn_cast<Instruction>(V);
20810 return OpI && L->contains(OpI);
20811 }))
20812 continue;
20813
20814 // We can hoist this instruction. Move it to the pre-header.
20815 I->moveBefore(PreHeader->getTerminator()->getIterator());
20816 CSEBlocks.insert(PreHeader);
20817 }
20818
20819 // Make a list of all reachable blocks in our CSE queue.
20821 CSEWorkList.reserve(CSEBlocks.size());
20822 for (BasicBlock *BB : CSEBlocks)
20823 if (DomTreeNode *N = DT->getNode(BB)) {
20824 assert(DT->isReachableFromEntry(N));
20825 CSEWorkList.push_back(N);
20826 }
20827
20828 // Sort blocks by domination. This ensures we visit a block after all blocks
20829 // dominating it are visited.
20830 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
20831 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
20832 "Different nodes should have different DFS numbers");
20833 return A->getDFSNumIn() < B->getDFSNumIn();
20834 });
20835
20836 // Less defined shuffles can be replaced by the more defined copies.
20837 // Between two shuffles one is less defined if it has the same vector operands
20838 // and its mask indeces are the same as in the first one or undefs. E.g.
20839 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
20840 // poison, <0, 0, 0, 0>.
20841 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
20842 Instruction *I2,
20843 SmallVectorImpl<int> &NewMask) {
20844 if (I1->getType() != I2->getType())
20845 return false;
20846 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
20847 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
20848 if (!SI1 || !SI2)
20849 return I1->isIdenticalTo(I2);
20850 if (SI1->isIdenticalTo(SI2))
20851 return true;
20852 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
20853 if (SI1->getOperand(I) != SI2->getOperand(I))
20854 return false;
20855 // Check if the second instruction is more defined than the first one.
20856 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
20857 ArrayRef<int> SM1 = SI1->getShuffleMask();
20858 // Count trailing undefs in the mask to check the final number of used
20859 // registers.
20860 unsigned LastUndefsCnt = 0;
20861 for (int I = 0, E = NewMask.size(); I < E; ++I) {
20862 if (SM1[I] == PoisonMaskElem)
20863 ++LastUndefsCnt;
20864 else
20865 LastUndefsCnt = 0;
20866 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
20867 NewMask[I] != SM1[I])
20868 return false;
20869 if (NewMask[I] == PoisonMaskElem)
20870 NewMask[I] = SM1[I];
20871 }
20872 // Check if the last undefs actually change the final number of used vector
20873 // registers.
20874 return SM1.size() - LastUndefsCnt > 1 &&
20875 ::getNumberOfParts(*TTI, SI1->getType()) ==
20877 *TTI, getWidenedType(SI1->getType()->getElementType(),
20878 SM1.size() - LastUndefsCnt));
20879 };
20880 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
20881 // instructions. TODO: We can further optimize this scan if we split the
20882 // instructions into different buckets based on the insert lane.
20884 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
20885 assert(*I &&
20886 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
20887 "Worklist not sorted properly!");
20888 BasicBlock *BB = (*I)->getBlock();
20889 // For all instructions in blocks containing gather sequences:
20890 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
20891 if (isDeleted(&In))
20892 continue;
20894 !GatherShuffleExtractSeq.contains(&In))
20895 continue;
20896
20897 // Check if we can replace this instruction with any of the
20898 // visited instructions.
20899 bool Replaced = false;
20900 for (Instruction *&V : Visited) {
20901 SmallVector<int> NewMask;
20902 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
20903 DT->dominates(V->getParent(), In.getParent())) {
20904 In.replaceAllUsesWith(V);
20905 eraseInstruction(&In);
20906 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
20907 if (!NewMask.empty())
20908 SI->setShuffleMask(NewMask);
20909 Replaced = true;
20910 break;
20911 }
20913 GatherShuffleExtractSeq.contains(V) &&
20914 IsIdenticalOrLessDefined(V, &In, NewMask) &&
20915 DT->dominates(In.getParent(), V->getParent())) {
20916 In.moveAfter(V);
20917 V->replaceAllUsesWith(&In);
20919 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
20920 if (!NewMask.empty())
20921 SI->setShuffleMask(NewMask);
20922 V = &In;
20923 Replaced = true;
20924 break;
20925 }
20926 }
20927 if (!Replaced) {
20928 assert(!is_contained(Visited, &In));
20929 Visited.push_back(&In);
20930 }
20931 }
20932 }
20933 CSEBlocks.clear();
20934 GatherShuffleExtractSeq.clear();
20935}
20936
20937BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
20938 ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
20939 auto &BundlePtr =
20940 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
20941 for (Value *V : VL) {
20942 if (S.isNonSchedulable(V))
20943 continue;
20944 auto *I = cast<Instruction>(V);
20945 if (S.isCopyableElement(V)) {
20946 // Add a copyable element model.
20947 ScheduleCopyableData &SD =
20948 addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr);
20949 // Group the instructions to a bundle.
20950 BundlePtr->add(&SD);
20951 continue;
20952 }
20953 ScheduleData *BundleMember = getScheduleData(V);
20954 assert(BundleMember && "no ScheduleData for bundle member "
20955 "(maybe not in same basic block)");
20956 // Group the instructions to a bundle.
20957 BundlePtr->add(BundleMember);
20958 ScheduledBundles.try_emplace(I).first->getSecond().push_back(
20959 BundlePtr.get());
20960 }
20961 assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
20962 return *BundlePtr;
20963}
20964
20965// Groups the instructions to a bundle (which is then a single scheduling entity)
20966// and schedules instructions until the bundle gets ready.
20967std::optional<BoUpSLP::ScheduleBundle *>
20968BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
20969 const InstructionsState &S,
20970 const EdgeInfo &EI) {
20971 // No need to schedule PHIs, insertelement, extractelement and extractvalue
20972 // instructions.
20973 if (isa<PHINode>(S.getMainOp()) ||
20974 isVectorLikeInstWithConstOps(S.getMainOp()))
20975 return nullptr;
20976 // If the parent node is non-schedulable and the current node is copyable, and
20977 // any of parent instructions are used outside several basic blocks or in
20978 // bin-op node - cancel scheduling, it may cause wrong def-use deps in
20979 // analysis, leading to a crash.
20980 // Non-scheduled nodes may not have related ScheduleData model, which may lead
20981 // to a skipped dep analysis.
20982 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
20983 EI.UserTE->doesNotNeedToSchedule() &&
20984 EI.UserTE->getOpcode() != Instruction::PHI &&
20985 any_of(EI.UserTE->Scalars, [](Value *V) {
20986 auto *I = dyn_cast<Instruction>(V);
20987 if (!I || I->hasOneUser())
20988 return false;
20989 for (User *U : I->users()) {
20990 auto *UI = cast<Instruction>(U);
20991 if (isa<BinaryOperator>(UI))
20992 return true;
20993 }
20994 return false;
20995 }))
20996 return std::nullopt;
20997 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
20998 EI.UserTE->hasCopyableElements() &&
20999 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
21000 all_of(VL, [&](Value *V) {
21001 if (S.isCopyableElement(V))
21002 return true;
21003 return isUsedOutsideBlock(V);
21004 }))
21005 return std::nullopt;
21006 bool HasCopyables = S.areInstructionsWithCopyableElements();
21007 if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||
21008 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) {
21009 // If all operands were replaced by copyables, the operands of this node
21010 // might be not, so need to recalculate dependencies for schedule data,
21011 // replaced by copyable schedule data.
21012 SmallVector<ScheduleData *> ControlDependentMembers;
21013 for (Value *V : VL) {
21014 auto *I = dyn_cast<Instruction>(V);
21015 if (!I || (HasCopyables && S.isCopyableElement(V)))
21016 continue;
21017 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
21018 for (const Use &U : I->operands()) {
21019 unsigned &NumOps =
21020 UserOpToNumOps.try_emplace(std::make_pair(I, U.get()), 0)
21021 .first->getSecond();
21022 ++NumOps;
21023 if (auto *Op = dyn_cast<Instruction>(U.get());
21024 Op && areAllOperandsReplacedByCopyableData(I, Op, *SLP, NumOps)) {
21025 if (ScheduleData *OpSD = getScheduleData(Op);
21026 OpSD && OpSD->hasValidDependencies()) {
21027 OpSD->clearDirectDependencies();
21028 if (RegionHasStackSave ||
21030 ControlDependentMembers.push_back(OpSD);
21031 }
21032 }
21033 }
21034 }
21035 if (!ControlDependentMembers.empty()) {
21036 ScheduleBundle Invalid = ScheduleBundle::invalid();
21037 calculateDependencies(Invalid, /*InsertInReadyList=*/true, SLP,
21038 ControlDependentMembers);
21039 }
21040 return nullptr;
21041 }
21042
21043 // Initialize the instruction bundle.
21044 Instruction *OldScheduleEnd = ScheduleEnd;
21045 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
21046
21047 auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
21048 // Clear deps or recalculate the region, if the memory instruction is a
21049 // copyable. It may have memory deps, which must be recalculated.
21050 SmallVector<ScheduleData *> ControlDependentMembers;
21051 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
21052 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
21053 for (ScheduleEntity *SE : Bundle.getBundle()) {
21054 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
21055 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
21056 BundleMember && BundleMember->hasValidDependencies()) {
21057 BundleMember->clearDirectDependencies();
21058 if (RegionHasStackSave ||
21060 BundleMember->getInst()))
21061 ControlDependentMembers.push_back(BundleMember);
21062 }
21063 continue;
21064 }
21065 auto *SD = cast<ScheduleData>(SE);
21066 if (SD->hasValidDependencies() &&
21067 (!S.areInstructionsWithCopyableElements() ||
21068 !S.isCopyableElement(SD->getInst())) &&
21069 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
21070 EI.UserTE->hasState() &&
21071 (!EI.UserTE->hasCopyableElements() ||
21072 !EI.UserTE->isCopyableElement(SD->getInst())))
21073 SD->clearDirectDependencies();
21074 for (const Use &U : SD->getInst()->operands()) {
21075 unsigned &NumOps =
21076 UserOpToNumOps
21077 .try_emplace(std::make_pair(SD->getInst(), U.get()), 0)
21078 .first->getSecond();
21079 ++NumOps;
21080 if (auto *Op = dyn_cast<Instruction>(U.get());
21081 Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
21082 *SLP, NumOps)) {
21083 if (ScheduleData *OpSD = getScheduleData(Op);
21084 OpSD && OpSD->hasValidDependencies()) {
21085 OpSD->clearDirectDependencies();
21086 if (RegionHasStackSave ||
21088 ControlDependentMembers.push_back(OpSD);
21089 }
21090 }
21091 }
21092 }
21093 };
21094 // The scheduling region got new instructions at the lower end (or it is a
21095 // new region for the first bundle). This makes it necessary to
21096 // recalculate all dependencies.
21097 // It is seldom that this needs to be done a second time after adding the
21098 // initial bundle to the region.
21099 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
21100 for_each(ScheduleDataMap, [&](auto &P) {
21101 if (BB != P.first->getParent())
21102 return;
21103 ScheduleData *SD = P.second;
21104 if (isInSchedulingRegion(*SD))
21105 SD->clearDependencies();
21106 });
21107 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
21108 for_each(P.second, [&](ScheduleCopyableData *SD) {
21109 if (isInSchedulingRegion(*SD))
21110 SD->clearDependencies();
21111 });
21112 });
21113 ReSchedule = true;
21114 }
21115 // Check if the bundle data has deps for copyable elements already. In
21116 // this case need to reset deps and recalculate it.
21117 if (Bundle && !Bundle.getBundle().empty()) {
21118 if (S.areInstructionsWithCopyableElements() ||
21119 !ScheduleCopyableDataMap.empty())
21120 CheckIfNeedToClearDeps(Bundle);
21121 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
21122 << BB->getName() << "\n");
21123 calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
21124 ControlDependentMembers);
21125 } else if (!ControlDependentMembers.empty()) {
21126 ScheduleBundle Invalid = ScheduleBundle::invalid();
21127 calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
21128 ControlDependentMembers);
21129 }
21130
21131 if (ReSchedule) {
21132 resetSchedule();
21133 initialFillReadyList(ReadyInsts);
21134 }
21135
21136 // Now try to schedule the new bundle or (if no bundle) just calculate
21137 // dependencies. As soon as the bundle is "ready" it means that there are no
21138 // cyclic dependencies and we can schedule it. Note that's important that we
21139 // don't "schedule" the bundle yet.
21140 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
21141 !ReadyInsts.empty()) {
21142 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
21143 assert(Picked->isReady() && "must be ready to schedule");
21144 schedule(*SLP, S, EI, Picked, ReadyInsts);
21145 if (Picked == &Bundle)
21146 break;
21147 }
21148 };
21149
21150 // Make sure that the scheduling region contains all
21151 // instructions of the bundle.
21152 for (Value *V : VL) {
21153 if (S.isNonSchedulable(V))
21154 continue;
21155 if (!extendSchedulingRegion(V, S)) {
21156 // If the scheduling region got new instructions at the lower end (or it
21157 // is a new region for the first bundle). This makes it necessary to
21158 // recalculate all dependencies.
21159 // Otherwise the compiler may crash trying to incorrectly calculate
21160 // dependencies and emit instruction in the wrong order at the actual
21161 // scheduling.
21162 ScheduleBundle Invalid = ScheduleBundle::invalid();
21163 TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
21164 return std::nullopt;
21165 }
21166 }
21167
21168 bool ReSchedule = false;
21169 for (Value *V : VL) {
21170 if (S.isNonSchedulable(V))
21171 continue;
21173 getScheduleCopyableData(cast<Instruction>(V));
21174 if (!CopyableData.empty()) {
21175 for (ScheduleCopyableData *SD : CopyableData)
21176 ReadyInsts.remove(SD);
21177 }
21178 ScheduleData *BundleMember = getScheduleData(V);
21179 assert((BundleMember || S.isCopyableElement(V)) &&
21180 "no ScheduleData for bundle member (maybe not in same basic block)");
21181 if (!BundleMember)
21182 continue;
21183
21184 // Make sure we don't leave the pieces of the bundle in the ready list when
21185 // whole bundle might not be ready.
21186 ReadyInsts.remove(BundleMember);
21187 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
21188 !Bundles.empty()) {
21189 for (ScheduleBundle *B : Bundles)
21190 ReadyInsts.remove(B);
21191 }
21192
21193 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21194 continue;
21195 // A bundle member was scheduled as single instruction before and now
21196 // needs to be scheduled as part of the bundle. We just get rid of the
21197 // existing schedule.
21198 // A bundle member has deps calculated before it was copyable element - need
21199 // to reschedule.
21200 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
21201 << " was already scheduled\n");
21202 ReSchedule = true;
21203 }
21204
21205 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21206 TryScheduleBundleImpl(ReSchedule, Bundle);
21207 if (!Bundle.isReady()) {
21208 for (ScheduleEntity *BD : Bundle.getBundle()) {
21209 // Copyable data scheduling is just removed.
21211 continue;
21212 if (BD->isReady()) {
21213 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
21214 if (Bundles.empty()) {
21215 ReadyInsts.insert(BD);
21216 continue;
21217 }
21218 for (ScheduleBundle *B : Bundles)
21219 if (B->isReady())
21220 ReadyInsts.insert(B);
21221 }
21222 }
21223 ScheduledBundlesList.pop_back();
21224 SmallVector<ScheduleData *> ControlDependentMembers;
21225 for (Value *V : VL) {
21226 if (S.isNonSchedulable(V))
21227 continue;
21228 auto *I = cast<Instruction>(V);
21229 if (S.isCopyableElement(I)) {
21230 // Remove the copyable data from the scheduling region and restore
21231 // previous mappings.
21232 auto KV = std::make_pair(EI, I);
21233 assert(ScheduleCopyableDataMap.contains(KV) &&
21234 "no ScheduleCopyableData for copyable element");
21235 ScheduleCopyableData *SD =
21236 ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val();
21237 ScheduleCopyableDataMapByUsers[I].remove(SD);
21238 if (EI.UserTE) {
21239 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21240 const auto *It = find(Op, I);
21241 assert(It != Op.end() && "Lane not set");
21242 SmallPtrSet<Instruction *, 4> Visited;
21243 do {
21244 int Lane = std::distance(Op.begin(), It);
21245 assert(Lane >= 0 && "Lane not set");
21246 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21247 !EI.UserTE->ReorderIndices.empty())
21248 Lane = EI.UserTE->ReorderIndices[Lane];
21249 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21250 "Couldn't find extract lane");
21251 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21252 if (!Visited.insert(In).second) {
21253 It = find(make_range(std::next(It), Op.end()), I);
21254 break;
21255 }
21256 ScheduleCopyableDataMapByInstUser
21257 [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)]
21258 .pop_back();
21259 It = find(make_range(std::next(It), Op.end()), I);
21260 } while (It != Op.end());
21261 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
21262 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I))
21263 ScheduleCopyableDataMapByUsers[I].insert(UserCD);
21264 }
21265 if (ScheduleCopyableDataMapByUsers[I].empty())
21266 ScheduleCopyableDataMapByUsers.erase(I);
21267 ScheduleCopyableDataMap.erase(KV);
21268 // Need to recalculate dependencies for the actual schedule data.
21269 if (ScheduleData *OpSD = getScheduleData(I);
21270 OpSD && OpSD->hasValidDependencies()) {
21271 OpSD->clearDirectDependencies();
21272 if (RegionHasStackSave ||
21274 ControlDependentMembers.push_back(OpSD);
21275 }
21276 continue;
21277 }
21278 ScheduledBundles.find(I)->getSecond().pop_back();
21279 }
21280 if (!ControlDependentMembers.empty()) {
21281 ScheduleBundle Invalid = ScheduleBundle::invalid();
21282 calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,
21283 ControlDependentMembers);
21284 }
21285 return std::nullopt;
21286 }
21287 return &Bundle;
21288}
21289
21290BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21291 // Allocate a new ScheduleData for the instruction.
21292 if (ChunkPos >= ChunkSize) {
21293 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21294 ChunkPos = 0;
21295 }
21296 return &(ScheduleDataChunks.back()[ChunkPos++]);
21297}
21298
21299bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21300 Value *V, const InstructionsState &S) {
21302 assert(I && "bundle member must be an instruction");
21303 if (getScheduleData(I))
21304 return true;
21305 if (!ScheduleStart) {
21306 // It's the first instruction in the new region.
21307 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
21308 ScheduleStart = I;
21309 ScheduleEnd = I->getNextNode();
21310 assert(ScheduleEnd && "tried to vectorize a terminator?");
21311 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
21312 return true;
21313 }
21314 // Search up and down at the same time, because we don't know if the new
21315 // instruction is above or below the existing scheduling region.
21316 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
21317 // against the budget. Otherwise debug info could affect codegen.
21319 ++ScheduleStart->getIterator().getReverse();
21320 BasicBlock::reverse_iterator UpperEnd = BB->rend();
21321 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
21322 BasicBlock::iterator LowerEnd = BB->end();
21323 auto IsAssumeLikeIntr = [](const Instruction &I) {
21324 if (auto *II = dyn_cast<IntrinsicInst>(&I))
21325 return II->isAssumeLikeIntrinsic();
21326 return false;
21327 };
21328 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21329 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21330 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
21331 &*DownIter != I) {
21332 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21333 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
21334 return false;
21335 }
21336
21337 ++UpIter;
21338 ++DownIter;
21339
21340 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21341 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21342 }
21343 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
21344 assert(I->getParent() == ScheduleStart->getParent() &&
21345 "Instruction is in wrong basic block.");
21346 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
21347 ScheduleStart = I;
21348 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
21349 << "\n");
21350 return true;
21351 }
21352 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
21353 "Expected to reach top of the basic block or instruction down the "
21354 "lower end.");
21355 assert(I->getParent() == ScheduleEnd->getParent() &&
21356 "Instruction is in wrong basic block.");
21357 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
21358 nullptr);
21359 ScheduleEnd = I->getNextNode();
21360 assert(ScheduleEnd && "tried to vectorize a terminator?");
21361 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
21362 return true;
21363}
21364
21365void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21366 Instruction *ToI,
21367 ScheduleData *PrevLoadStore,
21368 ScheduleData *NextLoadStore) {
21369 ScheduleData *CurrentLoadStore = PrevLoadStore;
21370 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
21371 // No need to allocate data for non-schedulable instructions.
21372 if (isa<PHINode>(I))
21373 continue;
21374 ScheduleData *SD = ScheduleDataMap.lookup(I);
21375 if (!SD) {
21376 SD = allocateScheduleDataChunks();
21377 ScheduleDataMap[I] = SD;
21378 }
21379 assert(!isInSchedulingRegion(*SD) &&
21380 "new ScheduleData already in scheduling region");
21381 SD->init(SchedulingRegionID, I);
21382
21383 if (I->mayReadOrWriteMemory() &&
21384 (!isa<IntrinsicInst>(I) ||
21385 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
21386 cast<IntrinsicInst>(I)->getIntrinsicID() !=
21387 Intrinsic::pseudoprobe))) {
21388 // Update the linked list of memory accessing instructions.
21389 if (CurrentLoadStore) {
21390 CurrentLoadStore->setNextLoadStore(SD);
21391 } else {
21392 FirstLoadStoreInRegion = SD;
21393 }
21394 CurrentLoadStore = SD;
21395 }
21396
21399 RegionHasStackSave = true;
21400 }
21401 if (NextLoadStore) {
21402 if (CurrentLoadStore)
21403 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21404 } else {
21405 LastLoadStoreInRegion = CurrentLoadStore;
21406 }
21407}
21408
21409void BoUpSLP::BlockScheduling::calculateDependencies(
21410 ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
21411 ArrayRef<ScheduleData *> ControlDeps) {
21412 SmallVector<ScheduleEntity *> WorkList;
21413 auto ProcessNode = [&](ScheduleEntity *SE) {
21414 if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
21415 if (CD->hasValidDependencies())
21416 return;
21417 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
21418 CD->initDependencies();
21419 CD->resetUnscheduledDeps();
21420 const EdgeInfo &EI = CD->getEdgeInfo();
21421 if (EI.UserTE) {
21422 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
21423 const auto *It = find(Op, CD->getInst());
21424 assert(It != Op.end() && "Lane not set");
21425 SmallPtrSet<Instruction *, 4> Visited;
21426 do {
21427 int Lane = std::distance(Op.begin(), It);
21428 assert(Lane >= 0 && "Lane not set");
21429 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
21430 !EI.UserTE->ReorderIndices.empty())
21431 Lane = EI.UserTE->ReorderIndices[Lane];
21432 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
21433 "Couldn't find extract lane");
21434 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
21435 if (EI.UserTE->isCopyableElement(In)) {
21436 // We may have not have related copyable scheduling data, if the
21437 // instruction is non-schedulable.
21438 if (ScheduleCopyableData *UseSD =
21439 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21440 CD->incDependencies();
21441 if (!UseSD->isScheduled())
21442 CD->incrementUnscheduledDeps(1);
21443 if (!UseSD->hasValidDependencies() ||
21444 (InsertInReadyList && UseSD->isReady()))
21445 WorkList.push_back(UseSD);
21446 }
21447 } else if (Visited.insert(In).second) {
21448 if (ScheduleData *UseSD = getScheduleData(In)) {
21449 CD->incDependencies();
21450 if (!UseSD->isScheduled())
21451 CD->incrementUnscheduledDeps(1);
21452 if (!UseSD->hasValidDependencies() ||
21453 (InsertInReadyList && UseSD->isReady()))
21454 WorkList.push_back(UseSD);
21455 }
21456 }
21457 It = find(make_range(std::next(It), Op.end()), CD->getInst());
21458 } while (It != Op.end());
21459 if (CD->isReady() && CD->getDependencies() == 0 &&
21460 (EI.UserTE->hasState() &&
21461 (EI.UserTE->getMainOp()->getParent() !=
21462 CD->getInst()->getParent() ||
21463 (isa<PHINode>(EI.UserTE->getMainOp()) &&
21464 (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) ||
21465 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21466 auto *IU = dyn_cast<Instruction>(U);
21467 if (!IU)
21468 return true;
21469 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21470 })))))) {
21471 // If no uses in the block - mark as having pseudo-use, which cannot
21472 // be scheduled.
21473 // Prevents incorrect def-use tracking between external user and
21474 // actual instruction.
21475 CD->incDependencies();
21476 CD->incrementUnscheduledDeps(1);
21477 }
21478 }
21479 return;
21480 }
21481 auto *BundleMember = cast<ScheduleData>(SE);
21482 if (BundleMember->hasValidDependencies())
21483 return;
21484 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
21485 BundleMember->initDependencies();
21486 BundleMember->resetUnscheduledDeps();
21487 // Handle def-use chain dependencies.
21488 SmallDenseMap<Value *, unsigned> UserToNumOps;
21489 for (User *U : BundleMember->getInst()->users()) {
21490 if (isa<PHINode>(U))
21491 continue;
21492 if (ScheduleData *UseSD = getScheduleData(U)) {
21493 // The operand is a copyable element - skip.
21494 unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();
21495 ++NumOps;
21496 if (areAllOperandsReplacedByCopyableData(
21497 cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
21498 continue;
21499 BundleMember->incDependencies();
21500 if (!UseSD->isScheduled())
21501 BundleMember->incrementUnscheduledDeps(1);
21502 if (!UseSD->hasValidDependencies() ||
21503 (InsertInReadyList && UseSD->isReady()))
21504 WorkList.push_back(UseSD);
21505 }
21506 }
21507 for (ScheduleCopyableData *UseSD :
21508 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21509 BundleMember->incDependencies();
21510 if (!UseSD->isScheduled())
21511 BundleMember->incrementUnscheduledDeps(1);
21512 if (!UseSD->hasValidDependencies() ||
21513 (InsertInReadyList && UseSD->isReady()))
21514 WorkList.push_back(UseSD);
21515 }
21516
21517 SmallPtrSet<const Instruction *, 4> Visited;
21518 auto MakeControlDependent = [&](Instruction *I) {
21519 // Do not mark control dependent twice.
21520 if (!Visited.insert(I).second)
21521 return;
21522 auto *DepDest = getScheduleData(I);
21523 assert(DepDest && "must be in schedule window");
21524 DepDest->addControlDependency(BundleMember);
21525 BundleMember->incDependencies();
21526 if (!DepDest->isScheduled())
21527 BundleMember->incrementUnscheduledDeps(1);
21528 if (!DepDest->hasValidDependencies() ||
21529 (InsertInReadyList && DepDest->isReady()))
21530 WorkList.push_back(DepDest);
21531 };
21532
21533 // Any instruction which isn't safe to speculate at the beginning of the
21534 // block is control depend on any early exit or non-willreturn call
21535 // which proceeds it.
21536 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->getInst())) {
21537 for (Instruction *I = BundleMember->getInst()->getNextNode();
21538 I != ScheduleEnd; I = I->getNextNode()) {
21539 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
21540 continue;
21541
21542 // Add the dependency
21543 MakeControlDependent(I);
21544
21546 // Everything past here must be control dependent on I.
21547 break;
21548 }
21549 }
21550
21551 if (RegionHasStackSave) {
21552 // If we have an inalloc alloca instruction, it needs to be scheduled
21553 // after any preceeding stacksave. We also need to prevent any alloca
21554 // from reordering above a preceeding stackrestore.
21555 if (match(BundleMember->getInst(), m_Intrinsic<Intrinsic::stacksave>()) ||
21556 match(BundleMember->getInst(),
21558 for (Instruction *I = BundleMember->getInst()->getNextNode();
21559 I != ScheduleEnd; I = I->getNextNode()) {
21562 // Any allocas past here must be control dependent on I, and I
21563 // must be memory dependend on BundleMember->Inst.
21564 break;
21565
21566 if (!isa<AllocaInst>(I))
21567 continue;
21568
21569 // Add the dependency
21570 MakeControlDependent(I);
21571 }
21572 }
21573
21574 // In addition to the cases handle just above, we need to prevent
21575 // allocas and loads/stores from moving below a stacksave or a
21576 // stackrestore. Avoiding moving allocas below stackrestore is currently
21577 // thought to be conservatism. Moving loads/stores below a stackrestore
21578 // can lead to incorrect code.
21579 if (isa<AllocaInst>(BundleMember->getInst()) ||
21580 BundleMember->getInst()->mayReadOrWriteMemory()) {
21581 for (Instruction *I = BundleMember->getInst()->getNextNode();
21582 I != ScheduleEnd; I = I->getNextNode()) {
21585 continue;
21586
21587 // Add the dependency
21588 MakeControlDependent(I);
21589 break;
21590 }
21591 }
21592 }
21593
21594 // Handle the memory dependencies (if any).
21595 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21596 if (!NextLoadStore)
21597 return;
21598 Instruction *SrcInst = BundleMember->getInst();
21599 assert(SrcInst->mayReadOrWriteMemory() &&
21600 "NextLoadStore list for non memory effecting bundle?");
21601 MemoryLocation SrcLoc = getLocation(SrcInst);
21602 bool SrcMayWrite = SrcInst->mayWriteToMemory();
21603 unsigned NumAliased = 0;
21604 unsigned DistToSrc = 1;
21605 bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
21606
21607 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21608 DepDest = DepDest->getNextLoadStore()) {
21609 assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
21610
21611 // We have two limits to reduce the complexity:
21612 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
21613 // SLP->isAliased (which is the expensive part in this loop).
21614 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
21615 // the whole loop (even if the loop is fast, it's quadratic).
21616 // It's important for the loop break condition (see below) to
21617 // check this limit even between two read-only instructions.
21618 if (DistToSrc >= MaxMemDepDistance ||
21619 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21620 (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
21621 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21622
21623 // We increment the counter only if the locations are aliased
21624 // (instead of counting all alias checks). This gives a better
21625 // balance between reduced runtime and accurate dependencies.
21626 NumAliased++;
21627
21628 DepDest->addMemoryDependency(BundleMember);
21629 BundleMember->incDependencies();
21630 if (!DepDest->isScheduled())
21631 BundleMember->incrementUnscheduledDeps(1);
21632 if (!DepDest->hasValidDependencies() ||
21633 (InsertInReadyList && DepDest->isReady()))
21634 WorkList.push_back(DepDest);
21635 }
21636
21637 // Example, explaining the loop break condition: Let's assume our
21638 // starting instruction is i0 and MaxMemDepDistance = 3.
21639 //
21640 // +--------v--v--v
21641 // i0,i1,i2,i3,i4,i5,i6,i7,i8
21642 // +--------^--^--^
21643 //
21644 // MaxMemDepDistance let us stop alias-checking at i3 and we add
21645 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
21646 // Previously we already added dependencies from i3 to i6,i7,i8
21647 // (because of MaxMemDepDistance). As we added a dependency from
21648 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
21649 // and we can abort this loop at i6.
21650 if (DistToSrc >= 2 * MaxMemDepDistance)
21651 break;
21652 DistToSrc++;
21653 }
21654 };
21655
21656 assert((Bundle || !ControlDeps.empty()) &&
21657 "expected at least one instruction to schedule");
21658 if (Bundle)
21659 WorkList.push_back(Bundle.getBundle().front());
21660 WorkList.append(ControlDeps.begin(), ControlDeps.end());
21661 SmallPtrSet<ScheduleBundle *, 16> Visited;
21662 while (!WorkList.empty()) {
21663 ScheduleEntity *SD = WorkList.pop_back_val();
21664 SmallVector<ScheduleBundle *, 1> CopyableBundle;
21666 if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
21667 CopyableBundle.push_back(&CD->getBundle());
21668 Bundles = CopyableBundle;
21669 } else {
21670 Bundles = getScheduleBundles(SD->getInst());
21671 }
21672 if (Bundles.empty()) {
21673 if (!SD->hasValidDependencies())
21674 ProcessNode(SD);
21675 if (InsertInReadyList && SD->isReady()) {
21676 ReadyInsts.insert(SD);
21677 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
21678 }
21679 continue;
21680 }
21681 for (ScheduleBundle *Bundle : Bundles) {
21682 if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second)
21683 continue;
21684 assert(isInSchedulingRegion(*Bundle) &&
21685 "ScheduleData not in scheduling region");
21686 for_each(Bundle->getBundle(), ProcessNode);
21687 }
21688 if (InsertInReadyList && SD->isReady()) {
21689 for (ScheduleBundle *Bundle : Bundles) {
21690 assert(isInSchedulingRegion(*Bundle) &&
21691 "ScheduleData not in scheduling region");
21692 if (!Bundle->isReady())
21693 continue;
21694 ReadyInsts.insert(Bundle);
21695 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
21696 << "\n");
21697 }
21698 }
21699 }
21700}
21701
21702void BoUpSLP::BlockScheduling::resetSchedule() {
21703 assert(ScheduleStart &&
21704 "tried to reset schedule on block which has not been scheduled");
21705 for_each(ScheduleDataMap, [&](auto &P) {
21706 if (BB != P.first->getParent())
21707 return;
21708 ScheduleData *SD = P.second;
21709 if (isInSchedulingRegion(*SD)) {
21710 SD->setScheduled(/*Scheduled=*/false);
21711 SD->resetUnscheduledDeps();
21712 }
21713 });
21714 for_each(ScheduleCopyableDataMapByInst, [&](auto &P) {
21715 for_each(P.second, [&](ScheduleCopyableData *SD) {
21716 if (isInSchedulingRegion(*SD)) {
21717 SD->setScheduled(/*Scheduled=*/false);
21718 SD->resetUnscheduledDeps();
21719 }
21720 });
21721 });
21722 for_each(ScheduledBundles, [&](auto &P) {
21723 for_each(P.second, [&](ScheduleBundle *Bundle) {
21724 if (isInSchedulingRegion(*Bundle))
21725 Bundle->setScheduled(/*Scheduled=*/false);
21726 });
21727 });
21728 // Reset schedule data for copyable elements.
21729 for (auto &P : ScheduleCopyableDataMap) {
21730 if (isInSchedulingRegion(*P.second)) {
21731 P.second->setScheduled(/*Scheduled=*/false);
21732 P.second->resetUnscheduledDeps();
21733 }
21734 }
21735 ReadyInsts.clear();
21736}
21737
21738void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
21739 if (!BS->ScheduleStart)
21740 return;
21741
21742 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
21743
21744 // A key point - if we got here, pre-scheduling was able to find a valid
21745 // scheduling of the sub-graph of the scheduling window which consists
21746 // of all vector bundles and their transitive users. As such, we do not
21747 // need to reschedule anything *outside of* that subgraph.
21748
21749 BS->resetSchedule();
21750
21751 // For the real scheduling we use a more sophisticated ready-list: it is
21752 // sorted by the original instruction location. This lets the final schedule
21753 // be as close as possible to the original instruction order.
21754 // WARNING: If changing this order causes a correctness issue, that means
21755 // there is some missing dependence edge in the schedule data graph.
21756 struct ScheduleDataCompare {
21757 bool operator()(const ScheduleEntity *SD1,
21758 const ScheduleEntity *SD2) const {
21759 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21760 }
21761 };
21762 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21763
21764 // Ensure that all dependency data is updated (for nodes in the sub-graph)
21765 // and fill the ready-list with initial instructions.
21766 int Idx = 0;
21767 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
21768 I = I->getNextNode()) {
21769 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
21770 if (!Bundles.empty()) {
21771 for (ScheduleBundle *Bundle : Bundles) {
21772 Bundle->setSchedulingPriority(Idx++);
21773 if (!Bundle->hasValidDependencies())
21774 BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
21775 }
21776 SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
21777 for (ScheduleCopyableData *SD : reverse(SDs)) {
21778 ScheduleBundle &Bundle = SD->getBundle();
21779 Bundle.setSchedulingPriority(Idx++);
21780 if (!Bundle.hasValidDependencies())
21781 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21782 }
21783 continue;
21784 }
21786 BS->getScheduleCopyableDataUsers(I);
21787 if (ScheduleData *SD = BS->getScheduleData(I)) {
21788 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
21789 assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
21790 SDTEs.front()->doesNotNeedToSchedule() ||
21792 "scheduler and vectorizer bundle mismatch");
21793 SD->setSchedulingPriority(Idx++);
21794 if (!SD->hasValidDependencies() &&
21795 (!CopyableData.empty() ||
21796 any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) {
21797 assert(TE->isGather() && "expected gather node");
21798 return TE->hasState() && TE->hasCopyableElements() &&
21799 TE->isCopyableElement(I);
21800 }))) {
21801 // Need to calculate deps for these nodes to correctly handle copyable
21802 // dependencies, even if they were cancelled.
21803 // If copyables bundle was cancelled, the deps are cleared and need to
21804 // recalculate them.
21805 ScheduleBundle Bundle;
21806 Bundle.add(SD);
21807 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21808 }
21809 }
21810 for (ScheduleCopyableData *SD : reverse(CopyableData)) {
21811 ScheduleBundle &Bundle = SD->getBundle();
21812 Bundle.setSchedulingPriority(Idx++);
21813 if (!Bundle.hasValidDependencies())
21814 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
21815 }
21816 }
21817 BS->initialFillReadyList(ReadyInsts);
21818
21819 Instruction *LastScheduledInst = BS->ScheduleEnd;
21820
21821 // Do the "real" scheduling.
21822 SmallPtrSet<Instruction *, 16> Scheduled;
21823 while (!ReadyInsts.empty()) {
21824 auto *Picked = *ReadyInsts.begin();
21825 ReadyInsts.erase(ReadyInsts.begin());
21826
21827 // Move the scheduled instruction(s) to their dedicated places, if not
21828 // there yet.
21829 if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
21830 for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
21831 Instruction *PickedInst = BundleMember->getInst();
21832 // If copyable must be schedule as part of something else, skip it.
21833 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
21834 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
21835 (!IsCopyable && !Scheduled.insert(PickedInst).second))
21836 continue;
21837 if (PickedInst->getNextNode() != LastScheduledInst)
21838 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
21839 LastScheduledInst = PickedInst;
21840 }
21841 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
21842 LastScheduledInst);
21843 } else {
21844 auto *SD = cast<ScheduleData>(Picked);
21845 Instruction *PickedInst = SD->getInst();
21846 if (PickedInst->getNextNode() != LastScheduledInst)
21847 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
21848 LastScheduledInst = PickedInst;
21849 }
21850 auto Invalid = InstructionsState::invalid();
21851 BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts);
21852 }
21853
21854 // Check that we didn't break any of our invariants.
21855#ifdef EXPENSIVE_CHECKS
21856 BS->verify();
21857#endif
21858
21859#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
21860 // Check that all schedulable entities got scheduled
21861 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
21862 I = I->getNextNode()) {
21863 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
21864 assert(all_of(Bundles,
21865 [](const ScheduleBundle *Bundle) {
21866 return Bundle->isScheduled();
21867 }) &&
21868 "must be scheduled at this point");
21869 }
21870#endif
21871
21872 // Avoid duplicate scheduling of the block.
21873 BS->ScheduleStart = nullptr;
21874}
21875
21877 // If V is a store, just return the width of the stored value (or value
21878 // truncated just before storing) without traversing the expression tree.
21879 // This is the common case.
21880 if (auto *Store = dyn_cast<StoreInst>(V))
21881 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
21882
21883 if (auto *IEI = dyn_cast<InsertElementInst>(V))
21884 return getVectorElementSize(IEI->getOperand(1));
21885
21886 auto E = InstrElementSize.find(V);
21887 if (E != InstrElementSize.end())
21888 return E->second;
21889
21890 // If V is not a store, we can traverse the expression tree to find loads
21891 // that feed it. The type of the loaded value may indicate a more suitable
21892 // width than V's type. We want to base the vector element size on the width
21893 // of memory operations where possible.
21896 if (auto *I = dyn_cast<Instruction>(V)) {
21897 Worklist.emplace_back(I, I->getParent(), 0);
21898 Visited.insert(I);
21899 }
21900
21901 // Traverse the expression tree in bottom-up order looking for loads. If we
21902 // encounter an instruction we don't yet handle, we give up.
21903 auto Width = 0u;
21904 Value *FirstNonBool = nullptr;
21905 while (!Worklist.empty()) {
21906 auto [I, Parent, Level] = Worklist.pop_back_val();
21907
21908 // We should only be looking at scalar instructions here. If the current
21909 // instruction has a vector type, skip.
21910 auto *Ty = I->getType();
21911 if (isa<VectorType>(Ty))
21912 continue;
21913 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
21914 FirstNonBool = I;
21915 if (Level > RecursionMaxDepth)
21916 continue;
21917
21918 // If the current instruction is a load, update MaxWidth to reflect the
21919 // width of the loaded value.
21921 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
21922
21923 // Otherwise, we need to visit the operands of the instruction. We only
21924 // handle the interesting cases from buildTree here. If an operand is an
21925 // instruction we haven't yet visited and from the same basic block as the
21926 // user or the use is a PHI node, we add it to the worklist.
21929 for (Use &U : I->operands()) {
21930 if (auto *J = dyn_cast<Instruction>(U.get()))
21931 if (Visited.insert(J).second &&
21932 (isa<PHINode>(I) || J->getParent() == Parent)) {
21933 Worklist.emplace_back(J, J->getParent(), Level + 1);
21934 continue;
21935 }
21936 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
21937 FirstNonBool = U.get();
21938 }
21939 } else {
21940 break;
21941 }
21942 }
21943
21944 // If we didn't encounter a memory access in the expression tree, or if we
21945 // gave up for some reason, just return the width of V. Otherwise, return the
21946 // maximum width we found.
21947 if (!Width) {
21948 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
21949 V = FirstNonBool;
21950 Width = DL->getTypeSizeInBits(V->getType());
21951 }
21952
21953 for (Instruction *I : Visited)
21954 InstrElementSize[I] = Width;
21955
21956 return Width;
21957}
21958
21959bool BoUpSLP::collectValuesToDemote(
21960 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
21962 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
21963 bool &IsProfitableToDemote, bool IsTruncRoot) const {
21964 // We can always demote constants.
21965 if (all_of(E.Scalars, IsaPred<Constant>))
21966 return true;
21967
21968 unsigned OrigBitWidth =
21969 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
21970 if (OrigBitWidth == BitWidth) {
21971 MaxDepthLevel = 1;
21972 return true;
21973 }
21974
21975 // Check if the node was analyzed already and must keep its original bitwidth.
21976 if (NodesToKeepBWs.contains(E.Idx))
21977 return false;
21978
21979 // If the value is not a vectorized instruction in the expression and not used
21980 // by the insertelement instruction and not used in multiple vector nodes, it
21981 // cannot be demoted.
21982 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
21983 if (isa<PoisonValue>(R))
21984 return false;
21985 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21986 });
21987 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
21988 if (isa<PoisonValue>(V))
21989 return true;
21990 if (getTreeEntries(V).size() > 1)
21991 return false;
21992 // For lat shuffle of sext/zext with many uses need to check the extra bit
21993 // for unsigned values, otherwise may have incorrect casting for reused
21994 // scalars.
21995 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
21996 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
21997 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21998 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
21999 return true;
22000 }
22001 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
22002 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
22003 if (IsSignedNode)
22004 ++BitWidth1;
22005 if (auto *I = dyn_cast<Instruction>(V)) {
22006 APInt Mask = DB->getDemandedBits(I);
22007 unsigned BitWidth2 =
22008 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
22009 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
22010 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
22011 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
22012 break;
22013 BitWidth2 *= 2;
22014 }
22015 BitWidth1 = std::min(BitWidth1, BitWidth2);
22016 }
22017 BitWidth = std::max(BitWidth, BitWidth1);
22018 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
22019 };
22020 auto FinalAnalysis = [&, TTI = TTI]() {
22021 if (!IsProfitableToDemote)
22022 return false;
22023 bool Res = all_of(
22024 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
22025 // Demote gathers.
22026 if (Res && E.isGather()) {
22027 if (E.hasState()) {
22028 if (const TreeEntry *SameTE =
22029 getSameValuesTreeEntry(E.getMainOp(), E.Scalars);
22030 SameTE)
22031 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot, BitWidth,
22032 ToDemote, Visited, NodesToKeepBWs,
22033 MaxDepthLevel, IsProfitableToDemote,
22034 IsTruncRoot)) {
22035 ToDemote.push_back(E.Idx);
22036 return true;
22037 }
22038 }
22039 // Check possible extractelement instructions bases and final vector
22040 // length.
22041 SmallPtrSet<Value *, 4> UniqueBases;
22042 for (Value *V : E.Scalars) {
22043 auto *EE = dyn_cast<ExtractElementInst>(V);
22044 if (!EE)
22045 continue;
22046 UniqueBases.insert(EE->getVectorOperand());
22047 }
22048 const unsigned VF = E.Scalars.size();
22049 Type *OrigScalarTy = E.Scalars.front()->getType();
22050 if (UniqueBases.size() <= 2 ||
22051 ::getNumberOfParts(*TTI, getWidenedType(OrigScalarTy, VF)) >=
22053 *TTI,
22055 IntegerType::get(OrigScalarTy->getContext(), BitWidth),
22056 VF))) {
22057 ToDemote.push_back(E.Idx);
22058 return true;
22059 }
22060 }
22061 return Res;
22062 };
22063 if (E.isGather() || !Visited.insert(&E).second ||
22064 any_of(E.Scalars, [&](Value *V) {
22065 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
22066 return isa<InsertElementInst>(U) && !isVectorized(U);
22067 });
22068 }))
22069 return FinalAnalysis();
22070
22071 if (any_of(E.Scalars, [&](Value *V) {
22072 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
22073 return isVectorized(U) ||
22074 (E.Idx == 0 && UserIgnoreList &&
22075 UserIgnoreList->contains(U)) ||
22076 (!isa<CmpInst>(U) && U->getType()->isSized() &&
22077 !U->getType()->isScalableTy() &&
22078 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
22079 }) && !IsPotentiallyTruncated(V, BitWidth);
22080 }))
22081 return false;
22082
22083 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
22084 bool &NeedToExit) {
22085 NeedToExit = false;
22086 unsigned InitLevel = MaxDepthLevel;
22087 for (const TreeEntry *Op : Operands) {
22088 unsigned Level = InitLevel;
22089 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
22090 ToDemote, Visited, NodesToKeepBWs, Level,
22091 IsProfitableToDemote, IsTruncRoot)) {
22092 if (!IsProfitableToDemote)
22093 return false;
22094 NeedToExit = true;
22095 if (!FinalAnalysis())
22096 return false;
22097 continue;
22098 }
22099 MaxDepthLevel = std::max(MaxDepthLevel, Level);
22100 }
22101 return true;
22102 };
22103 auto AttemptCheckBitwidth =
22104 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
22105 // Try all bitwidth < OrigBitWidth.
22106 NeedToExit = false;
22107 unsigned BestFailBitwidth = 0;
22108 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
22109 if (Checker(BitWidth, OrigBitWidth))
22110 return true;
22111 if (BestFailBitwidth == 0 && FinalAnalysis())
22112 BestFailBitwidth = BitWidth;
22113 }
22114 if (BitWidth >= OrigBitWidth) {
22115 if (BestFailBitwidth == 0) {
22116 BitWidth = OrigBitWidth;
22117 return false;
22118 }
22119 MaxDepthLevel = 1;
22120 BitWidth = BestFailBitwidth;
22121 NeedToExit = true;
22122 return true;
22123 }
22124 return false;
22125 };
22126 auto TryProcessInstruction =
22127 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
22128 function_ref<bool(unsigned, unsigned)> Checker = {}) {
22129 if (Operands.empty()) {
22130 if (!IsTruncRoot)
22131 MaxDepthLevel = 1;
22132 for (Value *V : E.Scalars)
22133 (void)IsPotentiallyTruncated(V, BitWidth);
22134 } else {
22135 // Several vectorized uses? Check if we can truncate it, otherwise -
22136 // exit.
22137 if (any_of(E.Scalars, [&](Value *V) {
22138 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
22139 }))
22140 return false;
22141 bool NeedToExit = false;
22142 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
22143 return false;
22144 if (NeedToExit)
22145 return true;
22146 if (!ProcessOperands(Operands, NeedToExit))
22147 return false;
22148 if (NeedToExit)
22149 return true;
22150 }
22151
22152 ++MaxDepthLevel;
22153 // Record the entry that we can demote.
22154 ToDemote.push_back(E.Idx);
22155 return IsProfitableToDemote;
22156 };
22157
22158 if (E.State == TreeEntry::SplitVectorize)
22159 return TryProcessInstruction(
22160 BitWidth,
22161 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
22162 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
22163
22164 if (E.isAltShuffle()) {
22165 // Combining these opcodes may lead to incorrect analysis, skip for now.
22166 auto IsDangerousOpcode = [](unsigned Opcode) {
22167 switch (Opcode) {
22168 case Instruction::Shl:
22169 case Instruction::AShr:
22170 case Instruction::LShr:
22171 case Instruction::UDiv:
22172 case Instruction::SDiv:
22173 case Instruction::URem:
22174 case Instruction::SRem:
22175 return true;
22176 default:
22177 break;
22178 }
22179 return false;
22180 };
22181 if (IsDangerousOpcode(E.getAltOpcode()))
22182 return FinalAnalysis();
22183 }
22184
22185 switch (E.getOpcode()) {
22186
22187 // We can always demote truncations and extensions. Since truncations can
22188 // seed additional demotion, we save the truncated value.
22189 case Instruction::Trunc:
22190 if (IsProfitableToDemoteRoot)
22191 IsProfitableToDemote = true;
22192 return TryProcessInstruction(BitWidth);
22193 case Instruction::ZExt:
22194 case Instruction::SExt:
22195 if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
22196 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
22197 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
22198 return false;
22199 IsProfitableToDemote = true;
22200 return TryProcessInstruction(BitWidth);
22201
22202 // We can demote certain binary operations if we can demote both of their
22203 // operands.
22204 case Instruction::Add:
22205 case Instruction::Sub:
22206 case Instruction::Mul:
22207 case Instruction::And:
22208 case Instruction::Or:
22209 case Instruction::Xor: {
22210 return TryProcessInstruction(
22211 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
22212 }
22213 case Instruction::Freeze:
22214 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
22215 case Instruction::Shl: {
22216 // If we are truncating the result of this SHL, and if it's a shift of an
22217 // inrange amount, we can always perform a SHL in a smaller type.
22218 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
22219 return all_of(E.Scalars, [&](Value *V) {
22220 if (isa<PoisonValue>(V))
22221 return true;
22222 if (E.isCopyableElement(V))
22223 return true;
22224 auto *I = cast<Instruction>(V);
22225 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22226 return AmtKnownBits.getMaxValue().ult(BitWidth);
22227 });
22228 };
22229 return TryProcessInstruction(
22230 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
22231 }
22232 case Instruction::LShr: {
22233 // If this is a truncate of a logical shr, we can truncate it to a smaller
22234 // lshr iff we know that the bits we would otherwise be shifting in are
22235 // already zeros.
22236 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22237 return all_of(E.Scalars, [&](Value *V) {
22238 if (isa<PoisonValue>(V))
22239 return true;
22240 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22241 if (E.isCopyableElement(V))
22242 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
22243 auto *I = cast<Instruction>(V);
22244 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22245 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22246 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
22247 SimplifyQuery(*DL));
22248 });
22249 };
22250 return TryProcessInstruction(
22251 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
22252 LShrChecker);
22253 }
22254 case Instruction::AShr: {
22255 // If this is a truncate of an arithmetic shr, we can truncate it to a
22256 // smaller ashr iff we know that all the bits from the sign bit of the
22257 // original type and the sign bit of the truncate type are similar.
22258 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22259 return all_of(E.Scalars, [&](Value *V) {
22260 if (isa<PoisonValue>(V))
22261 return true;
22262 auto *I = cast<Instruction>(V);
22263 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22264 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22265 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22266 ShiftedBits <
22267 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22268 });
22269 };
22270 return TryProcessInstruction(
22271 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
22272 AShrChecker);
22273 }
22274 case Instruction::UDiv:
22275 case Instruction::URem: {
22276 // UDiv and URem can be truncated if all the truncated bits are zero.
22277 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22278 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22279 return all_of(E.Scalars, [&](Value *V) {
22280 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22281 if (E.hasCopyableElements() && E.isCopyableElement(V))
22282 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22283 auto *I = cast<Instruction>(V);
22284 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22285 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22286 });
22287 };
22288 return TryProcessInstruction(
22289 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
22290 }
22291
22292 // We can demote selects if we can demote their true and false values.
22293 case Instruction::Select: {
22294 return TryProcessInstruction(
22295 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
22296 }
22297
22298 // We can demote phis if we can demote all their incoming operands.
22299 case Instruction::PHI: {
22300 const unsigned NumOps = E.getNumOperands();
22302 transform(seq<unsigned>(0, NumOps), Ops.begin(),
22303 [&](unsigned Idx) { return getOperandEntry(&E, Idx); });
22304
22305 return TryProcessInstruction(BitWidth, Ops);
22306 }
22307
22308 case Instruction::Call: {
22309 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
22310 if (!IC)
22311 break;
22313 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
22314 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
22315 break;
22316 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
22317 function_ref<bool(unsigned, unsigned)> CallChecker;
22318 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22319 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22320 return all_of(E.Scalars, [&](Value *V) {
22321 auto *I = cast<Instruction>(V);
22322 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22323 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22324 return MaskedValueIsZero(I->getOperand(0), Mask,
22325 SimplifyQuery(*DL)) &&
22326 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22327 }
22328 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
22329 "Expected min/max intrinsics only.");
22330 unsigned SignBits = OrigBitWidth - BitWidth;
22331 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22332 unsigned Op0SignBits =
22333 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22334 unsigned Op1SignBits =
22335 ComputeNumSignBits(I->getOperand(1), *DL, AC, nullptr, DT);
22336 return SignBits <= Op0SignBits &&
22337 ((SignBits != Op0SignBits &&
22338 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22339 MaskedValueIsZero(I->getOperand(0), Mask,
22340 SimplifyQuery(*DL))) &&
22341 SignBits <= Op1SignBits &&
22342 ((SignBits != Op1SignBits &&
22343 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
22344 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
22345 });
22346 };
22347 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
22348 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
22349 return all_of(E.Scalars, [&](Value *V) {
22350 auto *I = cast<Instruction>(V);
22351 unsigned SignBits = OrigBitWidth - BitWidth;
22352 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22353 unsigned Op0SignBits =
22354 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22355 return SignBits <= Op0SignBits &&
22356 ((SignBits != Op0SignBits &&
22357 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22358 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22359 });
22360 };
22361 if (ID != Intrinsic::abs) {
22362 Operands.push_back(getOperandEntry(&E, 1));
22363 CallChecker = CompChecker;
22364 } else {
22365 CallChecker = AbsChecker;
22366 }
22367 InstructionCost BestCost =
22368 std::numeric_limits<InstructionCost::CostType>::max();
22369 unsigned BestBitWidth = BitWidth;
22370 unsigned VF = E.Scalars.size();
22371 // Choose the best bitwidth based on cost estimations.
22372 auto Checker = [&](unsigned BitWidth, unsigned) {
22373 unsigned MinBW = PowerOf2Ceil(BitWidth);
22374 SmallVector<Type *> ArgTys =
22375 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
22376 auto VecCallCosts = getVectorCallCosts(
22377 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
22378 TTI, TLI, ArgTys);
22379 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
22380 if (Cost < BestCost) {
22381 BestCost = Cost;
22382 BestBitWidth = BitWidth;
22383 }
22384 return false;
22385 };
22386 [[maybe_unused]] bool NeedToExit;
22387 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22388 BitWidth = BestBitWidth;
22389 return TryProcessInstruction(BitWidth, Operands, CallChecker);
22390 }
22391
22392 // Otherwise, conservatively give up.
22393 default:
22394 break;
22395 }
22396 MaxDepthLevel = 1;
22397 return FinalAnalysis();
22398}
22399
22400static RecurKind getRdxKind(Value *V);
22401
22403 // We only attempt to truncate integer expressions.
22404 bool IsStoreOrInsertElt =
22405 VectorizableTree.front()->hasState() &&
22406 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
22407 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22408 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22409 ExtraBitWidthNodes.size() <= 1 &&
22410 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22411 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22412 return;
22413
22414 unsigned NodeIdx = 0;
22415 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22416 NodeIdx = 1;
22417
22418 // Ensure the roots of the vectorizable tree don't form a cycle.
22419 assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
22420 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22421 "Unexpected tree is graph.");
22422
22423 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
22424 // resize to the final type.
22425 bool IsTruncRoot = false;
22426 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22427 SmallVector<unsigned> RootDemotes;
22428 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
22429 if (NodeIdx != 0 &&
22430 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22431 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22432 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
22433 IsTruncRoot = true;
22434 RootDemotes.push_back(NodeIdx);
22435 IsProfitableToDemoteRoot = true;
22436 ++NodeIdx;
22437 }
22438
22439 // Analyzed the reduction already and not profitable - exit.
22440 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22441 return;
22442
22443 SmallVector<unsigned> ToDemote;
22444 auto ComputeMaxBitWidth =
22445 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
22446 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
22447 ToDemote.clear();
22448 // Check if the root is trunc and the next node is gather/buildvector, then
22449 // keep trunc in scalars, which is free in most cases.
22450 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22451 !NodesToKeepBWs.contains(E.Idx) &&
22452 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22453 all_of(E.Scalars, [&](Value *V) {
22454 return V->hasOneUse() || isa<Constant>(V) ||
22455 (!V->hasNUsesOrMore(UsesLimit) &&
22456 none_of(V->users(), [&](User *U) {
22457 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22458 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22459 if (TEs.empty() || is_contained(TEs, UserTE))
22460 return false;
22461 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22462 SelectInst>(U) ||
22463 isa<SIToFPInst, UIToFPInst>(U) ||
22464 (UserTE->hasState() &&
22465 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22466 SelectInst>(UserTE->getMainOp()) ||
22467 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22468 return true;
22469 unsigned UserTESz = DL->getTypeSizeInBits(
22470 UserTE->Scalars.front()->getType());
22471 if (all_of(TEs, [&](const TreeEntry *TE) {
22472 auto It = MinBWs.find(TE);
22473 return It != MinBWs.end() &&
22474 It->second.first > UserTESz;
22475 }))
22476 return true;
22477 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22478 }));
22479 })) {
22480 ToDemote.push_back(E.Idx);
22481 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22482 auto It = MinBWs.find(UserTE);
22483 if (It != MinBWs.end())
22484 return It->second.first;
22485 unsigned MaxBitWidth =
22486 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22487 MaxBitWidth = bit_ceil(MaxBitWidth);
22488 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22489 MaxBitWidth = 8;
22490 return MaxBitWidth;
22491 }
22492
22493 if (!E.hasState())
22494 return 0u;
22495
22496 unsigned VF = E.getVectorFactor();
22497 Type *ScalarTy = E.Scalars.front()->getType();
22498 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
22499 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
22500 if (!TreeRootIT)
22501 return 0u;
22502
22503 if (any_of(E.Scalars,
22504 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
22505 return 0u;
22506
22507 unsigned NumParts = ::getNumberOfParts(
22508 *TTI, getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
22509
22510 // The maximum bit width required to represent all the values that can be
22511 // demoted without loss of precision. It would be safe to truncate the roots
22512 // of the expression to this width.
22513 unsigned MaxBitWidth = 1u;
22514
22515 // True if the roots can be zero-extended back to their original type,
22516 // rather than sign-extended. We know that if the leading bits are not
22517 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
22518 // True.
22519 // Determine if the sign bit of all the roots is known to be zero. If not,
22520 // IsKnownPositive is set to False.
22521 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
22522 if (isa<PoisonValue>(R))
22523 return true;
22524 KnownBits Known = computeKnownBits(R, *DL);
22525 return Known.isNonNegative();
22526 });
22527
22528 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22529 E.UserTreeIndex.UserTE->hasState() &&
22530 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22531 MaxBitWidth =
22532 std::min(DL->getTypeSizeInBits(
22533 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22534 DL->getTypeSizeInBits(ScalarTy));
22535
22536 // We first check if all the bits of the roots are demanded. If they're not,
22537 // we can truncate the roots to this narrower type.
22538 for (Value *Root : E.Scalars) {
22539 if (isa<PoisonValue>(Root))
22540 continue;
22541 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, AC, nullptr, DT);
22542 TypeSize NumTypeBits =
22543 DL->getTypeSizeInBits(Root->getType()->getScalarType());
22544 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22545 // If we can't prove that the sign bit is zero, we must add one to the
22546 // maximum bit width to account for the unknown sign bit. This preserves
22547 // the existing sign bit so we can safely sign-extend the root back to the
22548 // original type. Otherwise, if we know the sign bit is zero, we will
22549 // zero-extend the root instead.
22550 //
22551 // FIXME: This is somewhat suboptimal, as there will be cases where adding
22552 // one to the maximum bit width will yield a larger-than-necessary
22553 // type. In general, we need to add an extra bit only if we can't
22554 // prove that the upper bit of the original type is equal to the
22555 // upper bit of the proposed smaller type. If these two bits are
22556 // the same (either zero or one) we know that sign-extending from
22557 // the smaller type will result in the same value. Here, since we
22558 // can't yet prove this, we are just making the proposed smaller
22559 // type larger to ensure correctness.
22560 if (!IsKnownPositive)
22561 ++BitWidth1;
22562
22563 auto *I = dyn_cast<Instruction>(Root);
22564 if (!I) {
22565 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22566 continue;
22567 }
22568 APInt Mask = DB->getDemandedBits(I);
22569 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22570 MaxBitWidth =
22571 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22572 }
22573
22574 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22575 MaxBitWidth = 8;
22576
22577 // If the original type is large, but reduced type does not improve the reg
22578 // use - ignore it.
22579 if (NumParts > 1 &&
22580 NumParts ==
22582 *TTI, getWidenedType(IntegerType::get(F->getContext(),
22583 bit_ceil(MaxBitWidth)),
22584 VF)))
22585 return 0u;
22586
22587 unsigned Opcode = E.getOpcode();
22588 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22589 Opcode == Instruction::SExt ||
22590 Opcode == Instruction::ZExt || NumParts > 1;
22591 // Conservatively determine if we can actually truncate the roots of the
22592 // expression. Collect the values that can be demoted in ToDemote and
22593 // additional roots that require investigating in Roots.
22595 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22596 bool NeedToDemote = IsProfitableToDemote;
22597
22598 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22599 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22600 NeedToDemote, IsTruncRoot) ||
22601 (MaxDepthLevel <= Limit &&
22602 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22603 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22604 DL->getTypeSizeInBits(TreeRootIT) /
22605 DL->getTypeSizeInBits(
22606 E.getMainOp()->getOperand(0)->getType()) >
22607 2)))))
22608 return 0u;
22609 // Round MaxBitWidth up to the next power-of-two.
22610 MaxBitWidth = bit_ceil(MaxBitWidth);
22611
22612 return MaxBitWidth;
22613 };
22614
22615 // If we can truncate the root, we must collect additional values that might
22616 // be demoted as a result. That is, those seeded by truncations we will
22617 // modify.
22618 // Add reduction ops sizes, if any.
22619 if (UserIgnoreList &&
22620 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
22621 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
22622 // x i1> to in)).
22623 if (all_of(*UserIgnoreList,
22624 [](Value *V) {
22625 return isa<PoisonValue>(V) ||
22626 cast<Instruction>(V)->getOpcode() == Instruction::Add;
22627 }) &&
22628 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22629 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22630 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22631 Builder.getInt1Ty()) {
22632 ReductionBitWidth = 1;
22633 } else {
22634 for (Value *V : *UserIgnoreList) {
22635 if (isa<PoisonValue>(V))
22636 continue;
22637 unsigned NumSignBits = ComputeNumSignBits(V, *DL, AC, nullptr, DT);
22638 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
22639 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22641 ++BitWidth1;
22642 unsigned BitWidth2 = BitWidth1;
22645 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22646 }
22647 ReductionBitWidth =
22648 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22649 }
22650 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22651 ReductionBitWidth = 8;
22652
22653 ReductionBitWidth = bit_ceil(ReductionBitWidth);
22654 }
22655 }
22656 bool IsTopRoot = NodeIdx == 0;
22657 while (NodeIdx < VectorizableTree.size() &&
22658 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22659 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22660 RootDemotes.push_back(NodeIdx);
22661 ++NodeIdx;
22662 IsTruncRoot = true;
22663 }
22664 bool IsSignedCmp = false;
22665 if (UserIgnoreList &&
22666 all_of(*UserIgnoreList,
22668 m_SMax(m_Value(), m_Value())))))
22669 IsSignedCmp = true;
22670 while (NodeIdx < VectorizableTree.size()) {
22671 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
22672 unsigned Limit = 2;
22673 if (IsTopRoot &&
22674 ReductionBitWidth ==
22675 DL->getTypeSizeInBits(
22676 VectorizableTree.front()->Scalars.front()->getType()))
22677 Limit = 3;
22678 unsigned MaxBitWidth = ComputeMaxBitWidth(
22679 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22680 IsTruncRoot, IsSignedCmp);
22681 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22682 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22683 ReductionBitWidth = bit_ceil(MaxBitWidth);
22684 else if (MaxBitWidth == 0)
22685 ReductionBitWidth = 0;
22686 }
22687
22688 for (unsigned Idx : RootDemotes) {
22689 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
22690 uint32_t OrigBitWidth =
22691 DL->getTypeSizeInBits(V->getType()->getScalarType());
22692 if (OrigBitWidth > MaxBitWidth) {
22693 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
22694 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22695 }
22696 return false;
22697 }))
22698 ToDemote.push_back(Idx);
22699 }
22700 RootDemotes.clear();
22701 IsTopRoot = false;
22702 IsProfitableToDemoteRoot = true;
22703
22704 if (ExtraBitWidthNodes.empty()) {
22705 NodeIdx = VectorizableTree.size();
22706 } else {
22707 unsigned NewIdx = 0;
22708 do {
22709 NewIdx = *ExtraBitWidthNodes.begin();
22710 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22711 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22712 NodeIdx = NewIdx;
22713 IsTruncRoot =
22714 NodeIdx < VectorizableTree.size() &&
22715 VectorizableTree[NodeIdx]->UserTreeIndex &&
22716 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22717 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22718 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22719 Instruction::Trunc &&
22720 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22721 IsSignedCmp =
22722 NodeIdx < VectorizableTree.size() &&
22723 VectorizableTree[NodeIdx]->UserTreeIndex &&
22724 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22725 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22726 Instruction::ICmp &&
22727 any_of(
22728 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22729 [&](Value *V) {
22730 auto *IC = dyn_cast<ICmpInst>(V);
22731 return IC && (IC->isSigned() ||
22732 !isKnownNonNegative(IC->getOperand(0),
22733 SimplifyQuery(*DL)) ||
22734 !isKnownNonNegative(IC->getOperand(1),
22735 SimplifyQuery(*DL)));
22736 });
22737 }
22738
22739 // If the maximum bit width we compute is less than the width of the roots'
22740 // type, we can proceed with the narrowing. Otherwise, do nothing.
22741 if (MaxBitWidth == 0 ||
22742 MaxBitWidth >=
22743 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
22744 ->getBitWidth()) {
22745 if (UserIgnoreList)
22746 AnalyzedMinBWVals.insert_range(TreeRoot);
22747 NodesToKeepBWs.insert_range(ToDemote);
22748 continue;
22749 }
22750
22751 // Finally, map the values we can demote to the maximum bit with we
22752 // computed.
22753 for (unsigned Idx : ToDemote) {
22754 TreeEntry *TE = VectorizableTree[Idx].get();
22755 if (MinBWs.contains(TE))
22756 continue;
22757 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
22758 if (isa<PoisonValue>(R))
22759 return false;
22760 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22761 });
22762 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22763 }
22764 }
22765}
22766
22768 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
22769 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
22771 auto *AA = &AM.getResult<AAManager>(F);
22772 auto *LI = &AM.getResult<LoopAnalysis>(F);
22773 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
22774 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
22775 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
22777
22778 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
22779 if (!Changed)
22780 return PreservedAnalyses::all();
22781
22784 return PA;
22785}
22786
22788 TargetTransformInfo *TTI_,
22789 TargetLibraryInfo *TLI_, AAResults *AA_,
22790 LoopInfo *LI_, DominatorTree *DT_,
22791 AssumptionCache *AC_, DemandedBits *DB_,
22794 return false;
22795 SE = SE_;
22796 TTI = TTI_;
22797 TLI = TLI_;
22798 AA = AA_;
22799 LI = LI_;
22800 DT = DT_;
22801 AC = AC_;
22802 DB = DB_;
22803 DL = &F.getDataLayout();
22804
22805 Stores.clear();
22806 GEPs.clear();
22807 bool Changed = false;
22808
22809 // If the target claims to have no vector registers don't attempt
22810 // vectorization.
22811 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
22812 LLVM_DEBUG(
22813 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
22814 return false;
22815 }
22816
22817 // Don't vectorize when the attribute NoImplicitFloat is used.
22818 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
22819 return false;
22820
22821 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
22822
22823 // Use the bottom up slp vectorizer to construct chains that start with
22824 // store instructions.
22825 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
22826
22827 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
22828 // delete instructions.
22829
22830 // Update DFS numbers now so that we can use them for ordering.
22831 DT->updateDFSNumbers();
22832
22833 // Scan the blocks in the function in post order.
22834 for (auto *BB : post_order(&F.getEntryBlock())) {
22836 continue;
22837
22838 // Start new block - clear the list of reduction roots.
22839 R.clearReductionData();
22840 collectSeedInstructions(BB);
22841
22842 // Vectorize trees that end at stores.
22843 if (!Stores.empty()) {
22844 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
22845 << " underlying objects.\n");
22846 Changed |= vectorizeStoreChains(R);
22847 }
22848
22849 // Vectorize trees that end at reductions.
22850 Changed |= vectorizeChainsInBlock(BB, R);
22851
22852 // Vectorize the index computations of getelementptr instructions. This
22853 // is primarily intended to catch gather-like idioms ending at
22854 // non-consecutive loads.
22855 if (!GEPs.empty()) {
22856 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
22857 << " underlying objects.\n");
22858 Changed |= vectorizeGEPIndices(BB, R);
22859 }
22860 }
22861
22862 if (Changed) {
22863 R.optimizeGatherSequence();
22864 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
22865 }
22866 return Changed;
22867}
22868
22869std::optional<bool>
22870SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
22871 unsigned Idx, unsigned MinVF,
22872 unsigned &Size) {
22873 Size = 0;
22874 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
22875 << "\n");
22876 const unsigned Sz = R.getVectorElementSize(Chain[0]);
22877 unsigned VF = Chain.size();
22878
22879 if (!has_single_bit(Sz) ||
22881 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
22882 VF) ||
22883 VF < 2 || VF < MinVF) {
22884 // Check if vectorizing with a non-power-of-2 VF should be considered. At
22885 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
22886 // all vector lanes are used.
22887 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
22888 return false;
22889 }
22890
22891 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
22892 << "\n");
22893
22894 SetVector<Value *> ValOps;
22895 for (Value *V : Chain)
22896 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
22897 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
22898 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
22899 InstructionsState S = Analysis.buildInstructionsState(
22900 ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
22901 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
22902 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
22903 bool IsAllowedSize =
22904 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
22905 ValOps.size()) ||
22906 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
22907 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
22908 (!S.getMainOp()->isSafeToRemove() ||
22909 any_of(ValOps.getArrayRef(),
22910 [&](Value *V) {
22911 return !isa<ExtractElementInst>(V) &&
22912 (V->getNumUses() > Chain.size() ||
22913 any_of(V->users(), [&](User *U) {
22914 return !Stores.contains(U);
22915 }));
22916 }))) ||
22917 (ValOps.size() > Chain.size() / 2 && !S)) {
22918 Size = (!IsAllowedSize && S) ? 1 : 2;
22919 return false;
22920 }
22921 }
22922 if (R.isLoadCombineCandidate(Chain))
22923 return true;
22924 R.buildTree(Chain);
22925 // Check if tree tiny and store itself or its value is not vectorized.
22926 if (R.isTreeTinyAndNotFullyVectorizable()) {
22927 if (R.isGathered(Chain.front()) ||
22928 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
22929 return std::nullopt;
22930 Size = R.getCanonicalGraphSize();
22931 return false;
22932 }
22933 if (R.isProfitableToReorder()) {
22934 R.reorderTopToBottom();
22935 R.reorderBottomToTop();
22936 }
22937 R.transformNodes();
22938 R.buildExternalUses();
22939
22940 R.computeMinimumValueSizes();
22941
22942 Size = R.getCanonicalGraphSize();
22943 if (S && S.getOpcode() == Instruction::Load)
22944 Size = 2; // cut off masked gather small trees
22945 InstructionCost Cost = R.getTreeCost();
22946
22947 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
22948 if (Cost < -SLPCostThreshold) {
22949 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
22950
22951 using namespace ore;
22952
22953 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
22954 cast<StoreInst>(Chain[0]))
22955 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
22956 << " and with tree size "
22957 << NV("TreeSize", R.getTreeSize()));
22958
22959 R.vectorizeTree();
22960 return true;
22961 }
22962
22963 return false;
22964}
22965
22966/// Checks if the quadratic mean deviation is less than 90% of the mean size.
22967static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
22968 bool First) {
22969 unsigned Num = 0;
22970 uint64_t Sum = std::accumulate(
22971 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
22972 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
22973 unsigned Size = First ? Val.first : Val.second;
22974 if (Size == 1)
22975 return V;
22976 ++Num;
22977 return V + Size;
22978 });
22979 if (Num == 0)
22980 return true;
22981 uint64_t Mean = Sum / Num;
22982 if (Mean == 0)
22983 return true;
22984 uint64_t Dev = std::accumulate(
22985 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
22986 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
22987 unsigned P = First ? Val.first : Val.second;
22988 if (P == 1)
22989 return V;
22990 return V + (P - Mean) * (P - Mean);
22991 }) /
22992 Num;
22993 return Dev * 96 / (Mean * Mean) == 0;
22994}
22995
22996namespace {
22997
22998/// A group of stores that we'll try to bundle together using vector ops.
22999/// They are ordered using the signed distance of their address operand to the
23000/// address of this group's BaseInstr.
23001class RelatedStoreInsts {
23002public:
23003 RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
23004 : AllStores(AllStores) {
23005 reset(BaseInstrIdx);
23006 }
23007
23008 void reset(unsigned NewBaseInstr) {
23009 assert(NewBaseInstr < AllStores.size() &&
23010 "Instruction index out of bounds");
23011 BaseInstrIdx = NewBaseInstr;
23012 Instrs.clear();
23013 insertOrLookup(NewBaseInstr, 0);
23014 }
23015
23016 /// Tries to insert \p InstrIdx as the store with a pointer distance of
23017 /// \p PtrDist.
23018 /// Does nothing if there is already a store with that \p PtrDist.
23019 /// \returns The previously associated Instruction index, or std::nullopt
23020 std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
23021 auto [It, Inserted] = Instrs.emplace(PtrDist, InstrIdx);
23022 return Inserted ? std::nullopt : std::make_optional(It->second);
23023 }
23024
23025 using DistToInstMap = std::map<int64_t, unsigned>;
23026 const DistToInstMap &getStores() const { return Instrs; }
23027
23028 /// If \p SI is related to this group of stores, return the distance of its
23029 /// pointer operand to the one the group's BaseInstr.
23030 std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
23031 ScalarEvolution &SE) const {
23032 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
23033 return getPointersDiff(
23034 BaseStore.getValueOperand()->getType(), BaseStore.getPointerOperand(),
23035 SI.getValueOperand()->getType(), SI.getPointerOperand(), DL, SE,
23036 /*StrictCheck=*/true);
23037 }
23038
23039 /// Recompute the pointer distances to be based on \p NewBaseInstIdx.
23040 /// Stores whose index is less than \p MinSafeIdx will be dropped.
23041 void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
23042 int64_t DistFromCurBase) {
23043 DistToInstMap PrevSet = std::move(Instrs);
23044 reset(NewBaseInstIdx);
23045
23046 // Re-insert stores that come after MinSafeIdx to try and vectorize them
23047 // again. Their distance will be "rebased" to use NewBaseInstIdx as
23048 // reference.
23049 for (auto [Dist, InstIdx] : PrevSet) {
23050 if (InstIdx >= MinSafeIdx)
23051 insertOrLookup(InstIdx, Dist - DistFromCurBase);
23052 }
23053 }
23054
23055 /// Remove all stores that have been vectorized from this group.
23056 void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
23057 DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
23058 reverse(Instrs), [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
23059 return VectorizedStores.contains(AllStores[DistAndIdx.second]);
23060 });
23061
23062 // Get a forward iterator pointing after the last vectorized store and erase
23063 // all stores before it so we don't try to vectorize them again.
23064 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
23065 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
23066 }
23067
23068private:
23069 /// The index of the Base instruction, i.e. the one with a 0 pointer distance.
23070 unsigned BaseInstrIdx;
23071
23072 /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
23073 DistToInstMap Instrs;
23074
23075 /// Reference to all the stores in the BB being analyzed.
23076 ArrayRef<StoreInst *> AllStores;
23077};
23078
23079} // end anonymous namespace
23080
23081bool SLPVectorizerPass::vectorizeStores(
23082 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
23083 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
23084 &Visited) {
23085 // We may run into multiple chains that merge into a single chain. We mark the
23086 // stores that we vectorized so that we don't visit the same store twice.
23087 BoUpSLP::ValueSet VectorizedStores;
23088 bool Changed = false;
23089
23090 auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
23091 int64_t PrevDist = -1;
23092 BoUpSLP::ValueList Operands;
23093 // Collect the chain into a list.
23094 for (auto [Idx, Data] : enumerate(StoreSeq)) {
23095 auto &[Dist, InstIdx] = Data;
23096 if (Operands.empty() || Dist - PrevDist == 1) {
23097 Operands.push_back(Stores[InstIdx]);
23098 PrevDist = Dist;
23099 if (Idx != StoreSeq.size() - 1)
23100 continue;
23101 }
23102 auto E = make_scope_exit([&, &Dist = Dist, &InstIdx = InstIdx]() {
23103 Operands.clear();
23104 Operands.push_back(Stores[InstIdx]);
23105 PrevDist = Dist;
23106 });
23107
23108 if (Operands.size() <= 1 ||
23109 !Visited
23110 .insert({Operands.front(),
23111 cast<StoreInst>(Operands.front())->getValueOperand(),
23112 Operands.back(),
23113 cast<StoreInst>(Operands.back())->getValueOperand(),
23114 Operands.size()})
23115 .second)
23116 continue;
23117
23118 unsigned MaxVecRegSize = R.getMaxVecRegSize();
23119 unsigned EltSize = R.getVectorElementSize(Operands[0]);
23120 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
23121
23122 unsigned MaxVF =
23123 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
23124 auto *Store = cast<StoreInst>(Operands[0]);
23125 Type *StoreTy = Store->getValueOperand()->getType();
23126 Type *ValueTy = StoreTy;
23127 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
23128 ValueTy = Trunc->getSrcTy();
23129 // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
23130 // getStoreMinimumVF only support scalar type as arguments. As a result,
23131 // we need to use the element type of StoreTy and ValueTy to retrieve the
23132 // VF and then transform it back.
23133 // Remember: VF is defined as the number we want to vectorize, not the
23134 // number of elements in the final vector.
23135 Type *StoreScalarTy = StoreTy->getScalarType();
23136 unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
23137 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
23138 ValueTy->getScalarType()));
23139 MinVF /= getNumElements(StoreTy);
23140 MinVF = std::max<unsigned>(2, MinVF);
23141
23142 if (MaxVF < MinVF) {
23143 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
23144 << ") < "
23145 << "MinVF (" << MinVF << ")\n");
23146 continue;
23147 }
23148
23149 unsigned NonPowerOf2VF = 0;
23151 // First try vectorizing with a non-power-of-2 VF. At the moment, only
23152 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
23153 // lanes are used.
23154 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
23155 if (has_single_bit(CandVF + 1)) {
23156 NonPowerOf2VF = CandVF;
23157 assert(NonPowerOf2VF != MaxVF &&
23158 "Non-power-of-2 VF should not be equal to MaxVF");
23159 }
23160 }
23161
23162 // MaxRegVF represents the number of instructions (scalar, or vector in
23163 // case of revec) that can be vectorized to naturally fit in a vector
23164 // register.
23165 unsigned MaxRegVF = MaxVF;
23166
23167 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
23168 if (MaxVF < MinVF) {
23169 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
23170 << ") < "
23171 << "MinVF (" << MinVF << ")\n");
23172 continue;
23173 }
23174
23175 SmallVector<unsigned> CandidateVFs;
23176 for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
23177 VF = divideCeil(VF, 2))
23178 CandidateVFs.push_back(VF);
23179
23180 unsigned End = Operands.size();
23181 unsigned Repeat = 0;
23182 constexpr unsigned MaxAttempts = 4;
23183 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
23184 for (std::pair<unsigned, unsigned> &P : RangeSizes)
23185 P.first = P.second = 1;
23186 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
23187 auto IsNotVectorized = [](bool First,
23188 const std::pair<unsigned, unsigned> &P) {
23189 return First ? P.first > 0 : P.second > 0;
23190 };
23191 auto IsVectorized = [](bool First,
23192 const std::pair<unsigned, unsigned> &P) {
23193 return First ? P.first == 0 : P.second == 0;
23194 };
23195 auto VFIsProfitable = [](bool First, unsigned Size,
23196 const std::pair<unsigned, unsigned> &P) {
23197 return First ? Size >= P.first : Size >= P.second;
23198 };
23199 auto FirstSizeSame = [](unsigned Size,
23200 const std::pair<unsigned, unsigned> &P) {
23201 return Size == P.first;
23202 };
23203 while (true) {
23204 ++Repeat;
23205 bool RepeatChanged = false;
23206 bool AnyProfitableGraph = false;
23207 for (unsigned VF : CandidateVFs) {
23208 AnyProfitableGraph = false;
23209 unsigned FirstUnvecStore =
23210 std::distance(RangeSizes.begin(),
23211 find_if(RangeSizes, std::bind(IsNotVectorized,
23212 VF >= MaxRegVF, _1)));
23213
23214 // Form slices of size VF starting from FirstUnvecStore and try to
23215 // vectorize them.
23216 while (FirstUnvecStore < End) {
23217 unsigned FirstVecStore = std::distance(
23218 RangeSizes.begin(),
23219 find_if(RangeSizes.drop_front(FirstUnvecStore),
23220 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
23221 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
23222 for (unsigned SliceStartIdx = FirstUnvecStore;
23223 SliceStartIdx + VF <= MaxSliceEnd;) {
23224 if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
23225 VF >= MaxRegVF)) {
23226 ++SliceStartIdx;
23227 continue;
23228 }
23229 ArrayRef<Value *> Slice =
23230 ArrayRef(Operands).slice(SliceStartIdx, VF);
23231 assert(all_of(Slice,
23232 [&](Value *V) {
23233 return cast<StoreInst>(V)
23234 ->getValueOperand()
23235 ->getType() ==
23236 cast<StoreInst>(Slice.front())
23237 ->getValueOperand()
23238 ->getType();
23239 }) &&
23240 "Expected all operands of same type.");
23241 if (!NonSchedulable.empty()) {
23242 auto [NonSchedSizeMax, NonSchedSizeMin] =
23243 NonSchedulable.lookup(Slice.front());
23244 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
23245 // VF is too ambitious. Try to vectorize another slice before
23246 // trying a smaller VF.
23247 SliceStartIdx += NonSchedSizeMax;
23248 continue;
23249 }
23250 }
23251 unsigned TreeSize;
23252 std::optional<bool> Res =
23253 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23254 if (!Res) {
23255 // Update the range of non schedulable VFs for slices starting
23256 // at SliceStartIdx.
23257 NonSchedulable
23258 .try_emplace(Slice.front(), std::make_pair(VF, VF))
23259 .first->getSecond()
23260 .second = VF;
23261 } else if (*Res) {
23262 // Mark the vectorized stores so that we don't vectorize them
23263 // again.
23264 VectorizedStores.insert_range(Slice);
23265 // Mark the vectorized stores so that we don't vectorize them
23266 // again.
23267 AnyProfitableGraph = RepeatChanged = Changed = true;
23268 // If we vectorized initial block, no need to try to vectorize
23269 // it again.
23270 for (std::pair<unsigned, unsigned> &P :
23271 RangeSizes.slice(SliceStartIdx, VF))
23272 P.first = P.second = 0;
23273 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23274 for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
23275 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23276 P.first = P.second = 0;
23277 FirstUnvecStore = SliceStartIdx + VF;
23278 }
23279 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23280 for (std::pair<unsigned, unsigned> &P :
23281 RangeSizes.slice(SliceStartIdx + VF,
23282 MaxSliceEnd - (SliceStartIdx + VF)))
23283 P.first = P.second = 0;
23284 if (MaxSliceEnd == End)
23285 End = SliceStartIdx;
23286 MaxSliceEnd = SliceStartIdx;
23287 }
23288 SliceStartIdx += VF;
23289 continue;
23290 }
23291 if (VF > 2 && Res &&
23292 !all_of(RangeSizes.slice(SliceStartIdx, VF),
23293 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23294 _1))) {
23295 SliceStartIdx += VF;
23296 continue;
23297 }
23298 // Check for the very big VFs that we're not rebuilding same
23299 // trees, just with larger number of elements.
23300 if (VF > MaxRegVF && TreeSize > 1 &&
23301 all_of(RangeSizes.slice(SliceStartIdx, VF),
23302 std::bind(FirstSizeSame, TreeSize, _1))) {
23303 SliceStartIdx += VF;
23304 while (SliceStartIdx != MaxSliceEnd &&
23305 RangeSizes[SliceStartIdx].first == TreeSize)
23306 ++SliceStartIdx;
23307 continue;
23308 }
23309 if (TreeSize > 1) {
23310 for (std::pair<unsigned, unsigned> &P :
23311 RangeSizes.slice(SliceStartIdx, VF)) {
23312 if (VF >= MaxRegVF)
23313 P.second = std::max(P.second, TreeSize);
23314 else
23315 P.first = std::max(P.first, TreeSize);
23316 }
23317 }
23318 ++SliceStartIdx;
23319 AnyProfitableGraph = true;
23320 }
23321 if (FirstUnvecStore >= End)
23322 break;
23323 if (MaxSliceEnd - FirstUnvecStore < VF &&
23324 MaxSliceEnd - FirstUnvecStore >= MinVF)
23325 AnyProfitableGraph = true;
23326 FirstUnvecStore = std::distance(
23327 RangeSizes.begin(),
23328 find_if(RangeSizes.drop_front(MaxSliceEnd),
23329 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23330 }
23331 if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
23332 break;
23333 }
23334 // All values vectorized - exit.
23335 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
23336 return P.first == 0 && P.second == 0;
23337 }))
23338 break;
23339 // Check if tried all attempts or no need for the last attempts at all.
23340 if (Repeat >= MaxAttempts ||
23341 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23342 break;
23343 constexpr unsigned StoresLimit = 64;
23344 const unsigned MaxTotalNum = std::min<unsigned>(
23345 Operands.size(),
23346 static_cast<unsigned>(
23347 End -
23348 std::distance(
23349 RangeSizes.begin(),
23350 find_if(RangeSizes, std::bind(IsNotVectorized, true, _1))) +
23351 1));
23352 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
23353 unsigned Limit =
23354 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
23355 CandidateVFs.clear();
23356 if (bit_floor(Limit) == VF)
23357 CandidateVFs.push_back(Limit);
23358 if (VF > MaxTotalNum || VF >= StoresLimit)
23359 break;
23360 for (std::pair<unsigned, unsigned> &P : RangeSizes) {
23361 if (P.first != 0)
23362 P.first = std::max(P.second, P.first);
23363 }
23364 // Last attempt to vectorize max number of elements, if all previous
23365 // attempts were unsuccessful because of the cost issues.
23366 CandidateVFs.push_back(VF);
23367 }
23368 }
23369 };
23370
23371 /// Groups of stores to vectorize
23372 SmallVector<RelatedStoreInsts> SortedStores;
23373
23374 // Inserts the specified store SI with the given index Idx to the set of the
23375 // stores. If the store with the same distance is found already - stop
23376 // insertion, try to vectorize already found stores. If some stores from this
23377 // sequence were not vectorized - try to vectorize them with the new store
23378 // later. But this logic is applied only to the stores, that come before the
23379 // previous store with the same distance.
23380 // Example:
23381 // 1. store x, %p
23382 // 2. store y, %p+1
23383 // 3. store z, %p+2
23384 // 4. store a, %p
23385 // 5. store b, %p+3
23386 // - Scan this from the last to first store. The very first bunch of stores is
23387 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
23388 // vector).
23389 // - The next store in the list - #1 - has the same distance from store #5 as
23390 // the store #4.
23391 // - Try to vectorize sequence of stores 4,2,3,5.
23392 // - If all these stores are vectorized - just drop them.
23393 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
23394 // - Start new stores sequence.
23395 // The new bunch of stores is {1, {1, 0}}.
23396 // - Add the stores from previous sequence, that were not vectorized.
23397 // Here we consider the stores in the reversed order, rather they are used in
23398 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
23399 // Store #3 can be added -> comes after store #4 with the same distance as
23400 // store #1.
23401 // Store #5 cannot be added - comes before store #4.
23402 // This logic allows to improve the compile time, we assume that the stores
23403 // after previous store with the same distance most likely have memory
23404 // dependencies and no need to waste compile time to try to vectorize them.
23405 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
23406 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
23407 std::optional<int64_t> PtrDist;
23408 auto *RelatedStores = find_if(
23409 SortedStores, [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
23410 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23411 return PtrDist.has_value();
23412 });
23413
23414 // We did not find a comparable store, start a new group.
23415 if (RelatedStores == SortedStores.end()) {
23416 SortedStores.emplace_back(Idx, Stores);
23417 return;
23418 }
23419
23420 // If there is already a store in the group with the same PtrDiff, try to
23421 // vectorize the existing instructions before adding the current store.
23422 // Otherwise, insert this store and keep collecting.
23423 if (std::optional<unsigned> PrevInst =
23424 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23425 TryToVectorize(RelatedStores->getStores());
23426 RelatedStores->clearVectorizedStores(VectorizedStores);
23427 RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,
23428 /*NewBaseInstIdx=*/Idx,
23429 /*DistFromCurBase=*/*PtrDist);
23430 }
23431 };
23432 Type *PrevValTy = nullptr;
23433 for (auto [I, SI] : enumerate(Stores)) {
23434 if (R.isDeleted(SI))
23435 continue;
23436 if (!PrevValTy)
23437 PrevValTy = SI->getValueOperand()->getType();
23438 // Check that we do not try to vectorize stores of different types.
23439 if (PrevValTy != SI->getValueOperand()->getType()) {
23440 for (RelatedStoreInsts &StoreSeq : SortedStores)
23441 TryToVectorize(StoreSeq.getStores());
23442 SortedStores.clear();
23443 PrevValTy = SI->getValueOperand()->getType();
23444 }
23445 FillStoresSet(I, SI);
23446 }
23447
23448 // Final vectorization attempt.
23449 for (RelatedStoreInsts &StoreSeq : SortedStores)
23450 TryToVectorize(StoreSeq.getStores());
23451
23452 return Changed;
23453}
23454
23455void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23456 // Initialize the collections. We will make a single pass over the block.
23457 Stores.clear();
23458 GEPs.clear();
23459
23460 // Visit the store and getelementptr instructions in BB and organize them in
23461 // Stores and GEPs according to the underlying objects of their pointer
23462 // operands.
23463 for (Instruction &I : *BB) {
23464 // Ignore store instructions that are volatile or have a pointer operand
23465 // that doesn't point to a scalar type.
23466 if (auto *SI = dyn_cast<StoreInst>(&I)) {
23467 if (!SI->isSimple())
23468 continue;
23469 if (!isValidElementType(SI->getValueOperand()->getType()))
23470 continue;
23471 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
23472 }
23473
23474 // Ignore getelementptr instructions that have more than one index, a
23475 // constant index, or a pointer operand that doesn't point to a scalar
23476 // type.
23477 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
23478 if (GEP->getNumIndices() != 1)
23479 continue;
23480 Value *Idx = GEP->idx_begin()->get();
23481 if (isa<Constant>(Idx))
23482 continue;
23483 if (!isValidElementType(Idx->getType()))
23484 continue;
23485 if (GEP->getType()->isVectorTy())
23486 continue;
23487 GEPs[GEP->getPointerOperand()].push_back(GEP);
23488 }
23489 }
23490}
23491
23492bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
23493 bool MaxVFOnly) {
23494 if (VL.size() < 2)
23495 return false;
23496
23497 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
23498 << VL.size() << ".\n");
23499
23500 // Check that all of the parts are instructions of the same type,
23501 // we permit an alternate opcode via InstructionsState.
23502 InstructionsState S = getSameOpcode(VL, *TLI);
23503 if (!S)
23504 return false;
23505
23506 Instruction *I0 = S.getMainOp();
23507 // Make sure invalid types (including vector type) are rejected before
23508 // determining vectorization factor for scalar instructions.
23509 for (Value *V : VL) {
23510 Type *Ty = V->getType();
23512 // NOTE: the following will give user internal llvm type name, which may
23513 // not be useful.
23514 R.getORE()->emit([&]() {
23515 std::string TypeStr;
23516 llvm::raw_string_ostream OS(TypeStr);
23517 Ty->print(OS);
23518 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
23519 << "Cannot SLP vectorize list: type "
23520 << TypeStr + " is unsupported by vectorizer";
23521 });
23522 return false;
23523 }
23524 }
23525
23526 Type *ScalarTy = getValueType(VL[0]);
23527 unsigned Sz = R.getVectorElementSize(I0);
23528 unsigned MinVF = R.getMinVF(Sz);
23529 unsigned MaxVF = std::max<unsigned>(
23530 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
23531 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23532 if (MaxVF < 2) {
23533 R.getORE()->emit([&]() {
23534 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
23535 << "Cannot SLP vectorize list: vectorization factor "
23536 << "less than 2 is not supported";
23537 });
23538 return false;
23539 }
23540
23541 bool Changed = false;
23542 bool CandidateFound = false;
23543 InstructionCost MinCost = SLPCostThreshold.getValue();
23544
23545 unsigned NextInst = 0, MaxInst = VL.size();
23546 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23547 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
23548 // No actual vectorization should happen, if number of parts is the same as
23549 // provided vectorization factor (i.e. the scalar type is used for vector
23550 // code during codegen).
23551 auto *VecTy = getWidenedType(ScalarTy, VF);
23552 if (TTI->getNumberOfParts(VecTy) == VF)
23553 continue;
23554 for (unsigned I = NextInst; I < MaxInst; ++I) {
23555 unsigned ActualVF = std::min(MaxInst - I, VF);
23556
23557 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
23558 continue;
23559
23560 if (MaxVFOnly && ActualVF < MaxVF)
23561 break;
23562 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23563 break;
23564
23565 SmallVector<Value *> Ops(ActualVF, nullptr);
23566 unsigned Idx = 0;
23567 for (Value *V : VL.drop_front(I)) {
23568 // Check that a previous iteration of this loop did not delete the
23569 // Value.
23570 if (auto *Inst = dyn_cast<Instruction>(V);
23571 !Inst || !R.isDeleted(Inst)) {
23572 Ops[Idx] = V;
23573 ++Idx;
23574 if (Idx == ActualVF)
23575 break;
23576 }
23577 }
23578 // Not enough vectorizable instructions - exit.
23579 if (Idx != ActualVF)
23580 break;
23581
23582 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
23583 << "\n");
23584
23585 R.buildTree(Ops);
23586 if (R.isTreeTinyAndNotFullyVectorizable())
23587 continue;
23588 if (R.isProfitableToReorder()) {
23589 R.reorderTopToBottom();
23590 R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
23591 }
23592 R.transformNodes();
23593 R.buildExternalUses();
23594
23595 R.computeMinimumValueSizes();
23596 InstructionCost Cost = R.getTreeCost();
23597 CandidateFound = true;
23598 MinCost = std::min(MinCost, Cost);
23599
23600 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
23601 << " for VF=" << ActualVF << "\n");
23602 if (Cost < -SLPCostThreshold) {
23603 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
23604 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
23606 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
23607 << " and with tree size "
23608 << ore::NV("TreeSize", R.getTreeSize()));
23609
23610 R.vectorizeTree();
23611 // Move to the next bundle.
23612 I += VF - 1;
23613 NextInst = I + 1;
23614 Changed = true;
23615 }
23616 }
23617 }
23618
23619 if (!Changed && CandidateFound) {
23620 R.getORE()->emit([&]() {
23621 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
23622 << "List vectorization was possible but not beneficial with cost "
23623 << ore::NV("Cost", MinCost) << " >= "
23624 << ore::NV("Treshold", -SLPCostThreshold);
23625 });
23626 } else if (!Changed) {
23627 R.getORE()->emit([&]() {
23628 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
23629 << "Cannot SLP vectorize list: vectorization was impossible"
23630 << " with available vectorization factors";
23631 });
23632 }
23633 return Changed;
23634}
23635
23636namespace {
23637
23638/// Model horizontal reductions.
23639///
23640/// A horizontal reduction is a tree of reduction instructions that has values
23641/// that can be put into a vector as its leaves. For example:
23642///
23643/// mul mul mul mul
23644/// \ / \ /
23645/// + +
23646/// \ /
23647/// +
23648/// This tree has "mul" as its leaf values and "+" as its reduction
23649/// instructions. A reduction can feed into a store or a binary operation
23650/// feeding a phi.
23651/// ...
23652/// \ /
23653/// +
23654/// |
23655/// phi +=
23656///
23657/// Or:
23658/// ...
23659/// \ /
23660/// +
23661/// |
23662/// *p =
23663///
23664class HorizontalReduction {
23665 using ReductionOpsType = SmallVector<Value *, 16>;
23666 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23667 ReductionOpsListType ReductionOps;
23668 /// List of possibly reduced values.
23670 /// Maps reduced value to the corresponding reduction operation.
23671 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23672 WeakTrackingVH ReductionRoot;
23673 /// The type of reduction operation.
23674 RecurKind RdxKind;
23675 /// Checks if the optimization of original scalar identity operations on
23676 /// matched horizontal reductions is enabled and allowed.
23677 bool IsSupportedHorRdxIdentityOp = false;
23678 /// The minimum number of the reduced values.
23679 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
23680 /// Contains vector values for reduction including their scale factor and
23681 /// signedness.
23683
23684 static bool isCmpSelMinMax(Instruction *I) {
23685 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
23687 }
23688
23689 // And/or are potentially poison-safe logical patterns like:
23690 // select x, y, false
23691 // select x, true, y
23692 static bool isBoolLogicOp(Instruction *I) {
23693 return isa<SelectInst>(I) &&
23694 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
23695 }
23696
23697 /// Checks if instruction is associative and can be vectorized.
23698 static bool isVectorizable(RecurKind Kind, Instruction *I,
23699 bool TwoElementReduction = false) {
23700 if (Kind == RecurKind::None)
23701 return false;
23702
23703 // Integer ops that map to select instructions or intrinsics are fine.
23705 isBoolLogicOp(I))
23706 return true;
23707
23708 // No need to check for associativity, if 2 reduced values.
23709 if (TwoElementReduction)
23710 return true;
23711
23712 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23713 // FP min/max are associative except for NaN and -0.0. We do not
23714 // have to rule out -0.0 here because the intrinsic semantics do not
23715 // specify a fixed result for it.
23716 return I->getFastMathFlags().noNaNs();
23717 }
23718
23719 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23720 return true;
23721
23722 return I->isAssociative();
23723 }
23724
23725 static Value *getRdxOperand(Instruction *I, unsigned Index) {
23726 // Poison-safe 'or' takes the form: select X, true, Y
23727 // To make that work with the normal operand processing, we skip the
23728 // true value operand.
23729 // TODO: Change the code and data structures to handle this without a hack.
23730 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
23731 return I->getOperand(2);
23732 return I->getOperand(Index);
23733 }
23734
23735 /// Creates reduction operation with the current opcode.
23736 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
23737 Value *RHS, const Twine &Name, bool UseSelect) {
23738 Type *OpTy = LHS->getType();
23739 assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
23740 switch (Kind) {
23741 case RecurKind::Or: {
23742 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23743 return Builder.CreateSelectWithUnknownProfile(
23744 LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
23745 RHS, DEBUG_TYPE, Name);
23746 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23747 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23748 Name);
23749 }
23750 case RecurKind::And: {
23751 if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
23752 return Builder.CreateSelectWithUnknownProfile(
23753 LHS, RHS,
23754 ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)),
23755 DEBUG_TYPE, Name);
23756 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23757 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23758 Name);
23759 }
23760 case RecurKind::Add:
23761 case RecurKind::Mul:
23762 case RecurKind::Xor:
23763 case RecurKind::FAdd:
23764 case RecurKind::FMul: {
23765 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
23766 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
23767 Name);
23768 }
23769 case RecurKind::SMax:
23770 case RecurKind::SMin:
23771 case RecurKind::UMax:
23772 case RecurKind::UMin:
23773 if (UseSelect) {
23775 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
23776 return Builder.CreateSelectWithUnknownProfile(Cmp, LHS, RHS, DEBUG_TYPE,
23777 Name);
23778 }
23779 [[fallthrough]];
23780 case RecurKind::FMax:
23781 case RecurKind::FMin:
23782 case RecurKind::FMaximum:
23783 case RecurKind::FMinimum:
23784 case RecurKind::FMaximumNum:
23785 case RecurKind::FMinimumNum: {
23787 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
23788 }
23789 default:
23790 llvm_unreachable("Unknown reduction operation.");
23791 }
23792 }
23793
23794 /// Creates reduction operation with the current opcode with the IR flags
23795 /// from \p ReductionOps, dropping nuw/nsw flags.
23796 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
23797 Value *RHS, const Twine &Name,
23798 const ReductionOpsListType &ReductionOps) {
23799 bool UseSelect = ReductionOps.size() == 2 ||
23800 // Logical or/and.
23801 (ReductionOps.size() == 1 &&
23802 any_of(ReductionOps.front(), IsaPred<SelectInst>));
23803 assert((!UseSelect || ReductionOps.size() != 2 ||
23804 isa<SelectInst>(ReductionOps[1][0])) &&
23805 "Expected cmp + select pairs for reduction");
23806 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
23808 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
23809 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
23810 /*IncludeWrapFlags=*/false);
23811 propagateIRFlags(Op, ReductionOps[1], nullptr,
23812 /*IncludeWrapFlags=*/false);
23813 return Op;
23814 }
23815 }
23816 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
23817 return Op;
23818 }
23819
23820public:
23821 static RecurKind getRdxKind(Value *V) {
23822 auto *I = dyn_cast<Instruction>(V);
23823 if (!I)
23824 return RecurKind::None;
23825 if (match(I, m_Add(m_Value(), m_Value())))
23826 return RecurKind::Add;
23827 if (match(I, m_Mul(m_Value(), m_Value())))
23828 return RecurKind::Mul;
23829 if (match(I, m_And(m_Value(), m_Value())) ||
23831 return RecurKind::And;
23832 if (match(I, m_Or(m_Value(), m_Value())) ||
23834 return RecurKind::Or;
23835 if (match(I, m_Xor(m_Value(), m_Value())))
23836 return RecurKind::Xor;
23837 if (match(I, m_FAdd(m_Value(), m_Value())))
23838 return RecurKind::FAdd;
23839 if (match(I, m_FMul(m_Value(), m_Value())))
23840 return RecurKind::FMul;
23841
23843 return RecurKind::FMax;
23845 return RecurKind::FMin;
23846
23847 if (match(I, m_FMaximum(m_Value(), m_Value())))
23848 return RecurKind::FMaximum;
23849 if (match(I, m_FMinimum(m_Value(), m_Value())))
23850 return RecurKind::FMinimum;
23851 // This matches either cmp+select or intrinsics. SLP is expected to handle
23852 // either form.
23853 // TODO: If we are canonicalizing to intrinsics, we can remove several
23854 // special-case paths that deal with selects.
23855 if (match(I, m_SMax(m_Value(), m_Value())))
23856 return RecurKind::SMax;
23857 if (match(I, m_SMin(m_Value(), m_Value())))
23858 return RecurKind::SMin;
23859 if (match(I, m_UMax(m_Value(), m_Value())))
23860 return RecurKind::UMax;
23861 if (match(I, m_UMin(m_Value(), m_Value())))
23862 return RecurKind::UMin;
23863
23864 if (auto *Select = dyn_cast<SelectInst>(I)) {
23865 // Try harder: look for min/max pattern based on instructions producing
23866 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
23867 // During the intermediate stages of SLP, it's very common to have
23868 // pattern like this (since optimizeGatherSequence is run only once
23869 // at the end):
23870 // %1 = extractelement <2 x i32> %a, i32 0
23871 // %2 = extractelement <2 x i32> %a, i32 1
23872 // %cond = icmp sgt i32 %1, %2
23873 // %3 = extractelement <2 x i32> %a, i32 0
23874 // %4 = extractelement <2 x i32> %a, i32 1
23875 // %select = select i1 %cond, i32 %3, i32 %4
23876 CmpPredicate Pred;
23877 Instruction *L1;
23878 Instruction *L2;
23879
23880 Value *LHS = Select->getTrueValue();
23881 Value *RHS = Select->getFalseValue();
23882 Value *Cond = Select->getCondition();
23883
23884 // TODO: Support inverse predicates.
23885 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
23888 return RecurKind::None;
23889 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
23892 return RecurKind::None;
23893 } else {
23895 return RecurKind::None;
23896 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
23899 return RecurKind::None;
23900 }
23901
23902 switch (Pred) {
23903 default:
23904 return RecurKind::None;
23905 case CmpInst::ICMP_SGT:
23906 case CmpInst::ICMP_SGE:
23907 return RecurKind::SMax;
23908 case CmpInst::ICMP_SLT:
23909 case CmpInst::ICMP_SLE:
23910 return RecurKind::SMin;
23911 case CmpInst::ICMP_UGT:
23912 case CmpInst::ICMP_UGE:
23913 return RecurKind::UMax;
23914 case CmpInst::ICMP_ULT:
23915 case CmpInst::ICMP_ULE:
23916 return RecurKind::UMin;
23917 }
23918 }
23919 return RecurKind::None;
23920 }
23921
23922 /// Get the index of the first operand.
23923 static unsigned getFirstOperandIndex(Instruction *I) {
23924 return isCmpSelMinMax(I) ? 1 : 0;
23925 }
23926
23927private:
23928 /// Total number of operands in the reduction operation.
23929 static unsigned getNumberOfOperands(Instruction *I) {
23930 return isCmpSelMinMax(I) ? 3 : 2;
23931 }
23932
23933 /// Checks if the instruction is in basic block \p BB.
23934 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
23935 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
23936 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
23937 auto *Sel = cast<SelectInst>(I);
23938 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
23939 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
23940 }
23941 return I->getParent() == BB;
23942 }
23943
23944 /// Expected number of uses for reduction operations/reduced values.
23945 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
23946 if (IsCmpSelMinMax) {
23947 // SelectInst must be used twice while the condition op must have single
23948 // use only.
23949 if (auto *Sel = dyn_cast<SelectInst>(I))
23950 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
23951 return I->hasNUses(2);
23952 }
23953
23954 // Arithmetic reduction operation must be used once only.
23955 return I->hasOneUse();
23956 }
23957
23958 /// Initializes the list of reduction operations.
23959 void initReductionOps(Instruction *I) {
23960 if (isCmpSelMinMax(I))
23961 ReductionOps.assign(2, ReductionOpsType());
23962 else
23963 ReductionOps.assign(1, ReductionOpsType());
23964 }
23965
23966 /// Add all reduction operations for the reduction instruction \p I.
23967 void addReductionOps(Instruction *I) {
23968 if (isCmpSelMinMax(I)) {
23969 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
23970 ReductionOps[1].emplace_back(I);
23971 } else {
23972 ReductionOps[0].emplace_back(I);
23973 }
23974 }
23975
23976 static bool isGoodForReduction(ArrayRef<Value *> Data) {
23977 int Sz = Data.size();
23978 auto *I = dyn_cast<Instruction>(Data.front());
23979 return Sz > 1 || isConstant(Data.front()) ||
23980 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
23981 }
23982
23983public:
23984 HorizontalReduction() = default;
23986 : ReductionRoot(I), ReductionLimit(2) {
23987 RdxKind = HorizontalReduction::getRdxKind(I);
23988 ReductionOps.emplace_back().push_back(I);
23989 ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
23990 for (Value *V : Ops)
23991 ReducedValsToOps[V].push_back(I);
23992 }
23993
23994 bool matchReductionForOperands() const {
23995 // Analyze "regular" integer/FP types for reductions - no target-specific
23996 // types or pointers.
23997 assert(ReductionRoot && "Reduction root is not set!");
23998 if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
23999 all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
24000 return Ops.size() == 2;
24001 })))
24002 return false;
24003
24004 return true;
24005 }
24006
24007 /// Try to find a reduction tree.
24008 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
24009 ScalarEvolution &SE, const DataLayout &DL,
24010 const TargetLibraryInfo &TLI) {
24011 RdxKind = HorizontalReduction::getRdxKind(Root);
24012 if (!isVectorizable(RdxKind, Root))
24013 return false;
24014
24015 // Analyze "regular" integer/FP types for reductions - no target-specific
24016 // types or pointers.
24017 Type *Ty = Root->getType();
24018 if (!isValidElementType(Ty) || Ty->isPointerTy())
24019 return false;
24020
24021 // Though the ultimate reduction may have multiple uses, its condition must
24022 // have only single use.
24023 if (auto *Sel = dyn_cast<SelectInst>(Root))
24024 if (!Sel->getCondition()->hasOneUse())
24025 return false;
24026
24027 ReductionRoot = Root;
24028
24029 // Iterate through all the operands of the possible reduction tree and
24030 // gather all the reduced values, sorting them by their value id.
24031 BasicBlock *BB = Root->getParent();
24032 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
24034 1, std::make_pair(Root, 0));
24035 // Checks if the operands of the \p TreeN instruction are also reduction
24036 // operations or should be treated as reduced values or an extra argument,
24037 // which is not part of the reduction.
24038 auto CheckOperands = [&](Instruction *TreeN,
24039 SmallVectorImpl<Value *> &PossibleReducedVals,
24040 SmallVectorImpl<Instruction *> &ReductionOps,
24041 unsigned Level) {
24042 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
24043 getNumberOfOperands(TreeN)))) {
24044 Value *EdgeVal = getRdxOperand(TreeN, I);
24045 ReducedValsToOps[EdgeVal].push_back(TreeN);
24046 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
24047 // If the edge is not an instruction, or it is different from the main
24048 // reduction opcode or has too many uses - possible reduced value.
24049 // Also, do not try to reduce const values, if the operation is not
24050 // foldable.
24051 if (!EdgeInst || Level > RecursionMaxDepth ||
24052 getRdxKind(EdgeInst) != RdxKind ||
24053 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
24054 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
24055 !isVectorizable(RdxKind, EdgeInst) ||
24056 (R.isAnalyzedReductionRoot(EdgeInst) &&
24057 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
24058 PossibleReducedVals.push_back(EdgeVal);
24059 continue;
24060 }
24061 ReductionOps.push_back(EdgeInst);
24062 }
24063 };
24064 // Try to regroup reduced values so that it gets more profitable to try to
24065 // reduce them. Values are grouped by their value ids, instructions - by
24066 // instruction op id and/or alternate op id, plus do extra analysis for
24067 // loads (grouping them by the distance between pointers) and cmp
24068 // instructions (grouping them by the predicate).
24069 SmallMapVector<
24070 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
24071 8>
24072 PossibleReducedVals;
24073 initReductionOps(Root);
24074 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
24075 SmallSet<size_t, 2> LoadKeyUsed;
24076
24077 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
24079 Value *Ptr =
24081 if (!LoadKeyUsed.insert(Key).second) {
24082 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
24083 if (LIt != LoadsMap.end()) {
24084 for (LoadInst *RLI : LIt->second) {
24085 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
24086 LI->getType(), LI->getPointerOperand(), DL, SE,
24087 /*StrictCheck=*/true))
24088 return hash_value(RLI->getPointerOperand());
24089 }
24090 for (LoadInst *RLI : LIt->second) {
24092 LI->getPointerOperand(), TLI)) {
24093 hash_code SubKey = hash_value(RLI->getPointerOperand());
24094 return SubKey;
24095 }
24096 }
24097 if (LIt->second.size() > 2) {
24098 hash_code SubKey =
24099 hash_value(LIt->second.back()->getPointerOperand());
24100 return SubKey;
24101 }
24102 }
24103 }
24104 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
24105 .first->second.push_back(LI);
24106 return hash_value(LI->getPointerOperand());
24107 };
24108
24109 while (!Worklist.empty()) {
24110 auto [TreeN, Level] = Worklist.pop_back_val();
24111 SmallVector<Value *> PossibleRedVals;
24112 SmallVector<Instruction *> PossibleReductionOps;
24113 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
24114 addReductionOps(TreeN);
24115 // Add reduction values. The values are sorted for better vectorization
24116 // results.
24117 for (Value *V : PossibleRedVals) {
24118 size_t Key, Idx;
24119 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
24120 /*AllowAlternate=*/false);
24121 ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
24122 }
24123 for (Instruction *I : reverse(PossibleReductionOps))
24124 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
24125 }
24126 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
24127 // Sort values by the total number of values kinds to start the reduction
24128 // from the longest possible reduced values sequences.
24129 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
24130 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
24131 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
24132 for (auto &Slice : PossibleRedVals) {
24133 PossibleRedValsVect.emplace_back();
24134 auto RedValsVect = Slice.second.takeVector();
24135 stable_sort(RedValsVect, llvm::less_second());
24136 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
24137 PossibleRedValsVect.back().append(Data.second, Data.first);
24138 }
24139 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
24140 return P1.size() > P2.size();
24141 });
24142 bool First = true;
24143 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
24144 if (First) {
24145 First = false;
24146 ReducedVals.emplace_back();
24147 } else if (!isGoodForReduction(Data)) {
24148 auto *LI = dyn_cast<LoadInst>(Data.front());
24149 auto *LastLI = dyn_cast<LoadInst>(ReducedVals.back().front());
24150 if (!LI || !LastLI ||
24152 getUnderlyingObject(LastLI->getPointerOperand()))
24153 ReducedVals.emplace_back();
24154 }
24155 ReducedVals.back().append(Data.rbegin(), Data.rend());
24156 }
24157 }
24158 // Sort the reduced values by number of same/alternate opcode and/or pointer
24159 // operand.
24160 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
24161 return P1.size() > P2.size();
24162 });
24163 return true;
24164 }
24165
24166 /// Attempt to vectorize the tree found by matchAssociativeReduction.
24167 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
24168 const TargetLibraryInfo &TLI, AssumptionCache *AC,
24169 DominatorTree &DT) {
24170 constexpr unsigned RegMaxNumber = 4;
24171 constexpr unsigned RedValsMaxNumber = 128;
24172 // If there are a sufficient number of reduction values, reduce
24173 // to a nearby power-of-2. We can safely generate oversized
24174 // vectors and rely on the backend to split them to legal sizes.
24175 if (unsigned NumReducedVals = std::accumulate(
24176 ReducedVals.begin(), ReducedVals.end(), 0,
24177 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
24178 if (!isGoodForReduction(Vals))
24179 return Num;
24180 return Num + Vals.size();
24181 });
24182 NumReducedVals < ReductionLimit &&
24183 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
24184 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
24185 })) {
24186 for (ReductionOpsType &RdxOps : ReductionOps)
24187 for (Value *RdxOp : RdxOps)
24188 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24189 return nullptr;
24190 }
24191
24192 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
24193 TargetFolder(DL));
24194 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
24195
24196 // Track the reduced values in case if they are replaced by extractelement
24197 // because of the vectorization.
24198 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
24199 ReducedVals.front().size());
24200
24201 // The compare instruction of a min/max is the insertion point for new
24202 // instructions and may be replaced with a new compare instruction.
24203 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
24204 assert(isa<SelectInst>(RdxRootInst) &&
24205 "Expected min/max reduction to have select root instruction");
24206 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
24207 assert(isa<Instruction>(ScalarCond) &&
24208 "Expected min/max reduction to have compare condition");
24209 return cast<Instruction>(ScalarCond);
24210 };
24211
24212 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
24213 return isBoolLogicOp(cast<Instruction>(V));
24214 });
24215 // Return new VectorizedTree, based on previous value.
24216 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
24217 if (VectorizedTree) {
24218 // Update the final value in the reduction.
24220 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
24221 if (AnyBoolLogicOp) {
24222 auto It = ReducedValsToOps.find(VectorizedTree);
24223 auto It1 = ReducedValsToOps.find(Res);
24224 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
24225 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
24226 (It != ReducedValsToOps.end() &&
24227 any_of(It->getSecond(), [&](Instruction *I) {
24228 return isBoolLogicOp(I) &&
24229 getRdxOperand(I, 0) == VectorizedTree;
24230 }))) {
24231 ;
24232 } else if (isGuaranteedNotToBePoison(Res, AC) ||
24233 (It1 != ReducedValsToOps.end() &&
24234 any_of(It1->getSecond(), [&](Instruction *I) {
24235 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
24236 }))) {
24237 std::swap(VectorizedTree, Res);
24238 } else {
24239 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
24240 }
24241 }
24242
24243 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
24244 ReductionOps);
24245 }
24246 // Initialize the final value in the reduction.
24247 return Res;
24248 };
24249 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
24250 ReductionOps.front().size());
24251 for (ReductionOpsType &RdxOps : ReductionOps)
24252 for (Value *RdxOp : RdxOps) {
24253 if (!RdxOp)
24254 continue;
24255 IgnoreList.insert(RdxOp);
24256 }
24257 // Intersect the fast-math-flags from all reduction operations.
24258 FastMathFlags RdxFMF;
24259 RdxFMF.set();
24260 for (Value *U : IgnoreList)
24261 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
24262 RdxFMF &= FPMO->getFastMathFlags();
24263 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
24264
24265 // Need to track reduced vals, they may be changed during vectorization of
24266 // subvectors.
24267 for (ArrayRef<Value *> Candidates : ReducedVals)
24268 for (Value *V : Candidates)
24269 TrackedVals.try_emplace(V, V);
24270
24271 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24272 Value *V) -> unsigned & {
24273 auto *It = MV.find(V);
24274 assert(It != MV.end() && "Unable to find given key.");
24275 return It->second;
24276 };
24277
24278 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
24279 // List of the values that were reduced in other trees as part of gather
24280 // nodes and thus requiring extract if fully vectorized in other trees.
24281 SmallPtrSet<Value *, 4> RequiredExtract;
24282 WeakTrackingVH VectorizedTree = nullptr;
24283 bool CheckForReusedReductionOps = false;
24284 // Try to vectorize elements based on their type.
24286 for (ArrayRef<Value *> RV : ReducedVals)
24287 States.push_back(getSameOpcode(RV, TLI));
24288 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
24289 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
24290 InstructionsState S = States[I];
24291 SmallVector<Value *> Candidates;
24292 Candidates.reserve(2 * OrigReducedVals.size());
24293 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
24294 for (Value *ReducedVal : OrigReducedVals) {
24295 Value *RdxVal = TrackedVals.at(ReducedVal);
24296 // Check if the reduction value was not overriden by the extractelement
24297 // instruction because of the vectorization and exclude it, if it is not
24298 // compatible with other values.
24299 // Also check if the instruction was folded to constant/other value.
24300 auto *Inst = dyn_cast<Instruction>(RdxVal);
24301 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
24302 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
24303 (S && !Inst))
24304 continue;
24305 Candidates.push_back(RdxVal);
24306 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
24307 }
24308 bool ShuffledExtracts = false;
24309 // Try to handle shuffled extractelements.
24310 if (S && S.getOpcode() == Instruction::ExtractElement &&
24311 !S.isAltShuffle() && I + 1 < E) {
24312 SmallVector<Value *> CommonCandidates(Candidates);
24313 for (Value *RV : ReducedVals[I + 1]) {
24314 Value *RdxVal = TrackedVals.at(RV);
24315 // Check if the reduction value was not overriden by the
24316 // extractelement instruction because of the vectorization and
24317 // exclude it, if it is not compatible with other values.
24318 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
24319 if (!Inst)
24320 continue;
24321 CommonCandidates.push_back(RdxVal);
24322 TrackedToOrig.try_emplace(RdxVal, RV);
24323 }
24324 SmallVector<int> Mask;
24325 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
24326 ++I;
24327 Candidates.swap(CommonCandidates);
24328 ShuffledExtracts = true;
24329 }
24330 }
24331
24332 // Emit code for constant values.
24333 if (Candidates.size() > 1 && allConstant(Candidates)) {
24334 Value *Res = Candidates.front();
24335 Value *OrigV = TrackedToOrig.at(Candidates.front());
24336 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24337 for (Value *VC : ArrayRef(Candidates).drop_front()) {
24338 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
24339 Value *OrigV = TrackedToOrig.at(VC);
24340 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24341 if (auto *ResI = dyn_cast<Instruction>(Res))
24342 V.analyzedReductionRoot(ResI);
24343 }
24344 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24345 continue;
24346 }
24347
24348 unsigned NumReducedVals = Candidates.size();
24349 if (NumReducedVals < ReductionLimit &&
24350 (NumReducedVals < 2 || !isSplat(Candidates)))
24351 continue;
24352
24353 // Check if we support repeated scalar values processing (optimization of
24354 // original scalar identity operations on matched horizontal reductions).
24355 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24356 RdxKind != RecurKind::FMul &&
24357 RdxKind != RecurKind::FMulAdd;
24358 // Gather same values.
24359 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24360 if (IsSupportedHorRdxIdentityOp)
24361 for (Value *V : Candidates) {
24362 Value *OrigV = TrackedToOrig.at(V);
24363 ++SameValuesCounter.try_emplace(OrigV).first->second;
24364 }
24365 // Used to check if the reduced values used same number of times. In this
24366 // case the compiler may produce better code. E.g. if reduced values are
24367 // aabbccdd (8 x values), then the first node of the tree will have a node
24368 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
24369 // Plus, the final reduction will be performed on <8 x aabbccdd>.
24370 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
24371 // x abcd) * 2.
24372 // Currently it only handles add/fadd/xor. and/or/min/max do not require
24373 // this analysis, other operations may require an extra estimation of
24374 // the profitability.
24375 bool SameScaleFactor = false;
24376 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24377 SameValuesCounter.size() != Candidates.size();
24378 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
24379 if (OptReusedScalars) {
24380 SameScaleFactor =
24381 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24382 RdxKind == RecurKind::Xor) &&
24383 all_of(drop_begin(SameValuesCounter),
24384 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
24385 return P.second == SameValuesCounter.front().second;
24386 });
24387 Candidates.resize(SameValuesCounter.size());
24388 transform(SameValuesCounter, Candidates.begin(),
24389 [&](const auto &P) { return TrackedVals.at(P.first); });
24390 NumReducedVals = Candidates.size();
24391 // Have a reduction of the same element.
24392 if (NumReducedVals == 1) {
24393 Value *OrigV = TrackedToOrig.at(Candidates.front());
24394 unsigned Cnt = At(SameValuesCounter, OrigV);
24395 Value *RedVal =
24396 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24397 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24398 VectorizedVals.try_emplace(OrigV, Cnt);
24399 ExternallyUsedValues.insert(OrigV);
24400 continue;
24401 }
24402 }
24403
24404 unsigned MaxVecRegSize = V.getMaxVecRegSize();
24405 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
24406 const unsigned MaxElts = std::clamp<unsigned>(
24407 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
24408 RegMaxNumber * RedValsMaxNumber);
24409
24410 unsigned ReduxWidth = NumReducedVals;
24411 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
24412 unsigned NumParts, NumRegs;
24413 Type *ScalarTy = Candidates.front()->getType();
24414 ReduxWidth =
24415 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
24416 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24417 NumParts = ::getNumberOfParts(TTI, Tp);
24418 NumRegs =
24420 while (NumParts > NumRegs) {
24421 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
24422 ReduxWidth = bit_floor(ReduxWidth - 1);
24423 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
24424 NumParts = ::getNumberOfParts(TTI, Tp);
24425 NumRegs =
24427 }
24428 if (NumParts > NumRegs / 2)
24429 ReduxWidth = bit_floor(ReduxWidth);
24430 return ReduxWidth;
24431 };
24432 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
24433 ReduxWidth = GetVectorFactor(ReduxWidth);
24434 ReduxWidth = std::min(ReduxWidth, MaxElts);
24435
24436 unsigned Start = 0;
24437 unsigned Pos = Start;
24438 // Restarts vectorization attempt with lower vector factor.
24439 unsigned PrevReduxWidth = ReduxWidth;
24440 bool CheckForReusedReductionOpsLocal = false;
24441 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
24442 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
24443 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24444 // Check if any of the reduction ops are gathered. If so, worth
24445 // trying again with less number of reduction ops.
24446 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24447 }
24448 ++Pos;
24449 if (Pos < NumReducedVals - ReduxWidth + 1)
24450 return IsAnyRedOpGathered;
24451 Pos = Start;
24452 --ReduxWidth;
24453 if (ReduxWidth > 1)
24454 ReduxWidth = GetVectorFactor(ReduxWidth);
24455 return IsAnyRedOpGathered;
24456 };
24457 bool AnyVectorized = false;
24458 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24459 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24460 ReduxWidth >= ReductionLimit) {
24461 // Dependency in tree of the reduction ops - drop this attempt, try
24462 // later.
24463 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24464 Start == 0) {
24465 CheckForReusedReductionOps = true;
24466 break;
24467 }
24468 PrevReduxWidth = ReduxWidth;
24469 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
24470 // Been analyzed already - skip.
24471 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
24472 (!has_single_bit(ReduxWidth) &&
24473 (IgnoredCandidates.contains(
24474 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
24475 IgnoredCandidates.contains(
24476 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
24477 bit_floor(ReduxWidth))))) ||
24478 V.areAnalyzedReductionVals(VL)) {
24479 (void)AdjustReducedVals(/*IgnoreVL=*/true);
24480 continue;
24481 }
24482 // Early exit if any of the reduction values were deleted during
24483 // previous vectorization attempts.
24484 if (any_of(VL, [&V](Value *RedVal) {
24485 auto *RedValI = dyn_cast<Instruction>(RedVal);
24486 return RedValI && V.isDeleted(RedValI);
24487 }))
24488 break;
24489 V.buildTree(VL, IgnoreList);
24490 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
24491 if (!AdjustReducedVals())
24492 V.analyzedReductionVals(VL);
24493 continue;
24494 }
24495 if (V.isLoadCombineReductionCandidate(RdxKind)) {
24496 if (!AdjustReducedVals())
24497 V.analyzedReductionVals(VL);
24498 continue;
24499 }
24500 V.reorderTopToBottom();
24501 // No need to reorder the root node at all for reassociative reduction.
24502 V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
24503 VL.front()->getType()->isIntOrIntVectorTy() ||
24504 ReductionLimit > 2);
24505 // Keep extracted other reduction values, if they are used in the
24506 // vectorization trees.
24507 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
24508 ExternallyUsedValues);
24509 // The reduction root is used as the insertion point for new
24510 // instructions, so set it as externally used to prevent it from being
24511 // deleted.
24512 LocalExternallyUsedValues.insert(ReductionRoot);
24513 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
24514 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
24515 continue;
24516 for (Value *V : ReducedVals[Cnt])
24517 if (isa<Instruction>(V))
24518 LocalExternallyUsedValues.insert(TrackedVals[V]);
24519 }
24520 if (!IsSupportedHorRdxIdentityOp) {
24521 // Number of uses of the candidates in the vector of values.
24522 assert(SameValuesCounter.empty() &&
24523 "Reused values counter map is not empty");
24524 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24525 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24526 continue;
24527 Value *V = Candidates[Cnt];
24528 Value *OrigV = TrackedToOrig.at(V);
24529 ++SameValuesCounter.try_emplace(OrigV).first->second;
24530 }
24531 }
24532 V.transformNodes();
24533 SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL);
24534 // Gather externally used values.
24535 SmallPtrSet<Value *, 4> Visited;
24536 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24537 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24538 continue;
24539 Value *RdxVal = Candidates[Cnt];
24540 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24541 RdxVal = It->second;
24542 if (!Visited.insert(RdxVal).second)
24543 continue;
24544 // Check if the scalar was vectorized as part of the vectorization
24545 // tree but not the top node.
24546 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
24547 LocalExternallyUsedValues.insert(RdxVal);
24548 continue;
24549 }
24550 Value *OrigV = TrackedToOrig.at(RdxVal);
24551 unsigned NumOps =
24552 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24553 if (NumOps != ReducedValsToOps.at(OrigV).size())
24554 LocalExternallyUsedValues.insert(RdxVal);
24555 }
24556 // Do not need the list of reused scalars in regular mode anymore.
24557 if (!IsSupportedHorRdxIdentityOp)
24558 SameValuesCounter.clear();
24559 for (Value *RdxVal : VL)
24560 if (RequiredExtract.contains(RdxVal))
24561 LocalExternallyUsedValues.insert(RdxVal);
24562 V.buildExternalUses(LocalExternallyUsedValues);
24563
24564 V.computeMinimumValueSizes();
24565
24566 // Estimate cost.
24567 InstructionCost ReductionCost =
24568 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
24569 InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
24570 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
24571 << " for reduction\n");
24572 if (!Cost.isValid())
24573 break;
24574 if (Cost >= -SLPCostThreshold) {
24575 V.getORE()->emit([&]() {
24576 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
24577 ReducedValsToOps.at(VL[0]).front())
24578 << "Vectorizing horizontal reduction is possible "
24579 << "but not beneficial with cost " << ore::NV("Cost", Cost)
24580 << " and threshold "
24581 << ore::NV("Threshold", -SLPCostThreshold);
24582 });
24583 if (!AdjustReducedVals()) {
24584 V.analyzedReductionVals(VL);
24585 unsigned Offset = Pos == Start ? Pos : Pos - 1;
24586 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
24587 // Add subvectors of VL to the list of the analyzed values.
24588 for (unsigned VF = getFloorFullVectorNumberOfElements(
24589 *TTI, VL.front()->getType(), ReduxWidth - 1);
24590 VF >= ReductionLimit;
24592 *TTI, VL.front()->getType(), VF - 1)) {
24593 if (has_single_bit(VF) &&
24594 V.getCanonicalGraphSize() != V.getTreeSize())
24595 continue;
24596 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
24597 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
24598 }
24599 }
24600 }
24601 continue;
24602 }
24603
24604 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
24605 << Cost << ". (HorRdx)\n");
24606 V.getORE()->emit([&]() {
24607 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
24608 ReducedValsToOps.at(VL[0]).front())
24609 << "Vectorized horizontal reduction with cost "
24610 << ore::NV("Cost", Cost) << " and with tree size "
24611 << ore::NV("TreeSize", V.getTreeSize());
24612 });
24613
24614 Builder.setFastMathFlags(RdxFMF);
24615
24616 // Emit a reduction. If the root is a select (min/max idiom), the insert
24617 // point is the compare condition of that select.
24618 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
24619 Instruction *InsertPt = RdxRootInst;
24620 if (IsCmpSelMinMax)
24621 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24622
24623 // Vectorize a tree.
24624 Value *VectorizedRoot = V.vectorizeTree(
24625 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24626 // Update TrackedToOrig mapping, since the tracked values might be
24627 // updated.
24628 for (Value *RdxVal : Candidates) {
24629 Value *OrigVal = TrackedToOrig.at(RdxVal);
24630 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24631 if (TransformedRdxVal != RdxVal)
24632 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24633 }
24634
24635 Builder.SetInsertPoint(InsertPt);
24636
24637 // To prevent poison from leaking across what used to be sequential,
24638 // safe, scalar boolean logic operations, the reduction operand must be
24639 // frozen.
24640 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
24641 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
24642
24643 // Emit code to correctly handle reused reduced values, if required.
24644 if (OptReusedScalars && !SameScaleFactor) {
24645 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24646 SameValuesCounter, TrackedToOrig);
24647 }
24648
24649 Type *ScalarTy = VL.front()->getType();
24650 Type *VecTy = VectorizedRoot->getType();
24651 Type *RedScalarTy = VecTy->getScalarType();
24652 VectorValuesAndScales.emplace_back(
24653 VectorizedRoot,
24654 OptReusedScalars && SameScaleFactor
24655 ? SameValuesCounter.front().second
24656 : 1,
24657 RedScalarTy != ScalarTy->getScalarType()
24658 ? V.isSignedMinBitwidthRootNode()
24659 : true);
24660
24661 // Count vectorized reduced values to exclude them from final reduction.
24662 for (Value *RdxVal : VL) {
24663 Value *OrigV = TrackedToOrig.at(RdxVal);
24664 if (IsSupportedHorRdxIdentityOp) {
24665 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24666 continue;
24667 }
24668 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24669 if (!V.isVectorized(RdxVal))
24670 RequiredExtract.insert(RdxVal);
24671 }
24672 Pos += ReduxWidth;
24673 Start = Pos;
24674 ReduxWidth = NumReducedVals - Pos;
24675 if (ReduxWidth > 1)
24676 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24677 AnyVectorized = true;
24678 }
24679 if (OptReusedScalars && !AnyVectorized) {
24680 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
24681 Value *RdxVal = TrackedVals.at(P.first);
24682 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
24683 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24684 VectorizedVals.try_emplace(P.first, P.second);
24685 }
24686 continue;
24687 }
24688 }
24689 if (!VectorValuesAndScales.empty())
24690 VectorizedTree = GetNewVectorizedTree(
24691 VectorizedTree,
24692 emitReduction(Builder, *TTI, ReductionRoot->getType()));
24693
24694 if (!VectorizedTree) {
24695 if (!CheckForReusedReductionOps) {
24696 for (ReductionOpsType &RdxOps : ReductionOps)
24697 for (Value *RdxOp : RdxOps)
24698 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
24699 }
24700 return nullptr;
24701 }
24702
24703 // Reorder operands of bool logical op in the natural order to avoid
24704 // possible problem with poison propagation. If not possible to reorder
24705 // (both operands are originally RHS), emit an extra freeze instruction
24706 // for the LHS operand.
24707 // I.e., if we have original code like this:
24708 // RedOp1 = select i1 ?, i1 LHS, i1 false
24709 // RedOp2 = select i1 RHS, i1 ?, i1 false
24710
24711 // Then, we swap LHS/RHS to create a new op that matches the poison
24712 // semantics of the original code.
24713
24714 // If we have original code like this and both values could be poison:
24715 // RedOp1 = select i1 ?, i1 LHS, i1 false
24716 // RedOp2 = select i1 ?, i1 RHS, i1 false
24717
24718 // Then, we must freeze LHS in the new op.
24719 auto FixBoolLogicalOps =
24720 [&, VectorizedTree](Value *&LHS, Value *&RHS, Instruction *RedOp1,
24721 Instruction *RedOp2, bool InitStep) {
24722 if (!AnyBoolLogicOp)
24723 return;
24724 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
24725 getRdxOperand(RedOp1, 0) == LHS ||
24727 return;
24728 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
24729 getRdxOperand(RedOp2, 0) == RHS ||
24731 std::swap(LHS, RHS);
24732 return;
24733 }
24734 if (LHS != VectorizedTree)
24735 LHS = Builder.CreateFreeze(LHS);
24736 };
24737 // Finish the reduction.
24738 // Need to add extra arguments and not vectorized possible reduction values.
24739 // Try to avoid dependencies between the scalar remainders after reductions.
24740 auto FinalGen = [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
24741 bool InitStep) {
24742 unsigned Sz = InstVals.size();
24743 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + Sz % 2);
24744 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
24745 Instruction *RedOp = InstVals[I + 1].first;
24746 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
24747 Value *RdxVal1 = InstVals[I].second;
24748 Value *StableRdxVal1 = RdxVal1;
24749 auto It1 = TrackedVals.find(RdxVal1);
24750 if (It1 != TrackedVals.end())
24751 StableRdxVal1 = It1->second;
24752 Value *RdxVal2 = InstVals[I + 1].second;
24753 Value *StableRdxVal2 = RdxVal2;
24754 auto It2 = TrackedVals.find(RdxVal2);
24755 if (It2 != TrackedVals.end())
24756 StableRdxVal2 = It2->second;
24757 // To prevent poison from leaking across what used to be sequential,
24758 // safe, scalar boolean logic operations, the reduction operand must be
24759 // frozen.
24760 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
24761 RedOp, InitStep);
24762 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24763 StableRdxVal2, "op.rdx", ReductionOps);
24764 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
24765 }
24766 if (Sz % 2 == 1)
24767 ExtraReds[Sz / 2] = InstVals.back();
24768 return ExtraReds;
24769 };
24771 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
24772 VectorizedTree);
24773 SmallPtrSet<Value *, 8> Visited;
24774 for (ArrayRef<Value *> Candidates : ReducedVals) {
24775 for (Value *RdxVal : Candidates) {
24776 if (!Visited.insert(RdxVal).second)
24777 continue;
24778 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24779 for (Instruction *RedOp :
24780 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
24781 ExtraReductions.emplace_back(RedOp, RdxVal);
24782 }
24783 }
24784 // Iterate through all not-vectorized reduction values/extra arguments.
24785 bool InitStep = true;
24786 while (ExtraReductions.size() > 1) {
24788 FinalGen(ExtraReductions, InitStep);
24789 ExtraReductions.swap(NewReds);
24790 InitStep = false;
24791 }
24792 VectorizedTree = ExtraReductions.front().second;
24793
24794 ReductionRoot->replaceAllUsesWith(VectorizedTree);
24795
24796 // The original scalar reduction is expected to have no remaining
24797 // uses outside the reduction tree itself. Assert that we got this
24798 // correct, replace internal uses with undef, and mark for eventual
24799 // deletion.
24800#ifndef NDEBUG
24801 SmallPtrSet<Value *, 4> IgnoreSet;
24802 for (ArrayRef<Value *> RdxOps : ReductionOps)
24803 IgnoreSet.insert_range(RdxOps);
24804#endif
24805 for (ArrayRef<Value *> RdxOps : ReductionOps) {
24806 for (Value *Ignore : RdxOps) {
24807 if (!Ignore)
24808 continue;
24809#ifndef NDEBUG
24810 for (auto *U : Ignore->users()) {
24811 assert(IgnoreSet.count(U) &&
24812 "All users must be either in the reduction ops list.");
24813 }
24814#endif
24815 if (!Ignore->use_empty()) {
24816 Value *P = PoisonValue::get(Ignore->getType());
24817 Ignore->replaceAllUsesWith(P);
24818 }
24819 }
24820 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
24821 }
24822 return VectorizedTree;
24823 }
24824
24825private:
24826 /// Creates the reduction from the given \p Vec vector value with the given
24827 /// scale \p Scale and signedness \p IsSigned.
24828 Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
24829 Value *Vec, unsigned Scale, bool IsSigned,
24830 Type *DestTy) {
24831 Value *Rdx;
24832 if (auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
24833 unsigned DestTyNumElements = getNumElements(VecTy);
24834 unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements;
24835 Rdx = PoisonValue::get(
24836 getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));
24837 for (unsigned I : seq<unsigned>(DestTyNumElements)) {
24838 // Do reduction for each lane.
24839 // e.g., do reduce add for
24840 // VL[0] = <4 x Ty> <a, b, c, d>
24841 // VL[1] = <4 x Ty> <e, f, g, h>
24842 // Lane[0] = <2 x Ty> <a, e>
24843 // Lane[1] = <2 x Ty> <b, f>
24844 // Lane[2] = <2 x Ty> <c, g>
24845 // Lane[3] = <2 x Ty> <d, h>
24846 // result[0] = reduce add Lane[0]
24847 // result[1] = reduce add Lane[1]
24848 // result[2] = reduce add Lane[2]
24849 // result[3] = reduce add Lane[3]
24850 SmallVector<int, 16> Mask = createStrideMask(I, DestTyNumElements, VF);
24851 Value *Lane = Builder.CreateShuffleVector(Vec, Mask);
24852 Rdx = Builder.CreateInsertElement(
24853 Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I);
24854 }
24855 } else {
24856 Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
24857 }
24858 if (Rdx->getType() != DestTy)
24859 Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
24860 // Improved analysis for add/fadd/xor reductions with same scale
24861 // factor for all operands of reductions. We can emit scalar ops for
24862 // them instead.
24863 if (Scale > 1)
24864 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
24865 return Rdx;
24866 }
24867
24868 /// Calculate the cost of a reduction.
24869 InstructionCost getReductionCost(TargetTransformInfo *TTI,
24870 ArrayRef<Value *> ReducedVals,
24871 bool IsCmpSelMinMax, FastMathFlags FMF,
24872 const BoUpSLP &R, DominatorTree &DT,
24873 const DataLayout &DL,
24874 const TargetLibraryInfo &TLI) {
24876 Type *ScalarTy = ReducedVals.front()->getType();
24877 unsigned ReduxWidth = ReducedVals.size();
24878 FixedVectorType *VectorTy = R.getReductionType();
24879 InstructionCost VectorCost = 0, ScalarCost;
24880 // If all of the reduced values are constant, the vector cost is 0, since
24881 // the reduction value can be calculated at the compile time.
24882 bool AllConsts = allConstant(ReducedVals);
24883 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
24885 // Scalar cost is repeated for N-1 elements.
24886 int Cnt = ReducedVals.size();
24887 for (Value *RdxVal : ReducedVals) {
24888 if (Cnt == 1)
24889 break;
24890 --Cnt;
24891 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
24892 Cost += GenCostFn();
24893 continue;
24894 }
24895 InstructionCost ScalarCost = 0;
24896 for (User *U : RdxVal->users()) {
24897 auto *RdxOp = cast<Instruction>(U);
24898 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24899 if (RdxKind == RecurKind::FAdd) {
24901 RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
24902 if (FMACost.isValid()) {
24903 LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
24904 if (auto *I = dyn_cast<Instruction>(RdxVal)) {
24905 // Also, exclude scalar fmul cost.
24906 InstructionCost FMulCost =
24908 LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
24909 FMACost -= FMulCost;
24910 }
24911 ScalarCost += FMACost;
24912 continue;
24913 }
24914 }
24915 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
24916 continue;
24917 }
24918 ScalarCost = InstructionCost::getInvalid();
24919 break;
24920 }
24921 if (ScalarCost.isValid())
24922 Cost += ScalarCost;
24923 else
24924 Cost += GenCostFn();
24925 }
24926 return Cost;
24927 };
24928 // Require reduction cost if:
24929 // 1. This type is not a full register type and no other vectors with the
24930 // same type in the storage (first vector with small type).
24931 // 2. The storage does not have any vector with full vector use (first
24932 // vector with full register use).
24933 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
24934 switch (RdxKind) {
24935 case RecurKind::Add:
24936 case RecurKind::Mul:
24937 case RecurKind::Or:
24938 case RecurKind::And:
24939 case RecurKind::Xor:
24940 case RecurKind::FAdd:
24941 case RecurKind::FMul: {
24942 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
24943 if (!AllConsts) {
24944 if (DoesRequireReductionOp) {
24945 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
24946 assert(SLPReVec && "FixedVectorType is not expected.");
24947 unsigned ScalarTyNumElements = VecTy->getNumElements();
24948 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
24949 VectorCost += TTI->getShuffleCost(
24952 ReducedVals.size()),
24953 VectorTy,
24954 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
24955 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy,
24956 FMF, CostKind);
24957 }
24958 VectorCost += TTI->getScalarizationOverhead(
24959 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
24960 /*Extract*/ false, TTI::TCK_RecipThroughput);
24961 } else {
24962 Type *RedTy = VectorTy->getElementType();
24963 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24964 std::make_pair(RedTy, true));
24965 if (RType == RedTy) {
24966 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
24967 FMF, CostKind);
24968 } else {
24969 VectorCost = TTI->getExtendedReductionCost(
24970 RdxOpcode, !IsSigned, RedTy,
24971 getWidenedType(RType, ReduxWidth), FMF, CostKind);
24972 }
24973 }
24974 } else {
24975 Type *RedTy = VectorTy->getElementType();
24976 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
24977 std::make_pair(RedTy, true));
24978 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
24979 InstructionCost FMACost = InstructionCost::getInvalid();
24980 if (RdxKind == RecurKind::FAdd) {
24981 // Check if the reduction operands can be converted to FMA.
24983 FastMathFlags FMF;
24984 FMF.set();
24985 for (Value *RdxVal : ReducedVals) {
24986 if (!RdxVal->hasOneUse()) {
24987 Ops.clear();
24988 break;
24989 }
24990 if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
24991 FMF &= FPCI->getFastMathFlags();
24992 Ops.push_back(RdxVal->user_back());
24993 }
24994 if (!Ops.empty()) {
24995 FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
24996 *TTI, TLI);
24997 if (FMACost.isValid()) {
24998 // Calculate actual FMAD cost.
24999 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
25000 {RVecTy, RVecTy, RVecTy}, FMF);
25001 FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
25002
25003 LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
25004 // Also, exclude vector fmul cost.
25006 Instruction::FMul, RVecTy, CostKind);
25008 << "Minus vector FMul cost: " << FMulCost << "\n");
25009 FMACost -= FMulCost;
25010 }
25011 }
25012 }
25013 if (FMACost.isValid())
25014 VectorCost += FMACost;
25015 else
25016 VectorCost +=
25017 TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
25018 if (RType != RedTy) {
25019 unsigned Opcode = Instruction::Trunc;
25020 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
25021 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25022 VectorCost += TTI->getCastInstrCost(
25023 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
25024 }
25025 }
25026 }
25027 ScalarCost = EvaluateScalarCost([&]() {
25028 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
25029 });
25030 break;
25031 }
25032 case RecurKind::FMax:
25033 case RecurKind::FMin:
25034 case RecurKind::FMaximum:
25035 case RecurKind::FMinimum:
25036 case RecurKind::SMax:
25037 case RecurKind::SMin:
25038 case RecurKind::UMax:
25039 case RecurKind::UMin: {
25041 if (!AllConsts) {
25042 if (DoesRequireReductionOp) {
25043 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
25044 } else {
25045 // Check if the previous reduction already exists and account it as
25046 // series of operations + single reduction.
25047 Type *RedTy = VectorTy->getElementType();
25048 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
25049 std::make_pair(RedTy, true));
25050 VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
25051 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
25052 VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
25053 if (RType != RedTy) {
25054 unsigned Opcode = Instruction::Trunc;
25055 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
25056 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25057 VectorCost += TTI->getCastInstrCost(
25058 Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind);
25059 }
25060 }
25061 }
25062 ScalarCost = EvaluateScalarCost([&]() {
25063 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
25064 return TTI->getIntrinsicInstrCost(ICA, CostKind);
25065 });
25066 break;
25067 }
25068 default:
25069 llvm_unreachable("Expected arithmetic or min/max reduction operation");
25070 }
25071
25072 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
25073 << " for reduction of " << shortBundleName(ReducedVals)
25074 << " (It is a splitting reduction)\n");
25075 return VectorCost - ScalarCost;
25076 }
25077
25078 /// Splits the values, stored in VectorValuesAndScales, into registers/free
25079 /// sub-registers, combines them with the given reduction operation as a
25080 /// vector operation and then performs single (small enough) reduction.
25081 Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
25082 Type *DestTy) {
25083 Value *ReducedSubTree = nullptr;
25084 // Creates reduction and combines with the previous reduction.
25085 auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
25086 Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);
25087 if (ReducedSubTree)
25088 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
25089 "op.rdx", ReductionOps);
25090 else
25091 ReducedSubTree = Rdx;
25092 };
25093 if (VectorValuesAndScales.size() == 1) {
25094 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
25095 CreateSingleOp(Vec, Scale, IsSigned);
25096 return ReducedSubTree;
25097 }
25098 // Scales Vec using given Cnt scale factor and then performs vector combine
25099 // with previous value of VecOp.
25100 Value *VecRes = nullptr;
25101 bool VecResSignedness = false;
25102 auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned) {
25103 Type *ScalarTy = Vec->getType()->getScalarType();
25104 // Scale Vec using given Cnt scale factor.
25105 if (Cnt > 1) {
25106 ElementCount EC = cast<VectorType>(Vec->getType())->getElementCount();
25107 switch (RdxKind) {
25108 case RecurKind::Add: {
25109 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
25110 unsigned VF = getNumElements(Vec->getType());
25111 LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
25112 << ". (HorRdx)\n");
25113 SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
25114 for (unsigned I : seq<unsigned>(Cnt))
25115 std::iota(std::next(Mask.begin(), VF * I),
25116 std::next(Mask.begin(), VF * (I + 1)), 0);
25117 ++NumVectorInstructions;
25118 Vec = Builder.CreateShuffleVector(Vec, Mask);
25119 break;
25120 }
25121 // res = mul vv, n
25122 if (ScalarTy != DestTy->getScalarType())
25123 Vec = Builder.CreateIntCast(
25124 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
25125 IsSigned);
25127 EC, ConstantInt::get(DestTy->getScalarType(), Cnt));
25128 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
25129 << ". (HorRdx)\n");
25130 ++NumVectorInstructions;
25131 Vec = Builder.CreateMul(Vec, Scale);
25132 break;
25133 }
25134 case RecurKind::Xor: {
25135 // res = n % 2 ? 0 : vv
25137 << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
25138 if (Cnt % 2 == 0)
25139 Vec = Constant::getNullValue(Vec->getType());
25140 break;
25141 }
25142 case RecurKind::FAdd: {
25143 // res = fmul v, n
25144 Value *Scale =
25145 ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt));
25146 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
25147 << ". (HorRdx)\n");
25148 ++NumVectorInstructions;
25149 Vec = Builder.CreateFMul(Vec, Scale);
25150 break;
25151 }
25152 case RecurKind::And:
25153 case RecurKind::Or:
25154 case RecurKind::SMax:
25155 case RecurKind::SMin:
25156 case RecurKind::UMax:
25157 case RecurKind::UMin:
25158 case RecurKind::FMax:
25159 case RecurKind::FMin:
25160 case RecurKind::FMaximum:
25161 case RecurKind::FMinimum:
25162 // res = vv
25163 break;
25164 case RecurKind::Sub:
25165 case RecurKind::AddChainWithSubs:
25166 case RecurKind::Mul:
25167 case RecurKind::FMul:
25168 case RecurKind::FMulAdd:
25169 case RecurKind::AnyOf:
25170 case RecurKind::FindFirstIVSMin:
25171 case RecurKind::FindFirstIVUMin:
25172 case RecurKind::FindLastIVSMax:
25173 case RecurKind::FindLastIVUMax:
25174 case RecurKind::FMaxNum:
25175 case RecurKind::FMinNum:
25176 case RecurKind::FMaximumNum:
25177 case RecurKind::FMinimumNum:
25178 case RecurKind::None:
25179 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
25180 }
25181 }
25182 // Combine Vec with the previous VecOp.
25183 if (!VecRes) {
25184 VecRes = Vec;
25185 VecResSignedness = IsSigned;
25186 } else {
25187 ++NumVectorInstructions;
25188 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
25189 VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
25190 // Handle ctpop.
25191 unsigned VecResVF = getNumElements(VecRes->getType());
25192 unsigned VecVF = getNumElements(Vec->getType());
25193 SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
25194 std::iota(Mask.begin(), Mask.end(), 0);
25195 // Ensure that VecRes is always larger than Vec
25196 if (VecResVF < VecVF) {
25197 std::swap(VecRes, Vec);
25198 std::swap(VecResVF, VecVF);
25199 }
25200 if (VecResVF != VecVF) {
25201 SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
25202 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
25203 Vec = Builder.CreateShuffleVector(Vec, ResizeMask);
25204 }
25205 VecRes = Builder.CreateShuffleVector(VecRes, Vec, Mask, "rdx.op");
25206 return;
25207 }
25208 if (VecRes->getType()->getScalarType() != DestTy->getScalarType())
25209 VecRes = Builder.CreateIntCast(
25210 VecRes, getWidenedType(DestTy, getNumElements(VecRes->getType())),
25211 VecResSignedness);
25212 if (ScalarTy != DestTy->getScalarType())
25213 Vec = Builder.CreateIntCast(
25214 Vec, getWidenedType(DestTy, getNumElements(Vec->getType())),
25215 IsSigned);
25216 unsigned VecResVF = getNumElements(VecRes->getType());
25217 unsigned VecVF = getNumElements(Vec->getType());
25218 // Ensure that VecRes is always larger than Vec
25219 if (VecResVF < VecVF) {
25220 std::swap(VecRes, Vec);
25221 std::swap(VecResVF, VecVF);
25222 }
25223 // extract + op + insert
25224 Value *Op = VecRes;
25225 if (VecResVF != VecVF)
25226 Op = createExtractVector(Builder, VecRes, VecVF, /*Index=*/0);
25227 Op = createOp(Builder, RdxKind, Op, Vec, "rdx.op", ReductionOps);
25228 if (VecResVF != VecVF)
25229 Op = createInsertVector(Builder, VecRes, Op, /*Index=*/0);
25230 VecRes = Op;
25231 }
25232 };
25233 for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
25234 CreateVecOp(Vec, Scale, IsSigned);
25235 CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false);
25236
25237 return ReducedSubTree;
25238 }
25239
25240 /// Emit a horizontal reduction of the vectorized value.
25241 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
25242 const TargetTransformInfo *TTI, Type *DestTy) {
25243 assert(VectorizedValue && "Need to have a vectorized tree node");
25244 assert(RdxKind != RecurKind::FMulAdd &&
25245 "A call to the llvm.fmuladd intrinsic is not handled yet");
25246
25247 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
25248 if (FTy->getScalarType() == Builder.getInt1Ty() &&
25249 RdxKind == RecurKind::Add &&
25250 DestTy->getScalarType() != FTy->getScalarType()) {
25251 // Convert vector_reduce_add(ZExt(<n x i1>)) to
25252 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
25253 Value *V = Builder.CreateBitCast(
25254 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
25255 ++NumVectorInstructions;
25256 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
25257 }
25258 ++NumVectorInstructions;
25259 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
25260 }
25261
25262 /// Emits optimized code for unique scalar value reused \p Cnt times.
25263 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
25264 unsigned Cnt) {
25265 assert(IsSupportedHorRdxIdentityOp &&
25266 "The optimization of matched scalar identity horizontal reductions "
25267 "must be supported.");
25268 if (Cnt == 1)
25269 return VectorizedValue;
25270 switch (RdxKind) {
25271 case RecurKind::Add: {
25272 // res = mul vv, n
25273 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
25274 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
25275 << VectorizedValue << ". (HorRdx)\n");
25276 return Builder.CreateMul(VectorizedValue, Scale);
25277 }
25278 case RecurKind::Xor: {
25279 // res = n % 2 ? 0 : vv
25280 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
25281 << ". (HorRdx)\n");
25282 if (Cnt % 2 == 0)
25283 return Constant::getNullValue(VectorizedValue->getType());
25284 return VectorizedValue;
25285 }
25286 case RecurKind::FAdd: {
25287 // res = fmul v, n
25288 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
25289 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
25290 << VectorizedValue << ". (HorRdx)\n");
25291 return Builder.CreateFMul(VectorizedValue, Scale);
25292 }
25293 case RecurKind::And:
25294 case RecurKind::Or:
25295 case RecurKind::SMax:
25296 case RecurKind::SMin:
25297 case RecurKind::UMax:
25298 case RecurKind::UMin:
25299 case RecurKind::FMax:
25300 case RecurKind::FMin:
25301 case RecurKind::FMaximum:
25302 case RecurKind::FMinimum:
25303 // res = vv
25304 return VectorizedValue;
25305 case RecurKind::Sub:
25306 case RecurKind::AddChainWithSubs:
25307 case RecurKind::Mul:
25308 case RecurKind::FMul:
25309 case RecurKind::FMulAdd:
25310 case RecurKind::AnyOf:
25311 case RecurKind::FindFirstIVSMin:
25312 case RecurKind::FindFirstIVUMin:
25313 case RecurKind::FindLastIVSMax:
25314 case RecurKind::FindLastIVUMax:
25315 case RecurKind::FMaxNum:
25316 case RecurKind::FMinNum:
25317 case RecurKind::FMaximumNum:
25318 case RecurKind::FMinimumNum:
25319 case RecurKind::None:
25320 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
25321 }
25322 return nullptr;
25323 }
25324
25325 /// Emits actual operation for the scalar identity values, found during
25326 /// horizontal reduction analysis.
25327 Value *
25328 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
25329 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
25330 const DenseMap<Value *, Value *> &TrackedToOrig) {
25331 assert(IsSupportedHorRdxIdentityOp &&
25332 "The optimization of matched scalar identity horizontal reductions "
25333 "must be supported.");
25334 ArrayRef<Value *> VL = R.getRootNodeScalars();
25335 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
25336 if (VTy->getElementType() != VL.front()->getType()) {
25337 VectorizedValue = Builder.CreateIntCast(
25338 VectorizedValue,
25339 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
25340 R.isSignedMinBitwidthRootNode());
25341 }
25342 switch (RdxKind) {
25343 case RecurKind::Add: {
25344 // root = mul prev_root, <1, 1, n, 1>
25346 for (Value *V : VL) {
25347 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25348 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
25349 }
25350 auto *Scale = ConstantVector::get(Vals);
25351 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
25352 << VectorizedValue << ". (HorRdx)\n");
25353 return Builder.CreateMul(VectorizedValue, Scale);
25354 }
25355 case RecurKind::And:
25356 case RecurKind::Or:
25357 // No need for multiple or/and(s).
25358 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
25359 << ". (HorRdx)\n");
25360 return VectorizedValue;
25361 case RecurKind::SMax:
25362 case RecurKind::SMin:
25363 case RecurKind::UMax:
25364 case RecurKind::UMin:
25365 case RecurKind::FMax:
25366 case RecurKind::FMin:
25367 case RecurKind::FMaximum:
25368 case RecurKind::FMinimum:
25369 // No need for multiple min/max(s) of the same value.
25370 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
25371 << ". (HorRdx)\n");
25372 return VectorizedValue;
25373 case RecurKind::Xor: {
25374 // Replace values with even number of repeats with 0, since
25375 // x xor x = 0.
25376 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
25377 // 7>, if elements 4th and 6th elements have even number of repeats.
25378 SmallVector<int> Mask(
25379 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
25381 std::iota(Mask.begin(), Mask.end(), 0);
25382 bool NeedShuffle = false;
25383 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
25384 Value *V = VL[I];
25385 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25386 if (Cnt % 2 == 0) {
25387 Mask[I] = VF;
25388 NeedShuffle = true;
25389 }
25390 }
25391 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
25392 : Mask) dbgs()
25393 << I << " ";
25394 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
25395 if (NeedShuffle)
25396 VectorizedValue = Builder.CreateShuffleVector(
25397 VectorizedValue,
25398 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
25399 return VectorizedValue;
25400 }
25401 case RecurKind::FAdd: {
25402 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
25404 for (Value *V : VL) {
25405 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
25406 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
25407 }
25408 auto *Scale = ConstantVector::get(Vals);
25409 return Builder.CreateFMul(VectorizedValue, Scale);
25410 }
25411 case RecurKind::Sub:
25412 case RecurKind::AddChainWithSubs:
25413 case RecurKind::Mul:
25414 case RecurKind::FMul:
25415 case RecurKind::FMulAdd:
25416 case RecurKind::AnyOf:
25417 case RecurKind::FindFirstIVSMin:
25418 case RecurKind::FindFirstIVUMin:
25419 case RecurKind::FindLastIVSMax:
25420 case RecurKind::FindLastIVUMax:
25421 case RecurKind::FMaxNum:
25422 case RecurKind::FMinNum:
25423 case RecurKind::FMaximumNum:
25424 case RecurKind::FMinimumNum:
25425 case RecurKind::None:
25426 llvm_unreachable("Unexpected reduction kind for reused scalars.");
25427 }
25428 return nullptr;
25429 }
25430};
25431} // end anonymous namespace
25432
25433/// Gets recurrence kind from the specified value.
25435 return HorizontalReduction::getRdxKind(V);
25436}
25437static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
25438 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
25439 return cast<FixedVectorType>(IE->getType())->getNumElements();
25440
25441 unsigned AggregateSize = 1;
25442 auto *IV = cast<InsertValueInst>(InsertInst);
25443 Type *CurrentType = IV->getType();
25444 do {
25445 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
25446 for (auto *Elt : ST->elements())
25447 if (Elt != ST->getElementType(0)) // check homogeneity
25448 return std::nullopt;
25449 AggregateSize *= ST->getNumElements();
25450 CurrentType = ST->getElementType(0);
25451 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
25452 AggregateSize *= AT->getNumElements();
25453 CurrentType = AT->getElementType();
25454 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
25455 AggregateSize *= VT->getNumElements();
25456 return AggregateSize;
25457 } else if (CurrentType->isSingleValueType()) {
25458 return AggregateSize;
25459 } else {
25460 return std::nullopt;
25461 }
25462 } while (true);
25463}
25464
25465static void findBuildAggregateRec(Instruction *LastInsertInst,
25467 SmallVectorImpl<Value *> &BuildVectorOpds,
25468 SmallVectorImpl<Value *> &InsertElts,
25469 unsigned OperandOffset, const BoUpSLP &R) {
25470 do {
25471 Value *InsertedOperand = LastInsertInst->getOperand(1);
25472 std::optional<unsigned> OperandIndex =
25473 getElementIndex(LastInsertInst, OperandOffset);
25474 if (!OperandIndex || R.isDeleted(LastInsertInst))
25475 return;
25476 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
25478 BuildVectorOpds, InsertElts, *OperandIndex, R);
25479
25480 } else {
25481 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25482 InsertElts[*OperandIndex] = LastInsertInst;
25483 }
25484 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
25485 } while (LastInsertInst != nullptr &&
25487 LastInsertInst->hasOneUse());
25488}
25489
25490/// Recognize construction of vectors like
25491/// %ra = insertelement <4 x float> poison, float %s0, i32 0
25492/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
25493/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
25494/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
25495/// starting from the last insertelement or insertvalue instruction.
25496///
25497/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
25498/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
25499/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
25500///
25501/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
25502///
25503/// \return true if it matches.
25504static bool findBuildAggregate(Instruction *LastInsertInst,
25506 SmallVectorImpl<Value *> &BuildVectorOpds,
25507 SmallVectorImpl<Value *> &InsertElts,
25508 const BoUpSLP &R) {
25509
25510 assert((isa<InsertElementInst>(LastInsertInst) ||
25511 isa<InsertValueInst>(LastInsertInst)) &&
25512 "Expected insertelement or insertvalue instruction!");
25513
25514 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
25515 "Expected empty result vectors!");
25516
25517 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
25518 if (!AggregateSize)
25519 return false;
25520 BuildVectorOpds.resize(*AggregateSize);
25521 InsertElts.resize(*AggregateSize);
25522
25523 findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0, R);
25524 llvm::erase(BuildVectorOpds, nullptr);
25525 llvm::erase(InsertElts, nullptr);
25526 if (BuildVectorOpds.size() >= 2)
25527 return true;
25528
25529 return false;
25530}
25531
25532/// Try and get a reduction instruction from a phi node.
25533///
25534/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
25535/// if they come from either \p ParentBB or a containing loop latch.
25536///
25537/// \returns A candidate reduction value if possible, or \code nullptr \endcode
25538/// if not possible.
25540 BasicBlock *ParentBB, LoopInfo *LI) {
25541 // There are situations where the reduction value is not dominated by the
25542 // reduction phi. Vectorizing such cases has been reported to cause
25543 // miscompiles. See PR25787.
25544 auto DominatedReduxValue = [&](Value *R) {
25545 return isa<Instruction>(R) &&
25546 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
25547 };
25548
25549 Instruction *Rdx = nullptr;
25550
25551 // Return the incoming value if it comes from the same BB as the phi node.
25552 if (P->getIncomingBlock(0) == ParentBB) {
25553 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25554 } else if (P->getIncomingBlock(1) == ParentBB) {
25555 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25556 }
25557
25558 if (Rdx && DominatedReduxValue(Rdx))
25559 return Rdx;
25560
25561 // Otherwise, check whether we have a loop latch to look at.
25562 Loop *BBL = LI->getLoopFor(ParentBB);
25563 if (!BBL)
25564 return nullptr;
25565 BasicBlock *BBLatch = BBL->getLoopLatch();
25566 if (!BBLatch)
25567 return nullptr;
25568
25569 // There is a loop latch, return the incoming value if it comes from
25570 // that. This reduction pattern occasionally turns up.
25571 if (P->getIncomingBlock(0) == BBLatch) {
25572 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
25573 } else if (P->getIncomingBlock(1) == BBLatch) {
25574 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
25575 }
25576
25577 if (Rdx && DominatedReduxValue(Rdx))
25578 return Rdx;
25579
25580 return nullptr;
25581}
25582
25583static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
25584 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
25585 return true;
25586 if (match(I, m_FMaxNum(m_Value(V0), m_Value(V1))))
25587 return true;
25588 if (match(I, m_FMinNum(m_Value(V0), m_Value(V1))))
25589 return true;
25590 if (match(I, m_FMaximum(m_Value(V0), m_Value(V1))))
25591 return true;
25592 if (match(I, m_FMinimum(m_Value(V0), m_Value(V1))))
25593 return true;
25595 return true;
25597 return true;
25599 return true;
25601 return true;
25602 return false;
25603}
25604
25605/// We could have an initial reduction that is not an add.
25606/// r *= v1 + v2 + v3 + v4
25607/// In such a case start looking for a tree rooted in the first '+'.
25608/// \Returns the new root if found, which may be nullptr if not an instruction.
25610 Instruction *Root) {
25611 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
25612 isa<IntrinsicInst>(Root)) &&
25613 "Expected binop, select, or intrinsic for reduction matching");
25614 Value *LHS =
25615 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25616 Value *RHS =
25617 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25618 if (LHS == Phi)
25619 return dyn_cast<Instruction>(RHS);
25620 if (RHS == Phi)
25621 return dyn_cast<Instruction>(LHS);
25622 return nullptr;
25623}
25624
25625/// \p Returns the first operand of \p I that does not match \p Phi. If
25626/// operand is not an instruction it returns nullptr.
25628 Value *Op0 = nullptr;
25629 Value *Op1 = nullptr;
25630 if (!matchRdxBop(I, Op0, Op1))
25631 return nullptr;
25632 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
25633}
25634
25635/// \Returns true if \p I is a candidate instruction for reduction vectorization.
25637 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
25638 Value *B0 = nullptr, *B1 = nullptr;
25639 bool IsBinop = matchRdxBop(I, B0, B1);
25640 return IsBinop || IsSelect;
25641}
25642
25643bool SLPVectorizerPass::vectorizeHorReduction(
25644 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
25645 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25646 if (!ShouldVectorizeHor)
25647 return false;
25648 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
25649
25650 if (Root->getParent() != BB || isa<PHINode>(Root))
25651 return false;
25652
25653 // If we can find a secondary reduction root, use that instead.
25654 auto SelectRoot = [&]() {
25655 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
25656 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
25657 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
25658 return NewRoot;
25659 return Root;
25660 };
25661
25662 // Start analysis starting from Root instruction. If horizontal reduction is
25663 // found, try to vectorize it. If it is not a horizontal reduction or
25664 // vectorization is not possible or not effective, and currently analyzed
25665 // instruction is a binary operation, try to vectorize the operands, using
25666 // pre-order DFS traversal order. If the operands were not vectorized, repeat
25667 // the same procedure considering each operand as a possible root of the
25668 // horizontal reduction.
25669 // Interrupt the process if the Root instruction itself was vectorized or all
25670 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
25671 // If a horizintal reduction was not matched or vectorized we collect
25672 // instructions for possible later attempts for vectorization.
25673 std::queue<std::pair<Instruction *, unsigned>> Stack;
25674 Stack.emplace(SelectRoot(), 0);
25675 SmallPtrSet<Value *, 8> VisitedInstrs;
25676 bool Res = false;
25677 auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
25678 if (R.isAnalyzedReductionRoot(Inst))
25679 return nullptr;
25680 if (!isReductionCandidate(Inst))
25681 return nullptr;
25682 HorizontalReduction HorRdx;
25683 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25684 return nullptr;
25685 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25686 };
25687 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
25688 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25689 FutureSeed = getNonPhiOperand(Root, P);
25690 if (!FutureSeed)
25691 return false;
25692 }
25693 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
25694 // analysis is done separately.
25696 PostponedInsts.push_back(FutureSeed);
25697 return true;
25698 };
25699
25700 while (!Stack.empty()) {
25701 Instruction *Inst;
25702 unsigned Level;
25703 std::tie(Inst, Level) = Stack.front();
25704 Stack.pop();
25705 // Do not try to analyze instruction that has already been vectorized.
25706 // This may happen when we vectorize instruction operands on a previous
25707 // iteration while stack was populated before that happened.
25708 if (R.isDeleted(Inst))
25709 continue;
25710 if (Value *VectorizedV = TryToReduce(Inst)) {
25711 Res = true;
25712 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
25713 // Try to find another reduction.
25714 Stack.emplace(I, Level);
25715 continue;
25716 }
25717 if (R.isDeleted(Inst))
25718 continue;
25719 } else {
25720 // We could not vectorize `Inst` so try to use it as a future seed.
25721 if (!TryAppendToPostponedInsts(Inst)) {
25722 assert(Stack.empty() && "Expected empty stack");
25723 break;
25724 }
25725 }
25726
25727 // Try to vectorize operands.
25728 // Continue analysis for the instruction from the same basic block only to
25729 // save compile time.
25730 if (++Level < RecursionMaxDepth)
25731 for (auto *Op : Inst->operand_values())
25732 if (VisitedInstrs.insert(Op).second)
25733 if (auto *I = dyn_cast<Instruction>(Op))
25734 // Do not try to vectorize CmpInst operands, this is done
25735 // separately.
25737 !R.isDeleted(I) && I->getParent() == BB)
25738 Stack.emplace(I, Level);
25739 }
25740 return Res;
25741}
25742
25743bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
25744 if (!I)
25745 return false;
25746
25747 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
25748 return false;
25749 // Skip potential FMA candidates.
25750 if ((I->getOpcode() == Instruction::FAdd ||
25751 I->getOpcode() == Instruction::FSub) &&
25752 canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
25753 .isValid())
25754 return false;
25755
25756 Value *P = I->getParent();
25757
25758 // Vectorize in current basic block only.
25759 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
25760 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
25761 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
25762 R.isDeleted(Op0) || R.isDeleted(Op1))
25763 return false;
25764
25765 // First collect all possible candidates
25767 Candidates.emplace_back(Op0, Op1);
25768
25769 auto *A = dyn_cast<BinaryOperator>(Op0);
25770 auto *B = dyn_cast<BinaryOperator>(Op1);
25771 // Try to skip B.
25772 if (A && B && B->hasOneUse()) {
25773 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
25774 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
25775 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
25776 Candidates.emplace_back(A, B0);
25777 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
25778 Candidates.emplace_back(A, B1);
25779 }
25780 // Try to skip A.
25781 if (B && A && A->hasOneUse()) {
25782 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
25783 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
25784 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
25785 Candidates.emplace_back(A0, B);
25786 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
25787 Candidates.emplace_back(A1, B);
25788 }
25789
25790 auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
25792 if (!isReductionCandidate(Inst))
25793 return false;
25794 Type *Ty = Inst->getType();
25795 if (!isValidElementType(Ty) || Ty->isPointerTy())
25796 return false;
25797 HorizontalReduction HorRdx(Inst, Ops);
25798 if (!HorRdx.matchReductionForOperands())
25799 return false;
25800 // Check the cost of operations.
25801 VectorType *VecTy = getWidenedType(Ty, Ops.size());
25803 InstructionCost ScalarCost =
25804 TTI.getScalarizationOverhead(
25805 VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
25806 /*Extract=*/true, CostKind) +
25807 TTI.getInstructionCost(Inst, CostKind);
25808 InstructionCost RedCost;
25809 switch (::getRdxKind(Inst)) {
25810 case RecurKind::Add:
25811 case RecurKind::Mul:
25812 case RecurKind::Or:
25813 case RecurKind::And:
25814 case RecurKind::Xor:
25815 case RecurKind::FAdd:
25816 case RecurKind::FMul: {
25817 FastMathFlags FMF;
25818 if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
25819 FMF = FPCI->getFastMathFlags();
25820 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
25821 CostKind);
25822 break;
25823 }
25824 default:
25825 return false;
25826 }
25827 if (RedCost >= ScalarCost)
25828 return false;
25829
25830 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
25831 };
25832 if (Candidates.size() == 1)
25833 return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
25834
25835 // We have multiple options. Try to pick the single best.
25836 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
25837 if (!BestCandidate)
25838 return false;
25839 return (*BestCandidate == 0 &&
25840 TryToReduce(I, {Candidates[*BestCandidate].first,
25841 Candidates[*BestCandidate].second})) ||
25842 tryToVectorizeList({Candidates[*BestCandidate].first,
25843 Candidates[*BestCandidate].second},
25844 R);
25845}
25846
25847bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
25848 BasicBlock *BB, BoUpSLP &R) {
25849 SmallVector<WeakTrackingVH> PostponedInsts;
25850 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
25851 Res |= tryToVectorize(PostponedInsts, R);
25852 return Res;
25853}
25854
25855bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
25856 BoUpSLP &R) {
25857 bool Res = false;
25858 for (Value *V : Insts)
25859 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
25860 Res |= tryToVectorize(Inst, R);
25861 return Res;
25862}
25863
25864bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
25865 BasicBlock *BB, BoUpSLP &R,
25866 bool MaxVFOnly) {
25867 if (!R.canMapToVector(IVI->getType()))
25868 return false;
25869
25870 SmallVector<Value *, 16> BuildVectorOpds;
25871 SmallVector<Value *, 16> BuildVectorInsts;
25872 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
25873 return false;
25874
25875 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
25876 R.getORE()->emit([&]() {
25877 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
25878 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
25879 "trying reduction first.";
25880 });
25881 return false;
25882 }
25883 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
25884 // Aggregate value is unlikely to be processed in vector register.
25885 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
25886}
25887
25888bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
25889 BasicBlock *BB, BoUpSLP &R,
25890 bool MaxVFOnly) {
25891 SmallVector<Value *, 16> BuildVectorInsts;
25892 SmallVector<Value *, 16> BuildVectorOpds;
25893 SmallVector<int> Mask;
25894 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
25896 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
25897 return false;
25898
25899 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
25900 R.getORE()->emit([&]() {
25901 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
25902 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
25903 "trying reduction first.";
25904 });
25905 return false;
25906 }
25907 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
25908 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
25909}
25910
25911template <typename T>
25913 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
25914 function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
25915 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
25916 bool MaxVFOnly, BoUpSLP &R) {
25917 bool Changed = false;
25918 // Sort by type, parent, operands.
25919 stable_sort(Incoming, Comparator);
25920
25921 // Try to vectorize elements base on their type.
25922 SmallVector<T *> Candidates;
25924 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
25925 VL.clear()) {
25926 // Look for the next elements with the same type, parent and operand
25927 // kinds.
25928 auto *I = dyn_cast<Instruction>(*IncIt);
25929 if (!I || R.isDeleted(I)) {
25930 ++IncIt;
25931 continue;
25932 }
25933 auto *SameTypeIt = IncIt;
25934 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
25935 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
25936 AreCompatible(VL, *SameTypeIt))) {
25937 auto *I = dyn_cast<Instruction>(*SameTypeIt);
25938 ++SameTypeIt;
25939 if (I && !R.isDeleted(I))
25940 VL.push_back(cast<T>(I));
25941 }
25942
25943 // Try to vectorize them.
25944 unsigned NumElts = VL.size();
25945 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
25946 << NumElts << ")\n");
25947 // The vectorization is a 3-state attempt:
25948 // 1. Try to vectorize instructions with the same/alternate opcodes with the
25949 // size of maximal register at first.
25950 // 2. Try to vectorize remaining instructions with the same type, if
25951 // possible. This may result in the better vectorization results rather than
25952 // if we try just to vectorize instructions with the same/alternate opcodes.
25953 // 3. Final attempt to try to vectorize all instructions with the
25954 // same/alternate ops only, this may result in some extra final
25955 // vectorization.
25956 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
25957 // Success start over because instructions might have been changed.
25958 Changed = true;
25959 VL.swap(Candidates);
25960 Candidates.clear();
25961 for (T *V : VL) {
25962 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
25963 Candidates.push_back(V);
25964 }
25965 } else {
25966 /// \Returns the minimum number of elements that we will attempt to
25967 /// vectorize.
25968 auto GetMinNumElements = [&R](Value *V) {
25969 unsigned EltSize = R.getVectorElementSize(V);
25970 return std::max(2U, R.getMaxVecRegSize() / EltSize);
25971 };
25972 if (NumElts < GetMinNumElements(*IncIt) &&
25973 (Candidates.empty() ||
25974 Candidates.front()->getType() == (*IncIt)->getType())) {
25975 for (T *V : VL) {
25976 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
25977 Candidates.push_back(V);
25978 }
25979 }
25980 }
25981 // Final attempt to vectorize instructions with the same types.
25982 if (Candidates.size() > 1 &&
25983 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
25984 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
25985 // Success start over because instructions might have been changed.
25986 Changed = true;
25987 } else if (MaxVFOnly) {
25988 // Try to vectorize using small vectors.
25990 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
25991 VL.clear()) {
25992 auto *I = dyn_cast<Instruction>(*It);
25993 if (!I || R.isDeleted(I)) {
25994 ++It;
25995 continue;
25996 }
25997 auto *SameTypeIt = It;
25998 while (SameTypeIt != End &&
25999 (!isa<Instruction>(*SameTypeIt) ||
26000 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
26001 AreCompatible(*SameTypeIt, *It))) {
26002 auto *I = dyn_cast<Instruction>(*SameTypeIt);
26003 ++SameTypeIt;
26004 if (I && !R.isDeleted(I))
26005 VL.push_back(cast<T>(I));
26006 }
26007 unsigned NumElts = VL.size();
26008 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
26009 /*MaxVFOnly=*/false))
26010 Changed = true;
26011 It = SameTypeIt;
26012 }
26013 }
26014 Candidates.clear();
26015 }
26016
26017 // Start over at the next instruction of a different type (or the end).
26018 IncIt = SameTypeIt;
26019 }
26020 return Changed;
26021}
26022
26023/// Compare two cmp instructions. If IsCompatibility is true, function returns
26024/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
26025/// operands. If IsCompatibility is false, function implements strict weak
26026/// ordering relation between two cmp instructions, returning true if the first
26027/// instruction is "less" than the second, i.e. its predicate is less than the
26028/// predicate of the second or the operands IDs are less than the operands IDs
26029/// of the second cmp instruction.
26030template <bool IsCompatibility>
26031static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
26032 const DominatorTree &DT) {
26033 assert(isValidElementType(V->getType()) &&
26034 isValidElementType(V2->getType()) &&
26035 "Expected valid element types only.");
26036 if (V == V2)
26037 return IsCompatibility;
26038 auto *CI1 = cast<CmpInst>(V);
26039 auto *CI2 = cast<CmpInst>(V2);
26040 if (CI1->getOperand(0)->getType()->getTypeID() <
26041 CI2->getOperand(0)->getType()->getTypeID())
26042 return !IsCompatibility;
26043 if (CI1->getOperand(0)->getType()->getTypeID() >
26044 CI2->getOperand(0)->getType()->getTypeID())
26045 return false;
26046 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
26048 return !IsCompatibility;
26049 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
26051 return false;
26052 CmpInst::Predicate Pred1 = CI1->getPredicate();
26053 CmpInst::Predicate Pred2 = CI2->getPredicate();
26056 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
26057 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
26058 if (BasePred1 < BasePred2)
26059 return !IsCompatibility;
26060 if (BasePred1 > BasePred2)
26061 return false;
26062 // Compare operands.
26063 bool CI1Preds = Pred1 == BasePred1;
26064 bool CI2Preds = Pred2 == BasePred1;
26065 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
26066 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
26067 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
26068 if (Op1 == Op2)
26069 continue;
26070 if (Op1->getValueID() < Op2->getValueID())
26071 return !IsCompatibility;
26072 if (Op1->getValueID() > Op2->getValueID())
26073 return false;
26074 if (auto *I1 = dyn_cast<Instruction>(Op1))
26075 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
26076 if (IsCompatibility) {
26077 if (I1->getParent() != I2->getParent())
26078 return false;
26079 } else {
26080 // Try to compare nodes with same parent.
26081 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
26082 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
26083 if (!NodeI1)
26084 return NodeI2 != nullptr;
26085 if (!NodeI2)
26086 return false;
26087 assert((NodeI1 == NodeI2) ==
26088 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26089 "Different nodes should have different DFS numbers");
26090 if (NodeI1 != NodeI2)
26091 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26092 }
26093 InstructionsState S = getSameOpcode({I1, I2}, TLI);
26094 if (S && (IsCompatibility || !S.isAltShuffle()))
26095 continue;
26096 if (IsCompatibility)
26097 return false;
26098 if (I1->getOpcode() != I2->getOpcode())
26099 return I1->getOpcode() < I2->getOpcode();
26100 }
26101 }
26102 return IsCompatibility;
26103}
26104
26105template <typename ItT>
26106bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
26107 BasicBlock *BB, BoUpSLP &R) {
26108 bool Changed = false;
26109 // Try to find reductions first.
26110 for (CmpInst *I : CmpInsts) {
26111 if (R.isDeleted(I))
26112 continue;
26113 for (Value *Op : I->operands())
26114 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
26115 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
26116 if (R.isDeleted(I))
26117 break;
26118 }
26119 }
26120 // Try to vectorize operands as vector bundles.
26121 for (CmpInst *I : CmpInsts) {
26122 if (R.isDeleted(I))
26123 continue;
26124 Changed |= tryToVectorize(I, R);
26125 }
26126 // Try to vectorize list of compares.
26127 // Sort by type, compare predicate, etc.
26128 auto CompareSorter = [&](Value *V, Value *V2) {
26129 if (V == V2)
26130 return false;
26131 return compareCmp<false>(V, V2, *TLI, *DT);
26132 };
26133
26134 auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
26135 if (VL.empty() || VL.back() == V1)
26136 return true;
26137 return compareCmp<true>(V1, VL.back(), *TLI, *DT);
26138 };
26139
26141 for (Instruction *V : CmpInsts)
26142 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
26143 Vals.push_back(V);
26144 if (Vals.size() <= 1)
26145 return Changed;
26147 Vals, CompareSorter, AreCompatibleCompares,
26148 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
26149 // Exclude possible reductions from other blocks.
26150 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
26151 return any_of(V->users(), [V](User *U) {
26152 auto *Select = dyn_cast<SelectInst>(U);
26153 return Select &&
26154 Select->getParent() != cast<Instruction>(V)->getParent();
26155 });
26156 });
26157 if (ArePossiblyReducedInOtherBlock)
26158 return false;
26159 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26160 },
26161 /*MaxVFOnly=*/true, R);
26162 return Changed;
26163}
26164
26165bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
26166 BasicBlock *BB, BoUpSLP &R) {
26168 "This function only accepts Insert instructions");
26169 bool OpsChanged = false;
26170 SmallVector<WeakTrackingVH> PostponedInsts;
26171 for (auto *I : reverse(Instructions)) {
26172 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
26173 if (R.isDeleted(I) || isa<CmpInst>(I))
26174 continue;
26175 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
26176 OpsChanged |=
26177 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
26178 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
26179 OpsChanged |=
26180 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
26181 }
26182 // pass2 - try to vectorize reductions only
26183 if (R.isDeleted(I))
26184 continue;
26185 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
26186 if (R.isDeleted(I) || isa<CmpInst>(I))
26187 continue;
26188 // pass3 - try to match and vectorize a buildvector sequence.
26189 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
26190 OpsChanged |=
26191 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
26192 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
26193 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
26194 /*MaxVFOnly=*/false);
26195 }
26196 }
26197 // Now try to vectorize postponed instructions.
26198 OpsChanged |= tryToVectorize(PostponedInsts, R);
26199
26200 Instructions.clear();
26201 return OpsChanged;
26202}
26203
26204bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
26205 bool Changed = false;
26206 SmallVector<Value *, 4> Incoming;
26207 SmallPtrSet<Value *, 16> VisitedInstrs;
26208 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
26209 // node. Allows better to identify the chains that can be vectorized in the
26210 // better way.
26211 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
26212 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
26214 isValidElementType(V2->getType()) &&
26215 "Expected vectorizable types only.");
26216 if (V1 == V2)
26217 return false;
26218 // It is fine to compare type IDs here, since we expect only vectorizable
26219 // types, like ints, floats and pointers, we don't care about other type.
26220 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
26221 return true;
26222 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
26223 return false;
26224 if (V1->getType()->getScalarSizeInBits() <
26225 V2->getType()->getScalarSizeInBits())
26226 return true;
26227 if (V1->getType()->getScalarSizeInBits() >
26228 V2->getType()->getScalarSizeInBits())
26229 return false;
26230 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
26231 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
26232 if (Opcodes1.size() < Opcodes2.size())
26233 return true;
26234 if (Opcodes1.size() > Opcodes2.size())
26235 return false;
26236 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
26237 {
26238 // Instructions come first.
26239 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
26240 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
26241 if (I1 && I2) {
26242 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
26243 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
26244 if (!NodeI1)
26245 return NodeI2 != nullptr;
26246 if (!NodeI2)
26247 return false;
26248 assert((NodeI1 == NodeI2) ==
26249 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26250 "Different nodes should have different DFS numbers");
26251 if (NodeI1 != NodeI2)
26252 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26253 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
26254 if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
26255 const auto *E1 = dyn_cast<ExtractElementInst>(I1);
26256 const auto *E2 = dyn_cast<ExtractElementInst>(I2);
26257 if (!E1 || !E2)
26258 continue;
26259
26260 // Sort on ExtractElementInsts primarily by vector operands. Prefer
26261 // program order of the vector operands.
26262 const auto *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
26263 const auto *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
26264 if (V1 != V2) {
26265 if (V1 && !V2)
26266 return true;
26267 if (!V1 && V2)
26268 return false;
26270 DT->getNode(V1->getParent());
26272 DT->getNode(V2->getParent());
26273 if (!NodeI1)
26274 return NodeI2 != nullptr;
26275 if (!NodeI2)
26276 return false;
26277 assert((NodeI1 == NodeI2) ==
26278 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26279 "Different nodes should have different DFS numbers");
26280 if (NodeI1 != NodeI2)
26281 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26282 return V1->comesBefore(V2);
26283 }
26284 // If we have the same vector operand, try to sort by constant
26285 // index.
26286 std::optional<unsigned> Id1 = getExtractIndex(E1);
26287 std::optional<unsigned> Id2 = getExtractIndex(E2);
26288 // Bring constants to the top
26289 if (Id1 && !Id2)
26290 return true;
26291 if (!Id1 && Id2)
26292 return false;
26293 // First elements come first.
26294 if (Id1 && Id2)
26295 return *Id1 < *Id2;
26296
26297 continue;
26298 }
26299 if (I1->getOpcode() == I2->getOpcode())
26300 continue;
26301 return I1->getOpcode() < I2->getOpcode();
26302 }
26303 if (I1)
26304 return true;
26305 if (I2)
26306 return false;
26307 }
26308 {
26309 // Non-undef constants come next.
26310 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
26311 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
26312 if (C1 && C2)
26313 continue;
26314 if (C1)
26315 return true;
26316 if (C2)
26317 return false;
26318 }
26319 bool U1 = isa<UndefValue>(Opcodes1[I]);
26320 bool U2 = isa<UndefValue>(Opcodes2[I]);
26321 {
26322 // Non-constant non-instructions come next.
26323 if (!U1 && !U2) {
26324 auto ValID1 = Opcodes1[I]->getValueID();
26325 auto ValID2 = Opcodes2[I]->getValueID();
26326 if (ValID1 == ValID2)
26327 continue;
26328 if (ValID1 < ValID2)
26329 return true;
26330 if (ValID1 > ValID2)
26331 return false;
26332 }
26333 if (!U1)
26334 return true;
26335 if (!U2)
26336 return false;
26337 }
26338 // Undefs come last.
26339 assert(U1 && U2 && "The only thing left should be undef & undef.");
26340 }
26341 return false;
26342 };
26343 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
26344 Value *V1) {
26345 if (VL.empty() || V1 == VL.back())
26346 return true;
26347 Value *V2 = VL.back();
26348 if (V1->getType() != V2->getType())
26349 return false;
26350 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
26351 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
26352 if (Opcodes1.size() != Opcodes2.size())
26353 return false;
26354 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
26355 // Undefs are compatible with any other value.
26356 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
26357 continue;
26358 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
26359 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
26360 if (R.isDeleted(I1) || R.isDeleted(I2))
26361 return false;
26362 if (I1->getParent() != I2->getParent())
26363 return false;
26364 if (getSameOpcode({I1, I2}, *TLI))
26365 continue;
26366 return false;
26367 }
26368 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
26369 continue;
26370 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
26371 return false;
26372 }
26373 return true;
26374 };
26375
26376 bool HaveVectorizedPhiNodes = false;
26377 do {
26378 // Collect the incoming values from the PHIs.
26379 Incoming.clear();
26380 for (Instruction &I : *BB) {
26381 auto *P = dyn_cast<PHINode>(&I);
26382 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
26383 break;
26384
26385 // No need to analyze deleted, vectorized and non-vectorizable
26386 // instructions.
26387 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
26388 isValidElementType(P->getType()))
26389 Incoming.push_back(P);
26390 }
26391
26392 if (Incoming.size() <= 1)
26393 break;
26394
26395 // Find the corresponding non-phi nodes for better matching when trying to
26396 // build the tree.
26397 for (Value *V : Incoming) {
26398 SmallVectorImpl<Value *> &Opcodes =
26399 PHIToOpcodes.try_emplace(V).first->getSecond();
26400 if (!Opcodes.empty())
26401 continue;
26402 SmallVector<Value *, 4> Nodes(1, V);
26403 SmallPtrSet<Value *, 4> Visited;
26404 while (!Nodes.empty()) {
26405 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
26406 if (!Visited.insert(PHI).second)
26407 continue;
26408 for (Value *V : PHI->incoming_values()) {
26409 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
26410 Nodes.push_back(PHI1);
26411 continue;
26412 }
26413 Opcodes.emplace_back(V);
26414 }
26415 }
26416 }
26417
26418 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
26419 Incoming, PHICompare, AreCompatiblePHIs,
26420 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
26421 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26422 },
26423 /*MaxVFOnly=*/true, R);
26424 Changed |= HaveVectorizedPhiNodes;
26425 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
26426 auto *PHI = dyn_cast<PHINode>(P.first);
26427 return !PHI || R.isDeleted(PHI);
26428 }))
26429 PHIToOpcodes.clear();
26430 VisitedInstrs.insert_range(Incoming);
26431 } while (HaveVectorizedPhiNodes);
26432
26433 VisitedInstrs.clear();
26434
26435 InstSetVector PostProcessInserts;
26436 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26437 // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
26438 // also vectorizes `PostProcessCmps`.
26439 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
26440 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26441 if (VectorizeCmps) {
26442 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
26443 PostProcessCmps.clear();
26444 }
26445 PostProcessInserts.clear();
26446 return Changed;
26447 };
26448 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
26449 auto IsInPostProcessInstrs = [&](Instruction *I) {
26450 if (auto *Cmp = dyn_cast<CmpInst>(I))
26451 return PostProcessCmps.contains(Cmp);
26453 PostProcessInserts.contains(I);
26454 };
26455 // Returns true if `I` is an instruction without users, like terminator, or
26456 // function call with ignored return value, store. Ignore unused instructions
26457 // (basing on instruction type, except for CallInst and InvokeInst).
26458 auto HasNoUsers = [](Instruction *I) {
26459 return I->use_empty() &&
26460 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
26461 };
26462 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
26463 // Skip instructions with scalable type. The num of elements is unknown at
26464 // compile-time for scalable type.
26465 if (isa<ScalableVectorType>(It->getType()))
26466 continue;
26467
26468 // Skip instructions marked for the deletion.
26469 if (R.isDeleted(&*It))
26470 continue;
26471 // We may go through BB multiple times so skip the one we have checked.
26472 if (!VisitedInstrs.insert(&*It).second) {
26473 if (HasNoUsers(&*It) &&
26474 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
26475 // We would like to start over since some instructions are deleted
26476 // and the iterator may become invalid value.
26477 Changed = true;
26478 It = BB->begin();
26479 E = BB->end();
26480 }
26481 continue;
26482 }
26483
26484 // Try to vectorize reductions that use PHINodes.
26485 if (PHINode *P = dyn_cast<PHINode>(It)) {
26486 // Check that the PHI is a reduction PHI.
26487 if (P->getNumIncomingValues() == 2) {
26488 // Try to match and vectorize a horizontal reduction.
26489 Instruction *Root = getReductionInstr(DT, P, BB, LI);
26490 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
26491 Changed = true;
26492 It = BB->begin();
26493 E = BB->end();
26494 continue;
26495 }
26496 }
26497 // Try to vectorize the incoming values of the PHI, to catch reductions
26498 // that feed into PHIs.
26499 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
26500 // Skip if the incoming block is the current BB for now. Also, bypass
26501 // unreachable IR for efficiency and to avoid crashing.
26502 // TODO: Collect the skipped incoming values and try to vectorize them
26503 // after processing BB.
26504 if (BB == P->getIncomingBlock(I) ||
26505 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
26506 continue;
26507
26508 // Postponed instructions should not be vectorized here, delay their
26509 // vectorization.
26510 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
26511 PI && !IsInPostProcessInstrs(PI)) {
26512 bool Res =
26513 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
26514 Changed |= Res;
26515 if (Res && R.isDeleted(P)) {
26516 It = BB->begin();
26517 E = BB->end();
26518 break;
26519 }
26520 }
26521 }
26522 continue;
26523 }
26524
26525 if (HasNoUsers(&*It)) {
26526 bool OpsChanged = false;
26527 auto *SI = dyn_cast<StoreInst>(It);
26528 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
26529 if (SI) {
26530 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
26531 // Try to vectorize chain in store, if this is the only store to the
26532 // address in the block.
26533 // TODO: This is just a temporarily solution to save compile time. Need
26534 // to investigate if we can safely turn on slp-vectorize-hor-store
26535 // instead to allow lookup for reduction chains in all non-vectorized
26536 // stores (need to check side effects and compile time).
26537 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
26538 SI->getValueOperand()->hasOneUse();
26539 }
26540 if (TryToVectorizeRoot) {
26541 for (auto *V : It->operand_values()) {
26542 // Postponed instructions should not be vectorized here, delay their
26543 // vectorization.
26544 if (auto *VI = dyn_cast<Instruction>(V);
26545 VI && !IsInPostProcessInstrs(VI))
26546 // Try to match and vectorize a horizontal reduction.
26547 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
26548 }
26549 }
26550 // Start vectorization of post-process list of instructions from the
26551 // top-tree instructions to try to vectorize as many instructions as
26552 // possible.
26553 OpsChanged |=
26554 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
26555 if (OpsChanged) {
26556 // We would like to start over since some instructions are deleted
26557 // and the iterator may become invalid value.
26558 Changed = true;
26559 It = BB->begin();
26560 E = BB->end();
26561 continue;
26562 }
26563 }
26564
26566 PostProcessInserts.insert(&*It);
26567 else if (isa<CmpInst>(It))
26568 PostProcessCmps.insert(cast<CmpInst>(&*It));
26569 }
26570
26571 return Changed;
26572}
26573
26574bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
26575 auto Changed = false;
26576 for (auto &Entry : GEPs) {
26577 // If the getelementptr list has fewer than two elements, there's nothing
26578 // to do.
26579 if (Entry.second.size() < 2)
26580 continue;
26581
26582 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
26583 << Entry.second.size() << ".\n");
26584
26585 // Process the GEP list in chunks suitable for the target's supported
26586 // vector size. If a vector register can't hold 1 element, we are done. We
26587 // are trying to vectorize the index computations, so the maximum number of
26588 // elements is based on the size of the index expression, rather than the
26589 // size of the GEP itself (the target's pointer size).
26590 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
26591 return !R.isDeleted(GEP);
26592 });
26593 if (It == Entry.second.end())
26594 continue;
26595 unsigned MaxVecRegSize = R.getMaxVecRegSize();
26596 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
26597 if (MaxVecRegSize < EltSize)
26598 continue;
26599
26600 unsigned MaxElts = MaxVecRegSize / EltSize;
26601 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
26602 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26603 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
26604
26605 // Initialize a set a candidate getelementptrs. Note that we use a
26606 // SetVector here to preserve program order. If the index computations
26607 // are vectorizable and begin with loads, we want to minimize the chance
26608 // of having to reorder them later.
26609 SetVector<Value *> Candidates(llvm::from_range, GEPList);
26610
26611 // Some of the candidates may have already been vectorized after we
26612 // initially collected them or their index is optimized to constant value.
26613 // If so, they are marked as deleted, so remove them from the set of
26614 // candidates.
26615 Candidates.remove_if([&R](Value *I) {
26616 return R.isDeleted(cast<Instruction>(I)) ||
26617 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
26618 });
26619
26620 // Remove from the set of candidates all pairs of getelementptrs with
26621 // constant differences. Such getelementptrs are likely not good
26622 // candidates for vectorization in a bottom-up phase since one can be
26623 // computed from the other. We also ensure all candidate getelementptr
26624 // indices are unique.
26625 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
26626 auto *GEPI = GEPList[I];
26627 if (!Candidates.count(GEPI))
26628 continue;
26629 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
26630 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
26631 auto *GEPJ = GEPList[J];
26632 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26633 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
26634 Candidates.remove(GEPI);
26635 Candidates.remove(GEPJ);
26636 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26637 Candidates.remove(GEPJ);
26638 }
26639 }
26640 }
26641
26642 // We break out of the above computation as soon as we know there are
26643 // fewer than two candidates remaining.
26644 if (Candidates.size() < 2)
26645 continue;
26646
26647 // Add the single, non-constant index of each candidate to the bundle. We
26648 // ensured the indices met these constraints when we originally collected
26649 // the getelementptrs.
26650 SmallVector<Value *, 16> Bundle(Candidates.size());
26651 auto BundleIndex = 0u;
26652 for (auto *V : Candidates) {
26653 auto *GEP = cast<GetElementPtrInst>(V);
26654 auto *GEPIdx = GEP->idx_begin()->get();
26655 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
26656 Bundle[BundleIndex++] = GEPIdx;
26657 }
26658
26659 // Try and vectorize the indices. We are currently only interested in
26660 // gather-like cases of the form:
26661 //
26662 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
26663 //
26664 // where the loads of "a", the loads of "b", and the subtractions can be
26665 // performed in parallel. It's likely that detecting this pattern in a
26666 // bottom-up phase will be simpler and less costly than building a
26667 // full-blown top-down phase beginning at the consecutive loads.
26668 Changed |= tryToVectorizeList(Bundle, R);
26669 }
26670 }
26671 return Changed;
26672}
26673
26674bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
26675 bool Changed = false;
26676 // Sort by type, base pointers and values operand. Value operands must be
26677 // compatible (have the same opcode, same parent), otherwise it is
26678 // definitely not profitable to try to vectorize them.
26679 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
26680 if (V->getValueOperand()->getType()->getTypeID() <
26681 V2->getValueOperand()->getType()->getTypeID())
26682 return true;
26683 if (V->getValueOperand()->getType()->getTypeID() >
26684 V2->getValueOperand()->getType()->getTypeID())
26685 return false;
26686 if (V->getPointerOperandType()->getTypeID() <
26687 V2->getPointerOperandType()->getTypeID())
26688 return true;
26689 if (V->getPointerOperandType()->getTypeID() >
26690 V2->getPointerOperandType()->getTypeID())
26691 return false;
26692 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
26693 V2->getValueOperand()->getType()->getScalarSizeInBits())
26694 return true;
26695 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
26696 V2->getValueOperand()->getType()->getScalarSizeInBits())
26697 return false;
26698 // UndefValues are compatible with all other values.
26699 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
26700 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
26701 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
26702 DT->getNode(I1->getParent());
26703 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
26704 DT->getNode(I2->getParent());
26705 assert(NodeI1 && "Should only process reachable instructions");
26706 assert(NodeI2 && "Should only process reachable instructions");
26707 assert((NodeI1 == NodeI2) ==
26708 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
26709 "Different nodes should have different DFS numbers");
26710 if (NodeI1 != NodeI2)
26711 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
26712 return I1->getOpcode() < I2->getOpcode();
26713 }
26714 return V->getValueOperand()->getValueID() <
26715 V2->getValueOperand()->getValueID();
26716 };
26717
26718 bool SameParent = true;
26719 auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
26720 if (VL.empty()) {
26721 SameParent = true;
26722 return true;
26723 }
26724 StoreInst *V2 = VL.back();
26725 if (V1 == V2)
26726 return true;
26727 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
26728 return false;
26729 if (V1->getPointerOperandType() != V2->getPointerOperandType())
26730 return false;
26731 // Undefs are compatible with any other value.
26732 if (isa<UndefValue>(V1->getValueOperand()) ||
26734 return true;
26735 if (isa<Constant>(V1->getValueOperand()) &&
26737 return true;
26738 // Check if the operands of the stores can be vectorized. They can be
26739 // vectorized, if they have compatible operands or have operands, which can
26740 // be vectorized as copyables.
26741 auto *I1 = dyn_cast<Instruction>(V1->getValueOperand());
26742 auto *I2 = dyn_cast<Instruction>(V2->getValueOperand());
26743 if (I1 || I2) {
26744 // Accept only tail-following non-compatible values for now.
26745 // TODO: investigate if it is possible to vectorize incompatible values,
26746 // if the copyables are first in the list.
26747 if (I1 && !I2)
26748 return false;
26749 SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
26750 SmallVector<Value *> NewVL(VL.size() + 1);
26751 for (auto [SI, V] : zip(VL, NewVL))
26752 V = SI->getValueOperand();
26753 NewVL.back() = V1->getValueOperand();
26754 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
26755 InstructionsState S = Analysis.buildInstructionsState(
26756 NewVL, R, VectorizeCopyableElements, /*WithProfitabilityCheck=*/true,
26757 /*SkipSameCodeCheck=*/!SameParent);
26758 if (S)
26759 return true;
26760 if (!SameParent)
26761 return false;
26762 }
26763 return V1->getValueOperand()->getValueID() ==
26764 V2->getValueOperand()->getValueID();
26765 };
26766
26767 // Attempt to sort and vectorize each of the store-groups.
26768 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
26769 for (auto &Pair : Stores) {
26770 if (Pair.second.size() < 2)
26771 continue;
26772
26773 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
26774 << Pair.second.size() << ".\n");
26775
26776 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
26777 continue;
26778
26779 // Reverse stores to do bottom-to-top analysis. This is important if the
26780 // values are stores to the same addresses several times, in this case need
26781 // to follow the stores order (reversed to meet the memory dependecies).
26782 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
26783 Pair.second.rend());
26785 ReversedStores, StoreSorter, AreCompatibleStores,
26786 [&](ArrayRef<StoreInst *> Candidates, bool) {
26787 return vectorizeStores(Candidates, R, Attempted);
26788 },
26789 /*MaxVFOnly=*/false, R);
26790 }
26791 return Changed;
26792}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
basic Basic Alias true
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
Early If Converter
static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC)
Definition ExpandFp.cpp:993
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
static Value * getCondition(Instruction *I)
static void setCondition(Instruction *I, Value *NewCond)
static const HTTPClientCleanup Cleanup
Hexagon Common GEP
#define _
static Type * getIndexType(Value *In)
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file provides utility analysis objects describing memory locations.
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool isCommutative(Instruction *I, Value *ValWithUses)
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
#define SV_NAME
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static const int BlockSize
Definition TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition VPlanSLP.cpp:210
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1407
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1331
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1112
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1397
void negate()
Negate this APInt in place.
Definition APInt.h:1469
unsigned logBase2() const
Definition APInt.h:1762
void setAllBits()
Set every bit to 1.
Definition APInt.h:1320
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1368
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:287
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:179
const T & back() const
back - Get the last element.
Definition ArrayRef.h:152
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition ArrayRef.h:220
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:196
const T & front() const
front - Get the first element.
Definition ArrayRef.h:146
iterator end() const
Definition ArrayRef.h:132
size_t size() const
size - Get the array size.
Definition ArrayRef.h:143
iterator begin() const
Definition ArrayRef.h:131
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:138
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:187
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
Definition ArrayRef.h:158
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
size_t size() const
Definition BasicBlock.h:480
InstListType::const_reverse_iterator const_reverse_iterator
Definition BasicBlock.h:173
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition BasicBlock.h:707
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:448
This class is the base class for the comparison instructions.
Definition InstrTypes.h:664
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:827
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:765
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
static bool shouldExecute(unsigned CounterName)
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getUnknown()
Definition DebugLoc.h:162
An analysis that produces DemandedBits for a function.
LLVM_ABI APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:248
bool erase(const KeyT &Val)
Definition DenseMap.h:322
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
iterator end()
Definition DenseMap.h:81
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:224
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:233
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool allowContract() const
Definition FMF.h:69
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
ArrayRef< Type * > params() const
Type * getReturnType() const
bool empty() const
Definition Function.h:857
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:547
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2645
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
LLVM_ABI Value * CreateSelectWithUnknownProfile(Value *C, Value *True, Value *False, StringRef PassName, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2280
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2442
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1651
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
bool isSimple() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
iterator end()
Definition MapVector.h:67
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition MapVector.h:48
iterator find(const KeyT &Key)
Definition MapVector.h:149
bool empty() const
Definition MapVector.h:77
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:111
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:103
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:299
T & front() const
front - Get the first element.
Definition ArrayRef.h:350
iterator end() const
Definition ArrayRef.h:344
iterator begin() const
Definition ArrayRef.h:343
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:58
ArrayRef< value_type > getArrayRef() const
Definition SetVector.h:89
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:101
const value_type & front() const
Return the first element of the SetVector.
Definition SetVector.h:130
void insert_range(Range &&R)
Definition SetVector.h:174
Vector takeVector()
Clear the SetVector and return the underlying vector.
Definition SetVector.h:92
void clear()
Completely clear the SetVector.
Definition SetVector.h:265
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:98
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:149
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition SetVector.h:250
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:337
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:228
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
size_type size() const
Definition SmallSet.h:170
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
Definition Type.cpp:180
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:296
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:136
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition User.h:119
op_iterator op_begin()
Definition User.h:284
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:24
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
iterator_range< value_op_iterator > operand_values()
Definition User.h:316
The Vector Function Database.
Definition VectorUtils.h:33
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:74
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
User * user_back()
Definition Value.h:412
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition Value.cpp:158
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition Value.cpp:265
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
iterator find(const_arg_type_t< ValueT > V)
Definition DenseSet.h:167
void insert_range(Range &&R)
Definition DenseSet.h:228
size_type size() const
Definition DenseSet.h:87
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition Hashing.h:76
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
SmallVector< StoreInst *, 8 > StoreList
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, const size_t Sz) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
SmallVector< Instruction *, 16 > InstrList
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
bool analyzeRtStrideCandidate(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl< unsigned > &SortedIndices, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with run-time stride).
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
bool analyzeConstantStrideCandidate(const ArrayRef< Value * > PointerOps, Type *ElemTy, Align Alignment, const SmallVectorImpl< unsigned > &SortedIndices, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with constant stride).
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
CallInst * Call
Changed
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
bool empty() const
Definition BasicBlock.h:101
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2058
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1718
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
InstructionCost Cost
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1724
constexpr from_range_t from_range
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2231
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
Definition STLExtras.h:1981
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:345
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2128
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:1968
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition STLExtras.h:1763
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
Definition Casting.h:669
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
Definition SPIRVUtils.h:339
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
Definition STLExtras.h:1397
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:421
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1920
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
TargetTransformInfo TTI
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ None
Not a recurrence.
@ Xor
Bitwise or logical XOR of integers.
@ FMul
Product of floats.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1954
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2030
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1835
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1407
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1961
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2088
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
ScalarEvolution * SE
TargetTransformInfo * TTI
AssumptionCache * AC
TargetLibraryInfo * TLI
const DataLayout * DL
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:257
Describe known properties for a set of pointers.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition STLExtras.h:1425
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1434
This structure holds any data we need about the edges being traversed during buildTreeRec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const