LLVM 20.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <memory>
97#include <optional>
98#include <set>
99#include <string>
100#include <tuple>
101#include <utility>
102
103using namespace llvm;
104using namespace llvm::PatternMatch;
105using namespace slpvectorizer;
106
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
109
110STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111
112static cl::opt<bool>
113 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
114 cl::desc("Run the SLP vectorization passes"));
115
116static cl::opt<bool>
117 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
118 cl::desc("Enable vectorization for wider vector utilization"));
119
120static cl::opt<int>
122 cl::desc("Only vectorize if you gain more than this "
123 "number "));
124
126 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
127 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
128 "heuristics and makes vectorization decision via cost modeling."));
129
130static cl::opt<bool>
131ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
132 cl::desc("Attempt to vectorize horizontal reductions"));
133
135 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
136 cl::desc(
137 "Attempt to vectorize horizontal reductions feeding into a store"));
138
139// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
140// even if we match a reduction but do not vectorize in the end.
142 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
143 cl::desc("Allow optimization of original scalar identity operations on "
144 "matched horizontal reductions."));
145
146static cl::opt<int>
148 cl::desc("Attempt to vectorize for this register size in bits"));
149
152 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
153
154/// Limits the size of scheduling regions in a block.
155/// It avoid long compile times for _very_ large blocks where vector
156/// instructions are spread over a wide range.
157/// This limit is way higher than needed by real-world functions.
158static cl::opt<int>
159ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
160 cl::desc("Limit the size of the SLP scheduling region per block"));
161
163 "slp-min-reg-size", cl::init(128), cl::Hidden,
164 cl::desc("Attempt to vectorize for this register size in bits"));
165
167 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
168 cl::desc("Limit the recursion depth when building a vectorizable tree"));
169
171 "slp-min-tree-size", cl::init(3), cl::Hidden,
172 cl::desc("Only vectorize small trees if they are fully vectorizable"));
173
174// The maximum depth that the look-ahead score heuristic will explore.
175// The higher this value, the higher the compilation time overhead.
177 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
178 cl::desc("The maximum look-ahead depth for operand reordering scores"));
179
180// The maximum depth that the look-ahead score heuristic will explore
181// when it probing among candidates for vectorization tree roots.
182// The higher this value, the higher the compilation time overhead but unlike
183// similar limit for operands ordering this is less frequently used, hence
184// impact of higher value is less noticeable.
186 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
187 cl::desc("The maximum look-ahead depth for searching best rooting option"));
188
190 "slp-min-strided-loads", cl::init(2), cl::Hidden,
191 cl::desc("The minimum number of loads, which should be considered strided, "
192 "if the stride is > 1 or is runtime value"));
193
195 "slp-max-stride", cl::init(8), cl::Hidden,
196 cl::desc("The maximum stride, considered to be profitable."));
197
198static cl::opt<bool>
199 ViewSLPTree("view-slp-tree", cl::Hidden,
200 cl::desc("Display the SLP trees with Graphviz"));
201
203 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
204 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
205
206// Limit the number of alias checks. The limit is chosen so that
207// it has no negative effect on the llvm benchmarks.
208static const unsigned AliasedCheckLimit = 10;
209
210// Limit of the number of uses for potentially transformed instructions/values,
211// used in checks to avoid compile-time explode.
212static constexpr int UsesLimit = 64;
213
214// Another limit for the alias checks: The maximum distance between load/store
215// instructions where alias checks are done.
216// This limit is useful for very large basic blocks.
217static const unsigned MaxMemDepDistance = 160;
218
219/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
220/// regions to be handled.
221static const int MinScheduleRegionSize = 16;
222
223/// Maximum allowed number of operands in the PHI nodes.
224static const unsigned MaxPHINumOperands = 128;
225
226/// Predicate for the element types that the SLP vectorizer supports.
227///
228/// The most important thing to filter here are types which are invalid in LLVM
229/// vectors. We also filter target specific types which have absolutely no
230/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
231/// avoids spending time checking the cost model and realizing that they will
232/// be inevitably scalarized.
233static bool isValidElementType(Type *Ty) {
234 // TODO: Support ScalableVectorType.
235 if (SLPReVec && isa<FixedVectorType>(Ty))
236 Ty = Ty->getScalarType();
237 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
238 !Ty->isPPC_FP128Ty();
239}
240
241/// \returns the number of elements for Ty.
242static unsigned getNumElements(Type *Ty) {
243 assert(!isa<ScalableVectorType>(Ty) &&
244 "ScalableVectorType is not supported.");
245 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
246 return VecTy->getNumElements();
247 return 1;
248}
249
250/// \returns the vector type of ScalarTy based on vectorization factor.
251static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
252 return FixedVectorType::get(ScalarTy->getScalarType(),
253 VF * getNumElements(ScalarTy));
254}
255
256static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
257 SmallVectorImpl<int> &Mask) {
258 // The ShuffleBuilder implementation use shufflevector to splat an "element".
259 // But the element have different meaning for SLP (scalar) and REVEC
260 // (vector). We need to expand Mask into masks which shufflevector can use
261 // directly.
262 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
263 for (unsigned I : seq<unsigned>(Mask.size()))
264 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
265 I * VecTyNumElements, VecTyNumElements)))
266 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
267 : Mask[I] * VecTyNumElements + J;
268 Mask.swap(NewMask);
269}
270
271/// \returns True if the value is a constant (but not globals/constant
272/// expressions).
273static bool isConstant(Value *V) {
274 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
275}
276
277/// Checks if \p V is one of vector-like instructions, i.e. undef,
278/// insertelement/extractelement with constant indices for fixed vector type or
279/// extractvalue instruction.
281 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
282 !isa<ExtractValueInst, UndefValue>(V))
283 return false;
284 auto *I = dyn_cast<Instruction>(V);
285 if (!I || isa<ExtractValueInst>(I))
286 return true;
287 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
288 return false;
289 if (isa<ExtractElementInst>(I))
290 return isConstant(I->getOperand(1));
291 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
292 return isConstant(I->getOperand(2));
293}
294
295/// Returns power-of-2 number of elements in a single register (part), given the
296/// total number of elements \p Size and number of registers (parts) \p
297/// NumParts.
298static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
299 return PowerOf2Ceil(divideCeil(Size, NumParts));
300}
301
302/// Returns correct remaining number of elements, considering total amount \p
303/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
304/// and current register (part) \p Part.
305static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
306 unsigned Part) {
307 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
308}
309
310#if !defined(NDEBUG)
311/// Print a short descriptor of the instruction bundle suitable for debug output.
312static std::string shortBundleName(ArrayRef<Value *> VL) {
313 std::string Result;
314 raw_string_ostream OS(Result);
315 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
316 OS.flush();
317 return Result;
318}
319#endif
320
321/// \returns true if all of the instructions in \p VL are in the same block or
322/// false otherwise.
324 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
325 if (!I0)
326 return false;
328 return true;
329
330 BasicBlock *BB = I0->getParent();
331 for (int I = 1, E = VL.size(); I < E; I++) {
332 auto *II = dyn_cast<Instruction>(VL[I]);
333 if (!II)
334 return false;
335
336 if (BB != II->getParent())
337 return false;
338 }
339 return true;
340}
341
342/// \returns True if all of the values in \p VL are constants (but not
343/// globals/constant expressions).
345 // Constant expressions and globals can't be vectorized like normal integer/FP
346 // constants.
347 return all_of(VL, isConstant);
348}
349
350/// \returns True if all of the values in \p VL are identical or some of them
351/// are UndefValue.
352static bool isSplat(ArrayRef<Value *> VL) {
353 Value *FirstNonUndef = nullptr;
354 for (Value *V : VL) {
355 if (isa<UndefValue>(V))
356 continue;
357 if (!FirstNonUndef) {
358 FirstNonUndef = V;
359 continue;
360 }
361 if (V != FirstNonUndef)
362 return false;
363 }
364 return FirstNonUndef != nullptr;
365}
366
367/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
369 if (auto *Cmp = dyn_cast<CmpInst>(I))
370 return Cmp->isCommutative();
371 if (auto *BO = dyn_cast<BinaryOperator>(I))
372 return BO->isCommutative() ||
373 (BO->getOpcode() == Instruction::Sub &&
374 !BO->hasNUsesOrMore(UsesLimit) &&
375 all_of(
376 BO->uses(),
377 [](const Use &U) {
378 // Commutative, if icmp eq/ne sub, 0
379 ICmpInst::Predicate Pred;
380 if (match(U.getUser(),
381 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
382 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
383 return true;
384 // Commutative, if abs(sub nsw, true) or abs(sub, false).
385 ConstantInt *Flag;
386 return match(U.getUser(),
387 m_Intrinsic<Intrinsic::abs>(
388 m_Specific(U.get()), m_ConstantInt(Flag))) &&
389 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
390 Flag->isOne());
391 })) ||
392 (BO->getOpcode() == Instruction::FSub &&
393 !BO->hasNUsesOrMore(UsesLimit) &&
394 all_of(BO->uses(), [](const Use &U) {
395 return match(U.getUser(),
396 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
397 }));
398 return I->isCommutative();
399}
400
401template <typename T>
402static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
403 unsigned Offset) {
404 static_assert(std::is_same_v<T, InsertElementInst> ||
405 std::is_same_v<T, ExtractElementInst>,
406 "unsupported T");
407 int Index = Offset;
408 if (const auto *IE = dyn_cast<T>(Inst)) {
409 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
410 if (!VT)
411 return std::nullopt;
412 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
413 if (!CI)
414 return std::nullopt;
415 if (CI->getValue().uge(VT->getNumElements()))
416 return std::nullopt;
417 Index *= VT->getNumElements();
418 Index += CI->getZExtValue();
419 return Index;
420 }
421 return std::nullopt;
422}
423
424/// \returns inserting or extracting index of InsertElement, ExtractElement or
425/// InsertValue instruction, using Offset as base offset for index.
426/// \returns std::nullopt if the index is not an immediate.
427static std::optional<unsigned> getElementIndex(const Value *Inst,
428 unsigned Offset = 0) {
429 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
430 return Index;
431 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
432 return Index;
433
434 int Index = Offset;
435
436 const auto *IV = dyn_cast<InsertValueInst>(Inst);
437 if (!IV)
438 return std::nullopt;
439
440 Type *CurrentType = IV->getType();
441 for (unsigned I : IV->indices()) {
442 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
443 Index *= ST->getNumElements();
444 CurrentType = ST->getElementType(I);
445 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
446 Index *= AT->getNumElements();
447 CurrentType = AT->getElementType();
448 } else {
449 return std::nullopt;
450 }
451 Index += I;
452 }
453 return Index;
454}
455
456namespace {
457/// Specifies the way the mask should be analyzed for undefs/poisonous elements
458/// in the shuffle mask.
459enum class UseMask {
460 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
461 ///< check for the mask elements for the first argument (mask
462 ///< indices are in range [0:VF)).
463 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
464 ///< for the mask elements for the second argument (mask indices
465 ///< are in range [VF:2*VF))
466 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
467 ///< future shuffle elements and mark them as ones as being used
468 ///< in future. Non-undef elements are considered as unused since
469 ///< they're already marked as used in the mask.
470};
471} // namespace
472
473/// Prepares a use bitset for the given mask either for the first argument or
474/// for the second.
476 UseMask MaskArg) {
477 SmallBitVector UseMask(VF, true);
478 for (auto [Idx, Value] : enumerate(Mask)) {
479 if (Value == PoisonMaskElem) {
480 if (MaskArg == UseMask::UndefsAsMask)
481 UseMask.reset(Idx);
482 continue;
483 }
484 if (MaskArg == UseMask::FirstArg && Value < VF)
485 UseMask.reset(Value);
486 else if (MaskArg == UseMask::SecondArg && Value >= VF)
487 UseMask.reset(Value - VF);
488 }
489 return UseMask;
490}
491
492/// Checks if the given value is actually an undefined constant vector.
493/// Also, if the \p UseMask is not empty, tries to check if the non-masked
494/// elements actually mask the insertelement buildvector, if any.
495template <bool IsPoisonOnly = false>
497 const SmallBitVector &UseMask = {}) {
498 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
499 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
500 if (isa<T>(V))
501 return Res;
502 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
503 if (!VecTy)
504 return Res.reset();
505 auto *C = dyn_cast<Constant>(V);
506 if (!C) {
507 if (!UseMask.empty()) {
508 const Value *Base = V;
509 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
510 Base = II->getOperand(0);
511 if (isa<T>(II->getOperand(1)))
512 continue;
513 std::optional<unsigned> Idx = getElementIndex(II);
514 if (!Idx) {
515 Res.reset();
516 return Res;
517 }
518 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
519 Res.reset(*Idx);
520 }
521 // TODO: Add analysis for shuffles here too.
522 if (V == Base) {
523 Res.reset();
524 } else {
525 SmallBitVector SubMask(UseMask.size(), false);
526 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
527 }
528 } else {
529 Res.reset();
530 }
531 return Res;
532 }
533 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
534 if (Constant *Elem = C->getAggregateElement(I))
535 if (!isa<T>(Elem) &&
536 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
537 Res.reset(I);
538 }
539 return Res;
540}
541
542/// Checks if the vector of instructions can be represented as a shuffle, like:
543/// %x0 = extractelement <4 x i8> %x, i32 0
544/// %x3 = extractelement <4 x i8> %x, i32 3
545/// %y1 = extractelement <4 x i8> %y, i32 1
546/// %y2 = extractelement <4 x i8> %y, i32 2
547/// %x0x0 = mul i8 %x0, %x0
548/// %x3x3 = mul i8 %x3, %x3
549/// %y1y1 = mul i8 %y1, %y1
550/// %y2y2 = mul i8 %y2, %y2
551/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
552/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
553/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
554/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
555/// ret <4 x i8> %ins4
556/// can be transformed into:
557/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
558/// i32 6>
559/// %2 = mul <4 x i8> %1, %1
560/// ret <4 x i8> %2
561/// Mask will return the Shuffle Mask equivalent to the extracted elements.
562/// TODO: Can we split off and reuse the shuffle mask detection from
563/// ShuffleVectorInst/getShuffleCost?
564static std::optional<TargetTransformInfo::ShuffleKind>
566 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
567 if (It == VL.end())
568 return std::nullopt;
569 unsigned Size =
570 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
571 auto *EI = dyn_cast<ExtractElementInst>(V);
572 if (!EI)
573 return S;
574 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
575 if (!VTy)
576 return S;
577 return std::max(S, VTy->getNumElements());
578 });
579
580 Value *Vec1 = nullptr;
581 Value *Vec2 = nullptr;
582 bool HasNonUndefVec = any_of(VL, [](Value *V) {
583 auto *EE = dyn_cast<ExtractElementInst>(V);
584 if (!EE)
585 return false;
586 Value *Vec = EE->getVectorOperand();
587 if (isa<UndefValue>(Vec))
588 return false;
589 return isGuaranteedNotToBePoison(Vec);
590 });
591 enum ShuffleMode { Unknown, Select, Permute };
592 ShuffleMode CommonShuffleMode = Unknown;
593 Mask.assign(VL.size(), PoisonMaskElem);
594 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
595 // Undef can be represented as an undef element in a vector.
596 if (isa<UndefValue>(VL[I]))
597 continue;
598 auto *EI = cast<ExtractElementInst>(VL[I]);
599 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
600 return std::nullopt;
601 auto *Vec = EI->getVectorOperand();
602 // We can extractelement from undef or poison vector.
603 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
604 continue;
605 // All vector operands must have the same number of vector elements.
606 if (isa<UndefValue>(Vec)) {
607 Mask[I] = I;
608 } else {
609 if (isa<UndefValue>(EI->getIndexOperand()))
610 continue;
611 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
612 if (!Idx)
613 return std::nullopt;
614 // Undefined behavior if Idx is negative or >= Size.
615 if (Idx->getValue().uge(Size))
616 continue;
617 unsigned IntIdx = Idx->getValue().getZExtValue();
618 Mask[I] = IntIdx;
619 }
620 if (isUndefVector(Vec).all() && HasNonUndefVec)
621 continue;
622 // For correct shuffling we have to have at most 2 different vector operands
623 // in all extractelement instructions.
624 if (!Vec1 || Vec1 == Vec) {
625 Vec1 = Vec;
626 } else if (!Vec2 || Vec2 == Vec) {
627 Vec2 = Vec;
628 Mask[I] += Size;
629 } else {
630 return std::nullopt;
631 }
632 if (CommonShuffleMode == Permute)
633 continue;
634 // If the extract index is not the same as the operation number, it is a
635 // permutation.
636 if (Mask[I] % Size != I) {
637 CommonShuffleMode = Permute;
638 continue;
639 }
640 CommonShuffleMode = Select;
641 }
642 // If we're not crossing lanes in different vectors, consider it as blending.
643 if (CommonShuffleMode == Select && Vec2)
645 // If Vec2 was never used, we have a permutation of a single vector, otherwise
646 // we have permutation of 2 vectors.
649}
650
651/// \returns True if Extract{Value,Element} instruction extracts element Idx.
652static std::optional<unsigned> getExtractIndex(Instruction *E) {
653 unsigned Opcode = E->getOpcode();
654 assert((Opcode == Instruction::ExtractElement ||
655 Opcode == Instruction::ExtractValue) &&
656 "Expected extractelement or extractvalue instruction.");
657 if (Opcode == Instruction::ExtractElement) {
658 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
659 if (!CI)
660 return std::nullopt;
661 return CI->getZExtValue();
662 }
663 auto *EI = cast<ExtractValueInst>(E);
664 if (EI->getNumIndices() != 1)
665 return std::nullopt;
666 return *EI->idx_begin();
667}
668
669namespace {
670
671/// Main data required for vectorization of instructions.
672struct InstructionsState {
673 /// The very first instruction in the list with the main opcode.
674 Value *OpValue = nullptr;
675
676 /// The main/alternate instruction.
677 Instruction *MainOp = nullptr;
678 Instruction *AltOp = nullptr;
679
680 /// The main/alternate opcodes for the list of instructions.
681 unsigned getOpcode() const {
682 return MainOp ? MainOp->getOpcode() : 0;
683 }
684
685 unsigned getAltOpcode() const {
686 return AltOp ? AltOp->getOpcode() : 0;
687 }
688
689 /// Some of the instructions in the list have alternate opcodes.
690 bool isAltShuffle() const { return AltOp != MainOp; }
691
692 bool isOpcodeOrAlt(Instruction *I) const {
693 unsigned CheckedOpcode = I->getOpcode();
694 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
695 }
696
697 InstructionsState() = delete;
698 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
699 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
700};
701
702} // end anonymous namespace
703
704/// Chooses the correct key for scheduling data. If \p Op has the same (or
705/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
706/// OpValue.
707static Value *isOneOf(const InstructionsState &S, Value *Op) {
708 auto *I = dyn_cast<Instruction>(Op);
709 if (I && S.isOpcodeOrAlt(I))
710 return Op;
711 return S.OpValue;
712}
713
714/// \returns true if \p Opcode is allowed as part of the main/alternate
715/// instruction for SLP vectorization.
716///
717/// Example of unsupported opcode is SDIV that can potentially cause UB if the
718/// "shuffled out" lane would result in division by zero.
719static bool isValidForAlternation(unsigned Opcode) {
720 if (Instruction::isIntDivRem(Opcode))
721 return false;
722
723 return true;
724}
725
726static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
727 const TargetLibraryInfo &TLI,
728 unsigned BaseIndex = 0);
729
730/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
731/// compatible instructions or constants, or just some other regular values.
732static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
733 Value *Op1, const TargetLibraryInfo &TLI) {
734 return (isConstant(BaseOp0) && isConstant(Op0)) ||
735 (isConstant(BaseOp1) && isConstant(Op1)) ||
736 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
737 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
738 BaseOp0 == Op0 || BaseOp1 == Op1 ||
739 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
740 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
741}
742
743/// \returns true if a compare instruction \p CI has similar "look" and
744/// same predicate as \p BaseCI, "as is" or with its operands and predicate
745/// swapped, false otherwise.
746static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
747 const TargetLibraryInfo &TLI) {
748 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
749 "Assessing comparisons of different types?");
750 CmpInst::Predicate BasePred = BaseCI->getPredicate();
751 CmpInst::Predicate Pred = CI->getPredicate();
753
754 Value *BaseOp0 = BaseCI->getOperand(0);
755 Value *BaseOp1 = BaseCI->getOperand(1);
756 Value *Op0 = CI->getOperand(0);
757 Value *Op1 = CI->getOperand(1);
758
759 return (BasePred == Pred &&
760 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
761 (BasePred == SwappedPred &&
762 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
763}
764
765/// \returns analysis of the Instructions in \p VL described in
766/// InstructionsState, the Opcode that we suppose the whole list
767/// could be vectorized even if its structure is diverse.
768static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
769 const TargetLibraryInfo &TLI,
770 unsigned BaseIndex) {
771 // Make sure these are all Instructions.
772 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
773 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
774
775 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
776 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
777 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
778 CmpInst::Predicate BasePred =
779 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
781 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
782 unsigned AltOpcode = Opcode;
783 unsigned AltIndex = BaseIndex;
784
785 bool SwappedPredsCompatible = [&]() {
786 if (!IsCmpOp)
787 return false;
788 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
789 UniquePreds.insert(BasePred);
790 UniqueNonSwappedPreds.insert(BasePred);
791 for (Value *V : VL) {
792 auto *I = dyn_cast<CmpInst>(V);
793 if (!I)
794 return false;
795 CmpInst::Predicate CurrentPred = I->getPredicate();
796 CmpInst::Predicate SwappedCurrentPred =
797 CmpInst::getSwappedPredicate(CurrentPred);
798 UniqueNonSwappedPreds.insert(CurrentPred);
799 if (!UniquePreds.contains(CurrentPred) &&
800 !UniquePreds.contains(SwappedCurrentPred))
801 UniquePreds.insert(CurrentPred);
802 }
803 // Total number of predicates > 2, but if consider swapped predicates
804 // compatible only 2, consider swappable predicates as compatible opcodes,
805 // not alternate.
806 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
807 }();
808 // Check for one alternate opcode from another BinaryOperator.
809 // TODO - generalize to support all operators (types, calls etc.).
810 auto *IBase = cast<Instruction>(VL[BaseIndex]);
811 Intrinsic::ID BaseID = 0;
812 SmallVector<VFInfo> BaseMappings;
813 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
815 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
816 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
817 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
818 }
819 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
820 auto *I = cast<Instruction>(VL[Cnt]);
821 unsigned InstOpcode = I->getOpcode();
822 if (IsBinOp && isa<BinaryOperator>(I)) {
823 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
824 continue;
825 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
826 isValidForAlternation(Opcode)) {
827 AltOpcode = InstOpcode;
828 AltIndex = Cnt;
829 continue;
830 }
831 } else if (IsCastOp && isa<CastInst>(I)) {
832 Value *Op0 = IBase->getOperand(0);
833 Type *Ty0 = Op0->getType();
834 Value *Op1 = I->getOperand(0);
835 Type *Ty1 = Op1->getType();
836 if (Ty0 == Ty1) {
837 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
838 continue;
839 if (Opcode == AltOpcode) {
841 isValidForAlternation(InstOpcode) &&
842 "Cast isn't safe for alternation, logic needs to be updated!");
843 AltOpcode = InstOpcode;
844 AltIndex = Cnt;
845 continue;
846 }
847 }
848 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
849 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
850 Type *Ty0 = BaseInst->getOperand(0)->getType();
851 Type *Ty1 = Inst->getOperand(0)->getType();
852 if (Ty0 == Ty1) {
853 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
854 // Check for compatible operands. If the corresponding operands are not
855 // compatible - need to perform alternate vectorization.
856 CmpInst::Predicate CurrentPred = Inst->getPredicate();
857 CmpInst::Predicate SwappedCurrentPred =
858 CmpInst::getSwappedPredicate(CurrentPred);
859
860 if ((E == 2 || SwappedPredsCompatible) &&
861 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
862 continue;
863
864 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
865 continue;
866 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
867 if (AltIndex != BaseIndex) {
868 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
869 continue;
870 } else if (BasePred != CurrentPred) {
871 assert(
872 isValidForAlternation(InstOpcode) &&
873 "CmpInst isn't safe for alternation, logic needs to be updated!");
874 AltIndex = Cnt;
875 continue;
876 }
877 CmpInst::Predicate AltPred = AltInst->getPredicate();
878 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
879 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
880 continue;
881 }
882 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
883 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
884 if (Gep->getNumOperands() != 2 ||
885 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
886 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
887 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
889 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
890 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
891 auto *BaseLI = cast<LoadInst>(IBase);
892 if (!LI->isSimple() || !BaseLI->isSimple())
893 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
894 } else if (auto *Call = dyn_cast<CallInst>(I)) {
895 auto *CallBase = cast<CallInst>(IBase);
896 if (Call->getCalledFunction() != CallBase->getCalledFunction())
897 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
898 if (Call->hasOperandBundles() && (!CallBase->hasOperandBundles() ||
899 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
900 Call->op_begin() + Call->getBundleOperandsEndIndex(),
901 CallBase->op_begin() +
903 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
905 if (ID != BaseID)
906 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
907 if (!ID) {
908 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
909 if (Mappings.size() != BaseMappings.size() ||
910 Mappings.front().ISA != BaseMappings.front().ISA ||
911 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
912 Mappings.front().VectorName != BaseMappings.front().VectorName ||
913 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
914 Mappings.front().Shape.Parameters !=
915 BaseMappings.front().Shape.Parameters)
916 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
917 }
918 }
919 continue;
920 }
921 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
922 }
923
924 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
925 cast<Instruction>(VL[AltIndex]));
926}
927
928/// \returns true if all of the values in \p VL have the same type or false
929/// otherwise.
931 Type *Ty = VL.front()->getType();
932 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
933}
934
935/// \returns True if in-tree use also needs extract. This refers to
936/// possible scalar operand in vectorized instruction.
937static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
938 TargetLibraryInfo *TLI) {
939 unsigned Opcode = UserInst->getOpcode();
940 switch (Opcode) {
941 case Instruction::Load: {
942 LoadInst *LI = cast<LoadInst>(UserInst);
943 return (LI->getPointerOperand() == Scalar);
944 }
945 case Instruction::Store: {
946 StoreInst *SI = cast<StoreInst>(UserInst);
947 return (SI->getPointerOperand() == Scalar);
948 }
949 case Instruction::Call: {
950 CallInst *CI = cast<CallInst>(UserInst);
952 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
953 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
954 Arg.value().get() == Scalar;
955 });
956 }
957 default:
958 return false;
959 }
960}
961
962/// \returns the AA location that is being access by the instruction.
964 if (StoreInst *SI = dyn_cast<StoreInst>(I))
965 return MemoryLocation::get(SI);
966 if (LoadInst *LI = dyn_cast<LoadInst>(I))
967 return MemoryLocation::get(LI);
968 return MemoryLocation();
969}
970
971/// \returns True if the instruction is not a volatile or atomic load/store.
972static bool isSimple(Instruction *I) {
973 if (LoadInst *LI = dyn_cast<LoadInst>(I))
974 return LI->isSimple();
975 if (StoreInst *SI = dyn_cast<StoreInst>(I))
976 return SI->isSimple();
977 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
978 return !MI->isVolatile();
979 return true;
980}
981
982/// Shuffles \p Mask in accordance with the given \p SubMask.
983/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
984/// one but two input vectors.
985static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
986 bool ExtendingManyInputs = false) {
987 if (SubMask.empty())
988 return;
989 assert(
990 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
991 // Check if input scalars were extended to match the size of other node.
992 (SubMask.size() == Mask.size() &&
993 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
994 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
995 "SubMask with many inputs support must be larger than the mask.");
996 if (Mask.empty()) {
997 Mask.append(SubMask.begin(), SubMask.end());
998 return;
999 }
1000 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1001 int TermValue = std::min(Mask.size(), SubMask.size());
1002 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1003 if (SubMask[I] == PoisonMaskElem ||
1004 (!ExtendingManyInputs &&
1005 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1006 continue;
1007 NewMask[I] = Mask[SubMask[I]];
1008 }
1009 Mask.swap(NewMask);
1010}
1011
1012/// Order may have elements assigned special value (size) which is out of
1013/// bounds. Such indices only appear on places which correspond to undef values
1014/// (see canReuseExtract for details) and used in order to avoid undef values
1015/// have effect on operands ordering.
1016/// The first loop below simply finds all unused indices and then the next loop
1017/// nest assigns these indices for undef values positions.
1018/// As an example below Order has two undef positions and they have assigned
1019/// values 3 and 7 respectively:
1020/// before: 6 9 5 4 9 2 1 0
1021/// after: 6 3 5 4 7 2 1 0
1023 const unsigned Sz = Order.size();
1024 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1025 SmallBitVector MaskedIndices(Sz);
1026 for (unsigned I = 0; I < Sz; ++I) {
1027 if (Order[I] < Sz)
1028 UnusedIndices.reset(Order[I]);
1029 else
1030 MaskedIndices.set(I);
1031 }
1032 if (MaskedIndices.none())
1033 return;
1034 assert(UnusedIndices.count() == MaskedIndices.count() &&
1035 "Non-synced masked/available indices.");
1036 int Idx = UnusedIndices.find_first();
1037 int MIdx = MaskedIndices.find_first();
1038 while (MIdx >= 0) {
1039 assert(Idx >= 0 && "Indices must be synced.");
1040 Order[MIdx] = Idx;
1041 Idx = UnusedIndices.find_next(Idx);
1042 MIdx = MaskedIndices.find_next(MIdx);
1043 }
1044}
1045
1046/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1047/// Opcode1.
1049 unsigned Opcode1) {
1050 SmallBitVector OpcodeMask(VL.size(), false);
1051 for (unsigned Lane : seq<unsigned>(VL.size()))
1052 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1053 OpcodeMask.set(Lane);
1054 return OpcodeMask;
1055}
1056
1057namespace llvm {
1058
1060 SmallVectorImpl<int> &Mask) {
1061 Mask.clear();
1062 const unsigned E = Indices.size();
1063 Mask.resize(E, PoisonMaskElem);
1064 for (unsigned I = 0; I < E; ++I)
1065 Mask[Indices[I]] = I;
1066}
1067
1068/// Reorders the list of scalars in accordance with the given \p Mask.
1070 ArrayRef<int> Mask) {
1071 assert(!Mask.empty() && "Expected non-empty mask.");
1072 SmallVector<Value *> Prev(Scalars.size(),
1073 PoisonValue::get(Scalars.front()->getType()));
1074 Prev.swap(Scalars);
1075 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1076 if (Mask[I] != PoisonMaskElem)
1077 Scalars[Mask[I]] = Prev[I];
1078}
1079
1080/// Checks if the provided value does not require scheduling. It does not
1081/// require scheduling if this is not an instruction or it is an instruction
1082/// that does not read/write memory and all operands are either not instructions
1083/// or phi nodes or instructions from different blocks.
1085 auto *I = dyn_cast<Instruction>(V);
1086 if (!I)
1087 return true;
1088 return !mayHaveNonDefUseDependency(*I) &&
1089 all_of(I->operands(), [I](Value *V) {
1090 auto *IO = dyn_cast<Instruction>(V);
1091 if (!IO)
1092 return true;
1093 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1094 });
1095}
1096
1097/// Checks if the provided value does not require scheduling. It does not
1098/// require scheduling if this is not an instruction or it is an instruction
1099/// that does not read/write memory and all users are phi nodes or instructions
1100/// from the different blocks.
1101static bool isUsedOutsideBlock(Value *V) {
1102 auto *I = dyn_cast<Instruction>(V);
1103 if (!I)
1104 return true;
1105 // Limits the number of uses to save compile time.
1106 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1107 all_of(I->users(), [I](User *U) {
1108 auto *IU = dyn_cast<Instruction>(U);
1109 if (!IU)
1110 return true;
1111 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1112 });
1113}
1114
1115/// Checks if the specified value does not require scheduling. It does not
1116/// require scheduling if all operands and all users do not need to be scheduled
1117/// in the current basic block.
1120}
1121
1122/// Checks if the specified array of instructions does not require scheduling.
1123/// It is so if all either instructions have operands that do not require
1124/// scheduling or their users do not require scheduling since they are phis or
1125/// in other basic blocks.
1127 return !VL.empty() &&
1129}
1130
1131namespace slpvectorizer {
1132
1133/// Bottom Up SLP Vectorizer.
1134class BoUpSLP {
1135 struct TreeEntry;
1136 struct ScheduleData;
1139
1140public:
1141 /// Tracks the state we can represent the loads in the given sequence.
1142 enum class LoadsState {
1143 Gather,
1144 Vectorize,
1147 };
1148
1156
1158 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1161 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1162 AC(AC), DB(DB), DL(DL), ORE(ORE),
1163 Builder(Se->getContext(), TargetFolder(*DL)) {
1164 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1165 // Use the vector register size specified by the target unless overridden
1166 // by a command-line option.
1167 // TODO: It would be better to limit the vectorization factor based on
1168 // data type rather than just register size. For example, x86 AVX has
1169 // 256-bit registers, but it does not support integer operations
1170 // at that width (that requires AVX2).
1171 if (MaxVectorRegSizeOption.getNumOccurrences())
1172 MaxVecRegSize = MaxVectorRegSizeOption;
1173 else
1174 MaxVecRegSize =
1176 .getFixedValue();
1177
1178 if (MinVectorRegSizeOption.getNumOccurrences())
1179 MinVecRegSize = MinVectorRegSizeOption;
1180 else
1181 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1182 }
1183
1184 /// Vectorize the tree that starts with the elements in \p VL.
1185 /// Returns the vectorized root.
1187
1188 /// Vectorize the tree but with the list of externally used values \p
1189 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1190 /// generated extractvalue instructions.
1191 /// \param ReplacedExternals containd list of replaced external values
1192 /// {scalar, replace} after emitting extractelement for external uses.
1193 Value *
1194 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1195 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1196 Instruction *ReductionRoot = nullptr);
1197
1198 /// \returns the cost incurred by unwanted spills and fills, caused by
1199 /// holding live values over call sites.
1201
1202 /// \returns the vectorization cost of the subtree that starts at \p VL.
1203 /// A negative number means that this is profitable.
1204 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1205
1206 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1207 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1208 void buildTree(ArrayRef<Value *> Roots,
1209 const SmallDenseSet<Value *> &UserIgnoreLst);
1210
1211 /// Construct a vectorizable tree that starts at \p Roots.
1212 void buildTree(ArrayRef<Value *> Roots);
1213
1214 /// Returns whether the root node has in-tree uses.
1216 return !VectorizableTree.empty() &&
1217 !VectorizableTree.front()->UserTreeIndices.empty();
1218 }
1219
1220 /// Return the scalars of the root node.
1222 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1223 return VectorizableTree.front()->Scalars;
1224 }
1225
1226 /// Checks if the root graph node can be emitted with narrower bitwidth at
1227 /// codegen and returns it signedness, if so.
1229 return MinBWs.at(VectorizableTree.front().get()).second;
1230 }
1231
1232 /// Builds external uses of the vectorized scalars, i.e. the list of
1233 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1234 /// ExternallyUsedValues contains additional list of external uses to handle
1235 /// vectorization of reductions.
1236 void
1237 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1238
1239 /// Transforms graph nodes to target specific representations, if profitable.
1240 void transformNodes();
1241
1242 /// Clear the internal data structures that are created by 'buildTree'.
1243 void deleteTree() {
1244 VectorizableTree.clear();
1245 ScalarToTreeEntry.clear();
1246 MultiNodeScalars.clear();
1247 MustGather.clear();
1248 NonScheduledFirst.clear();
1249 EntryToLastInstruction.clear();
1250 ExternalUses.clear();
1251 ExternalUsesAsGEPs.clear();
1252 for (auto &Iter : BlocksSchedules) {
1253 BlockScheduling *BS = Iter.second.get();
1254 BS->clear();
1255 }
1256 MinBWs.clear();
1257 ReductionBitWidth = 0;
1258 CastMaxMinBWSizes.reset();
1259 ExtraBitWidthNodes.clear();
1260 InstrElementSize.clear();
1261 UserIgnoreList = nullptr;
1262 PostponedGathers.clear();
1263 ValueToGatherNodes.clear();
1264 }
1265
1266 unsigned getTreeSize() const { return VectorizableTree.size(); }
1267
1268 /// Perform LICM and CSE on the newly generated gather sequences.
1270
1271 /// Checks if the specified gather tree entry \p TE can be represented as a
1272 /// shuffled vector entry + (possibly) permutation with other gathers. It
1273 /// implements the checks only for possibly ordered scalars (Loads,
1274 /// ExtractElement, ExtractValue), which can be part of the graph.
1275 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1276
1277 /// Sort loads into increasing pointers offsets to allow greater clustering.
1278 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1279
1280 /// Gets reordering data for the given tree entry. If the entry is vectorized
1281 /// - just return ReorderIndices, otherwise check if the scalars can be
1282 /// reordered and return the most optimal order.
1283 /// \return std::nullopt if ordering is not important, empty order, if
1284 /// identity order is important, or the actual order.
1285 /// \param TopToBottom If true, include the order of vectorized stores and
1286 /// insertelement nodes, otherwise skip them.
1287 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1288 bool TopToBottom);
1289
1290 /// Reorders the current graph to the most profitable order starting from the
1291 /// root node to the leaf nodes. The best order is chosen only from the nodes
1292 /// of the same size (vectorization factor). Smaller nodes are considered
1293 /// parts of subgraph with smaller VF and they are reordered independently. We
1294 /// can make it because we still need to extend smaller nodes to the wider VF
1295 /// and we can merge reordering shuffles with the widening shuffles.
1296 void reorderTopToBottom();
1297
1298 /// Reorders the current graph to the most profitable order starting from
1299 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1300 /// number of reshuffles if the leaf nodes use the same order. In this case we
1301 /// can merge the orders and just shuffle user node instead of shuffling its
1302 /// operands. Plus, even the leaf nodes have different orders, it allows to
1303 /// sink reordering in the graph closer to the root node and merge it later
1304 /// during analysis.
1305 void reorderBottomToTop(bool IgnoreReorder = false);
1306
1307 /// \return The vector element size in bits to use when vectorizing the
1308 /// expression tree ending at \p V. If V is a store, the size is the width of
1309 /// the stored value. Otherwise, the size is the width of the largest loaded
1310 /// value reaching V. This method is used by the vectorizer to calculate
1311 /// vectorization factors.
1312 unsigned getVectorElementSize(Value *V);
1313
1314 /// Compute the minimum type sizes required to represent the entries in a
1315 /// vectorizable tree.
1317
1318 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1319 unsigned getMaxVecRegSize() const {
1320 return MaxVecRegSize;
1321 }
1322
1323 // \returns minimum vector register size as set by cl::opt.
1324 unsigned getMinVecRegSize() const {
1325 return MinVecRegSize;
1326 }
1327
1328 unsigned getMinVF(unsigned Sz) const {
1329 return std::max(2U, getMinVecRegSize() / Sz);
1330 }
1331
1332 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1333 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1334 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1335 return MaxVF ? MaxVF : UINT_MAX;
1336 }
1337
1338 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1339 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1340 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1341 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1342 ///
1343 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1344 unsigned canMapToVector(Type *T) const;
1345
1346 /// \returns True if the VectorizableTree is both tiny and not fully
1347 /// vectorizable. We do not vectorize such trees.
1348 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1349
1350 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1351 /// can be load combined in the backend. Load combining may not be allowed in
1352 /// the IR optimizer, so we do not want to alter the pattern. For example,
1353 /// partially transforming a scalar bswap() pattern into vector code is
1354 /// effectively impossible for the backend to undo.
1355 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1356 /// may not be necessary.
1357 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1358
1359 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1360 /// can be load combined in the backend. Load combining may not be allowed in
1361 /// the IR optimizer, so we do not want to alter the pattern. For example,
1362 /// partially transforming a scalar bswap() pattern into vector code is
1363 /// effectively impossible for the backend to undo.
1364 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1365 /// may not be necessary.
1366 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1367
1368 /// Checks if the given array of loads can be represented as a vectorized,
1369 /// scatter or just simple gather.
1370 /// \param VL list of loads.
1371 /// \param VL0 main load value.
1372 /// \param Order returned order of load instructions.
1373 /// \param PointerOps returned list of pointer operands.
1374 /// \param TryRecursiveCheck used to check if long masked gather can be
1375 /// represented as a serie of loads/insert subvector, if profitable.
1378 SmallVectorImpl<Value *> &PointerOps,
1379 bool TryRecursiveCheck = true) const;
1380
1382
1383 /// This structure holds any data we need about the edges being traversed
1384 /// during buildTree_rec(). We keep track of:
1385 /// (i) the user TreeEntry index, and
1386 /// (ii) the index of the edge.
1387 struct EdgeInfo {
1388 EdgeInfo() = default;
1389 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1391 /// The user TreeEntry.
1392 TreeEntry *UserTE = nullptr;
1393 /// The operand index of the use.
1394 unsigned EdgeIdx = UINT_MAX;
1395#ifndef NDEBUG
1397 const BoUpSLP::EdgeInfo &EI) {
1398 EI.dump(OS);
1399 return OS;
1400 }
1401 /// Debug print.
1402 void dump(raw_ostream &OS) const {
1403 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1404 << " EdgeIdx:" << EdgeIdx << "}";
1405 }
1406 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1407#endif
1408 bool operator == (const EdgeInfo &Other) const {
1409 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1410 }
1411 };
1412
1413 /// A helper class used for scoring candidates for two consecutive lanes.
1415 const TargetLibraryInfo &TLI;
1416 const DataLayout &DL;
1417 ScalarEvolution &SE;
1418 const BoUpSLP &R;
1419 int NumLanes; // Total number of lanes (aka vectorization factor).
1420 int MaxLevel; // The maximum recursion depth for accumulating score.
1421
1422 public:
1424 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1425 int MaxLevel)
1426 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1427 MaxLevel(MaxLevel) {}
1428
1429 // The hard-coded scores listed here are not very important, though it shall
1430 // be higher for better matches to improve the resulting cost. When
1431 // computing the scores of matching one sub-tree with another, we are
1432 // basically counting the number of values that are matching. So even if all
1433 // scores are set to 1, we would still get a decent matching result.
1434 // However, sometimes we have to break ties. For example we may have to
1435 // choose between matching loads vs matching opcodes. This is what these
1436 // scores are helping us with: they provide the order of preference. Also,
1437 // this is important if the scalar is externally used or used in another
1438 // tree entry node in the different lane.
1439
1440 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1441 static const int ScoreConsecutiveLoads = 4;
1442 /// The same load multiple times. This should have a better score than
1443 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1444 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1445 /// a vector load and 1.0 for a broadcast.
1446 static const int ScoreSplatLoads = 3;
1447 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1448 static const int ScoreReversedLoads = 3;
1449 /// A load candidate for masked gather.
1450 static const int ScoreMaskedGatherCandidate = 1;
1451 /// ExtractElementInst from same vector and consecutive indexes.
1452 static const int ScoreConsecutiveExtracts = 4;
1453 /// ExtractElementInst from same vector and reversed indices.
1454 static const int ScoreReversedExtracts = 3;
1455 /// Constants.
1456 static const int ScoreConstants = 2;
1457 /// Instructions with the same opcode.
1458 static const int ScoreSameOpcode = 2;
1459 /// Instructions with alt opcodes (e.g, add + sub).
1460 static const int ScoreAltOpcodes = 1;
1461 /// Identical instructions (a.k.a. splat or broadcast).
1462 static const int ScoreSplat = 1;
1463 /// Matching with an undef is preferable to failing.
1464 static const int ScoreUndef = 1;
1465 /// Score for failing to find a decent match.
1466 static const int ScoreFail = 0;
1467 /// Score if all users are vectorized.
1468 static const int ScoreAllUserVectorized = 1;
1469
1470 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1471 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1472 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1473 /// MainAltOps.
1475 ArrayRef<Value *> MainAltOps) const {
1476 if (!isValidElementType(V1->getType()) ||
1477 !isValidElementType(V2->getType()))
1479
1480 if (V1 == V2) {
1481 if (isa<LoadInst>(V1)) {
1482 // Retruns true if the users of V1 and V2 won't need to be extracted.
1483 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1484 // Bail out if we have too many uses to save compilation time.
1485 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1486 return false;
1487
1488 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1489 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1490 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1491 });
1492 };
1493 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1494 };
1495 // A broadcast of a load can be cheaper on some targets.
1496 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1497 ElementCount::getFixed(NumLanes)) &&
1498 ((int)V1->getNumUses() == NumLanes ||
1499 AllUsersAreInternal(V1, V2)))
1501 }
1503 }
1504
1505 auto CheckSameEntryOrFail = [&]() {
1506 if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1507 TE1 && TE1 == R.getTreeEntry(V2))
1510 };
1511
1512 auto *LI1 = dyn_cast<LoadInst>(V1);
1513 auto *LI2 = dyn_cast<LoadInst>(V2);
1514 if (LI1 && LI2) {
1515 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1516 !LI2->isSimple())
1517 return CheckSameEntryOrFail();
1518
1519 std::optional<int> Dist = getPointersDiff(
1520 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1521 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1522 if (!Dist || *Dist == 0) {
1523 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1524 getUnderlyingObject(LI2->getPointerOperand()) &&
1525 R.TTI->isLegalMaskedGather(
1526 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
1528 return CheckSameEntryOrFail();
1529 }
1530 // The distance is too large - still may be profitable to use masked
1531 // loads/gathers.
1532 if (std::abs(*Dist) > NumLanes / 2)
1534 // This still will detect consecutive loads, but we might have "holes"
1535 // in some cases. It is ok for non-power-2 vectorization and may produce
1536 // better results. It should not affect current vectorization.
1539 }
1540
1541 auto *C1 = dyn_cast<Constant>(V1);
1542 auto *C2 = dyn_cast<Constant>(V2);
1543 if (C1 && C2)
1545
1546 // Extracts from consecutive indexes of the same vector better score as
1547 // the extracts could be optimized away.
1548 Value *EV1;
1549 ConstantInt *Ex1Idx;
1550 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1551 // Undefs are always profitable for extractelements.
1552 // Compiler can easily combine poison and extractelement <non-poison> or
1553 // undef and extractelement <poison>. But combining undef +
1554 // extractelement <non-poison-but-may-produce-poison> requires some
1555 // extra operations.
1556 if (isa<UndefValue>(V2))
1557 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1560 Value *EV2 = nullptr;
1561 ConstantInt *Ex2Idx = nullptr;
1562 if (match(V2,
1564 m_Undef())))) {
1565 // Undefs are always profitable for extractelements.
1566 if (!Ex2Idx)
1568 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1570 if (EV2 == EV1) {
1571 int Idx1 = Ex1Idx->getZExtValue();
1572 int Idx2 = Ex2Idx->getZExtValue();
1573 int Dist = Idx2 - Idx1;
1574 // The distance is too large - still may be profitable to use
1575 // shuffles.
1576 if (std::abs(Dist) == 0)
1578 if (std::abs(Dist) > NumLanes / 2)
1582 }
1584 }
1585 return CheckSameEntryOrFail();
1586 }
1587
1588 auto *I1 = dyn_cast<Instruction>(V1);
1589 auto *I2 = dyn_cast<Instruction>(V2);
1590 if (I1 && I2) {
1591 if (I1->getParent() != I2->getParent())
1592 return CheckSameEntryOrFail();
1593 SmallVector<Value *, 4> Ops(MainAltOps);
1594 Ops.push_back(I1);
1595 Ops.push_back(I2);
1596 InstructionsState S = getSameOpcode(Ops, TLI);
1597 // Note: Only consider instructions with <= 2 operands to avoid
1598 // complexity explosion.
1599 if (S.getOpcode() &&
1600 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1601 !S.isAltShuffle()) &&
1602 all_of(Ops, [&S](Value *V) {
1603 return cast<Instruction>(V)->getNumOperands() ==
1604 S.MainOp->getNumOperands();
1605 }))
1606 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1608 }
1609
1610 if (isa<UndefValue>(V2))
1612
1613 return CheckSameEntryOrFail();
1614 }
1615
1616 /// Go through the operands of \p LHS and \p RHS recursively until
1617 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1618 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1619 /// of \p U1 and \p U2), except at the beginning of the recursion where
1620 /// these are set to nullptr.
1621 ///
1622 /// For example:
1623 /// \verbatim
1624 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1625 /// \ / \ / \ / \ /
1626 /// + + + +
1627 /// G1 G2 G3 G4
1628 /// \endverbatim
1629 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1630 /// each level recursively, accumulating the score. It starts from matching
1631 /// the additions at level 0, then moves on to the loads (level 1). The
1632 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1633 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1634 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1635 /// Please note that the order of the operands does not matter, as we
1636 /// evaluate the score of all profitable combinations of operands. In
1637 /// other words the score of G1 and G4 is the same as G1 and G2. This
1638 /// heuristic is based on ideas described in:
1639 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1640 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1641 /// Luís F. W. Góes
1643 Instruction *U2, int CurrLevel,
1644 ArrayRef<Value *> MainAltOps) const {
1645
1646 // Get the shallow score of V1 and V2.
1647 int ShallowScoreAtThisLevel =
1648 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1649
1650 // If reached MaxLevel,
1651 // or if V1 and V2 are not instructions,
1652 // or if they are SPLAT,
1653 // or if they are not consecutive,
1654 // or if profitable to vectorize loads or extractelements, early return
1655 // the current cost.
1656 auto *I1 = dyn_cast<Instruction>(LHS);
1657 auto *I2 = dyn_cast<Instruction>(RHS);
1658 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1659 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1660 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1661 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1662 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1663 ShallowScoreAtThisLevel))
1664 return ShallowScoreAtThisLevel;
1665 assert(I1 && I2 && "Should have early exited.");
1666
1667 // Contains the I2 operand indexes that got matched with I1 operands.
1668 SmallSet<unsigned, 4> Op2Used;
1669
1670 // Recursion towards the operands of I1 and I2. We are trying all possible
1671 // operand pairs, and keeping track of the best score.
1672 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1673 OpIdx1 != NumOperands1; ++OpIdx1) {
1674 // Try to pair op1I with the best operand of I2.
1675 int MaxTmpScore = 0;
1676 unsigned MaxOpIdx2 = 0;
1677 bool FoundBest = false;
1678 // If I2 is commutative try all combinations.
1679 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1680 unsigned ToIdx = isCommutative(I2)
1681 ? I2->getNumOperands()
1682 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1683 assert(FromIdx <= ToIdx && "Bad index");
1684 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1685 // Skip operands already paired with OpIdx1.
1686 if (Op2Used.count(OpIdx2))
1687 continue;
1688 // Recursively calculate the cost at each level
1689 int TmpScore =
1690 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1691 I1, I2, CurrLevel + 1, std::nullopt);
1692 // Look for the best score.
1693 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1694 TmpScore > MaxTmpScore) {
1695 MaxTmpScore = TmpScore;
1696 MaxOpIdx2 = OpIdx2;
1697 FoundBest = true;
1698 }
1699 }
1700 if (FoundBest) {
1701 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1702 Op2Used.insert(MaxOpIdx2);
1703 ShallowScoreAtThisLevel += MaxTmpScore;
1704 }
1705 }
1706 return ShallowScoreAtThisLevel;
1707 }
1708 };
1709 /// A helper data structure to hold the operands of a vector of instructions.
1710 /// This supports a fixed vector length for all operand vectors.
1712 /// For each operand we need (i) the value, and (ii) the opcode that it
1713 /// would be attached to if the expression was in a left-linearized form.
1714 /// This is required to avoid illegal operand reordering.
1715 /// For example:
1716 /// \verbatim
1717 /// 0 Op1
1718 /// |/
1719 /// Op1 Op2 Linearized + Op2
1720 /// \ / ----------> |/
1721 /// - -
1722 ///
1723 /// Op1 - Op2 (0 + Op1) - Op2
1724 /// \endverbatim
1725 ///
1726 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1727 ///
1728 /// Another way to think of this is to track all the operations across the
1729 /// path from the operand all the way to the root of the tree and to
1730 /// calculate the operation that corresponds to this path. For example, the
1731 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1732 /// corresponding operation is a '-' (which matches the one in the
1733 /// linearized tree, as shown above).
1734 ///
1735 /// For lack of a better term, we refer to this operation as Accumulated
1736 /// Path Operation (APO).
1737 struct OperandData {
1738 OperandData() = default;
1739 OperandData(Value *V, bool APO, bool IsUsed)
1740 : V(V), APO(APO), IsUsed(IsUsed) {}
1741 /// The operand value.
1742 Value *V = nullptr;
1743 /// TreeEntries only allow a single opcode, or an alternate sequence of
1744 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1745 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1746 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1747 /// (e.g., Add/Mul)
1748 bool APO = false;
1749 /// Helper data for the reordering function.
1750 bool IsUsed = false;
1751 };
1752
1753 /// During operand reordering, we are trying to select the operand at lane
1754 /// that matches best with the operand at the neighboring lane. Our
1755 /// selection is based on the type of value we are looking for. For example,
1756 /// if the neighboring lane has a load, we need to look for a load that is
1757 /// accessing a consecutive address. These strategies are summarized in the
1758 /// 'ReorderingMode' enumerator.
1759 enum class ReorderingMode {
1760 Load, ///< Matching loads to consecutive memory addresses
1761 Opcode, ///< Matching instructions based on opcode (same or alternate)
1762 Constant, ///< Matching constants
1763 Splat, ///< Matching the same instruction multiple times (broadcast)
1764 Failed, ///< We failed to create a vectorizable group
1765 };
1766
1768
1769 /// A vector of operand vectors.
1771
1772 const TargetLibraryInfo &TLI;
1773 const DataLayout &DL;
1774 ScalarEvolution &SE;
1775 const BoUpSLP &R;
1776 const Loop *L = nullptr;
1777
1778 /// \returns the operand data at \p OpIdx and \p Lane.
1779 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1780 return OpsVec[OpIdx][Lane];
1781 }
1782
1783 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1784 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1785 return OpsVec[OpIdx][Lane];
1786 }
1787
1788 /// Clears the used flag for all entries.
1789 void clearUsed() {
1790 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1791 OpIdx != NumOperands; ++OpIdx)
1792 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1793 ++Lane)
1794 OpsVec[OpIdx][Lane].IsUsed = false;
1795 }
1796
1797 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1798 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1799 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1800 }
1801
1802 /// \param Lane lane of the operands under analysis.
1803 /// \param OpIdx operand index in \p Lane lane we're looking the best
1804 /// candidate for.
1805 /// \param Idx operand index of the current candidate value.
1806 /// \returns The additional score due to possible broadcasting of the
1807 /// elements in the lane. It is more profitable to have power-of-2 unique
1808 /// elements in the lane, it will be vectorized with higher probability
1809 /// after removing duplicates. Currently the SLP vectorizer supports only
1810 /// vectorization of the power-of-2 number of unique scalars.
1811 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1812 Value *IdxLaneV = getData(Idx, Lane).V;
1813 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1814 return 0;
1816 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1817 if (Ln == Lane)
1818 continue;
1819 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1820 if (!isa<Instruction>(OpIdxLnV))
1821 return 0;
1822 Uniques.insert(OpIdxLnV);
1823 }
1824 int UniquesCount = Uniques.size();
1825 int UniquesCntWithIdxLaneV =
1826 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1827 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1828 int UniquesCntWithOpIdxLaneV =
1829 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1830 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1831 return 0;
1832 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1833 UniquesCntWithOpIdxLaneV) -
1834 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1835 }
1836
1837 /// \param Lane lane of the operands under analysis.
1838 /// \param OpIdx operand index in \p Lane lane we're looking the best
1839 /// candidate for.
1840 /// \param Idx operand index of the current candidate value.
1841 /// \returns The additional score for the scalar which users are all
1842 /// vectorized.
1843 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1844 Value *IdxLaneV = getData(Idx, Lane).V;
1845 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1846 // Do not care about number of uses for vector-like instructions
1847 // (extractelement/extractvalue with constant indices), they are extracts
1848 // themselves and already externally used. Vectorization of such
1849 // instructions does not add extra extractelement instruction, just may
1850 // remove it.
1851 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1852 isVectorLikeInstWithConstOps(OpIdxLaneV))
1854 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1855 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1856 return 0;
1857 return R.areAllUsersVectorized(IdxLaneI)
1859 : 0;
1860 }
1861
1862 /// Score scaling factor for fully compatible instructions but with
1863 /// different number of external uses. Allows better selection of the
1864 /// instructions with less external uses.
1865 static const int ScoreScaleFactor = 10;
1866
1867 /// \Returns the look-ahead score, which tells us how much the sub-trees
1868 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1869 /// score. This helps break ties in an informed way when we cannot decide on
1870 /// the order of the operands by just considering the immediate
1871 /// predecessors.
1872 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1873 int Lane, unsigned OpIdx, unsigned Idx,
1874 bool &IsUsed) {
1875 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1877 // Keep track of the instruction stack as we recurse into the operands
1878 // during the look-ahead score exploration.
1879 int Score =
1880 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1881 /*CurrLevel=*/1, MainAltOps);
1882 if (Score) {
1883 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1884 if (Score <= -SplatScore) {
1885 // Set the minimum score for splat-like sequence to avoid setting
1886 // failed state.
1887 Score = 1;
1888 } else {
1889 Score += SplatScore;
1890 // Scale score to see the difference between different operands
1891 // and similar operands but all vectorized/not all vectorized
1892 // uses. It does not affect actual selection of the best
1893 // compatible operand in general, just allows to select the
1894 // operand with all vectorized uses.
1895 Score *= ScoreScaleFactor;
1896 Score += getExternalUseScore(Lane, OpIdx, Idx);
1897 IsUsed = true;
1898 }
1899 }
1900 return Score;
1901 }
1902
1903 /// Best defined scores per lanes between the passes. Used to choose the
1904 /// best operand (with the highest score) between the passes.
1905 /// The key - {Operand Index, Lane}.
1906 /// The value - the best score between the passes for the lane and the
1907 /// operand.
1909 BestScoresPerLanes;
1910
1911 // Search all operands in Ops[*][Lane] for the one that matches best
1912 // Ops[OpIdx][LastLane] and return its opreand index.
1913 // If no good match can be found, return std::nullopt.
1914 std::optional<unsigned>
1915 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1916 ArrayRef<ReorderingMode> ReorderingModes,
1917 ArrayRef<Value *> MainAltOps) {
1918 unsigned NumOperands = getNumOperands();
1919
1920 // The operand of the previous lane at OpIdx.
1921 Value *OpLastLane = getData(OpIdx, LastLane).V;
1922
1923 // Our strategy mode for OpIdx.
1924 ReorderingMode RMode = ReorderingModes[OpIdx];
1925 if (RMode == ReorderingMode::Failed)
1926 return std::nullopt;
1927
1928 // The linearized opcode of the operand at OpIdx, Lane.
1929 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1930
1931 // The best operand index and its score.
1932 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1933 // are using the score to differentiate between the two.
1934 struct BestOpData {
1935 std::optional<unsigned> Idx;
1936 unsigned Score = 0;
1937 } BestOp;
1938 BestOp.Score =
1939 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1940 .first->second;
1941
1942 // Track if the operand must be marked as used. If the operand is set to
1943 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1944 // want to reestimate the operands again on the following iterations).
1945 bool IsUsed = RMode == ReorderingMode::Splat ||
1946 RMode == ReorderingMode::Constant ||
1947 RMode == ReorderingMode::Load;
1948 // Iterate through all unused operands and look for the best.
1949 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1950 // Get the operand at Idx and Lane.
1951 OperandData &OpData = getData(Idx, Lane);
1952 Value *Op = OpData.V;
1953 bool OpAPO = OpData.APO;
1954
1955 // Skip already selected operands.
1956 if (OpData.IsUsed)
1957 continue;
1958
1959 // Skip if we are trying to move the operand to a position with a
1960 // different opcode in the linearized tree form. This would break the
1961 // semantics.
1962 if (OpAPO != OpIdxAPO)
1963 continue;
1964
1965 // Look for an operand that matches the current mode.
1966 switch (RMode) {
1967 case ReorderingMode::Load:
1968 case ReorderingMode::Opcode: {
1969 bool LeftToRight = Lane > LastLane;
1970 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1971 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1972 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1973 OpIdx, Idx, IsUsed);
1974 if (Score > static_cast<int>(BestOp.Score) ||
1975 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
1976 Idx == OpIdx)) {
1977 BestOp.Idx = Idx;
1978 BestOp.Score = Score;
1979 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1980 }
1981 break;
1982 }
1983 case ReorderingMode::Constant:
1984 if (isa<Constant>(Op) ||
1985 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
1986 BestOp.Idx = Idx;
1987 if (isa<Constant>(Op)) {
1989 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1991 }
1992 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
1993 IsUsed = false;
1994 }
1995 break;
1996 case ReorderingMode::Splat:
1997 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
1998 IsUsed = Op == OpLastLane;
1999 if (Op == OpLastLane) {
2000 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2001 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2003 }
2004 BestOp.Idx = Idx;
2005 }
2006 break;
2007 case ReorderingMode::Failed:
2008 llvm_unreachable("Not expected Failed reordering mode.");
2009 }
2010 }
2011
2012 if (BestOp.Idx) {
2013 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2014 return BestOp.Idx;
2015 }
2016 // If we could not find a good match return std::nullopt.
2017 return std::nullopt;
2018 }
2019
2020 /// Helper for reorderOperandVecs.
2021 /// \returns the lane that we should start reordering from. This is the one
2022 /// which has the least number of operands that can freely move about or
2023 /// less profitable because it already has the most optimal set of operands.
2024 unsigned getBestLaneToStartReordering() const {
2025 unsigned Min = UINT_MAX;
2026 unsigned SameOpNumber = 0;
2027 // std::pair<unsigned, unsigned> is used to implement a simple voting
2028 // algorithm and choose the lane with the least number of operands that
2029 // can freely move about or less profitable because it already has the
2030 // most optimal set of operands. The first unsigned is a counter for
2031 // voting, the second unsigned is the counter of lanes with instructions
2032 // with same/alternate opcodes and same parent basic block.
2034 // Try to be closer to the original results, if we have multiple lanes
2035 // with same cost. If 2 lanes have the same cost, use the one with the
2036 // lowest index.
2037 for (int I = getNumLanes(); I > 0; --I) {
2038 unsigned Lane = I - 1;
2039 OperandsOrderData NumFreeOpsHash =
2040 getMaxNumOperandsThatCanBeReordered(Lane);
2041 // Compare the number of operands that can move and choose the one with
2042 // the least number.
2043 if (NumFreeOpsHash.NumOfAPOs < Min) {
2044 Min = NumFreeOpsHash.NumOfAPOs;
2045 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2046 HashMap.clear();
2047 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2048 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2049 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2050 // Select the most optimal lane in terms of number of operands that
2051 // should be moved around.
2052 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2053 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2054 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2055 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2056 auto *It = HashMap.find(NumFreeOpsHash.Hash);
2057 if (It == HashMap.end())
2058 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2059 else
2060 ++It->second.first;
2061 }
2062 }
2063 // Select the lane with the minimum counter.
2064 unsigned BestLane = 0;
2065 unsigned CntMin = UINT_MAX;
2066 for (const auto &Data : reverse(HashMap)) {
2067 if (Data.second.first < CntMin) {
2068 CntMin = Data.second.first;
2069 BestLane = Data.second.second;
2070 }
2071 }
2072 return BestLane;
2073 }
2074
2075 /// Data structure that helps to reorder operands.
2076 struct OperandsOrderData {
2077 /// The best number of operands with the same APOs, which can be
2078 /// reordered.
2079 unsigned NumOfAPOs = UINT_MAX;
2080 /// Number of operands with the same/alternate instruction opcode and
2081 /// parent.
2082 unsigned NumOpsWithSameOpcodeParent = 0;
2083 /// Hash for the actual operands ordering.
2084 /// Used to count operands, actually their position id and opcode
2085 /// value. It is used in the voting mechanism to find the lane with the
2086 /// least number of operands that can freely move about or less profitable
2087 /// because it already has the most optimal set of operands. Can be
2088 /// replaced with SmallVector<unsigned> instead but hash code is faster
2089 /// and requires less memory.
2090 unsigned Hash = 0;
2091 };
2092 /// \returns the maximum number of operands that are allowed to be reordered
2093 /// for \p Lane and the number of compatible instructions(with the same
2094 /// parent/opcode). This is used as a heuristic for selecting the first lane
2095 /// to start operand reordering.
2096 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2097 unsigned CntTrue = 0;
2098 unsigned NumOperands = getNumOperands();
2099 // Operands with the same APO can be reordered. We therefore need to count
2100 // how many of them we have for each APO, like this: Cnt[APO] = x.
2101 // Since we only have two APOs, namely true and false, we can avoid using
2102 // a map. Instead we can simply count the number of operands that
2103 // correspond to one of them (in this case the 'true' APO), and calculate
2104 // the other by subtracting it from the total number of operands.
2105 // Operands with the same instruction opcode and parent are more
2106 // profitable since we don't need to move them in many cases, with a high
2107 // probability such lane already can be vectorized effectively.
2108 bool AllUndefs = true;
2109 unsigned NumOpsWithSameOpcodeParent = 0;
2110 Instruction *OpcodeI = nullptr;
2111 BasicBlock *Parent = nullptr;
2112 unsigned Hash = 0;
2113 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2114 const OperandData &OpData = getData(OpIdx, Lane);
2115 if (OpData.APO)
2116 ++CntTrue;
2117 // Use Boyer-Moore majority voting for finding the majority opcode and
2118 // the number of times it occurs.
2119 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2120 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
2121 I->getParent() != Parent) {
2122 if (NumOpsWithSameOpcodeParent == 0) {
2123 NumOpsWithSameOpcodeParent = 1;
2124 OpcodeI = I;
2125 Parent = I->getParent();
2126 } else {
2127 --NumOpsWithSameOpcodeParent;
2128 }
2129 } else {
2130 ++NumOpsWithSameOpcodeParent;
2131 }
2132 }
2133 Hash = hash_combine(
2134 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2135 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2136 }
2137 if (AllUndefs)
2138 return {};
2139 OperandsOrderData Data;
2140 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2141 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2142 Data.Hash = Hash;
2143 return Data;
2144 }
2145
2146 /// Go through the instructions in VL and append their operands.
2147 void appendOperandsOfVL(ArrayRef<Value *> VL) {
2148 assert(!VL.empty() && "Bad VL");
2149 assert((empty() || VL.size() == getNumLanes()) &&
2150 "Expected same number of lanes");
2151 assert(isa<Instruction>(VL[0]) && "Expected instruction");
2152 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2153 constexpr unsigned IntrinsicNumOperands = 2;
2154 if (isa<IntrinsicInst>(VL[0]))
2155 NumOperands = IntrinsicNumOperands;
2156 OpsVec.resize(NumOperands);
2157 unsigned NumLanes = VL.size();
2158 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2159 OpsVec[OpIdx].resize(NumLanes);
2160 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2161 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2162 // Our tree has just 3 nodes: the root and two operands.
2163 // It is therefore trivial to get the APO. We only need to check the
2164 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2165 // RHS operand. The LHS operand of both add and sub is never attached
2166 // to an inversese operation in the linearized form, therefore its APO
2167 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2168
2169 // Since operand reordering is performed on groups of commutative
2170 // operations or alternating sequences (e.g., +, -), we can safely
2171 // tell the inverse operations by checking commutativity.
2172 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2173 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2174 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2175 APO, false};
2176 }
2177 }
2178 }
2179
2180 /// \returns the number of operands.
2181 unsigned getNumOperands() const { return OpsVec.size(); }
2182
2183 /// \returns the number of lanes.
2184 unsigned getNumLanes() const { return OpsVec[0].size(); }
2185
2186 /// \returns the operand value at \p OpIdx and \p Lane.
2187 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2188 return getData(OpIdx, Lane).V;
2189 }
2190
2191 /// \returns true if the data structure is empty.
2192 bool empty() const { return OpsVec.empty(); }
2193
2194 /// Clears the data.
2195 void clear() { OpsVec.clear(); }
2196
2197 /// \Returns true if there are enough operands identical to \p Op to fill
2198 /// the whole vector (it is mixed with constants or loop invariant values).
2199 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2200 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2201 bool OpAPO = getData(OpIdx, Lane).APO;
2202 bool IsInvariant = L && L->isLoopInvariant(Op);
2203 unsigned Cnt = 0;
2204 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2205 if (Ln == Lane)
2206 continue;
2207 // This is set to true if we found a candidate for broadcast at Lane.
2208 bool FoundCandidate = false;
2209 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2210 OperandData &Data = getData(OpI, Ln);
2211 if (Data.APO != OpAPO || Data.IsUsed)
2212 continue;
2213 Value *OpILane = getValue(OpI, Lane);
2214 bool IsConstantOp = isa<Constant>(OpILane);
2215 // Consider the broadcast candidate if:
2216 // 1. Same value is found in one of the operands.
2217 if (Data.V == Op ||
2218 // 2. The operand in the given lane is not constant but there is a
2219 // constant operand in another lane (which can be moved to the
2220 // given lane). In this case we can represent it as a simple
2221 // permutation of constant and broadcast.
2222 (!IsConstantOp &&
2223 ((Lns > 2 && isa<Constant>(Data.V)) ||
2224 // 2.1. If we have only 2 lanes, need to check that value in the
2225 // next lane does not build same opcode sequence.
2226 (Lns == 2 &&
2227 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
2228 .getOpcode() &&
2229 isa<Constant>(Data.V)))) ||
2230 // 3. The operand in the current lane is loop invariant (can be
2231 // hoisted out) and another operand is also a loop invariant
2232 // (though not a constant). In this case the whole vector can be
2233 // hoisted out.
2234 // FIXME: need to teach the cost model about this case for better
2235 // estimation.
2236 (IsInvariant && !isa<Constant>(Data.V) &&
2237 !getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
2238 L->isLoopInvariant(Data.V))) {
2239 FoundCandidate = true;
2240 Data.IsUsed = Data.V == Op;
2241 if (Data.V == Op)
2242 ++Cnt;
2243 break;
2244 }
2245 }
2246 if (!FoundCandidate)
2247 return false;
2248 }
2249 return getNumLanes() == 2 || Cnt > 1;
2250 }
2251
2252 /// Checks if there is at least single compatible operand in lanes other
2253 /// than \p Lane, compatible with the operand \p Op.
2254 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
2255 bool OpAPO = getData(OpIdx, Lane).APO;
2256 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2257 if (Ln == Lane)
2258 continue;
2259 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2260 const OperandData &Data = getData(OpI, Ln);
2261 if (Data.APO != OpAPO || Data.IsUsed)
2262 return true;
2263 Value *OpILn = getValue(OpI, Ln);
2264 return (L && L->isLoopInvariant(OpILn)) ||
2265 (getSameOpcode({Op, OpILn}, TLI).getOpcode() &&
2266 Op->getParent() == cast<Instruction>(OpILn)->getParent());
2267 }))
2268 return true;
2269 }
2270 return false;
2271 }
2272
2273 public:
2274 /// Initialize with all the operands of the instruction vector \p RootVL.
2276 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2277 L(R.LI->getLoopFor(
2278 (cast<Instruction>(RootVL.front())->getParent()))) {
2279 // Append all the operands of RootVL.
2280 appendOperandsOfVL(RootVL);
2281 }
2282
2283 /// \Returns a value vector with the operands across all lanes for the
2284 /// opearnd at \p OpIdx.
2285 ValueList getVL(unsigned OpIdx) const {
2286 ValueList OpVL(OpsVec[OpIdx].size());
2287 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2288 "Expected same num of lanes across all operands");
2289 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2290 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2291 return OpVL;
2292 }
2293
2294 // Performs operand reordering for 2 or more operands.
2295 // The original operands are in OrigOps[OpIdx][Lane].
2296 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2297 void reorder() {
2298 unsigned NumOperands = getNumOperands();
2299 unsigned NumLanes = getNumLanes();
2300 // Each operand has its own mode. We are using this mode to help us select
2301 // the instructions for each lane, so that they match best with the ones
2302 // we have selected so far.
2303 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2304
2305 // This is a greedy single-pass algorithm. We are going over each lane
2306 // once and deciding on the best order right away with no back-tracking.
2307 // However, in order to increase its effectiveness, we start with the lane
2308 // that has operands that can move the least. For example, given the
2309 // following lanes:
2310 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2311 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2312 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2313 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2314 // we will start at Lane 1, since the operands of the subtraction cannot
2315 // be reordered. Then we will visit the rest of the lanes in a circular
2316 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2317
2318 // Find the first lane that we will start our search from.
2319 unsigned FirstLane = getBestLaneToStartReordering();
2320
2321 // Initialize the modes.
2322 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2323 Value *OpLane0 = getValue(OpIdx, FirstLane);
2324 // Keep track if we have instructions with all the same opcode on one
2325 // side.
2326 if (isa<LoadInst>(OpLane0))
2327 ReorderingModes[OpIdx] = ReorderingMode::Load;
2328 else if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2329 // Check if OpLane0 should be broadcast.
2330 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2331 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2332 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2333 else
2334 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2335 } else if (isa<Constant>(OpLane0))
2336 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2337 else if (isa<Argument>(OpLane0))
2338 // Our best hope is a Splat. It may save some cost in some cases.
2339 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2340 else
2341 // NOTE: This should be unreachable.
2342 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2343 }
2344
2345 // Check that we don't have same operands. No need to reorder if operands
2346 // are just perfect diamond or shuffled diamond match. Do not do it only
2347 // for possible broadcasts or non-power of 2 number of scalars (just for
2348 // now).
2349 auto &&SkipReordering = [this]() {
2350 SmallPtrSet<Value *, 4> UniqueValues;
2351 ArrayRef<OperandData> Op0 = OpsVec.front();
2352 for (const OperandData &Data : Op0)
2353 UniqueValues.insert(Data.V);
2354 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2355 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2356 return !UniqueValues.contains(Data.V);
2357 }))
2358 return false;
2359 }
2360 // TODO: Check if we can remove a check for non-power-2 number of
2361 // scalars after full support of non-power-2 vectorization.
2362 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2363 };
2364
2365 // If the initial strategy fails for any of the operand indexes, then we
2366 // perform reordering again in a second pass. This helps avoid assigning
2367 // high priority to the failed strategy, and should improve reordering for
2368 // the non-failed operand indexes.
2369 for (int Pass = 0; Pass != 2; ++Pass) {
2370 // Check if no need to reorder operands since they're are perfect or
2371 // shuffled diamond match.
2372 // Need to do it to avoid extra external use cost counting for
2373 // shuffled matches, which may cause regressions.
2374 if (SkipReordering())
2375 break;
2376 // Skip the second pass if the first pass did not fail.
2377 bool StrategyFailed = false;
2378 // Mark all operand data as free to use.
2379 clearUsed();
2380 // We keep the original operand order for the FirstLane, so reorder the
2381 // rest of the lanes. We are visiting the nodes in a circular fashion,
2382 // using FirstLane as the center point and increasing the radius
2383 // distance.
2384 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2385 for (unsigned I = 0; I < NumOperands; ++I)
2386 MainAltOps[I].push_back(getData(I, FirstLane).V);
2387
2388 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2389 // Visit the lane on the right and then the lane on the left.
2390 for (int Direction : {+1, -1}) {
2391 int Lane = FirstLane + Direction * Distance;
2392 if (Lane < 0 || Lane >= (int)NumLanes)
2393 continue;
2394 int LastLane = Lane - Direction;
2395 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2396 "Out of bounds");
2397 // Look for a good match for each operand.
2398 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2399 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2400 std::optional<unsigned> BestIdx = getBestOperand(
2401 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2402 // By not selecting a value, we allow the operands that follow to
2403 // select a better matching value. We will get a non-null value in
2404 // the next run of getBestOperand().
2405 if (BestIdx) {
2406 // Swap the current operand with the one returned by
2407 // getBestOperand().
2408 swap(OpIdx, *BestIdx, Lane);
2409 } else {
2410 // Enable the second pass.
2411 StrategyFailed = true;
2412 }
2413 // Try to get the alternate opcode and follow it during analysis.
2414 if (MainAltOps[OpIdx].size() != 2) {
2415 OperandData &AltOp = getData(OpIdx, Lane);
2416 InstructionsState OpS =
2417 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2418 if (OpS.getOpcode() && OpS.isAltShuffle())
2419 MainAltOps[OpIdx].push_back(AltOp.V);
2420 }
2421 }
2422 }
2423 }
2424 // Skip second pass if the strategy did not fail.
2425 if (!StrategyFailed)
2426 break;
2427 }
2428 }
2429
2430#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2431 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2432 switch (RMode) {
2433 case ReorderingMode::Load:
2434 return "Load";
2435 case ReorderingMode::Opcode:
2436 return "Opcode";
2437 case ReorderingMode::Constant:
2438 return "Constant";
2439 case ReorderingMode::Splat:
2440 return "Splat";
2441 case ReorderingMode::Failed:
2442 return "Failed";
2443 }
2444 llvm_unreachable("Unimplemented Reordering Type");
2445 }
2446
2447 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2448 raw_ostream &OS) {
2449 return OS << getModeStr(RMode);
2450 }
2451
2452 /// Debug print.
2453 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2454 printMode(RMode, dbgs());
2455 }
2456
2457 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2458 return printMode(RMode, OS);
2459 }
2460
2462 const unsigned Indent = 2;
2463 unsigned Cnt = 0;
2464 for (const OperandDataVec &OpDataVec : OpsVec) {
2465 OS << "Operand " << Cnt++ << "\n";
2466 for (const OperandData &OpData : OpDataVec) {
2467 OS.indent(Indent) << "{";
2468 if (Value *V = OpData.V)
2469 OS << *V;
2470 else
2471 OS << "null";
2472 OS << ", APO:" << OpData.APO << "}\n";
2473 }
2474 OS << "\n";
2475 }
2476 return OS;
2477 }
2478
2479 /// Debug print.
2480 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2481#endif
2482 };
2483
2484 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2485 /// for a pair which have highest score deemed to have best chance to form
2486 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2487 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2488 /// of the cost, considered to be good enough score.
2489 std::optional<int>
2490 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2491 int Limit = LookAheadHeuristics::ScoreFail) const {
2492 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2494 int BestScore = Limit;
2495 std::optional<int> Index;
2496 for (int I : seq<int>(0, Candidates.size())) {
2497 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2498 Candidates[I].second,
2499 /*U1=*/nullptr, /*U2=*/nullptr,
2500 /*Level=*/1, std::nullopt);
2501 if (Score > BestScore) {
2502 BestScore = Score;
2503 Index = I;
2504 }
2505 }
2506 return Index;
2507 }
2508
2509 /// Checks if the instruction is marked for deletion.
2510 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2511
2512 /// Removes an instruction from its block and eventually deletes it.
2513 /// It's like Instruction::eraseFromParent() except that the actual deletion
2514 /// is delayed until BoUpSLP is destructed.
2516 DeletedInstructions.insert(I);
2517 }
2518
2519 /// Remove instructions from the parent function and clear the operands of \p
2520 /// DeadVals instructions, marking for deletion trivially dead operands.
2521 template <typename T>
2524 for (T *V : DeadVals) {
2525 auto *I = cast<Instruction>(V);
2526 DeletedInstructions.insert(I);
2527 }
2528 DenseSet<Value *> Processed;
2529 for (T *V : DeadVals) {
2530 if (!V || !Processed.insert(V).second)
2531 continue;
2532 auto *I = cast<Instruction>(V);
2535 if (const TreeEntry *Entry = getTreeEntry(I)) {
2536 Entries.push_back(Entry);
2537 auto It = MultiNodeScalars.find(I);
2538 if (It != MultiNodeScalars.end())
2539 Entries.append(It->second.begin(), It->second.end());
2540 }
2541 for (Use &U : I->operands()) {
2542 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2543 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2545 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
2546 return Entry->VectorizedValue == OpI;
2547 })))
2548 DeadInsts.push_back(OpI);
2549 }
2550 I->dropAllReferences();
2551 }
2552 for (T *V : DeadVals) {
2553 auto *I = cast<Instruction>(V);
2554 if (!I->getParent())
2555 continue;
2556 assert((I->use_empty() || all_of(I->uses(),
2557 [&](Use &U) {
2558 return isDeleted(
2559 cast<Instruction>(U.getUser()));
2560 })) &&
2561 "trying to erase instruction with users.");
2562 I->removeFromParent();
2563 SE->forgetValue(I);
2564 }
2565 // Process the dead instruction list until empty.
2566 while (!DeadInsts.empty()) {
2567 Value *V = DeadInsts.pop_back_val();
2568 Instruction *VI = cast_or_null<Instruction>(V);
2569 if (!VI || !VI->getParent())
2570 continue;
2572 "Live instruction found in dead worklist!");
2573 assert(VI->use_empty() && "Instructions with uses are not dead.");
2574
2575 // Don't lose the debug info while deleting the instructions.
2576 salvageDebugInfo(*VI);
2577
2578 // Null out all of the instruction's operands to see if any operand
2579 // becomes dead as we go.
2580 for (Use &OpU : VI->operands()) {
2581 Value *OpV = OpU.get();
2582 if (!OpV)
2583 continue;
2584 OpU.set(nullptr);
2585
2586 if (!OpV->use_empty())
2587 continue;
2588
2589 // If the operand is an instruction that became dead as we nulled out
2590 // the operand, and if it is 'trivially' dead, delete it in a future
2591 // loop iteration.
2592 if (auto *OpI = dyn_cast<Instruction>(OpV))
2593 if (!DeletedInstructions.contains(OpI) &&
2595 DeadInsts.push_back(OpI);
2596 }
2597
2598 VI->removeFromParent();
2599 DeletedInstructions.insert(VI);
2600 SE->forgetValue(VI);
2601 }
2602 }
2603
2604 /// Checks if the instruction was already analyzed for being possible
2605 /// reduction root.
2607 return AnalyzedReductionsRoots.count(I);
2608 }
2609 /// Register given instruction as already analyzed for being possible
2610 /// reduction root.
2612 AnalyzedReductionsRoots.insert(I);
2613 }
2614 /// Checks if the provided list of reduced values was checked already for
2615 /// vectorization.
2617 return AnalyzedReductionVals.contains(hash_value(VL));
2618 }
2619 /// Adds the list of reduced values to list of already checked values for the
2620 /// vectorization.
2622 AnalyzedReductionVals.insert(hash_value(VL));
2623 }
2624 /// Clear the list of the analyzed reduction root instructions.
2626 AnalyzedReductionsRoots.clear();
2627 AnalyzedReductionVals.clear();
2628 AnalyzedMinBWVals.clear();
2629 }
2630 /// Checks if the given value is gathered in one of the nodes.
2631 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2632 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2633 }
2634 /// Checks if the given value is gathered in one of the nodes.
2635 bool isGathered(const Value *V) const {
2636 return MustGather.contains(V);
2637 }
2638 /// Checks if the specified value was not schedule.
2639 bool isNotScheduled(const Value *V) const {
2640 return NonScheduledFirst.contains(V);
2641 }
2642
2643 /// Check if the value is vectorized in the tree.
2644 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2645
2646 ~BoUpSLP();
2647
2648private:
2649 /// Determine if a node \p E in can be demoted to a smaller type with a
2650 /// truncation. We collect the entries that will be demoted in ToDemote.
2651 /// \param E Node for analysis
2652 /// \param ToDemote indices of the nodes to be demoted.
2653 bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
2654 unsigned &BitWidth,
2655 SmallVectorImpl<unsigned> &ToDemote,
2657 unsigned &MaxDepthLevel,
2658 bool &IsProfitableToDemote,
2659 bool IsTruncRoot) const;
2660
2661 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2662 /// reordering (i.e. the operands can be reordered because they have only one
2663 /// user and reordarable).
2664 /// \param ReorderableGathers List of all gather nodes that require reordering
2665 /// (e.g., gather of extractlements or partially vectorizable loads).
2666 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2667 /// reordering, subset of \p NonVectorized.
2668 bool
2669 canReorderOperands(TreeEntry *UserTE,
2670 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2671 ArrayRef<TreeEntry *> ReorderableGathers,
2672 SmallVectorImpl<TreeEntry *> &GatherOps);
2673
2674 /// Checks if the given \p TE is a gather node with clustered reused scalars
2675 /// and reorders it per given \p Mask.
2676 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2677
2678 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2679 /// if any. If it is not vectorized (gather node), returns nullptr.
2680 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2681 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2682 TreeEntry *TE = nullptr;
2683 const auto *It = find_if(VL, [&](Value *V) {
2684 TE = getTreeEntry(V);
2685 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2686 return true;
2687 auto It = MultiNodeScalars.find(V);
2688 if (It != MultiNodeScalars.end()) {
2689 for (TreeEntry *E : It->second) {
2690 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2691 TE = E;
2692 return true;
2693 }
2694 }
2695 }
2696 return false;
2697 });
2698 if (It != VL.end()) {
2699 assert(TE->isSame(VL) && "Expected same scalars.");
2700 return TE;
2701 }
2702 return nullptr;
2703 }
2704
2705 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2706 /// if any. If it is not vectorized (gather node), returns nullptr.
2707 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2708 unsigned OpIdx) const {
2709 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2710 const_cast<TreeEntry *>(UserTE), OpIdx);
2711 }
2712
2713 /// Checks if all users of \p I are the part of the vectorization tree.
2714 bool areAllUsersVectorized(
2715 Instruction *I,
2716 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2717
2718 /// Return information about the vector formed for the specified index
2719 /// of a vector of (the same) instruction.
2721
2722 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2723 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2724
2725 /// \returns Cast context for the given graph node.
2727 getCastContextHint(const TreeEntry &TE) const;
2728
2729 /// \returns the cost of the vectorizable entry.
2730 InstructionCost getEntryCost(const TreeEntry *E,
2731 ArrayRef<Value *> VectorizedVals,
2732 SmallPtrSetImpl<Value *> &CheckedExtracts);
2733
2734 /// This is the recursive part of buildTree.
2735 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2736 const EdgeInfo &EI);
2737
2738 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2739 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2740 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2741 /// returns false, setting \p CurrentOrder to either an empty vector or a
2742 /// non-identity permutation that allows to reuse extract instructions.
2743 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2744 /// extract order.
2745 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2746 SmallVectorImpl<unsigned> &CurrentOrder,
2747 bool ResizeAllowed = false) const;
2748
2749 /// Vectorize a single entry in the tree.
2750 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2751 /// avoid issues with def-use order.
2752 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2753
2754 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2755 /// \p E.
2756 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2757 /// avoid issues with def-use order.
2758 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2759
2760 /// Create a new vector from a list of scalar values. Produces a sequence
2761 /// which exploits values reused across lanes, and arranges the inserts
2762 /// for ease of later optimization.
2763 template <typename BVTy, typename ResTy, typename... Args>
2764 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
2765
2766 /// Create a new vector from a list of scalar values. Produces a sequence
2767 /// which exploits values reused across lanes, and arranges the inserts
2768 /// for ease of later optimization.
2769 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
2770
2771 /// Returns the instruction in the bundle, which can be used as a base point
2772 /// for scheduling. Usually it is the last instruction in the bundle, except
2773 /// for the case when all operands are external (in this case, it is the first
2774 /// instruction in the list).
2775 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2776
2777 /// Tries to find extractelement instructions with constant indices from fixed
2778 /// vector type and gather such instructions into a bunch, which highly likely
2779 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2780 /// was successful, the matched scalars are replaced by poison values in \p VL
2781 /// for future analysis.
2782 std::optional<TargetTransformInfo::ShuffleKind>
2783 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2784 SmallVectorImpl<int> &Mask) const;
2785
2786 /// Tries to find extractelement instructions with constant indices from fixed
2787 /// vector type and gather such instructions into a bunch, which highly likely
2788 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2789 /// was successful, the matched scalars are replaced by poison values in \p VL
2790 /// for future analysis.
2792 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2794 unsigned NumParts) const;
2795
2796 /// Checks if the gathered \p VL can be represented as a single register
2797 /// shuffle(s) of previous tree entries.
2798 /// \param TE Tree entry checked for permutation.
2799 /// \param VL List of scalars (a subset of the TE scalar), checked for
2800 /// permutations. Must form single-register vector.
2801 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2802 /// commands to build the mask using the original vector value, without
2803 /// relying on the potential reordering.
2804 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2805 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2806 std::optional<TargetTransformInfo::ShuffleKind>
2807 isGatherShuffledSingleRegisterEntry(
2808 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2809 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2810 bool ForOrder);
2811
2812 /// Checks if the gathered \p VL can be represented as multi-register
2813 /// shuffle(s) of previous tree entries.
2814 /// \param TE Tree entry checked for permutation.
2815 /// \param VL List of scalars (a subset of the TE scalar), checked for
2816 /// permutations.
2817 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2818 /// commands to build the mask using the original vector value, without
2819 /// relying on the potential reordering.
2820 /// \returns per-register series of ShuffleKind, if gathered values can be
2821 /// represented as shuffles of previous tree entries. \p Mask is filled with
2822 /// the shuffle mask (also on per-register base).
2824 isGatherShuffledEntry(
2825 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2827 unsigned NumParts, bool ForOrder = false);
2828
2829 /// \returns the scalarization cost for this list of values. Assuming that
2830 /// this subtree gets vectorized, we may need to extract the values from the
2831 /// roots. This method calculates the cost of extracting the values.
2832 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2833 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
2834 Type *ScalarTy) const;
2835
2836 /// Set the Builder insert point to one after the last instruction in
2837 /// the bundle
2838 void setInsertPointAfterBundle(const TreeEntry *E);
2839
2840 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2841 /// specified, the starting vector value is poison.
2842 Value *gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy);
2843
2844 /// \returns whether the VectorizableTree is fully vectorizable and will
2845 /// be beneficial even the tree height is tiny.
2846 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2847
2848 /// Reorder commutative or alt operands to get better probability of
2849 /// generating vectorized code.
2850 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2853 const BoUpSLP &R);
2854
2855 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2856 /// users of \p TE and collects the stores. It returns the map from the store
2857 /// pointers to the collected stores.
2859 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2860
2861 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2862 /// stores in \p StoresVec can form a vector instruction. If so it returns
2863 /// true and populates \p ReorderIndices with the shuffle indices of the
2864 /// stores when compared to the sorted vector.
2865 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2866 OrdersType &ReorderIndices) const;
2867
2868 /// Iterates through the users of \p TE, looking for scalar stores that can be
2869 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2870 /// their order and builds an order index vector for each store bundle. It
2871 /// returns all these order vectors found.
2872 /// We run this after the tree has formed, otherwise we may come across user
2873 /// instructions that are not yet in the tree.
2875 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2876
2877 struct TreeEntry {
2878 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2879 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2880
2881 /// \returns Common mask for reorder indices and reused scalars.
2882 SmallVector<int> getCommonMask() const {
2884 inversePermutation(ReorderIndices, Mask);
2885 ::addMask(Mask, ReuseShuffleIndices);
2886 return Mask;
2887 }
2888
2889 /// \returns true if the scalars in VL are equal to this entry.
2890 bool isSame(ArrayRef<Value *> VL) const {
2891 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2892 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2893 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2894 return VL.size() == Mask.size() &&
2895 std::equal(VL.begin(), VL.end(), Mask.begin(),
2896 [Scalars](Value *V, int Idx) {
2897 return (isa<UndefValue>(V) &&
2898 Idx == PoisonMaskElem) ||
2899 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2900 });
2901 };
2902 if (!ReorderIndices.empty()) {
2903 // TODO: implement matching if the nodes are just reordered, still can
2904 // treat the vector as the same if the list of scalars matches VL
2905 // directly, without reordering.
2907 inversePermutation(ReorderIndices, Mask);
2908 if (VL.size() == Scalars.size())
2909 return IsSame(Scalars, Mask);
2910 if (VL.size() == ReuseShuffleIndices.size()) {
2911 ::addMask(Mask, ReuseShuffleIndices);
2912 return IsSame(Scalars, Mask);
2913 }
2914 return false;
2915 }
2916 return IsSame(Scalars, ReuseShuffleIndices);
2917 }
2918
2919 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2920 return isGather() && UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2921 UserTreeIndices.front().UserTE == UserEI.UserTE;
2922 }
2923
2924 /// \returns true if current entry has same operands as \p TE.
2925 bool hasEqualOperands(const TreeEntry &TE) const {
2926 if (TE.getNumOperands() != getNumOperands())
2927 return false;
2928 SmallBitVector Used(getNumOperands());
2929 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2930 unsigned PrevCount = Used.count();
2931 for (unsigned K = 0; K < E; ++K) {
2932 if (Used.test(K))
2933 continue;
2934 if (getOperand(K) == TE.getOperand(I)) {
2935 Used.set(K);
2936 break;
2937 }
2938 }
2939 // Check if we actually found the matching operand.
2940 if (PrevCount == Used.count())
2941 return false;
2942 }
2943 return true;
2944 }
2945
2946 /// \return Final vectorization factor for the node. Defined by the total
2947 /// number of vectorized scalars, including those, used several times in the
2948 /// entry and counted in the \a ReuseShuffleIndices, if any.
2949 unsigned getVectorFactor() const {
2950 if (!ReuseShuffleIndices.empty())
2951 return ReuseShuffleIndices.size();
2952 return Scalars.size();
2953 };
2954
2955 /// Checks if the current node is a gather node.
2956 bool isGather() const {return State == NeedToGather; }
2957
2958 /// A vector of scalars.
2959 ValueList Scalars;
2960
2961 /// The Scalars are vectorized into this value. It is initialized to Null.
2962 WeakTrackingVH VectorizedValue = nullptr;
2963
2964 /// New vector phi instructions emitted for the vectorized phi nodes.
2965 PHINode *PHI = nullptr;
2966
2967 /// Do we need to gather this sequence or vectorize it
2968 /// (either with vector instruction or with scatter/gather
2969 /// intrinsics for store/load)?
2970 enum EntryState {
2971 Vectorize,
2972 ScatterVectorize,
2973 StridedVectorize,
2974 NeedToGather
2975 };
2976 EntryState State;
2977
2978 /// Does this sequence require some shuffling?
2979 SmallVector<int, 4> ReuseShuffleIndices;
2980
2981 /// Does this entry require reordering?
2982 SmallVector<unsigned, 4> ReorderIndices;
2983
2984 /// Points back to the VectorizableTree.
2985 ///
2986 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2987 /// to be a pointer and needs to be able to initialize the child iterator.
2988 /// Thus we need a reference back to the container to translate the indices
2989 /// to entries.
2990 VecTreeTy &Container;
2991
2992 /// The TreeEntry index containing the user of this entry. We can actually
2993 /// have multiple users so the data structure is not truly a tree.
2994 SmallVector<EdgeInfo, 1> UserTreeIndices;
2995
2996 /// The index of this treeEntry in VectorizableTree.
2997 int Idx = -1;
2998
2999 private:
3000 /// The operands of each instruction in each lane Operands[op_index][lane].
3001 /// Note: This helps avoid the replication of the code that performs the
3002 /// reordering of operands during buildTree_rec() and vectorizeTree().
3004
3005 /// The main/alternate instruction.
3006 Instruction *MainOp = nullptr;
3007 Instruction *AltOp = nullptr;
3008
3009 public:
3010 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3011 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3012 if (Operands.size() < OpIdx + 1)
3013 Operands.resize(OpIdx + 1);
3014 assert(Operands[OpIdx].empty() && "Already resized?");
3015 assert(OpVL.size() <= Scalars.size() &&
3016 "Number of operands is greater than the number of scalars.");
3017 Operands[OpIdx].resize(OpVL.size());
3018 copy(OpVL, Operands[OpIdx].begin());
3019 }
3020
3021 /// Set the operands of this bundle in their original order.
3022 void setOperandsInOrder() {
3023 assert(Operands.empty() && "Already initialized?");
3024 auto *I0 = cast<Instruction>(Scalars[0]);
3025 Operands.resize(I0->getNumOperands());
3026 unsigned NumLanes = Scalars.size();
3027 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
3028 OpIdx != NumOperands; ++OpIdx) {
3029 Operands[OpIdx].resize(NumLanes);
3030 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3031 auto *I = cast<Instruction>(Scalars[Lane]);
3032 assert(I->getNumOperands() == NumOperands &&
3033 "Expected same number of operands");
3034 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
3035 }
3036 }
3037 }
3038
3039 /// Reorders operands of the node to the given mask \p Mask.
3040 void reorderOperands(ArrayRef<int> Mask) {
3041 for (ValueList &Operand : Operands)
3042 reorderScalars(Operand, Mask);
3043 }
3044
3045 /// \returns the \p OpIdx operand of this TreeEntry.
3046 ValueList &getOperand(unsigned OpIdx) {
3047 assert(OpIdx < Operands.size() && "Off bounds");
3048 return Operands[OpIdx];
3049 }
3050
3051 /// \returns the \p OpIdx operand of this TreeEntry.
3052 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
3053 assert(OpIdx < Operands.size() && "Off bounds");
3054 return Operands[OpIdx];
3055 }
3056
3057 /// \returns the number of operands.
3058 unsigned getNumOperands() const { return Operands.size(); }
3059
3060 /// \return the single \p OpIdx operand.
3061 Value *getSingleOperand(unsigned OpIdx) const {
3062 assert(OpIdx < Operands.size() && "Off bounds");
3063 assert(!Operands[OpIdx].empty() && "No operand available");
3064 return Operands[OpIdx][0];
3065 }
3066
3067 /// Some of the instructions in the list have alternate opcodes.
3068 bool isAltShuffle() const { return MainOp != AltOp; }
3069
3070 bool isOpcodeOrAlt(Instruction *I) const {
3071 unsigned CheckedOpcode = I->getOpcode();
3072 return (getOpcode() == CheckedOpcode ||
3073 getAltOpcode() == CheckedOpcode);
3074 }
3075
3076 /// Chooses the correct key for scheduling data. If \p Op has the same (or
3077 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3078 /// \p OpValue.
3079 Value *isOneOf(Value *Op) const {
3080 auto *I = dyn_cast<Instruction>(Op);
3081 if (I && isOpcodeOrAlt(I))
3082 return Op;
3083 return MainOp;
3084 }
3085
3086 void setOperations(const InstructionsState &S) {
3087 MainOp = S.MainOp;
3088 AltOp = S.AltOp;
3089 }
3090
3091 Instruction *getMainOp() const {
3092 return MainOp;
3093 }
3094
3095 Instruction *getAltOp() const {
3096 return AltOp;
3097 }
3098
3099 /// The main/alternate opcodes for the list of instructions.
3100 unsigned getOpcode() const {
3101 return MainOp ? MainOp->getOpcode() : 0;
3102 }
3103
3104 unsigned getAltOpcode() const {
3105 return AltOp ? AltOp->getOpcode() : 0;
3106 }
3107
3108 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3109 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3110 int findLaneForValue(Value *V) const {
3111 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
3112 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3113 if (!ReorderIndices.empty())
3114 FoundLane = ReorderIndices[FoundLane];
3115 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3116 if (!ReuseShuffleIndices.empty()) {
3117 FoundLane = std::distance(ReuseShuffleIndices.begin(),
3118 find(ReuseShuffleIndices, FoundLane));
3119 }
3120 return FoundLane;
3121 }
3122
3123 /// Build a shuffle mask for graph entry which represents a merge of main
3124 /// and alternate operations.
3125 void
3126 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3128 SmallVectorImpl<Value *> *OpScalars = nullptr,
3129 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3130
3131 /// Return true if this is a non-power-of-2 node.
3132 bool isNonPowOf2Vec() const {
3133 bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size());
3134 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3135 "Reshuffling not supported with non-power-of-2 vectors yet.");
3136 return IsNonPowerOf2;
3137 }
3138
3139#ifndef NDEBUG
3140 /// Debug printer.
3141 LLVM_DUMP_METHOD void dump() const {
3142 dbgs() << Idx << ".\n";
3143 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
3144 dbgs() << "Operand " << OpI << ":\n";
3145 for (const Value *V : Operands[OpI])
3146 dbgs().indent(2) << *V << "\n";
3147 }
3148 dbgs() << "Scalars: \n";
3149 for (Value *V : Scalars)
3150 dbgs().indent(2) << *V << "\n";
3151 dbgs() << "State: ";
3152 switch (State) {
3153 case Vectorize:
3154 dbgs() << "Vectorize\n";
3155 break;
3156 case ScatterVectorize:
3157 dbgs() << "ScatterVectorize\n";
3158 break;
3159 case StridedVectorize:
3160 dbgs() << "StridedVectorize\n";
3161 break;
3162 case NeedToGather:
3163 dbgs() << "NeedToGather\n";
3164 break;
3165 }
3166 dbgs() << "MainOp: ";
3167 if (MainOp)
3168 dbgs() << *MainOp << "\n";
3169 else
3170 dbgs() << "NULL\n";
3171 dbgs() << "AltOp: ";
3172 if (AltOp)
3173 dbgs() << *AltOp << "\n";
3174 else
3175 dbgs() << "NULL\n";
3176 dbgs() << "VectorizedValue: ";
3177 if (VectorizedValue)
3178 dbgs() << *VectorizedValue << "\n";
3179 else
3180 dbgs() << "NULL\n";
3181 dbgs() << "ReuseShuffleIndices: ";
3182 if (ReuseShuffleIndices.empty())
3183 dbgs() << "Empty";
3184 else
3185 for (int ReuseIdx : ReuseShuffleIndices)
3186 dbgs() << ReuseIdx << ", ";
3187 dbgs() << "\n";
3188 dbgs() << "ReorderIndices: ";
3189 for (unsigned ReorderIdx : ReorderIndices)
3190 dbgs() << ReorderIdx << ", ";
3191 dbgs() << "\n";
3192 dbgs() << "UserTreeIndices: ";
3193 for (const auto &EInfo : UserTreeIndices)
3194 dbgs() << EInfo << ", ";
3195 dbgs() << "\n";
3196 }
3197#endif
3198 };
3199
3200#ifndef NDEBUG
3201 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3202 InstructionCost VecCost, InstructionCost ScalarCost,
3203 StringRef Banner) const {
3204 dbgs() << "SLP: " << Banner << ":\n";
3205 E->dump();
3206 dbgs() << "SLP: Costs:\n";
3207 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3208 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
3209 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
3210 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3211 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3212 }
3213#endif
3214
3215 /// Create a new VectorizableTree entry.
3216 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3217 std::optional<ScheduleData *> Bundle,
3218 const InstructionsState &S,
3219 const EdgeInfo &UserTreeIdx,
3220 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3221 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3222 TreeEntry::EntryState EntryState =
3223 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3224 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3225 ReuseShuffleIndices, ReorderIndices);
3226 }
3227
3228 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3229 TreeEntry::EntryState EntryState,
3230 std::optional<ScheduleData *> Bundle,
3231 const InstructionsState &S,
3232 const EdgeInfo &UserTreeIdx,
3233 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3234 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3235 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3236 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3237 "Need to vectorize gather entry?");
3238 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3239 TreeEntry *Last = VectorizableTree.back().get();
3240 Last->Idx = VectorizableTree.size() - 1;
3241 Last->State = EntryState;
3242 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3243 ReuseShuffleIndices.end());
3244 if (ReorderIndices.empty()) {
3245 Last->Scalars.assign(VL.begin(), VL.end());
3246 Last->setOperations(S);
3247 } else {
3248 // Reorder scalars and build final mask.
3249 Last->Scalars.assign(VL.size(), nullptr);
3250 transform(ReorderIndices, Last->Scalars.begin(),
3251 [VL](unsigned Idx) -> Value * {
3252 if (Idx >= VL.size())
3253 return UndefValue::get(VL.front()->getType());
3254 return VL[Idx];
3255 });
3256 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3257 Last->setOperations(S);
3258 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3259 }
3260 if (!Last->isGather()) {
3261 for (Value *V : VL) {
3262 const TreeEntry *TE = getTreeEntry(V);
3263 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3264 "Scalar already in tree!");
3265 if (TE) {
3266 if (TE != Last)
3267 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3268 continue;
3269 }
3270 ScalarToTreeEntry[V] = Last;
3271 }
3272 // Update the scheduler bundle to point to this TreeEntry.
3273 ScheduleData *BundleMember = *Bundle;
3274 assert((BundleMember || isa<PHINode>(S.MainOp) ||
3275 isVectorLikeInstWithConstOps(S.MainOp) ||
3276 doesNotNeedToSchedule(VL)) &&
3277 "Bundle and VL out of sync");
3278 if (BundleMember) {
3279 for (Value *V : VL) {
3281 continue;
3282 if (!BundleMember)
3283 continue;
3284 BundleMember->TE = Last;
3285 BundleMember = BundleMember->NextInBundle;
3286 }
3287 }
3288 assert(!BundleMember && "Bundle and VL out of sync");
3289 } else {
3290 // Build a map for gathered scalars to the nodes where they are used.
3291 bool AllConstsOrCasts = true;
3292 for (Value *V : VL)
3293 if (!isConstant(V)) {
3294 auto *I = dyn_cast<CastInst>(V);
3295 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3296 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3297 }
3298 if (AllConstsOrCasts)
3299 CastMaxMinBWSizes =
3300 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3301 MustGather.insert(VL.begin(), VL.end());
3302 }
3303
3304 if (UserTreeIdx.UserTE) {
3305 Last->UserTreeIndices.push_back(UserTreeIdx);
3306 assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3307 "Reordering isn't implemented for non-power-of-2 nodes yet");
3308 }
3309 return Last;
3310 }
3311
3312 /// -- Vectorization State --
3313 /// Holds all of the tree entries.
3314 TreeEntry::VecTreeTy VectorizableTree;
3315
3316#ifndef NDEBUG
3317 /// Debug printer.
3318 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3319 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3320 VectorizableTree[Id]->dump();
3321 dbgs() << "\n";
3322 }
3323 }
3324#endif
3325
3326 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3327
3328 const TreeEntry *getTreeEntry(Value *V) const {
3329 return ScalarToTreeEntry.lookup(V);
3330 }
3331
3332 /// Check that the operand node of alternate node does not generate
3333 /// buildvector sequence. If it is, then probably not worth it to build
3334 /// alternate shuffle, if number of buildvector operands + alternate
3335 /// instruction > than the number of buildvector instructions.
3336 /// \param S the instructions state of the analyzed values.
3337 /// \param VL list of the instructions with alternate opcodes.
3338 bool areAltOperandsProfitable(const InstructionsState &S,
3339 ArrayRef<Value *> VL) const;
3340
3341 /// Checks if the specified list of the instructions/values can be vectorized
3342 /// and fills required data before actual scheduling of the instructions.
3343 TreeEntry::EntryState getScalarsVectorizationState(
3344 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
3345 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
3346
3347 /// Maps a specific scalar to its tree entry.
3348 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3349
3350 /// List of scalars, used in several vectorize nodes, and the list of the
3351 /// nodes.
3353
3354 /// Maps a value to the proposed vectorizable size.
3355 SmallDenseMap<Value *, unsigned> InstrElementSize;
3356
3357 /// A list of scalars that we found that we need to keep as scalars.
3358 ValueSet MustGather;
3359
3360 /// A set of first non-schedulable values.
3361 ValueSet NonScheduledFirst;
3362
3363 /// A map between the vectorized entries and the last instructions in the
3364 /// bundles. The bundles are built in use order, not in the def order of the
3365 /// instructions. So, we cannot rely directly on the last instruction in the
3366 /// bundle being the last instruction in the program order during
3367 /// vectorization process since the basic blocks are affected, need to
3368 /// pre-gather them before.
3369 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3370
3371 /// List of gather nodes, depending on other gather/vector nodes, which should
3372 /// be emitted after the vector instruction emission process to correctly
3373 /// handle order of the vector instructions and shuffles.
3374 SetVector<const TreeEntry *> PostponedGathers;
3375
3376 using ValueToGatherNodesMap =
3378 ValueToGatherNodesMap ValueToGatherNodes;
3379
3380 /// This POD struct describes one external user in the vectorized tree.
3381 struct ExternalUser {
3382 ExternalUser(Value *S, llvm::User *U, int L)
3383 : Scalar(S), User(U), Lane(L) {}
3384
3385 // Which scalar in our function.
3386 Value *Scalar;
3387
3388 // Which user that uses the scalar.
3390
3391 // Which lane does the scalar belong to.
3392 int Lane;
3393 };
3394 using UserList = SmallVector<ExternalUser, 16>;
3395
3396 /// Checks if two instructions may access the same memory.
3397 ///
3398 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3399 /// is invariant in the calling loop.
3400 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3401 Instruction *Inst2) {
3402 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3403 return true;
3404 // First check if the result is already in the cache.
3405 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3406 auto It = AliasCache.find(Key);
3407 if (It != AliasCache.end())
3408 return It->second;
3409 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3410 // Store the result in the cache.
3411 AliasCache.try_emplace(Key, Aliased);
3412 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3413 return Aliased;
3414 }
3415
3416 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3417
3418 /// Cache for alias results.
3419 /// TODO: consider moving this to the AliasAnalysis itself.
3421
3422 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3423 // globally through SLP because we don't perform any action which
3424 // invalidates capture results.
3425 BatchAAResults BatchAA;
3426
3427 /// Temporary store for deleted instructions. Instructions will be deleted
3428 /// eventually when the BoUpSLP is destructed. The deferral is required to
3429 /// ensure that there are no incorrect collisions in the AliasCache, which
3430 /// can happen if a new instruction is allocated at the same address as a
3431 /// previously deleted instruction.
3432 DenseSet<Instruction *> DeletedInstructions;
3433
3434 /// Set of the instruction, being analyzed already for reductions.
3435 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3436
3437 /// Set of hashes for the list of reduction values already being analyzed.
3438 DenseSet<size_t> AnalyzedReductionVals;
3439
3440 /// Values, already been analyzed for mininmal bitwidth and found to be
3441 /// non-profitable.
3442 DenseSet<Value *> AnalyzedMinBWVals;
3443
3444 /// A list of values that need to extracted out of the tree.
3445 /// This list holds pairs of (Internal Scalar : External User). External User
3446 /// can be nullptr, it means that this Internal Scalar will be used later,
3447 /// after vectorization.
3448 UserList ExternalUses;
3449
3450 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3451 /// extractelement instructions.
3452 SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;
3453
3454 /// Values used only by @llvm.assume calls.
3456
3457 /// Holds all of the instructions that we gathered, shuffle instructions and
3458 /// extractelements.
3459 SetVector<Instruction *> GatherShuffleExtractSeq;
3460
3461 /// A list of blocks that we are going to CSE.
3462 DenseSet<BasicBlock *> CSEBlocks;
3463
3464 /// Contains all scheduling relevant data for an instruction.
3465 /// A ScheduleData either represents a single instruction or a member of an
3466 /// instruction bundle (= a group of instructions which is combined into a
3467 /// vector instruction).
3468 struct ScheduleData {
3469 // The initial value for the dependency counters. It means that the
3470 // dependencies are not calculated yet.
3471 enum { InvalidDeps = -1 };
3472
3473 ScheduleData() = default;
3474
3475 void init(int BlockSchedulingRegionID, Value *OpVal) {
3476 FirstInBundle = this;
3477 NextInBundle = nullptr;
3478 NextLoadStore = nullptr;
3479 IsScheduled = false;
3480 SchedulingRegionID = BlockSchedulingRegionID;
3481 clearDependencies();
3482 OpValue = OpVal;
3483 TE = nullptr;
3484 }
3485
3486 /// Verify basic self consistency properties
3487 void verify() {
3488 if (hasValidDependencies()) {
3489 assert(UnscheduledDeps <= Dependencies && "invariant");
3490 } else {
3491 assert(UnscheduledDeps == Dependencies && "invariant");
3492 }
3493
3494 if (IsScheduled) {
3495 assert(isSchedulingEntity() &&
3496 "unexpected scheduled state");
3497 for (const ScheduleData *BundleMember = this; BundleMember;
3498 BundleMember = BundleMember->NextInBundle) {
3499 assert(BundleMember->hasValidDependencies() &&
3500 BundleMember->UnscheduledDeps == 0 &&
3501 "unexpected scheduled state");
3502 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3503 "only bundle is marked scheduled");
3504 }
3505 }
3506
3507 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3508 "all bundle members must be in same basic block");
3509 }
3510
3511 /// Returns true if the dependency information has been calculated.
3512 /// Note that depenendency validity can vary between instructions within
3513 /// a single bundle.
3514 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3515
3516 /// Returns true for single instructions and for bundle representatives
3517 /// (= the head of a bundle).
3518 bool isSchedulingEntity() const { return FirstInBundle == this; }
3519
3520 /// Returns true if it represents an instruction bundle and not only a
3521 /// single instruction.
3522 bool isPartOfBundle() const {
3523 return NextInBundle != nullptr || FirstInBundle != this || TE;
3524 }
3525
3526 /// Returns true if it is ready for scheduling, i.e. it has no more
3527 /// unscheduled depending instructions/bundles.
3528 bool isReady() const {
3529 assert(isSchedulingEntity() &&
3530 "can't consider non-scheduling entity for ready list");
3531 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3532 }
3533
3534 /// Modifies the number of unscheduled dependencies for this instruction,
3535 /// and returns the number of remaining dependencies for the containing
3536 /// bundle.
3537 int incrementUnscheduledDeps(int Incr) {
3538 assert(hasValidDependencies() &&
3539 "increment of unscheduled deps would be meaningless");
3540 UnscheduledDeps += Incr;
3541 return FirstInBundle->unscheduledDepsInBundle();
3542 }
3543
3544 /// Sets the number of unscheduled dependencies to the number of
3545 /// dependencies.
3546 void resetUnscheduledDeps() {
3547 UnscheduledDeps = Dependencies;
3548 }
3549
3550 /// Clears all dependency information.
3551 void clearDependencies() {
3552 Dependencies = InvalidDeps;
3553 resetUnscheduledDeps();
3554 MemoryDependencies.clear();
3555 ControlDependencies.clear();
3556 }
3557
3558 int unscheduledDepsInBundle() const {
3559 assert(isSchedulingEntity() && "only meaningful on the bundle");
3560 int Sum = 0;
3561 for (const ScheduleData *BundleMember = this; BundleMember;
3562 BundleMember = BundleMember->NextInBundle) {
3563 if (BundleMember->UnscheduledDeps == InvalidDeps)
3564 return InvalidDeps;
3565 Sum += BundleMember->UnscheduledDeps;
3566 }
3567 return Sum;
3568 }
3569
3570 void dump(raw_ostream &os) const {
3571 if (!isSchedulingEntity()) {
3572 os << "/ " << *Inst;
3573 } else if (NextInBundle) {
3574 os << '[' << *Inst;
3575 ScheduleData *SD = NextInBundle;
3576 while (SD) {
3577 os << ';' << *SD->Inst;
3578 SD = SD->NextInBundle;
3579 }
3580 os << ']';
3581 } else {
3582 os << *Inst;
3583 }
3584 }
3585
3586 Instruction *Inst = nullptr;
3587
3588 /// Opcode of the current instruction in the schedule data.
3589 Value *OpValue = nullptr;
3590
3591 /// The TreeEntry that this instruction corresponds to.
3592 TreeEntry *TE = nullptr;
3593
3594 /// Points to the head in an instruction bundle (and always to this for
3595 /// single instructions).
3596 ScheduleData *FirstInBundle = nullptr;
3597
3598 /// Single linked list of all instructions in a bundle. Null if it is a
3599 /// single instruction.
3600 ScheduleData *NextInBundle = nullptr;
3601
3602 /// Single linked list of all memory instructions (e.g. load, store, call)
3603 /// in the block - until the end of the scheduling region.
3604 ScheduleData *NextLoadStore = nullptr;
3605
3606 /// The dependent memory instructions.
3607 /// This list is derived on demand in calculateDependencies().
3608 SmallVector<ScheduleData *, 4> MemoryDependencies;
3609
3610 /// List of instructions which this instruction could be control dependent
3611 /// on. Allowing such nodes to be scheduled below this one could introduce
3612 /// a runtime fault which didn't exist in the original program.
3613 /// ex: this is a load or udiv following a readonly call which inf loops
3614 SmallVector<ScheduleData *, 4> ControlDependencies;
3615
3616 /// This ScheduleData is in the current scheduling region if this matches
3617 /// the current SchedulingRegionID of BlockScheduling.
3618 int SchedulingRegionID = 0;
3619
3620 /// Used for getting a "good" final ordering of instructions.
3621 int SchedulingPriority = 0;
3622
3623 /// The number of dependencies. Constitutes of the number of users of the
3624 /// instruction plus the number of dependent memory instructions (if any).
3625 /// This value is calculated on demand.
3626 /// If InvalidDeps, the number of dependencies is not calculated yet.
3627 int Dependencies = InvalidDeps;
3628
3629 /// The number of dependencies minus the number of dependencies of scheduled
3630 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3631 /// for scheduling.
3632 /// Note that this is negative as long as Dependencies is not calculated.
3633 int UnscheduledDeps = InvalidDeps;
3634
3635 /// True if this instruction is scheduled (or considered as scheduled in the
3636 /// dry-run).
3637 bool IsScheduled = false;
3638 };
3639
3640#ifndef NDEBUG
3642 const BoUpSLP::ScheduleData &SD) {
3643 SD.dump(os);
3644 return os;
3645 }
3646#endif
3647
3648 friend struct GraphTraits<BoUpSLP *>;
3649 friend struct DOTGraphTraits<BoUpSLP *>;
3650
3651 /// Contains all scheduling data for a basic block.
3652 /// It does not schedules instructions, which are not memory read/write
3653 /// instructions and their operands are either constants, or arguments, or
3654 /// phis, or instructions from others blocks, or their users are phis or from
3655 /// the other blocks. The resulting vector instructions can be placed at the
3656 /// beginning of the basic block without scheduling (if operands does not need
3657 /// to be scheduled) or at the end of the block (if users are outside of the
3658 /// block). It allows to save some compile time and memory used by the
3659 /// compiler.
3660 /// ScheduleData is assigned for each instruction in between the boundaries of
3661 /// the tree entry, even for those, which are not part of the graph. It is
3662 /// required to correctly follow the dependencies between the instructions and
3663 /// their correct scheduling. The ScheduleData is not allocated for the
3664 /// instructions, which do not require scheduling, like phis, nodes with
3665 /// extractelements/insertelements only or nodes with instructions, with
3666 /// uses/operands outside of the block.
3667 struct BlockScheduling {
3668 BlockScheduling(BasicBlock *BB)
3669 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3670
3671 void clear() {
3672 ReadyInsts.clear();
3673 ScheduleStart = nullptr;
3674 ScheduleEnd = nullptr;
3675 FirstLoadStoreInRegion = nullptr;
3676 LastLoadStoreInRegion = nullptr;
3677 RegionHasStackSave = false;
3678
3679 // Reduce the maximum schedule region size by the size of the
3680 // previous scheduling run.
3681 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3682 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3683 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3684 ScheduleRegionSize = 0;
3685
3686 // Make a new scheduling region, i.e. all existing ScheduleData is not
3687 // in the new region yet.
3688 ++SchedulingRegionID;
3689 }
3690
3691 ScheduleData *getScheduleData(Instruction *I) {
3692 if (BB != I->getParent())
3693 // Avoid lookup if can't possibly be in map.
3694 return nullptr;
3695 ScheduleData *SD = ScheduleDataMap.lookup(I);
3696 if (SD && isInSchedulingRegion(SD))
3697 return SD;
3698 return nullptr;
3699 }
3700
3701 ScheduleData *getScheduleData(Value *V) {
3702 if (auto *I = dyn_cast<Instruction>(V))
3703 return getScheduleData(I);
3704 return nullptr;
3705 }
3706
3707 ScheduleData *getScheduleData(Value *V, Value *Key) {
3708 if (V == Key)
3709 return getScheduleData(V);
3710 auto I = ExtraScheduleDataMap.find(V);
3711 if (I != ExtraScheduleDataMap.end()) {
3712 ScheduleData *SD = I->second.lookup(Key);
3713 if (SD && isInSchedulingRegion(SD))
3714 return SD;
3715 }
3716 return nullptr;
3717 }
3718
3719 bool isInSchedulingRegion(ScheduleData *SD) const {
3720 return SD->SchedulingRegionID == SchedulingRegionID;
3721 }
3722
3723 /// Marks an instruction as scheduled and puts all dependent ready
3724 /// instructions into the ready-list.
3725 template <typename ReadyListType>
3726 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3727 SD->IsScheduled = true;
3728 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3729
3730 for (ScheduleData *BundleMember = SD; BundleMember;
3731 BundleMember = BundleMember->NextInBundle) {
3732 if (BundleMember->Inst != BundleMember->OpValue)
3733 continue;
3734
3735 // Handle the def-use chain dependencies.
3736
3737 // Decrement the unscheduled counter and insert to ready list if ready.
3738 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3739 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
3740 if (OpDef && OpDef->hasValidDependencies() &&
3741 OpDef->incrementUnscheduledDeps(-1) == 0) {
3742 // There are no more unscheduled dependencies after
3743 // decrementing, so we can put the dependent instruction
3744 // into the ready list.
3745 ScheduleData *DepBundle = OpDef->FirstInBundle;
3746 assert(!DepBundle->IsScheduled &&
3747 "already scheduled bundle gets ready");
3748 ReadyList.insert(DepBundle);
3749 LLVM_DEBUG(dbgs()
3750 << "SLP: gets ready (def): " << *DepBundle << "\n");
3751 }
3752 });
3753 };
3754
3755 // If BundleMember is a vector bundle, its operands may have been
3756 // reordered during buildTree(). We therefore need to get its operands
3757 // through the TreeEntry.
3758 if (TreeEntry *TE = BundleMember->TE) {
3759 // Need to search for the lane since the tree entry can be reordered.
3760 int Lane = std::distance(TE->Scalars.begin(),
3761 find(TE->Scalars, BundleMember->Inst));
3762 assert(Lane >= 0 && "Lane not set");
3763
3764 // Since vectorization tree is being built recursively this assertion
3765 // ensures that the tree entry has all operands set before reaching
3766 // this code. Couple of exceptions known at the moment are extracts
3767 // where their second (immediate) operand is not added. Since
3768 // immediates do not affect scheduler behavior this is considered
3769 // okay.
3770 auto *In = BundleMember->Inst;
3771 assert(
3772 In &&
3773 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3774 In->getNumOperands() == TE->getNumOperands()) &&
3775 "Missed TreeEntry operands?");
3776 (void)In; // fake use to avoid build failure when assertions disabled
3777
3778 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3779 OpIdx != NumOperands; ++OpIdx)
3780 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3781 DecrUnsched(I);
3782 } else {
3783 // If BundleMember is a stand-alone instruction, no operand reordering
3784 // has taken place, so we directly access its operands.
3785 for (Use &U : BundleMember->Inst->operands())
3786 if (auto *I = dyn_cast<Instruction>(U.get()))
3787 DecrUnsched(I);
3788 }
3789 // Handle the memory dependencies.
3790 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3791 if (MemoryDepSD->hasValidDependencies() &&
3792 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3793 // There are no more unscheduled dependencies after decrementing,
3794 // so we can put the dependent instruction into the ready list.
3795 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3796 assert(!DepBundle->IsScheduled &&
3797 "already scheduled bundle gets ready");
3798 ReadyList.insert(DepBundle);
3800 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3801 }
3802 }
3803 // Handle the control dependencies.
3804 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3805 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3806 // There are no more unscheduled dependencies after decrementing,
3807 // so we can put the dependent instruction into the ready list.
3808 ScheduleData *DepBundle = DepSD->FirstInBundle;
3809 assert(!DepBundle->IsScheduled &&
3810 "already scheduled bundle gets ready");
3811 ReadyList.insert(DepBundle);
3813 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3814 }
3815 }
3816 }
3817 }
3818
3819 /// Verify basic self consistency properties of the data structure.
3820 void verify() {
3821 if (!ScheduleStart)
3822 return;
3823
3824 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3825 ScheduleStart->comesBefore(ScheduleEnd) &&
3826 "Not a valid scheduling region?");
3827
3828 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3829 auto *SD = getScheduleData(I);
3830 if (!SD)
3831 continue;
3832 assert(isInSchedulingRegion(SD) &&
3833 "primary schedule data not in window?");
3834 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3835 "entire bundle in window!");
3836 (void)SD;
3837 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3838 }
3839
3840 for (auto *SD : ReadyInsts) {
3841 assert(SD->isSchedulingEntity() && SD->isReady() &&
3842 "item in ready list not ready?");
3843 (void)SD;
3844 }
3845 }
3846
3847 void doForAllOpcodes(Value *V,
3848 function_ref<void(ScheduleData *SD)> Action) {
3849 if (ScheduleData *SD = getScheduleData(V))
3850 Action(SD);
3851 auto I = ExtraScheduleDataMap.find(V);
3852 if (I != ExtraScheduleDataMap.end())
3853 for (auto &P : I->second)
3854 if (isInSchedulingRegion(P.second))
3855 Action(P.second);
3856 }
3857
3858 /// Put all instructions into the ReadyList which are ready for scheduling.
3859 template <typename ReadyListType>
3860 void initialFillReadyList(ReadyListType &ReadyList) {
3861 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3862 doForAllOpcodes(I, [&](ScheduleData *SD) {
3863 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3864 SD->isReady()) {
3865 ReadyList.insert(SD);
3866 LLVM_DEBUG(dbgs()
3867 << "SLP: initially in ready list: " << *SD << "\n");
3868 }
3869 });
3870 }
3871 }
3872
3873 /// Build a bundle from the ScheduleData nodes corresponding to the
3874 /// scalar instruction for each lane.
3875 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3876
3877 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3878 /// cyclic dependencies. This is only a dry-run, no instructions are
3879 /// actually moved at this stage.
3880 /// \returns the scheduling bundle. The returned Optional value is not
3881 /// std::nullopt if \p VL is allowed to be scheduled.
3882 std::optional<ScheduleData *>
3883 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3884 const InstructionsState &S);
3885
3886 /// Un-bundles a group of instructions.
3887 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3888
3889 /// Allocates schedule data chunk.
3890 ScheduleData *allocateScheduleDataChunks();
3891
3892 /// Extends the scheduling region so that V is inside the region.
3893 /// \returns true if the region size is within the limit.
3894 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3895
3896 /// Initialize the ScheduleData structures for new instructions in the
3897 /// scheduling region.
3898 void initScheduleData(Instruction *FromI, Instruction *ToI,
3899 ScheduleData *PrevLoadStore,
3900 ScheduleData *NextLoadStore);
3901
3902 /// Updates the dependency information of a bundle and of all instructions/
3903 /// bundles which depend on the original bundle.
3904 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3905 BoUpSLP *SLP);
3906
3907 /// Sets all instruction in the scheduling region to un-scheduled.
3908 void resetSchedule();
3909
3910 BasicBlock *BB;
3911
3912 /// Simple memory allocation for ScheduleData.
3914
3915 /// The size of a ScheduleData array in ScheduleDataChunks.
3916 int ChunkSize;
3917
3918 /// The allocator position in the current chunk, which is the last entry
3919 /// of ScheduleDataChunks.
3920 int ChunkPos;
3921
3922 /// Attaches ScheduleData to Instruction.
3923 /// Note that the mapping survives during all vectorization iterations, i.e.
3924 /// ScheduleData structures are recycled.
3926
3927 /// Attaches ScheduleData to Instruction with the leading key.
3929 ExtraScheduleDataMap;
3930
3931 /// The ready-list for scheduling (only used for the dry-run).
3932 SetVector<ScheduleData *> ReadyInsts;
3933
3934 /// The first instruction of the scheduling region.
3935 Instruction *ScheduleStart = nullptr;
3936
3937 /// The first instruction _after_ the scheduling region.
3938 Instruction *ScheduleEnd = nullptr;
3939
3940 /// The first memory accessing instruction in the scheduling region
3941 /// (can be null).
3942 ScheduleData *FirstLoadStoreInRegion = nullptr;
3943
3944 /// The last memory accessing instruction in the scheduling region
3945 /// (can be null).
3946 ScheduleData *LastLoadStoreInRegion = nullptr;
3947
3948 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3949 /// region? Used to optimize the dependence calculation for the
3950 /// common case where there isn't.
3951 bool RegionHasStackSave = false;
3952
3953 /// The current size of the scheduling region.
3954 int ScheduleRegionSize = 0;
3955
3956 /// The maximum size allowed for the scheduling region.
3957 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3958
3959 /// The ID of the scheduling region. For a new vectorization iteration this
3960 /// is incremented which "removes" all ScheduleData from the region.
3961 /// Make sure that the initial SchedulingRegionID is greater than the
3962 /// initial SchedulingRegionID in ScheduleData (which is 0).
3963 int SchedulingRegionID = 1;
3964 };
3965
3966 /// Attaches the BlockScheduling structures to basic blocks.
3968
3969 /// Performs the "real" scheduling. Done before vectorization is actually
3970 /// performed in a basic block.
3971 void scheduleBlock(BlockScheduling *BS);
3972
3973 /// List of users to ignore during scheduling and that don't need extracting.
3974 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3975
3976 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3977 /// sorted SmallVectors of unsigned.
3978 struct OrdersTypeDenseMapInfo {
3979 static OrdersType getEmptyKey() {
3980 OrdersType V;
3981 V.push_back(~1U);
3982 return V;
3983 }
3984
3985 static OrdersType getTombstoneKey() {
3986 OrdersType V;
3987 V.push_back(~2U);
3988 return V;
3989 }
3990
3991 static unsigned getHashValue(const OrdersType &V) {
3992 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3993 }
3994
3995 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3996 return LHS == RHS;
3997 }
3998 };
3999
4000 // Analysis and block reference.
4001 Function *F;
4002 ScalarEvolution *SE;
4004 TargetLibraryInfo *TLI;
4005 LoopInfo *LI;
4006 DominatorTree *DT;
4007 AssumptionCache *AC;
4008 DemandedBits *DB;
4009 const DataLayout *DL;
4011
4012 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
4013 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
4014
4015 /// Instruction builder to construct the vectorized tree.
4017
4018 /// A map of scalar integer values to the smallest bit width with which they
4019 /// can legally be represented. The values map to (width, signed) pairs,
4020 /// where "width" indicates the minimum bit width and "signed" is True if the
4021 /// value must be signed-extended, rather than zero-extended, back to its
4022 /// original width.
4024
4025 /// Final size of the reduced vector, if the current graph represents the
4026 /// input for the reduction and it was possible to narrow the size of the
4027 /// reduction.
4028 unsigned ReductionBitWidth = 0;
4029
4030 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4031 /// type sizes, used in the tree.
4032 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4033
4034 /// Indices of the vectorized nodes, which supposed to be the roots of the new
4035 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4036 DenseSet<unsigned> ExtraBitWidthNodes;
4037};
4038
4039} // end namespace slpvectorizer
4040
4041template <> struct GraphTraits<BoUpSLP *> {
4042 using TreeEntry = BoUpSLP::TreeEntry;
4043
4044 /// NodeRef has to be a pointer per the GraphWriter.
4046
4048
4049 /// Add the VectorizableTree to the index iterator to be able to return
4050 /// TreeEntry pointers.
4051 struct ChildIteratorType
4052 : public iterator_adaptor_base<
4053 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4055
4057 ContainerTy &VT)
4058 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
4059
4060 NodeRef operator*() { return I->UserTE; }
4061 };
4062
4064 return R.VectorizableTree[0].get();
4065 }
4066
4067 static ChildIteratorType child_begin(NodeRef N) {
4068 return {N->UserTreeIndices.begin(), N->Container};
4069 }
4070
4071 static ChildIteratorType child_end(NodeRef N) {
4072 return {N->UserTreeIndices.end(), N->Container};
4073 }
4074
4075 /// For the node iterator we just need to turn the TreeEntry iterator into a
4076 /// TreeEntry* iterator so that it dereferences to NodeRef.
4077 class nodes_iterator {
4079 ItTy It;
4080
4081 public:
4082 nodes_iterator(const ItTy &It2) : It(It2) {}
4083 NodeRef operator*() { return It->get(); }
4084 nodes_iterator operator++() {
4085 ++It;
4086 return *this;
4087 }
4088 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
4089 };
4090
4091 static nodes_iterator nodes_begin(BoUpSLP *R) {
4092 return nodes_iterator(R->VectorizableTree.begin());
4093 }
4094
4095 static nodes_iterator nodes_end(BoUpSLP *R) {
4096 return nodes_iterator(R->VectorizableTree.end());
4097 }
4098
4099 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
4100};
4101
4102template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
4103 using TreeEntry = BoUpSLP::TreeEntry;
4104
4105 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
4106
4107 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
4108 std::string Str;
4110 OS << Entry->Idx << ".\n";
4111 if (isSplat(Entry->Scalars))
4112 OS << "<splat> ";
4113 for (auto *V : Entry->Scalars) {
4114 OS << *V;
4115 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
4116 return EU.Scalar == V;
4117 }))
4118 OS << " <extract>";
4119 OS << "\n";
4120 }
4121 return Str;
4122 }
4123
4124 static std::string getNodeAttributes(const TreeEntry *Entry,
4125 const BoUpSLP *) {
4126 if (Entry->isGather())
4127 return "color=red";
4128 if (Entry->State == TreeEntry::ScatterVectorize ||
4129 Entry->State == TreeEntry::StridedVectorize)
4130 return "color=blue";
4131 return "";
4132 }
4133};
4134
4135} // end namespace llvm
4136
4139 for (auto *I : DeletedInstructions) {
4140 if (!I->getParent()) {
4141 // Temporarily insert instruction back to erase them from parent and
4142 // memory later.
4143 if (isa<PHINode>(I))
4144 // Phi nodes must be the very first instructions in the block.
4145 I->insertBefore(F->getEntryBlock(),
4146 F->getEntryBlock().getFirstNonPHIIt());
4147 else
4148 I->insertBefore(F->getEntryBlock().getTerminator());
4149 continue;
4150 }
4151 for (Use &U : I->operands()) {
4152 auto *Op = dyn_cast<Instruction>(U.get());
4153 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
4155 DeadInsts.emplace_back(Op);
4156 }
4157 I->dropAllReferences();
4158 }
4159 for (auto *I : DeletedInstructions) {
4160 assert(I->use_empty() &&
4161 "trying to erase instruction with users.");
4162 I->eraseFromParent();
4163 }
4164
4165 // Cleanup any dead scalar code feeding the vectorized instructions
4167
4168#ifdef EXPENSIVE_CHECKS
4169 // If we could guarantee that this call is not extremely slow, we could
4170 // remove the ifdef limitation (see PR47712).
4171 assert(!verifyFunction(*F, &dbgs()));
4172#endif
4173}
4174
4175/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4176/// contains original mask for the scalars reused in the node. Procedure
4177/// transform this mask in accordance with the given \p Mask.
4179 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4180 "Expected non-empty mask.");
4181 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4182 Prev.swap(Reuses);
4183 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
4184 if (Mask[I] != PoisonMaskElem)
4185 Reuses[Mask[I]] = Prev[I];
4186}
4187
4188/// Reorders the given \p Order according to the given \p Mask. \p Order - is
4189/// the original order of the scalars. Procedure transforms the provided order
4190/// in accordance with the given \p Mask. If the resulting \p Order is just an
4191/// identity order, \p Order is cleared.
4193 bool BottomOrder = false) {
4194 assert(!Mask.empty() && "Expected non-empty mask.");
4195 unsigned Sz = Mask.size();
4196 if (BottomOrder) {
4197 SmallVector<unsigned> PrevOrder;
4198 if (Order.empty()) {
4199 PrevOrder.resize(Sz);
4200 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4201 } else {
4202 PrevOrder.swap(Order);
4203 }
4204 Order.assign(Sz, Sz);
4205 for (unsigned I = 0; I < Sz; ++I)
4206 if (Mask[I] != PoisonMaskElem)
4207 Order[I] = PrevOrder[Mask[I]];
4208 if (all_of(enumerate(Order), [&](const auto &Data) {
4209 return Data.value() == Sz || Data.index() == Data.value();
4210 })) {
4211 Order.clear();
4212 return;
4213 }
4214 fixupOrderingIndices(Order);
4215 return;
4216 }
4217 SmallVector<int> MaskOrder;
4218 if (Order.empty()) {
4219 MaskOrder.resize(Sz);
4220 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4221 } else {
4222 inversePermutation(Order, MaskOrder);
4223 }
4224 reorderReuses(MaskOrder, Mask);
4225 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4226 Order.clear();
4227 return;
4228 }
4229 Order.assign(Sz, Sz);
4230 for (unsigned I = 0; I < Sz; ++I)
4231 if (MaskOrder[I] != PoisonMaskElem)
4232 Order[MaskOrder[I]] = I;
4233 fixupOrderingIndices(Order);
4234}
4235
4236std::optional<BoUpSLP::OrdersType>
4237BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4238 assert(TE.isGather() && "Expected gather node only.");
4239 // Try to find subvector extract/insert patterns and reorder only such
4240 // patterns.
4241 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4242 Type *ScalarTy = GatheredScalars.front()->getType();
4243 int NumScalars = GatheredScalars.size();
4244 if (!isValidElementType(ScalarTy))
4245 return std::nullopt;
4246 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4247 int NumParts = TTI->getNumberOfParts(VecTy);
4248 if (NumParts == 0 || NumParts >= NumScalars)
4249 NumParts = 1;
4250 SmallVector<int> ExtractMask;
4251 SmallVector<int> Mask;
4254 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4256 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4257 /*ForOrder=*/true);
4258 // No shuffled operands - ignore.
4259 if (GatherShuffles.empty() && ExtractShuffles.empty())
4260 return std::nullopt;
4261 OrdersType CurrentOrder(NumScalars, NumScalars);
4262 if (GatherShuffles.size() == 1 &&
4263 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4264 Entries.front().front()->isSame(TE.Scalars)) {
4265 // Perfect match in the graph, will reuse the previously vectorized
4266 // node. Cost is 0.
4267 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4268 return CurrentOrder;
4269 }
4270 auto IsSplatMask = [](ArrayRef<int> Mask) {
4271 int SingleElt = PoisonMaskElem;
4272 return all_of(Mask, [&](int I) {
4273 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4274 SingleElt = I;
4275 return I == PoisonMaskElem || I == SingleElt;
4276 });
4277 };
4278 // Exclusive broadcast mask - ignore.
4279 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4280 (Entries.size() != 1 ||
4281 Entries.front().front()->ReorderIndices.empty())) ||
4282 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4283 return std::nullopt;
4284 SmallBitVector ShuffledSubMasks(NumParts);
4285 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4286 ArrayRef<int> Mask, int PartSz, int NumParts,
4287 function_ref<unsigned(unsigned)> GetVF) {
4288 for (int I : seq<int>(0, NumParts)) {
4289 if (ShuffledSubMasks.test(I))
4290 continue;
4291 const int VF = GetVF(I);
4292 if (VF == 0)
4293 continue;
4294 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
4295 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4296 // Shuffle of at least 2 vectors - ignore.
4297 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4298 std::fill(Slice.begin(), Slice.end(), NumScalars);
4299 ShuffledSubMasks.set(I);
4300 continue;
4301 }
4302 // Try to include as much elements from the mask as possible.
4303 int FirstMin = INT_MAX;
4304 int SecondVecFound = false;
4305 for (int K : seq<int>(Limit)) {
4306 int Idx = Mask[I * PartSz + K];
4307 if (Idx == PoisonMaskElem) {
4308 Value *V = GatheredScalars[I * PartSz + K];
4309 if (isConstant(V) && !isa<PoisonValue>(V)) {
4310 SecondVecFound = true;
4311 break;
4312 }
4313 continue;
4314 }
4315 if (Idx < VF) {
4316 if (FirstMin > Idx)
4317 FirstMin = Idx;
4318 } else {
4319 SecondVecFound = true;
4320 break;
4321 }
4322 }
4323 FirstMin = (FirstMin / PartSz) * PartSz;
4324 // Shuffle of at least 2 vectors - ignore.
4325 if (SecondVecFound) {
4326 std::fill(Slice.begin(), Slice.end(), NumScalars);
4327 ShuffledSubMasks.set(I);
4328 continue;
4329 }
4330 for (int K : seq<int>(Limit)) {
4331 int Idx = Mask[I * PartSz + K];
4332 if (Idx == PoisonMaskElem)
4333 continue;
4334 Idx -= FirstMin;
4335 if (Idx >= PartSz) {
4336 SecondVecFound = true;
4337 break;
4338 }
4339 if (CurrentOrder[I * PartSz + Idx] >
4340 static_cast<unsigned>(I * PartSz + K) &&
4341 CurrentOrder[I * PartSz + Idx] !=
4342 static_cast<unsigned>(I * PartSz + Idx))
4343 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4344 }
4345 // Shuffle of at least 2 vectors - ignore.
4346 if (SecondVecFound) {
4347 std::fill(Slice.begin(), Slice.end(), NumScalars);
4348 ShuffledSubMasks.set(I);
4349 continue;
4350 }
4351 }
4352 };
4353 int PartSz = getPartNumElems(NumScalars, NumParts);
4354 if (!ExtractShuffles.empty())
4355 TransformMaskToOrder(
4356 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4357 if (!ExtractShuffles[I])
4358 return 0U;
4359 unsigned VF = 0;
4360 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
4361 for (unsigned Idx : seq<unsigned>(Sz)) {
4362 int K = I * PartSz + Idx;
4363 if (ExtractMask[K] == PoisonMaskElem)
4364 continue;
4365 if (!TE.ReuseShuffleIndices.empty())
4366 K = TE.ReuseShuffleIndices[K];
4367 if (!TE.ReorderIndices.empty())
4368 K = std::distance(TE.ReorderIndices.begin(),
4369 find(TE.ReorderIndices, K));
4370 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4371 if (!EI)
4372 continue;
4373 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4374 ->getElementCount()
4375 .getKnownMinValue());
4376 }
4377 return VF;
4378 });
4379 // Check special corner case - single shuffle of the same entry.
4380 if (GatherShuffles.size() == 1 && NumParts != 1) {
4381 if (ShuffledSubMasks.any())
4382 return std::nullopt;
4383 PartSz = NumScalars;
4384 NumParts = 1;
4385 }
4386 if (!Entries.empty())
4387 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4388 if (!GatherShuffles[I])
4389 return 0U;
4390 return std::max(Entries[I].front()->getVectorFactor(),
4391 Entries[I].back()->getVectorFactor());
4392 });
4393 int NumUndefs =
4394 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4395 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4396 return std::nullopt;
4397 return std::move(CurrentOrder);
4398}
4399
4400static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4401 const TargetLibraryInfo &TLI,
4402 bool CompareOpcodes = true) {
4403 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
4404 return false;
4405 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4406 if (!GEP1)
4407 return false;
4408 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4409 if (!GEP2)
4410 return false;
4411 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4412 ((isConstant(GEP1->getOperand(1)) &&
4413 isConstant(GEP2->getOperand(1))) ||
4414 !CompareOpcodes ||
4415 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4416 .getOpcode());
4417}
4418
4419/// Calculates minimal alignment as a common alignment.
4420template <typename T>
4422 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4423 for (Value *V : VL.drop_front())
4424 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4425 return CommonAlignment;
4426}
4427
4428/// Check if \p Order represents reverse order.
4430 unsigned Sz = Order.size();
4431 return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
4432 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4433 });
4434}
4435
4436/// Checks if the provided list of pointers \p Pointers represents the strided
4437/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4438/// Otherwise, if \p Inst is not specified, just initialized optional value is
4439/// returned to show that the pointers represent strided pointers. If \p Inst
4440/// specified, the runtime stride is materialized before the given \p Inst.
4441/// \returns std::nullopt if the pointers are not pointers with the runtime
4442/// stride, nullptr or actual stride value, otherwise.
4443static std::optional<Value *>
4445 const DataLayout &DL, ScalarEvolution &SE,
4446 SmallVectorImpl<unsigned> &SortedIndices,
4447 Instruction *Inst = nullptr) {
4449 const SCEV *PtrSCEVLowest = nullptr;
4450 const SCEV *PtrSCEVHighest = nullptr;
4451 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4452 // addresses).
4453 for (Value *Ptr : PointerOps) {
4454 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4455 if (!PtrSCEV)
4456 return std::nullopt;
4457 SCEVs.push_back(PtrSCEV);
4458 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4459 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4460 continue;
4461 }
4462 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4463 if (isa<SCEVCouldNotCompute>(Diff))
4464 return std::nullopt;
4465 if (Diff->isNonConstantNegative()) {
4466 PtrSCEVLowest = PtrSCEV;
4467 continue;
4468 }
4469 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4470 if (isa<SCEVCouldNotCompute>(Diff1))
4471 return std::nullopt;
4472 if (Diff1->isNonConstantNegative()) {
4473 PtrSCEVHighest = PtrSCEV;
4474 continue;
4475 }
4476 }
4477 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4478 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4479 if (isa<SCEVCouldNotCompute>(Dist))
4480 return std::nullopt;
4481 int Size = DL.getTypeStoreSize(ElemTy);
4482 auto TryGetStride = [&](const SCEV *Dist,
4483 const SCEV *Multiplier) -> const SCEV * {
4484 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4485 if (M->getOperand(0) == Multiplier)
4486 return M->getOperand(1);
4487 if (M->getOperand(1) == Multiplier)
4488 return M->getOperand(0);
4489 return nullptr;
4490 }
4491 if (Multiplier == Dist)
4492 return SE.getConstant(Dist->getType(), 1);
4493 return SE.getUDivExactExpr(Dist, Multiplier);
4494 };
4495 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4496 const SCEV *Stride = nullptr;
4497 if (Size != 1 || SCEVs.size() > 2) {
4498 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4499 Stride = TryGetStride(Dist, Sz);
4500 if (!Stride)
4501 return std::nullopt;
4502 }
4503 if (!Stride || isa<SCEVConstant>(Stride))
4504 return std::nullopt;
4505 // Iterate through all pointers and check if all distances are
4506 // unique multiple of Stride.
4507 using DistOrdPair = std::pair<int64_t, int>;
4508 auto Compare = llvm::less_first();
4509 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4510 int Cnt = 0;
4511 bool IsConsecutive = true;
4512 for (const SCEV *PtrSCEV : SCEVs) {
4513 unsigned Dist = 0;
4514 if (PtrSCEV != PtrSCEVLowest) {
4515 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4516 const SCEV *Coeff = TryGetStride(Diff, Stride);
4517 if (!Coeff)
4518 return std::nullopt;
4519 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4520 if (!SC || isa<SCEVCouldNotCompute>(SC))
4521 return std::nullopt;
4522 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4523 SE.getMulExpr(Stride, SC)))
4524 ->isZero())
4525 return std::nullopt;
4526 Dist = SC->getAPInt().getZExtValue();
4527 }
4528 // If the strides are not the same or repeated, we can't vectorize.
4529 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4530 return std::nullopt;
4531 auto Res = Offsets.emplace(Dist, Cnt);
4532 if (!Res.second)
4533 return std::nullopt;
4534 // Consecutive order if the inserted element is the last one.
4535 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4536 ++Cnt;
4537 }
4538 if (Offsets.size() != SCEVs.size())
4539 return std::nullopt;
4540 SortedIndices.clear();
4541 if (!IsConsecutive) {
4542 // Fill SortedIndices array only if it is non-consecutive.
4543 SortedIndices.resize(PointerOps.size());
4544 Cnt = 0;
4545 for (const std::pair<int64_t, int> &Pair : Offsets) {
4546 SortedIndices[Cnt] = Pair.second;
4547 ++Cnt;
4548 }
4549 }
4550 if (!Inst)
4551 return nullptr;
4552 SCEVExpander Expander(SE, DL, "strided-load-vec");
4553 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4554}
4555
4556static std::pair<InstructionCost, InstructionCost>
4558 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4559 Type *ScalarTy, VectorType *VecTy);
4560
4562 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
4563 SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
4564 // Check that a vectorized load would load the same memory as a scalar
4565 // load. For example, we don't want to vectorize loads that are smaller
4566 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4567 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4568 // from such a struct, we read/write packed bits disagreeing with the
4569 // unvectorized version.
4570 Type *ScalarTy = VL0->getType();
4571
4572 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4573 return LoadsState::Gather;
4574
4575 // Make sure all loads in the bundle are simple - we can't vectorize
4576 // atomic or volatile loads.
4577 PointerOps.clear();
4578 const unsigned Sz = VL.size();
4579 PointerOps.resize(Sz);
4580 auto *POIter = PointerOps.begin();
4581 for (Value *V : VL) {
4582 auto *L = cast<LoadInst>(V);
4583 if (!L->isSimple())
4584 return LoadsState::Gather;
4585 *POIter = L->getPointerOperand();
4586 ++POIter;
4587 }
4588
4589 Order.clear();
4590 auto *VecTy = getWidenedType(ScalarTy, Sz);
4591 // Check the order of pointer operands or that all pointers are the same.
4592 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4593 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4594 if (!Order.empty() && !isPowerOf2_32(VL.size())) {
4595 assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4596 "supported with VectorizeNonPowerOf2");
4597 return LoadsState::Gather;
4598 }
4599
4600 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4601 if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
4602 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4603 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4605 if (IsSorted || all_of(PointerOps, [&](Value *P) {
4606 return arePointersCompatible(P, PointerOps.front(), *TLI);
4607 })) {
4608 if (IsSorted) {
4609 Value *Ptr0;
4610 Value *PtrN;
4611 if (Order.empty()) {
4612 Ptr0 = PointerOps.front();
4613 PtrN = PointerOps.back();
4614 } else {
4615 Ptr0 = PointerOps[Order.front()];
4616 PtrN = PointerOps[Order.back()];
4617 }
4618 std::optional<int> Diff =
4619 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
4620 // Check that the sorted loads are consecutive.
4621 if (static_cast<unsigned>(*Diff) == Sz - 1)
4622 return LoadsState::Vectorize;
4623 // Simple check if not a strided access - clear order.
4624 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4625 // Try to generate strided load node if:
4626 // 1. Target with strided load support is detected.
4627 // 2. The number of loads is greater than MinProfitableStridedLoads,
4628 // or the potential stride <= MaxProfitableLoadStride and the
4629 // potential stride is power-of-2 (to avoid perf regressions for the very
4630 // small number of loads) and max distance > number of loads, or potential
4631 // stride is -1.
4632 // 3. The loads are ordered, or number of unordered loads <=
4633 // MaxProfitableUnorderedLoads, or loads are in reversed order.
4634 // (this check is to avoid extra costs for very expensive shuffles).
4635 // 4. Any pointer operand is an instruction with the users outside of the
4636 // current graph (for masked gathers extra extractelement instructions
4637 // might be required).
4638 auto IsAnyPointerUsedOutGraph =
4639 IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
4640 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
4641 return !getTreeEntry(U) && !MustGather.contains(U);
4642 });
4643 });
4644 if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
4646 (static_cast<unsigned>(std::abs(*Diff)) <=
4648 isPowerOf2_32(std::abs(*Diff)))) &&
4649 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4650 *Diff == -(static_cast<int>(Sz) - 1))) {
4651 int Stride = *Diff / static_cast<int>(Sz - 1);
4652 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4653 Align Alignment =
4654 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
4655 ->getAlign();
4656 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
4657 // Iterate through all pointers and check if all distances are
4658 // unique multiple of Dist.
4659 SmallSet<int, 4> Dists;
4660 for (Value *Ptr : PointerOps) {
4661 int Dist = 0;
4662 if (Ptr == PtrN)
4663 Dist = *Diff;
4664 else if (Ptr != Ptr0)
4665 Dist =
4666 *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
4667 // If the strides are not the same or repeated, we can't
4668 // vectorize.
4669 if (((Dist / Stride) * Stride) != Dist ||
4670 !Dists.insert(Dist).second)
4671 break;
4672 }
4673 if (Dists.size() == Sz)
4675 }
4676 }
4677 }
4678 }
4679 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4680 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
4681 unsigned MinVF = getMinVF(Sz);
4682 unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
4683 MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
4684 for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4685 unsigned VectorizedCnt = 0;
4687 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4688 Cnt += VF, ++VectorizedCnt) {
4689 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4691 SmallVector<Value *> PointerOps;
4692 LoadsState LS =
4693 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
4694 /*TryRecursiveCheck=*/false);
4695 // Check that the sorted loads are consecutive.
4696 if (LS == LoadsState::Gather)
4697 break;
4698 // If need the reorder - consider as high-cost masked gather for now.
4699 if ((LS == LoadsState::Vectorize ||
4701 !Order.empty() && !isReverseOrder(Order))
4703 States.push_back(LS);
4704 }
4705 // Can be vectorized later as a serie of loads/insertelements.
4706 if (VectorizedCnt == VL.size() / VF) {
4707 // Compare masked gather cost and loads + insersubvector costs.
4709 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4710 TTI, PointerOps, PointerOps.front(), Instruction::GetElementPtr,
4711 CostKind, ScalarTy, VecTy);
4712 InstructionCost MaskedGatherCost =
4714 Instruction::Load, VecTy,
4715 cast<LoadInst>(VL0)->getPointerOperand(),
4716 /*VariableMask=*/false, CommonAlignment, CostKind) +
4717 VectorGEPCost - ScalarGEPCost;
4718 InstructionCost VecLdCost = 0;
4719 auto *SubVecTy = getWidenedType(ScalarTy, VF);
4720 for (auto [I, LS] : enumerate(States)) {
4721 auto *LI0 = cast<LoadInst>(VL[I * VF]);
4722 switch (LS) {
4723 case LoadsState::Vectorize: {
4724 auto [ScalarGEPCost, VectorGEPCost] =
4725 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4726 LI0->getPointerOperand(), Instruction::Load,
4727 CostKind, ScalarTy, SubVecTy);
4728 VecLdCost += TTI.getMemoryOpCost(
4729 Instruction::Load, SubVecTy, LI0->getAlign(),
4730 LI0->getPointerAddressSpace(), CostKind,
4732 VectorGEPCost - ScalarGEPCost;
4733 break;
4734 }
4736 auto [ScalarGEPCost, VectorGEPCost] =
4737 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4738 LI0->getPointerOperand(), Instruction::Load,
4739 CostKind, ScalarTy, SubVecTy);
4740 VecLdCost +=
4742 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4743 /*VariableMask=*/false, CommonAlignment, CostKind) +
4744 VectorGEPCost - ScalarGEPCost;
4745 break;
4746 }
4748 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4749 TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4750 LI0->getPointerOperand(), Instruction::GetElementPtr,
4751 CostKind, ScalarTy, SubVecTy);
4752 VecLdCost +=
4754 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4755 /*VariableMask=*/false, CommonAlignment, CostKind) +
4756 VectorGEPCost - ScalarGEPCost;
4757 break;
4758 }
4759 case LoadsState::Gather:
4761 "Expected only consecutive, strided or masked gather loads.");
4762 }
4763 SmallVector<int> ShuffleMask(VL.size());
4764 for (int Idx : seq<int>(0, VL.size()))
4765 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4766 VecLdCost +=
4767 TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy, ShuffleMask,
4768 CostKind, I * VF, SubVecTy);
4769 }
4770 // If masked gather cost is higher - better to vectorize, so
4771 // consider it as a gather node. It will be better estimated
4772 // later.
4773 if (MaskedGatherCost >= VecLdCost)
4774 return true;
4775 }
4776 }
4777 return false;
4778 };
4779 // TODO: need to improve analysis of the pointers, if not all of them are
4780 // GEPs or have > 2 operands, we end up with a gather node, which just
4781 // increases the cost.
4782 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
4783 bool ProfitableGatherPointers =
4784 L && Sz > 2 &&
4785 static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
4786 return L->isLoopInvariant(V);
4787 })) <= Sz / 2;
4788 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
4789 auto *GEP = dyn_cast<GetElementPtrInst>(P);
4790 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
4791 (GEP && GEP->getNumOperands() == 2 &&
4792 isa<Constant, Instruction>(GEP->getOperand(1)));
4793 })) {
4794 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4795 if (TTI->isLegalMaskedGather(VecTy, CommonAlignment) &&
4796 !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
4797 // Check if potential masked gather can be represented as series
4798 // of loads + insertsubvectors.
4799 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4800 // If masked gather cost is higher - better to vectorize, so
4801 // consider it as a gather node. It will be better estimated
4802 // later.
4803 return LoadsState::Gather;
4804 }
4806 }
4807 }
4808 }
4809
4810 return LoadsState::Gather;
4811}
4812
4814 const DataLayout &DL, ScalarEvolution &SE,
4815 SmallVectorImpl<unsigned> &SortedIndices) {
4817 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4818 "Expected list of pointer operands.");
4819 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4820 // Ptr into, sort and return the sorted indices with values next to one
4821 // another.
4823 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4824
4825 unsigned Cnt = 1;
4826 for (Value *Ptr : VL.drop_front()) {
4827 bool Found = any_of(Bases, [&](auto &Base) {
4828 std::optional<int> Diff =
4829 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4830 /*StrictCheck=*/true);
4831 if (!Diff)
4832 return false;
4833
4834 Base.second.emplace_back(Ptr, *Diff, Cnt++);
4835 return true;
4836 });
4837
4838 if (!Found) {
4839 // If we haven't found enough to usefully cluster, return early.
4840 if (Bases.size() > VL.size() / 2 - 1)
4841 return false;
4842
4843 // Not found already - add a new Base
4844 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
4845 }
4846 }
4847
4848 // For each of the bases sort the pointers by Offset and check if any of the
4849 // base become consecutively allocated.
4850 bool AnyConsecutive = false;
4851 for (auto &Base : Bases) {
4852 auto &Vec = Base.second;
4853 if (Vec.size() > 1) {
4854 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
4855 const std::tuple<Value *, int, unsigned> &Y) {
4856 return std::get<1>(X) < std::get<1>(Y);
4857 });
4858 int InitialOffset = std::get<1>(Vec[0]);
4859 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
4860 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4861 });
4862 }
4863 }
4864
4865 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4866 SortedIndices.clear();
4867 if (!AnyConsecutive)
4868 return false;
4869
4870 // If we have a better order, also sort the base pointers by increasing
4871 // (variable) values if possible, to try and keep the order more regular. In
4872 // order to create a valid strict-weak order we cluster by the Root of gep
4873 // chains and sort within each.
4875 for (auto &Base : Bases) {
4876 Value *Strip = Base.first->stripInBoundsConstantOffsets();
4877 Value *Root = Strip;
4878 while (auto *Gep = dyn_cast<GetElementPtrInst>(Root))
4879 Root = Gep->getOperand(0);
4880 SortedBases.emplace_back(Base.first, Strip, Root);
4881 }
4882 auto *Begin = SortedBases.begin();
4883 auto *End = SortedBases.end();
4884 while (Begin != End) {
4885 Value *Root = std::get<2>(*Begin);
4886 auto *Mid = std::stable_partition(
4887 Begin, End, [&Root](auto V) { return std::get<2>(V) == Root; });
4889 for (auto I = Begin; I < Mid; ++I)
4890 LessThan.try_emplace(std::get<1>(*I));
4891 for (auto I = Begin; I < Mid; ++I) {
4892 Value *V = std::get<1>(*I);
4893 while (auto *Gep = dyn_cast<GetElementPtrInst>(V)) {
4894 V = Gep->getOperand(0);
4895 if (LessThan.contains(V))
4896 LessThan[V][std::get<1>(*I)] = true;
4897 }
4898 }
4899 std::stable_sort(Begin, Mid, [&LessThan](auto &V1, auto &V2) {
4900 return LessThan[std::get<1>(V1)][std::get<1>(V2)];
4901 });
4902 Begin = Mid;
4903 }
4904
4905 // Collect the final order of sorted indices
4906 for (auto Base : SortedBases)
4907 for (auto &T : Bases[std::get<0>(Base)])
4908 SortedIndices.push_back(std::get<2>(T));
4909
4910 assert(SortedIndices.size() == VL.size() &&
4911 "Expected SortedIndices to be the size of VL");
4912 return true;
4913}
4914
4915std::optional<BoUpSLP::OrdersType>
4916BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4917 assert(TE.isGather() && "Expected gather node only.");
4918 Type *ScalarTy = TE.Scalars[0]->getType();
4919
4921 Ptrs.reserve(TE.Scalars.size());
4922 for (Value *V : TE.Scalars) {
4923 auto *L = dyn_cast<LoadInst>(V);
4924 if (!L || !L->isSimple())
4925 return std::nullopt;
4926 Ptrs.push_back(L->getPointerOperand());
4927 }
4928
4929 BoUpSLP::OrdersType Order;
4930 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
4931 return std::move(Order);
4932 return std::nullopt;
4933}
4934
4935/// Check if two insertelement instructions are from the same buildvector.
4938 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4939 // Instructions must be from the same basic blocks.
4940 if (VU->getParent() != V->getParent())
4941 return false;
4942 // Checks if 2 insertelements are from the same buildvector.
4943 if (VU->getType() != V->getType())
4944 return false;
4945 // Multiple used inserts are separate nodes.
4946 if (!VU->hasOneUse() && !V->hasOneUse())
4947 return false;
4948 auto *IE1 = VU;
4949 auto *IE2 = V;
4950 std::optional<unsigned> Idx1 = getElementIndex(IE1);
4951 std::optional<unsigned> Idx2 = getElementIndex(IE2);
4952 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4953 return false;
4954 // Go through the vector operand of insertelement instructions trying to find
4955 // either VU as the original vector for IE2 or V as the original vector for
4956 // IE1.
4957 SmallBitVector ReusedIdx(
4958 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
4959 bool IsReusedIdx = false;
4960 do {
4961 if (IE2 == VU && !IE1)
4962 return VU->hasOneUse();
4963 if (IE1 == V && !IE2)
4964 return V->hasOneUse();
4965 if (IE1 && IE1 != V) {
4966 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
4967 IsReusedIdx |= ReusedIdx.test(Idx1);
4968 ReusedIdx.set(Idx1);
4969 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4970 IE1 = nullptr;
4971 else
4972 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4973 }
4974 if (IE2 && IE2 != VU) {
4975 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
4976 IsReusedIdx |= ReusedIdx.test(Idx2);
4977 ReusedIdx.set(Idx2);
4978 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4979 IE2 = nullptr;
4980 else
4981 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4982 }
4983 } while (!IsReusedIdx && (IE1 || IE2));
4984 return false;
4985}
4986
4987std::optional<BoUpSLP::OrdersType>
4988BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4989 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
4990 if (TE.isNonPowOf2Vec())
4991 return std::nullopt;
4992
4993 // No need to reorder if need to shuffle reuses, still need to shuffle the
4994 // node.
4995 if (!TE.ReuseShuffleIndices.empty()) {
4996 if (isSplat(TE.Scalars))
4997 return std::nullopt;
4998 // Check if reuse shuffle indices can be improved by reordering.
4999 // For this, check that reuse mask is "clustered", i.e. each scalar values
5000 // is used once in each submask of size <number_of_scalars>.
5001 // Example: 4 scalar values.
5002 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
5003 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
5004 // element 3 is used twice in the second submask.
5005 unsigned Sz = TE.Scalars.size();
5006 if (TE.isGather()) {
5007 if (std::optional<OrdersType> CurrentOrder =
5009 SmallVector<int> Mask;
5010 fixupOrderingIndices(*CurrentOrder);
5011 inversePermutation(*CurrentOrder, Mask);
5012 ::addMask(Mask, TE.ReuseShuffleIndices);
5013 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5014 unsigned Sz = TE.Scalars.size();
5015 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
5016 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
5017 if (Idx != PoisonMaskElem)
5018 Res[Idx + K * Sz] = I + K * Sz;
5019 }
5020 return std::move(Res);
5021 }
5022 }
5023 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5024 TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
5025 2 * TE.getVectorFactor())) == 1)
5026 return std::nullopt;
5027 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
5028 Sz)) {
5029 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5030 if (TE.ReorderIndices.empty())
5031 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5032 else
5033 inversePermutation(TE.ReorderIndices, ReorderMask);
5034 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5035 unsigned VF = ReorderMask.size();
5036 OrdersType ResOrder(VF, VF);
5037 unsigned NumParts = divideCeil(VF, Sz);
5038 SmallBitVector UsedVals(NumParts);
5039 for (unsigned I = 0; I < VF; I += Sz) {
5040 int Val = PoisonMaskElem;
5041 unsigned UndefCnt = 0;
5042 unsigned Limit = std::min(Sz, VF - I);
5043 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
5044 [&](int Idx) {
5045 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
5046 Val = Idx;
5047 if (Idx == PoisonMaskElem)
5048 ++UndefCnt;
5049 return Idx != PoisonMaskElem && Idx != Val;
5050 }) ||
5051 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
5052 UndefCnt > Sz / 2)
5053 return std::nullopt;
5054 UsedVals.set(Val);
5055 for (unsigned K = 0; K < NumParts; ++K)
5056 ResOrder[Val + Sz * K] = I + K;
5057 }
5058 return std::move(ResOrder);
5059 }
5060 unsigned VF = TE.getVectorFactor();
5061 // Try build correct order for extractelement instructions.
5062 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
5063 TE.ReuseShuffleIndices.end());
5064 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
5065 all_of(TE.Scalars, [Sz](Value *V) {
5066 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5067 return Idx && *Idx < Sz;
5068 })) {
5069 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5070 if (TE.ReorderIndices.empty())
5071 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5072 else
5073 inversePermutation(TE.ReorderIndices, ReorderMask);
5074 for (unsigned I = 0; I < VF; ++I) {
5075 int &Idx = ReusedMask[I];
5076 if (Idx == PoisonMaskElem)
5077 continue;
5078 Value *V = TE.Scalars[ReorderMask[Idx]];
5079 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
5080 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
5081 }
5082 }
5083 // Build the order of the VF size, need to reorder reuses shuffles, they are
5084 // always of VF size.
5085 OrdersType ResOrder(VF);
5086 std::iota(ResOrder.begin(), ResOrder.end(), 0);
5087 auto *It = ResOrder.begin();
5088 for (unsigned K = 0; K < VF; K += Sz) {
5089 OrdersType CurrentOrder(TE.ReorderIndices);
5090 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
5091 if (SubMask.front() == PoisonMaskElem)
5092 std::iota(SubMask.begin(), SubMask.end(), 0);
5093 reorderOrder(CurrentOrder, SubMask);
5094 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
5095 std::advance(It, Sz);
5096 }
5097 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
5098 return Data.index() == Data.value();
5099 }))
5100 return std::nullopt; // No need to reorder.
5101 return std::move(ResOrder);
5102 }
5103 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5104 any_of(TE.UserTreeIndices,
5105 [](const EdgeInfo &EI) {
5106 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5107 }) &&
5108 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
5109 return std::nullopt;
5110 if ((TE.State == TreeEntry::Vectorize ||
5111 TE.State == TreeEntry::StridedVectorize) &&
5112 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5113 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
5114 !TE.isAltShuffle())
5115 return TE.ReorderIndices;
5116 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5117 auto PHICompare = [&](unsigned I1, unsigned I2) {
5118 Value *V1 = TE.Scalars[I1];
5119 Value *V2 = TE.Scalars[I2];
5120 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
5121 return false;
5122 if (V1->getNumUses() < V2->getNumUses())
5123 return true;
5124 if (V1->getNumUses() > V2->getNumUses())
5125 return false;
5126 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
5127 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5128 if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
5129 if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
5131 IE1, IE2,
5132 [](InsertElementInst *II) { return II->getOperand(0); }))
5133 return I1 < I2;
5134 return getElementIndex(IE1) < getElementIndex(IE2);
5135 }
5136 if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
5137 if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
5138 if (EE1->getOperand(0) != EE2->getOperand(0))
5139 return I1 < I2;
5140 return getElementIndex(EE1) < getElementIndex(EE2);
5141 }
5142 return I1 < I2;
5143 };
5144 auto IsIdentityOrder = [](const OrdersType &Order) {
5145 for (unsigned Idx : seq<unsigned>(0, Order.size()))
5146 if (Idx != Order[Idx])
5147 return false;
5148 return true;
5149 };
5150 if (!TE.ReorderIndices.empty())
5151 return TE.ReorderIndices;
5153 SmallVector<unsigned> Phis(TE.Scalars.size());
5154 std::iota(Phis.begin(), Phis.end(), 0);
5155 OrdersType ResOrder(TE.Scalars.size());
5156 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
5157 PhiToId[Id] = Id;
5158 stable_sort(Phis, PHICompare);
5159 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
5160 ResOrder[Id] = PhiToId[Phis[Id]];
5161 if (IsIdentityOrder(ResOrder))
5162 return std::nullopt; // No need to reorder.
5163 return std::move(ResOrder);
5164 }
5165 if (TE.isGather() && !TE.isAltShuffle() && allSameType(TE.Scalars)) {
5166 // TODO: add analysis of other gather nodes with extractelement
5167 // instructions and other values/instructions, not only undefs.
5168 if ((TE.getOpcode() == Instruction::ExtractElement ||
5169 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5170 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5171 all_of(TE.Scalars, [](Value *V) {
5172 auto *EE = dyn_cast<ExtractElementInst>(V);
5173 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5174 })) {
5175 // Check that gather of extractelements can be represented as
5176 // just a shuffle of a single vector.
5177 OrdersType CurrentOrder;
5178 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
5179 /*ResizeAllowed=*/true);
5180 if (Reuse || !CurrentOrder.empty())
5181 return std::move(CurrentOrder);
5182 }
5183 // If the gather node is <undef, v, .., poison> and
5184 // insertelement poison, v, 0 [+ permute]
5185 // is cheaper than
5186 // insertelement poison, v, n - try to reorder.
5187 // If rotating the whole graph, exclude the permute cost, the whole graph
5188 // might be transformed.
5189 int Sz = TE.Scalars.size();
5190 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
5191 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5192 const auto *It =
5193 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
5194 if (It == TE.Scalars.begin())
5195 return OrdersType();
5196 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
5197 if (It != TE.Scalars.end()) {
5198 OrdersType Order(Sz, Sz);
5199 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5200 Order[Idx] = 0;
5201 fixupOrderingIndices(Order);
5202 SmallVector<int> Mask;
5203 inversePermutation(Order, Mask);
5204 InstructionCost PermuteCost =
5205 TopToBottom
5206 ? 0
5208 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
5209 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
5210 PoisonValue::get(Ty), *It);
5211 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
5212 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
5213 PoisonValue::get(Ty), *It);
5214 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5215 OrdersType Order(Sz, Sz);
5216 Order[Idx] = 0;
5217 return std::move(Order);
5218 }
5219 }
5220 }
5221 if (isSplat(TE.Scalars))
5222 return std::nullopt;
5223 if (TE.Scalars.size() >= 4)
5224 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
5225 return Order;
5226 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
5227 return CurrentOrder;
5228 }
5229 return std::nullopt;
5230}
5231
5232/// Checks if the given mask is a "clustered" mask with the same clusters of
5233/// size \p Sz, which are not identity submasks.
5235 unsigned Sz) {
5236 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
5237 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
5238 return false;
5239 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
5240 ArrayRef<int> Cluster = Mask.slice(I, Sz);
5241 if (Cluster != FirstCluster)
5242 return false;
5243 }
5244 return true;
5245}
5246
5247void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
5248 // Reorder reuses mask.
5249 reorderReuses(TE.ReuseShuffleIndices, Mask);
5250 const unsigned Sz = TE.Scalars.size();
5251 // For vectorized and non-clustered reused no need to do anything else.
5252 if (!TE.isGather() ||
5254 Sz) ||
5255 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
5256 return;
5257 SmallVector<int> NewMask;
5258 inversePermutation(TE.ReorderIndices, NewMask);
5259 addMask(NewMask, TE.ReuseShuffleIndices);
5260 // Clear reorder since it is going to be applied to the new mask.
5261 TE.ReorderIndices.clear();
5262 // Try to improve gathered nodes with clustered reuses, if possible.
5263 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
5264 SmallVector<unsigned> NewOrder(Slice);
5265 inversePermutation(NewOrder, NewMask);
5266 reorderScalars(TE.Scalars, NewMask);
5267 // Fill the reuses mask with the identity submasks.
5268 for (auto *It = TE.ReuseShuffleIndices.begin(),
5269 *End = TE.ReuseShuffleIndices.end();
5270 It != End; std::advance(It, Sz))
5271 std::iota(It, std::next(It, Sz), 0);
5272}
5273
5275 ArrayRef<unsigned> SecondaryOrder) {
5276 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5277 "Expected same size of orders");
5278 unsigned Sz = Order.size();
5279 SmallBitVector UsedIndices(Sz);
5280 for (unsigned Idx : seq<unsigned>(0, Sz)) {
5281 if (Order[Idx] != Sz)
5282 UsedIndices.set(Order[Idx]);
5283 }
5284 if (SecondaryOrder.empty()) {
5285 for (unsigned Idx : seq<unsigned>(0, Sz))
5286 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5287 Order[Idx] = Idx;
5288 } else {
5289 for (unsigned Idx : seq<unsigned>(0, Sz))
5290 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5291 !UsedIndices.test(SecondaryOrder[Idx]))
5292 Order[Idx] = SecondaryOrder[Idx];
5293 }
5294}
5295
5297 // Maps VF to the graph nodes.
5299 // ExtractElement gather nodes which can be vectorized and need to handle
5300 // their ordering.
5302
5303 // Phi nodes can have preferred ordering based on their result users
5305
5306 // AltShuffles can also have a preferred ordering that leads to fewer
5307 // instructions, e.g., the addsub instruction in x86.
5308 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5309
5310 // Maps a TreeEntry to the reorder indices of external users.
5312 ExternalUserReorderMap;
5313 // Find all reorderable nodes with the given VF.
5314 // Currently the are vectorized stores,loads,extracts + some gathering of
5315 // extracts.
5316 for_each(VectorizableTree, [&, &TTIRef = *TTI](
5317 const std::unique_ptr<TreeEntry> &TE) {
5318 // Look for external users that will probably be vectorized.
5319 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5320 findExternalStoreUsersReorderIndices(TE.get());
5321 if (!ExternalUserReorderIndices.empty()) {
5322 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5323 ExternalUserReorderMap.try_emplace(TE.get(),
5324 std::move(ExternalUserReorderIndices));
5325 }
5326
5327 // Patterns like [fadd,fsub] can be combined into a single instruction in
5328 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5329 // to take into account their order when looking for the most used order.
5330 if (TE->isAltShuffle()) {
5331 VectorType *VecTy =
5332 getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
5333 unsigned Opcode0 = TE->getOpcode();
5334 unsigned Opcode1 = TE->getAltOpcode();
5335 SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
5336 // If this pattern is supported by the target then we consider the order.
5337 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5338 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5339 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
5340 }
5341 // TODO: Check the reverse order too.
5342 }
5343
5344 if (std::optional<OrdersType> CurrentOrder =
5345 getReorderingData(*TE, /*TopToBottom=*/true)) {
5346 // Do not include ordering for nodes used in the alt opcode vectorization,
5347 // better to reorder them during bottom-to-top stage. If follow the order
5348 // here, it causes reordering of the whole graph though actually it is
5349 // profitable just to reorder the subgraph that starts from the alternate
5350 // opcode vectorization node. Such nodes already end-up with the shuffle
5351 // instruction and it is just enough to change this shuffle rather than
5352 // rotate the scalars for the whole graph.
5353 unsigned Cnt = 0;
5354 const TreeEntry *UserTE = TE.get();
5355 while (UserTE && Cnt < RecursionMaxDepth) {
5356 if (UserTE->UserTreeIndices.size() != 1)
5357 break;
5358 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
5359 return EI.UserTE->State == TreeEntry::Vectorize &&
5360 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5361 }))
5362 return;
5363 UserTE = UserTE->UserTreeIndices.back().UserTE;
5364 ++Cnt;
5365 }
5366 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5367 if (!(TE->State == TreeEntry::Vectorize ||
5368 TE->State == TreeEntry::StridedVectorize) ||
5369 !TE->ReuseShuffleIndices.empty())
5370 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
5371 if (TE->State == TreeEntry::Vectorize &&
5372 TE->getOpcode() == Instruction::PHI)
5373 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
5374 }
5375 });
5376
5377 // Reorder the graph nodes according to their vectorization factor.
5378 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5379 VF /= 2) {
5380 auto It = VFToOrderedEntries.find(VF);
5381 if (It == VFToOrderedEntries.end())
5382 continue;
5383 // Try to find the most profitable order. We just are looking for the most
5384 // used order and reorder scalar elements in the nodes according to this
5385 // mostly used order.
5386 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5387 // All operands are reordered and used only in this node - propagate the
5388 // most used order to the user node.
5391 OrdersUses;
5393 for (const TreeEntry *OpTE : OrderedEntries) {
5394 // No need to reorder this nodes, still need to extend and to use shuffle,
5395 // just need to merge reordering shuffle and the reuse shuffle.
5396 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5397 continue;
5398 // Count number of orders uses.
5399 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5400 &PhisToOrders]() -> const OrdersType & {
5401 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
5402 auto It = GathersToOrders.find(OpTE);
5403 if (It != GathersToOrders.end())
5404 return It->second;
5405 }
5406 if (OpTE->isAltShuffle()) {
5407 auto It = AltShufflesToOrders.find(OpTE);
5408 if (It != AltShufflesToOrders.end())
5409 return It->second;
5410 }
5411 if (OpTE->State == TreeEntry::Vectorize &&
5412 OpTE->getOpcode() == Instruction::PHI) {
5413 auto It = PhisToOrders.find(OpTE);
5414 if (It != PhisToOrders.end())
5415 return It->second;
5416 }
5417 return OpTE->ReorderIndices;
5418 }();
5419 // First consider the order of the external scalar users.
5420 auto It = ExternalUserReorderMap.find(OpTE);
5421 if (It != ExternalUserReorderMap.end()) {
5422 const auto &ExternalUserReorderIndices = It->second;
5423 // If the OpTE vector factor != number of scalars - use natural order,
5424 // it is an attempt to reorder node with reused scalars but with
5425 // external uses.
5426 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5427 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
5428 ExternalUserReorderIndices.size();
5429 } else {
5430 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
5431 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5432 }
5433 // No other useful reorder data in this entry.
5434 if (Order.empty())
5435 continue;
5436 }
5437 // Stores actually store the mask, not the order, need to invert.
5438 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5439 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5440 SmallVector<int> Mask;
5441 inversePermutation(Order, Mask);
5442 unsigned E = Order.size();
5443 OrdersType CurrentOrder(E, E);
5444 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5445 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5446 });
5447 fixupOrderingIndices(CurrentOrder);
5448 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5449 } else {
5450 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5451 }
5452 }
5453 if (OrdersUses.empty())
5454 continue;
5455 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5456 const unsigned Sz = Order.size();
5457 for (unsigned Idx : seq<unsigned>(0, Sz))
5458 if (Idx != Order[Idx] && Order[Idx] != Sz)
5459 return false;
5460 return true;
5461 };
5462 // Choose the most used order.
5463 unsigned IdentityCnt = 0;
5464 unsigned FilledIdentityCnt = 0;
5465 OrdersType IdentityOrder(VF, VF);
5466 for (auto &Pair : OrdersUses) {
5467 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5468 if (!Pair.first.empty())
5469 FilledIdentityCnt += Pair.second;
5470 IdentityCnt += Pair.second;
5471 combineOrders(IdentityOrder, Pair.first);
5472 }
5473 }
5474 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5475 unsigned Cnt = IdentityCnt;
5476 for (auto &Pair : OrdersUses) {
5477 // Prefer identity order. But, if filled identity found (non-empty order)
5478 // with same number of uses, as the new candidate order, we can choose
5479 // this candidate order.
5480 if (Cnt < Pair.second ||
5481 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5482 Cnt == Pair.second && !BestOrder.empty() &&
5483 IsIdentityOrder(BestOrder))) {
5484 combineOrders(Pair.first, BestOrder);
5485 BestOrder = Pair.first;
5486 Cnt = Pair.second;
5487 } else {
5488 combineOrders(BestOrder, Pair.first);
5489 }
5490 }
5491 // Set order of the user node.
5492 if (IsIdentityOrder(BestOrder))
5493 continue;
5494 fixupOrderingIndices(BestOrder);
5495 SmallVector<int> Mask;
5496 inversePermutation(BestOrder, Mask);
5497 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5498 unsigned E = BestOrder.size();
5499 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5500 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5501 });
5502 // Do an actual reordering, if profitable.
5503 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5504 // Just do the reordering for the nodes with the given VF.
5505 if (TE->Scalars.size() != VF) {
5506 if (TE->ReuseShuffleIndices.size() == VF) {
5507 // Need to reorder the reuses masks of the operands with smaller VF to
5508 // be able to find the match between the graph nodes and scalar
5509 // operands of the given node during vectorization/cost estimation.
5510 assert(all_of(TE->UserTreeIndices,
5511 [VF, &TE](const EdgeInfo &EI) {
5512 return EI.UserTE->Scalars.size() == VF ||
5513 EI.UserTE->Scalars.size() ==
5514 TE->Scalars.size();
5515 }) &&
5516 "All users must be of VF size.");
5517 // Update ordering of the operands with the smaller VF than the given
5518 // one.
5519 reorderNodeWithReuses(*TE, Mask);
5520 }
5521 continue;
5522 }
5523 if ((TE->State == TreeEntry::Vectorize ||
5524 TE->State == TreeEntry::StridedVectorize) &&
5526 InsertElementInst>(TE->getMainOp()) &&
5527 !TE->isAltShuffle()) {
5528 // Build correct orders for extract{element,value}, loads and
5529 // stores.
5530 reorderOrder(TE->ReorderIndices, Mask);
5531 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5532 TE->reorderOperands(Mask);
5533 } else {
5534 // Reorder the node and its operands.
5535 TE->reorderOperands(Mask);
5536 assert(TE->ReorderIndices.empty() &&
5537 "Expected empty reorder sequence.");
5538 reorderScalars(TE->Scalars, Mask);
5539 }
5540 if (!TE->ReuseShuffleIndices.empty()) {
5541 // Apply reversed order to keep the original ordering of the reused
5542 // elements to avoid extra reorder indices shuffling.
5543 OrdersType CurrentOrder;
5544 reorderOrder(CurrentOrder, MaskOrder);
5545 SmallVector<int> NewReuses;
5546 inversePermutation(CurrentOrder, NewReuses);
5547 addMask(NewReuses, TE->ReuseShuffleIndices);
5548 TE->ReuseShuffleIndices.swap(NewReuses);
5549 }
5550 }
5551 }
5552}
5553
5554bool BoUpSLP::canReorderOperands(
5555 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5556 ArrayRef<TreeEntry *> ReorderableGathers,
5557 SmallVectorImpl<TreeEntry *> &GatherOps) {
5558 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5559 if (UserTE->isNonPowOf2Vec())
5560 return false;
5561
5562 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
5563 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
5564 return OpData.first == I &&
5565 (OpData.second->State == TreeEntry::Vectorize ||
5566 OpData.second->State == TreeEntry::StridedVectorize);
5567 }))
5568 continue;
5569 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
5570 // Do not reorder if operand node is used by many user nodes.
5571 if (any_of(TE->UserTreeIndices,
5572 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5573 return false;
5574 // Add the node to the list of the ordered nodes with the identity
5575 // order.
5576 Edges.emplace_back(I, TE);
5577 // Add ScatterVectorize nodes to the list of operands, where just
5578 // reordering of the scalars is required. Similar to the gathers, so
5579 // simply add to the list of gathered ops.
5580 // If there are reused scalars, process this node as a regular vectorize
5581 // node, just reorder reuses mask.
5582 if (TE->State != TreeEntry::Vectorize &&
5583 TE->State != TreeEntry::StridedVectorize &&
5584 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5585 GatherOps.push_back(TE);
5586 continue;
5587 }
5588 TreeEntry *Gather = nullptr;
5589 if (count_if(ReorderableGathers,
5590 [&Gather, UserTE, I](TreeEntry *TE) {
5591 assert(TE->State != TreeEntry::Vectorize &&
5592 TE->State != TreeEntry::StridedVectorize &&
5593 "Only non-vectorized nodes are expected.");
5594 if (any_of(TE->UserTreeIndices,
5595 [UserTE, I](const EdgeInfo &EI) {
5596 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5597 })) {
5598 assert(TE->isSame(UserTE->getOperand(I)) &&
5599 "Operand entry does not match operands.");
5600 Gather = TE;
5601 return true;
5602 }
5603 return false;
5604 }) > 1 &&
5605 !allConstant(UserTE->getOperand(I)))
5606 return false;
5607 if (Gather)
5608 GatherOps.push_back(Gather);
5609 }
5610 return true;
5611}
5612
5613void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
5614 SetVector<TreeEntry *> OrderedEntries;
5615 DenseSet<const TreeEntry *> GathersToOrders;
5616 // Find all reorderable leaf nodes with the given VF.
5617 // Currently the are vectorized loads,extracts without alternate operands +
5618 // some gathering of extracts.
5619 SmallVector<TreeEntry *> NonVectorized;
5620 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5621 if (TE->State != TreeEntry::Vectorize &&
5622 TE->State != TreeEntry::StridedVectorize)
5623 NonVectorized.push_back(TE.get());
5624 if (std::optional<OrdersType> CurrentOrder =
5625 getReorderingData(*TE, /*TopToBottom=*/false)) {
5626 OrderedEntries.insert(TE.get());
5627 if (!(TE->State == TreeEntry::Vectorize ||
5628 TE->State == TreeEntry::StridedVectorize) ||
5629 !TE->ReuseShuffleIndices.empty())
5630 GathersToOrders.insert(TE.get());
5631 }
5632 }
5633
5634 // 1. Propagate order to the graph nodes, which use only reordered nodes.
5635 // I.e., if the node has operands, that are reordered, try to make at least
5636 // one operand order in the natural order and reorder others + reorder the
5637 // user node itself.
5639 while (!OrderedEntries.empty()) {
5640 // 1. Filter out only reordered nodes.
5641 // 2. If the entry has multiple uses - skip it and jump to the next node.
5643 SmallVector<TreeEntry *> Filtered;
5644 for (TreeEntry *TE : OrderedEntries) {
5645 if (!(TE->State == TreeEntry::Vectorize ||
5646 TE->State == TreeEntry::StridedVectorize ||
5647 (TE->isGather() && GathersToOrders.contains(TE))) ||
5648 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5649 !all_of(drop_begin(TE->UserTreeIndices),
5650 [TE](const EdgeInfo &EI) {
5651 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5652 }) ||
5653 !Visited.insert(TE).second) {
5654 Filtered.push_back(TE);
5655 continue;
5656 }
5657 // Build a map between user nodes and their operands order to speedup
5658 // search. The graph currently does not provide this dependency directly.
5659 for (EdgeInfo &EI : TE->UserTreeIndices) {
5660 TreeEntry *UserTE = EI.UserTE;
5661 auto It = Users.find(UserTE);
5662 if (It == Users.end())
5663 It = Users.insert({UserTE, {}}).first;
5664 It->second.emplace_back(EI.EdgeIdx, TE);
5665 }
5666 }
5667 // Erase filtered entries.
5668 for (TreeEntry *TE : Filtered)
5669 OrderedEntries.remove(TE);
5671 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5672 UsersVec(Users.begin(), Users.end());
5673 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
5674 return Data1.first->Idx > Data2.first->Idx;
5675 });
5676 for (auto &Data : UsersVec) {
5677 // Check that operands are used only in the User node.
5678 SmallVector<TreeEntry *> GatherOps;
5679 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
5680 GatherOps)) {
5681 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5682 OrderedEntries.remove(Op.second);
5683 continue;
5684 }
5685 // All operands are reordered and used only in this node - propagate the
5686 // most used order to the user node.
5689 OrdersUses;
5690 // Do the analysis for each tree entry only once, otherwise the order of
5691 // the same node my be considered several times, though might be not
5692 // profitable.
5695 for (const auto &Op : Data.second) {
5696 TreeEntry *OpTE = Op.second;
5697 if (!VisitedOps.insert(OpTE).second)
5698 continue;
5699 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5700 continue;
5701 const auto Order = [&]() -> const OrdersType {
5702 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
5703 return getReorderingData(*OpTE, /*TopToBottom=*/false)
5704 .value_or(OrdersType(1));
5705 return OpTE->ReorderIndices;
5706 }();
5707 // The order is partially ordered, skip it in favor of fully non-ordered
5708 // orders.
5709 if (Order.size() == 1)
5710 continue;
5711 unsigned NumOps = count_if(
5712 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
5713 return P.second == OpTE;
5714 });
5715 // Stores actually store the mask, not the order, need to invert.
5716 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5717 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5718 SmallVector<int> Mask;
5719 inversePermutation(Order, Mask);
5720 unsigned E = Order.size();
5721 OrdersType CurrentOrder(E, E);
5722 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5723 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5724 });
5725 fixupOrderingIndices(CurrentOrder);
5726 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5727 NumOps;
5728 } else {
5729 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5730 }
5731 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
5732 const auto AllowsReordering = [&](const TreeEntry *TE) {
5733 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5734 if (TE->isNonPowOf2Vec())
5735 return false;
5736 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5737 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5738 (IgnoreReorder && TE->Idx == 0))
5739 return true;
5740 if (TE->isGather()) {
5741 if (GathersToOrders.contains(TE))
5742 return !getReorderingData(*TE, /*TopToBottom=*/false)
5743 .value_or(OrdersType(1))
5744 .empty();
5745 return true;
5746 }
5747 return false;
5748 };
5749 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
5750 TreeEntry *UserTE = EI.UserTE;
5751 if (!VisitedUsers.insert(UserTE).second)
5752 continue;
5753 // May reorder user node if it requires reordering, has reused
5754 // scalars, is an alternate op vectorize node or its op nodes require
5755 // reordering.
5756 if (AllowsReordering(UserTE))
5757 continue;
5758 // Check if users allow reordering.
5759 // Currently look up just 1 level of operands to avoid increase of
5760 // the compile time.
5761 // Profitable to reorder if definitely more operands allow
5762 // reordering rather than those with natural order.
5764 if (static_cast<unsigned>(count_if(
5765 Ops, [UserTE, &AllowsReordering](
5766 const std::pair<unsigned, TreeEntry *> &Op) {
5767 return AllowsReordering(Op.second) &&
5768 all_of(Op.second->UserTreeIndices,
5769 [UserTE](const EdgeInfo &EI) {
5770 return EI.UserTE == UserTE;
5771 });
5772 })) <= Ops.size() / 2)
5773 ++Res.first->second;
5774 }
5775 }
5776 if (OrdersUses.empty()) {
5777 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5778 OrderedEntries.remove(Op.second);
5779 continue;
5780 }
5781 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5782 const unsigned Sz = Order.size();
5783 for (unsigned Idx : seq<unsigned>(0, Sz))
5784 if (Idx != Order[Idx] && Order[Idx] != Sz)
5785 return false;
5786 return true;
5787 };
5788 // Choose the most used order.
5789 unsigned IdentityCnt = 0;
5790 unsigned VF = Data.second.front().second->getVectorFactor();
5791 OrdersType IdentityOrder(VF, VF);
5792 for (auto &Pair : OrdersUses) {
5793 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5794 IdentityCnt += Pair.second;
5795 combineOrders(IdentityOrder, Pair.first);
5796 }
5797 }
5798 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5799 unsigned Cnt = IdentityCnt;
5800 for (auto &Pair : OrdersUses) {
5801 // Prefer identity order. But, if filled identity found (non-empty
5802 // order) with same number of uses, as the new candidate order, we can
5803 // choose this candidate order.
5804 if (Cnt < Pair.second) {
5805 combineOrders(Pair.first, BestOrder);
5806 BestOrder = Pair.first;
5807 Cnt = Pair.second;
5808 } else {
5809 combineOrders(BestOrder, Pair.first);
5810 }
5811 }
5812 // Set order of the user node.
5813 if (IsIdentityOrder(BestOrder)) {
5814 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5815 OrderedEntries.remove(Op.second);
5816 continue;
5817 }
5818 fixupOrderingIndices(BestOrder);
5819 // Erase operands from OrderedEntries list and adjust their orders.
5820 VisitedOps.clear();
5821 SmallVector<int> Mask;
5822 inversePermutation(BestOrder, Mask);
5823 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5824 unsigned E = BestOrder.size();
5825 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5826 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5827 });
5828 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
5829 TreeEntry *TE = Op.second;
5830 OrderedEntries.remove(TE);
5831 if (!VisitedOps.insert(TE).second)
5832 continue;
5833 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
5834 reorderNodeWithReuses(*TE, Mask);
5835 continue;
5836 }
5837 // Gathers are processed separately.
5838 if (TE->State != TreeEntry::Vectorize &&
5839 TE->State != TreeEntry::StridedVectorize &&
5840 (TE->State != TreeEntry::ScatterVectorize ||
5841 TE->ReorderIndices.empty()))
5842 continue;
5843 assert((BestOrder.size() == TE->ReorderIndices.size() ||
5844 TE->ReorderIndices.empty()) &&
5845 "Non-matching sizes of user/operand entries.");
5846 reorderOrder(TE->ReorderIndices, Mask);
5847 if (IgnoreReorder && TE == VectorizableTree.front().get())
5848 IgnoreReorder = false;
5849 }
5850 // For gathers just need to reorder its scalars.
5851 for (TreeEntry *Gather : GatherOps) {
5852 assert(Gather->ReorderIndices.empty() &&
5853 "Unexpected reordering of gathers.");
5854 if (!Gather->ReuseShuffleIndices.empty()) {
5855 // Just reorder reuses indices.
5856 reorderReuses(Gather->ReuseShuffleIndices, Mask);
5857 continue;
5858 }
5859 reorderScalars(Gather->Scalars, Mask);
5860 OrderedEntries.remove(Gather);
5861 }
5862 // Reorder operands of the user node and set the ordering for the user
5863 // node itself.
5864 if (Data.first->State != TreeEntry::Vectorize ||
5865 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5866 Data.first->getMainOp()) ||
5867 Data.first->isAltShuffle())
5868 Data.first->reorderOperands(Mask);
5869 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
5870 Data.first->isAltShuffle() ||
5871 Data.first->State == TreeEntry::StridedVectorize) {
5872 reorderScalars(Data.first->Scalars, Mask);
5873 reorderOrder(Data.first->ReorderIndices, MaskOrder,
5874 /*BottomOrder=*/true);
5875 if (Data.first->ReuseShuffleIndices.empty() &&
5876 !Data.first->ReorderIndices.empty() &&
5877 !Data.first->isAltShuffle()) {
5878 // Insert user node to the list to try to sink reordering deeper in
5879 // the graph.
5880 OrderedEntries.insert(Data.first);
5881 }
5882 } else {
5883 reorderOrder(Data.first->ReorderIndices, Mask);
5884 }
5885 }
5886 }
5887 // If the reordering is unnecessary, just remove the reorder.
5888 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5889 VectorizableTree.front()->ReuseShuffleIndices.empty())
5890 VectorizableTree.front()->ReorderIndices.clear();
5891}
5892
5894 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
5895 DenseMap<Value *, unsigned> ScalarToExtUses;
5896 // Collect the values that we need to extract from the tree.
5897 for (auto &TEPtr : VectorizableTree) {
5898 TreeEntry *Entry = TEPtr.get();
5899
5900 // No need to handle users of gathered values.
5901 if (Entry->isGather())
5902 continue;
5903
5904 // For each lane:
5905 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5906 Value *Scalar = Entry->Scalars[Lane];
5907 if (!isa<Instruction>(Scalar))
5908 continue;
5909 // All uses must be replaced already? No need to do it again.
5910 auto It = ScalarToExtUses.find(Scalar);
5911 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
5912 continue;
5913
5914 // Check if the scalar is externally used as an extra arg.
5915 const auto *ExtI = ExternallyUsedValues.find(Scalar);
5916 if (ExtI != ExternallyUsedValues.end()) {
5917 int FoundLane = Entry->findLaneForValue(Scalar);
5918 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
5919 << FoundLane << " from " << *Scalar << ".\n");
5920 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
5921 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
5922 continue;
5923 }
5924 for (User *U : Scalar->users()) {
5925 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
5926
5927 Instruction *UserInst = dyn_cast<Instruction>(U);
5928 if (!UserInst || isDeleted(UserInst))
5929 continue;
5930
5931 // Ignore users in the user ignore list.
5932 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5933 continue;
5934
5935 // Skip in-tree scalars that become vectors
5936 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5937 // Some in-tree scalars will remain as scalar in vectorized
5938 // instructions. If that is the case, the one in FoundLane will
5939 // be used.
5940 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5942 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5943 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
5944 << ".\n");
5945 assert(!UseEntry->isGather() && "Bad state");
5946 continue;
5947 }
5948 U = nullptr;
5949 if (It != ScalarToExtUses.end()) {
5950 ExternalUses[It->second].User = nullptr;
5951 break;
5952 }
5953 }
5954
5955 if (U && Scalar->hasNUsesOrMore(UsesLimit))
5956 U = nullptr;
5957 int FoundLane = Entry->findLaneForValue(Scalar);
5958 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
5959 << " from lane " << FoundLane << " from " << *Scalar
5960 << ".\n");
5961 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
5962 ExternalUses.emplace_back(Scalar, U, FoundLane);
5963 if (!U)
5964 break;
5965 }
5966 }
5967 }
5968}
5969
5971BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
5973 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5974 Value *V = TE->Scalars[Lane];
5975 // To save compilation time we don't visit if we have too many users.
5976 if (V->hasNUsesOrMore(UsesLimit))
5977 break;
5978
5979 // Collect stores per pointer object.
5980 for (User *U : V->users()) {
5981 auto *SI = dyn_cast<StoreInst>(U);
5982 if (SI == nullptr || !SI->isSimple() ||
5983 !isValidElementType(SI->getValueOperand()->getType()))
5984 continue;
5985 // Skip entry if already
5986 if (getTreeEntry(U))
5987 continue;
5988
5989 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
5990 auto &StoresVec = PtrToStoresMap[Ptr];
5991 // For now just keep one store per pointer object per lane.
5992 // TODO: Extend this to support multiple stores per pointer per lane
5993 if (StoresVec.size() > Lane)
5994 continue;
5995 // Skip if in different BBs.
5996 if (!StoresVec.empty() &&
5997 SI->getParent() != StoresVec.back()->getParent())
5998 continue;
5999 // Make sure that the stores are of the same type.
6000 if (!StoresVec.empty() &&
6001 SI->getValueOperand()->getType() !=
6002 StoresVec.back()->getValueOperand()->getType())
6003 continue;
6004 StoresVec.push_back(SI);
6005 }
6006 }
6007 return PtrToStoresMap;
6008}
6009
6010bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
6011 OrdersType &ReorderIndices) const {
6012 // We check whether the stores in StoreVec can form a vector by sorting them
6013 // and checking whether they are consecutive.
6014
6015 // To avoid calling getPointersDiff() while sorting we create a vector of
6016 // pairs {store, offset from first} and sort this instead.
6017 SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
6018 StoreInst *S0 = StoresVec[0];
6019 StoreOffsetVec[0] = {S0, 0};
6020 Type *S0Ty = S0->getValueOperand()->getType();
6021 Value *S0Ptr = S0->getPointerOperand();
6022 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
6023 StoreInst *SI = StoresVec[Idx];
6024 std::optional<int> Diff =
6025 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
6026 SI->getPointerOperand(), *DL, *SE,
6027 /*StrictCheck=*/true);
6028 // We failed to compare the pointers so just abandon this StoresVec.
6029 if (!Diff)
6030 return false;
6031 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
6032 }
6033
6034 // Sort the vector based on the pointers. We create a copy because we may
6035 // need the original later for calculating the reorder (shuffle) indices.
6036 stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
6037 const std::pair<StoreInst *, int> &Pair2) {
6038 int Offset1 = Pair1.second;
6039 int Offset2 = Pair2.second;
6040 return Offset1 < Offset2;
6041 });
6042
6043 // Check if the stores are consecutive by checking if their difference is 1.
6044 for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
6045 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
6046 return false;
6047
6048 // Calculate the shuffle indices according to their offset against the sorted
6049 // StoreOffsetVec.
6050 ReorderIndices.reserve(StoresVec.size());
6051 for (StoreInst *SI : StoresVec) {
6052 unsigned Idx = find_if(StoreOffsetVec,
6053 [SI](const std::pair<StoreInst *, int> &Pair) {
6054 return Pair.first == SI;
6055 }) -
6056 StoreOffsetVec.begin();
6057 ReorderIndices.push_back(Idx);
6058 }
6059 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
6060 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
6061 // same convention here.
6062 auto IsIdentityOrder = [](const OrdersType &Order) {
6063 for (unsigned Idx : seq<unsigned>(0, Order.size()))
6064 if (Idx != Order[Idx])
6065 return false;
6066 return true;
6067 };
6068 if (IsIdentityOrder(ReorderIndices))
6069 ReorderIndices.clear();
6070
6071 return true;
6072}
6073
6074#ifndef NDEBUG
6076 for (unsigned Idx : Order)
6077 dbgs() << Idx << ", ";
6078 dbgs() << "\n";
6079}
6080#endif
6081
6083BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
6084 unsigned NumLanes = TE->Scalars.size();
6085
6087 collectUserStores(TE);
6088
6089 // Holds the reorder indices for each candidate store vector that is a user of
6090 // the current TreeEntry.
6091 SmallVector<OrdersType, 1> ExternalReorderIndices;
6092
6093 // Now inspect the stores collected per pointer and look for vectorization
6094 // candidates. For each candidate calculate the reorder index vector and push
6095 // it into `ExternalReorderIndices`
6096 for (const auto &Pair : PtrToStoresMap) {
6097 auto &StoresVec = Pair.second;
6098 // If we have fewer than NumLanes stores, then we can't form a vector.
6099 if (StoresVec.size() != NumLanes)
6100 continue;
6101
6102 // If the stores are not consecutive then abandon this StoresVec.
6103 OrdersType ReorderIndices;
6104 if (!canFormVector(StoresVec, ReorderIndices))
6105 continue;
6106
6107 // We now know that the scalars in StoresVec can form a vector instruction,
6108 // so set the reorder indices.
6109 ExternalReorderIndices.push_back(ReorderIndices);
6110 }
6111 return ExternalReorderIndices;
6112}
6113
6115 const SmallDenseSet<Value *> &UserIgnoreLst) {
6116 deleteTree();
6117 UserIgnoreList = &UserIgnoreLst;
6118 if (!allSameType(Roots))
6119 return;
6120 buildTree_rec(Roots, 0, EdgeInfo());
6121}
6122
6124 deleteTree();
6125 if (!allSameType(Roots))
6126 return;
6127 buildTree_rec(Roots, 0, EdgeInfo());
6128}
6129
6130/// \return true if the specified list of values has only one instruction that
6131/// requires scheduling, false otherwise.
6132#ifndef NDEBUG
6134 Value *NeedsScheduling = nullptr;
6135 for (Value *V : VL) {
6137 continue;
6138 if (!NeedsScheduling) {
6139 NeedsScheduling = V;
6140 continue;
6141 }
6142 return false;
6143 }
6144 return NeedsScheduling;
6145}
6146#endif
6147
6148/// Generates key/subkey pair for the given value to provide effective sorting
6149/// of the values and better detection of the vectorizable values sequences. The
6150/// keys/subkeys can be used for better sorting of the values themselves (keys)
6151/// and in values subgroups (subkeys).
6152static std::pair<size_t, size_t> generateKeySubkey(
6153 Value *V, const TargetLibraryInfo *TLI,
6154 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
6155 bool AllowAlternate) {
6156 hash_code Key = hash_value(V->getValueID() + 2);
6157 hash_code SubKey = hash_value(0);
6158 // Sort the loads by the distance between the pointers.
6159 if (auto *LI = dyn_cast<LoadInst>(V)) {
6160 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
6161 if (LI->isSimple())
6162 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
6163 else
6164 Key = SubKey = hash_value(LI);
6165 } else if (isVectorLikeInstWithConstOps(V)) {
6166 // Sort extracts by the vector operands.
6167 if (isa<ExtractElementInst, UndefValue>(V))
6168 Key = hash_value(Value::UndefValueVal + 1);
6169 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
6170 if (!isUndefVector(EI->getVectorOperand()).all() &&
6171 !isa<UndefValue>(EI->getIndexOperand()))
6172 SubKey = hash_value(EI->getVectorOperand());
6173 }
6174 } else if (auto *I = dyn_cast<Instruction>(V)) {
6175 // Sort other instructions just by the opcodes except for CMPInst.
6176 // For CMP also sort by the predicate kind.
6177 if ((isa<BinaryOperator, CastInst>(I)) &&
6178 isValidForAlternation(I->getOpcode())) {
6179 if (AllowAlternate)
6180 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
6181 else
6182 Key = hash_combine(hash_value(I->getOpcode()), Key);
6183 SubKey = hash_combine(
6184 hash_value(I->getOpcode()), hash_value(I->getType()),
6185 hash_value(isa<BinaryOperator>(I)
6186 ? I->getType()
6187 : cast<CastInst>(I)->getOperand(0)->getType()));
6188 // For casts, look through the only operand to improve compile time.
6189 if (isa<CastInst>(I)) {
6190 std::pair<size_t, size_t> OpVals =
6191 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
6192 /*AllowAlternate=*/true);
6193 Key = hash_combine(OpVals.first, Key);
6194 SubKey = hash_combine(OpVals.first, SubKey);
6195 }
6196 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
6197 CmpInst::Predicate Pred = CI->getPredicate();
6198 if (CI->isCommutative())
6199 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
6201 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
6202 hash_value(SwapPred),
6203 hash_value(CI->getOperand(0)->getType()));
6204 } else if (auto *Call = dyn_cast<CallInst>(I)) {
6207 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
6208 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
6209 SubKey = hash_combine(hash_value(I->getOpcode()),
6210 hash_value(Call->getCalledFunction()));
6211 } else {
6212 Key = hash_combine(hash_value(Call), Key);
6213 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
6214 }
6215 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
6216 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
6217 hash_value(Op.Tag), SubKey);
6218 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
6219 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
6220 SubKey = hash_value(Gep->getPointerOperand());
6221 else
6222 SubKey = hash_value(Gep);
6223 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
6224 !isa<ConstantInt>(I->getOperand(1))) {
6225 // Do not try to vectorize instructions with potentially high cost.
6226 SubKey = hash_value(I);
6227 } else {
6228 SubKey = hash_value(I->getOpcode());
6229 }
6230 Key = hash_combine(hash_value(I->getParent()), Key);
6231 }
6232 return std::make_pair(Key, SubKey);
6233}
6234
6235/// Checks if the specified instruction \p I is an alternate operation for
6236/// the given \p MainOp and \p AltOp instructions.
6237static bool isAlternateInstruction(const Instruction *I,
6238 const Instruction *MainOp,
6239 const Instruction *AltOp,
6240 const TargetLibraryInfo &TLI);
6241
6242bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
6243 ArrayRef<Value *> VL) const {
6244 unsigned Opcode0 = S.getOpcode();
6245 unsigned Opcode1 = S.getAltOpcode();
6246 SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
6247 // If this pattern is supported by the target then consider it profitable.
6248 if (TTI->isLegalAltInstr(getWidenedType(S.MainOp->getType(), VL.size()),
6249 Opcode0, Opcode1, OpcodeMask))
6250 return true;
6252 for (unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
6253 Operands.emplace_back();
6254 // Prepare the operand vector.
6255 for (Value *V : VL)
6256 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
6257 }
6258 if (Operands.size() == 2) {
6259 // Try find best operands candidates.
6260 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
6262 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
6263 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
6264 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
6265 std::optional<int> Res = findBestRootPair(Candidates);
6266 switch (Res.value_or(0)) {
6267 case 0:
6268 break;
6269 case 1:
6270 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
6271 break;
6272 case 2:
6273 std::swap(Operands[0][I], Operands[1][I]);
6274 break;
6275 default:
6276 llvm_unreachable("Unexpected index.");
6277 }
6278 }
6279 }
6280 DenseSet<unsigned> UniqueOpcodes;
6281 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
6282 unsigned NonInstCnt = 0;
6283 // Estimate number of instructions, required for the vectorized node and for
6284 // the buildvector node.
6285 unsigned UndefCnt = 0;
6286 // Count the number of extra shuffles, required for vector nodes.
6287 unsigned ExtraShuffleInsts = 0;
6288 // Check that operands do not contain same values and create either perfect
6289 // diamond match or shuffled match.
6290 if (Operands.size() == 2) {
6291 // Do not count same operands twice.
6292 if (Operands.front() == Operands.back()) {
6293 Operands.erase(Operands.begin());
6294 } else if (!allConstant(Operands.front()) &&
6295 all_of(Operands.front(), [&](Value *V) {
6296 return is_contained(Operands.back(), V);
6297 })) {
6298 Operands.erase(Operands.begin());
6299 ++ExtraShuffleInsts;
6300 }
6301 }
6302 const Loop *L = LI->getLoopFor(S.MainOp->getParent());
6303 // Vectorize node, if:
6304 // 1. at least single operand is constant or splat.
6305 // 2. Operands have many loop invariants (the instructions are not loop
6306 // invariants).
6307 // 3. At least single unique operands is supposed to vectorized.
6308 return none_of(Operands,
6309 [&](ArrayRef<Value *> Op) {
6310 if (allConstant(Op) ||
6311 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
6312 getSameOpcode(Op, *TLI).MainOp))
6313 return false;
6315 for (Value *V : Op) {
6316 if (isa<Constant, ExtractElementInst>(V) ||
6317 getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
6318 if (isa<UndefValue>(V))
6319 ++UndefCnt;
6320 continue;
6321 }
6322 auto Res = Uniques.try_emplace(V, 0);
6323 // Found first duplicate - need to add shuffle.
6324 if (!Res.second && Res.first->second == 1)
6325 ++ExtraShuffleInsts;
6326 ++Res.first->getSecond();
6327 if (auto *I = dyn_cast<Instruction>(V))
6328 UniqueOpcodes.insert(I->getOpcode());
6329 else if (Res.second)
6330 ++NonInstCnt;
6331 }
6332 return none_of(Uniques, [&](const auto &P) {
6333 return P.first->hasNUsesOrMore(P.second + 1) &&
6334 none_of(P.first->users(), [&](User *U) {
6335 return getTreeEntry(U) || Uniques.contains(U);
6336 });
6337 });
6338 }) ||
6339 // Do not vectorize node, if estimated number of vector instructions is
6340 // more than estimated number of buildvector instructions. Number of
6341 // vector operands is number of vector instructions + number of vector
6342 // instructions for operands (buildvectors). Number of buildvector
6343 // instructions is just number_of_operands * number_of_scalars.
6344 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
6345 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
6346 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
6347}
6348
6349BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
6350 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
6351 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
6352 assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
6353
6354 unsigned ShuffleOrOp =
6355 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
6356 auto *VL0 = cast<Instruction>(S.OpValue);
6357 switch (ShuffleOrOp) {
6358 case Instruction::PHI: {
6359 // Too many operands - gather, most probably won't be vectorized.
6360 if (VL0->getNumOperands() > MaxPHINumOperands)
6361 return TreeEntry::NeedToGather;
6362 // Check for terminator values (e.g. invoke).
6363 for (Value *V : VL)
6364 for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
6365 Instruction *Term = dyn_cast<Instruction>(Incoming);
6366 if (Term && Term->isTerminator()) {
6368 << "SLP: Need to swizzle PHINodes (terminator use).\n");
6369 return TreeEntry::NeedToGather;
6370 }
6371 }
6372
6373 return TreeEntry::Vectorize;
6374 }
6375 case Instruction::ExtractValue:
6376 case Instruction::ExtractElement: {
6377 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6378 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6379 if (!isPowerOf2_32(VL.size()))
6380 return TreeEntry::NeedToGather;
6381 if (Reuse || !CurrentOrder.empty())
6382 return TreeEntry::Vectorize;
6383 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
6384 return TreeEntry::NeedToGather;
6385 }
6386 case Instruction::InsertElement: {
6387 // Check that we have a buildvector and not a shuffle of 2 or more
6388 // different vectors.
6389 ValueSet SourceVectors;
6390 for (Value *V : VL) {
6391 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
6392 assert(getElementIndex(V) != std::nullopt &&
6393 "Non-constant or undef index?");
6394 }
6395
6396 if (count_if(VL, [&SourceVectors](Value *V) {
6397 return !SourceVectors.contains(V);
6398 }) >= 2) {
6399 // Found 2nd source vector - cancel.
6400 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
6401 "different source vectors.\n");
6402 return TreeEntry::NeedToGather;
6403 }
6404
6405 return TreeEntry::Vectorize;
6406 }
6407 case Instruction::Load: {
6408 // Check that a vectorized load would load the same memory as a scalar
6409 // load. For example, we don't want to vectorize loads that are smaller
6410 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6411 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6412 // from such a struct, we read/write packed bits disagreeing with the
6413 // unvectorized version.
6414 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
6416 return TreeEntry::Vectorize;
6418 return TreeEntry::ScatterVectorize;
6420 return TreeEntry::StridedVectorize;
6421 case LoadsState::Gather:
6422#ifndef NDEBUG
6423 Type *ScalarTy = VL0->getType();
6424 if (DL->getTypeSizeInBits(ScalarTy) !=
6425 DL->getTypeAllocSizeInBits(ScalarTy))
6426 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
6427 else if (any_of(VL,
6428 [](Value *V) { return !cast<LoadInst>(V)->isSimple(); }))
6429 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
6430 else
6431 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
6432#endif // NDEBUG
6433 return TreeEntry::NeedToGather;
6434 }
6435 llvm_unreachable("Unexpected state of loads");
6436 }
6437 case Instruction::ZExt:
6438 case Instruction::SExt:
6439 case Instruction::FPToUI:
6440 case Instruction::FPToSI:
6441 case Instruction::FPExt:
6442 case Instruction::PtrToInt:
6443 case Instruction::IntToPtr:
6444 case Instruction::SIToFP:
6445 case Instruction::UIToFP:
6446 case Instruction::Trunc:
6447 case Instruction::FPTrunc:
6448 case Instruction::BitCast: {
6449 Type *SrcTy = VL0->getOperand(0)->getType();
6450 for (Value *V : VL) {
6451 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6452 if (Ty != SrcTy || !isValidElementType(Ty)) {
6453 LLVM_DEBUG(
6454 dbgs() << "SLP: Gathering casts with different src types.\n");
6455 return TreeEntry::NeedToGather;
6456 }
6457 }
6458 return TreeEntry::Vectorize;
6459 }
6460 case Instruction::ICmp:
6461 case Instruction::FCmp: {
6462 // Check that all of the compares have the same predicate.
6463 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
6465 Type *ComparedTy = VL0->getOperand(0)->getType();
6466 for (Value *V : VL) {
6467 CmpInst *Cmp = cast<CmpInst>(V);
6468 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
6469 Cmp->getOperand(0)->getType() != ComparedTy) {
6470 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
6471 return TreeEntry::NeedToGather;
6472 }
6473 }
6474 return TreeEntry::Vectorize;
6475 }
6476 case Instruction::Select:
6477 case Instruction::FNeg:
6478 case Instruction::Add:
6479 case Instruction::FAdd:
6480 case Instruction::Sub:
6481 case Instruction::FSub:
6482 case Instruction::Mul:
6483 case Instruction::FMul:
6484 case Instruction::UDiv:
6485 case Instruction::SDiv:
6486 case Instruction::FDiv:
6487 case Instruction::URem:
6488 case Instruction::SRem:
6489 case Instruction::FRem:
6490 case Instruction::Shl:
6491 case Instruction::LShr:
6492 case Instruction::AShr:
6493 case Instruction::And:
6494 case Instruction::Or:
6495 case Instruction::Xor:
6496 case Instruction::Freeze:
6497 return TreeEntry::Vectorize;
6498 case Instruction::GetElementPtr: {
6499 // We don't combine GEPs with complicated (nested) indexing.
6500 for (Value *V : VL) {
6501 auto *I = dyn_cast<GetElementPtrInst>(V);
6502 if (!I)
6503 continue;
6504 if (I->getNumOperands() != 2) {
6505 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
6506 return TreeEntry::NeedToGather;
6507 }
6508 }
6509
6510 // We can't combine several GEPs into one vector if they operate on
6511 // different types.
6512 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6513 for (Value *V : VL) {
6514 auto *GEP = dyn_cast<GEPOperator>(V);
6515 if (!GEP)
6516 continue;
6517 Type *CurTy = GEP->getSourceElementType();
6518 if (Ty0 != CurTy) {
6519 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
6520 return TreeEntry::NeedToGather;
6521 }
6522 }
6523
6524 // We don't combine GEPs with non-constant indexes.
6525 Type *Ty1 = VL0->getOperand(1)->getType();
6526 for (Value *V : VL) {
6527 auto *I = dyn_cast<GetElementPtrInst>(V);
6528 if (!I)
6529 continue;
6530 auto *Op = I->getOperand(1);
6531 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6532 (Op->getType() != Ty1 &&
6533 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6534 Op->getType()->getScalarSizeInBits() >
6535 DL->getIndexSizeInBits(
6536 V->getType()->getPointerAddressSpace())))) {
6537 LLVM_DEBUG(
6538 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
6539 return TreeEntry::NeedToGather;
6540 }
6541 }
6542
6543 return TreeEntry::Vectorize;
6544 }
6545 case Instruction::Store: {
6546 // Check if the stores are consecutive or if we need to swizzle them.
6547 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6548 // Avoid types that are padded when being allocated as scalars, while
6549 // being packed together in a vector (such as i1).
6550 if (DL->getTypeSizeInBits(ScalarTy) !=
6551 DL->getTypeAllocSizeInBits(ScalarTy)) {
6552 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
6553 return TreeEntry::NeedToGather;
6554 }
6555 // Make sure all stores in the bundle are simple - we can't vectorize
6556 // atomic or volatile stores.
6557 for (Value *V : VL) {
6558 auto *SI = cast<StoreInst>(V);
6559 if (!SI->isSimple()) {
6560 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
6561 return TreeEntry::NeedToGather;
6562 }
6563 PointerOps.push_back(SI->getPointerOperand());
6564 }
6565
6566 // Check the order of pointer operands.
6567 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
6568 Value *Ptr0;
6569 Value *PtrN;
6570 if (CurrentOrder.empty()) {
6571 Ptr0 = PointerOps.front();
6572 PtrN = PointerOps.back();
6573 } else {
6574 Ptr0 = PointerOps[CurrentOrder.front()];
6575 PtrN = PointerOps[CurrentOrder.back()];
6576 }
6577 std::optional<int> Dist =
6578 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
6579 // Check that the sorted pointer operands are consecutive.
6580 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
6581 return TreeEntry::Vectorize;
6582 }
6583
6584 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
6585 return TreeEntry::NeedToGather;
6586 }
6587 case Instruction::Call: {
6588 // Check if the calls are all to the same vectorizable intrinsic or
6589 // library function.
6590 CallInst *CI = cast<CallInst>(VL0);
6592
6593 VFShape Shape = VFShape::get(
6594 CI->getFunctionType(),
6595 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
6596 false /*HasGlobalPred*/);
6597 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
6598
6599 if (!VecFunc && !isTriviallyVectorizable(ID)) {
6600 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
6601 return TreeEntry::NeedToGather;
6602 }
6603 Function *F = CI->getCalledFunction();
6604 unsigned NumArgs = CI->arg_size();
6605 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
6606 for (unsigned J = 0; J != NumArgs; ++J)
6608 ScalarArgs[J] = CI->getArgOperand(J);
6609 for (Value *V : VL) {
6610 CallInst *CI2 = dyn_cast<CallInst>(V);
6611 if (!CI2 || CI2->getCalledFunction() != F ||
6612 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
6613 (VecFunc &&
6614 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
6616 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
6617 << "\n");
6618 return TreeEntry::NeedToGather;
6619 }
6620 // Some intrinsics have scalar arguments and should be same in order for
6621 // them to be vectorized.
6622 for (unsigned J = 0; J != NumArgs; ++J) {
6624 Value *A1J = CI2->getArgOperand(J);
6625 if (ScalarArgs[J] != A1J) {
6627 << "SLP: mismatched arguments in call:" << *CI
6628 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
6629 return TreeEntry::NeedToGather;
6630 }
6631 }
6632 }
6633 // Verify that the bundle operands are identical between the two calls.
6634 if (CI->hasOperandBundles() &&
6635 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
6636 CI->op_begin() + CI->getBundleOperandsEndIndex(),
6637 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
6638 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
6639 << "!=" << *V << '\n');
6640 return TreeEntry::NeedToGather;
6641 }
6642 }
6643
6644 return TreeEntry::Vectorize;
6645 }
6646 case Instruction::ShuffleVector: {
6647 // If this is not an alternate sequence of opcode like add-sub
6648 // then do not vectorize this instruction.
6649 if (!S.isAltShuffle()) {
6650 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
6651 return TreeEntry::NeedToGather;
6652 }
6653 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
6654 LLVM_DEBUG(
6655 dbgs()
6656 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
6657 "the whole alt sequence is not profitable.\n");
6658 return TreeEntry::NeedToGather;
6659 }
6660
6661 return TreeEntry::Vectorize;
6662 }
6663 default:
6664 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
6665 return TreeEntry::NeedToGather;
6666 }
6667}
6668
6669namespace {
6670/// Allows to correctly handle operands of the phi nodes based on the \p Main
6671/// PHINode order of incoming basic blocks/values.
6672class PHIHandler {
6673 DominatorTree &DT;
6674 PHINode *Main = nullptr;
6677
6678public:
6679 PHIHandler() = delete;
6680 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
6681 : DT(DT), Main(Main), Phis(Phis),
6682 Operands(Main->getNumIncomingValues(),
6683 SmallVector<Value *>(Phis.size(), nullptr)) {}
6684 void buildOperands() {
6685 constexpr unsigned FastLimit = 4;
6686 if (Main->getNumIncomingValues() <= FastLimit) {
6687 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
6688 BasicBlock *InBB = Main->getIncomingBlock(I);
6689 if (!DT.isReachableFromEntry(InBB)) {
6690 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
6691 continue;
6692 }
6693 // Prepare the operand vector.
6694 for (auto [Idx, V] : enumerate(Phis)) {
6695 auto *P = cast<PHINode>(V);
6696 if (P->getIncomingBlock(I) == InBB)
6697 Operands[I][Idx] = P->getIncomingValue(I);
6698 else
6699 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
6700 }
6701 }
6702 return;
6703 }
6705 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
6706 BasicBlock *InBB = Main->getIncomingBlock(I);
6707 if (!DT.isReachableFromEntry(InBB)) {
6708 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
6709 continue;
6710 }
6711 Blocks.try_emplace(InBB).first->second.push_back(I);
6712 }
6713 for (auto [Idx, V] : enumerate(Phis)) {
6714 auto *P = cast<PHINode>(V);
6715 for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) {
6716 BasicBlock *InBB = P->getIncomingBlock(I);
6717 if (InBB == Main->getIncomingBlock(I)) {
6718 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
6719 continue;
6720 Operands[I][Idx] = P->getIncomingValue(I);
6721 continue;
6722 }
6723 auto It = Blocks.find(InBB);
6724 if (It == Blocks.end())
6725 continue;
6726 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
6727 }
6728 }
6729 for (const auto &P : Blocks) {
6730 if (P.getSecond().size() <= 1)
6731 continue;
6732 unsigned BasicI = P.getSecond().front();
6733 for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
6735 [&](const auto &Data) {
6736 return !Data.value() ||
6737 Data.value() == Operands[BasicI][Data.index()];
6738 }) &&
6739 "Expected empty operands list.");
6740 Operands[I] = Operands[BasicI];
6741 }
6742 }
6743 }
6744 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
6745};
6746} // namespace
6747
6748void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
6749 const EdgeInfo &UserTreeIdx) {
6750 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
6751
6752 SmallVector<int> ReuseShuffleIndices;
6753 SmallVector<Value *> UniqueValues;
6754 SmallVector<Value *> NonUniqueValueVL;
6755 auto TryToFindDuplicates = [&](const InstructionsState &S,
6756 bool DoNotFail = false) {
6757 // Check that every instruction appears once in this bundle.
6758 DenseMap<Value *, unsigned> UniquePositions(VL.size());
6759 for (Value *V : VL) {
6760 if (isConstant(V)) {
6761 ReuseShuffleIndices.emplace_back(
6762 isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());
6763 UniqueValues.emplace_back(V);
6764 continue;
6765 }
6766 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
6767 ReuseShuffleIndices.emplace_back(Res.first->second);
6768 if (Res.second)
6769 UniqueValues.emplace_back(V);
6770 }
6771 size_t NumUniqueScalarValues = UniqueValues.size();
6772 if (NumUniqueScalarValues == VL.size()) {
6773 ReuseShuffleIndices.clear();
6774 } else {
6775 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6776 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6777 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
6778 "for nodes with padding.\n");
6779 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6780 return false;
6781 }
6782 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
6783 if (NumUniqueScalarValues <= 1 ||
6784 (UniquePositions.size() == 1 && all_of(UniqueValues,
6785 [](Value *V) {
6786 return isa<UndefValue>(V) ||
6787 !isConstant(V);
6788 })) ||
6789 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6790 if (DoNotFail && UniquePositions.size() > 1 &&
6791 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6792 all_of(UniqueValues, [=](Value *V) {
6793 return isa<ExtractElementInst>(V) ||
6794 areAllUsersVectorized(cast<Instruction>(V),
6795 UserIgnoreList);
6796 })) {
6797 unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
6798 if (PWSz == VL.size()) {
6799 ReuseShuffleIndices.clear();
6800 } else {
6801 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
6802 NonUniqueValueVL.append(PWSz - UniqueValues.size(),
6803 UniqueValues.back());
6804 VL = NonUniqueValueVL;
6805 }
6806 return true;
6807 }
6808 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
6809 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6810 return false;
6811 }
6812 VL = UniqueValues;
6813 }
6814 return true;
6815 };
6816
6817 InstructionsState S = getSameOpcode(VL, *TLI);
6818
6819 // Don't vectorize ephemeral values.
6820 if (!EphValues.empty()) {
6821 for (Value *V : VL) {
6822 if (EphValues.count(V)) {
6823 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6824 << ") is ephemeral.\n");
6825 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6826 return;
6827 }
6828 }
6829 }
6830
6831 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
6832 // a load), in which case peek through to include it in the tree, without
6833 // ballooning over-budget.
6834 if (Depth >= RecursionMaxDepth &&
6835 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6836 VL.size() >= 4 &&
6837 (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {
6838 return match(I,
6840 cast<Instruction>(I)->getOpcode() ==
6841 cast<Instruction>(S.MainOp)->getOpcode();
6842 })))) {
6843 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
6844 if (TryToFindDuplicates(S))
6845 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6846 ReuseShuffleIndices);
6847 return;
6848 }
6849
6850 // Don't handle scalable vectors
6851 if (S.getOpcode() == Instruction::ExtractElement &&
6852 isa<ScalableVectorType>(
6853 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6854 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
6855 if (TryToFindDuplicates(S))
6856 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6857 ReuseShuffleIndices);
6858 return;
6859 }
6860
6861 // Don't handle vectors.
6862 if (!SLPReVec && S.OpValue->getType()->isVectorTy() &&
6863 !isa<InsertElementInst>(S.OpValue)) {
6864 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
6865 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6866 return;
6867 }
6868
6869 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6870 if (!SLPReVec && SI->getValueOperand()->getType()->isVectorTy()) {
6871 LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
6872 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6873 return;
6874 }
6875
6876 // If all of the operands are identical or constant we have a simple solution.
6877 // If we deal with insert/extract instructions, they all must have constant
6878 // indices, otherwise we should gather them, not try to vectorize.
6879 // If alternate op node with 2 elements with gathered operands - do not
6880 // vectorize.
6881 auto &&NotProfitableForVectorization = [&S, this,
6883 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6884 return false;
6885 if (VectorizableTree.size() < MinTreeSize)
6886 return false;
6887 if (Depth >= RecursionMaxDepth - 1)
6888 return true;
6889 // Check if all operands are extracts, part of vector node or can build a
6890 // regular vectorize node.
6891 SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
6892 for (Value *V : VL) {
6893 auto *I = cast<Instruction>(V);
6894 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
6895 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6896 }));
6897 }
6898 bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
6899 if ((IsCommutative &&
6900 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6901 (!IsCommutative &&
6902 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
6903 return true;
6904 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
6906 auto *I1 = cast<Instruction>(VL.front());
6907 auto *I2 = cast<Instruction>(VL.back());
6908 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6909 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6910 I2->getOperand(Op));
6911 if (static_cast<unsigned>(count_if(
6912 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6914 })) >= S.MainOp->getNumOperands() / 2)
6915 return false;
6916 if (S.MainOp->getNumOperands() > 2)
6917 return true;
6918 if (IsCommutative) {
6919 // Check permuted operands.
6920 Candidates.clear();
6921 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6922 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6923 I2->getOperand((Op + 1) % E));
6924 if (any_of(
6925 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6927 }))
6928 return false;
6929 }
6930 return true;
6931 };
6932 SmallVector<unsigned> SortedIndices;
6933 BasicBlock *BB = nullptr;
6934 bool IsScatterVectorizeUserTE =
6935 UserTreeIdx.UserTE &&
6936 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6937 bool AreAllSameBlock = S.getOpcode() && allSameBlock(VL);
6938 bool AreScatterAllGEPSameBlock =
6939 (IsScatterVectorizeUserTE && S.OpValue->getType()->isPointerTy() &&
6940 VL.size() > 2 &&
6941 all_of(VL,
6942 [&BB](Value *V) {
6943 auto *I = dyn_cast<GetElementPtrInst>(V);
6944 if (!I)
6945 return doesNotNeedToBeScheduled(V);
6946 if (!BB)
6947 BB = I->getParent();
6948 return BB == I->getParent() && I->getNumOperands() == 2;
6949 }) &&
6950 BB &&
6951 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
6952 SortedIndices));
6953 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
6954 if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) ||
6955 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6956 S.OpValue) &&
6958 NotProfitableForVectorization(VL)) {
6959 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
6960 if (TryToFindDuplicates(S))
6961 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6962 ReuseShuffleIndices);
6963 return;
6964 }
6965
6966 // We now know that this is a vector of instructions of the same type from
6967 // the same block.
6968
6969 // Check if this is a duplicate of another entry.
6970 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
6971 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
6972 if (!E->isSame(VL)) {
6973 auto It = MultiNodeScalars.find(S.OpValue);
6974 if (It != MultiNodeScalars.end()) {
6975 auto *TEIt = find_if(It->getSecond(),
6976 [&](TreeEntry *ME) { return ME->isSame(VL); });
6977 if (TEIt != It->getSecond().end())
6978 E = *TEIt;
6979 else
6980 E = nullptr;
6981 } else {
6982 E = nullptr;
6983 }
6984 }
6985 if (!E) {
6986 if (!doesNotNeedToBeScheduled(S.OpValue)) {
6987 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
6988 if (TryToFindDuplicates(S))
6989 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6990 ReuseShuffleIndices);
6991 return;
6992 }
6993 } else {
6994 // Record the reuse of the tree node. FIXME, currently this is only used
6995 // to properly draw the graph rather than for the actual vectorization.
6996 E->UserTreeIndices.push_back(UserTreeIdx);
6997 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
6998 << ".\n");
6999 return;
7000 }
7001 }
7002
7003 // Check that none of the instructions in the bundle are already in the tree.
7004 for (Value *V : VL) {
7005 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
7007 continue;
7008 if (getTreeEntry(V)) {
7009 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
7010 << ") is already in tree.\n");
7011 if (TryToFindDuplicates(S))
7012 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7013 ReuseShuffleIndices);
7014 return;
7015 }
7016 }
7017
7018 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
7019 if (UserIgnoreList && !UserIgnoreList->empty()) {
7020 for (Value *V : VL) {
7021 if (UserIgnoreList && UserIgnoreList->contains(V)) {
7022 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
7023 if (TryToFindDuplicates(S))
7024 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7025 ReuseShuffleIndices);
7026 return;
7027 }
7028 }
7029 }
7030
7031 // Special processing for sorted pointers for ScatterVectorize node with
7032 // constant indeces only.
7033 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
7034 assert(S.OpValue->getType()->isPointerTy() &&
7035 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
7036 "Expected pointers only.");
7037 // Reset S to make it GetElementPtr kind of node.
7038 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
7039 assert(It != VL.end() && "Expected at least one GEP.");
7040 S = getSameOpcode(*It, *TLI);
7041 }
7042
7043 // Check that all of the users of the scalars that we want to vectorize are
7044 // schedulable.
7045 auto *VL0 = cast<Instruction>(S.OpValue);
7046 BB = VL0->getParent();
7047
7048 if (!DT->isReachableFromEntry(BB)) {
7049 // Don't go into unreachable blocks. They may contain instructions with
7050 // dependency cycles which confuse the final scheduling.
7051 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
7052 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
7053 return;
7054 }
7055
7056 // Don't go into catchswitch blocks, which can happen with PHIs.
7057 // Such blocks can only have PHIs and the catchswitch. There is no
7058 // place to insert a shuffle if we need to, so just avoid that issue.
7059 if (isa<CatchSwitchInst>(BB->getTerminator())) {
7060 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
7061 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
7062 return;
7063 }
7064
7065 // Check that every instruction appears once in this bundle.
7066 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
7067 return;
7068
7069 // Perform specific checks for each particular instruction kind.
7070 OrdersType CurrentOrder;
7071 SmallVector<Value *> PointerOps;
7072 TreeEntry::EntryState State = getScalarsVectorizationState(
7073 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
7074 if (State == TreeEntry::NeedToGather) {
7075 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7076 ReuseShuffleIndices);
7077 return;
7078 }
7079
7080 auto &BSRef = BlocksSchedules[BB];
7081 if (!BSRef)
7082 BSRef = std::make_unique<BlockScheduling>(BB);
7083
7084 BlockScheduling &BS = *BSRef;
7085
7086 std::optional<ScheduleData *> Bundle =
7087 BS.tryScheduleBundle(UniqueValues, this, S);
7088#ifdef EXPENSIVE_CHECKS
7089 // Make sure we didn't break any internal invariants
7090 BS.verify();
7091#endif
7092 if (!Bundle) {
7093 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
7094 assert((!BS.getScheduleData(VL0) ||
7095 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
7096 "tryScheduleBundle should cancelScheduling on failure");
7097 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7098 ReuseShuffleIndices);
7099 NonScheduledFirst.insert(VL.front());
7100 return;
7101 }
7102 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
7103
7104 unsigned ShuffleOrOp = S.isAltShuffle() ?
7105 (unsigned) Instruction::ShuffleVector : S.getOpcode();
7106 switch (ShuffleOrOp) {
7107 case Instruction::PHI: {
7108 auto *PH = cast<PHINode>(VL0);
7109
7110 TreeEntry *TE =
7111 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
7112 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
7113
7114 // Keeps the reordered operands to avoid code duplication.
7115 PHIHandler Handler(*DT, PH, VL);
7116 Handler.buildOperands();
7117 for (unsigned I : seq<unsigned>(0, PH->getNumOperands()))
7118 TE->setOperand(I, Handler.getOperands(I));
7119 for (unsigned I : seq<unsigned>(0, PH->getNumOperands()))
7120 buildTree_rec(Handler.getOperands(I), Depth + 1, {TE, I});
7121 return;
7122 }
7123 case Instruction::ExtractValue:
7124 case Instruction::ExtractElement: {
7125 if (CurrentOrder.empty()) {
7126 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
7127 } else {
7128 LLVM_DEBUG({
7129 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
7130 "with order";
7131 for (unsigned Idx : CurrentOrder)
7132 dbgs() << " " << Idx;
7133 dbgs() << "\n";
7134 });
7135 fixupOrderingIndices(CurrentOrder);
7136 }
7137 // Insert new order with initial value 0, if it does not exist,
7138 // otherwise return the iterator to the existing one.
7139 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7140 ReuseShuffleIndices, CurrentOrder);
7141 // This is a special case, as it does not gather, but at the same time
7142 // we are not extending buildTree_rec() towards the operands.
7143 ValueList Op0;
7144 Op0.assign(VL.size(), VL0->getOperand(0));
7145 VectorizableTree.back()->setOperand(0, Op0);
7146 return;
7147 }
7148 case Instruction::InsertElement: {
7149 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
7150
7151 auto OrdCompare = [](const std::pair<int, int> &P1,
7152 const std::pair<int, int> &P2) {
7153 return P1.first > P2.first;
7154 };
7156 decltype(OrdCompare)>
7157 Indices(OrdCompare);
7158 for (int I = 0, E = VL.size(); I < E; ++I) {
7159 unsigned Idx = *getElementIndex(VL[I]);
7160 Indices.emplace(Idx, I);
7161 }
7162 OrdersType CurrentOrder(VL.size(), VL.size());
7163 bool IsIdentity = true;
7164 for (int I = 0, E = VL.size(); I < E; ++I) {
7165 CurrentOrder[Indices.top().second] = I;
7166 IsIdentity &= Indices.top().second == I;
7167 Indices.pop();
7168 }
7169 if (IsIdentity)
7170 CurrentOrder.clear();
7171 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7172 std::nullopt, CurrentOrder);
7173 LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
7174
7175 TE->setOperandsInOrder();
7176 buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
7177 return;
7178 }
7179 case Instruction::Load: {
7180 // Check that a vectorized load would load the same memory as a scalar
7181 // load. For example, we don't want to vectorize loads that are smaller
7182 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7183 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7184 // from such a struct, we read/write packed bits disagreeing with the
7185 // unvectorized version.
7186 TreeEntry *TE = nullptr;
7187 fixupOrderingIndices(CurrentOrder);
7188 switch (State) {
7189 case TreeEntry::Vectorize:
7190 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7191 ReuseShuffleIndices, CurrentOrder);
7192 if (CurrentOrder.empty())
7193 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
7194 else
7195 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
7196 TE->setOperandsInOrder();
7197 break;
7198 case TreeEntry::StridedVectorize:
7199 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
7200 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
7201 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
7202 TE->setOperandsInOrder();
7203 LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
7204 break;
7205 case TreeEntry::ScatterVectorize:
7206 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
7207 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
7208 UserTreeIdx, ReuseShuffleIndices);
7209 TE->setOperandsInOrder();
7210 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
7211 LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
7212 break;
7213 case TreeEntry::NeedToGather:
7214 llvm_unreachable("Unexpected loads state.");
7215 }
7216 return;
7217 }
7218 case Instruction::ZExt:
7219 case Instruction::SExt:
7220 case Instruction::FPToUI:
7221 case Instruction::FPToSI:
7222 case Instruction::FPExt:
7223 case Instruction::PtrToInt:
7224 case Instruction::IntToPtr:
7225 case Instruction::SIToFP:
7226 case Instruction::UIToFP:
7227 case Instruction::Trunc:
7228 case Instruction::FPTrunc:
7229 case Instruction::BitCast: {
7230 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
7231 std::make_pair(std::numeric_limits<unsigned>::min(),
7232 std::numeric_limits<unsigned>::max()));
7233 if (ShuffleOrOp == Instruction::ZExt ||
7234 ShuffleOrOp == Instruction::SExt) {
7235 CastMaxMinBWSizes = std::make_pair(
7236 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
7237 PrevMaxBW),
7238 std::min<unsigned>(
7239 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
7240 PrevMinBW));
7241 } else if (ShuffleOrOp == Instruction::Trunc) {
7242 CastMaxMinBWSizes = std::make_pair(
7243 std::max<unsigned>(
7244 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
7245 PrevMaxBW),
7246 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
7247 PrevMinBW));
7248 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
7249 } else if (ShuffleOrOp == Instruction::SIToFP ||
7250 ShuffleOrOp == Instruction::UIToFP) {
7251 unsigned NumSignBits =
7252 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
7253 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
7254 APInt Mask = DB->getDemandedBits(OpI);
7255 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
7256 }
7257 if (NumSignBits * 2 >=
7258 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
7259 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
7260 }
7261 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7262 ReuseShuffleIndices);
7263 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
7264
7265 TE->setOperandsInOrder();
7266 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7267 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
7268 return;
7269 }
7270 case Instruction::ICmp:
7271 case Instruction::FCmp: {
7272 // Check that all of the compares have the same predicate.
7273 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
7274 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7275 ReuseShuffleIndices);
7276 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
7277
7279 if (cast<CmpInst>(VL0)->isCommutative()) {
7280 // Commutative predicate - collect + sort operands of the instructions
7281 // so that each side is more likely to have the same opcode.
7283 "Commutative Predicate mismatch");
7284 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7285 } else {
7286 // Collect operands - commute if it uses the swapped predicate.
7287 for (Value *V : VL) {
7288 auto *Cmp = cast<CmpInst>(V);
7289 Value *LHS = Cmp->getOperand(0);
7290 Value *RHS = Cmp->getOperand(1);
7291 if (Cmp->getPredicate() != P0)
7292 std::swap(LHS, RHS);
7293 Left.push_back(LHS);
7294 Right.push_back(RHS);
7295 }
7296 }
7297 TE->setOperand(0, Left);
7298 TE->setOperand(1, Right);
7299 buildTree_rec(Left, Depth + 1, {TE, 0});
7300 buildTree_rec(Right, Depth + 1, {TE, 1});
7301 if (ShuffleOrOp == Instruction::ICmp) {
7302 unsigned NumSignBits0 =
7303 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
7304 if (NumSignBits0 * 2 >=
7305 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
7306 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
7307 unsigned NumSignBits1 =
7308 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
7309 if (NumSignBits1 * 2 >=
7310 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
7311 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
7312 }
7313 return;
7314 }
7315 case Instruction::Select:
7316 case Instruction::FNeg:
7317 case Instruction::Add:
7318 case Instruction::FAdd:
7319 case Instruction::Sub:
7320 case Instruction::FSub:
7321 case Instruction::Mul:
7322 case Instruction::FMul:
7323 case Instruction::UDiv:
7324 case Instruction::SDiv:
7325 case Instruction::FDiv:
7326 case Instruction::URem:
7327 case Instruction::SRem:
7328 case Instruction::FRem:
7329 case Instruction::Shl:
7330 case Instruction::LShr:
7331 case Instruction::AShr:
7332 case Instruction::And:
7333 case Instruction::Or:
7334 case Instruction::Xor:
7335 case Instruction::Freeze: {
7336 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7337 ReuseShuffleIndices);
7338 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
7339
7340 // Sort operands of the instructions so that each side is more likely to
7341 // have the same opcode.
7342 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
7344 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7345 TE->setOperand(0, Left);
7346 TE->setOperand(1, Right);
7347 buildTree_rec(Left, Depth + 1, {TE, 0});
7348 buildTree_rec(Right, Depth + 1, {TE, 1});
7349 return;
7350 }
7351
7352 TE->setOperandsInOrder();
7353 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7354 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
7355 return;
7356 }
7357 case Instruction::GetElementPtr: {
7358 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7359 ReuseShuffleIndices);
7360 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
7362 // Prepare the operand vector for pointer operands.
7363 for (Value *V : VL) {
7364 auto *GEP = dyn_cast<GetElementPtrInst>(V);
7365 if (!GEP) {
7366 Operands.front().push_back(V);
7367 continue;
7368 }
7369 Operands.front().push_back(GEP->getPointerOperand());
7370 }
7371 TE->setOperand(0, Operands.front());
7372 // Need to cast all indices to the same type before vectorization to
7373 // avoid crash.
7374 // Required to be able to find correct matches between different gather
7375 // nodes and reuse the vectorized values rather than trying to gather them
7376 // again.
7377 int IndexIdx = 1;
7378 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
7379 Type *Ty = all_of(VL,
7380 [VL0Ty, IndexIdx](Value *V) {
7381 auto *GEP = dyn_cast<GetElementPtrInst>(V);
7382 if (!GEP)
7383 return true;
7384 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
7385 })
7386 ? VL0Ty
7387 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
7388 ->getPointerOperandType()
7389 ->getScalarType());
7390 // Prepare the operand vector.
7391 for (Value *V : VL) {
7392 auto *I = dyn_cast<GetElementPtrInst>(V);
7393 if (!I) {
7394 Operands.back().push_back(
7395 ConstantInt::get(Ty, 0, /*isSigned=*/false));
7396 continue;
7397 }
7398 auto *Op = I->getOperand(IndexIdx);
7399 auto *CI = dyn_cast<ConstantInt>(Op);
7400 if (!CI)
7401 Operands.back().push_back(Op);
7402 else
7403 Operands.back().push_back(ConstantFoldIntegerCast(
7404 CI, Ty, CI->getValue().isSignBitSet(), *DL));
7405 }
7406 TE->setOperand(IndexIdx, Operands.back());
7407
7408 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
7409 buildTree_rec(Operands[I], Depth + 1, {TE, I});
7410 return;
7411 }
7412 case Instruction::Store: {
7413 bool Consecutive = CurrentOrder.empty();
7414 if (!Consecutive)
7415 fixupOrderingIndices(CurrentOrder);
7416 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7417 ReuseShuffleIndices, CurrentOrder);
7418 TE->setOperandsInOrder();
7419 buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
7420 if (Consecutive)
7421 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
7422 else
7423 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
7424 return;
7425 }
7426 case Instruction::Call: {
7427 // Check if the calls are all to the same vectorizable intrinsic or
7428 // library function.
7429 CallInst *CI = cast<CallInst>(VL0);
7431
7432 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7433 ReuseShuffleIndices);
7434 // Sort operands of the instructions so that each side is more likely to
7435 // have the same opcode.
7436 if (isCommutative(VL0)) {
7438 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7439 TE->setOperand(0, Left);
7440 TE->setOperand(1, Right);
7442 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7443 Operands.emplace_back();
7445 continue;
7446 for (Value *V : VL) {
7447 auto *CI2 = cast<CallInst>(V);
7448 Operands.back().push_back(CI2->getArgOperand(I));
7449 }
7450 TE->setOperand(I, Operands.back());
7451 }
7452 buildTree_rec(Left, Depth + 1, {TE, 0});
7453 buildTree_rec(Right, Depth + 1, {TE, 1});
7454 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7455 if (Operands[I - 2].empty())
7456 continue;
7457 buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
7458 }
7459 return;
7460 }
7461 TE->setOperandsInOrder();
7462 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
7463 // For scalar operands no need to create an entry since no need to
7464 // vectorize it.
7466 continue;
7468 // Prepare the operand vector.
7469 for (Value *V : VL) {
7470 auto *CI2 = cast<CallInst>(V);
7471 Operands.push_back(CI2->getArgOperand(I));
7472 }
7473 buildTree_rec(Operands, Depth + 1, {TE, I});
7474 }
7475 return;
7476 }
7477 case Instruction::ShuffleVector: {
7478 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7479 ReuseShuffleIndices);
7480 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
7481
7482 // Reorder operands if reordering would enable vectorization.
7483 auto *CI = dyn_cast<CmpInst>(VL0);
7484 if (isa<BinaryOperator>(VL0) || CI) {
7486 if (!CI || all_of(VL, [](Value *V) {
7487 return cast<CmpInst>(V)->isCommutative();
7488 })) {
7489 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7490 } else {
7491 auto *MainCI = cast<CmpInst>(S.MainOp);
7492 auto *AltCI = cast<CmpInst>(S.AltOp);
7493 CmpInst::Predicate MainP = MainCI->getPredicate();
7494 CmpInst::Predicate AltP = AltCI->getPredicate();
7495 assert(MainP != AltP &&
7496 "Expected different main/alternate predicates.");
7497 // Collect operands - commute if it uses the swapped predicate or
7498 // alternate operation.
7499 for (Value *V : VL) {
7500 auto *Cmp = cast<CmpInst>(V);
7501 Value *LHS = Cmp->getOperand(0);
7502 Value *RHS = Cmp->getOperand(1);
7503
7504 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
7505 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7506 std::swap(LHS, RHS);
7507 } else {
7508 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7509 std::swap(LHS, RHS);
7510 }
7511 Left.push_back(LHS);
7512 Right.push_back(RHS);
7513 }
7514 }
7515 TE->setOperand(0, Left);
7516 TE->setOperand(1, Right);
7517 buildTree_rec(Left, Depth + 1, {TE, 0});
7518 buildTree_rec(Right, Depth + 1, {TE, 1});
7519 return;
7520 }
7521
7522 TE->setOperandsInOrder();
7523 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7524 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
7525 return;
7526 }
7527 default:
7528 break;
7529 }
7530 llvm_unreachable("Unexpected vectorization of the instructions.");
7531}
7532
7534 unsigned N = 1;
7535 Type *EltTy = T;
7536
7537 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7538 if (auto *ST = dyn_cast<StructType>(EltTy)) {
7539 // Check that struct is homogeneous.
7540 for (const auto *Ty : ST->elements())
7541 if (Ty != *ST->element_begin())
7542 return 0;
7543 N *= ST->getNumElements();
7544 EltTy = *ST->element_begin();
7545 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
7546 N *= AT->getNumElements();
7547 EltTy = AT->getElementType();
7548 } else {
7549 auto *VT = cast<FixedVectorType>(EltTy);
7550 N *= VT->getNumElements();
7551 EltTy = VT->getElementType();
7552 }
7553 }
7554
7555 if (!isValidElementType(EltTy))
7556 return 0;
7557 uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
7558 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7559 VTSize != DL->getTypeStoreSizeInBits(T))
7560 return 0;
7561 return N;
7562}
7563
7564bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
7565 SmallVectorImpl<unsigned> &CurrentOrder,
7566 bool ResizeAllowed) const {
7567 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7568 assert(It != VL.end() && "Expected at least one extract instruction.");
7569 auto *E0 = cast<Instruction>(*It);
7570 assert(
7571 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7572 "Invalid opcode");
7573 // Check if all of the extracts come from the same vector and from the
7574 // correct offset.
7575 Value *Vec = E0->getOperand(0);
7576
7577 CurrentOrder.clear();
7578
7579 // We have to extract from a vector/aggregate with the same number of elements.
7580 unsigned NElts;
7581 if (E0->getOpcode() == Instruction::ExtractValue) {
7582 NElts = canMapToVector(Vec->getType());
7583 if (!NElts)
7584 return false;
7585 // Check if load can be rewritten as load of vector.
7586 LoadInst *LI = dyn_cast<LoadInst>(Vec);
7587 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
7588 return false;
7589 } else {
7590 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
7591 }
7592
7593 unsigned E = VL.size();
7594 if (!ResizeAllowed && NElts != E)
7595 return false;
7596 SmallVector<int> Indices(E, PoisonMaskElem);
7597 unsigned MinIdx = NElts, MaxIdx = 0;
7598 for (auto [I, V] : enumerate(VL)) {
7599 auto *Inst = dyn_cast<Instruction>(V);
7600 if (!Inst)
7601 continue;
7602 if (Inst->getOperand(0) != Vec)
7603 return false;
7604 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
7605 if (isa<UndefValue>(EE->getIndexOperand()))
7606 continue;
7607 std::optional<unsigned> Idx = getExtractIndex(Inst);
7608 if (!Idx)
7609 return false;
7610 const unsigned ExtIdx = *Idx;
7611 if (ExtIdx >= NElts)
7612 continue;
7613 Indices[I] = ExtIdx;
7614 if (MinIdx > ExtIdx)
7615 MinIdx = ExtIdx;
7616 if (MaxIdx < ExtIdx)
7617 MaxIdx = ExtIdx;
7618 }
7619 if (MaxIdx - MinIdx + 1 > E)
7620 return false;
7621 if (MaxIdx + 1 <= E)
7622 MinIdx = 0;
7623
7624 // Check that all of the indices extract from the correct offset.
7625 bool ShouldKeepOrder = true;
7626 // Assign to all items the initial value E + 1 so we can check if the extract
7627 // instruction index was used already.
7628 // Also, later we can check that all the indices are used and we have a
7629 // consecutive access in the extract instructions, by checking that no
7630 // element of CurrentOrder still has value E + 1.
7631 CurrentOrder.assign(E, E);
7632 for (unsigned I = 0; I < E; ++I) {
7633 if (Indices[I] == PoisonMaskElem)
7634 continue;
7635 const unsigned ExtIdx = Indices[I] - MinIdx;
7636 if (CurrentOrder[ExtIdx] != E) {
7637 CurrentOrder.clear();
7638 return false;
7639 }
7640 ShouldKeepOrder &= ExtIdx == I;
7641 CurrentOrder[ExtIdx] = I;
7642 }
7643 if (ShouldKeepOrder)
7644 CurrentOrder.clear();
7645
7646 return ShouldKeepOrder;
7647}
7648
7649bool BoUpSLP::areAllUsersVectorized(
7650 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
7651 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
7652 all_of(I->users(), [this](User *U) {
7653 return ScalarToTreeEntry.contains(U) ||
7654 isVectorLikeInstWithConstOps(U) ||
7655 (isa<ExtractElementInst>(U) && MustGather.contains(U));
7656 });
7657}
7658
7659static std::pair<InstructionCost, InstructionCost>
7662 ArrayRef<Type *> ArgTys) {
7664
7665 // Calculate the cost of the scalar and vector calls.
7666 FastMathFlags FMF;
7667 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
7668 FMF = FPCI->getFastMathFlags();
7670 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
7671 dyn_cast<IntrinsicInst>(CI));
7672 auto IntrinsicCost =
7674
7675 auto Shape = VFShape::get(CI->getFunctionType(),
7677 false /*HasGlobalPred*/);
7678 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7679 auto LibCost = IntrinsicCost;
7680 if (!CI->isNoBuiltin() && VecFunc) {
7681 // Calculate the cost of the vector library call.
7682 // If the corresponding vector call is cheaper, return its cost.
7683 LibCost =
7684 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
7685 }
7686 return {IntrinsicCost, LibCost};
7687}
7688
7689void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7690 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
7691 SmallVectorImpl<Value *> *OpScalars,
7692 SmallVectorImpl<Value *> *AltScalars) const {
7693 unsigned Sz = Scalars.size();
7694 Mask.assign(Sz, PoisonMaskElem);
7695 SmallVector<int> OrderMask;
7696 if (!ReorderIndices.empty())
7697 inversePermutation(ReorderIndices, OrderMask);
7698 for (unsigned I = 0; I < Sz; ++I) {
7699 unsigned Idx = I;
7700 if (!ReorderIndices.empty())
7701 Idx = OrderMask[I];
7702 auto *OpInst = cast<Instruction>(Scalars[Idx]);
7703 if (IsAltOp(OpInst)) {
7704 Mask[I] = Sz + Idx;
7705 if (AltScalars)
7706 AltScalars->push_back(OpInst);
7707 } else {
7708 Mask[I] = Idx;
7709 if (OpScalars)
7710 OpScalars->push_back(OpInst);
7711 }
7712 }
7713 if (!ReuseShuffleIndices.empty()) {
7714 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
7715 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
7716 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7717 });
7718 Mask.swap(NewMask);
7719 }
7720}
7721
7723 const Instruction *MainOp,
7724 const Instruction *AltOp,
7725 const TargetLibraryInfo &TLI) {
7726 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7727 auto *AltCI = cast<CmpInst>(AltOp);
7728 CmpInst::Predicate MainP = MainCI->getPredicate();
7729 CmpInst::Predicate AltP = AltCI->getPredicate();
7730 assert(MainP != AltP && "Expected different main/alternate predicates.");
7731 auto *CI = cast<CmpInst>(I);
7732 if (isCmpSameOrSwapped(MainCI, CI, TLI))
7733 return false;
7734 if (isCmpSameOrSwapped(AltCI, CI, TLI))
7735 return true;
7736 CmpInst::Predicate P = CI->getPredicate();
7738
7739 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
7740 "CmpInst expected to match either main or alternate predicate or "
7741 "their swap.");
7742 (void)AltP;
7743 return MainP != P && MainP != SwappedP;
7744 }
7745 return I->getOpcode() == AltOp->getOpcode();
7746}
7747
7748TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
7749 assert(!Ops.empty());
7750 const auto *Op0 = Ops.front();
7751
7752 const bool IsConstant = all_of(Ops, [](Value *V) {
7753 // TODO: We should allow undef elements here
7754 return isConstant(V) && !isa<UndefValue>(V);
7755 });
7756 const bool IsUniform = all_of(Ops, [=](Value *V) {
7757 // TODO: We should allow undef elements here
7758 return V == Op0;
7759 });
7760 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
7761 // TODO: We should allow undef elements here
7762 if (auto *CI = dyn_cast<ConstantInt>(V))
7763 return CI->getValue().isPowerOf2();
7764 return false;
7765 });
7766 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
7767 // TODO: We should allow undef elements here
7768 if (auto *CI = dyn_cast<ConstantInt>(V))
7769 return CI->getValue().isNegatedPowerOf2();
7770 return false;
7771 });
7772
7774 if (IsConstant && IsUniform)
7776 else if (IsConstant)
7778 else if (IsUniform)
7780
7782 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
7783 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
7784
7785 return {VK, VP};
7786}
7787
7788namespace {
7789/// The base class for shuffle instruction emission and shuffle cost estimation.
7790class BaseShuffleAnalysis {
7791protected:
7792 Type *ScalarTy = nullptr;
7793
7794 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
7795
7796 /// V is expected to be a vectorized value.
7797 /// When REVEC is disabled, there is no difference between VF and
7798 /// VNumElements.
7799 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
7800 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
7801 /// of 8.
7802 unsigned getVF(Value *V) const {
7803 assert(V && "V cannot be nullptr");
7804 assert(isa<FixedVectorType>(V->getType()) &&
7805 "V does not have FixedVectorType");
7806 assert(ScalarTy && "ScalarTy cannot be nullptr");
7807 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
7808 unsigned VNumElements =
7809 cast<FixedVectorType>(V->getType())->getNumElements();
7810 assert(VNumElements > ScalarTyNumElements &&
7811 "the number of elements of V is not large enough");
7812 assert(VNumElements % ScalarTyNumElements == 0 &&
7813 "the number of elements of V is not a vectorized value");
7814 return VNumElements / ScalarTyNumElements;
7815 }
7816
7817 /// Checks if the mask is an identity mask.
7818 /// \param IsStrict if is true the function returns false if mask size does
7819 /// not match vector size.
7820 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
7821 bool IsStrict) {
7822 int Limit = Mask.size();
7823 int VF = VecTy->getNumElements();
7824 int Index = -1;
7825 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
7826 return true;
7827 if (!IsStrict) {
7828 // Consider extract subvector starting from index 0.
7830 Index == 0)
7831 return true;
7832 // All VF-size submasks are identity (e.g.
7833 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
7834 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
7835 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
7836 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
7838 }))
7839 return true;
7840 }
7841 return false;
7842 }
7843
7844 /// Tries to combine 2 different masks into single one.
7845 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
7846 /// change the size of the vector, \p LocalVF is the original size of the
7847 /// shuffled vector.
7848 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
7849 ArrayRef<int> ExtMask) {
7850 unsigned VF = Mask.size();
7851 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
7852 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
7853 if (ExtMask[I] == PoisonMaskElem)
7854 continue;
7855 int MaskedIdx = Mask[ExtMask[I] % VF];
7856 NewMask[I] =
7857 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
7858 }
7859 Mask.swap(NewMask);
7860 }
7861
7862 /// Looks through shuffles trying to reduce final number of shuffles in the
7863 /// code. The function looks through the previously emitted shuffle
7864 /// instructions and properly mark indices in mask as undef.
7865 /// For example, given the code
7866 /// \code
7867 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
7868 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
7869 /// \endcode
7870 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
7871 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7872 /// <0, 1, 2, 3> for the shuffle.
7873 /// If 2 operands are of different size, the smallest one will be resized and
7874 /// the mask recalculated properly.
7875 /// For example, given the code
7876 /// \code
7877 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
7878 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
7879 /// \endcode
7880 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
7881 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7882 /// <0, 1, 2, 3> for the shuffle.
7883 /// So, it tries to transform permutations to simple vector merge, if
7884 /// possible.
7885 /// \param V The input vector which must be shuffled using the given \p Mask.
7886 /// If the better candidate is found, \p V is set to this best candidate
7887 /// vector.
7888 /// \param Mask The input mask for the shuffle. If the best candidate is found
7889 /// during looking-through-shuffles attempt, it is updated accordingly.
7890 /// \param SinglePermute true if the shuffle operation is originally a
7891 /// single-value-permutation. In this case the look-through-shuffles procedure
7892 /// may look for resizing shuffles as the best candidates.
7893 /// \return true if the shuffle results in the non-resizing identity shuffle
7894 /// (and thus can be ignored), false - otherwise.
7895 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
7896 bool SinglePermute) {
7897 Value *Op = V;
7898 ShuffleVectorInst *IdentityOp = nullptr;
7899 SmallVector<int> IdentityMask;
7900 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
7901 // Exit if not a fixed vector type or changing size shuffle.
7902 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7903 if (!SVTy)
7904 break;
7905 // Remember the identity or broadcast mask, if it is not a resizing
7906 // shuffle. If no better candidates are found, this Op and Mask will be
7907 // used in the final shuffle.
7908 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
7909 if (!IdentityOp || !SinglePermute ||
7910 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
7912 IdentityMask.size()))) {
7913 IdentityOp = SV;
7914 // Store current mask in the IdentityMask so later we did not lost
7915 // this info if IdentityOp is selected as the best candidate for the
7916 // permutation.
7917 IdentityMask.assign(Mask);
7918 }
7919 }
7920 // Remember the broadcast mask. If no better candidates are found, this Op
7921 // and Mask will be used in the final shuffle.
7922 // Zero splat can be used as identity too, since it might be used with
7923 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
7924 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
7925 // expensive, the analysis founds out, that the source vector is just a
7926 // broadcast, this original mask can be transformed to identity mask <0,
7927 // 1, 2, 3>.
7928 // \code
7929 // %0 = shuffle %v, poison, zeroinitalizer
7930 // %res = shuffle %0, poison, <3, 1, 2, 0>
7931 // \endcode
7932 // may be transformed to
7933 // \code
7934 // %0 = shuffle %v, poison, zeroinitalizer
7935 // %res = shuffle %0, poison, <0, 1, 2, 3>
7936 // \endcode
7937 if (SV->isZeroEltSplat()) {
7938 IdentityOp = SV;
7939 IdentityMask.assign(Mask);
7940 }
7941 int LocalVF = Mask.size();
7942 if (auto *SVOpTy =
7943 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7944 LocalVF = SVOpTy->getNumElements();
7945 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
7946 for (auto [Idx, I] : enumerate(Mask)) {
7947 if (I == PoisonMaskElem ||
7948 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
7949 continue;
7950 ExtMask[Idx] = SV->getMaskValue(I);
7951 }
7952 bool IsOp1Undef =
7953 isUndefVector(SV->getOperand(0),
7954 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
7955 .all();
7956 bool IsOp2Undef =
7957 isUndefVector(SV->getOperand(1),
7958 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
7959 .all();
7960 if (!IsOp1Undef && !IsOp2Undef) {
7961 // Update mask and mark undef elems.
7962 for (int &I : Mask) {
7963 if (I == PoisonMaskElem)
7964 continue;
7965 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
7967 I = PoisonMaskElem;
7968 }
7969 break;
7970 }
7971 SmallVector<int> ShuffleMask(SV->getShuffleMask());
7972 combineMasks(LocalVF, ShuffleMask, Mask);
7973 Mask.swap(ShuffleMask);
7974 if (IsOp2Undef)
7975 Op = SV->getOperand(0);
7976 else
7977 Op = SV->getOperand(1);
7978 }
7979 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
7980 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7982 if (IdentityOp) {
7983 V = IdentityOp;
7984 assert(Mask.size() == IdentityMask.size() &&
7985 "Expected masks of same sizes.");
7986 // Clear known poison elements.
7987 for (auto [I, Idx] : enumerate(Mask))
7988 if (Idx == PoisonMaskElem)
7989 IdentityMask[I] = PoisonMaskElem;
7990 Mask.swap(IdentityMask);
7991 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7992 return SinglePermute &&
7993 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
7994 /*IsStrict=*/true) ||
7995 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
7996 Shuffle->isZeroEltSplat() &&
7998 }
7999 V = Op;
8000 return false;
8001 }
8002 V = Op;
8003 return true;
8004 }
8005
8006 /// Smart shuffle instruction emission, walks through shuffles trees and
8007 /// tries to find the best matching vector for the actual shuffle
8008 /// instruction.
8009 template <typename T, typename ShuffleBuilderTy>
8010 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
8011 ShuffleBuilderTy &Builder) {
8012 assert(V1 && "Expected at least one vector value.");
8013 if (V2)
8014 Builder.resizeToMatch(V1, V2);
8015 int VF = Mask.size();
8016 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
8017 VF = FTy->getNumElements();
8018 if (V2 &&
8019 !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) {
8020 // Peek through shuffles.
8021 Value *Op1 = V1;
8022 Value *Op2 = V2;
8023 int VF =
8024 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8025 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
8026 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
8027 for (int I = 0, E = Mask.size(); I < E; ++I) {
8028 if (Mask[I] < VF)
8029 CombinedMask1[I] = Mask[I];
8030 else
8031 CombinedMask2[I] = Mask[I] - VF;
8032 }
8033 Value *PrevOp1;
8034 Value *PrevOp2;
8035 do {
8036 PrevOp1 = Op1;
8037 PrevOp2 = Op2;
8038 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
8039 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
8040 // Check if we have 2 resizing shuffles - need to peek through operands
8041 // again.
8042 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
8043 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
8044 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
8045 for (auto [Idx, I] : enumerate(CombinedMask1)) {
8046 if (I == PoisonMaskElem)
8047 continue;
8048 ExtMask1[Idx] = SV1->getMaskValue(I);
8049 }
8050 SmallBitVector UseMask1 = buildUseMask(
8051 cast<FixedVectorType>(SV1->getOperand(1)->getType())
8052 ->getNumElements(),
8053 ExtMask1, UseMask::SecondArg);
8054 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
8055 for (auto [Idx, I] : enumerate(CombinedMask2)) {
8056 if (I == PoisonMaskElem)
8057 continue;
8058 ExtMask2[Idx] = SV2->getMaskValue(I);
8059 }
8060 SmallBitVector UseMask2 = buildUseMask(
8061 cast<FixedVectorType>(SV2->getOperand(1)->getType())
8062 ->getNumElements(),
8063 ExtMask2, UseMask::SecondArg);
8064 if (SV1->getOperand(0)->getType() ==
8065 SV2->getOperand(0)->getType() &&
8066 SV1->getOperand(0)->getType() != SV1->getType() &&
8067 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
8068 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
8069 Op1 = SV1->getOperand(0);
8070 Op2 = SV2->getOperand(0);
8071 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
8072 int LocalVF = ShuffleMask1.size();
8073 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
8074 LocalVF = FTy->getNumElements();
8075 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
8076 CombinedMask1.swap(ShuffleMask1);
8077 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
8078 LocalVF = ShuffleMask2.size();
8079 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
8080 LocalVF = FTy->getNumElements();
8081 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
8082 CombinedMask2.swap(ShuffleMask2);
8083 }
8084 }
8085 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
8086 Builder.resizeToMatch(Op1, Op2);
8087 VF = std::max(cast<VectorType>(Op1->getType())
8088 ->getElementCount()
8089 .getKnownMinValue(),
8090 cast<VectorType>(Op2->getType())
8091 ->getElementCount()
8092 .getKnownMinValue());
8093 for (int I = 0, E = Mask.size(); I < E; ++I) {
8094 if (CombinedMask2[I] != PoisonMaskElem) {
8095 assert(CombinedMask1[I] == PoisonMaskElem &&
8096 "Expected undefined mask element");
8097 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
8098 }
8099 }
8100 if (Op1 == Op2 &&
8101 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
8102 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
8103 isa<ShuffleVectorInst>(Op1) &&
8104 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
8105 ArrayRef(CombinedMask1))))
8106 return Builder.createIdentity(Op1);
8107 return Builder.createShuffleVector(
8108 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
8109 CombinedMask1);
8110 }
8111 if (isa<PoisonValue>(V1))
8112 return Builder.createPoison(
8113 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
8114 SmallVector<int> NewMask(Mask);
8115 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
8116 assert(V1 && "Expected non-null value after looking through shuffles.");
8117
8118 if (!IsIdentity)
8119 return Builder.createShuffleVector(V1, NewMask);
8120 return Builder.createIdentity(V1);
8121 }
8122};
8123} // namespace
8124
8125/// Returns the cost of the shuffle instructions with the given \p Kind, vector
8126/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
8127/// subvector pattern.
8128static InstructionCost
8130 VectorType *Tp, ArrayRef<int> Mask = std::nullopt,
8132 int Index = 0, VectorType *SubTp = nullptr,
8133 ArrayRef<const Value *> Args = std::nullopt) {
8134 if (Kind != TTI::SK_PermuteTwoSrc)
8135 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
8136 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
8137 int NumSubElts;
8138 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
8139 Mask, NumSrcElts, NumSubElts, Index)) {
8140 if (Index + NumSubElts > NumSrcElts &&
8141 Index + NumSrcElts <= static_cast<int>(Mask.size()))
8142 return TTI.getShuffleCost(
8144 getWidenedType(Tp->getElementType(), Mask.size()), Mask,
8146 }
8147 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
8148}
8149
8150/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
8151static std::pair<InstructionCost, InstructionCost>
8153 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
8154 Type *ScalarTy, VectorType *VecTy) {
8155 InstructionCost ScalarCost = 0;
8156 InstructionCost VecCost = 0;
8157 // Here we differentiate two cases: (1) when Ptrs represent a regular
8158 // vectorization tree node (as they are pointer arguments of scattered
8159 // loads) or (2) when Ptrs are the arguments of loads or stores being
8160 // vectorized as plane wide unit-stride load/store since all the
8161 // loads/stores are known to be from/to adjacent locations.
8162 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
8163 // Case 2: estimate costs for pointer related costs when vectorizing to
8164 // a wide load/store.
8165 // Scalar cost is estimated as a set of pointers with known relationship
8166 // between them.
8167 // For vector code we will use BasePtr as argument for the wide load/store
8168 // but we also need to account all the instructions which are going to
8169 // stay in vectorized code due to uses outside of these scalar
8170 // loads/stores.
8171 ScalarCost = TTI.getPointersChainCost(
8172 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
8173 CostKind);
8174
8175 SmallVector<const Value *> PtrsRetainedInVecCode;
8176 for (Value *V : Ptrs) {
8177 if (V == BasePtr) {
8178 PtrsRetainedInVecCode.push_back(V);
8179 continue;
8180 }
8181 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
8182 // For simplicity assume Ptr to stay in vectorized code if it's not a
8183 // GEP instruction. We don't care since it's cost considered free.
8184 // TODO: We should check for any uses outside of vectorizable tree
8185 // rather than just single use.
8186 if (!Ptr || !Ptr->hasOneUse())
8187 PtrsRetainedInVecCode.push_back(V);
8188 }
8189
8190 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
8191 // If all pointers stay in vectorized code then we don't have
8192 // any savings on that.
8193 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
8194 }
8195 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
8196 TTI::PointersChainInfo::getKnownStride(),
8197 VecTy, CostKind);
8198 } else {
8199 // Case 1: Ptrs are the arguments of loads that we are going to transform
8200 // into masked gather load intrinsic.
8201 // All the scalar GEPs will be removed as a result of vectorization.
8202 // For any external uses of some lanes extract element instructions will
8203 // be generated (which cost is estimated separately).
8204 TTI::PointersChainInfo PtrsInfo =
8205 all_of(Ptrs,
8206 [](const Value *V) {
8207 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
8208 return Ptr && !Ptr->hasAllConstantIndices();
8209 })
8210 ? TTI::PointersChainInfo::getUnknownStride()
8211 : TTI::PointersChainInfo::getKnownStride();
8212
8213 ScalarCost =
8214 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
8215 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
8216 if (!BaseGEP) {
8217 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
8218 if (It != Ptrs.end())
8219 BaseGEP = cast<GEPOperator>(*It);
8220 }
8221 if (BaseGEP) {
8222 SmallVector<const Value *> Indices(BaseGEP->indices());
8223 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
8224 BaseGEP->getPointerOperand(), Indices, VecTy,
8225 CostKind);
8226 }
8227 }
8228
8229 return std::make_pair(ScalarCost, VecCost);
8230}
8231
8234 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8235 TreeEntry &E = *TE;
8236 switch (E.getOpcode()) {
8237 case Instruction::Load: {
8238 // No need to reorder masked gather loads, just reorder the scalar
8239 // operands.
8240 if (E.State != TreeEntry::Vectorize)
8241 break;
8242 Type *ScalarTy = E.getMainOp()->getType();
8243 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
8244 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
8245 // Check if profitable to represent consecutive load + reverse as strided
8246 // load with stride -1.
8247 if (isReverseOrder(E.ReorderIndices) &&
8248 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
8249 SmallVector<int> Mask;
8250 inversePermutation(E.ReorderIndices, Mask);
8251 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
8252 InstructionCost OriginalVecCost =
8253 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
8258 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
8259 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
8260 if (StridedCost < OriginalVecCost)
8261 // Strided load is more profitable than consecutive load + reverse -
8262 // transform the node to strided load.
8263 E.State = TreeEntry::StridedVectorize;
8264 }
8265 break;
8266 }
8267 case Instruction::Store: {
8268 Type *ScalarTy =
8269 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
8270 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
8271 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
8272 // Check if profitable to represent consecutive load + reverse as strided
8273 // load with stride -1.
8274 if (isReverseOrder(E.ReorderIndices) &&
8275 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
8276 SmallVector<int> Mask;
8277 inversePermutation(E.ReorderIndices, Mask);
8278 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
8279 InstructionCost OriginalVecCost =
8280 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
8285 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
8286 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
8287 if (StridedCost < OriginalVecCost)
8288 // Strided load is more profitable than consecutive load + reverse -
8289 // transform the node to strided load.
8290 E.State = TreeEntry::StridedVectorize;
8291 }
8292 break;
8293 }
8294 default:
8295 break;
8296 }
8297 }
8298}
8299
8300/// Merges shuffle masks and emits final shuffle instruction, if required. It
8301/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
8302/// when the actual shuffle instruction is generated only if this is actually
8303/// required. Otherwise, the shuffle instruction emission is delayed till the
8304/// end of the process, to reduce the number of emitted instructions and further
8305/// analysis/transformations.
8306class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
8307 bool IsFinalized = false;
8308 SmallVector<int> CommonMask;
8310 const TargetTransformInfo &TTI;
8312 SmallDenseSet<Value *> VectorizedVals;
8313 BoUpSLP &R;
8314 SmallPtrSetImpl<Value *> &CheckedExtracts;
8315 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
8316 /// While set, still trying to estimate the cost for the same nodes and we
8317 /// can delay actual cost estimation (virtual shuffle instruction emission).
8318 /// May help better estimate the cost if same nodes must be permuted + allows
8319 /// to move most of the long shuffles cost estimation to TTI.
8320 bool SameNodesEstimated = true;
8321
8322 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
8323 if (Ty->getScalarType()->isPointerTy()) {
8327 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
8328 Ty->getScalarType());
8329 if (auto *VTy = dyn_cast<VectorType>(Ty))
8330 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
8331 return Res;
8332 }
8333 return Constant::getAllOnesValue(Ty);
8334 }
8335
8336 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
8337 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
8338 return TTI::TCC_Free;
8339 auto *VecTy = getWidenedType(ScalarTy, VL.size());
8340 InstructionCost GatherCost = 0;
8341 SmallVector<Value *> Gathers(VL);
8342 // Improve gather cost for gather of loads, if we can group some of the
8343 // loads into vector loads.
8344 InstructionsState S = getSameOpcode(VL, *R.TLI);
8345 const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy);
8346 unsigned MinVF = R.getMinVF(2 * Sz);
8347 if (VL.size() > 2 &&
8348 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
8349 (InVectors.empty() &&
8350 any_of(seq<unsigned>(0, VL.size() / MinVF),
8351 [&](unsigned Idx) {
8352 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
8353 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
8354 return S.getOpcode() == Instruction::Load &&
8355 !S.isAltShuffle();
8356 }))) &&
8357 !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
8358 !isSplat(Gathers)) {
8359 InstructionCost BaseCost = R.getGatherCost(Gathers, !Root, ScalarTy);
8360 SetVector<Value *> VectorizedLoads;
8362 SmallVector<unsigned> ScatterVectorized;
8363 unsigned StartIdx = 0;
8364 unsigned VF = VL.size() / 2;
8365 for (; VF >= MinVF; VF /= 2) {
8366 for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
8367 Cnt += VF) {
8368 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
8369 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
8370 InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);
8371 if (SliceS.getOpcode() != Instruction::Load ||
8372 SliceS.isAltShuffle())
8373 continue;
8374 }
8375 if (!VectorizedLoads.count(Slice.front()) &&
8376 !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
8377 SmallVector<Value *> PointerOps;
8378 OrdersType CurrentOrder;
8379 LoadsState LS = R.canVectorizeLoads(Slice, Slice.front(),
8380 CurrentOrder, PointerOps);
8381 switch (LS) {
8385 // Mark the vectorized loads so that we don't vectorize them
8386 // again.
8387 // TODO: better handling of loads with reorders.
8388 if (((LS == LoadsState::Vectorize ||
8390 CurrentOrder.empty()) ||
8392 isReverseOrder(CurrentOrder)))
8393 VectorizedStarts.emplace_back(Cnt, LS);
8394 else
8395 ScatterVectorized.push_back(Cnt);
8396 VectorizedLoads.insert(Slice.begin(), Slice.end());
8397 // If we vectorized initial block, no need to try to vectorize
8398 // it again.
8399 if (Cnt == StartIdx)
8400 StartIdx += VF;
8401 break;
8402 case LoadsState::Gather:
8403 break;
8404 }
8405 }
8406 }
8407 // Check if the whole array was vectorized already - exit.
8408 if (StartIdx >= VL.size())
8409 break;
8410 // Found vectorizable parts - exit.
8411 if (!VectorizedLoads.empty())
8412 break;
8413 }
8414 if (!VectorizedLoads.empty()) {
8415 unsigned NumParts = TTI.getNumberOfParts(VecTy);
8416 bool NeedInsertSubvectorAnalysis =
8417 !NumParts || (VL.size() / VF) > NumParts;
8418 // Get the cost for gathered loads.
8419 for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
8420 if (VectorizedLoads.contains(VL[I]))
8421 continue;
8422 GatherCost +=
8423 getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root);
8424 }
8425 // Exclude potentially vectorized loads from list of gathered
8426 // scalars.
8427 Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));
8428 // The cost for vectorized loads.
8429 InstructionCost ScalarsCost = 0;
8430 for (Value *V : VectorizedLoads) {
8431 auto *LI = cast<LoadInst>(V);
8432 ScalarsCost +=
8433 TTI.getMemoryOpCost(Instruction::Load, LI->getType(),
8434 LI->getAlign(), LI->getPointerAddressSpace(),
8435 CostKind, TTI::OperandValueInfo(), LI);
8436 }
8437 auto *LoadTy = getWidenedType(VL.front()->getType(), VF);
8438 for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
8439 auto *LI = cast<LoadInst>(VL[P.first]);
8440 Align Alignment = LI->getAlign();
8441 GatherCost +=
8442 P.second == LoadsState::Vectorize
8443 ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
8444 LI->getPointerAddressSpace(), CostKind,
8447 Instruction::Load, LoadTy, LI->getPointerOperand(),
8448 /*VariableMask=*/false, Alignment, CostKind, LI);
8449 // Add external uses costs.
8450 for (auto [Idx, V] : enumerate(VL.slice(
8451 P.first, std::min<unsigned>(VL.size() - P.first, VF))))
8452 if (!R.areAllUsersVectorized(cast<Instruction>(V)))
8453 GatherCost += TTI.getVectorInstrCost(Instruction::ExtractElement,
8454 LoadTy, CostKind, Idx);
8455 // Estimate GEP cost.
8456 SmallVector<Value *> PointerOps(VF);
8457 for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
8458 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8459 auto [ScalarGEPCost, VectorGEPCost] =
8460 getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
8461 Instruction::Load, CostKind, LI->getType(), LoadTy);
8462 GatherCost += VectorGEPCost - ScalarGEPCost;
8463 }
8464 for (unsigned P : ScatterVectorized) {
8465 auto *LI0 = cast<LoadInst>(VL[P]);
8466 ArrayRef<Value *> Slice = VL.slice(P, VF);
8467 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
8468 GatherCost += TTI.getGatherScatterOpCost(
8469 Instruction::Load, LoadTy, LI0->getPointerOperand(),
8470 /*VariableMask=*/false, CommonAlignment, CostKind, LI0);
8471 // Estimate GEP cost.
8472 SmallVector<Value *> PointerOps(VF);
8473 for (auto [I, V] : enumerate(Slice))
8474 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8475 OrdersType Order;
8476 if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE,
8477 Order)) {
8478 // TODO: improve checks if GEPs can be vectorized.
8479 Value *Ptr0 = PointerOps.front();
8480 Type *ScalarTy = Ptr0->getType();
8481 auto *VecTy = getWidenedType(ScalarTy, VF);
8482 auto [ScalarGEPCost, VectorGEPCost] =
8483 getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr,
8484 CostKind, ScalarTy, VecTy);
8485 GatherCost += VectorGEPCost - ScalarGEPCost;
8486 if (!Order.empty()) {
8487 SmallVector<int> Mask;
8488 inversePermutation(Order, Mask);
8490 VecTy, Mask, CostKind);
8491 }
8492 } else {
8493 GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true,
8494 PointerOps.front()->getType());
8495 }
8496 }
8497 if (NeedInsertSubvectorAnalysis) {
8498 // Add the cost for the subvectors insert.
8499 SmallVector<int> ShuffleMask(VL.size());
8500 for (unsigned I = VF, E = VL.size(); I < E; I += VF) {
8501 for (unsigned Idx : seq<unsigned>(0, E))
8502 ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx;
8503 GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy,
8504 ShuffleMask, CostKind, I, LoadTy);
8505 }
8506 }
8507 GatherCost -= ScalarsCost;
8508 }
8509 GatherCost = std::min(BaseCost, GatherCost);
8510 } else if (!Root && isSplat(VL)) {
8511 // Found the broadcasting of the single scalar, calculate the cost as
8512 // the broadcast.
8513 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
8514 assert(It != VL.end() && "Expected at least one non-undef value.");
8515 // Add broadcast for non-identity shuffle only.
8516 bool NeedShuffle =
8517 count(VL, *It) > 1 &&
8518 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
8519 if (!NeedShuffle)
8520 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
8521 CostKind, std::distance(VL.begin(), It),
8522 PoisonValue::get(VecTy), *It);
8523
8524 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
8525 transform(VL, ShuffleMask.begin(), [](Value *V) {
8526 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8527 });
8528 InstructionCost InsertCost =
8529 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
8530 PoisonValue::get(VecTy), *It);
8532 VecTy, ShuffleMask, CostKind,
8533 /*Index=*/0, /*SubTp=*/nullptr,
8534 /*Args=*/*It);
8535 }
8536 return GatherCost +
8537 (all_of(Gathers, IsaPred<UndefValue>)
8539 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
8540 ScalarTy));
8541 };
8542
8543 /// Compute the cost of creating a vector containing the extracted values from
8544 /// \p VL.
8546 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
8547 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8548 unsigned NumParts) {
8549 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
8550 unsigned NumElts =
8551 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
8552 auto *EE = dyn_cast<ExtractElementInst>(V);
8553 if (!EE)
8554 return Sz;
8555 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8556 if (!VecTy)
8557 return Sz;
8558 return std::max(Sz, VecTy->getNumElements());
8559 });
8560 // FIXME: this must be moved to TTI for better estimation.
8561 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
8562 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
8564 -> std::optional<TTI::ShuffleKind> {
8565 if (NumElts <= EltsPerVector)
8566 return std::nullopt;
8567 int OffsetReg0 =
8568 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
8569 [](int S, int I) {
8570 if (I == PoisonMaskElem)
8571 return S;
8572 return std::min(S, I);
8573 }),
8574 EltsPerVector);
8575 int OffsetReg1 = OffsetReg0;
8576 DenseSet<int> RegIndices;
8577 // Check that if trying to permute same single/2 input vectors.
8579 int FirstRegId = -1;
8580 Indices.assign(1, OffsetReg0);
8581 for (auto [Pos, I] : enumerate(Mask)) {
8582 if (I == PoisonMaskElem)
8583 continue;
8584 int Idx = I - OffsetReg0;
8585 int RegId =
8586 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
8587 if (FirstRegId < 0)
8588 FirstRegId = RegId;
8589 RegIndices.insert(RegId);
8590 if (RegIndices.size() > 2)
8591 return std::nullopt;
8592 if (RegIndices.size() == 2) {
8593 ShuffleKind = TTI::SK_PermuteTwoSrc;
8594 if (Indices.size() == 1) {
8595 OffsetReg1 = alignDown(
8596 std::accumulate(
8597 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
8598 [&](int S, int I) {
8599 if (I == PoisonMaskElem)
8600 return S;
8601 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
8602 ((I - OffsetReg0) % NumElts) / EltsPerVector;
8603 if (RegId == FirstRegId)
8604 return S;
8605 return std::min(S, I);
8606 }),
8607 EltsPerVector);
8608 Indices.push_back(OffsetReg1 % NumElts);
8609 }
8610 Idx = I - OffsetReg1;
8611 }
8612 I = (Idx % NumElts) % EltsPerVector +
8613 (RegId == FirstRegId ? 0 : EltsPerVector);
8614 }
8615 return ShuffleKind;
8616 };
8618
8619 // Process extracts in blocks of EltsPerVector to check if the source vector
8620 // operand can be re-used directly. If not, add the cost of creating a
8621 // shuffle to extract the values into a vector register.
8622 for (unsigned Part : seq<unsigned>(NumParts)) {
8623 if (!ShuffleKinds[Part])
8624 continue;
8625 ArrayRef<int> MaskSlice = Mask.slice(
8626 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
8627 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
8628 copy(MaskSlice, SubMask.begin());
8630 std::optional<TTI::ShuffleKind> RegShuffleKind =
8631 CheckPerRegistersShuffle(SubMask, Indices);
8632 if (!RegShuffleKind) {
8633 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
8635 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
8636 Cost +=
8637 ::getShuffleCost(TTI, *ShuffleKinds[Part],
8638 getWidenedType(ScalarTy, NumElts), MaskSlice);
8639 continue;
8640 }
8641 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
8642 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
8643 Cost +=
8644 ::getShuffleCost(TTI, *RegShuffleKind,
8645 getWidenedType(ScalarTy, EltsPerVector), SubMask);
8646 }
8647 for (unsigned Idx : Indices) {
8648 assert((Idx + EltsPerVector) <= alignTo(NumElts, EltsPerVector) &&
8649 "SK_ExtractSubvector index out of range");
8652 getWidenedType(ScalarTy, alignTo(NumElts, EltsPerVector)),
8653 std::nullopt, CostKind, Idx,
8654 getWidenedType(ScalarTy, EltsPerVector));
8655 }
8656 // Second attempt to check, if just a permute is better estimated than
8657 // subvector extract.
8658 SubMask.assign(NumElts, PoisonMaskElem);
8659 copy(MaskSlice, SubMask.begin());
8660 InstructionCost OriginalCost = ::getShuffleCost(
8661 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
8662 if (OriginalCost < Cost)
8663 Cost = OriginalCost;
8664 }
8665 return Cost;
8666 }
8667 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
8668 /// shuffle emission.
8669 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
8670 ArrayRef<int> Mask) {
8671 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8672 if (Mask[Idx] != PoisonMaskElem)
8673 CommonMask[Idx] = Idx;
8674 }
8675 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
8676 /// mask \p Mask, register number \p Part, that includes \p SliceSize
8677 /// elements.
8678 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
8679 ArrayRef<int> Mask, unsigned Part,
8680 unsigned SliceSize) {
8681 if (SameNodesEstimated) {
8682 // Delay the cost estimation if the same nodes are reshuffling.
8683 // If we already requested the cost of reshuffling of E1 and E2 before, no
8684 // need to estimate another cost with the sub-Mask, instead include this
8685 // sub-Mask into the CommonMask to estimate it later and avoid double cost
8686 // estimation.
8687 if ((InVectors.size() == 2 &&
8688 InVectors.front().get<const TreeEntry *>() == &E1 &&
8689 InVectors.back().get<const TreeEntry *>() == E2) ||
8690 (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
8691 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
8692 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
8693 [](int Idx) { return Idx == PoisonMaskElem; }) &&
8694 "Expected all poisoned elements.");
8695 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
8696 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
8697 return;
8698 }
8699 // Found non-matching nodes - need to estimate the cost for the matched
8700 // and transform mask.
8701 Cost += createShuffle(InVectors.front(),
8702 InVectors.size() == 1 ? nullptr : InVectors.back(),
8703 CommonMask);
8704 transformMaskAfterShuffle(CommonMask, CommonMask);
8705 }
8706 SameNodesEstimated = false;
8707 if (!E2 && InVectors.size() == 1) {
8708 unsigned VF = E1.getVectorFactor();
8709 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
8710 VF = std::max(VF,
8711 cast<FixedVectorType>(V1->getType())->getNumElements());
8712 } else {
8713 const auto *E = InVectors.front().get<const TreeEntry *>();
8714 VF = std::max(VF, E->getVectorFactor());
8715 }
8716 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8717 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8718 CommonMask[Idx] = Mask[Idx] + VF;
8719 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
8720 transformMaskAfterShuffle(CommonMask, CommonMask);
8721 } else {
8722 Cost += createShuffle(&E1, E2, Mask);
8723 transformMaskAfterShuffle(CommonMask, Mask);
8724 }
8725 }
8726
8727 class ShuffleCostBuilder {
8728 const TargetTransformInfo &TTI;
8729
8730 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
8731 int Index = -1;
8732 return Mask.empty() ||
8733 (VF == Mask.size() &&
8736 Index == 0);
8737 }
8738
8739 public:
8740 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
8741 ~ShuffleCostBuilder() = default;
8742 InstructionCost createShuffleVector(Value *V1, Value *,
8743 ArrayRef<int> Mask) const {
8744 // Empty mask or identity mask are free.
8745 unsigned VF =
8746 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8747 if (isEmptyOrIdentity(Mask, VF))
8748 return TTI::TCC_Free;
8749 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
8750 cast<VectorType>(V1->getType()), Mask);
8751 }
8752 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
8753 // Empty mask or identity mask are free.
8754 unsigned VF =
8755 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8756 if (isEmptyOrIdentity(Mask, VF))
8757 return TTI::TCC_Free;
8759 cast<VectorType>(V1->getType()), Mask);
8760 }
8761 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
8762 InstructionCost createPoison(Type *Ty, unsigned VF) const {
8763 return TTI::TCC_Free;
8764 }
8765 void resizeToMatch(Value *&, Value *&) const {}
8766 };
8767
8768 /// Smart shuffle instruction emission, walks through shuffles trees and
8769 /// tries to find the best matching vector for the actual shuffle
8770 /// instruction.
8772 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
8774 ArrayRef<int> Mask) {
8775 ShuffleCostBuilder Builder(TTI);
8776 SmallVector<int> CommonMask(Mask);
8777 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
8778 unsigned CommonVF = Mask.size();
8779 InstructionCost ExtraCost = 0;
8780 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
8781 unsigned VF) -> InstructionCost {
8782 if (E.isGather() && allConstant(E.Scalars))
8783 return TTI::TCC_Free;
8784 Type *EScalarTy = E.Scalars.front()->getType();
8785 bool IsSigned = true;
8786 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
8787 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
8788 IsSigned = It->second.second;
8789 }
8790 if (EScalarTy != ScalarTy) {
8791 unsigned CastOpcode = Instruction::Trunc;
8792 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8793 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8794 if (DstSz > SrcSz)
8795 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8796 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
8797 getWidenedType(EScalarTy, VF),
8798 TTI::CastContextHint::None, CostKind);
8799 }
8800 return TTI::TCC_Free;
8801 };
8802 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
8803 if (isa<Constant>(V))
8804 return TTI::TCC_Free;
8805 auto *VecTy = cast<VectorType>(V->getType());
8806 Type *EScalarTy = VecTy->getElementType();
8807 if (EScalarTy != ScalarTy) {
8808 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
8809 unsigned CastOpcode = Instruction::Trunc;
8810 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8811 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8812 if (DstSz > SrcSz)
8813 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8814 return TTI.getCastInstrCost(
8815 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
8816 VecTy, TTI::CastContextHint::None, CostKind);
8817 }
8818 return TTI::TCC_Free;
8819 };
8820 if (!V1 && !V2 && !P2.isNull()) {
8821 // Shuffle 2 entry nodes.
8822 const TreeEntry *E = P1.get<const TreeEntry *>();
8823 unsigned VF = E->getVectorFactor();
8824 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8825 CommonVF = std::max(VF, E2->getVectorFactor());
8826 assert(all_of(Mask,
8827 [=](int Idx) {
8828 return Idx < 2 * static_cast<int>(CommonVF);
8829 }) &&
8830 "All elements in mask must be less than 2 * CommonVF.");
8831 if (E->Scalars.size() == E2->Scalars.size()) {
8832 SmallVector<int> EMask = E->getCommonMask();
8833 SmallVector<int> E2Mask = E2->getCommonMask();
8834 if (!EMask.empty() || !E2Mask.empty()) {
8835 for (int &Idx : CommonMask) {
8836 if (Idx == PoisonMaskElem)
8837 continue;
8838 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
8839 Idx = EMask[Idx];
8840 else if (Idx >= static_cast<int>(CommonVF))
8841 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
8842 E->Scalars.size();
8843 }
8844 }
8845 CommonVF = E->Scalars.size();
8846 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
8847 GetNodeMinBWAffectedCost(*E2, CommonVF);
8848 } else {
8849 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
8850 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
8851 }
8852 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8853 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8854 } else if (!V1 && P2.isNull()) {
8855 // Shuffle single entry node.
8856 const TreeEntry *E = P1.get<const TreeEntry *>();
8857 unsigned VF = E->getVectorFactor();
8858 CommonVF = VF;
8859 assert(
8860 all_of(Mask,
8861 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8862 "All elements in mask must be less than CommonVF.");
8863 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8864 SmallVector<int> EMask = E->getCommonMask();
8865 assert(!EMask.empty() && "Expected non-empty common mask.");
8866 for (int &Idx : CommonMask) {
8867 if (Idx != PoisonMaskElem)
8868 Idx = EMask[Idx];
8869 }
8870 CommonVF = E->Scalars.size();
8871 }
8872 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
8873 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8874 // Not identity/broadcast? Try to see if the original vector is better.
8875 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8876 CommonVF == CommonMask.size() &&
8877 any_of(enumerate(CommonMask),
8878 [](const auto &&P) {
8879 return P.value() != PoisonMaskElem &&
8880 static_cast<unsigned>(P.value()) != P.index();
8881 }) &&
8882 any_of(CommonMask,
8883 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
8884 SmallVector<int> ReorderMask;
8885 inversePermutation(E->ReorderIndices, ReorderMask);
8886 ::addMask(CommonMask, ReorderMask);
8887 }
8888 } else if (V1 && P2.isNull()) {
8889 // Shuffle single vector.
8890 ExtraCost += GetValueMinBWAffectedCost(V1);
8891 CommonVF = getVF(V1);
8892 assert(
8893 all_of(Mask,
8894 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8895 "All elements in mask must be less than CommonVF.");
8896 } else if (V1 && !V2) {
8897 // Shuffle vector and tree node.
8898 unsigned VF = getVF(V1);
8899 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8900 CommonVF = std::max(VF, E2->getVectorFactor());
8901 assert(all_of(Mask,
8902 [=](int Idx) {
8903 return Idx < 2 * static_cast<int>(CommonVF);
8904 }) &&
8905 "All elements in mask must be less than 2 * CommonVF.");
8906 if (E2->Scalars.size() == VF && VF != CommonVF) {
8907 SmallVector<int> E2Mask = E2->getCommonMask();
8908 assert(!E2Mask.empty() && "Expected non-empty common mask.");
8909 for (int &Idx : CommonMask) {
8910 if (Idx == PoisonMaskElem)
8911 continue;
8912 if (Idx >= static_cast<int>(CommonVF))
8913 Idx = E2Mask[Idx - CommonVF] + VF;
8914 }
8915 CommonVF = VF;
8916 }
8917 ExtraCost += GetValueMinBWAffectedCost(V1);
8918 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8919 ExtraCost += GetNodeMinBWAffectedCost(
8920 *E2, std::min(CommonVF, E2->getVectorFactor()));
8921 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8922 } else if (!V1 && V2) {
8923 // Shuffle vector and tree node.
8924 unsigned VF = getVF(V2);
8925 const TreeEntry *E1 = P1.get<const TreeEntry *>();
8926 CommonVF = std::max(VF, E1->getVectorFactor());
8927 assert(all_of(Mask,
8928 [=](int Idx) {
8929 return Idx < 2 * static_cast<int>(CommonVF);
8930 }) &&
8931 "All elements in mask must be less than 2 * CommonVF.");
8932 if (E1->Scalars.size() == VF && VF != CommonVF) {
8933 SmallVector<int> E1Mask = E1->getCommonMask();
8934 assert(!E1Mask.empty() && "Expected non-empty common mask.");
8935 for (int &Idx : CommonMask) {
8936 if (Idx == PoisonMaskElem)
8937 continue;
8938 if (Idx >= static_cast<int>(CommonVF))
8939 Idx = E1Mask[Idx - CommonVF] + VF;
8940 else
8941 Idx = E1Mask[Idx];
8942 }
8943 CommonVF = VF;
8944 }
8945 ExtraCost += GetNodeMinBWAffectedCost(
8946 *E1, std::min(CommonVF, E1->getVectorFactor()));
8947 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8948 ExtraCost += GetValueMinBWAffectedCost(V2);
8949 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8950 } else {
8951 assert(V1 && V2 && "Expected both vectors.");
8952 unsigned VF = getVF(V1);
8953 CommonVF = std::max(VF, getVF(V2));
8954 assert(all_of(Mask,
8955 [=](int Idx) {
8956 return Idx < 2 * static_cast<int>(CommonVF);
8957 }) &&
8958 "All elements in mask must be less than 2 * CommonVF.");
8959 ExtraCost +=
8960 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
8961 if (V1->getType() != V2->getType()) {
8962 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8963 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8964 } else {
8965 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
8966 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8967 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
8968 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8969 }
8970 }
8971 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
8972 assert(SLPReVec && "FixedVectorType is not expected.");
8974 CommonMask);
8975 }
8976 InVectors.front() =
8977 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
8978 if (InVectors.size() == 2)
8979 InVectors.pop_back();
8980 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
8981 V1, V2, CommonMask, Builder);
8982 }
8983
8984public:
8986 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
8987 SmallPtrSetImpl<Value *> &CheckedExtracts)
8988 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
8989 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
8990 CheckedExtracts(CheckedExtracts) {}
8991 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
8992 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8993 unsigned NumParts, bool &UseVecBaseAsInput) {
8994 UseVecBaseAsInput = false;
8995 if (Mask.empty())
8996 return nullptr;
8997 Value *VecBase = nullptr;
8998 ArrayRef<Value *> VL = E->Scalars;
8999 // If the resulting type is scalarized, do not adjust the cost.
9000 if (NumParts == VL.size())
9001 return nullptr;
9002 // Check if it can be considered reused if same extractelements were
9003 // vectorized already.
9004 bool PrevNodeFound = any_of(
9005 ArrayRef(R.VectorizableTree).take_front(E->Idx),
9006 [&](const std::unique_ptr<TreeEntry> &TE) {
9007 return ((!TE->isAltShuffle() &&
9008 TE->getOpcode() == Instruction::ExtractElement) ||
9009 TE->isGather()) &&
9010 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
9011 return VL.size() > Data.index() &&
9012 (Mask[Data.index()] == PoisonMaskElem ||
9013 isa<UndefValue>(VL[Data.index()]) ||
9014 Data.value() == VL[Data.index()]);
9015 });
9016 });
9017 SmallPtrSet<Value *, 4> UniqueBases;
9018 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
9019 for (unsigned Part : seq<unsigned>(NumParts)) {
9020 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
9021 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
9022 for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, Limit))) {
9023 // Ignore non-extractelement scalars.
9024 if (isa<UndefValue>(V) ||
9025 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
9026 continue;
9027 // If all users of instruction are going to be vectorized and this
9028 // instruction itself is not going to be vectorized, consider this
9029 // instruction as dead and remove its cost from the final cost of the
9030 // vectorized tree.
9031 // Also, avoid adjusting the cost for extractelements with multiple uses
9032 // in different graph entries.
9033 auto *EE = cast<ExtractElementInst>(V);
9034 VecBase = EE->getVectorOperand();
9035 UniqueBases.insert(VecBase);
9036 const TreeEntry *VE = R.getTreeEntry(V);
9037 if (!CheckedExtracts.insert(V).second ||
9038 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
9039 any_of(EE->users(),
9040 [&](User *U) {
9041 return isa<GetElementPtrInst>(U) &&
9042 !R.areAllUsersVectorized(cast<Instruction>(U),
9043 &VectorizedVals);
9044 }) ||
9045 (VE && VE != E))
9046 continue;
9047 std::optional<unsigned> EEIdx = getExtractIndex(EE);
9048 if (!EEIdx)
9049 continue;
9050 unsigned Idx = *EEIdx;
9051 // Take credit for instruction that will become dead.
9052 if (EE->hasOneUse() || !PrevNodeFound) {
9053 Instruction *Ext = EE->user_back();
9054 if (isa<SExtInst, ZExtInst>(Ext) &&
9055 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
9056 // Use getExtractWithExtendCost() to calculate the cost of
9057 // extractelement/ext pair.
9058 Cost -=
9059 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
9060 EE->getVectorOperandType(), Idx);
9061 // Add back the cost of s|zext which is subtracted separately.
9063 Ext->getOpcode(), Ext->getType(), EE->getType(),
9064 TTI::getCastContextHint(Ext), CostKind, Ext);
9065 continue;
9066 }
9067 }
9068 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
9069 CostKind, Idx);
9070 }
9071 }
9072 // Check that gather of extractelements can be represented as just a
9073 // shuffle of a single/two vectors the scalars are extracted from.
9074 // Found the bunch of extractelement instructions that must be gathered
9075 // into a vector and can be represented as a permutation elements in a
9076 // single input vector or of 2 input vectors.
9077 // Done for reused if same extractelements were vectorized already.
9078 if (!PrevNodeFound)
9079 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
9080 InVectors.assign(1, E);
9081 CommonMask.assign(Mask.begin(), Mask.end());
9082 transformMaskAfterShuffle(CommonMask, CommonMask);
9083 SameNodesEstimated = false;
9084 if (NumParts != 1 && UniqueBases.size() != 1) {
9085 UseVecBaseAsInput = true;
9086 VecBase =
9087 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
9088 }
9089 return VecBase;
9090 }
9091 /// Checks if the specified entry \p E needs to be delayed because of its
9092 /// dependency nodes.
9093 std::optional<InstructionCost>
9094 needToDelay(const TreeEntry *,
9096 // No need to delay the cost estimation during analysis.
9097 return std::nullopt;
9098 }
9099 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
9100 if (&E1 == &E2) {
9101 assert(all_of(Mask,
9102 [&](int Idx) {
9103 return Idx < static_cast<int>(E1.getVectorFactor());
9104 }) &&
9105 "Expected single vector shuffle mask.");
9106 add(E1, Mask);
9107 return;
9108 }
9109 if (InVectors.empty()) {
9110 CommonMask.assign(Mask.begin(), Mask.end());
9111 InVectors.assign({&E1, &E2});
9112 return;
9113 }
9114 assert(!CommonMask.empty() && "Expected non-empty common mask.");
9115 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9116 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
9117 if (NumParts == 0 || NumParts >= Mask.size())
9118 NumParts = 1;
9119 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
9120 const auto *It =
9121 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
9122 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
9123 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
9124 }
9125 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
9126 if (InVectors.empty()) {
9127 CommonMask.assign(Mask.begin(), Mask.end());
9128 InVectors.assign(1, &E1);
9129 return;
9130 }
9131 assert(!CommonMask.empty() && "Expected non-empty common mask.");
9132 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9133 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
9134 if (NumParts == 0 || NumParts >= Mask.size())
9135 NumParts = 1;
9136 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
9137 const auto *It =
9138 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
9139 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
9140 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
9141 if (!SameNodesEstimated && InVectors.size() == 1)
9142 InVectors.emplace_back(&E1);
9143 }
9144 /// Adds 2 input vectors and the mask for their shuffling.
9145 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
9146 // May come only for shuffling of 2 vectors with extractelements, already
9147 // handled in adjustExtracts.
9148 assert(InVectors.size() == 1 &&
9149 all_of(enumerate(CommonMask),
9150 [&](auto P) {
9151 if (P.value() == PoisonMaskElem)
9152 return Mask[P.index()] == PoisonMaskElem;
9153 auto *EI =
9154 cast<ExtractElementInst>(InVectors.front()
9155 .get<const TreeEntry *>()
9156 ->Scalars[P.index()]);
9157 return EI->getVectorOperand() == V1 ||
9158 EI->getVectorOperand() == V2;
9159 }) &&
9160 "Expected extractelement vectors.");
9161 }
9162 /// Adds another one input vector and the mask for the shuffling.
9163 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
9164 if (InVectors.empty()) {
9165 assert(CommonMask.empty() && !ForExtracts &&
9166 "Expected empty input mask/vectors.");
9167 CommonMask.assign(Mask.begin(), Mask.end());
9168 InVectors.assign(1, V1);
9169 return;
9170 }
9171 if (ForExtracts) {
9172 // No need to add vectors here, already handled them in adjustExtracts.
9173 assert(InVectors.size() == 1 &&
9174 InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
9175 all_of(enumerate(CommonMask),
9176 [&](auto P) {
9177 Value *Scalar = InVectors.front()
9178 .get<const TreeEntry *>()
9179 ->Scalars[P.index()];
9180 if (P.value() == PoisonMaskElem)
9181 return P.value() == Mask[P.index()] ||
9182 isa<UndefValue>(Scalar);
9183 if (isa<Constant>(V1))
9184 return true;
9185 auto *EI = cast<ExtractElementInst>(Scalar);
9186 return EI->getVectorOperand() == V1;
9187 }) &&
9188 "Expected only tree entry for extractelement vectors.");
9189 return;
9190 }
9191 assert(!InVectors.empty() && !CommonMask.empty() &&
9192 "Expected only tree entries from extracts/reused buildvectors.");
9193 unsigned VF = getVF(V1);
9194 if (InVectors.size() == 2) {
9195 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
9196 transformMaskAfterShuffle(CommonMask, CommonMask);
9197 VF = std::max<unsigned>(VF, CommonMask.size());
9198 } else if (const auto *InTE =
9199 InVectors.front().dyn_cast<const TreeEntry *>()) {
9200 VF = std::max(VF, InTE->getVectorFactor());
9201 } else {
9202 VF = std::max(
9203 VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())
9204 ->getNumElements());
9205 }
9206 InVectors.push_back(V1);
9207 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
9208 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
9209 CommonMask[Idx] = Mask[Idx] + VF;
9210 }
9211 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
9212 Value *Root = nullptr) {
9213 Cost += getBuildVectorCost(VL, Root);
9214 if (!Root) {
9215 // FIXME: Need to find a way to avoid use of getNullValue here.
9217 unsigned VF = VL.size();
9218 if (MaskVF != 0)
9219 VF = std::min(VF, MaskVF);
9220 for (Value *V : VL.take_front(VF)) {
9221 if (isa<UndefValue>(V)) {
9222 Vals.push_back(cast<Constant>(V));
9223 continue;
9224 }
9225 Vals.push_back(Constant::getNullValue(V->getType()));
9226 }
9227 if (auto *VecTy = dyn_cast<FixedVectorType>(Vals.front()->getType())) {
9228 assert(SLPReVec && "FixedVectorType is not expected.");
9229 // When REVEC is enabled, we need to expand vector types into scalar
9230 // types.
9231 unsigned VecTyNumElements = VecTy->getNumElements();
9232 SmallVector<Constant *> NewVals(VF * VecTyNumElements, nullptr);
9233 for (auto [I, V] : enumerate(Vals)) {
9234 Type *ScalarTy = V->getType()->getScalarType();
9235 Constant *NewVal;
9236 if (isa<PoisonValue>(V))
9237 NewVal = PoisonValue::get(ScalarTy);
9238 else if (isa<UndefValue>(V))
9239 NewVal = UndefValue::get(ScalarTy);
9240 else
9241 NewVal = Constant::getNullValue(ScalarTy);
9242 std::fill_n(NewVals.begin() + I * VecTyNumElements, VecTyNumElements,
9243 NewVal);
9244 }
9245 Vals.swap(NewVals);
9246 }
9247 return ConstantVector::get(Vals);
9248 }
9251 cast<FixedVectorType>(Root->getType())->getNumElements()),
9252 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
9253 }
9255 /// Finalize emission of the shuffles.
9257 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
9258 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
9259 IsFinalized = true;
9260 if (Action) {
9261 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
9262 if (InVectors.size() == 2)
9263 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
9264 else
9265 Cost += createShuffle(Vec, nullptr, CommonMask);
9266 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
9267 if (CommonMask[Idx] != PoisonMaskElem)
9268 CommonMask[Idx] = Idx;
9269 assert(VF > 0 &&
9270 "Expected vector length for the final value before action.");
9271 Value *V = Vec.get<Value *>();
9272 Action(V, CommonMask);
9273 InVectors.front() = V;
9274 }
9275 ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
9276 if (CommonMask.empty()) {
9277 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
9278 return Cost;
9279 }
9280 return Cost +
9281 createShuffle(InVectors.front(),
9282 InVectors.size() == 2 ? InVectors.back() : nullptr,
9283 CommonMask);
9284 }
9285
9287 assert((IsFinalized || CommonMask.empty()) &&
9288 "Shuffle construction must be finalized.");
9289 }
9290};
9291
9292const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
9293 unsigned Idx) const {
9294 Value *Op = E->getOperand(Idx).front();
9295 if (const TreeEntry *TE = getTreeEntry(Op)) {
9296 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9297 return EI.EdgeIdx == Idx && EI.UserTE == E;
9298 }) != TE->UserTreeIndices.end())
9299 return TE;
9300 auto MIt = MultiNodeScalars.find(Op);
9301 if (MIt != MultiNodeScalars.end()) {
9302 for (const TreeEntry *TE : MIt->second) {
9303 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9304 return EI.EdgeIdx == Idx && EI.UserTE == E;
9305 }) != TE->UserTreeIndices.end())
9306 return TE;
9307 }
9308 }
9309 }
9310 const auto *It =
9311 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9312 return TE->isGather() &&
9313 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9314 return EI.EdgeIdx == Idx && EI.UserTE == E;
9315 }) != TE->UserTreeIndices.end();
9316 });
9317 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
9318 return It->get();
9319}
9320
9321TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
9322 if (TE.State == TreeEntry::ScatterVectorize ||
9323 TE.State == TreeEntry::StridedVectorize)
9325 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
9326 !TE.isAltShuffle()) {
9327 if (TE.ReorderIndices.empty())
9330 inversePermutation(TE.ReorderIndices, Mask);
9331 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
9333 }
9335}
9336
9337/// Builds the arguments types vector for the given call instruction with the
9338/// given \p ID for the specified vector factor.
9340 const Intrinsic::ID ID,
9341 const unsigned VF,
9342 unsigned MinBW) {
9343 SmallVector<Type *> ArgTys;
9344 for (auto [Idx, Arg] : enumerate(CI->args())) {
9347 ArgTys.push_back(Arg->getType());
9348 continue;
9349 }
9350 if (MinBW > 0) {
9351 ArgTys.push_back(
9352 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
9353 continue;
9354 }
9355 }
9356 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
9357 }
9358 return ArgTys;
9359}
9360
9362BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
9363 SmallPtrSetImpl<Value *> &CheckedExtracts) {
9364 ArrayRef<Value *> VL = E->Scalars;
9365
9366 Type *ScalarTy = VL[0]->getType();
9367 if (!E->isGather()) {
9368 if (auto *SI = dyn_cast<StoreInst>(VL[0]))
9369 ScalarTy = SI->getValueOperand()->getType();
9370 else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
9371 ScalarTy = CI->getOperand(0)->getType();
9372 else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
9373 ScalarTy = IE->getOperand(1)->getType();
9374 }
9375 if (!isValidElementType(ScalarTy))
9378
9379 // If we have computed a smaller type for the expression, update VecTy so
9380 // that the costs will be accurate.
9381 auto It = MinBWs.find(E);
9382 Type *OrigScalarTy = ScalarTy;
9383 if (It != MinBWs.end())
9384 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
9385 auto *VecTy = getWidenedType(ScalarTy, VL.size());
9386 unsigned EntryVF = E->getVectorFactor();
9387 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
9388
9389 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
9390 if (E->isGather()) {
9391 if (allConstant(VL))
9392 return 0;
9393 if (isa<InsertElementInst>(VL[0]))
9395 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
9396 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
9397 }
9398 InstructionCost CommonCost = 0;
9400 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
9401 if (!E->ReorderIndices.empty() &&
9402 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
9403 SmallVector<int> NewMask;
9404 if (E->getOpcode() == Instruction::Store) {
9405 // For stores the order is actually a mask.
9406 NewMask.resize(E->ReorderIndices.size());
9407 copy(E->ReorderIndices, NewMask.begin());
9408 } else {
9409 inversePermutation(E->ReorderIndices, NewMask);
9410 }
9411 ::addMask(Mask, NewMask);
9412 }
9413 if (NeedToShuffleReuses)
9414 ::addMask(Mask, E->ReuseShuffleIndices);
9415 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
9416 CommonCost =
9417 TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
9418 assert((E->State == TreeEntry::Vectorize ||
9419 E->State == TreeEntry::ScatterVectorize ||
9420 E->State == TreeEntry::StridedVectorize) &&
9421 "Unhandled state");
9422 assert(E->getOpcode() &&
9423 ((allSameType(VL) && allSameBlock(VL)) ||
9424 (E->getOpcode() == Instruction::GetElementPtr &&
9425 E->getMainOp()->getType()->isPointerTy())) &&
9426 "Invalid VL");
9427 Instruction *VL0 = E->getMainOp();
9428 unsigned ShuffleOrOp =
9429 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
9430 SetVector<Value *> UniqueValues(VL.begin(), VL.end());
9431 const unsigned Sz = UniqueValues.size();
9432 SmallBitVector UsedScalars(Sz, false);
9433 for (unsigned I = 0; I < Sz; ++I) {
9434 if (getTreeEntry(UniqueValues[I]) == E)
9435 continue;
9436 UsedScalars.set(I);
9437 }
9438 auto GetCastContextHint = [&](Value *V) {
9439 if (const TreeEntry *OpTE = getTreeEntry(V))
9440 return getCastContextHint(*OpTE);
9441 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
9442 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
9445 };
9446 auto GetCostDiff =
9447 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
9449 // Calculate the cost of this instruction.
9450 InstructionCost ScalarCost = 0;
9451 if (isa<CastInst, CallInst>(VL0)) {
9452 // For some of the instructions no need to calculate cost for each
9453 // particular instruction, we can use the cost of the single
9454 // instruction x total number of scalar instructions.
9455 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
9456 } else {
9457 for (unsigned I = 0; I < Sz; ++I) {
9458 if (UsedScalars.test(I))
9459 continue;
9460 ScalarCost += ScalarEltCost(I);
9461 }
9462 }
9463
9464 InstructionCost VecCost = VectorCost(CommonCost);
9465 // Check if the current node must be resized, if the parent node is not
9466 // resized.
9467 if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) {
9468 const EdgeInfo &EI = E->UserTreeIndices.front();
9469 if ((EI.UserTE->getOpcode() != Instruction::Select ||
9470 EI.EdgeIdx != 0) &&
9471 It != MinBWs.end()) {
9472 auto UserBWIt = MinBWs.find(EI.UserTE);
9473 Type *UserScalarTy =
9474 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
9475 if (UserBWIt != MinBWs.end())
9476 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
9477 UserBWIt->second.first);
9478 if (ScalarTy != UserScalarTy) {
9479 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9480 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
9481 unsigned VecOpcode;
9482 auto *UserVecTy =
9483 getWidenedType(UserScalarTy, E->getVectorFactor());
9484 if (BWSz > SrcBWSz)
9485 VecOpcode = Instruction::Trunc;
9486 else
9487 VecOpcode =
9488 It->second.second ? Instruction::SExt : Instruction::ZExt;
9489 TTI::CastContextHint CCH = GetCastContextHint(VL0);
9490 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
9491 CostKind);
9492 }
9493 }
9494 }
9495 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
9496 ScalarCost, "Calculated costs for Tree"));
9497 return VecCost - ScalarCost;
9498 };
9499 // Calculate cost difference from vectorizing set of GEPs.
9500 // Negative value means vectorizing is profitable.
9501 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
9502 assert((E->State == TreeEntry::Vectorize ||
9503 E->State == TreeEntry::StridedVectorize) &&
9504 "Entry state expected to be Vectorize or StridedVectorize here.");
9505 InstructionCost ScalarCost = 0;
9506 InstructionCost VecCost = 0;
9507 std::tie(ScalarCost, VecCost) = getGEPCosts(
9508 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
9509 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
9510 "Calculated GEPs cost for Tree"));
9511
9512 return VecCost - ScalarCost;
9513 };
9514
9515 switch (ShuffleOrOp) {
9516 case Instruction::PHI: {
9517 // Count reused scalars.
9518 InstructionCost ScalarCost = 0;
9520 for (Value *V : UniqueValues) {
9521 auto *PHI = dyn_cast<PHINode>(V);
9522 if (!PHI)
9523 continue;
9524
9525 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
9526 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
9527 Value *Op = PHI->getIncomingValue(I);
9528 Operands[I] = Op;
9529 }
9530 if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
9531 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
9532 if (!OpTE->ReuseShuffleIndices.empty())
9533 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
9534 OpTE->Scalars.size());
9535 }
9536
9537 return CommonCost - ScalarCost;
9538 }
9539 case Instruction::ExtractValue:
9540 case Instruction::ExtractElement: {
9541 auto GetScalarCost = [&](unsigned Idx) {
9542 auto *I = cast<Instruction>(UniqueValues[Idx]);
9543 VectorType *SrcVecTy;
9544 if (ShuffleOrOp == Instruction::ExtractElement) {
9545 auto *EE = cast<ExtractElementInst>(I);
9546 SrcVecTy = EE->getVectorOperandType();
9547 } else {
9548 auto *EV = cast<ExtractValueInst>(I);
9549 Type *AggregateTy = EV->getAggregateOperand()->getType();
9550 unsigned NumElts;
9551 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
9552 NumElts = ATy->getNumElements();
9553 else
9554 NumElts = AggregateTy->getStructNumElements();
9555 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
9556 }
9557 if (I->hasOneUse()) {
9558 Instruction *Ext = I->user_back();
9559 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
9560 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
9561 // Use getExtractWithExtendCost() to calculate the cost of
9562 // extractelement/ext pair.
9564 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
9565 // Subtract the cost of s|zext which is subtracted separately.
9567 Ext->getOpcode(), Ext->getType(), I->getType(),
9569 return Cost;
9570 }
9571 }
9572 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
9574 };
9575 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
9576 return GetCostDiff(GetScalarCost, GetVectorCost);
9577 }
9578 case Instruction::InsertElement: {
9579 assert(E->ReuseShuffleIndices.empty() &&
9580 "Unique insertelements only are expected.");
9581 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
9582 unsigned const NumElts = SrcVecTy->getNumElements();
9583 unsigned const NumScalars = VL.size();
9584
9585 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
9586
9587 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
9588 unsigned OffsetBeg = *getElementIndex(VL.front());
9589 unsigned OffsetEnd = OffsetBeg;
9590 InsertMask[OffsetBeg] = 0;
9591 for (auto [I, V] : enumerate(VL.drop_front())) {
9592 unsigned Idx = *getElementIndex(V);
9593 if (OffsetBeg > Idx)
9594 OffsetBeg = Idx;
9595 else if (OffsetEnd < Idx)
9596 OffsetEnd = Idx;
9597 InsertMask[Idx] = I + 1;
9598 }
9599 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
9600 if (NumOfParts > 0)
9601 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
9602 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9603 VecScalarsSz;
9604 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9605 unsigned InsertVecSz = std::min<unsigned>(
9606 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
9607 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9608 bool IsWholeSubvector =
9609 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9610 // Check if we can safely insert a subvector. If it is not possible, just
9611 // generate a whole-sized vector and shuffle the source vector and the new
9612 // subvector.
9613 if (OffsetBeg + InsertVecSz > VecSz) {
9614 // Align OffsetBeg to generate correct mask.
9615 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
9616 InsertVecSz = VecSz;
9617 }
9618
9619 APInt DemandedElts = APInt::getZero(NumElts);
9620 // TODO: Add support for Instruction::InsertValue.
9622 if (!E->ReorderIndices.empty()) {
9623 inversePermutation(E->ReorderIndices, Mask);
9624 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
9625 } else {
9626 Mask.assign(VecSz, PoisonMaskElem);
9627 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
9628 }
9629 bool IsIdentity = true;
9630 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
9631 Mask.swap(PrevMask);
9632 for (unsigned I = 0; I < NumScalars; ++I) {
9633 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
9634 DemandedElts.setBit(InsertIdx);
9635 IsIdentity &= InsertIdx - OffsetBeg == I;
9636 Mask[InsertIdx - OffsetBeg] = I;
9637 }
9638 assert(Offset < NumElts && "Failed to find vector index offset");
9639
9641 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
9642 /*Insert*/ true, /*Extract*/ false,
9643 CostKind);
9644
9645 // First cost - resize to actual vector size if not identity shuffle or
9646 // need to shift the vector.
9647 // Do not calculate the cost if the actual size is the register size and
9648 // we can merge this shuffle with the following SK_Select.
9649 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
9650 if (!IsIdentity)
9652 InsertVecTy, Mask);
9653 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
9654 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9655 }));
9656 // Second cost - permutation with subvector, if some elements are from the
9657 // initial vector or inserting a subvector.
9658 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
9659 // subvector of ActualVecTy.
9660 SmallBitVector InMask =
9661 isUndefVector(FirstInsert->getOperand(0),
9662 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9663 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
9664 if (InsertVecSz != VecSz) {
9665 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
9667 std::nullopt, CostKind, OffsetBeg - Offset,
9668 InsertVecTy);
9669 } else {
9670 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
9671 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
9672 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
9673 I <= End; ++I)
9674 if (Mask[I] != PoisonMaskElem)
9675 Mask[I] = I + VecSz;
9676 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
9677 Mask[I] =
9678 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
9679 Cost +=
9680 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
9681 }
9682 }
9683 return Cost;
9684 }
9685 case Instruction::ZExt:
9686 case Instruction::SExt:
9687 case Instruction::FPToUI:
9688 case Instruction::FPToSI:
9689 case Instruction::FPExt:
9690 case Instruction::PtrToInt:
9691 case Instruction::IntToPtr:
9692 case Instruction::SIToFP:
9693 case Instruction::UIToFP:
9694 case Instruction::Trunc:
9695 case Instruction::FPTrunc:
9696 case Instruction::BitCast: {
9697 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9698 Type *SrcScalarTy = VL0->getOperand(0)->getType();
9699 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
9700 unsigned Opcode = ShuffleOrOp;
9701 unsigned VecOpcode = Opcode;
9702 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
9703 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
9704 // Check if the values are candidates to demote.
9705 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
9706 if (SrcIt != MinBWs.end()) {
9707 SrcBWSz = SrcIt->second.first;
9708 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
9709 SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
9710 }
9711 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9712 if (BWSz == SrcBWSz) {
9713 VecOpcode = Instruction::BitCast;
9714 } else if (BWSz < SrcBWSz) {
9715 VecOpcode = Instruction::Trunc;
9716 } else if (It != MinBWs.end()) {
9717 assert(BWSz > SrcBWSz && "Invalid cast!");
9718 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9719 } else if (SrcIt != MinBWs.end()) {
9720 assert(BWSz > SrcBWSz && "Invalid cast!");
9721 VecOpcode =
9722 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9723 }
9724 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
9725 !SrcIt->second.second) {
9726 VecOpcode = Instruction::UIToFP;
9727 }
9728 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
9729 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9730 return TTI->getCastInstrCost(Opcode, VL0->getType(),
9731 VL0->getOperand(0)->getType(),
9733 };
9734 auto GetVectorCost = [=](InstructionCost CommonCost) {
9735 // Do not count cost here if minimum bitwidth is in effect and it is just
9736 // a bitcast (here it is just a noop).
9737 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9738 return CommonCost;
9739 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
9740 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
9741 return CommonCost +
9742 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
9743 VecOpcode == Opcode ? VI : nullptr);
9744 };
9745 return GetCostDiff(GetScalarCost, GetVectorCost);
9746 }
9747 case Instruction::FCmp:
9748 case Instruction::ICmp:
9749 case Instruction::Select: {
9750 CmpInst::Predicate VecPred, SwappedVecPred;
9751 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
9752 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
9753 match(VL0, MatchCmp))
9754 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
9755 else
9756 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
9759 auto GetScalarCost = [&](unsigned Idx) {
9760 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9761 CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
9764 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
9765 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
9766 !match(VI, MatchCmp)) ||
9767 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9768 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
9771
9773 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
9774 CostKind, VI);
9775 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI);
9776 if (MinMaxID != Intrinsic::not_intrinsic) {
9777 Type *CanonicalType = OrigScalarTy;
9778 if (CanonicalType->isPtrOrPtrVectorTy())
9779 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
9780 CanonicalType->getContext(),
9781 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
9782
9783 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
9784 {CanonicalType, CanonicalType});
9785 InstructionCost IntrinsicCost =
9786 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9787 // If the selects are the only uses of the compares, they will be
9788 // dead and we can adjust the cost by removing their cost.
9789 if (SelectOnly) {
9790 auto *CI = cast<CmpInst>(VI->getOperand(0));
9791 IntrinsicCost -= TTI->getCmpSelInstrCost(
9792 CI->getOpcode(), OrigScalarTy, Builder.getInt1Ty(),
9793 CI->getPredicate(), CostKind, CI);
9794 }
9795 ScalarCost = std::min(ScalarCost, IntrinsicCost);
9796 }
9797
9798 return ScalarCost;
9799 };
9800 auto GetVectorCost = [&](InstructionCost CommonCost) {
9801 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
9802
9804 E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
9805 // Check if it is possible and profitable to use min/max for selects
9806 // in VL.
9807 //
9808 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL);
9809 if (MinMaxID != Intrinsic::not_intrinsic) {
9810 Type *CanonicalType = VecTy;
9811 if (CanonicalType->isPtrOrPtrVectorTy())
9812 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
9813 CanonicalType->getContext(),
9814 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
9815 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
9816 {CanonicalType, CanonicalType});
9817 InstructionCost IntrinsicCost =
9818 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9819 // If the selects are the only uses of the compares, they will be
9820 // dead and we can adjust the cost by removing their cost.
9821 if (SelectOnly) {
9822 auto *CI =
9823 cast<CmpInst>(cast<Instruction>(VL.front())->getOperand(0));
9824 IntrinsicCost -= TTI->getCmpSelInstrCost(CI->getOpcode(), VecTy,
9825 MaskTy, VecPred, CostKind);
9826 }
9827 VecCost = std::min(VecCost, IntrinsicCost);
9828 }
9829 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
9830 auto *CondType =
9831 getWidenedType(SI->getCondition()->getType(), VL.size());
9832 unsigned CondNumElements = CondType->getNumElements();
9833 unsigned VecTyNumElements = getNumElements(VecTy);
9834 assert(VecTyNumElements >= CondNumElements &&
9835 VecTyNumElements % CondNumElements == 0 &&
9836 "Cannot vectorize Instruction::Select");
9837 if (CondNumElements != VecTyNumElements) {
9838 // When the return type is i1 but the source is fixed vector type, we
9839 // need to duplicate the condition value.
9840 VecCost += TTI->getShuffleCost(
9841 TTI::SK_PermuteSingleSrc, CondType,
9842 createReplicatedMask(VecTyNumElements / CondNumElements,
9843 CondNumElements));
9844 }
9845 }
9846 return VecCost + CommonCost;
9847 };
9848 return GetCostDiff(GetScalarCost, GetVectorCost);
9849 }
9850 case Instruction::FNeg:
9851 case Instruction::Add:
9852 case Instruction::FAdd:
9853 case Instruction::Sub:
9854 case Instruction::FSub:
9855 case Instruction::Mul:
9856 case Instruction::FMul:
9857 case Instruction::UDiv:
9858 case Instruction::SDiv:
9859 case Instruction::FDiv:
9860 case Instruction::URem:
9861 case Instruction::SRem:
9862 case Instruction::FRem:
9863 case Instruction::Shl:
9864 case Instruction::LShr:
9865 case Instruction::AShr:
9866 case Instruction::And:
9867 case Instruction::Or:
9868 case Instruction::Xor: {
9869 auto GetScalarCost = [&](unsigned Idx) {
9870 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9871 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9872 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
9873 TTI::OperandValueInfo Op2Info =
9874 TTI::getOperandInfo(VI->getOperand(OpIdx));
9875 SmallVector<const Value *> Operands(VI->operand_values());
9876 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
9877 Op1Info, Op2Info, Operands, VI);
9878 };
9879 auto GetVectorCost = [=](InstructionCost CommonCost) {
9880 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
9881 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
9882 ArrayRef<Value *> Ops = E->getOperand(I);
9883 if (all_of(Ops, [&](Value *Op) {
9884 auto *CI = dyn_cast<ConstantInt>(Op);
9885 return CI && CI->getValue().countr_one() >= It->second.first;
9886 }))
9887 return CommonCost;
9888 }
9889 }
9890 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
9891 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
9892 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
9893 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
9894 Op2Info, std::nullopt, nullptr, TLI) +
9895 CommonCost;
9896 };
9897 return GetCostDiff(GetScalarCost, GetVectorCost);
9898 }
9899 case Instruction::GetElementPtr: {
9900 return CommonCost + GetGEPCostDiff(VL, VL0);
9901 }
9902 case Instruction::Load: {
9903 auto GetScalarCost = [&](unsigned Idx) {
9904 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
9905 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
9906 VI->getAlign(), VI->getPointerAddressSpace(),
9908 };
9909 auto *LI0 = cast<LoadInst>(VL0);
9910 auto GetVectorCost = [&](InstructionCost CommonCost) {
9911 InstructionCost VecLdCost;
9912 if (E->State == TreeEntry::Vectorize) {
9913 VecLdCost = TTI->getMemoryOpCost(
9914 Instruction::Load, VecTy, LI0->getAlign(),
9915 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
9916 } else if (E->State == TreeEntry::StridedVectorize) {
9917 Align CommonAlignment =
9918 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9919 VecLdCost = TTI->getStridedMemoryOpCost(
9920 Instruction::Load, VecTy, LI0->getPointerOperand(),
9921 /*VariableMask=*/false, CommonAlignment, CostKind);
9922 } else {
9923 assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
9924 Align CommonAlignment =
9925 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9926 VecLdCost = TTI->getGatherScatterOpCost(
9927 Instruction::Load, VecTy, LI0->getPointerOperand(),
9928 /*VariableMask=*/false, CommonAlignment, CostKind);
9929 }
9930 return VecLdCost + CommonCost;
9931 };
9932
9933 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
9934 // If this node generates masked gather load then it is not a terminal node.
9935 // Hence address operand cost is estimated separately.
9936 if (E->State == TreeEntry::ScatterVectorize)
9937 return Cost;
9938
9939 // Estimate cost of GEPs since this tree node is a terminator.
9940 SmallVector<Value *> PointerOps(VL.size());
9941 for (auto [I, V] : enumerate(VL))
9942 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
9943 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9944 }
9945 case Instruction::Store: {
9946 bool IsReorder = !E->ReorderIndices.empty();
9947 auto GetScalarCost = [=](unsigned Idx) {
9948 auto *VI = cast<StoreInst>(VL[Idx]);
9949 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
9950 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
9951 VI->getAlign(), VI->getPointerAddressSpace(),
9952 CostKind, OpInfo, VI);
9953 };
9954 auto *BaseSI =
9955 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9956 auto GetVectorCost = [=](InstructionCost CommonCost) {
9957 // We know that we can merge the stores. Calculate the cost.
9958 InstructionCost VecStCost;
9959 if (E->State == TreeEntry::StridedVectorize) {
9960 Align CommonAlignment =
9961 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
9962 VecStCost = TTI->getStridedMemoryOpCost(
9963 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9964 /*VariableMask=*/false, CommonAlignment, CostKind);
9965 } else {
9966 assert(E->State == TreeEntry::Vectorize &&
9967 "Expected either strided or consecutive stores.");
9968 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
9969 VecStCost = TTI->getMemoryOpCost(
9970 Instruction::Store, VecTy, BaseSI->getAlign(),
9971 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
9972 }
9973 return VecStCost + CommonCost;
9974 };
9975 SmallVector<Value *> PointerOps(VL.size());
9976 for (auto [I, V] : enumerate(VL)) {
9977 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
9978 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
9979 }
9980
9981 return GetCostDiff(GetScalarCost, GetVectorCost) +
9982 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9983 }
9984 case Instruction::Call: {
9985 auto GetScalarCost = [&](unsigned Idx) {
9986 auto *CI = cast<CallInst>(UniqueValues[Idx]);
9989 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
9990 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9991 }
9994 CI->getFunctionType()->params(), CostKind);
9995 };
9996 auto GetVectorCost = [=](InstructionCost CommonCost) {
9997 auto *CI = cast<CallInst>(VL0);
9999 SmallVector<Type *> ArgTys =
10001 It != MinBWs.end() ? It->second.first : 0);
10002 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10003 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
10004 };
10005 return GetCostDiff(GetScalarCost, GetVectorCost);
10006 }
10007 case Instruction::ShuffleVector: {
10008 assert(E->isAltShuffle() &&
10009 ((Instruction::isBinaryOp(E->getOpcode()) &&
10010 Instruction::isBinaryOp(E->getAltOpcode())) ||
10011 (Instruction::isCast(E->getOpcode()) &&
10012 Instruction::isCast(E->getAltOpcode())) ||
10013 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
10014 "Invalid Shuffle Vector Operand");
10015 // Try to find the previous shuffle node with the same operands and same
10016 // main/alternate ops.
10017 auto TryFindNodeWithEqualOperands = [=]() {
10018 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10019 if (TE.get() == E)
10020 break;
10021 if (TE->isAltShuffle() &&
10022 ((TE->getOpcode() == E->getOpcode() &&
10023 TE->getAltOpcode() == E->getAltOpcode()) ||
10024 (TE->getOpcode() == E->getAltOpcode() &&
10025 TE->getAltOpcode() == E->getOpcode())) &&
10026 TE->hasEqualOperands(*E))
10027 return true;
10028 }
10029 return false;
10030 };
10031 auto GetScalarCost = [&](unsigned Idx) {
10032 auto *VI = cast<Instruction>(UniqueValues[Idx]);
10033 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
10034 (void)E;
10035 return TTI->getInstructionCost(VI, CostKind);
10036 };
10037 // Need to clear CommonCost since the final shuffle cost is included into
10038 // vector cost.
10039 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
10040 // VecCost is equal to sum of the cost of creating 2 vectors
10041 // and the cost of creating shuffle.
10042 InstructionCost VecCost = 0;
10043 if (TryFindNodeWithEqualOperands()) {
10044 LLVM_DEBUG({
10045 dbgs() << "SLP: diamond match for alternate node found.\n";
10046 E->dump();
10047 });
10048 // No need to add new vector costs here since we're going to reuse
10049 // same main/alternate vector ops, just do different shuffling.
10050 } else if (Instruction::isBinaryOp(E->getOpcode())) {
10051 VecCost =
10052 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
10053 VecCost +=
10054 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
10055 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
10056 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
10057 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
10058 CI0->getPredicate(), CostKind, VL0);
10059 VecCost += TTIRef.getCmpSelInstrCost(
10060 E->getOpcode(), VecTy, MaskTy,
10061 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
10062 E->getAltOp());
10063 } else {
10064 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
10065 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
10066 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
10067 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
10068 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
10069 unsigned SrcBWSz =
10070 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
10071 if (SrcIt != MinBWs.end()) {
10072 SrcBWSz = SrcIt->second.first;
10073 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
10074 SrcTy = getWidenedType(SrcSclTy, VL.size());
10075 }
10076 if (BWSz <= SrcBWSz) {
10077 if (BWSz < SrcBWSz)
10078 VecCost =
10079 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
10081 LLVM_DEBUG({
10082 dbgs()
10083 << "SLP: alternate extension, which should be truncated.\n";
10084 E->dump();
10085 });
10086 return VecCost;
10087 }
10088 }
10089 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
10091 VecCost +=
10092 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
10094 }
10096 E->buildAltOpShuffleMask(
10097 [E](Instruction *I) {
10098 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
10099 return I->getOpcode() == E->getAltOpcode();
10100 },
10101 Mask);
10103 FinalVecTy, Mask);
10104 // Patterns like [fadd,fsub] can be combined into a single instruction
10105 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
10106 // need to take into account their order when looking for the most used
10107 // order.
10108 unsigned Opcode0 = E->getOpcode();
10109 unsigned Opcode1 = E->getAltOpcode();
10110 SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));
10111 // If this pattern is supported by the target then we consider the
10112 // order.
10113 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
10114 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
10115 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
10116 return AltVecCost < VecCost ? AltVecCost : VecCost;
10117 }
10118 // TODO: Check the reverse order too.
10119 return VecCost;
10120 };
10121 return GetCostDiff(GetScalarCost, GetVectorCost);
10122 }
10123 case Instruction::Freeze:
10124 return CommonCost;
10125 default:
10126 llvm_unreachable("Unknown instruction");
10127 }
10128}
10129
10130bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
10131 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
10132 << VectorizableTree.size() << " is fully vectorizable .\n");
10133
10134 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
10136 return TE->isGather() &&
10137 !any_of(TE->Scalars,
10138 [this](Value *V) { return EphValues.contains(V); }) &&
10139 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
10140 TE->Scalars.size() < Limit ||
10141 ((TE->getOpcode() == Instruction::ExtractElement ||
10142 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
10143 isFixedVectorShuffle(TE->Scalars, Mask)) ||
10144 (TE->isGather() && TE->getOpcode() == Instruction::Load &&
10145 !TE->isAltShuffle()));
10146 };
10147
10148 // We only handle trees of heights 1 and 2.
10149 if (VectorizableTree.size() == 1 &&
10150 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
10151 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
10152 (ForReduction &&
10153 AreVectorizableGathers(VectorizableTree[0].get(),
10154 VectorizableTree[0]->Scalars.size()) &&
10155 VectorizableTree[0]->getVectorFactor() > 2)))
10156 return true;
10157
10158 if (VectorizableTree.size() != 2)
10159 return false;
10160
10161 // Handle splat and all-constants stores. Also try to vectorize tiny trees
10162 // with the second gather nodes if they have less scalar operands rather than
10163 // the initial tree element (may be profitable to shuffle the second gather)
10164 // or they are extractelements, which form shuffle.
10166 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
10167 AreVectorizableGathers(VectorizableTree[1].get(),
10168 VectorizableTree[0]->Scalars.size()))
10169 return true;
10170
10171 // Gathering cost would be too much for tiny trees.
10172 if (VectorizableTree[0]->isGather() ||
10173 (VectorizableTree[1]->isGather() &&
10174 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
10175 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
10176 return false;
10177
10178 return true;
10179}
10180
10181static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
10183 bool MustMatchOrInst) {
10184 // Look past the root to find a source value. Arbitrarily follow the
10185 // path through operand 0 of any 'or'. Also, peek through optional
10186 // shift-left-by-multiple-of-8-bits.
10187 Value *ZextLoad = Root;
10188 const APInt *ShAmtC;
10189 bool FoundOr = false;
10190 while (!isa<ConstantExpr>(ZextLoad) &&
10191 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
10192 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
10193 ShAmtC->urem(8) == 0))) {
10194 auto *BinOp = cast<BinaryOperator>(ZextLoad);
10195 ZextLoad = BinOp->getOperand(0);
10196 if (BinOp->getOpcode() == Instruction::Or)
10197 FoundOr = true;
10198 }
10199 // Check if the input is an extended load of the required or/shift expression.
10200 Value *Load;
10201 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
10202 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
10203 return false;
10204
10205 // Require that the total load bit width is a legal integer type.
10206 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
10207 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
10208 Type *SrcTy = Load->getType();
10209 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
10210 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
10211 return false;
10212
10213 // Everything matched - assume that we can fold the whole sequence using
10214 // load combining.
10215 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
10216 << *(cast<Instruction>(Root)) << "\n");
10217
10218 return true;
10219}
10220
10222 if (RdxKind != RecurKind::Or)
10223 return false;
10224
10225 unsigned NumElts = VectorizableTree[0]->Scalars.size();
10226 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
10227 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
10228 /* MatchOr */ false);
10229}
10230
10232 // Peek through a final sequence of stores and check if all operations are
10233 // likely to be load-combined.
10234 unsigned NumElts = Stores.size();
10235 for (Value *Scalar : Stores) {
10236 Value *X;
10237 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
10238 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
10239 return false;
10240 }
10241 return true;
10242}
10243
10244bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
10245 // No need to vectorize inserts of gathered values.
10246 if (VectorizableTree.size() == 2 &&
10247 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
10248 VectorizableTree[1]->isGather() &&
10249 (VectorizableTree[1]->getVectorFactor() <= 2 ||
10250 !(isSplat(VectorizableTree[1]->Scalars) ||
10251 allConstant(VectorizableTree[1]->Scalars))))
10252 return true;
10253
10254 // If the graph includes only PHI nodes and gathers, it is defnitely not
10255 // profitable for the vectorization, we can skip it, if the cost threshold is
10256 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
10257 // gathers/buildvectors.
10258 constexpr int Limit = 4;
10259 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
10260 !VectorizableTree.empty() &&
10261 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
10262 return (TE->isGather() &&
10263 TE->getOpcode() != Instruction::ExtractElement &&
10264 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
10265 TE->getOpcode() == Instruction::PHI;
10266 }))
10267 return true;
10268
10269 // We can vectorize the tree if its size is greater than or equal to the
10270 // minimum size specified by the MinTreeSize command line option.
10271 if (VectorizableTree.size() >= MinTreeSize)
10272 return false;
10273
10274 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
10275 // can vectorize it if we can prove it fully vectorizable.
10276 if (isFullyVectorizableTinyTree(ForReduction))
10277 return false;
10278
10279 // Check if any of the gather node forms an insertelement buildvector
10280 // somewhere.
10281 bool IsAllowedSingleBVNode =
10282 VectorizableTree.size() > 1 ||
10283 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
10284 !VectorizableTree.front()->isAltShuffle() &&
10285 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
10286 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
10287 allSameBlock(VectorizableTree.front()->Scalars));
10288 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
10289 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
10290 return isa<ExtractElementInst, UndefValue>(V) ||
10291 (IsAllowedSingleBVNode &&
10292 !V->hasNUsesOrMore(UsesLimit) &&
10293 any_of(V->users(), IsaPred<InsertElementInst>));
10294 });
10295 }))
10296 return false;
10297
10298 assert(VectorizableTree.empty()
10299 ? ExternalUses.empty()
10300 : true && "We shouldn't have any external users");
10301
10302 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
10303 // vectorizable.
10304 return true;
10305}
10306
10308 // Walk from the bottom of the tree to the top, tracking which values are
10309 // live. When we see a call instruction that is not part of our tree,
10310 // query TTI to see if there is a cost to keeping values live over it
10311 // (for example, if spills and fills are required).
10312 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
10314
10316 Instruction *PrevInst = nullptr;
10317
10318 // The entries in VectorizableTree are not necessarily ordered by their
10319 // position in basic blocks. Collect them and order them by dominance so later
10320 // instructions are guaranteed to be visited first. For instructions in
10321 // different basic blocks, we only scan to the beginning of the block, so
10322 // their order does not matter, as long as all instructions in a basic block
10323 // are grouped together. Using dominance ensures a deterministic order.
10324 SmallVector<Instruction *, 16> OrderedScalars;
10325 for (const auto &TEPtr : VectorizableTree) {
10326 if (TEPtr->State != TreeEntry::Vectorize)
10327 continue;
10328 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
10329 if (!Inst)
10330 continue;
10331 OrderedScalars.push_back(Inst);
10332 }
10333 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
10334 auto *NodeA = DT->getNode(A->getParent());
10335 auto *NodeB = DT->getNode(B->getParent());
10336 assert(NodeA && "Should only process reachable instructions");
10337 assert(NodeB && "Should only process reachable instructions");
10338 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10339 "Different nodes should have different DFS numbers");
10340 if (NodeA != NodeB)
10341 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
10342 return B->comesBefore(A);
10343 });
10344
10345 for (Instruction *Inst : OrderedScalars) {
10346 if (!PrevInst) {
10347 PrevInst = Inst;
10348 continue;
10349 }
10350
10351 // Update LiveValues.
10352 LiveValues.erase(PrevInst);
10353 for (auto &J : PrevInst->operands()) {
10354 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
10355 LiveValues.insert(cast<Instruction>(&*J));
10356 }
10357
10358 LLVM_DEBUG({
10359 dbgs() << "SLP: #LV: " << LiveValues.size();
10360 for (auto *X : LiveValues)
10361 dbgs() << " " << X->getName();
10362 dbgs() << ", Looking at ";
10363 Inst->dump();
10364 });
10365
10366 // Now find the sequence of instructions between PrevInst and Inst.
10367 unsigned NumCalls = 0;
10368 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
10369 PrevInstIt =
10370 PrevInst->getIterator().getReverse();
10371 while (InstIt != PrevInstIt) {
10372 if (PrevInstIt == PrevInst->getParent()->rend()) {
10373 PrevInstIt = Inst->getParent()->rbegin();
10374 continue;
10375 }
10376
10377 auto NoCallIntrinsic = [this](Instruction *I) {
10378 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
10379 if (II->isAssumeLikeIntrinsic())
10380 return true;
10381 FastMathFlags FMF;
10383 for (auto &ArgOp : II->args())
10384 Tys.push_back(ArgOp->getType());
10385 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
10386 FMF = FPMO->getFastMathFlags();
10387 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
10388 FMF);
10389 InstructionCost IntrCost =
10392 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
10393 if (IntrCost < CallCost)
10394 return true;
10395 }
10396 return false;
10397 };
10398
10399 // Debug information does not impact spill cost.
10400 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
10401 &*PrevInstIt != PrevInst)
10402 NumCalls++;
10403
10404 ++PrevInstIt;
10405 }
10406
10407 if (NumCalls) {
10409 for (auto *II : LiveValues) {
10410 auto *ScalarTy = II->getType();
10411 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
10412 ScalarTy = VectorTy->getElementType();
10413 V.push_back(getWidenedType(ScalarTy, BundleWidth));
10414 }
10415 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
10416 }
10417
10418 PrevInst = Inst;
10419 }
10420
10421 return Cost;
10422}
10423
10424/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
10425/// buildvector sequence.
10427 const InsertElementInst *IE2) {
10428 if (IE1 == IE2)
10429 return false;
10430 const auto *I1 = IE1;
10431 const auto *I2 = IE2;
10432 const InsertElementInst *PrevI1;
10433 const InsertElementInst *PrevI2;
10434 unsigned Idx1 = *getElementIndex(IE1);
10435 unsigned Idx2 = *getElementIndex(IE2);
10436 do {
10437 if (I2 == IE1)
10438 return true;
10439 if (I1 == IE2)
10440 return false;
10441 PrevI1 = I1;
10442 PrevI2 = I2;
10443 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
10444 getElementIndex(I1).value_or(Idx2) != Idx2)
10445 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
10446 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
10447 getElementIndex(I2).value_or(Idx1) != Idx1)
10448 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
10449 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
10450 llvm_unreachable("Two different buildvectors not expected.");
10451}
10452
10453namespace {
10454/// Returns incoming Value *, if the requested type is Value * too, or a default
10455/// value, otherwise.
10456struct ValueSelect {
10457 template <typename U>
10458 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
10459 return V;
10460 }
10461 template <typename U>
10462 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
10463 return U();
10464 }
10465};
10466} // namespace
10467
10468/// Does the analysis of the provided shuffle masks and performs the requested
10469/// actions on the vectors with the given shuffle masks. It tries to do it in
10470/// several steps.
10471/// 1. If the Base vector is not undef vector, resizing the very first mask to
10472/// have common VF and perform action for 2 input vectors (including non-undef
10473/// Base). Other shuffle masks are combined with the resulting after the 1 stage
10474/// and processed as a shuffle of 2 elements.
10475/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
10476/// action only for 1 vector with the given mask, if it is not the identity
10477/// mask.
10478/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
10479/// vectors, combing the masks properly between the steps.
10480template <typename T>
10482 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
10483 function_ref<unsigned(T *)> GetVF,
10484 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
10486 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
10487 SmallVector<int> Mask(ShuffleMask.begin()->second);
10488 auto VMIt = std::next(ShuffleMask.begin());
10489 T *Prev = nullptr;
10490 SmallBitVector UseMask =
10491 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
10492 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
10493 if (!IsBaseUndef.all()) {
10494 // Base is not undef, need to combine it with the next subvectors.
10495 std::pair<T *, bool> Res =
10496 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
10497 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
10498 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
10499 if (Mask[Idx] == PoisonMaskElem)
10500 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
10501 else
10502 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
10503 }
10504 auto *V = ValueSelect::get<T *>(Base);
10505 (void)V;
10506 assert((!V || GetVF(V) == Mask.size()) &&
10507 "Expected base vector of VF number of elements.");
10508 Prev = Action(Mask, {nullptr, Res.first});
10509 } else if (ShuffleMask.size() == 1) {
10510 // Base is undef and only 1 vector is shuffled - perform the action only for
10511 // single vector, if the mask is not the identity mask.
10512 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
10513 /*ForSingleMask=*/true);
10514 if (Res.second)
10515 // Identity mask is found.
10516 Prev = Res.first;
10517 else
10518 Prev = Action(Mask, {ShuffleMask.begin()->first});
10519 } else {
10520 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
10521 // shuffles step by step, combining shuffle between the steps.
10522 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
10523 unsigned Vec2VF = GetVF(VMIt->first);
10524 if (Vec1VF == Vec2VF) {
10525 // No need to resize the input vectors since they are of the same size, we
10526 // can shuffle them directly.
10527 ArrayRef<int> SecMask = VMIt->second;
10528 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10529 if (SecMask[I] != PoisonMaskElem) {
10530 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10531 Mask[I] = SecMask[I] + Vec1VF;
10532 }
10533 }
10534 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
10535 } else {
10536 // Vectors of different sizes - resize and reshuffle.
10537 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
10538 /*ForSingleMask=*/false);
10539 std::pair<T *, bool> Res2 =
10540 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
10541 ArrayRef<int> SecMask = VMIt->second;
10542 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10543 if (Mask[I] != PoisonMaskElem) {
10544 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10545 if (Res1.second)
10546 Mask[I] = I;
10547 } else if (SecMask[I] != PoisonMaskElem) {
10548 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10549 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
10550 }
10551 }
10552 Prev = Action(Mask, {Res1.first, Res2.first});
10553 }
10554 VMIt = std::next(VMIt);
10555 }
10556 bool IsBaseNotUndef = !IsBaseUndef.all();
10557 (void)IsBaseNotUndef;
10558 // Perform requested actions for the remaining masks/vectors.
10559 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
10560 // Shuffle other input vectors, if any.
10561 std::pair<T *, bool> Res =
10562 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
10563 ArrayRef<int> SecMask = VMIt->second;
10564 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10565 if (SecMask[I] != PoisonMaskElem) {
10566 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
10567 "Multiple uses of scalars.");
10568 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
10569 } else if (Mask[I] != PoisonMaskElem) {
10570 Mask[I] = I;
10571 }
10572 }
10573 Prev = Action(Mask, {Prev, Res.first});
10574 }
10575 return Prev;
10576}
10577
10580 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
10581 << VectorizableTree.size() << ".\n");
10582
10583 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
10584
10585 SmallPtrSet<Value *, 4> CheckedExtracts;
10586 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
10587 TreeEntry &TE = *VectorizableTree[I];
10588 if (TE.isGather()) {
10589 if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
10590 E && E->getVectorFactor() == TE.getVectorFactor() &&
10591 E->isSame(TE.Scalars)) {
10592 // Some gather nodes might be absolutely the same as some vectorizable
10593 // nodes after reordering, need to handle it.
10594 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
10595 << shortBundleName(TE.Scalars) << ".\n"
10596 << "SLP: Current total cost = " << Cost << "\n");
10597 continue;
10598 }
10599 }
10600
10601 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
10602 Cost += C;
10603 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
10604 << shortBundleName(TE.Scalars) << ".\n"
10605 << "SLP: Current total cost = " << Cost << "\n");
10606 }
10607
10608 SmallPtrSet<Value *, 16> ExtractCostCalculated;
10609 InstructionCost ExtractCost = 0;
10612 SmallVector<APInt> DemandedElts;
10613 SmallDenseSet<Value *, 4> UsedInserts;
10615 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
10616 for (ExternalUser &EU : ExternalUses) {
10617 // We only add extract cost once for the same scalar.
10618 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
10619 !ExtractCostCalculated.insert(EU.Scalar).second)
10620 continue;
10621
10622 // Uses by ephemeral values are free (because the ephemeral value will be
10623 // removed prior to code generation, and so the extraction will be
10624 // removed as well).
10625 if (EphValues.count(EU.User))
10626 continue;
10627
10628 // No extract cost for vector "scalar"
10629 if (isa<FixedVectorType>(EU.Scalar->getType()))
10630 continue;
10631
10632 // If found user is an insertelement, do not calculate extract cost but try
10633 // to detect it as a final shuffled/identity match.
10634 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
10635 VU && VU->getOperand(1) == EU.Scalar) {
10636 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
10637 if (!UsedInserts.insert(VU).second)
10638 continue;
10639 std::optional<unsigned> InsertIdx = getElementIndex(VU);
10640 if (InsertIdx) {
10641 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
10642 auto *It = find_if(
10643 FirstUsers,
10644 [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
10646 VU, cast<InsertElementInst>(Pair.first),
10647 [this](InsertElementInst *II) -> Value * {
10648 Value *Op0 = II->getOperand(0);
10649 if (getTreeEntry(II) && !getTreeEntry(Op0))
10650 return nullptr;
10651 return Op0;
10652 });
10653 });
10654 int VecId = -1;
10655 if (It == FirstUsers.end()) {
10656 (void)ShuffleMasks.emplace_back();
10657 SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
10658 if (Mask.empty())
10659 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10660 // Find the insertvector, vectorized in tree, if any.
10661 Value *Base = VU;
10662 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
10663 if (IEBase != EU.User &&
10664 (!IEBase->hasOneUse() ||
10665 getElementIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
10666 break;
10667 // Build the mask for the vectorized insertelement instructions.
10668 if (const TreeEntry *E = getTreeEntry(IEBase)) {
10669 VU = IEBase;
10670 do {
10671 IEBase = cast<InsertElementInst>(Base);
10672 int Idx = *getElementIndex(IEBase);
10673 assert(Mask[Idx] == PoisonMaskElem &&
10674 "InsertElementInstruction used already.");
10675 Mask[Idx] = Idx;
10676 Base = IEBase->getOperand(0);
10677 } while (E == getTreeEntry(Base));
10678 break;
10679 }
10680 Base = cast<InsertElementInst>(Base)->getOperand(0);
10681 }
10682 FirstUsers.emplace_back(VU, ScalarTE);
10683 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
10684 VecId = FirstUsers.size() - 1;
10685 auto It = MinBWs.find(ScalarTE);
10686 if (It != MinBWs.end() &&
10687 VectorCasts
10688 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
10689 .second) {
10690 unsigned BWSz = It->second.first;
10691 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
10692 unsigned VecOpcode;
10693 if (DstBWSz < BWSz)
10694 VecOpcode = Instruction::Trunc;
10695 else
10696 VecOpcode =
10697 It->second.second ? Instruction::SExt : Instruction::ZExt;
10700 VecOpcode, FTy,
10701 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
10702 FTy->getNumElements()),
10704 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10705 << " for extending externally used vector with "
10706 "non-equal minimum bitwidth.\n");
10707 Cost += C;
10708 }
10709 } else {
10710 if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
10711 It->first = VU;
10712 VecId = std::distance(FirstUsers.begin(), It);
10713 }
10714 int InIdx = *InsertIdx;
10715 SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
10716 if (Mask.empty())
10717 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10718 Mask[InIdx] = EU.Lane;
10719 DemandedElts[VecId].setBit(InIdx);
10720 continue;
10721 }
10722 }
10723 }
10724 // Leave the GEPs as is, they are free in most cases and better to keep them
10725 // as GEPs.
10727 if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10728 if (!ValueToExtUses) {
10729 ValueToExtUses.emplace();
10730 for_each(enumerate(ExternalUses), [&](const auto &P) {
10731 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
10732 });
10733 }
10734 // Can use original GEP, if no operands vectorized or they are marked as
10735 // externally used already.
10736 bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
10737 if (!getTreeEntry(V))
10738 return true;
10739 auto It = ValueToExtUses->find(V);
10740 if (It != ValueToExtUses->end()) {
10741 // Replace all uses to avoid compiler crash.
10742 ExternalUses[It->second].User = nullptr;
10743 return true;
10744 }
10745 return false;
10746 });
10747 if (CanBeUsedAsGEP) {
10748 ExtractCost += TTI->getInstructionCost(GEP, CostKind);
10749 ExternalUsesAsGEPs.insert(EU.Scalar);
10750 continue;
10751 }
10752 }
10753
10754 // If we plan to rewrite the tree in a smaller type, we will need to sign
10755 // extend the extracted value back to the original type. Here, we account
10756 // for the extract and the added cost of the sign extend if needed.
10757 auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
10758 auto It = MinBWs.find(getTreeEntry(EU.Scalar));
10759 if (It != MinBWs.end()) {
10760 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
10761 unsigned Extend =
10762 It->second.second ? Instruction::SExt : Instruction::ZExt;
10763 VecTy = getWidenedType(MinTy, BundleWidth);
10764 ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
10765 VecTy, EU.Lane);
10766 } else {
10767 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
10768 CostKind, EU.Lane);
10769 }
10770 }
10771 // Add reduced value cost, if resized.
10772 if (!VectorizedVals.empty()) {
10773 const TreeEntry &Root = *VectorizableTree.front();
10774 auto BWIt = MinBWs.find(&Root);
10775 if (BWIt != MinBWs.end()) {
10776 Type *DstTy = Root.Scalars.front()->getType();
10777 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
10778 unsigned SrcSz =
10779 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10780 if (OriginalSz != SrcSz) {
10781 unsigned Opcode = Instruction::Trunc;
10782 if (OriginalSz > SrcSz)
10783 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10784 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
10785 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
10788 }
10789 }
10790 }
10791
10792 InstructionCost SpillCost = getSpillCost();
10793 Cost += SpillCost + ExtractCost;
10794 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
10795 bool) {
10796 InstructionCost C = 0;
10797 unsigned VF = Mask.size();
10798 unsigned VecVF = TE->getVectorFactor();
10799 if (VF != VecVF &&
10800 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
10802 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
10803 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
10804 OrigMask.begin());
10806 getWidenedType(TE->getMainOp()->getType(), VecVF),
10807 OrigMask);
10808 LLVM_DEBUG(
10809 dbgs() << "SLP: Adding cost " << C
10810 << " for final shuffle of insertelement external users.\n";
10811 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
10812 Cost += C;
10813 return std::make_pair(TE, true);
10814 }
10815 return std::make_pair(TE, false);
10816 };
10817 // Calculate the cost of the reshuffled vectors, if any.
10818 for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
10819 Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
10820 auto Vector = ShuffleMasks[I].takeVector();
10821 unsigned VF = 0;
10822 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
10824 assert((TEs.size() == 1 || TEs.size() == 2) &&
10825 "Expected exactly 1 or 2 tree entries.");
10826 if (TEs.size() == 1) {
10827 if (VF == 0)
10828 VF = TEs.front()->getVectorFactor();
10829 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
10830 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
10831 !all_of(enumerate(Mask), [=](const auto &Data) {
10832 return Data.value() == PoisonMaskElem ||
10833 (Data.index() < VF &&
10834 static_cast<int>(Data.index()) == Data.value());
10835 })) {
10838 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10839 << " for final shuffle of insertelement "
10840 "external users.\n";
10841 TEs.front()->dump();
10842 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10843 Cost += C;
10844 }
10845 } else {
10846 if (VF == 0) {
10847 if (TEs.front() &&
10848 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10849 VF = TEs.front()->getVectorFactor();
10850 else
10851 VF = Mask.size();
10852 }
10853 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
10856 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10857 << " for final shuffle of vector node and external "
10858 "insertelement users.\n";
10859 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10860 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10861 Cost += C;
10862 }
10863 VF = Mask.size();
10864 return TEs.back();
10865 };
10866 (void)performExtractsShuffleAction<const TreeEntry>(
10867 MutableArrayRef(Vector.data(), Vector.size()), Base,
10868 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
10869 EstimateShufflesCost);
10871 cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
10872 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
10873 Cost -= InsertCost;
10874 }
10875
10876 // Add the cost for reduced value resize (if required).
10877 if (ReductionBitWidth != 0) {
10878 assert(UserIgnoreList && "Expected reduction tree.");
10879 const TreeEntry &E = *VectorizableTree.front();
10880 auto It = MinBWs.find(&E);
10881 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10882 unsigned SrcSize = It->second.first;
10883 unsigned DstSize = ReductionBitWidth;
10884 unsigned Opcode = Instruction::Trunc;
10885 if (SrcSize < DstSize)
10886 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10887 auto *SrcVecTy =
10888 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
10889 auto *DstVecTy =
10890 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
10891 TTI::CastContextHint CCH = getCastContextHint(E);
10892 InstructionCost CastCost;
10893 switch (E.getOpcode()) {
10894 case Instruction::SExt:
10895 case Instruction::ZExt:
10896 case Instruction::Trunc: {
10897 const TreeEntry *OpTE = getOperandEntry(&E, 0);
10898 CCH = getCastContextHint(*OpTE);
10899 break;
10900 }
10901 default:
10902 break;
10903 }
10904 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
10906 Cost += CastCost;
10907 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
10908 << " for final resize for reduction from " << SrcVecTy
10909 << " to " << DstVecTy << "\n";
10910 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10911 }
10912 }
10913
10914#ifndef NDEBUG
10915 SmallString<256> Str;
10916 {
10918 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
10919 << "SLP: Extract Cost = " << ExtractCost << ".\n"
10920 << "SLP: Total Cost = " << Cost << ".\n";
10921 }
10922 LLVM_DEBUG(dbgs() << Str);
10923 if (ViewSLPTree)
10924 ViewGraph(this, "SLP" + F->getName(), false, Str);
10925#endif
10926
10927 return Cost;
10928}
10929
10930/// Tries to find extractelement instructions with constant indices from fixed
10931/// vector type and gather such instructions into a bunch, which highly likely
10932/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10933/// successful, the matched scalars are replaced by poison values in \p VL for
10934/// future analysis.
10935std::optional<TTI::ShuffleKind>
10936BoUpSLP::tryToGatherSingleRegisterExtractElements(
10938 // Scan list of gathered scalars for extractelements that can be represented
10939 // as shuffles.
10941 SmallVector<int> UndefVectorExtracts;
10942 for (int I = 0, E = VL.size(); I < E; ++I) {
10943 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10944 if (!EI) {
10945 if (isa<UndefValue>(VL[I]))
10946 UndefVectorExtracts.push_back(I);
10947 continue;
10948 }
10949 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
10950 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
10951 continue;
10952 std::optional<unsigned> Idx = getExtractIndex(EI);
10953 // Undefined index.
10954 if (!Idx) {
10955 UndefVectorExtracts.push_back(I);
10956 continue;
10957 }
10958 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
10959 ExtractMask.reset(*Idx);
10960 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
10961 UndefVectorExtracts.push_back(I);
10962 continue;
10963 }
10964 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
10965 }
10966 // Sort the vector operands by the maximum number of uses in extractelements.
10968 VectorOpToIdx.takeVector();
10969 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
10970 return P1.second.size() > P2.second.size();
10971 });
10972 // Find the best pair of the vectors or a single vector.
10973 const int UndefSz = UndefVectorExtracts.size();
10974 unsigned SingleMax = 0;
10975 unsigned PairMax = 0;
10976 if (!Vectors.empty()) {
10977 SingleMax = Vectors.front().second.size() + UndefSz;
10978 if (Vectors.size() > 1) {
10979 auto *ItNext = std::next(Vectors.begin());
10980 PairMax = SingleMax + ItNext->second.size();
10981 }
10982 }
10983 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10984 return std::nullopt;
10985 // Check if better to perform a shuffle of 2 vectors or just of a single
10986 // vector.
10987 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
10988 SmallVector<Value *> GatheredExtracts(
10989 VL.size(), PoisonValue::get(VL.front()->getType()));
10990 if (SingleMax >= PairMax && SingleMax) {
10991 for (int Idx : Vectors.front().second)
10992 std::swap(GatheredExtracts[Idx], VL[Idx]);
10993 } else if (!Vectors.empty()) {
10994 for (unsigned Idx : {0, 1})
10995 for (int Idx : Vectors[Idx].second)
10996 std::swap(GatheredExtracts[Idx], VL[Idx]);
10997 }
10998 // Add extracts from undefs too.
10999 for (int Idx : UndefVectorExtracts)
11000 std::swap(GatheredExtracts[Idx], VL[Idx]);
11001 // Check that gather of extractelements can be represented as just a
11002 // shuffle of a single/two vectors the scalars are extracted from.
11003 std::optional<TTI::ShuffleKind> Res =
11004 isFixedVectorShuffle(GatheredExtracts, Mask);
11005 if (!Res) {
11006 // TODO: try to check other subsets if possible.
11007 // Restore the original VL if attempt was not successful.
11008 copy(SavedVL, VL.begin());
11009 return std::nullopt;
11010 }
11011 // Restore unused scalars from mask, if some of the extractelements were not
11012 // selected for shuffle.
11013 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
11014 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
11015 isa<UndefValue>(GatheredExtracts[I])) {
11016 std::swap(VL[I], GatheredExtracts[I]);
11017 continue;
11018 }
11019 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
11020 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
11021 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
11022 is_contained(UndefVectorExtracts, I))
11023 continue;
11024 }
11025 return Res;
11026}
11027
11028/// Tries to find extractelement instructions with constant indices from fixed
11029/// vector type and gather such instructions into a bunch, which highly likely
11030/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
11031/// successful, the matched scalars are replaced by poison values in \p VL for
11032/// future analysis.
11034BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
11036 unsigned NumParts) const {
11037 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
11038 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
11039 Mask.assign(VL.size(), PoisonMaskElem);
11040 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
11041 for (unsigned Part : seq<unsigned>(NumParts)) {
11042 // Scan list of gathered scalars for extractelements that can be represented
11043 // as shuffles.
11045 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
11046 SmallVector<int> SubMask;
11047 std::optional<TTI::ShuffleKind> Res =
11048 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
11049 ShufflesRes[Part] = Res;
11050 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
11051 }
11052 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
11053 return Res.has_value();
11054 }))
11055 ShufflesRes.clear();
11056 return ShufflesRes;
11057}
11058
11059std::optional<TargetTransformInfo::ShuffleKind>
11060BoUpSLP::isGatherShuffledSingleRegisterEntry(
11061 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
11062 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
11063 Entries.clear();
11064 // TODO: currently checking only for Scalars in the tree entry, need to count
11065 // reused elements too for better cost estimation.
11066 const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
11067 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
11068 const BasicBlock *TEInsertBlock = nullptr;
11069 // Main node of PHI entries keeps the correct order of operands/incoming
11070 // blocks.
11071 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
11072 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
11073 TEInsertPt = TEInsertBlock->getTerminator();
11074 } else {
11075 TEInsertBlock = TEInsertPt->getParent();
11076 }
11077 if (!DT->isReachableFromEntry(TEInsertBlock))
11078 return std::nullopt;
11079 auto *NodeUI = DT->getNode(TEInsertBlock);
11080 assert(NodeUI && "Should only process reachable instructions");
11081 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
11082 auto CheckOrdering = [&](const Instruction *InsertPt) {
11083 // Argument InsertPt is an instruction where vector code for some other
11084 // tree entry (one that shares one or more scalars with TE) is going to be
11085 // generated. This lambda returns true if insertion point of vector code
11086 // for the TE dominates that point (otherwise dependency is the other way
11087 // around). The other node is not limited to be of a gather kind. Gather
11088 // nodes are not scheduled and their vector code is inserted before their
11089 // first user. If user is PHI, that is supposed to be at the end of a
11090 // predecessor block. Otherwise it is the last instruction among scalars of
11091 // the user node. So, instead of checking dependency between instructions
11092 // themselves, we check dependency between their insertion points for vector
11093 // code (since each scalar instruction ends up as a lane of a vector
11094 // instruction).
11095 const BasicBlock *InsertBlock = InsertPt->getParent();
11096 auto *NodeEUI = DT->getNode(InsertBlock);
11097 if (!NodeEUI)
11098 return false;
11099 assert((NodeUI == NodeEUI) ==
11100 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
11101 "Different nodes should have different DFS numbers");
11102 // Check the order of the gather nodes users.
11103 if (TEInsertPt->getParent() != InsertBlock &&
11104 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
11105 return false;
11106 if (TEInsertPt->getParent() == InsertBlock &&
11107 TEInsertPt->comesBefore(InsertPt))
11108 return false;
11109 return true;
11110 };
11111 // Find all tree entries used by the gathered values. If no common entries
11112 // found - not a shuffle.
11113 // Here we build a set of tree nodes for each gathered value and trying to
11114 // find the intersection between these sets. If we have at least one common
11115 // tree node for each gathered value - we have just a permutation of the
11116 // single vector. If we have 2 different sets, we're in situation where we
11117 // have a permutation of 2 input vectors.
11119 DenseMap<Value *, int> UsedValuesEntry;
11120 for (Value *V : VL) {
11121 if (isConstant(V))
11122 continue;
11123 // Build a list of tree entries where V is used.
11125 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
11126 if (TEPtr == TE)
11127 continue;
11128 assert(any_of(TEPtr->Scalars,
11129 [&](Value *V) { return GatheredScalars.contains(V); }) &&
11130 "Must contain at least single gathered value.");
11131 assert(TEPtr->UserTreeIndices.size() == 1 &&
11132 "Expected only single user of a gather node.");
11133 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
11134
11135 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
11136 const Instruction *InsertPt =
11137 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
11138 : &getLastInstructionInBundle(UseEI.UserTE);
11139 if (TEInsertPt == InsertPt) {
11140 // If 2 gathers are operands of the same entry (regardless of whether
11141 // user is PHI or else), compare operands indices, use the earlier one
11142 // as the base.
11143 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
11144 continue;
11145 // If the user instruction is used for some reason in different
11146 // vectorized nodes - make it depend on index.
11147 if (TEUseEI.UserTE != UseEI.UserTE &&
11148 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
11149 continue;
11150 }
11151
11152 // Check if the user node of the TE comes after user node of TEPtr,
11153 // otherwise TEPtr depends on TE.
11154 if ((TEInsertBlock != InsertPt->getParent() ||
11155 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
11156 !CheckOrdering(InsertPt))
11157 continue;
11158 VToTEs.insert(TEPtr);
11159 }
11160 if (const TreeEntry *VTE = getTreeEntry(V)) {
11161 if (ForOrder) {
11162 if (VTE->State != TreeEntry::Vectorize) {
11163 auto It = MultiNodeScalars.find(V);
11164 if (It == MultiNodeScalars.end())
11165 continue;
11166 VTE = *It->getSecond().begin();
11167 // Iterate through all vectorized nodes.
11168 auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
11169 return MTE->State == TreeEntry::Vectorize;
11170 });
11171 if (MIt == It->getSecond().end())
11172 continue;
11173 VTE = *MIt;
11174 }
11175 }
11176 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
11177 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
11178 continue;
11179 VToTEs.insert(VTE);
11180 }
11181 if (VToTEs.empty())
11182 continue;
11183 if (UsedTEs.empty()) {
11184 // The first iteration, just insert the list of nodes to vector.
11185 UsedTEs.push_back(VToTEs);
11186 UsedValuesEntry.try_emplace(V, 0);
11187 } else {
11188 // Need to check if there are any previously used tree nodes which use V.
11189 // If there are no such nodes, consider that we have another one input
11190 // vector.
11191 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
11192 unsigned Idx = 0;
11193 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
11194 // Do we have a non-empty intersection of previously listed tree entries
11195 // and tree entries using current V?
11196 set_intersect(VToTEs, Set);
11197 if (!VToTEs.empty()) {
11198 // Yes, write the new subset and continue analysis for the next
11199 // scalar.
11200 Set.swap(VToTEs);
11201 break;
11202 }
11203 VToTEs = SavedVToTEs;
11204 ++Idx;
11205 }
11206 // No non-empty intersection found - need to add a second set of possible
11207 // source vectors.
11208 if (Idx == UsedTEs.size()) {
11209 // If the number of input vectors is greater than 2 - not a permutation,
11210 // fallback to the regular gather.
11211 // TODO: support multiple reshuffled nodes.
11212 if (UsedTEs.size() == 2)
11213 continue;
11214 UsedTEs.push_back(SavedVToTEs);
11215 Idx = UsedTEs.size() - 1;
11216 }
11217 UsedValuesEntry.try_emplace(V, Idx);
11218 }
11219 }
11220
11221 if (UsedTEs.empty()) {
11222 Entries.clear();
11223 return std::nullopt;
11224 }
11225
11226 unsigned VF = 0;
11227 if (UsedTEs.size() == 1) {
11228 // Keep the order to avoid non-determinism.
11229 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
11230 UsedTEs.front().end());
11231 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
11232 return TE1->Idx < TE2->Idx;
11233 });
11234 // Try to find the perfect match in another gather node at first.
11235 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
11236 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
11237 });
11238 if (It != FirstEntries.end() &&
11239 ((*It)->getVectorFactor() == VL.size() ||
11240 ((*It)->getVectorFactor() == TE->Scalars.size() &&
11241 TE->ReuseShuffleIndices.size() == VL.size() &&
11242 (*It)->isSame(TE->Scalars)))) {
11243 Entries.push_back(*It);
11244 if ((*It)->getVectorFactor() == VL.size()) {
11245 std::iota(std::next(Mask.begin(), Part * VL.size()),
11246 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
11247 } else {
11248 SmallVector<int> CommonMask = TE->getCommonMask();
11249 copy(CommonMask, Mask.begin());
11250 }
11251 // Clear undef scalars.
11252 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
11253 if (isa<PoisonValue>(VL[I]))
11256 }
11257 // No perfect match, just shuffle, so choose the first tree node from the
11258 // tree.
11259 Entries.push_back(FirstEntries.front());
11260 } else {
11261 // Try to find nodes with the same vector factor.
11262 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
11263 // Keep the order of tree nodes to avoid non-determinism.
11265 for (const TreeEntry *TE : UsedTEs.front()) {
11266 unsigned VF = TE->getVectorFactor();
11267 auto It = VFToTE.find(VF);
11268 if (It != VFToTE.end()) {
11269 if (It->second->Idx > TE->Idx)
11270 It->getSecond() = TE;
11271 continue;
11272 }
11273 VFToTE.try_emplace(VF, TE);
11274 }
11275 // Same, keep the order to avoid non-determinism.
11276 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
11277 UsedTEs.back().end());
11278 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
11279 return TE1->Idx < TE2->Idx;
11280 });
11281 for (const TreeEntry *TE : SecondEntries) {
11282 auto It = VFToTE.find(TE->getVectorFactor());
11283 if (It != VFToTE.end()) {
11284 VF = It->first;
11285 Entries.push_back(It->second);
11286 Entries.push_back(TE);
11287 break;
11288 }
11289 }
11290 // No 2 source vectors with the same vector factor - just choose 2 with max
11291 // index.
11292 if (Entries.empty()) {
11293 Entries.push_back(*llvm::max_element(
11294 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
11295 return TE1->Idx < TE2->Idx;
11296 }));
11297 Entries.push_back(SecondEntries.front());
11298 VF = std::max(Entries.front()->getVectorFactor(),
11299 Entries.back()->getVectorFactor());
11300 }
11301 }
11302
11303 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
11304 // Checks if the 2 PHIs are compatible in terms of high possibility to be
11305 // vectorized.
11306 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
11307 auto *PHI = cast<PHINode>(V);
11308 auto *PHI1 = cast<PHINode>(V1);
11309 // Check that all incoming values are compatible/from same parent (if they
11310 // are instructions).
11311 // The incoming values are compatible if they all are constants, or
11312 // instruction with the same/alternate opcodes from the same basic block.
11313 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
11314 Value *In = PHI->getIncomingValue(I);
11315 Value *In1 = PHI1->getIncomingValue(I);
11316 if (isConstant(In) && isConstant(In1))
11317 continue;
11318 if (!getSameOpcode({In, In1}, *TLI).getOpcode())
11319 return false;
11320 if (cast<Instruction>(In)->getParent() !=
11321 cast<Instruction>(In1)->getParent())
11322 return false;
11323 }
11324 return true;
11325 };
11326 // Check if the value can be ignored during analysis for shuffled gathers.
11327 // We suppose it is better to ignore instruction, which do not form splats,
11328 // are not vectorized/not extractelements (these instructions will be handled
11329 // by extractelements processing) or may form vector node in future.
11330 auto MightBeIgnored = [=](Value *V) {
11331 auto *I = dyn_cast<Instruction>(V);
11332 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
11334 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
11335 };
11336 // Check that the neighbor instruction may form a full vector node with the
11337 // current instruction V. It is possible, if they have same/alternate opcode
11338 // and same parent basic block.
11339 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
11340 Value *V1 = VL[Idx];
11341 bool UsedInSameVTE = false;
11342 auto It = UsedValuesEntry.find(V1);
11343 if (It != UsedValuesEntry.end())
11344 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
11345 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
11346 getSameOpcode({V, V1}, *TLI).getOpcode() &&
11347 cast<Instruction>(V)->getParent() ==
11348 cast<Instruction>(V1)->getParent() &&
11349 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
11350 };
11351 // Build a shuffle mask for better cost estimation and vector emission.
11352 SmallBitVector UsedIdxs(Entries.size());
11354 for (int I = 0, E = VL.size(); I < E; ++I) {
11355 Value *V = VL[I];
11356 auto It = UsedValuesEntry.find(V);
11357 if (It == UsedValuesEntry.end())
11358 continue;
11359 // Do not try to shuffle scalars, if they are constants, or instructions
11360 // that can be vectorized as a result of the following vector build
11361 // vectorization.
11362 if (isConstant(V) || (MightBeIgnored(V) &&
11363 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
11364 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
11365 continue;
11366 unsigned Idx = It->second;
11367 EntryLanes.emplace_back(Idx, I);
11368 UsedIdxs.set(Idx);
11369 }
11370 // Iterate through all shuffled scalars and select entries, which can be used
11371 // for final shuffle.
11373 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
11374 if (!UsedIdxs.test(I))
11375 continue;
11376 // Fix the entry number for the given scalar. If it is the first entry, set
11377 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
11378 // These indices are used when calculating final shuffle mask as the vector
11379 // offset.
11380 for (std::pair<unsigned, int> &Pair : EntryLanes)
11381 if (Pair.first == I)
11382 Pair.first = TempEntries.size();
11383 TempEntries.push_back(Entries[I]);
11384 }
11385 Entries.swap(TempEntries);
11386 if (EntryLanes.size() == Entries.size() &&
11387 !VL.equals(ArrayRef(TE->Scalars)
11388 .slice(Part * VL.size(),
11389 std::min<int>(VL.size(), TE->Scalars.size())))) {
11390 // We may have here 1 or 2 entries only. If the number of scalars is equal
11391 // to the number of entries, no need to do the analysis, it is not very
11392 // profitable. Since VL is not the same as TE->Scalars, it means we already
11393 // have some shuffles before. Cut off not profitable case.
11394 Entries.clear();
11395 return std::nullopt;
11396 }
11397 // Build the final mask, check for the identity shuffle, if possible.
11398 bool IsIdentity = Entries.size() == 1;
11399 // Pair.first is the offset to the vector, while Pair.second is the index of
11400 // scalar in the list.
11401 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
11402 unsigned Idx = Part * VL.size() + Pair.second;
11403 Mask[Idx] =
11404 Pair.first * VF +
11405 (ForOrder ? std::distance(
11406 Entries[Pair.first]->Scalars.begin(),
11407 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
11408 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
11409 IsIdentity &= Mask[Idx] == Pair.second;
11410 }
11411 switch (Entries.size()) {
11412 case 1:
11413 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
11415 break;
11416 case 2:
11417 if (EntryLanes.size() > 2 || VL.size() <= 2)
11419 break;
11420 default:
11421 break;
11422 }
11423 Entries.clear();
11424 // Clear the corresponding mask elements.
11425 std::fill(std::next(Mask.begin(), Part * VL.size()),
11426 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
11427 return std::nullopt;
11428}
11429
11431BoUpSLP::isGatherShuffledEntry(
11432 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
11433 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
11434 bool ForOrder) {
11435 assert(NumParts > 0 && NumParts < VL.size() &&
11436 "Expected positive number of registers.");
11437 Entries.clear();
11438 // No need to check for the topmost gather node.
11439 if (TE == VectorizableTree.front().get())
11440 return {};
11441 // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
11442 if (TE->isNonPowOf2Vec())
11443 return {};
11444 Mask.assign(VL.size(), PoisonMaskElem);
11445 assert(TE->UserTreeIndices.size() == 1 &&
11446 "Expected only single user of the gather node.");
11447 assert(VL.size() % NumParts == 0 &&
11448 "Number of scalars must be divisible by NumParts.");
11449 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
11451 for (unsigned Part : seq<unsigned>(NumParts)) {
11452 ArrayRef<Value *> SubVL =
11453 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
11454 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
11455 std::optional<TTI::ShuffleKind> SubRes =
11456 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
11457 ForOrder);
11458 if (!SubRes)
11459 SubEntries.clear();
11460 Res.push_back(SubRes);
11461 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
11462 SubEntries.front()->getVectorFactor() == VL.size() &&
11463 (SubEntries.front()->isSame(TE->Scalars) ||
11464 SubEntries.front()->isSame(VL))) {
11465 SmallVector<const TreeEntry *> LocalSubEntries;
11466 LocalSubEntries.swap(SubEntries);
11467 Entries.clear();
11468 Res.clear();
11469 std::iota(Mask.begin(), Mask.end(), 0);
11470 // Clear undef scalars.
11471 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
11472 if (isa<PoisonValue>(VL[I]))
11474 Entries.emplace_back(1, LocalSubEntries.front());
11476 return Res;
11477 }
11478 }
11479 if (all_of(Res,
11480 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
11481 Entries.clear();
11482 return {};
11483 }
11484 return Res;
11485}
11486
11487InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
11488 Type *ScalarTy) const {
11489 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11490 bool DuplicateNonConst = false;
11491 // Find the cost of inserting/extracting values from the vector.
11492 // Check if the same elements are inserted several times and count them as
11493 // shuffle candidates.
11494 APInt ShuffledElements = APInt::getZero(VL.size());
11495 DenseMap<Value *, unsigned> UniqueElements;
11498 auto EstimateInsertCost = [&](unsigned I, Value *V) {
11499 if (V->getType() != ScalarTy) {
11500 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
11502 V = nullptr;
11503 }
11504 if (!ForPoisonSrc)
11505 Cost +=
11506 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
11507 I, Constant::getNullValue(VecTy), V);
11508 };
11509 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
11510 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
11511 Value *V = VL[I];
11512 // No need to shuffle duplicates for constants.
11513 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
11514 ShuffledElements.setBit(I);
11515 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
11516 continue;
11517 }
11518
11519 auto Res = UniqueElements.try_emplace(V, I);
11520 if (Res.second) {
11521 EstimateInsertCost(I, V);
11522 ShuffleMask[I] = I;
11523 continue;
11524 }
11525
11526 DuplicateNonConst = true;
11527 ShuffledElements.setBit(I);
11528 ShuffleMask[I] = Res.first->second;
11529 }
11530 if (ForPoisonSrc)
11531 Cost =
11532 TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
11533 /*Extract*/ false, CostKind);
11534 if (DuplicateNonConst)
11536 VecTy, ShuffleMask);
11537 return Cost;
11538}
11539
11540// Perform operand reordering on the instructions in VL and return the reordered
11541// operands in Left and Right.
11542void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
11545 const BoUpSLP &R) {
11546 if (VL.empty())
11547 return;
11548 VLOperands Ops(VL, R);
11549 // Reorder the operands in place.
11550 Ops.reorder();
11551 Left = Ops.getVL(0);
11552 Right = Ops.getVL(1);
11553}
11554
11555Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
11556 auto &Res = EntryToLastInstruction.FindAndConstruct(E);
11557 if (Res.second)
11558 return *Res.second;
11559 // Get the basic block this bundle is in. All instructions in the bundle
11560 // should be in this block (except for extractelement-like instructions with
11561 // constant indeces).
11562 auto *Front = E->getMainOp();
11563 auto *BB = Front->getParent();
11564 assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
11565 if (E->getOpcode() == Instruction::GetElementPtr &&
11566 !isa<GetElementPtrInst>(V))
11567 return true;
11568 auto *I = cast<Instruction>(V);
11569 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
11570 isVectorLikeInstWithConstOps(I);
11571 }));
11572
11573 auto FindLastInst = [&]() {
11574 Instruction *LastInst = Front;
11575 for (Value *V : E->Scalars) {
11576 auto *I = dyn_cast<Instruction>(V);
11577 if (!I)
11578 continue;
11579 if (LastInst->getParent() == I->getParent()) {
11580 if (LastInst->comesBefore(I))
11581 LastInst = I;
11582 continue;
11583 }
11584 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11585 !isa<GetElementPtrInst>(I)) ||
11586 (isVectorLikeInstWithConstOps(LastInst) &&
11588 "Expected vector-like or non-GEP in GEP node insts only.");
11589 if (!DT->isReachableFromEntry(LastInst->getParent())) {
11590 LastInst = I;
11591 continue;
11592 }
11593 if (!DT->isReachableFromEntry(I->getParent()))
11594 continue;
11595 auto *NodeA = DT->getNode(LastInst->getParent());
11596 auto *NodeB = DT->getNode(I->getParent());
11597 assert(NodeA && "Should only process reachable instructions");
11598 assert(NodeB && "Should only process reachable instructions");
11599 assert((NodeA == NodeB) ==
11600 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11601 "Different nodes should have different DFS numbers");
11602 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
11603 LastInst = I;
11604 }
11605 BB = LastInst->getParent();
11606 return LastInst;
11607 };
11608
11609 auto FindFirstInst = [&]() {
11610 Instruction *FirstInst = Front;
11611 for (Value *V : E->Scalars) {
11612 auto *I = dyn_cast<Instruction>(V);
11613 if (!I)
11614 continue;
11615 if (FirstInst->getParent() == I->getParent()) {
11616 if (I->comesBefore(FirstInst))
11617 FirstInst = I;
11618 continue;
11619 }
11620 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11621 !isa<GetElementPtrInst>(I)) ||
11622 (isVectorLikeInstWithConstOps(FirstInst) &&
11624 "Expected vector-like or non-GEP in GEP node insts only.");
11625 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
11626 FirstInst = I;
11627 continue;
11628 }
11629 if (!DT->isReachableFromEntry(I->getParent()))
11630 continue;
11631 auto *NodeA = DT->getNode(FirstInst->getParent());
11632 auto *NodeB = DT->getNode(I->getParent());
11633 assert(NodeA && "Should only process reachable instructions");
11634 assert(NodeB && "Should only process reachable instructions");
11635 assert((NodeA == NodeB) ==
11636 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11637 "Different nodes should have different DFS numbers");
11638 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11639 FirstInst = I;
11640 }
11641 return FirstInst;
11642 };
11643
11644 // Set the insert point to the beginning of the basic block if the entry
11645 // should not be scheduled.
11646 if (doesNotNeedToSchedule(E->Scalars) ||
11647 (!E->isGather() && all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
11648 if ((E->getOpcode() == Instruction::GetElementPtr &&
11649 any_of(E->Scalars,
11650 [](Value *V) {
11651 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
11652 })) ||
11653 all_of(E->Scalars,
11654 [](Value *V) {
11655 return !isVectorLikeInstWithConstOps(V) &&
11656 isUsedOutsideBlock(V);
11657 }) ||
11658 (E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) {
11659 return isa<ExtractElementInst, UndefValue>(V) ||
11660 areAllOperandsNonInsts(V);
11661 })))
11662 Res.second = FindLastInst();
11663 else
11664 Res.second = FindFirstInst();
11665 return *Res.second;
11666 }
11667
11668 // Find the last instruction. The common case should be that BB has been
11669 // scheduled, and the last instruction is VL.back(). So we start with
11670 // VL.back() and iterate over schedule data until we reach the end of the
11671 // bundle. The end of the bundle is marked by null ScheduleData.
11672 if (BlocksSchedules.count(BB)) {
11673 Value *V = E->isOneOf(E->Scalars.back());
11675 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
11676 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11677 if (Bundle && Bundle->isPartOfBundle())
11678 for (; Bundle; Bundle = Bundle->NextInBundle)
11679 if (Bundle->OpValue == Bundle->Inst)
11680 Res.second = Bundle->Inst;
11681 }
11682
11683 // LastInst can still be null at this point if there's either not an entry
11684 // for BB in BlocksSchedules or there's no ScheduleData available for
11685 // VL.back(). This can be the case if buildTree_rec aborts for various
11686 // reasons (e.g., the maximum recursion depth is reached, the maximum region
11687 // size is reached, etc.). ScheduleData is initialized in the scheduling
11688 // "dry-run".
11689 //
11690 // If this happens, we can still find the last instruction by brute force. We
11691 // iterate forwards from Front (inclusive) until we either see all
11692 // instructions in the bundle or reach the end of the block. If Front is the
11693 // last instruction in program order, LastInst will be set to Front, and we
11694 // will visit all the remaining instructions in the block.
11695 //
11696 // One of the reasons we exit early from buildTree_rec is to place an upper
11697 // bound on compile-time. Thus, taking an additional compile-time hit here is
11698 // not ideal. However, this should be exceedingly rare since it requires that
11699 // we both exit early from buildTree_rec and that the bundle be out-of-order
11700 // (causing us to iterate all the way to the end of the block).
11701 if (!Res.second)
11702 Res.second = FindLastInst();
11703 assert(Res.second && "Failed to find last instruction in bundle");
11704 return *Res.second;
11705}
11706
11707void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
11708 auto *Front = E->getMainOp();
11709 Instruction *LastInst = &getLastInstructionInBundle(E);
11710 assert(LastInst && "Failed to find last instruction in bundle");
11711 BasicBlock::iterator LastInstIt = LastInst->getIterator();
11712 // If the instruction is PHI, set the insert point after all the PHIs.
11713 bool IsPHI = isa<PHINode>(LastInst);
11714 if (IsPHI)
11715 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
11716 if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) {
11717 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
11718 } else {
11719 // Set the insertion point after the last instruction in the bundle. Set the
11720 // debug location to Front.
11721 Builder.SetInsertPoint(
11722 LastInst->getParent(),
11724 }
11725 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11726}
11727
11728Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy) {
11729 // List of instructions/lanes from current block and/or the blocks which are
11730 // part of the current loop. These instructions will be inserted at the end to
11731 // make it possible to optimize loops and hoist invariant instructions out of
11732 // the loops body with better chances for success.
11734 SmallSet<int, 4> PostponedIndices;
11735 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
11736 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
11738 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
11739 InsertBB = InsertBB->getSinglePredecessor();
11740 return InsertBB && InsertBB == InstBB;
11741 };
11742 for (int I = 0, E = VL.size(); I < E; ++I) {
11743 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
11744 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11745 getTreeEntry(Inst) ||
11746 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
11747 PostponedIndices.insert(I).second)
11748 PostponedInsts.emplace_back(Inst, I);
11749 }
11750
11751 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
11752 Type *Ty) {
11753 Value *Scalar = V;
11754 if (Scalar->getType() != Ty) {
11755 assert(Scalar->getType()->isIntOrIntVectorTy() &&
11756 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
11757 Value *V = Scalar;
11758 if (auto *CI = dyn_cast<CastInst>(Scalar);
11759 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
11760 Value *Op = CI->getOperand(0);
11761 if (auto *IOp = dyn_cast<Instruction>(Op);
11762 !IOp || !(isDeleted(IOp) || getTreeEntry(IOp)))
11763 V = Op;
11764 }
11765 Scalar = Builder.CreateIntCast(
11766 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
11767 }
11768
11769 Instruction *InsElt;
11770 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
11771 assert(SLPReVec && "FixedVectorType is not expected.");
11772 Vec = InsElt = Builder.CreateInsertVector(
11773 Vec->getType(), Vec, V,
11774 Builder.getInt64(Pos * VecTy->getNumElements()));
11775 auto *II = dyn_cast<IntrinsicInst>(InsElt);
11776 if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
11777 return Vec;
11778 } else {
11779 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11780 InsElt = dyn_cast<InsertElementInst>(Vec);
11781 if (!InsElt)
11782 return Vec;
11783 }
11784 GatherShuffleExtractSeq.insert(InsElt);
11785 CSEBlocks.insert(InsElt->getParent());
11786 // Add to our 'need-to-extract' list.
11787 if (isa<Instruction>(V)) {
11788 if (TreeEntry *Entry = getTreeEntry(V)) {
11789 // Find which lane we need to extract.
11790 User *UserOp = nullptr;
11791 if (Scalar != V) {
11792 if (auto *SI = dyn_cast<Instruction>(Scalar))
11793 UserOp = SI;
11794 } else {
11795 UserOp = InsElt;
11796 }
11797 if (UserOp) {
11798 unsigned FoundLane = Entry->findLaneForValue(V);
11799 ExternalUses.emplace_back(V, UserOp, FoundLane);
11800 }
11801 }
11802 }
11803 return Vec;
11804 };
11805 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11806 Value *Vec = Root ? Root : PoisonValue::get(VecTy);
11807 SmallVector<int> NonConsts;
11808 // Insert constant values at first.
11809 for (int I = 0, E = VL.size(); I < E; ++I) {
11810 if (PostponedIndices.contains(I))
11811 continue;
11812 if (!isConstant(VL[I])) {
11813 NonConsts.push_back(I);
11814 continue;
11815 }
11816 if (Root) {
11817 if (!isa<UndefValue>(VL[I])) {
11818 NonConsts.push_back(I);
11819 continue;
11820 }
11821 if (isa<PoisonValue>(VL[I]))
11822 continue;
11823 if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
11824 if (SV->getMaskValue(I) == PoisonMaskElem)
11825 continue;
11826 }
11827 }
11828 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11829 }
11830 // Insert non-constant values.
11831 for (int I : NonConsts)
11832 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11833 // Append instructions, which are/may be part of the loop, in the end to make
11834 // it possible to hoist non-loop-based instructions.
11835 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11836 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11837
11838 return Vec;
11839}
11840
11841/// Merges shuffle masks and emits final shuffle instruction, if required. It
11842/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
11843/// when the actual shuffle instruction is generated only if this is actually
11844/// required. Otherwise, the shuffle instruction emission is delayed till the
11845/// end of the process, to reduce the number of emitted instructions and further
11846/// analysis/transformations.
11847/// The class also will look through the previously emitted shuffle instructions
11848/// and properly mark indices in mask as undef.
11849/// For example, given the code
11850/// \code
11851/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
11852/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
11853/// \endcode
11854/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
11855/// look through %s1 and %s2 and emit
11856/// \code
11857/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11858/// \endcode
11859/// instead.
11860/// If 2 operands are of different size, the smallest one will be resized and
11861/// the mask recalculated properly.
11862/// For example, given the code
11863/// \code
11864/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
11865/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
11866/// \endcode
11867/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
11868/// look through %s1 and %s2 and emit
11869/// \code
11870/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11871/// \endcode
11872/// instead.
11873class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
11874 bool IsFinalized = false;
11875 /// Combined mask for all applied operands and masks. It is built during
11876 /// analysis and actual emission of shuffle vector instructions.
11877 SmallVector<int> CommonMask;
11878 /// List of operands for the shuffle vector instruction. It hold at max 2
11879 /// operands, if the 3rd is going to be added, the first 2 are combined into
11880 /// shuffle with \p CommonMask mask, the first operand sets to be the
11881 /// resulting shuffle and the second operand sets to be the newly added
11882 /// operand. The \p CommonMask is transformed in the proper way after that.
11883 SmallVector<Value *, 2> InVectors;
11884 IRBuilderBase &Builder;
11885 BoUpSLP &R;
11886
11887 class ShuffleIRBuilder {
11888 IRBuilderBase &Builder;
11889 /// Holds all of the instructions that we gathered.
11890 SetVector<Instruction *> &GatherShuffleExtractSeq;
11891 /// A list of blocks that we are going to CSE.
11892 DenseSet<BasicBlock *> &CSEBlocks;
11893 /// Data layout.
11894 const DataLayout &DL;
11895
11896 public:
11897 ShuffleIRBuilder(IRBuilderBase &Builder,
11898 SetVector<Instruction *> &GatherShuffleExtractSeq,
11899 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
11900 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11901 CSEBlocks(CSEBlocks), DL(DL) {}
11902 ~ShuffleIRBuilder() = default;
11903 /// Creates shufflevector for the 2 operands with the given mask.
11904 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
11905 if (V1->getType() != V2->getType()) {
11907 V1->getType()->isIntOrIntVectorTy() &&
11908 "Expected integer vector types only.");
11909 if (V1->getType() != V2->getType()) {
11910 if (cast<VectorType>(V2->getType())
11911 ->getElementType()
11912 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
11913 ->getElementType()
11914 ->getIntegerBitWidth())
11915 V2 = Builder.CreateIntCast(
11916 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
11917 else
11918 V1 = Builder.CreateIntCast(
11919 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
11920 }
11921 }
11922 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
11923 if (auto *I = dyn_cast<Instruction>(Vec)) {
11924 GatherShuffleExtractSeq.insert(I);
11925 CSEBlocks.insert(I->getParent());
11926 }
11927 return Vec;
11928 }
11929 /// Creates permutation of the single vector operand with the given mask, if
11930 /// it is not identity mask.
11931 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
11932 if (Mask.empty())
11933 return V1;
11934 unsigned VF = Mask.size();
11935 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
11936 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
11937 return V1;
11938 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
11939 if (auto *I = dyn_cast<Instruction>(Vec)) {
11940 GatherShuffleExtractSeq.insert(I);
11941 CSEBlocks.insert(I->getParent());
11942 }
11943 return Vec;
11944 }
11945 Value *createIdentity(Value *V) { return V; }
11946 Value *createPoison(Type *Ty, unsigned VF) {
11947 return PoisonValue::get(getWidenedType(Ty, VF));
11948 }
11949 /// Resizes 2 input vector to match the sizes, if the they are not equal
11950 /// yet. The smallest vector is resized to the size of the larger vector.
11951 void resizeToMatch(Value *&V1, Value *&V2) {
11952 if (V1->getType() == V2->getType())
11953 return;
11954 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
11955 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
11956 int VF = std::max(V1VF, V2VF);
11957 int MinVF = std::min(V1VF, V2VF);
11958 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
11959 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
11960 0);
11961 Value *&Op = MinVF == V1VF ? V1 : V2;
11962 Op = Builder.CreateShuffleVector(Op, IdentityMask);
11963 if (auto *I = dyn_cast<Instruction>(Op)) {
11964 GatherShuffleExtractSeq.insert(I);
11965 CSEBlocks.insert(I->getParent());
11966 }
11967 if (MinVF == V1VF)
11968 V1 = Op;
11969 else
11970 V2 = Op;
11971 }
11972 };
11973
11974 /// Smart shuffle instruction emission, walks through shuffles trees and
11975 /// tries to find the best matching vector for the actual shuffle
11976 /// instruction.
11977 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
11978 assert(V1 && "Expected at least one vector value.");
11979 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11980 R.CSEBlocks, *R.DL);
11981 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11982 ShuffleBuilder);
11983 }
11984
11985 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
11986 /// shuffle emission.
11987 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
11988 ArrayRef<int> Mask) {
11989 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11990 if (Mask[Idx] != PoisonMaskElem)
11991 CommonMask[Idx] = Idx;
11992 }
11993
11994 /// Cast value \p V to the vector type with the same number of elements, but
11995 /// the base type \p ScalarTy.
11996 Value *castToScalarTyElem(Value *V,
11997 std::optional<bool> IsSigned = std::nullopt) {
11998 auto *VecTy = cast<VectorType>(V->getType());
11999 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
12000 if (VecTy->getElementType() == ScalarTy->getScalarType())
12001 return V;
12002 return Builder.CreateIntCast(
12003 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
12004 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
12005 }
12006
12007public:
12009 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
12010
12011 /// Adjusts extractelements after reusing them.
12012 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
12013 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
12014 unsigned NumParts, bool &UseVecBaseAsInput) {
12015 UseVecBaseAsInput = false;
12016 SmallPtrSet<Value *, 4> UniqueBases;
12017 Value *VecBase = nullptr;
12018 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
12019 int Idx = Mask[I];
12020 if (Idx == PoisonMaskElem)
12021 continue;
12022 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
12023 VecBase = EI->getVectorOperand();
12024 if (const TreeEntry *TE = R.getTreeEntry(VecBase))
12025 VecBase = TE->VectorizedValue;
12026 assert(VecBase && "Expected vectorized value.");
12027 UniqueBases.insert(VecBase);
12028 // If the only one use is vectorized - can delete the extractelement
12029 // itself.
12030 if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) ||
12031 any_of(EI->users(), [&](User *U) {
12032 const TreeEntry *UTE = R.getTreeEntry(U);
12033 return !UTE || R.MultiNodeScalars.contains(U) ||
12034 (isa<GetElementPtrInst>(U) &&
12035 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
12036 count_if(R.VectorizableTree,
12037 [&](const std::unique_ptr<TreeEntry> &TE) {
12038 return any_of(TE->UserTreeIndices,
12039 [&](const EdgeInfo &Edge) {
12040 return Edge.UserTE == UTE;
12041 }) &&
12042 is_contained(TE->Scalars, EI);
12043 }) != 1;
12044 }))
12045 continue;
12046 R.eraseInstruction(EI);
12047 }
12048 if (NumParts == 1 || UniqueBases.size() == 1) {
12049 assert(VecBase && "Expected vectorized value.");
12050 return castToScalarTyElem(VecBase);
12051 }
12052 UseVecBaseAsInput = true;
12053 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
12054 for (auto [I, Idx] : enumerate(Mask))
12055 if (Idx != PoisonMaskElem)
12056 Idx = I;
12057 };
12058 // Perform multi-register vector shuffle, joining them into a single virtual
12059 // long vector.
12060 // Need to shuffle each part independently and then insert all this parts
12061 // into a long virtual vector register, forming the original vector.
12062 Value *Vec = nullptr;
12063 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
12064 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
12065 for (unsigned Part : seq<unsigned>(NumParts)) {
12066 unsigned Limit = getNumElems(E->Scalars.size(), SliceSize, Part);
12068 ArrayRef(E->Scalars).slice(Part * SliceSize, Limit);
12069 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
12070 constexpr int MaxBases = 2;
12071 SmallVector<Value *, MaxBases> Bases(MaxBases);
12072 auto VLMask = zip(VL, SubMask);
12073 const unsigned VF = std::accumulate(
12074 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
12075 if (std::get<1>(D) == PoisonMaskElem)
12076 return S;
12077 Value *VecOp =
12078 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
12079 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
12080 VecOp = TE->VectorizedValue;
12081 assert(VecOp && "Expected vectorized value.");
12082 const unsigned Size =
12083 cast<FixedVectorType>(VecOp->getType())->getNumElements();
12084 return std::max(S, Size);
12085 });
12086 for (const auto [V, I] : VLMask) {
12087 if (I == PoisonMaskElem)
12088 continue;
12089 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
12090 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
12091 VecOp = TE->VectorizedValue;
12092 assert(VecOp && "Expected vectorized value.");
12093 VecOp = castToScalarTyElem(VecOp);
12094 Bases[I / VF] = VecOp;
12095 }
12096 if (!Bases.front())
12097 continue;
12098 Value *SubVec;
12099 if (Bases.back()) {
12100 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
12101 TransformToIdentity(SubMask);
12102 } else {
12103 SubVec = Bases.front();
12104 }
12105 if (!Vec) {
12106 Vec = SubVec;
12107 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
12108 [&](unsigned P) {
12109 ArrayRef<int> SubMask =
12110 Mask.slice(P * SliceSize,
12111 getNumElems(Mask.size(),
12112 SliceSize, P));
12113 return all_of(SubMask, [](int Idx) {
12114 return Idx == PoisonMaskElem;
12115 });
12116 })) &&
12117 "Expected first part or all previous parts masked.");
12118 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
12119 } else {
12120 unsigned NewVF =
12121 cast<FixedVectorType>(Vec->getType())->getNumElements();
12122 if (Vec->getType() != SubVec->getType()) {
12123 unsigned SubVecVF =
12124 cast<FixedVectorType>(SubVec->getType())->getNumElements();
12125 NewVF = std::max(NewVF, SubVecVF);
12126 }
12127 // Adjust SubMask.
12128 for (int &Idx : SubMask)
12129 if (Idx != PoisonMaskElem)
12130 Idx += NewVF;
12131 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
12132 Vec = createShuffle(Vec, SubVec, VecMask);
12133 TransformToIdentity(VecMask);
12134 }
12135 }
12136 copy(VecMask, Mask.begin());
12137 return Vec;
12138 }
12139 /// Checks if the specified entry \p E needs to be delayed because of its
12140 /// dependency nodes.
12141 std::optional<Value *>
12142 needToDelay(const TreeEntry *E,
12144 // No need to delay emission if all deps are ready.
12145 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
12146 return all_of(
12147 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
12148 }))
12149 return std::nullopt;
12150 // Postpone gather emission, will be emitted after the end of the
12151 // process to keep correct order.
12152 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
12153 return Builder.CreateAlignedLoad(
12154 ResVecTy,
12156 MaybeAlign());
12157 }
12158 /// Adds 2 input vectors (in form of tree entries) and the mask for their
12159 /// shuffling.
12160 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
12161 Value *V1 = E1.VectorizedValue;
12162 if (V1->getType()->isIntOrIntVectorTy())
12163 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
12164 return !isKnownNonNegative(
12165 V, SimplifyQuery(*R.DL));
12166 }));
12167 Value *V2 = E2.VectorizedValue;
12168 if (V2->getType()->isIntOrIntVectorTy())
12169 V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
12170 return !isKnownNonNegative(
12171 V, SimplifyQuery(*R.DL));
12172 }));
12173 add(V1, V2, Mask);
12174 }
12175 /// Adds single input vector (in form of tree entry) and the mask for its
12176 /// shuffling.
12177 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
12178 Value *V1 = E1.VectorizedValue;
12179 if (V1->getType()->isIntOrIntVectorTy())
12180 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
12181 return !isKnownNonNegative(
12182 V, SimplifyQuery(*R.DL));
12183 }));
12184 add(V1, Mask);
12185 }
12186 /// Adds 2 input vectors and the mask for their shuffling.
12187 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
12188 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
12189 assert(isa<FixedVectorType>(V1->getType()) &&
12190 isa<FixedVectorType>(V2->getType()) &&
12191 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
12192 V1 = castToScalarTyElem(V1);
12193 V2 = castToScalarTyElem(V2);
12194 if (InVectors.empty()) {
12195 InVectors.push_back(V1);
12196 InVectors.push_back(V2);
12197 CommonMask.assign(Mask.begin(), Mask.end());
12198 return;
12199 }
12200 Value *Vec = InVectors.front();
12201 if (InVectors.size() == 2) {
12202 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
12203 transformMaskAfterShuffle(CommonMask, CommonMask);
12204 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
12205 Mask.size()) {
12206 Vec = createShuffle(Vec, nullptr, CommonMask);
12207 transformMaskAfterShuffle(CommonMask, CommonMask);
12208 }
12209 V1 = createShuffle(V1, V2, Mask);
12210 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12211 if (Mask[Idx] != PoisonMaskElem)
12212 CommonMask[Idx] = Idx + Sz;
12213 InVectors.front() = Vec;
12214 if (InVectors.size() == 2)
12215 InVectors.back() = V1;
12216 else
12217 InVectors.push_back(V1);
12218 }
12219 /// Adds another one input vector and the mask for the shuffling.
12220 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
12221 assert(isa<FixedVectorType>(V1->getType()) &&
12222 "castToScalarTyElem expects V1 to be FixedVectorType");
12223 V1 = castToScalarTyElem(V1);
12224 if (InVectors.empty()) {
12225 InVectors.push_back(V1);
12226 CommonMask.assign(Mask.begin(), Mask.end());
12227 return;
12228 }
12229 const auto *It = find(InVectors, V1);
12230 if (It == InVectors.end()) {
12231 if (InVectors.size() == 2 ||
12232 InVectors.front()->getType() != V1->getType()) {
12233 Value *V = InVectors.front();
12234 if (InVectors.size() == 2) {
12235 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
12236 transformMaskAfterShuffle(CommonMask, CommonMask);
12237 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
12238 CommonMask.size()) {
12239 V = createShuffle(InVectors.front(), nullptr, CommonMask);
12240 transformMaskAfterShuffle(CommonMask, CommonMask);
12241 }
12242 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12243 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
12244 CommonMask[Idx] =
12245 V->getType() != V1->getType()
12246 ? Idx + Sz
12247 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
12248 ->getNumElements();
12249 if (V->getType() != V1->getType())
12250 V1 = createShuffle(V1, nullptr, Mask);
12251 InVectors.front() = V;
12252 if (InVectors.size() == 2)
12253 InVectors.back() = V1;
12254 else
12255 InVectors.push_back(V1);
12256 return;
12257 }
12258 // Check if second vector is required if the used elements are already
12259 // used from the first one.
12260 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12261 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
12262 InVectors.push_back(V1);
12263 break;
12264 }
12265 }
12266 int VF = getVF(V1);
12267 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12268 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
12269 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
12270 }
12271 /// Adds another one input vector and the mask for the shuffling.
12273 SmallVector<int> NewMask;
12274 inversePermutation(Order, NewMask);
12275 add(V1, NewMask);
12276 }
12277 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
12278 Value *Root = nullptr) {
12279 return R.gather(VL, Root, ScalarTy);
12280 }
12281 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
12282 /// Finalize emission of the shuffles.
12283 /// \param Action the action (if any) to be performed before final applying of
12284 /// the \p ExtMask mask.
12285 Value *
12286 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
12287 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
12288 IsFinalized = true;
12289 SmallVector<int> NewExtMask(ExtMask);
12290 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
12291 assert(SLPReVec && "FixedVectorType is not expected.");
12293 CommonMask);
12295 NewExtMask);
12296 ExtMask = NewExtMask;
12297 }
12298 if (Action) {
12299 Value *Vec = InVectors.front();
12300 if (InVectors.size() == 2) {
12301 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
12302 InVectors.pop_back();
12303 } else {
12304 Vec = createShuffle(Vec, nullptr, CommonMask);
12305 }
12306 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12307 if (CommonMask[Idx] != PoisonMaskElem)
12308 CommonMask[Idx] = Idx;
12309 assert(VF > 0 &&
12310 "Expected vector length for the final value before action.");
12311 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
12312 if (VecVF < VF) {
12313 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
12314 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
12315 Vec = createShuffle(Vec, nullptr, ResizeMask);
12316 }
12317 Action(Vec, CommonMask);
12318 InVectors.front() = Vec;
12319 }
12320 if (!ExtMask.empty()) {
12321 if (CommonMask.empty()) {
12322 CommonMask.assign(ExtMask.begin(), ExtMask.end());
12323 } else {
12324 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12325 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12326 if (ExtMask[I] == PoisonMaskElem)
12327 continue;
12328 NewMask[I] = CommonMask[ExtMask[I]];
12329 }
12330 CommonMask.swap(NewMask);
12331 }
12332 }
12333 if (CommonMask.empty()) {
12334 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
12335 return InVectors.front();
12336 }
12337 if (InVectors.size() == 2)
12338 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
12339 return createShuffle(InVectors.front(), nullptr, CommonMask);
12340 }
12341
12343 assert((IsFinalized || CommonMask.empty()) &&
12344 "Shuffle construction must be finalized.");
12345 }
12346};
12347
12348Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
12349 bool PostponedPHIs) {
12350 ValueList &VL = E->getOperand(NodeIdx);
12351 const unsigned VF = VL.size();
12352 InstructionsState S = getSameOpcode(VL, *TLI);
12353 // Special processing for GEPs bundle, which may include non-gep values.
12354 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
12355 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
12356 if (It != VL.end())
12357 S = getSameOpcode(*It, *TLI);
12358 }
12359 if (S.getOpcode()) {
12360 auto CheckSameVE = [&](const TreeEntry *VE) {
12361 return VE->isSame(VL) &&
12362 (any_of(VE->UserTreeIndices,
12363 [E, NodeIdx](const EdgeInfo &EI) {
12364 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12365 }) ||
12366 any_of(VectorizableTree,
12367 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
12368 return TE->isOperandGatherNode({E, NodeIdx}) &&
12369 VE->isSame(TE->Scalars);
12370 }));
12371 };
12372 TreeEntry *VE = getTreeEntry(S.OpValue);
12373 bool IsSameVE = VE && CheckSameVE(VE);
12374 if (!IsSameVE) {
12375 auto It = MultiNodeScalars.find(S.OpValue);
12376 if (It != MultiNodeScalars.end()) {
12377 auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
12378 return TE != VE && CheckSameVE(TE);
12379 });
12380 if (I != It->getSecond().end()) {
12381 VE = *I;
12382 IsSameVE = true;
12383 }
12384 }
12385 }
12386 if (IsSameVE) {
12387 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
12388 ShuffleInstructionBuilder ShuffleBuilder(
12389 cast<VectorType>(V->getType())->getElementType(), Builder, *this);
12390 ShuffleBuilder.add(V, Mask);
12391 return ShuffleBuilder.finalize(std::nullopt);
12392 };
12393 Value *V = vectorizeTree(VE, PostponedPHIs);
12394 if (VF * getNumElements(VL[0]->getType()) !=
12395 cast<FixedVectorType>(V->getType())->getNumElements()) {
12396 if (!VE->ReuseShuffleIndices.empty()) {
12397 // Reshuffle to get only unique values.
12398 // If some of the scalars are duplicated in the vectorization
12399 // tree entry, we do not vectorize them but instead generate a
12400 // mask for the reuses. But if there are several users of the
12401 // same entry, they may have different vectorization factors.
12402 // This is especially important for PHI nodes. In this case, we
12403 // need to adapt the resulting instruction for the user
12404 // vectorization factor and have to reshuffle it again to take
12405 // only unique elements of the vector. Without this code the
12406 // function incorrectly returns reduced vector instruction with
12407 // the same elements, not with the unique ones.
12408
12409 // block:
12410 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
12411 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
12412 // ... (use %2)
12413 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
12414 // br %block
12416 for (auto [I, V] : enumerate(VL)) {
12417 if (isa<PoisonValue>(V))
12418 continue;
12419 Mask[I] = VE->findLaneForValue(V);
12420 }
12421 V = FinalShuffle(V, Mask);
12422 } else {
12423 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
12424 "Expected vectorization factor less "
12425 "than original vector size.");
12426 SmallVector<int> UniformMask(VF, 0);
12427 std::iota(UniformMask.begin(), UniformMask.end(), 0);
12428 V = FinalShuffle(V, UniformMask);
12429 }
12430 }
12431 // Need to update the operand gather node, if actually the operand is not a
12432 // vectorized node, but the buildvector/gather node, which matches one of
12433 // the vectorized nodes.
12434 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
12435 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12436 }) == VE->UserTreeIndices.end()) {
12437 auto *It = find_if(
12438 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12439 return TE->isGather() &&
12440 TE->UserTreeIndices.front().UserTE == E &&
12441 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
12442 });
12443 assert(It != VectorizableTree.end() && "Expected gather node operand.");
12444 (*It)->VectorizedValue = V;
12445 }
12446 return V;
12447 }
12448 }
12449
12450 // Find the corresponding gather entry and vectorize it.
12451 // Allows to be more accurate with tree/graph transformations, checks for the
12452 // correctness of the transformations in many cases.
12453 auto *I = find_if(VectorizableTree,
12454 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
12455 return TE->isOperandGatherNode({E, NodeIdx});
12456 });
12457 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
12458 assert(I->get()->UserTreeIndices.size() == 1 &&
12459 "Expected only single user for the gather node.");
12460 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
12461 return vectorizeTree(I->get(), PostponedPHIs);
12462}
12463
12464template <typename BVTy, typename ResTy, typename... Args>
12465ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
12466 Args &...Params) {
12467 assert(E->isGather() && "Expected gather node.");
12468 unsigned VF = E->getVectorFactor();
12469
12470 bool NeedFreeze = false;
12471 SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
12472 E->ReuseShuffleIndices.end());
12473 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
12474 // Build a mask out of the reorder indices and reorder scalars per this
12475 // mask.
12476 SmallVector<int> ReorderMask;
12477 inversePermutation(E->ReorderIndices, ReorderMask);
12478 if (!ReorderMask.empty())
12479 reorderScalars(GatheredScalars, ReorderMask);
12480 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
12481 unsigned I, unsigned SliceSize) {
12482 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
12483 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12484 }))
12485 return false;
12486 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
12487 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
12488 if (UserTE->getNumOperands() != 2)
12489 return false;
12490 auto *It =
12491 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
12492 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
12493 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
12494 }) != TE->UserTreeIndices.end();
12495 });
12496 if (It == VectorizableTree.end())
12497 return false;
12498 int Idx;
12499 if ((Mask.size() < InputVF &&
12501 Idx == 0) ||
12502 (Mask.size() == InputVF &&
12503 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
12504 std::iota(
12505 std::next(Mask.begin(), I * SliceSize),
12506 std::next(Mask.begin(),
12507 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
12508 0);
12509 } else {
12510 unsigned IVal =
12511 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
12512 std::fill(
12513 std::next(Mask.begin(), I * SliceSize),
12514 std::next(Mask.begin(),
12515 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
12516 IVal);
12517 }
12518 return true;
12519 };
12520 BVTy ShuffleBuilder(ScalarTy, Params...);
12521 ResTy Res = ResTy();
12523 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
12525 Value *ExtractVecBase = nullptr;
12526 bool UseVecBaseAsInput = false;
12529 Type *OrigScalarTy = GatheredScalars.front()->getType();
12530 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
12531 unsigned NumParts = TTI->getNumberOfParts(VecTy);
12532 if (NumParts == 0 || NumParts >= GatheredScalars.size())
12533 NumParts = 1;
12534 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
12535 // Check for gathered extracts.
12536 bool Resized = false;
12537 ExtractShuffles =
12538 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
12539 if (!ExtractShuffles.empty()) {
12540 SmallVector<const TreeEntry *> ExtractEntries;
12541 for (auto [Idx, I] : enumerate(ExtractMask)) {
12542 if (I == PoisonMaskElem)
12543 continue;
12544 if (const auto *TE = getTreeEntry(
12545 cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
12546 ExtractEntries.push_back(TE);
12547 }
12548 if (std::optional<ResTy> Delayed =
12549 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
12550 // Delay emission of gathers which are not ready yet.
12551 PostponedGathers.insert(E);
12552 // Postpone gather emission, will be emitted after the end of the
12553 // process to keep correct order.
12554 return *Delayed;
12555 }
12556 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
12557 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
12558 ExtractVecBase = VecBase;
12559 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
12560 if (VF == VecBaseTy->getNumElements() &&
12561 GatheredScalars.size() != VF) {
12562 Resized = true;
12563 GatheredScalars.append(VF - GatheredScalars.size(),
12564 PoisonValue::get(OrigScalarTy));
12565 }
12566 }
12567 }
12568 // Gather extracts after we check for full matched gathers only.
12569 if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
12570 E->isAltShuffle() ||
12571 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
12572 isSplat(E->Scalars) ||
12573 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
12574 GatherShuffles =
12575 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
12576 }
12577 if (!GatherShuffles.empty()) {
12578 if (std::optional<ResTy> Delayed =
12579 ShuffleBuilder.needToDelay(E, Entries)) {
12580 // Delay emission of gathers which are not ready yet.
12581 PostponedGathers.insert(E);
12582 // Postpone gather emission, will be emitted after the end of the
12583 // process to keep correct order.
12584 return *Delayed;
12585 }
12586 if (GatherShuffles.size() == 1 &&
12587 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
12588 Entries.front().front()->isSame(E->Scalars)) {
12589 // Perfect match in the graph, will reuse the previously vectorized
12590 // node. Cost is 0.
12591 LLVM_DEBUG(
12592 dbgs()
12593 << "SLP: perfect diamond match for gather bundle "
12594 << shortBundleName(E->Scalars) << ".\n");
12595 // Restore the mask for previous partially matched values.
12596 Mask.resize(E->Scalars.size());
12597 const TreeEntry *FrontTE = Entries.front().front();
12598 if (FrontTE->ReorderIndices.empty() &&
12599 ((FrontTE->ReuseShuffleIndices.empty() &&
12600 E->Scalars.size() == FrontTE->Scalars.size()) ||
12601 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
12602 std::iota(Mask.begin(), Mask.end(), 0);
12603 } else {
12604 for (auto [I, V] : enumerate(E->Scalars)) {
12605 if (isa<PoisonValue>(V)) {
12607 continue;
12608 }
12609 Mask[I] = FrontTE->findLaneForValue(V);
12610 }
12611 }
12612 ShuffleBuilder.add(*FrontTE, Mask);
12613 Res = ShuffleBuilder.finalize(E->getCommonMask());
12614 return Res;
12615 }
12616 if (!Resized) {
12617 if (GatheredScalars.size() != VF &&
12618 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
12619 return any_of(TEs, [&](const TreeEntry *TE) {
12620 return TE->getVectorFactor() == VF;
12621 });
12622 }))
12623 GatheredScalars.append(VF - GatheredScalars.size(),
12624 PoisonValue::get(OrigScalarTy));
12625 }
12626 // Remove shuffled elements from list of gathers.
12627 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
12628 if (Mask[I] != PoisonMaskElem)
12629 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
12630 }
12631 }
12632 }
12633 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
12634 SmallVectorImpl<int> &ReuseMask,
12635 bool IsRootPoison) {
12636 // For splats with can emit broadcasts instead of gathers, so try to find
12637 // such sequences.
12638 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
12639 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
12640 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
12641 SmallVector<int> UndefPos;
12642 DenseMap<Value *, unsigned> UniquePositions;
12643 // Gather unique non-const values and all constant values.
12644 // For repeated values, just shuffle them.
12645 int NumNonConsts = 0;
12646 int SinglePos = 0;
12647 for (auto [I, V] : enumerate(Scalars)) {
12648 if (isa<UndefValue>(V)) {
12649 if (!isa<PoisonValue>(V)) {
12650 ReuseMask[I] = I;
12651 UndefPos.push_back(I);
12652 }
12653 continue;
12654 }
12655 if (isConstant(V)) {
12656 ReuseMask[I] = I;
12657 continue;
12658 }
12659 ++NumNonConsts;
12660 SinglePos = I;
12661 Value *OrigV = V;
12662 Scalars[I] = PoisonValue::get(OrigScalarTy);
12663 if (IsSplat) {
12664 Scalars.front() = OrigV;
12665 ReuseMask[I] = 0;
12666 } else {
12667 const auto Res = UniquePositions.try_emplace(OrigV, I);
12668 Scalars[Res.first->second] = OrigV;
12669 ReuseMask[I] = Res.first->second;
12670 }
12671 }
12672 if (NumNonConsts == 1) {
12673 // Restore single insert element.
12674 if (IsSplat) {
12675 ReuseMask.assign(VF, PoisonMaskElem);
12676 std::swap(Scalars.front(), Scalars[SinglePos]);
12677 if (!UndefPos.empty() && UndefPos.front() == 0)
12678 Scalars.front() = UndefValue::get(OrigScalarTy);
12679 }
12680 ReuseMask[SinglePos] = SinglePos;
12681 } else if (!UndefPos.empty() && IsSplat) {
12682 // For undef values, try to replace them with the simple broadcast.
12683 // We can do it if the broadcasted value is guaranteed to be
12684 // non-poisonous, or by freezing the incoming scalar value first.
12685 auto *It = find_if(Scalars, [this, E](Value *V) {
12686 return !isa<UndefValue>(V) &&
12687 (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
12688 (E->UserTreeIndices.size() == 1 &&
12689 any_of(V->uses(), [E](const Use &U) {
12690 // Check if the value already used in the same operation in
12691 // one of the nodes already.
12692 return E->UserTreeIndices.front().EdgeIdx !=
12693 U.getOperandNo() &&
12694 is_contained(
12695 E->UserTreeIndices.front().UserTE->Scalars,
12696 U.getUser());
12697 })));
12698 });
12699 if (It != Scalars.end()) {
12700 // Replace undefs by the non-poisoned scalars and emit broadcast.
12701 int Pos = std::distance(Scalars.begin(), It);
12702 for (int I : UndefPos) {
12703 // Set the undef position to the non-poisoned scalar.
12704 ReuseMask[I] = Pos;
12705 // Replace the undef by the poison, in the mask it is replaced by
12706 // non-poisoned scalar already.
12707 if (I != Pos)
12708 Scalars[I] = PoisonValue::get(OrigScalarTy);
12709 }
12710 } else {
12711 // Replace undefs by the poisons, emit broadcast and then emit
12712 // freeze.
12713 for (int I : UndefPos) {
12714 ReuseMask[I] = PoisonMaskElem;
12715 if (isa<UndefValue>(Scalars[I]))
12716 Scalars[I] = PoisonValue::get(OrigScalarTy);
12717 }
12718 NeedFreeze = true;
12719 }
12720 }
12721 };
12722 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
12723 bool IsNonPoisoned = true;
12724 bool IsUsedInExpr = true;
12725 Value *Vec1 = nullptr;
12726 if (!ExtractShuffles.empty()) {
12727 // Gather of extractelements can be represented as just a shuffle of
12728 // a single/two vectors the scalars are extracted from.
12729 // Find input vectors.
12730 Value *Vec2 = nullptr;
12731 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12732 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
12733 ExtractMask[I] = PoisonMaskElem;
12734 }
12735 if (UseVecBaseAsInput) {
12736 Vec1 = ExtractVecBase;
12737 } else {
12738 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12739 if (ExtractMask[I] == PoisonMaskElem)
12740 continue;
12741 if (isa<UndefValue>(E->Scalars[I]))
12742 continue;
12743 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
12744 Value *VecOp = EI->getVectorOperand();
12745 if (const auto *TE = getTreeEntry(VecOp))
12746 if (TE->VectorizedValue)
12747 VecOp = TE->VectorizedValue;
12748 if (!Vec1) {
12749 Vec1 = VecOp;
12750 } else if (Vec1 != VecOp) {
12751 assert((!Vec2 || Vec2 == VecOp) &&
12752 "Expected only 1 or 2 vectors shuffle.");
12753 Vec2 = VecOp;
12754 }
12755 }
12756 }
12757 if (Vec2) {
12758 IsUsedInExpr = false;
12759 IsNonPoisoned &=
12761 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12762 } else if (Vec1) {
12763 IsUsedInExpr &= FindReusedSplat(
12764 ExtractMask,
12765 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
12766 ExtractMask.size());
12767 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
12768 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
12769 } else {
12770 IsUsedInExpr = false;
12771 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
12772 /*ForExtracts=*/true);
12773 }
12774 }
12775 if (!GatherShuffles.empty()) {
12776 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
12777 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
12778 for (const auto [I, TEs] : enumerate(Entries)) {
12779 if (TEs.empty()) {
12780 assert(!GatherShuffles[I] &&
12781 "No shuffles with empty entries list expected.");
12782 continue;
12783 }
12784 assert((TEs.size() == 1 || TEs.size() == 2) &&
12785 "Expected shuffle of 1 or 2 entries.");
12786 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
12787 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
12788 VecMask.assign(VecMask.size(), PoisonMaskElem);
12789 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
12790 if (TEs.size() == 1) {
12791 IsUsedInExpr &= FindReusedSplat(
12792 VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
12793 ShuffleBuilder.add(*TEs.front(), VecMask);
12794 if (TEs.front()->VectorizedValue)
12795 IsNonPoisoned &=
12796 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
12797 } else {
12798 IsUsedInExpr = false;
12799 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
12800 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
12801 IsNonPoisoned &=
12802 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
12803 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
12804 }
12805 }
12806 }
12807 // Try to figure out best way to combine values: build a shuffle and insert
12808 // elements or just build several shuffles.
12809 // Insert non-constant scalars.
12810 SmallVector<Value *> NonConstants(GatheredScalars);
12811 int EMSz = ExtractMask.size();
12812 int MSz = Mask.size();
12813 // Try to build constant vector and shuffle with it only if currently we
12814 // have a single permutation and more than 1 scalar constants.
12815 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
12816 bool IsIdentityShuffle =
12817 ((UseVecBaseAsInput ||
12818 all_of(ExtractShuffles,
12819 [](const std::optional<TTI::ShuffleKind> &SK) {
12820 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12822 })) &&
12823 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
12824 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
12825 (!GatherShuffles.empty() &&
12826 all_of(GatherShuffles,
12827 [](const std::optional<TTI::ShuffleKind> &SK) {
12828 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12830 }) &&
12831 none_of(Mask, [&](int I) { return I >= MSz; }) &&
12833 bool EnoughConstsForShuffle =
12834 IsSingleShuffle &&
12835 (none_of(GatheredScalars,
12836 [](Value *V) {
12837 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12838 }) ||
12839 any_of(GatheredScalars,
12840 [](Value *V) {
12841 return isa<Constant>(V) && !isa<UndefValue>(V);
12842 })) &&
12843 (!IsIdentityShuffle ||
12844 (GatheredScalars.size() == 2 &&
12845 any_of(GatheredScalars,
12846 [](Value *V) { return !isa<UndefValue>(V); })) ||
12847 count_if(GatheredScalars, [](Value *V) {
12848 return isa<Constant>(V) && !isa<PoisonValue>(V);
12849 }) > 1);
12850 // NonConstants array contains just non-constant values, GatheredScalars
12851 // contains only constant to build final vector and then shuffle.
12852 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
12853 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
12854 NonConstants[I] = PoisonValue::get(OrigScalarTy);
12855 else
12856 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
12857 }
12858 // Generate constants for final shuffle and build a mask for them.
12859 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
12860 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
12861 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
12862 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12863 ShuffleBuilder.add(BV, BVMask);
12864 }
12865 if (all_of(NonConstants, [=](Value *V) {
12866 return isa<PoisonValue>(V) ||
12867 (IsSingleShuffle && ((IsIdentityShuffle &&
12868 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
12869 }))
12870 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12871 else
12872 Res = ShuffleBuilder.finalize(
12873 E->ReuseShuffleIndices, E->Scalars.size(),
12874 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
12875 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
12876 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
12877 });
12878 } else if (!allConstant(GatheredScalars)) {
12879 // Gather unique scalars and all constants.
12880 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
12881 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
12882 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
12883 ShuffleBuilder.add(BV, ReuseMask);
12884 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12885 } else {
12886 // Gather all constants.
12887 SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem);
12888 for (auto [I, V] : enumerate(E->Scalars)) {
12889 if (!isa<PoisonValue>(V))
12890 Mask[I] = I;
12891 }
12892 Value *BV = ShuffleBuilder.gather(E->Scalars);
12893 ShuffleBuilder.add(BV, Mask);
12894 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12895 }
12896
12897 if (NeedFreeze)
12898 Res = ShuffleBuilder.createFreeze(Res);
12899 return Res;
12900}
12901
12902Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
12903 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
12904 Builder, *this);
12905}
12906
12907Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
12908 IRBuilderBase::InsertPointGuard Guard(Builder);
12909
12910 if (E->VectorizedValue &&
12911 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12912 E->isAltShuffle())) {
12913 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
12914 return E->VectorizedValue;
12915 }
12916
12917 Value *V = E->Scalars.front();
12918 Type *ScalarTy = V->getType();
12919 if (auto *Store = dyn_cast<StoreInst>(V))
12920 ScalarTy = Store->getValueOperand()->getType();
12921 else if (auto *IE = dyn_cast<InsertElementInst>(V))
12922 ScalarTy = IE->getOperand(1)->getType();
12923 auto It = MinBWs.find(E);
12924 if (It != MinBWs.end())
12925 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
12926 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
12927 if (E->isGather()) {
12928 // Set insert point for non-reduction initial nodes.
12929 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12930 setInsertPointAfterBundle(E);
12931 Value *Vec = createBuildVector(E, ScalarTy);
12932 E->VectorizedValue = Vec;
12933 return Vec;
12934 }
12935
12936 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
12937 auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
12938 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
12939 if (E->getOpcode() == Instruction::Store &&
12940 E->State == TreeEntry::Vectorize) {
12942 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
12943 E->ReorderIndices.size());
12944 ShuffleBuilder.add(V, Mask);
12945 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12946 ShuffleBuilder.addOrdered(V, std::nullopt);
12947 } else {
12948 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
12949 }
12950 return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12951 };
12952
12953 assert((E->State == TreeEntry::Vectorize ||
12954 E->State == TreeEntry::ScatterVectorize ||
12955 E->State == TreeEntry::StridedVectorize) &&
12956 "Unhandled state");
12957 unsigned ShuffleOrOp =
12958 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
12959 Instruction *VL0 = E->getMainOp();
12960 auto GetOperandSignedness = [&](unsigned Idx) {
12961 const TreeEntry *OpE = getOperandEntry(E, Idx);
12962 bool IsSigned = false;
12963 auto It = MinBWs.find(OpE);
12964 if (It != MinBWs.end())
12965 IsSigned = It->second.second;
12966 else
12967 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
12968 return !isKnownNonNegative(R, SimplifyQuery(*DL));
12969 });
12970 return IsSigned;
12971 };
12972 switch (ShuffleOrOp) {
12973 case Instruction::PHI: {
12974 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
12975 E != VectorizableTree.front().get() ||
12976 !E->UserTreeIndices.empty()) &&
12977 "PHI reordering is free.");
12978 if (PostponedPHIs && E->VectorizedValue)
12979 return E->VectorizedValue;
12980 auto *PH = cast<PHINode>(VL0);
12981 Builder.SetInsertPoint(PH->getParent(),
12982 PH->getParent()->getFirstNonPHIIt());
12983 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12984 if (PostponedPHIs || !E->VectorizedValue) {
12985 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
12986 E->PHI = NewPhi;
12987 Value *V = NewPhi;
12988
12989 // Adjust insertion point once all PHI's have been generated.
12990 Builder.SetInsertPoint(PH->getParent(),
12991 PH->getParent()->getFirstInsertionPt());
12992 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12993
12994 V = FinalShuffle(V, E, VecTy);
12995
12996 E->VectorizedValue = V;
12997 if (PostponedPHIs)
12998 return V;
12999 }
13000 PHINode *NewPhi = cast<PHINode>(E->PHI);
13001 // If phi node is fully emitted - exit.
13002 if (NewPhi->getNumIncomingValues() != 0)
13003 return NewPhi;
13004
13005 // PHINodes may have multiple entries from the same block. We want to
13006 // visit every block once.
13008
13009 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
13011 BasicBlock *IBB = PH->getIncomingBlock(I);
13012
13013 // Stop emission if all incoming values are generated.
13014 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
13015 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13016 return NewPhi;
13017 }
13018
13019 if (!VisitedBBs.insert(IBB).second) {
13020 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
13021 continue;
13022 }
13023
13024 Builder.SetInsertPoint(IBB->getTerminator());
13025 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
13026 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
13027 if (VecTy != Vec->getType()) {
13028 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
13029 MinBWs.contains(getOperandEntry(E, I))) &&
13030 "Expected item in MinBWs.");
13031 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
13032 }
13033 NewPhi->addIncoming(Vec, IBB);
13034 }
13035
13036 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
13037 "Invalid number of incoming values");
13038 return NewPhi;
13039 }
13040
13041 case Instruction::ExtractElement: {
13042 Value *V = E->getSingleOperand(0);
13043 if (const TreeEntry *TE = getTreeEntry(V))
13044 V = TE->VectorizedValue;
13045 setInsertPointAfterBundle(E);
13046 V = FinalShuffle(V, E, VecTy);
13047 E->VectorizedValue = V;
13048 return V;
13049 }
13050 case Instruction::ExtractValue: {
13051 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
13052 Builder.SetInsertPoint(LI);
13053 Value *Ptr = LI->getPointerOperand();
13054 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
13055 Value *NewV = propagateMetadata(V, E->Scalars);
13056 NewV = FinalShuffle(NewV, E, VecTy);
13057 E->VectorizedValue = NewV;
13058 return NewV;
13059 }
13060 case Instruction::InsertElement: {
13061 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
13062 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
13063 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
13064 ArrayRef<Value *> Op = E->getOperand(1);
13065 Type *ScalarTy = Op.front()->getType();
13066 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
13067 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
13068 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
13069 assert(Res.first > 0 && "Expected item in MinBWs.");
13070 V = Builder.CreateIntCast(
13071 V,
13073 ScalarTy,
13074 cast<FixedVectorType>(V->getType())->getNumElements()),
13075 Res.second);
13076 }
13077
13078 // Create InsertVector shuffle if necessary
13079 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
13080 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
13081 }));
13082 const unsigned NumElts =
13083 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
13084 const unsigned NumScalars = E->Scalars.size();
13085
13086 unsigned Offset = *getElementIndex(VL0);
13087 assert(Offset < NumElts && "Failed to find vector index offset");
13088
13089 // Create shuffle to resize vector
13091 if (!E->ReorderIndices.empty()) {
13092 inversePermutation(E->ReorderIndices, Mask);
13093 Mask.append(NumElts - NumScalars, PoisonMaskElem);
13094 } else {
13095 Mask.assign(NumElts, PoisonMaskElem);
13096 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
13097 }
13098 // Create InsertVector shuffle if necessary
13099 bool IsIdentity = true;
13100 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
13101 Mask.swap(PrevMask);
13102 for (unsigned I = 0; I < NumScalars; ++I) {
13103 Value *Scalar = E->Scalars[PrevMask[I]];
13104 unsigned InsertIdx = *getElementIndex(Scalar);
13105 IsIdentity &= InsertIdx - Offset == I;
13106 Mask[InsertIdx - Offset] = I;
13107 }
13108 if (!IsIdentity || NumElts != NumScalars) {
13109 Value *V2 = nullptr;
13110 bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
13111 SmallVector<int> InsertMask(Mask);
13112 if (NumElts != NumScalars && Offset == 0) {
13113 // Follow all insert element instructions from the current buildvector
13114 // sequence.
13115 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
13116 do {
13117 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
13118 if (!InsertIdx)
13119 break;
13120 if (InsertMask[*InsertIdx] == PoisonMaskElem)
13121 InsertMask[*InsertIdx] = *InsertIdx;
13122 if (!Ins->hasOneUse())
13123 break;
13124 Ins = dyn_cast_or_null<InsertElementInst>(
13125 Ins->getUniqueUndroppableUser());
13126 } while (Ins);
13127 SmallBitVector UseMask =
13128 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
13129 SmallBitVector IsFirstPoison =
13130 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
13131 SmallBitVector IsFirstUndef =
13132 isUndefVector(FirstInsert->getOperand(0), UseMask);
13133 if (!IsFirstPoison.all()) {
13134 unsigned Idx = 0;
13135 for (unsigned I = 0; I < NumElts; I++) {
13136 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
13137 IsFirstUndef.test(I)) {
13138 if (IsVNonPoisonous) {
13139 InsertMask[I] = I < NumScalars ? I : 0;
13140 continue;
13141 }
13142 if (!V2)
13143 V2 = UndefValue::get(V->getType());
13144 if (Idx >= NumScalars)
13145 Idx = NumScalars - 1;
13146 InsertMask[I] = NumScalars + Idx;
13147 ++Idx;
13148 } else if (InsertMask[I] != PoisonMaskElem &&
13149 Mask[I] == PoisonMaskElem) {
13150 InsertMask[I] = PoisonMaskElem;
13151 }
13152 }
13153 } else {
13154 InsertMask = Mask;
13155 }
13156 }
13157 if (!V2)
13158 V2 = PoisonValue::get(V->getType());
13159 V = Builder.CreateShuffleVector(V, V2, InsertMask);
13160 if (auto *I = dyn_cast<Instruction>(V)) {
13161 GatherShuffleExtractSeq.insert(I);
13162 CSEBlocks.insert(I->getParent());
13163 }
13164 }
13165
13166 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
13167 for (unsigned I = 0; I < NumElts; I++) {
13168 if (Mask[I] != PoisonMaskElem)
13169 InsertMask[Offset + I] = I;
13170 }
13171 SmallBitVector UseMask =
13172 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
13173 SmallBitVector IsFirstUndef =
13174 isUndefVector(FirstInsert->getOperand(0), UseMask);
13175 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
13176 NumElts != NumScalars) {
13177 if (IsFirstUndef.all()) {
13178 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
13179 SmallBitVector IsFirstPoison =
13180 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
13181 if (!IsFirstPoison.all()) {
13182 for (unsigned I = 0; I < NumElts; I++) {
13183 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
13184 InsertMask[I] = I + NumElts;
13185 }
13186 }
13187 V = Builder.CreateShuffleVector(
13188 V,
13189 IsFirstPoison.all() ? PoisonValue::get(V->getType())
13190 : FirstInsert->getOperand(0),
13191 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
13192 if (auto *I = dyn_cast<Instruction>(V)) {
13193 GatherShuffleExtractSeq.insert(I);
13194 CSEBlocks.insert(I->getParent());
13195 }
13196 }
13197 } else {
13198 SmallBitVector IsFirstPoison =
13199 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
13200 for (unsigned I = 0; I < NumElts; I++) {
13201 if (InsertMask[I] == PoisonMaskElem)
13202 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
13203 else
13204 InsertMask[I] += NumElts;
13205 }
13206 V = Builder.CreateShuffleVector(
13207 FirstInsert->getOperand(0), V, InsertMask,
13208 cast<Instruction>(E->Scalars.back())->getName());
13209 if (auto *I = dyn_cast<Instruction>(V)) {
13210 GatherShuffleExtractSeq.insert(I);
13211 CSEBlocks.insert(I->getParent());
13212 }
13213 }
13214 }
13215
13216 ++NumVectorInstructions;
13217 E->VectorizedValue = V;
13218 return V;
13219 }
13220 case Instruction::ZExt:
13221 case Instruction::SExt:
13222 case Instruction::FPToUI:
13223 case Instruction::FPToSI:
13224 case Instruction::FPExt:
13225 case Instruction::PtrToInt:
13226 case Instruction::IntToPtr:
13227 case Instruction::SIToFP:
13228 case Instruction::UIToFP:
13229 case Instruction::Trunc:
13230 case Instruction::FPTrunc:
13231 case Instruction::BitCast: {
13232 setInsertPointAfterBundle(E);
13233
13234 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
13235 if (E->VectorizedValue) {
13236 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13237 return E->VectorizedValue;
13238 }
13239
13240 auto *CI = cast<CastInst>(VL0);
13241 Instruction::CastOps VecOpcode = CI->getOpcode();
13242 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
13243 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
13244 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
13245 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
13246 SrcScalarTy != CI->getOperand(0)->getType())) {
13247 // Check if the values are candidates to demote.
13248 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
13249 if (SrcIt != MinBWs.end())
13250 SrcBWSz = SrcIt->second.first;
13251 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
13252 if (BWSz == SrcBWSz) {
13253 VecOpcode = Instruction::BitCast;
13254 } else if (BWSz < SrcBWSz) {
13255 VecOpcode = Instruction::Trunc;
13256 } else if (It != MinBWs.end()) {
13257 assert(BWSz > SrcBWSz && "Invalid cast!");
13258 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
13259 } else if (SrcIt != MinBWs.end()) {
13260 assert(BWSz > SrcBWSz && "Invalid cast!");
13261 VecOpcode =
13262 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
13263 }
13264 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
13265 !SrcIt->second.second) {
13266 VecOpcode = Instruction::UIToFP;
13267 }
13268 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
13269 ? InVec
13270 : Builder.CreateCast(VecOpcode, InVec, VecTy);
13271 V = FinalShuffle(V, E, VecTy);
13272
13273 E->VectorizedValue = V;
13274 ++NumVectorInstructions;
13275 return V;
13276 }
13277 case Instruction::FCmp:
13278 case Instruction::ICmp: {
13279 setInsertPointAfterBundle(E);
13280
13281 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
13282 if (E->VectorizedValue) {
13283 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13284 return E->VectorizedValue;
13285 }
13286 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
13287 if (E->VectorizedValue) {
13288 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13289 return E->VectorizedValue;
13290 }
13291 if (L->getType() != R->getType()) {
13292 assert((getOperandEntry(E, 0)->isGather() ||
13293 getOperandEntry(E, 1)->isGather() ||
13294 MinBWs.contains(getOperandEntry(E, 0)) ||
13295 MinBWs.contains(getOperandEntry(E, 1))) &&
13296 "Expected item in MinBWs.");
13297 if (cast<VectorType>(L->getType())
13298 ->getElementType()
13299 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
13300 ->getElementType()
13301 ->getIntegerBitWidth()) {
13302 Type *CastTy = R->getType();
13303 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
13304 } else {
13305 Type *CastTy = L->getType();
13306 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
13307 }
13308 }
13309
13310 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
13311 Value *V = Builder.CreateCmp(P0, L, R);
13312 propagateIRFlags(V, E->Scalars, VL0);
13313 // Do not cast for cmps.
13314 VecTy = cast<FixedVectorType>(V->getType());
13315 V = FinalShuffle(V, E, VecTy);
13316
13317 E->VectorizedValue = V;
13318 ++NumVectorInstructions;
13319 return V;
13320 }
13321 case Instruction::Select: {
13322 setInsertPointAfterBundle(E);
13323
13324 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
13325 if (E->VectorizedValue) {
13326 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13327 return E->VectorizedValue;
13328 }
13329 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
13330 if (E->VectorizedValue) {
13331 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13332 return E->VectorizedValue;
13333 }
13334 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
13335 if (E->VectorizedValue) {
13336 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13337 return E->VectorizedValue;
13338 }
13339 if (True->getType() != VecTy || False->getType() != VecTy) {
13340 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
13341 getOperandEntry(E, 2)->isGather() ||
13342 MinBWs.contains(getOperandEntry(E, 1)) ||
13343 MinBWs.contains(getOperandEntry(E, 2))) &&
13344 "Expected item in MinBWs.");
13345 if (True->getType() != VecTy)
13346 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
13347 if (False->getType() != VecTy)
13348 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
13349 }
13350
13351 unsigned CondNumElements = getNumElements(Cond->getType());
13352 unsigned TrueNumElements = getNumElements(True->getType());
13353 assert(TrueNumElements >= CondNumElements &&
13354 TrueNumElements % CondNumElements == 0 &&
13355 "Cannot vectorize Instruction::Select");
13356 assert(TrueNumElements == getNumElements(False->getType()) &&
13357 "Cannot vectorize Instruction::Select");
13358 if (CondNumElements != TrueNumElements) {
13359 // When the return type is i1 but the source is fixed vector type, we
13360 // need to duplicate the condition value.
13361 Cond = Builder.CreateShuffleVector(
13362 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
13363 CondNumElements));
13364 }
13365 assert(getNumElements(Cond->getType()) == TrueNumElements &&
13366 "Cannot vectorize Instruction::Select");
13367 Value *V = Builder.CreateSelect(Cond, True, False);
13368 V = FinalShuffle(V, E, VecTy);
13369
13370 E->VectorizedValue = V;
13371 ++NumVectorInstructions;
13372 return V;
13373 }
13374 case Instruction::FNeg: {
13375 setInsertPointAfterBundle(E);
13376
13377 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
13378
13379 if (E->VectorizedValue) {
13380 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13381 return E->VectorizedValue;
13382 }
13383
13384 Value *V = Builder.CreateUnOp(
13385 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
13386 propagateIRFlags(V, E->Scalars, VL0);
13387 if (auto *I = dyn_cast<Instruction>(V))
13388 V = propagateMetadata(I, E->Scalars);
13389
13390 V = FinalShuffle(V, E, VecTy);
13391
13392 E->VectorizedValue = V;
13393 ++NumVectorInstructions;
13394
13395 return V;
13396 }
13397 case Instruction::Freeze: {
13398 setInsertPointAfterBundle(E);
13399
13400 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
13401
13402 if (E->VectorizedValue) {
13403 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13404 return E->VectorizedValue;
13405 }
13406
13407 Value *V = Builder.CreateFreeze(Op);
13408 V = FinalShuffle(V, E, VecTy);
13409
13410 E->VectorizedValue = V;
13411 ++NumVectorInstructions;
13412
13413 return V;
13414 }
13415 case Instruction::Add:
13416 case Instruction::FAdd:
13417 case Instruction::Sub:
13418 case Instruction::FSub:
13419 case Instruction::Mul:
13420 case Instruction::FMul:
13421 case Instruction::UDiv:
13422 case Instruction::SDiv:
13423 case Instruction::FDiv:
13424 case Instruction::URem:
13425 case Instruction::SRem:
13426 case Instruction::FRem:
13427 case Instruction::Shl:
13428 case Instruction::LShr:
13429 case Instruction::AShr:
13430 case Instruction::And:
13431 case Instruction::Or:
13432 case Instruction::Xor: {
13433 setInsertPointAfterBundle(E);
13434
13435 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
13436 if (E->VectorizedValue) {
13437 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13438 return E->VectorizedValue;
13439 }
13440 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
13441 if (E->VectorizedValue) {
13442 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13443 return E->VectorizedValue;
13444 }
13445 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
13446 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
13447 ArrayRef<Value *> Ops = E->getOperand(I);
13448 if (all_of(Ops, [&](Value *Op) {
13449 auto *CI = dyn_cast<ConstantInt>(Op);
13450 return CI && CI->getValue().countr_one() >= It->second.first;
13451 })) {
13452 V = FinalShuffle(I == 0 ? RHS : LHS, E, VecTy);
13453 E->VectorizedValue = V;
13454 ++NumVectorInstructions;
13455 return V;
13456 }
13457 }
13458 }
13459 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
13460 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
13461 getOperandEntry(E, 1)->isGather() ||
13462 MinBWs.contains(getOperandEntry(E, 0)) ||
13463 MinBWs.contains(getOperandEntry(E, 1))) &&
13464 "Expected item in MinBWs.");
13465 if (LHS->getType() != VecTy)
13466 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
13467 if (RHS->getType() != VecTy)
13468 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
13469 }
13470
13471 Value *V = Builder.CreateBinOp(
13472 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
13473 RHS);
13474 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
13475 if (auto *I = dyn_cast<Instruction>(V)) {
13476 V = propagateMetadata(I, E->Scalars);
13477 // Drop nuw flags for abs(sub(commutative), true).
13478 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
13479 any_of(E->Scalars, [](Value *V) {
13480 return isCommutative(cast<Instruction>(V));
13481 }))
13482 I->setHasNoUnsignedWrap(/*b=*/false);
13483 }
13484
13485 V = FinalShuffle(V, E, VecTy);
13486
13487 E->VectorizedValue = V;
13488 ++NumVectorInstructions;
13489
13490 return V;
13491 }
13492 case Instruction::Load: {
13493 // Loads are inserted at the head of the tree because we don't want to
13494 // sink them all the way down past store instructions.
13495 setInsertPointAfterBundle(E);
13496
13497 LoadInst *LI = cast<LoadInst>(VL0);
13498 Instruction *NewLI;
13499 Value *PO = LI->getPointerOperand();
13500 if (E->State == TreeEntry::Vectorize) {
13501 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
13502 } else if (E->State == TreeEntry::StridedVectorize) {
13503 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
13504 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
13505 PO = IsReverseOrder ? PtrN : Ptr0;
13506 std::optional<int> Diff = getPointersDiff(
13507 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
13508 Type *StrideTy = DL->getIndexType(PO->getType());
13509 Value *StrideVal;
13510 if (Diff) {
13511 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
13512 StrideVal =
13513 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
13514 DL->getTypeAllocSize(ScalarTy));
13515 } else {
13516 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
13517 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
13518 return cast<LoadInst>(V)->getPointerOperand();
13519 });
13520 OrdersType Order;
13521 std::optional<Value *> Stride =
13522 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
13523 &*Builder.GetInsertPoint());
13524 Value *NewStride =
13525 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
13526 StrideVal = Builder.CreateMul(
13527 NewStride,
13528 ConstantInt::get(
13529 StrideTy,
13530 (IsReverseOrder ? -1 : 1) *
13531 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
13532 }
13533 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13534 auto *Inst = Builder.CreateIntrinsic(
13535 Intrinsic::experimental_vp_strided_load,
13536 {VecTy, PO->getType(), StrideTy},
13537 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
13538 Builder.getInt32(E->Scalars.size())});
13539 Inst->addParamAttr(
13540 /*ArgNo=*/0,
13541 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
13542 NewLI = Inst;
13543 } else {
13544 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
13545 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
13546 if (E->VectorizedValue) {
13547 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13548 return E->VectorizedValue;
13549 }
13550 // Use the minimum alignment of the gathered loads.
13551 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13552 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
13553 }
13554 Value *V = propagateMetadata(NewLI, E->Scalars);
13555
13556 V = FinalShuffle(V, E, VecTy);
13557 E->VectorizedValue = V;
13558 ++NumVectorInstructions;
13559 return V;
13560 }
13561 case Instruction::Store: {
13562 auto *SI = cast<StoreInst>(VL0);
13563
13564 setInsertPointAfterBundle(E);
13565
13566 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
13567 if (VecValue->getType() != VecTy)
13568 VecValue =
13569 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
13570 VecValue = FinalShuffle(VecValue, E, VecTy);
13571
13572 Value *Ptr = SI->getPointerOperand();
13573 Instruction *ST;
13574 if (E->State == TreeEntry::Vectorize) {
13575 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
13576 } else {
13577 assert(E->State == TreeEntry::StridedVectorize &&
13578 "Expected either strided or conseutive stores.");
13579 if (!E->ReorderIndices.empty()) {
13580 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
13581 Ptr = SI->getPointerOperand();
13582 }
13583 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
13584 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
13585 auto *Inst = Builder.CreateIntrinsic(
13586 Intrinsic::experimental_vp_strided_store,
13587 {VecTy, Ptr->getType(), StrideTy},
13588 {VecValue, Ptr,
13589 ConstantInt::get(
13590 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
13591 Builder.getAllOnesMask(VecTy->getElementCount()),
13592 Builder.getInt32(E->Scalars.size())});
13593 Inst->addParamAttr(
13594 /*ArgNo=*/1,
13595 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
13596 ST = Inst;
13597 }
13598
13599 Value *V = propagateMetadata(ST, E->Scalars);
13600
13601 E->VectorizedValue = V;
13602 ++NumVectorInstructions;
13603 return V;
13604 }
13605 case Instruction::GetElementPtr: {
13606 auto *GEP0 = cast<GetElementPtrInst>(VL0);
13607 setInsertPointAfterBundle(E);
13608
13609 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
13610 if (E->VectorizedValue) {
13611 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13612 return E->VectorizedValue;
13613 }
13614
13615 SmallVector<Value *> OpVecs;
13616 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
13617 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
13618 if (E->VectorizedValue) {
13619 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13620 return E->VectorizedValue;
13621 }
13622 OpVecs.push_back(OpVec);
13623 }
13624
13625 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
13626 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
13628 for (Value *V : E->Scalars) {
13629 if (isa<GetElementPtrInst>(V))
13630 GEPs.push_back(V);
13631 }
13632 V = propagateMetadata(I, GEPs);
13633 }
13634
13635 V = FinalShuffle(V, E, VecTy);
13636
13637 E->VectorizedValue = V;
13638 ++NumVectorInstructions;
13639
13640 return V;
13641 }
13642 case Instruction::Call: {
13643 CallInst *CI = cast<CallInst>(VL0);
13644 setInsertPointAfterBundle(E);
13645
13647
13648 SmallVector<Type *> ArgTys =
13650 It != MinBWs.end() ? It->second.first : 0);
13651 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
13652 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
13653 VecCallCosts.first <= VecCallCosts.second;
13654
13655 Value *ScalarArg = nullptr;
13656 SmallVector<Value *> OpVecs;
13657 SmallVector<Type *, 2> TysForDecl;
13658 // Add return type if intrinsic is overloaded on it.
13659 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
13660 TysForDecl.push_back(VecTy);
13661 auto *CEI = cast<CallInst>(VL0);
13662 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
13663 ValueList OpVL;
13664 // Some intrinsics have scalar arguments. This argument should not be
13665 // vectorized.
13666 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
13667 ScalarArg = CEI->getArgOperand(I);
13668 // if decided to reduce bitwidth of abs intrinsic, it second argument
13669 // must be set false (do not return poison, if value issigned min).
13670 if (ID == Intrinsic::abs && It != MinBWs.end() &&
13671 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
13672 ScalarArg = Builder.getFalse();
13673 OpVecs.push_back(ScalarArg);
13675 TysForDecl.push_back(ScalarArg->getType());
13676 continue;
13677 }
13678
13679 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
13680 if (E->VectorizedValue) {
13681 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13682 return E->VectorizedValue;
13683 }
13684 ScalarArg = CEI->getArgOperand(I);
13685 if (cast<VectorType>(OpVec->getType())->getElementType() !=
13686 ScalarArg->getType()->getScalarType() &&
13687 It == MinBWs.end()) {
13688 auto *CastTy =
13689 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
13690 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
13691 } else if (It != MinBWs.end()) {
13692 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
13693 }
13694 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
13695 OpVecs.push_back(OpVec);
13696 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
13697 TysForDecl.push_back(OpVec->getType());
13698 }
13699
13700 Function *CF;
13701 if (!UseIntrinsic) {
13702 VFShape Shape =
13705 static_cast<unsigned>(VecTy->getNumElements())),
13706 false /*HasGlobalPred*/);
13707 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
13708 } else {
13709 CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl);
13710 }
13711
13713 CI->getOperandBundlesAsDefs(OpBundles);
13714 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
13715
13716 propagateIRFlags(V, E->Scalars, VL0);
13717 V = FinalShuffle(V, E, VecTy);
13718
13719 E->VectorizedValue = V;
13720 ++NumVectorInstructions;
13721 return V;
13722 }
13723 case Instruction::ShuffleVector: {
13724 assert(E->isAltShuffle() &&
13725 ((Instruction::isBinaryOp(E->getOpcode()) &&
13726 Instruction::isBinaryOp(E->getAltOpcode())) ||
13727 (Instruction::isCast(E->getOpcode()) &&
13728 Instruction::isCast(E->getAltOpcode())) ||
13729 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
13730 "Invalid Shuffle Vector Operand");
13731
13732 Value *LHS = nullptr, *RHS = nullptr;
13733 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
13734 setInsertPointAfterBundle(E);
13735 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13736 if (E->VectorizedValue) {
13737 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13738 return E->VectorizedValue;
13739 }
13740 RHS = vectorizeOperand(E, 1, PostponedPHIs);
13741 } else {
13742 setInsertPointAfterBundle(E);
13743 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13744 }
13745 if (E->VectorizedValue) {
13746 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13747 return E->VectorizedValue;
13748 }
13749 if (LHS && RHS &&
13750 ((Instruction::isBinaryOp(E->getOpcode()) &&
13751 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
13752 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
13753 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
13754 getOperandEntry(E, 1)->isGather() ||
13755 MinBWs.contains(getOperandEntry(E, 0)) ||
13756 MinBWs.contains(getOperandEntry(E, 1))) &&
13757 "Expected item in MinBWs.");
13758 Type *CastTy = VecTy;
13759 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
13760 if (cast<VectorType>(LHS->getType())
13761 ->getElementType()
13762 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
13763 ->getElementType()
13764 ->getIntegerBitWidth())
13765 CastTy = RHS->getType();
13766 else
13767 CastTy = LHS->getType();
13768 }
13769 if (LHS->getType() != CastTy)
13770 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
13771 if (RHS->getType() != CastTy)
13772 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
13773 }
13774
13775 Value *V0, *V1;
13776 if (Instruction::isBinaryOp(E->getOpcode())) {
13777 V0 = Builder.CreateBinOp(
13778 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
13779 V1 = Builder.CreateBinOp(
13780 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
13781 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
13782 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
13783 auto *AltCI = cast<CmpInst>(E->getAltOp());
13784 CmpInst::Predicate AltPred = AltCI->getPredicate();
13785 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
13786 } else {
13787 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
13788 unsigned SrcBWSz = DL->getTypeSizeInBits(
13789 cast<VectorType>(LHS->getType())->getElementType());
13790 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
13791 if (BWSz <= SrcBWSz) {
13792 if (BWSz < SrcBWSz)
13793 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
13794 assert(LHS->getType() == VecTy && "Expected same type as operand.");
13795 if (auto *I = dyn_cast<Instruction>(LHS))
13796 LHS = propagateMetadata(I, E->Scalars);
13797 E->VectorizedValue = LHS;
13798 ++NumVectorInstructions;
13799 return LHS;
13800 }
13801 }
13802 V0 = Builder.CreateCast(
13803 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
13804 V1 = Builder.CreateCast(
13805 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
13806 }
13807 // Add V0 and V1 to later analysis to try to find and remove matching
13808 // instruction, if any.
13809 for (Value *V : {V0, V1}) {
13810 if (auto *I = dyn_cast<Instruction>(V)) {
13811 GatherShuffleExtractSeq.insert(I);
13812 CSEBlocks.insert(I->getParent());
13813 }
13814 }
13815
13816 // Create shuffle to take alternate operations from the vector.
13817 // Also, gather up main and alt scalar ops to propagate IR flags to
13818 // each vector operation.
13819 ValueList OpScalars, AltScalars;
13821 E->buildAltOpShuffleMask(
13822 [E, this](Instruction *I) {
13823 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
13824 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
13825 *TLI);
13826 },
13827 Mask, &OpScalars, &AltScalars);
13828
13829 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
13830 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
13831 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
13832 // Drop nuw flags for abs(sub(commutative), true).
13833 if (auto *I = dyn_cast<Instruction>(Vec);
13834 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
13835 any_of(E->Scalars, [](Value *V) {
13836 auto *IV = cast<Instruction>(V);
13837 return IV->getOpcode() == Instruction::Sub &&
13838 isCommutative(cast<Instruction>(IV));
13839 }))
13840 I->setHasNoUnsignedWrap(/*b=*/false);
13841 };
13842 DropNuwFlag(V0, E->getOpcode());
13843 DropNuwFlag(V1, E->getAltOpcode());
13844
13845 Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
13846 if (auto *I = dyn_cast<Instruction>(V)) {
13847 V = propagateMetadata(I, E->Scalars);
13848 GatherShuffleExtractSeq.insert(I);
13849 CSEBlocks.insert(I->getParent());
13850 }
13851
13852 E->VectorizedValue = V;
13853 ++NumVectorInstructions;
13854
13855 return V;
13856 }
13857 default:
13858 llvm_unreachable("unknown inst");
13859 }
13860 return nullptr;
13861}
13862
13864 ExtraValueToDebugLocsMap ExternallyUsedValues;
13865 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
13866 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13867}
13868
13869namespace {
13870/// Data type for handling buildvector sequences with the reused scalars from
13871/// other tree entries.
13872struct ShuffledInsertData {
13873 /// List of insertelements to be replaced by shuffles.
13874 SmallVector<InsertElementInst *> InsertElements;
13875 /// The parent vectors and shuffle mask for the given list of inserts.
13877};
13878} // namespace
13879
13881 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
13882 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
13883 Instruction *ReductionRoot) {
13884 // All blocks must be scheduled before any instructions are inserted.
13885 for (auto &BSIter : BlocksSchedules) {
13886 scheduleBlock(BSIter.second.get());
13887 }
13888 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
13889 // need to rebuild it.
13890 EntryToLastInstruction.clear();
13891
13892 if (ReductionRoot)
13893 Builder.SetInsertPoint(ReductionRoot->getParent(),
13894 ReductionRoot->getIterator());
13895 else
13896 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13897
13898 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
13899 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
13900 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13901 if (TE->State == TreeEntry::Vectorize &&
13902 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13903 TE->VectorizedValue)
13904 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
13905 // Run through the list of postponed gathers and emit them, replacing the temp
13906 // emitted allocas with actual vector instructions.
13907 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
13909 for (const TreeEntry *E : PostponedNodes) {
13910 auto *TE = const_cast<TreeEntry *>(E);
13911 if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
13912 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
13913 TE->UserTreeIndices.front().EdgeIdx)) &&
13914 VecTE->isSame(TE->Scalars))
13915 // Found gather node which is absolutely the same as one of the
13916 // vectorized nodes. It may happen after reordering.
13917 continue;
13918 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
13919 TE->VectorizedValue = nullptr;
13920 auto *UserI =
13921 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
13922 // If user is a PHI node, its vector code have to be inserted right before
13923 // block terminator. Since the node was delayed, there were some unresolved
13924 // dependencies at the moment when stab instruction was emitted. In a case
13925 // when any of these dependencies turn out an operand of another PHI, coming
13926 // from this same block, position of a stab instruction will become invalid.
13927 // The is because source vector that supposed to feed this gather node was
13928 // inserted at the end of the block [after stab instruction]. So we need
13929 // to adjust insertion point again to the end of block.
13930 if (isa<PHINode>(UserI)) {
13931 // Insert before all users.
13932 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
13933 for (User *U : PrevVec->users()) {
13934 if (U == UserI)
13935 continue;
13936 auto *UI = dyn_cast<Instruction>(U);
13937 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
13938 continue;
13939 if (UI->comesBefore(InsertPt))
13940 InsertPt = UI;
13941 }
13942 Builder.SetInsertPoint(InsertPt);
13943 } else {
13944 Builder.SetInsertPoint(PrevVec);
13945 }
13946 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
13947 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
13948 if (Vec->getType() != PrevVec->getType()) {
13949 assert(Vec->getType()->isIntOrIntVectorTy() &&
13950 PrevVec->getType()->isIntOrIntVectorTy() &&
13951 "Expected integer vector types only.");
13952 std::optional<bool> IsSigned;
13953 for (Value *V : TE->Scalars) {
13954 if (const TreeEntry *BaseTE = getTreeEntry(V)) {
13955 auto It = MinBWs.find(BaseTE);
13956 if (It != MinBWs.end()) {
13957 IsSigned = IsSigned.value_or(false) || It->second.second;
13958 if (*IsSigned)
13959 break;
13960 }
13961 for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
13962 auto It = MinBWs.find(MNTE);
13963 if (It != MinBWs.end()) {
13964 IsSigned = IsSigned.value_or(false) || It->second.second;
13965 if (*IsSigned)
13966 break;
13967 }
13968 }
13969 if (IsSigned.value_or(false))
13970 break;
13971 // Scan through gather nodes.
13972 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
13973 auto It = MinBWs.find(BVE);
13974 if (It != MinBWs.end()) {
13975 IsSigned = IsSigned.value_or(false) || It->second.second;
13976 if (*IsSigned)
13977 break;
13978 }
13979 }
13980 if (IsSigned.value_or(false))
13981 break;
13982 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
13983 IsSigned =
13984 IsSigned.value_or(false) ||
13985 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
13986 continue;
13987 }
13988 if (IsSigned.value_or(false))
13989 break;
13990 }
13991 }
13992 if (IsSigned.value_or(false)) {
13993 // Final attempt - check user node.
13994 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
13995 if (It != MinBWs.end())
13996 IsSigned = It->second.second;
13997 }
13998 assert(IsSigned &&
13999 "Expected user node or perfect diamond match in MinBWs.");
14000 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
14001 }
14002 PrevVec->replaceAllUsesWith(Vec);
14003 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
14004 // Replace the stub vector node, if it was used before for one of the
14005 // buildvector nodes already.
14006 auto It = PostponedValues.find(PrevVec);
14007 if (It != PostponedValues.end()) {
14008 for (TreeEntry *VTE : It->getSecond())
14009 VTE->VectorizedValue = Vec;
14010 }
14011 eraseInstruction(PrevVec);
14012 }
14013
14014 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
14015 << " values .\n");
14016
14017 SmallVector<ShuffledInsertData> ShuffledInserts;
14018 // Maps vector instruction to original insertelement instruction
14019 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
14020 // Maps extract Scalar to the corresponding extractelement instruction in the
14021 // basic block. Only one extractelement per block should be emitted.
14022 DenseMap<Value *,
14024 ScalarToEEs;
14025 SmallDenseSet<Value *, 4> UsedInserts;
14027 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
14028 // Extract all of the elements with the external uses.
14029 for (const auto &ExternalUse : ExternalUses) {
14030 Value *Scalar = ExternalUse.Scalar;
14031 llvm::User *User = ExternalUse.User;
14032
14033 // Skip users that we already RAUW. This happens when one instruction
14034 // has multiple uses of the same value.
14035 if (User && !is_contained(Scalar->users(), User))
14036 continue;
14037 TreeEntry *E = getTreeEntry(Scalar);
14038 assert(E && "Invalid scalar");
14039 assert(!E->isGather() && "Extracting from a gather list");
14040 // Non-instruction pointers are not deleted, just skip them.
14041 if (E->getOpcode() == Instruction::GetElementPtr &&
14042 !isa<GetElementPtrInst>(Scalar))
14043 continue;
14044
14045 Value *Vec = E->VectorizedValue;
14046 assert(Vec && "Can't find vectorizable value");
14047
14048 Value *Lane = Builder.getInt32(ExternalUse.Lane);
14049 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
14050 if (Scalar->getType() != Vec->getType()) {
14051 Value *Ex = nullptr;
14052 Value *ExV = nullptr;
14053 auto *GEP = dyn_cast<GetElementPtrInst>(Scalar);
14054 bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP);
14055 auto It = ScalarToEEs.find(Scalar);
14056 if (It != ScalarToEEs.end()) {
14057 // No need to emit many extracts, just move the only one in the
14058 // current block.
14059 auto EEIt = It->second.find(Builder.GetInsertBlock());
14060 if (EEIt != It->second.end()) {
14061 Instruction *I = EEIt->second.first;
14062 if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
14063 Builder.GetInsertPoint()->comesBefore(I)) {
14064 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
14065 Builder.GetInsertPoint());
14066 if (auto *CI = EEIt->second.second)
14067 CI->moveAfter(I);
14068 }
14069 Ex = I;
14070 ExV = EEIt->second.second ? EEIt->second.second : Ex;
14071 }
14072 }
14073 if (!Ex) {
14074 // "Reuse" the existing extract to improve final codegen.
14075 if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
14076 ES && isa<Instruction>(Vec)) {
14077 Value *V = ES->getVectorOperand();
14078 auto *IVec = cast<Instruction>(Vec);
14079 if (const TreeEntry *ETE = getTreeEntry(V))
14080 V = ETE->VectorizedValue;
14081 if (auto *IV = dyn_cast<Instruction>(V);
14082 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
14083 IV->comesBefore(IVec))
14084 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
14085 else
14086 Ex = Builder.CreateExtractElement(Vec, Lane);
14087 } else if (ReplaceGEP) {
14088 // Leave the GEPs as is, they are free in most cases and better to
14089 // keep them as GEPs.
14090 auto *CloneGEP = GEP->clone();
14091 if (isa<Instruction>(Vec))
14092 CloneGEP->insertBefore(*Builder.GetInsertBlock(),
14093 Builder.GetInsertPoint());
14094 else
14095 CloneGEP->insertBefore(GEP);
14096 if (GEP->hasName())
14097 CloneGEP->takeName(GEP);
14098 Ex = CloneGEP;
14099 } else if (auto *VecTy =
14100 dyn_cast<FixedVectorType>(Scalar->getType())) {
14101 assert(SLPReVec && "FixedVectorType is not expected.");
14102 unsigned VecTyNumElements = VecTy->getNumElements();
14103 // When REVEC is enabled, we need to extract a vector.
14104 // Note: The element size of Scalar may be different from the
14105 // element size of Vec.
14106 Ex = Builder.CreateExtractVector(
14108 VecTyNumElements),
14109 Vec, Builder.getInt64(ExternalUse.Lane * VecTyNumElements));
14110 } else {
14111 Ex = Builder.CreateExtractElement(Vec, Lane);
14112 }
14113 // If necessary, sign-extend or zero-extend ScalarRoot
14114 // to the larger type.
14115 ExV = Ex;
14116 if (Scalar->getType() != Ex->getType())
14117 ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
14118 MinBWs.find(E)->second.second);
14119 if (auto *I = dyn_cast<Instruction>(Ex))
14120 ScalarToEEs[Scalar].try_emplace(
14121 Builder.GetInsertBlock(),
14122 std::make_pair(I, cast<Instruction>(ExV)));
14123 }
14124 // The then branch of the previous if may produce constants, since 0
14125 // operand might be a constant.
14126 if (auto *ExI = dyn_cast<Instruction>(Ex)) {
14127 GatherShuffleExtractSeq.insert(ExI);
14128 CSEBlocks.insert(ExI->getParent());
14129 }
14130 return ExV;
14131 }
14132 assert(isa<FixedVectorType>(Scalar->getType()) &&
14133 isa<InsertElementInst>(Scalar) &&
14134 "In-tree scalar of vector type is not insertelement?");
14135 auto *IE = cast<InsertElementInst>(Scalar);
14136 VectorToInsertElement.try_emplace(Vec, IE);
14137 return Vec;
14138 };
14139 // If User == nullptr, the Scalar remains as scalar in vectorized
14140 // instructions or is used as extra arg. Generate ExtractElement instruction
14141 // and update the record for this scalar in ExternallyUsedValues.
14142 if (!User) {
14143 if (!ScalarsWithNullptrUser.insert(Scalar).second)
14144 continue;
14145 assert((ExternallyUsedValues.count(Scalar) ||
14146 Scalar->hasNUsesOrMore(UsesLimit) ||
14147 any_of(Scalar->users(),
14148 [&](llvm::User *U) {
14149 if (ExternalUsesAsGEPs.contains(U))
14150 return true;
14151 TreeEntry *UseEntry = getTreeEntry(U);
14152 return UseEntry &&
14153 (UseEntry->State == TreeEntry::Vectorize ||
14154 UseEntry->State ==
14155 TreeEntry::StridedVectorize) &&
14156 (E->State == TreeEntry::Vectorize ||
14157 E->State == TreeEntry::StridedVectorize) &&
14158 doesInTreeUserNeedToExtract(
14159 Scalar,
14160 cast<Instruction>(UseEntry->Scalars.front()),
14161 TLI);
14162 })) &&
14163 "Scalar with nullptr User must be registered in "
14164 "ExternallyUsedValues map or remain as scalar in vectorized "
14165 "instructions");
14166 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
14167 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
14168 if (PHI->getParent()->isLandingPad())
14169 Builder.SetInsertPoint(
14170 PHI->getParent(),
14171 std::next(
14172 PHI->getParent()->getLandingPadInst()->getIterator()));
14173 else
14174 Builder.SetInsertPoint(PHI->getParent(),
14175 PHI->getParent()->getFirstNonPHIIt());
14176 } else {
14177 Builder.SetInsertPoint(VecI->getParent(),
14178 std::next(VecI->getIterator()));
14179 }
14180 } else {
14181 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
14182 }
14183 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14184 // Required to update internally referenced instructions.
14185 Scalar->replaceAllUsesWith(NewInst);
14186 ReplacedExternals.emplace_back(Scalar, NewInst);
14187 continue;
14188 }
14189
14190 if (auto *VU = dyn_cast<InsertElementInst>(User);
14191 VU && VU->getOperand(1) == Scalar) {
14192 // Skip if the scalar is another vector op or Vec is not an instruction.
14193 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
14194 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
14195 if (!UsedInserts.insert(VU).second)
14196 continue;
14197 // Need to use original vector, if the root is truncated.
14198 auto BWIt = MinBWs.find(E);
14199 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
14200 auto *ScalarTy = FTy->getElementType();
14201 auto Key = std::make_pair(Vec, ScalarTy);
14202 auto VecIt = VectorCasts.find(Key);
14203 if (VecIt == VectorCasts.end()) {
14204 IRBuilderBase::InsertPointGuard Guard(Builder);
14205 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
14206 if (IVec->getParent()->isLandingPad())
14207 Builder.SetInsertPoint(IVec->getParent(),
14208 std::next(IVec->getParent()
14209 ->getLandingPadInst()
14210 ->getIterator()));
14211 else
14212 Builder.SetInsertPoint(
14213 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
14214 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
14215 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
14216 }
14217 Vec = Builder.CreateIntCast(
14218 Vec,
14220 ScalarTy,
14221 cast<FixedVectorType>(Vec->getType())->getNumElements()),
14222 BWIt->second.second);
14223 VectorCasts.try_emplace(Key, Vec);
14224 } else {
14225 Vec = VecIt->second;
14226 }
14227 }
14228
14229 std::optional<unsigned> InsertIdx = getElementIndex(VU);
14230 if (InsertIdx) {
14231 auto *It =
14232 find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
14233 // Checks if 2 insertelements are from the same buildvector.
14234 InsertElementInst *VecInsert = Data.InsertElements.front();
14236 VU, VecInsert,
14237 [](InsertElementInst *II) { return II->getOperand(0); });
14238 });
14239 unsigned Idx = *InsertIdx;
14240 if (It == ShuffledInserts.end()) {
14241 (void)ShuffledInserts.emplace_back();
14242 It = std::next(ShuffledInserts.begin(),
14243 ShuffledInserts.size() - 1);
14244 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
14245 if (Mask.empty())
14246 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
14247 // Find the insertvector, vectorized in tree, if any.
14248 Value *Base = VU;
14249 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
14250 if (IEBase != User &&
14251 (!IEBase->hasOneUse() ||
14252 getElementIndex(IEBase).value_or(Idx) == Idx))
14253 break;
14254 // Build the mask for the vectorized insertelement instructions.
14255 if (const TreeEntry *E = getTreeEntry(IEBase)) {
14256 do {
14257 IEBase = cast<InsertElementInst>(Base);
14258 int IEIdx = *getElementIndex(IEBase);
14259 assert(Mask[IEIdx] == PoisonMaskElem &&
14260 "InsertElementInstruction used already.");
14261 Mask[IEIdx] = IEIdx;
14262 Base = IEBase->getOperand(0);
14263 } while (E == getTreeEntry(Base));
14264 break;
14265 }
14266 Base = cast<InsertElementInst>(Base)->getOperand(0);
14267 // After the vectorization the def-use chain has changed, need
14268 // to look through original insertelement instructions, if they
14269 // get replaced by vector instructions.
14270 auto It = VectorToInsertElement.find(Base);
14271 if (It != VectorToInsertElement.end())
14272 Base = It->second;
14273 }
14274 }
14275 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
14276 if (Mask.empty())
14277 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
14278 Mask[Idx] = ExternalUse.Lane;
14279 It->InsertElements.push_back(cast<InsertElementInst>(User));
14280 continue;
14281 }
14282 }
14283 }
14284 }
14285
14286 // Generate extracts for out-of-tree users.
14287 // Find the insertion point for the extractelement lane.
14288 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
14289 if (PHINode *PH = dyn_cast<PHINode>(User)) {
14290 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
14291 if (PH->getIncomingValue(I) == Scalar) {
14292 Instruction *IncomingTerminator =
14293 PH->getIncomingBlock(I)->getTerminator();
14294 if (isa<CatchSwitchInst>(IncomingTerminator)) {
14295 Builder.SetInsertPoint(VecI->getParent(),
14296 std::next(VecI->getIterator()));
14297 } else {
14298 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
14299 }
14300 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14301 PH->setOperand(I, NewInst);
14302 }
14303 }
14304 } else {
14305 Builder.SetInsertPoint(cast<Instruction>(User));
14306 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14307 User->replaceUsesOfWith(Scalar, NewInst);
14308 }
14309 } else {
14310 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
14311 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14312 User->replaceUsesOfWith(Scalar, NewInst);
14313 }
14314
14315 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
14316 }
14317
14318 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
14319 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
14320 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
14321 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
14322 for (int I = 0, E = Mask.size(); I < E; ++I) {
14323 if (Mask[I] < VF)
14324 CombinedMask1[I] = Mask[I];
14325 else
14326 CombinedMask2[I] = Mask[I] - VF;
14327 }
14328 ShuffleInstructionBuilder ShuffleBuilder(
14329 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
14330 ShuffleBuilder.add(V1, CombinedMask1);
14331 if (V2)
14332 ShuffleBuilder.add(V2, CombinedMask2);
14333 return ShuffleBuilder.finalize(std::nullopt);
14334 };
14335
14336 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
14337 bool ForSingleMask) {
14338 unsigned VF = Mask.size();
14339 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
14340 if (VF != VecVF) {
14341 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
14342 Vec = CreateShuffle(Vec, nullptr, Mask);
14343 return std::make_pair(Vec, true);
14344 }
14345 if (!ForSingleMask) {
14346 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
14347 for (unsigned I = 0; I < VF; ++I) {
14348 if (Mask[I] != PoisonMaskElem)
14349 ResizeMask[Mask[I]] = Mask[I];
14350 }
14351 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
14352 }
14353 }
14354
14355 return std::make_pair(Vec, false);
14356 };
14357 // Perform shuffling of the vectorize tree entries for better handling of
14358 // external extracts.
14359 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
14360 // Find the first and the last instruction in the list of insertelements.
14361 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
14362 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
14363 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
14364 Builder.SetInsertPoint(LastInsert);
14365 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
14366 Value *NewInst = performExtractsShuffleAction<Value>(
14367 MutableArrayRef(Vector.data(), Vector.size()),
14368 FirstInsert->getOperand(0),
14369 [](Value *Vec) {
14370 return cast<VectorType>(Vec->getType())
14371 ->getElementCount()
14372 .getKnownMinValue();
14373 },
14374 ResizeToVF,
14375 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
14376 ArrayRef<Value *> Vals) {
14377 assert((Vals.size() == 1 || Vals.size() == 2) &&
14378 "Expected exactly 1 or 2 input values.");
14379 if (Vals.size() == 1) {
14380 // Do not create shuffle if the mask is a simple identity
14381 // non-resizing mask.
14382 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
14383 ->getNumElements() ||
14384 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14385 return CreateShuffle(Vals.front(), nullptr, Mask);
14386 return Vals.front();
14387 }
14388 return CreateShuffle(Vals.front() ? Vals.front()
14389 : FirstInsert->getOperand(0),
14390 Vals.back(), Mask);
14391 });
14392 auto It = ShuffledInserts[I].InsertElements.rbegin();
14393 // Rebuild buildvector chain.
14394 InsertElementInst *II = nullptr;
14395 if (It != ShuffledInserts[I].InsertElements.rend())
14396 II = *It;
14398 while (It != ShuffledInserts[I].InsertElements.rend()) {
14399 assert(II && "Must be an insertelement instruction.");
14400 if (*It == II)
14401 ++It;
14402 else
14403 Inserts.push_back(cast<Instruction>(II));
14404 II = dyn_cast<InsertElementInst>(II->getOperand(0));
14405 }
14406 for (Instruction *II : reverse(Inserts)) {
14407 II->replaceUsesOfWith(II->getOperand(0), NewInst);
14408 if (auto *NewI = dyn_cast<Instruction>(NewInst))
14409 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
14410 II->moveAfter(NewI);
14411 NewInst = II;
14412 }
14413 LastInsert->replaceAllUsesWith(NewInst);
14414 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
14415 IE->replaceUsesOfWith(IE->getOperand(0),
14416 PoisonValue::get(IE->getOperand(0)->getType()));
14417 IE->replaceUsesOfWith(IE->getOperand(1),
14418 PoisonValue::get(IE->getOperand(1)->getType()));
14419 eraseInstruction(IE);
14420 }
14421 CSEBlocks.insert(LastInsert->getParent());
14422 }
14423
14424 SmallVector<Instruction *> RemovedInsts;
14425 // For each vectorized value:
14426 for (auto &TEPtr : VectorizableTree) {
14427 TreeEntry *Entry = TEPtr.get();
14428
14429 // No need to handle users of gathered values.
14430 if (Entry->isGather())
14431 continue;
14432
14433 assert(Entry->VectorizedValue && "Can't find vectorizable value");
14434
14435 // For each lane:
14436 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
14437 Value *Scalar = Entry->Scalars[Lane];
14438
14439 if (Entry->getOpcode() == Instruction::GetElementPtr &&
14440 !isa<GetElementPtrInst>(Scalar))
14441 continue;
14442#ifndef NDEBUG
14443 Type *Ty = Scalar->getType();
14444 if (!Ty->isVoidTy()) {
14445 for (User *U : Scalar->users()) {
14446 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
14447
14448 // It is legal to delete users in the ignorelist.
14449 assert((getTreeEntry(U) ||
14450 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14451 (isa_and_nonnull<Instruction>(U) &&
14452 isDeleted(cast<Instruction>(U)))) &&
14453 "Deleting out-of-tree value");
14454 }
14455 }
14456#endif
14457 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
14458 auto *I = cast<Instruction>(Scalar);
14459 RemovedInsts.push_back(I);
14460 }
14461 }
14462
14463 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
14464 // new vector instruction.
14465 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
14466 V->mergeDIAssignID(RemovedInsts);
14467
14468 // Clear up reduction references, if any.
14469 if (UserIgnoreList) {
14470 for (Instruction *I : RemovedInsts) {
14471 if (getTreeEntry(I)->Idx != 0)
14472 continue;
14473 SmallVector<SelectInst *> LogicalOpSelects;
14474 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
14475 // Do not replace condition of the logical op in form select <cond>.
14476 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
14477 (match(U.getUser(), m_LogicalAnd()) ||
14478 match(U.getUser(), m_LogicalOr())) &&
14479 U.getOperandNo() == 0;
14480 if (IsPoisoningLogicalOp) {
14481 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
14482 return false;
14483 }
14484 return UserIgnoreList->contains(U.getUser());
14485 });
14486 // Replace conditions of the poisoning logical ops with the non-poison
14487 // constant value.
14488 for (SelectInst *SI : LogicalOpSelects)
14489 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
14490 }
14491 }
14492 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
14493 // cache correctness.
14494 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
14495 // - instructions are not deleted until later.
14496 removeInstructionsAndOperands(ArrayRef(RemovedInsts));
14497
14498 Builder.ClearInsertionPoint();
14499 InstrElementSize.clear();
14500
14501 const TreeEntry &RootTE = *VectorizableTree.front();
14502 Value *Vec = RootTE.VectorizedValue;
14503 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
14504 It != MinBWs.end() &&
14505 ReductionBitWidth != It->second.first) {
14506 IRBuilder<>::InsertPointGuard Guard(Builder);
14507 Builder.SetInsertPoint(ReductionRoot->getParent(),
14508 ReductionRoot->getIterator());
14509 Vec = Builder.CreateIntCast(
14510 Vec,
14511 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
14512 cast<VectorType>(Vec->getType())->getElementCount()),
14513 It->second.second);
14514 }
14515 return Vec;
14516}
14517
14519 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
14520 << " gather sequences instructions.\n");
14521 // LICM InsertElementInst sequences.
14522 for (Instruction *I : GatherShuffleExtractSeq) {
14523 if (isDeleted(I))
14524 continue;
14525
14526 // Check if this block is inside a loop.
14527 Loop *L = LI->getLoopFor(I->getParent());
14528 if (!L)
14529 continue;
14530
14531 // Check if it has a preheader.
14532 BasicBlock *PreHeader = L->getLoopPreheader();
14533 if (!PreHeader)
14534 continue;
14535
14536 // If the vector or the element that we insert into it are
14537 // instructions that are defined in this basic block then we can't
14538 // hoist this instruction.
14539 if (any_of(I->operands(), [L](Value *V) {
14540 auto *OpI = dyn_cast<Instruction>(V);
14541 return OpI && L->contains(OpI);
14542 }))
14543 continue;
14544
14545 // We can hoist this instruction. Move it to the pre-header.
14546 I->moveBefore(PreHeader->getTerminator());
14547 CSEBlocks.insert(PreHeader);
14548 }
14549
14550 // Make a list of all reachable blocks in our CSE queue.
14552 CSEWorkList.reserve(CSEBlocks.size());
14553 for (BasicBlock *BB : CSEBlocks)
14554 if (DomTreeNode *N = DT->getNode(BB)) {
14556 CSEWorkList.push_back(N);
14557 }
14558
14559 // Sort blocks by domination. This ensures we visit a block after all blocks
14560 // dominating it are visited.
14561 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
14562 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
14563 "Different nodes should have different DFS numbers");
14564 return A->getDFSNumIn() < B->getDFSNumIn();
14565 });
14566
14567 // Less defined shuffles can be replaced by the more defined copies.
14568 // Between two shuffles one is less defined if it has the same vector operands
14569 // and its mask indeces are the same as in the first one or undefs. E.g.
14570 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
14571 // poison, <0, 0, 0, 0>.
14572 auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
14573 SmallVectorImpl<int> &NewMask) {
14574 if (I1->getType() != I2->getType())
14575 return false;
14576 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
14577 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
14578 if (!SI1 || !SI2)
14579 return I1->isIdenticalTo(I2);
14580 if (SI1->isIdenticalTo(SI2))
14581 return true;
14582 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
14583 if (SI1->getOperand(I) != SI2->getOperand(I))
14584 return false;
14585 // Check if the second instruction is more defined than the first one.
14586 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
14587 ArrayRef<int> SM1 = SI1->getShuffleMask();
14588 // Count trailing undefs in the mask to check the final number of used
14589 // registers.
14590 unsigned LastUndefsCnt = 0;
14591 for (int I = 0, E = NewMask.size(); I < E; ++I) {
14592 if (SM1[I] == PoisonMaskElem)
14593 ++LastUndefsCnt;
14594 else
14595 LastUndefsCnt = 0;
14596 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
14597 NewMask[I] != SM1[I])
14598 return false;
14599 if (NewMask[I] == PoisonMaskElem)
14600 NewMask[I] = SM1[I];
14601 }
14602 // Check if the last undefs actually change the final number of used vector
14603 // registers.
14604 return SM1.size() - LastUndefsCnt > 1 &&
14605 TTI->getNumberOfParts(SI1->getType()) ==
14607 getWidenedType(SI1->getType()->getElementType(),
14608 SM1.size() - LastUndefsCnt));
14609 };
14610 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
14611 // instructions. TODO: We can further optimize this scan if we split the
14612 // instructions into different buckets based on the insert lane.
14614 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
14615 assert(*I &&
14616 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
14617 "Worklist not sorted properly!");
14618 BasicBlock *BB = (*I)->getBlock();
14619 // For all instructions in blocks containing gather sequences:
14620 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
14621 if (isDeleted(&In))
14622 continue;
14623 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
14624 !GatherShuffleExtractSeq.contains(&In))
14625 continue;
14626
14627 // Check if we can replace this instruction with any of the
14628 // visited instructions.
14629 bool Replaced = false;
14630 for (Instruction *&V : Visited) {
14631 SmallVector<int> NewMask;
14632 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
14633 DT->dominates(V->getParent(), In.getParent())) {
14634 In.replaceAllUsesWith(V);
14635 eraseInstruction(&In);
14636 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
14637 if (!NewMask.empty())
14638 SI->setShuffleMask(NewMask);
14639 Replaced = true;
14640 break;
14641 }
14642 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
14643 GatherShuffleExtractSeq.contains(V) &&
14644 IsIdenticalOrLessDefined(V, &In, NewMask) &&
14645 DT->dominates(In.getParent(), V->getParent())) {
14646 In.moveAfter(V);
14647 V->replaceAllUsesWith(&In);
14649 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
14650 if (!NewMask.empty())
14651 SI->setShuffleMask(NewMask);
14652 V = &In;
14653 Replaced = true;
14654 break;
14655 }
14656 }
14657 if (!Replaced) {
14658 assert(!is_contained(Visited, &In));
14659 Visited.push_back(&In);
14660 }
14661 }
14662 }
14663 CSEBlocks.clear();
14664 GatherShuffleExtractSeq.clear();
14665}
14666
14667BoUpSLP::ScheduleData *
14668BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
14669 ScheduleData *Bundle = nullptr;
14670 ScheduleData *PrevInBundle = nullptr;
14671 for (Value *V : VL) {
14673 continue;
14674 ScheduleData *BundleMember = getScheduleData(V);
14675 assert(BundleMember &&
14676 "no ScheduleData for bundle member "
14677 "(maybe not in same basic block)");
14678 assert(BundleMember->isSchedulingEntity() &&
14679 "bundle member already part of other bundle");
14680 if (PrevInBundle) {
14681 PrevInBundle->NextInBundle = BundleMember;
14682 } else {
14683 Bundle = BundleMember;
14684 }
14685
14686 // Group the instructions to a bundle.
14687 BundleMember->FirstInBundle = Bundle;
14688 PrevInBundle = BundleMember;
14689 }
14690 assert(Bundle && "Failed to find schedule bundle");
14691 return Bundle;
14692}
14693
14694// Groups the instructions to a bundle (which is then a single scheduling entity)
14695// and schedules instructions until the bundle gets ready.
14696std::optional<BoUpSLP::ScheduleData *>
14697BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
14698 const InstructionsState &S) {
14699 // No need to schedule PHIs, insertelement, extractelement and extractvalue
14700 // instructions.
14701 if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
14703 return nullptr;
14704
14705 // Initialize the instruction bundle.
14706 Instruction *OldScheduleEnd = ScheduleEnd;
14707 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
14708
14709 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
14710 ScheduleData *Bundle) {
14711 // The scheduling region got new instructions at the lower end (or it is a
14712 // new region for the first bundle). This makes it necessary to
14713 // recalculate all dependencies.
14714 // It is seldom that this needs to be done a second time after adding the
14715 // initial bundle to the region.
14716 if (ScheduleEnd != OldScheduleEnd) {
14717 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
14718 doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
14719 ReSchedule = true;
14720 }
14721 if (Bundle) {
14722 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
14723 << " in block " << BB->getName() << "\n");
14724 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
14725 }
14726
14727 if (ReSchedule) {
14728 resetSchedule();
14729 initialFillReadyList(ReadyInsts);
14730 }
14731
14732 // Now try to schedule the new bundle or (if no bundle) just calculate
14733 // dependencies. As soon as the bundle is "ready" it means that there are no
14734 // cyclic dependencies and we can schedule it. Note that's important that we
14735 // don't "schedule" the bundle yet (see cancelScheduling).
14736 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
14737 !ReadyInsts.empty()) {
14738 ScheduleData *Picked = ReadyInsts.pop_back_val();
14739 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
14740 "must be ready to schedule");
14741 schedule(Picked, ReadyInsts);
14742 }
14743 };
14744
14745 // Make sure that the scheduling region contains all
14746 // instructions of the bundle.
14747 for (Value *V : VL) {
14749 continue;
14750 if (!extendSchedulingRegion(V, S)) {
14751 // If the scheduling region got new instructions at the lower end (or it
14752 // is a new region for the first bundle). This makes it necessary to
14753 // recalculate all dependencies.
14754 // Otherwise the compiler may crash trying to incorrectly calculate
14755 // dependencies and emit instruction in the wrong order at the actual
14756 // scheduling.
14757 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
14758 return std::nullopt;
14759 }
14760 }
14761
14762 bool ReSchedule = false;
14763 for (Value *V : VL) {
14765 continue;
14766 ScheduleData *BundleMember = getScheduleData(V);
14767 assert(BundleMember &&
14768 "no ScheduleData for bundle member (maybe not in same basic block)");
14769
14770 // Make sure we don't leave the pieces of the bundle in the ready list when
14771 // whole bundle might not be ready.
14772 ReadyInsts.remove(BundleMember);
14773
14774 if (!BundleMember->IsScheduled)
14775 continue;
14776 // A bundle member was scheduled as single instruction before and now
14777 // needs to be scheduled as part of the bundle. We just get rid of the
14778 // existing schedule.
14779 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
14780 << " was already scheduled\n");
14781 ReSchedule = true;
14782 }
14783
14784 auto *Bundle = buildBundle(VL);
14785 TryScheduleBundleImpl(ReSchedule, Bundle);
14786 if (!Bundle->isReady()) {
14787 cancelScheduling(VL, S.OpValue);
14788 return std::nullopt;
14789 }
14790 return Bundle;
14791}
14792
14793void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
14794 Value *OpValue) {
14795 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
14797 return;
14798
14799 if (doesNotNeedToBeScheduled(OpValue))
14800 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
14801 ScheduleData *Bundle = getScheduleData(OpValue);
14802 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
14803 assert(!Bundle->IsScheduled &&
14804 "Can't cancel bundle which is already scheduled");
14805 assert(Bundle->isSchedulingEntity() &&
14806 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
14807 "tried to unbundle something which is not a bundle");
14808
14809 // Remove the bundle from the ready list.
14810 if (Bundle->isReady())
14811 ReadyInsts.remove(Bundle);
14812
14813 // Un-bundle: make single instructions out of the bundle.
14814 ScheduleData *BundleMember = Bundle;
14815 while (BundleMember) {
14816 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
14817 BundleMember->FirstInBundle = BundleMember;
14818 ScheduleData *Next = BundleMember->NextInBundle;
14819 BundleMember->NextInBundle = nullptr;
14820 BundleMember->TE = nullptr;
14821 if (BundleMember->unscheduledDepsInBundle() == 0) {
14822 ReadyInsts.insert(BundleMember);
14823 }
14824 BundleMember = Next;
14825 }
14826}
14827
14828BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
14829 // Allocate a new ScheduleData for the instruction.
14830 if (ChunkPos >= ChunkSize) {
14831 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
14832 ChunkPos = 0;
14833 }
14834 return &(ScheduleDataChunks.back()[ChunkPos++]);
14835}
14836
14837bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
14838 const InstructionsState &S) {
14839 if (getScheduleData(V, isOneOf(S, V)))
14840 return true;
14841 Instruction *I = dyn_cast<Instruction>(V);
14842 assert(I && "bundle member must be an instruction");
14843 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
14845 "phi nodes/insertelements/extractelements/extractvalues don't need to "
14846 "be scheduled");
14847 auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
14848 ScheduleData *ISD = getScheduleData(I);
14849 if (!ISD)
14850 return false;
14851 assert(isInSchedulingRegion(ISD) &&
14852 "ScheduleData not in scheduling region");
14853 ScheduleData *SD = allocateScheduleDataChunks();
14854 SD->Inst = I;
14855 SD->init(SchedulingRegionID, S.OpValue);
14856 ExtraScheduleDataMap[I][S.OpValue] = SD;
14857 return true;
14858 };
14859 if (CheckScheduleForI(I))
14860 return true;
14861 if (!ScheduleStart) {
14862 // It's the first instruction in the new region.
14863 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
14864 ScheduleStart = I;
14865 ScheduleEnd = I->getNextNode();
14866 if (isOneOf(S, I) != I)
14867 CheckScheduleForI(I);
14868 assert(ScheduleEnd && "tried to vectorize a terminator?");
14869 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
14870 return true;
14871 }
14872 // Search up and down at the same time, because we don't know if the new
14873 // instruction is above or below the existing scheduling region.
14874 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
14875 // against the budget. Otherwise debug info could affect codegen.
14877 ++ScheduleStart->getIterator().getReverse();
14878 BasicBlock::reverse_iterator UpperEnd = BB->rend();
14879 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
14880 BasicBlock::iterator LowerEnd = BB->end();
14881 auto IsAssumeLikeIntr = [](const Instruction &I) {
14882 if (auto *II = dyn_cast<IntrinsicInst>(&I))
14883 return II->isAssumeLikeIntrinsic();
14884 return false;
14885 };
14886 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14887 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14888 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
14889 &*DownIter != I) {
14890 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14891 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
14892 return false;
14893 }
14894
14895 ++UpIter;
14896 ++DownIter;
14897
14898 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14899 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14900 }
14901 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
14902 assert(I->getParent() == ScheduleStart->getParent() &&
14903 "Instruction is in wrong basic block.");
14904 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
14905 ScheduleStart = I;
14906 if (isOneOf(S, I) != I)
14907 CheckScheduleForI(I);
14908 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
14909 << "\n");
14910 return true;
14911 }
14912 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
14913 "Expected to reach top of the basic block or instruction down the "
14914 "lower end.");
14915 assert(I->getParent() == ScheduleEnd->getParent() &&
14916 "Instruction is in wrong basic block.");
14917 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
14918 nullptr);
14919 ScheduleEnd = I->getNextNode();
14920 if (isOneOf(S, I) != I)
14921 CheckScheduleForI(I);
14922 assert(ScheduleEnd && "tried to vectorize a terminator?");
14923 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
14924 return true;
14925}
14926
14927void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
14928 Instruction *ToI,
14929 ScheduleData *PrevLoadStore,
14930 ScheduleData *NextLoadStore) {
14931 ScheduleData *CurrentLoadStore = PrevLoadStore;
14932 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
14933 // No need to allocate data for non-schedulable instructions.
14935 continue;
14936 ScheduleData *SD = ScheduleDataMap.lookup(I);
14937 if (!SD) {
14938 SD = allocateScheduleDataChunks();
14939 ScheduleDataMap[I] = SD;
14940 SD->Inst = I;
14941 }
14942 assert(!isInSchedulingRegion(SD) &&
14943 "new ScheduleData already in scheduling region");
14944 SD->init(SchedulingRegionID, I);
14945
14946 if (I->mayReadOrWriteMemory() &&
14947 (!isa<IntrinsicInst>(I) ||
14948 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
14949 cast<IntrinsicInst>(I)->getIntrinsicID() !=
14950 Intrinsic::pseudoprobe))) {
14951 // Update the linked list of memory accessing instructions.
14952 if (CurrentLoadStore) {
14953 CurrentLoadStore->NextLoadStore = SD;
14954 } else {
14955 FirstLoadStoreInRegion = SD;
14956 }
14957 CurrentLoadStore = SD;
14958 }
14959
14960 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14961 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14962 RegionHasStackSave = true;
14963 }
14964 if (NextLoadStore) {
14965 if (CurrentLoadStore)
14966 CurrentLoadStore->NextLoadStore = NextLoadStore;
14967 } else {
14968 LastLoadStoreInRegion = CurrentLoadStore;
14969 }
14970}
14971
14972void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14973 bool InsertInReadyList,
14974 BoUpSLP *SLP) {
14975 assert(SD->isSchedulingEntity());
14976
14978 WorkList.push_back(SD);
14979
14980 while (!WorkList.empty()) {
14981 ScheduleData *SD = WorkList.pop_back_val();
14982 for (ScheduleData *BundleMember = SD; BundleMember;
14983 BundleMember = BundleMember->NextInBundle) {
14984 assert(isInSchedulingRegion(BundleMember));
14985 if (BundleMember->hasValidDependencies())
14986 continue;
14987
14988 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
14989 << "\n");
14990 BundleMember->Dependencies = 0;
14991 BundleMember->resetUnscheduledDeps();
14992
14993 // Handle def-use chain dependencies.
14994 if (BundleMember->OpValue != BundleMember->Inst) {
14995 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
14996 BundleMember->Dependencies++;
14997 ScheduleData *DestBundle = UseSD->FirstInBundle;
14998 if (!DestBundle->IsScheduled)
14999 BundleMember->incrementUnscheduledDeps(1);
15000 if (!DestBundle->hasValidDependencies())
15001 WorkList.push_back(DestBundle);
15002 }
15003 } else {
15004 for (User *U : BundleMember->Inst->users()) {
15005 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
15006 BundleMember->Dependencies++;
15007 ScheduleData *DestBundle = UseSD->FirstInBundle;
15008 if (!DestBundle->IsScheduled)
15009 BundleMember->incrementUnscheduledDeps(1);
15010 if (!DestBundle->hasValidDependencies())
15011 WorkList.push_back(DestBundle);
15012 }
15013 }
15014 }
15015
15016 auto MakeControlDependent = [&](Instruction *I) {
15017 auto *DepDest = getScheduleData(I);
15018 assert(DepDest && "must be in schedule window");
15019 DepDest->ControlDependencies.push_back(BundleMember);
15020 BundleMember->Dependencies++;
15021 ScheduleData *DestBundle = DepDest->FirstInBundle;
15022 if (!DestBundle->IsScheduled)
15023 BundleMember->incrementUnscheduledDeps(1);
15024 if (!DestBundle->hasValidDependencies())
15025 WorkList.push_back(DestBundle);
15026 };
15027
15028 // Any instruction which isn't safe to speculate at the beginning of the
15029 // block is control dependend on any early exit or non-willreturn call
15030 // which proceeds it.
15031 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
15032 for (Instruction *I = BundleMember->Inst->getNextNode();
15033 I != ScheduleEnd; I = I->getNextNode()) {
15034 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
15035 continue;
15036
15037 // Add the dependency
15038 MakeControlDependent(I);
15039
15041 // Everything past here must be control dependent on I.
15042 break;
15043 }
15044 }
15045
15046 if (RegionHasStackSave) {
15047 // If we have an inalloc alloca instruction, it needs to be scheduled
15048 // after any preceeding stacksave. We also need to prevent any alloca
15049 // from reordering above a preceeding stackrestore.
15050 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
15051 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
15052 for (Instruction *I = BundleMember->Inst->getNextNode();
15053 I != ScheduleEnd; I = I->getNextNode()) {
15054 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
15055 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
15056 // Any allocas past here must be control dependent on I, and I
15057 // must be memory dependend on BundleMember->Inst.
15058 break;
15059
15060 if (!isa<AllocaInst>(I))
15061 continue;
15062
15063 // Add the dependency
15064 MakeControlDependent(I);
15065 }
15066 }
15067
15068 // In addition to the cases handle just above, we need to prevent
15069 // allocas and loads/stores from moving below a stacksave or a
15070 // stackrestore. Avoiding moving allocas below stackrestore is currently
15071 // thought to be conservatism. Moving loads/stores below a stackrestore
15072 // can lead to incorrect code.
15073 if (isa<AllocaInst>(BundleMember->Inst) ||
15074 BundleMember->Inst->mayReadOrWriteMemory()) {
15075 for (Instruction *I = BundleMember->Inst->getNextNode();
15076 I != ScheduleEnd; I = I->getNextNode()) {
15077 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
15078 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
15079 continue;
15080
15081 // Add the dependency
15082 MakeControlDependent(I);
15083 break;
15084 }
15085 }
15086 }
15087
15088 // Handle the memory dependencies (if any).
15089 ScheduleData *DepDest = BundleMember->NextLoadStore;
15090 if (!DepDest)
15091 continue;
15092 Instruction *SrcInst = BundleMember->Inst;
15093 assert(SrcInst->mayReadOrWriteMemory() &&
15094 "NextLoadStore list for non memory effecting bundle?");
15095 MemoryLocation SrcLoc = getLocation(SrcInst);
15096 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
15097 unsigned NumAliased = 0;
15098 unsigned DistToSrc = 1;
15099
15100 for (; DepDest; DepDest = DepDest->NextLoadStore) {
15101 assert(isInSchedulingRegion(DepDest));
15102
15103 // We have two limits to reduce the complexity:
15104 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
15105 // SLP->isAliased (which is the expensive part in this loop).
15106 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
15107 // the whole loop (even if the loop is fast, it's quadratic).
15108 // It's important for the loop break condition (see below) to
15109 // check this limit even between two read-only instructions.
15110 if (DistToSrc >= MaxMemDepDistance ||
15111 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
15112 (NumAliased >= AliasedCheckLimit ||
15113 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
15114
15115 // We increment the counter only if the locations are aliased
15116 // (instead of counting all alias checks). This gives a better
15117 // balance between reduced runtime and accurate dependencies.
15118 NumAliased++;
15119
15120 DepDest->MemoryDependencies.push_back(BundleMember);
15121 BundleMember->Dependencies++;
15122 ScheduleData *DestBundle = DepDest->FirstInBundle;
15123 if (!DestBundle->IsScheduled) {
15124 BundleMember->incrementUnscheduledDeps(1);
15125 }
15126 if (!DestBundle->hasValidDependencies()) {
15127 WorkList.push_back(DestBundle);
15128 }
15129 }
15130
15131 // Example, explaining the loop break condition: Let's assume our
15132 // starting instruction is i0 and MaxMemDepDistance = 3.
15133 //
15134 // +--------v--v--v
15135 // i0,i1,i2,i3,i4,i5,i6,i7,i8
15136 // +--------^--^--^
15137 //
15138 // MaxMemDepDistance let us stop alias-checking at i3 and we add
15139 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
15140 // Previously we already added dependencies from i3 to i6,i7,i8
15141 // (because of MaxMemDepDistance). As we added a dependency from
15142 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
15143 // and we can abort this loop at i6.
15144 if (DistToSrc >= 2 * MaxMemDepDistance)
15145 break;
15146 DistToSrc++;
15147 }
15148 }
15149 if (InsertInReadyList && SD->isReady()) {
15150 ReadyInsts.insert(SD);
15151 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
15152 << "\n");
15153 }
15154 }
15155}
15156
15157void BoUpSLP::BlockScheduling::resetSchedule() {
15158 assert(ScheduleStart &&
15159 "tried to reset schedule on block which has not been scheduled");
15160 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
15161 doForAllOpcodes(I, [&](ScheduleData *SD) {
15162 assert(isInSchedulingRegion(SD) &&
15163 "ScheduleData not in scheduling region");
15164 SD->IsScheduled = false;
15165 SD->resetUnscheduledDeps();
15166 });
15167 }
15168 ReadyInsts.clear();
15169}
15170
15171void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
15172 if (!BS->ScheduleStart)
15173 return;
15174
15175 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
15176
15177 // A key point - if we got here, pre-scheduling was able to find a valid
15178 // scheduling of the sub-graph of the scheduling window which consists
15179 // of all vector bundles and their transitive users. As such, we do not
15180 // need to reschedule anything *outside of* that subgraph.
15181
15182 BS->resetSchedule();
15183
15184 // For the real scheduling we use a more sophisticated ready-list: it is
15185 // sorted by the original instruction location. This lets the final schedule
15186 // be as close as possible to the original instruction order.
15187 // WARNING: If changing this order causes a correctness issue, that means
15188 // there is some missing dependence edge in the schedule data graph.
15189 struct ScheduleDataCompare {
15190 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
15191 return SD2->SchedulingPriority < SD1->SchedulingPriority;
15192 }
15193 };
15194 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
15195
15196 // Ensure that all dependency data is updated (for nodes in the sub-graph)
15197 // and fill the ready-list with initial instructions.
15198 int Idx = 0;
15199 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
15200 I = I->getNextNode()) {
15201 BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
15202 TreeEntry *SDTE = getTreeEntry(SD->Inst);
15203 (void)SDTE;
15205 SD->isPartOfBundle() ==
15206 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
15207 "scheduler and vectorizer bundle mismatch");
15208 SD->FirstInBundle->SchedulingPriority = Idx++;
15209
15210 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
15211 BS->calculateDependencies(SD, false, this);
15212 });
15213 }
15214 BS->initialFillReadyList(ReadyInsts);
15215
15216 Instruction *LastScheduledInst = BS->ScheduleEnd;
15217
15218 // Do the "real" scheduling.
15219 while (!ReadyInsts.empty()) {
15220 ScheduleData *Picked = *ReadyInsts.begin();
15221 ReadyInsts.erase(ReadyInsts.begin());
15222
15223 // Move the scheduled instruction(s) to their dedicated places, if not
15224 // there yet.
15225 for (ScheduleData *BundleMember = Picked; BundleMember;
15226 BundleMember = BundleMember->NextInBundle) {
15227 Instruction *PickedInst = BundleMember->Inst;
15228 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
15229 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
15230 LastScheduledInst = PickedInst;
15231 }
15232
15233 BS->schedule(Picked, ReadyInsts);
15234 }
15235
15236 // Check that we didn't break any of our invariants.
15237#ifdef EXPENSIVE_CHECKS
15238 BS->verify();
15239#endif
15240
15241#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
15242 // Check that all schedulable entities got scheduled
15243 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
15244 BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
15245 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
15246 assert(SD->IsScheduled && "must be scheduled at this point");
15247 }
15248 });
15249 }
15250#endif
15251
15252 // Avoid duplicate scheduling of the block.
15253 BS->ScheduleStart = nullptr;
15254}
15255
15257 // If V is a store, just return the width of the stored value (or value
15258 // truncated just before storing) without traversing the expression tree.
15259 // This is the common case.
15260 if (auto *Store = dyn_cast<StoreInst>(V))
15261 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
15262
15263 if (auto *IEI = dyn_cast<InsertElementInst>(V))
15264 return getVectorElementSize(IEI->getOperand(1));
15265
15266 auto E = InstrElementSize.find(V);
15267 if (E != InstrElementSize.end())
15268 return E->second;
15269
15270 // If V is not a store, we can traverse the expression tree to find loads
15271 // that feed it. The type of the loaded value may indicate a more suitable
15272 // width than V's type. We want to base the vector element size on the width
15273 // of memory operations where possible.
15276 if (auto *I = dyn_cast<Instruction>(V)) {
15277 Worklist.emplace_back(I, I->getParent(), 0);
15278 Visited.insert(I);
15279 }
15280
15281 // Traverse the expression tree in bottom-up order looking for loads. If we
15282 // encounter an instruction we don't yet handle, we give up.
15283 auto Width = 0u;
15284 Value *FirstNonBool = nullptr;
15285 while (!Worklist.empty()) {
15286 auto [I, Parent, Level] = Worklist.pop_back_val();
15287
15288 // We should only be looking at scalar instructions here. If the current
15289 // instruction has a vector type, skip.
15290 auto *Ty = I->getType();
15291 if (isa<VectorType>(Ty))
15292 continue;
15293 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
15294 FirstNonBool = I;
15295 if (Level > RecursionMaxDepth)
15296 continue;
15297
15298 // If the current instruction is a load, update MaxWidth to reflect the
15299 // width of the loaded value.
15300 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
15301 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
15302
15303 // Otherwise, we need to visit the operands of the instruction. We only
15304 // handle the interesting cases from buildTree here. If an operand is an
15305 // instruction we haven't yet visited and from the same basic block as the
15306 // user or the use is a PHI node, we add it to the worklist.
15309 for (Use &U : I->operands()) {
15310 if (auto *J = dyn_cast<Instruction>(U.get()))
15311 if (Visited.insert(J).second &&
15312 (isa<PHINode>(I) || J->getParent() == Parent)) {
15313 Worklist.emplace_back(J, J->getParent(), Level + 1);
15314 continue;
15315 }
15316 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
15317 FirstNonBool = U.get();
15318 }
15319 } else {
15320 break;
15321 }
15322 }
15323
15324 // If we didn't encounter a memory access in the expression tree, or if we
15325 // gave up for some reason, just return the width of V. Otherwise, return the
15326 // maximum width we found.
15327 if (!Width) {
15328 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
15329 V = FirstNonBool;
15330 Width = DL->getTypeSizeInBits(V->getType());
15331 }
15332
15333 for (Instruction *I : Visited)
15334 InstrElementSize[I] = Width;
15335
15336 return Width;
15337}
15338
15339bool BoUpSLP::collectValuesToDemote(
15340 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
15342 unsigned &MaxDepthLevel, bool &IsProfitableToDemote,
15343 bool IsTruncRoot) const {
15344 // We can always demote constants.
15345 if (all_of(E.Scalars, IsaPred<Constant>))
15346 return true;
15347
15348 unsigned OrigBitWidth = DL->getTypeSizeInBits(E.Scalars.front()->getType());
15349 if (OrigBitWidth == BitWidth) {
15350 MaxDepthLevel = 1;
15351 return true;
15352 }
15353
15354 // If the value is not a vectorized instruction in the expression and not used
15355 // by the insertelement instruction and not used in multiple vector nodes, it
15356 // cannot be demoted.
15357 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
15358 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15359 });
15360 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
15361 if (MultiNodeScalars.contains(V))
15362 return false;
15363 // For lat shuffle of sext/zext with many uses need to check the extra bit
15364 // for unsigned values, otherwise may have incorrect casting for reused
15365 // scalars.
15366 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
15367 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
15368 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15369 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
15370 return true;
15371 }
15372 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
15373 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
15374 if (IsSignedNode)
15375 ++BitWidth1;
15376 if (auto *I = dyn_cast<Instruction>(V)) {
15377 APInt Mask = DB->getDemandedBits(I);
15378 unsigned BitWidth2 =
15379 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
15380 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
15381 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
15382 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
15383 break;
15384 BitWidth2 *= 2;
15385 }
15386 BitWidth1 = std::min(BitWidth1, BitWidth2);
15387 }
15388 BitWidth = std::max(BitWidth, BitWidth1);
15389 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
15390 };
15391 using namespace std::placeholders;
15392 auto FinalAnalysis = [&]() {
15393 if (!IsProfitableToDemote)
15394 return false;
15395 bool Res = all_of(
15396 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
15397 // Demote gathers.
15398 if (Res && E.isGather()) {
15399 // Check possible extractelement instructions bases and final vector
15400 // length.
15401 SmallPtrSet<Value *, 4> UniqueBases;
15402 for (Value *V : E.Scalars) {
15403 auto *EE = dyn_cast<ExtractElementInst>(V);
15404 if (!EE)
15405 continue;
15406 UniqueBases.insert(EE->getVectorOperand());
15407 }
15408 const unsigned VF = E.Scalars.size();
15409 Type *OrigScalarTy = E.Scalars.front()->getType();
15410 if (UniqueBases.size() <= 2 ||
15411 TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF)) ==
15413 IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
15414 ToDemote.push_back(E.Idx);
15415 }
15416 return Res;
15417 };
15418 if (E.isGather() || !Visited.insert(&E).second ||
15419 any_of(E.Scalars, [&](Value *V) {
15420 return all_of(V->users(), [&](User *U) {
15421 return isa<InsertElementInst>(U) && !getTreeEntry(U);
15422 });
15423 }))
15424 return FinalAnalysis();
15425
15426 if (any_of(E.Scalars, [&](Value *V) {
15427 return !all_of(V->users(), [=](User *U) {
15428 return getTreeEntry(U) ||
15429 (UserIgnoreList && UserIgnoreList->contains(U)) ||
15430 (!isa<CmpInst>(U) && U->getType()->isSized() &&
15431 !U->getType()->isScalableTy() &&
15432 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
15433 }) && !IsPotentiallyTruncated(V, BitWidth);
15434 }))
15435 return false;
15436
15437 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
15438 bool &NeedToExit) {
15439 NeedToExit = false;
15440 unsigned InitLevel = MaxDepthLevel;
15441 for (const TreeEntry *Op : Operands) {
15442 unsigned Level = InitLevel;
15443 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
15444 ToDemote, Visited, Level, IsProfitableToDemote,
15445 IsTruncRoot)) {
15446 if (!IsProfitableToDemote)
15447 return false;
15448 NeedToExit = true;
15449 if (!FinalAnalysis())
15450 return false;
15451 continue;
15452 }
15453 MaxDepthLevel = std::max(MaxDepthLevel, Level);
15454 }
15455 return true;
15456 };
15457 auto AttemptCheckBitwidth =
15458 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
15459 // Try all bitwidth < OrigBitWidth.
15460 NeedToExit = false;
15461 unsigned BestFailBitwidth = 0;
15462 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
15463 if (Checker(BitWidth, OrigBitWidth))
15464 return true;
15465 if (BestFailBitwidth == 0 && FinalAnalysis())
15466 BestFailBitwidth = BitWidth;
15467 }
15468 if (BitWidth >= OrigBitWidth) {
15469 if (BestFailBitwidth == 0) {
15470 BitWidth = OrigBitWidth;
15471 return false;
15472 }
15473 MaxDepthLevel = 1;
15474 BitWidth = BestFailBitwidth;
15475 NeedToExit = true;
15476 return true;
15477 }
15478 return false;
15479 };
15480 auto TryProcessInstruction =
15481 [&](unsigned &BitWidth,
15483 function_ref<bool(unsigned, unsigned)> Checker = {}) {
15484 if (Operands.empty()) {
15485 if (!IsTruncRoot)
15486 MaxDepthLevel = 1;
15487 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15488 std::ref(BitWidth)));
15489 } else {
15490 // Several vectorized uses? Check if we can truncate it, otherwise -
15491 // exit.
15492 if (E.UserTreeIndices.size() > 1 &&
15493 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15494 std::ref(BitWidth))))
15495 return false;
15496 bool NeedToExit = false;
15497 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
15498 return false;
15499 if (NeedToExit)
15500 return true;
15501 if (!ProcessOperands(Operands, NeedToExit))
15502 return false;
15503 if (NeedToExit)
15504 return true;
15505 }
15506
15507 ++MaxDepthLevel;
15508 // Record the entry that we can demote.
15509 ToDemote.push_back(E.Idx);
15510 return IsProfitableToDemote;
15511 };
15512 switch (E.getOpcode()) {
15513
15514 // We can always demote truncations and extensions. Since truncations can
15515 // seed additional demotion, we save the truncated value.
15516 case Instruction::Trunc:
15517 if (IsProfitableToDemoteRoot)
15518 IsProfitableToDemote = true;
15519 return TryProcessInstruction(BitWidth);
15520 case Instruction::ZExt:
15521 case Instruction::SExt:
15522 IsProfitableToDemote = true;
15523 return TryProcessInstruction(BitWidth);
15524
15525 // We can demote certain binary operations if we can demote both of their
15526 // operands.
15527 case Instruction::Add:
15528 case Instruction::Sub:
15529 case Instruction::Mul:
15530 case Instruction::And:
15531 case Instruction::Or:
15532 case Instruction::Xor: {
15533 return TryProcessInstruction(
15534 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
15535 }
15536 case Instruction::Shl: {
15537 // If we are truncating the result of this SHL, and if it's a shift of an
15538 // inrange amount, we can always perform a SHL in a smaller type.
15539 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
15540 return all_of(E.Scalars, [&](Value *V) {
15541 auto *I = cast<Instruction>(V);
15542 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15543 return AmtKnownBits.getMaxValue().ult(BitWidth);
15544 });
15545 };
15546 return TryProcessInstruction(
15547 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
15548 }
15549 case Instruction::LShr: {
15550 // If this is a truncate of a logical shr, we can truncate it to a smaller
15551 // lshr iff we know that the bits we would otherwise be shifting in are
15552 // already zeros.
15553 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15554 return all_of(E.Scalars, [&](Value *V) {
15555 auto *I = cast<Instruction>(V);
15556 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15557 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15558 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15559 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
15560 SimplifyQuery(*DL));
15561 });
15562 };
15563 return TryProcessInstruction(
15564 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15565 LShrChecker);
15566 }
15567 case Instruction::AShr: {
15568 // If this is a truncate of an arithmetic shr, we can truncate it to a
15569 // smaller ashr iff we know that all the bits from the sign bit of the
15570 // original type and the sign bit of the truncate type are similar.
15571 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15572 return all_of(E.Scalars, [&](Value *V) {
15573 auto *I = cast<Instruction>(V);
15574 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15575 unsigned ShiftedBits = OrigBitWidth - BitWidth;
15576 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15577 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15578 nullptr, DT);
15579 });
15580 };
15581 return TryProcessInstruction(
15582 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15583 AShrChecker);
15584 }
15585 case Instruction::UDiv:
15586 case Instruction::URem: {
15587 // UDiv and URem can be truncated if all the truncated bits are zero.
15588 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15589 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15590 return all_of(E.Scalars, [&](Value *V) {
15591 auto *I = cast<Instruction>(V);
15592 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15593 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
15594 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15595 });
15596 };
15597 return TryProcessInstruction(
15598 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
15599 }
15600
15601 // We can demote selects if we can demote their true and false values.
15602 case Instruction::Select: {
15603 return TryProcessInstruction(
15604 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
15605 }
15606
15607 // We can demote phis if we can demote all their incoming operands. Note that
15608 // we don't need to worry about cycles since we ensure single use above.
15609 case Instruction::PHI: {
15610 const unsigned NumOps = E.getNumOperands();
15612 transform(seq<unsigned>(0, NumOps), Ops.begin(),
15613 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
15614
15615 return TryProcessInstruction(BitWidth, Ops);
15616 }
15617
15618 case Instruction::Call: {
15619 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
15620 if (!IC)
15621 break;
15623 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
15624 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
15625 break;
15626 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
15627 function_ref<bool(unsigned, unsigned)> CallChecker;
15628 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15629 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15630 return all_of(E.Scalars, [&](Value *V) {
15631 auto *I = cast<Instruction>(V);
15632 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
15633 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15634 return MaskedValueIsZero(I->getOperand(0), Mask,
15635 SimplifyQuery(*DL)) &&
15636 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15637 }
15638 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
15639 "Expected min/max intrinsics only.");
15640 unsigned SignBits = OrigBitWidth - BitWidth;
15641 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
15642 unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15643 nullptr, DT);
15644 unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
15645 nullptr, DT);
15646 return SignBits <= Op0SignBits &&
15647 ((SignBits != Op0SignBits &&
15648 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
15649 MaskedValueIsZero(I->getOperand(0), Mask,
15650 SimplifyQuery(*DL))) &&
15651 SignBits <= Op1SignBits &&
15652 ((SignBits != Op1SignBits &&
15653 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
15654 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
15655 });
15656 };
15657 if (ID != Intrinsic::abs) {
15658 Operands.push_back(getOperandEntry(&E, 1));
15659 CallChecker = CompChecker;
15660 }
15661 InstructionCost BestCost =
15662 std::numeric_limits<InstructionCost::CostType>::max();
15663 unsigned BestBitWidth = BitWidth;
15664 unsigned VF = E.Scalars.size();
15665 // Choose the best bitwidth based on cost estimations.
15666 auto Checker = [&](unsigned BitWidth, unsigned) {
15667 unsigned MinBW = PowerOf2Ceil(BitWidth);
15668 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(IC, ID, VF, MinBW);
15669 auto VecCallCosts = getVectorCallCosts(
15670 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
15671 TTI, TLI, ArgTys);
15672 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
15673 if (Cost < BestCost) {
15674 BestCost = Cost;
15675 BestBitWidth = BitWidth;
15676 }
15677 return false;
15678 };
15679 [[maybe_unused]] bool NeedToExit;
15680 (void)AttemptCheckBitwidth(Checker, NeedToExit);
15681 BitWidth = BestBitWidth;
15682 return TryProcessInstruction(BitWidth, Operands, CallChecker);
15683 }
15684
15685 // Otherwise, conservatively give up.
15686 default:
15687 break;
15688 }
15689 MaxDepthLevel = 1;
15690 return FinalAnalysis();
15691}
15692
15693static RecurKind getRdxKind(Value *V);
15694
15696 // We only attempt to truncate integer expressions.
15697 bool IsStoreOrInsertElt =
15698 VectorizableTree.front()->getOpcode() == Instruction::Store ||
15699 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
15700 if ((IsStoreOrInsertElt || UserIgnoreList) &&
15701 ExtraBitWidthNodes.size() <= 1 &&
15702 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
15703 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
15704 return;
15705
15706 unsigned NodeIdx = 0;
15707 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
15708 NodeIdx = 1;
15709
15710 // Ensure the roots of the vectorizable tree don't form a cycle.
15711 if (VectorizableTree[NodeIdx]->isGather() ||
15712 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
15713 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15714 [NodeIdx](const EdgeInfo &EI) {
15715 return EI.UserTE->Idx >
15716 static_cast<int>(NodeIdx);
15717 })))
15718 return;
15719
15720 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
15721 // resize to the final type.
15722 bool IsTruncRoot = false;
15723 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
15724 SmallVector<unsigned> RootDemotes;
15725 if (NodeIdx != 0 &&
15726 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15727 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15728 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
15729 IsTruncRoot = true;
15730 RootDemotes.push_back(NodeIdx);
15731 IsProfitableToDemoteRoot = true;
15732 ++NodeIdx;
15733 }
15734
15735 // Analyzed the reduction already and not profitable - exit.
15736 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
15737 return;
15738
15739 SmallVector<unsigned> ToDemote;
15740 auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
15741 bool IsProfitableToDemoteRoot, unsigned Opcode,
15742 unsigned Limit, bool IsTruncRoot,
15743 bool IsSignedCmp) -> unsigned {
15744 ToDemote.clear();
15745 // Check if the root is trunc and the next node is gather/buildvector, then
15746 // keep trunc in scalars, which is free in most cases.
15747 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
15748 E.Idx > (IsStoreOrInsertElt ? 2 : 1) &&
15749 all_of(E.Scalars, [&](Value *V) {
15750 return V->hasOneUse() || isa<Constant>(V) ||
15751 (!V->hasNUsesOrMore(UsesLimit) &&
15752 none_of(V->users(), [&](User *U) {
15753 const TreeEntry *TE = getTreeEntry(U);
15754 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
15755 if (TE == UserTE || !TE)
15756 return false;
15757 unsigned UserTESz = DL->getTypeSizeInBits(
15758 UserTE->Scalars.front()->getType());
15759 auto It = MinBWs.find(TE);
15760 if (It != MinBWs.end() && It->second.first > UserTESz)
15761 return true;
15762 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
15763 }));
15764 })) {
15765 ToDemote.push_back(E.Idx);
15766 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
15767 auto It = MinBWs.find(UserTE);
15768 if (It != MinBWs.end())
15769 return It->second.first;
15770 unsigned MaxBitWidth =
15771 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
15772 MaxBitWidth = bit_ceil(MaxBitWidth);
15773 if (MaxBitWidth < 8 && MaxBitWidth > 1)
15774 MaxBitWidth = 8;
15775 return MaxBitWidth;
15776 }
15777
15778 unsigned VF = E.getVectorFactor();
15779 auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType());
15780 if (!TreeRootIT || !Opcode)
15781 return 0u;
15782
15783 if (any_of(E.Scalars,
15784 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
15785 return 0u;
15786
15787 unsigned NumParts = TTI->getNumberOfParts(getWidenedType(TreeRootIT, VF));
15788
15789 // The maximum bit width required to represent all the values that can be
15790 // demoted without loss of precision. It would be safe to truncate the roots
15791 // of the expression to this width.
15792 unsigned MaxBitWidth = 1u;
15793
15794 // True if the roots can be zero-extended back to their original type,
15795 // rather than sign-extended. We know that if the leading bits are not
15796 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
15797 // True.
15798 // Determine if the sign bit of all the roots is known to be zero. If not,
15799 // IsKnownPositive is set to False.
15800 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
15801 KnownBits Known = computeKnownBits(R, *DL);
15802 return Known.isNonNegative();
15803 });
15804
15805 // We first check if all the bits of the roots are demanded. If they're not,
15806 // we can truncate the roots to this narrower type.
15807 for (Value *Root : E.Scalars) {
15808 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
15809 TypeSize NumTypeBits = DL->getTypeSizeInBits(Root->getType());
15810 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15811 // If we can't prove that the sign bit is zero, we must add one to the
15812 // maximum bit width to account for the unknown sign bit. This preserves
15813 // the existing sign bit so we can safely sign-extend the root back to the
15814 // original type. Otherwise, if we know the sign bit is zero, we will
15815 // zero-extend the root instead.
15816 //
15817 // FIXME: This is somewhat suboptimal, as there will be cases where adding
15818 // one to the maximum bit width will yield a larger-than-necessary
15819 // type. In general, we need to add an extra bit only if we can't
15820 // prove that the upper bit of the original type is equal to the
15821 // upper bit of the proposed smaller type. If these two bits are
15822 // the same (either zero or one) we know that sign-extending from
15823 // the smaller type will result in the same value. Here, since we
15824 // can't yet prove this, we are just making the proposed smaller
15825 // type larger to ensure correctness.
15826 if (!IsKnownPositive)
15827 ++BitWidth1;
15828
15829 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
15830 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15831 MaxBitWidth =
15832 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
15833 }
15834
15835 if (MaxBitWidth < 8 && MaxBitWidth > 1)
15836 MaxBitWidth = 8;
15837
15838 // If the original type is large, but reduced type does not improve the reg
15839 // use - ignore it.
15840 if (NumParts > 1 &&
15841 NumParts ==
15843 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
15844 return 0u;
15845
15846 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
15847 Opcode == Instruction::SExt ||
15848 Opcode == Instruction::ZExt || NumParts > 1;
15849 // Conservatively determine if we can actually truncate the roots of the
15850 // expression. Collect the values that can be demoted in ToDemote and
15851 // additional roots that require investigating in Roots.
15853 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
15854 bool NeedToDemote = IsProfitableToDemote;
15855
15856 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
15857 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
15858 IsTruncRoot) ||
15859 (MaxDepthLevel <= Limit &&
15860 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
15861 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
15862 DL->getTypeSizeInBits(TreeRootIT) /
15863 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
15864 ->getOperand(0)
15865 ->getType()) >
15866 2)))))
15867 return 0u;
15868 // Round MaxBitWidth up to the next power-of-two.
15869 MaxBitWidth = bit_ceil(MaxBitWidth);
15870
15871 return MaxBitWidth;
15872 };
15873
15874 // If we can truncate the root, we must collect additional values that might
15875 // be demoted as a result. That is, those seeded by truncations we will
15876 // modify.
15877 // Add reduction ops sizes, if any.
15878 if (UserIgnoreList &&
15879 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
15880 for (Value *V : *UserIgnoreList) {
15881 auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
15882 auto NumTypeBits = DL->getTypeSizeInBits(V->getType());
15883 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15885 ++BitWidth1;
15886 unsigned BitWidth2 = BitWidth1;
15888 auto Mask = DB->getDemandedBits(cast<Instruction>(V));
15889 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15890 }
15891 ReductionBitWidth =
15892 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
15893 }
15894 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
15895 ReductionBitWidth = 8;
15896
15897 ReductionBitWidth = bit_ceil(ReductionBitWidth);
15898 }
15899 bool IsTopRoot = NodeIdx == 0;
15900 while (NodeIdx < VectorizableTree.size() &&
15901 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15902 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15903 RootDemotes.push_back(NodeIdx);
15904 ++NodeIdx;
15905 IsTruncRoot = true;
15906 }
15907 bool IsSignedCmp = false;
15908 while (NodeIdx < VectorizableTree.size()) {
15909 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
15910 unsigned Limit = 2;
15911 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
15912 if (IsTopRoot &&
15913 ReductionBitWidth ==
15914 DL->getTypeSizeInBits(
15915 VectorizableTree.front()->Scalars.front()->getType()))
15916 Limit = 3;
15917 unsigned MaxBitWidth = ComputeMaxBitWidth(
15918 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
15919 Limit, IsTruncRoot, IsSignedCmp);
15920 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
15921 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
15922 ReductionBitWidth = bit_ceil(MaxBitWidth);
15923 else if (MaxBitWidth == 0)
15924 ReductionBitWidth = 0;
15925 }
15926
15927 for (unsigned Idx : RootDemotes) {
15928 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
15929 uint32_t OrigBitWidth = DL->getTypeSizeInBits(V->getType());
15930 if (OrigBitWidth > MaxBitWidth) {
15931 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
15932 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
15933 }
15934 return false;
15935 }))
15936 ToDemote.push_back(Idx);
15937 }
15938 RootDemotes.clear();
15939 IsTopRoot = false;
15940 IsProfitableToDemoteRoot = true;
15941
15942 if (ExtraBitWidthNodes.empty()) {
15943 NodeIdx = VectorizableTree.size();
15944 } else {
15945 unsigned NewIdx = 0;
15946 do {
15947 NewIdx = *ExtraBitWidthNodes.begin();
15948 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
15949 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
15950 NodeIdx = NewIdx;
15951 IsTruncRoot =
15952 NodeIdx < VectorizableTree.size() &&
15953 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15954 [](const EdgeInfo &EI) {
15955 return EI.EdgeIdx == 0 &&
15956 EI.UserTE->getOpcode() == Instruction::Trunc &&
15957 !EI.UserTE->isAltShuffle();
15958 });
15959 IsSignedCmp =
15960 NodeIdx < VectorizableTree.size() &&
15961 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15962 [&](const EdgeInfo &EI) {
15963 return EI.UserTE->getOpcode() == Instruction::ICmp &&
15964 any_of(EI.UserTE->Scalars, [&](Value *V) {
15965 auto *IC = dyn_cast<ICmpInst>(V);
15966 return IC &&
15967 (IC->isSigned() ||
15968 !isKnownNonNegative(IC->getOperand(0),
15969 SimplifyQuery(*DL)) ||
15970 !isKnownNonNegative(IC->getOperand(1),
15971 SimplifyQuery(*DL)));
15972 });
15973 });
15974 }
15975
15976 // If the maximum bit width we compute is less than the with of the roots'
15977 // type, we can proceed with the narrowing. Otherwise, do nothing.
15978 if (MaxBitWidth == 0 ||
15979 MaxBitWidth >=
15980 cast<IntegerType>(TreeRoot.front()->getType())->getBitWidth()) {
15981 if (UserIgnoreList)
15982 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
15983 continue;
15984 }
15985
15986 // Finally, map the values we can demote to the maximum bit with we
15987 // computed.
15988 for (unsigned Idx : ToDemote) {
15989 TreeEntry *TE = VectorizableTree[Idx].get();
15990 if (MinBWs.contains(TE))
15991 continue;
15992 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
15993 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15994 });
15995 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
15996 }
15997 }
15998}
15999
16001 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
16002 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
16003 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
16004 auto *AA = &AM.getResult<AAManager>(F);
16005 auto *LI = &AM.getResult<LoopAnalysis>(F);
16006 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
16007 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
16008 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
16010
16011 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
16012 if (!Changed)
16013 return PreservedAnalyses::all();
16014
16017 return PA;
16018}
16019
16021 TargetTransformInfo *TTI_,
16022 TargetLibraryInfo *TLI_, AAResults *AA_,
16023 LoopInfo *LI_, DominatorTree *DT_,
16024 AssumptionCache *AC_, DemandedBits *DB_,
16027 return false;
16028 SE = SE_;
16029 TTI = TTI_;
16030 TLI = TLI_;
16031 AA = AA_;
16032 LI = LI_;
16033 DT = DT_;
16034 AC = AC_;
16035 DB = DB_;
16036 DL = &F.getDataLayout();
16037
16038 Stores.clear();
16039 GEPs.clear();
16040 bool Changed = false;
16041
16042 // If the target claims to have no vector registers don't attempt
16043 // vectorization.
16045 LLVM_DEBUG(
16046 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
16047 return false;
16048 }
16049
16050 // Don't vectorize when the attribute NoImplicitFloat is used.
16051 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
16052 return false;
16053
16054 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
16055
16056 // Use the bottom up slp vectorizer to construct chains that start with
16057 // store instructions.
16058 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
16059
16060 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
16061 // delete instructions.
16062
16063 // Update DFS numbers now so that we can use them for ordering.
16064 DT->updateDFSNumbers();
16065
16066 // Scan the blocks in the function in post order.
16067 for (auto *BB : post_order(&F.getEntryBlock())) {
16068 // Start new block - clear the list of reduction roots.
16069 R.clearReductionData();
16070 collectSeedInstructions(BB);
16071
16072 // Vectorize trees that end at stores.
16073 if (!Stores.empty()) {
16074 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
16075 << " underlying objects.\n");
16076 Changed |= vectorizeStoreChains(R);
16077 }
16078
16079 // Vectorize trees that end at reductions.
16080 Changed |= vectorizeChainsInBlock(BB, R);
16081
16082 // Vectorize the index computations of getelementptr instructions. This
16083 // is primarily intended to catch gather-like idioms ending at
16084 // non-consecutive loads.
16085 if (!GEPs.empty()) {
16086 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
16087 << " underlying objects.\n");
16088 Changed |= vectorizeGEPIndices(BB, R);
16089 }
16090 }
16091
16092 if (Changed) {
16093 R.optimizeGatherSequence();
16094 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
16095 }
16096 return Changed;
16097}
16098
16099std::optional<bool>
16100SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
16101 unsigned Idx, unsigned MinVF,
16102 unsigned &Size) {
16103 Size = 0;
16104 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
16105 << "\n");
16106 const unsigned Sz = R.getVectorElementSize(Chain[0]);
16107 unsigned VF = Chain.size();
16108
16109 if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) {
16110 // Check if vectorizing with a non-power-of-2 VF should be considered. At
16111 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
16112 // all vector lanes are used.
16113 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
16114 return false;
16115 }
16116
16117 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
16118 << "\n");
16119
16120 SetVector<Value *> ValOps;
16121 for (Value *V : Chain)
16122 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
16123 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
16124 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
16125 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
16126 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
16127 bool IsPowerOf2 =
16128 isPowerOf2_32(ValOps.size()) ||
16129 (VectorizeNonPowerOf2 && isPowerOf2_32(ValOps.size() + 1));
16130 if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
16131 (!S.MainOp->isSafeToRemove() ||
16132 any_of(ValOps.getArrayRef(),
16133 [&](Value *V) {
16134 return !isa<ExtractElementInst>(V) &&
16135 (V->getNumUses() > Chain.size() ||
16136 any_of(V->users(), [&](User *U) {
16137 return !Stores.contains(U);
16138 }));
16139 }))) ||
16140 (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
16141 Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
16142 return false;
16143 }
16144 }
16145 if (R.isLoadCombineCandidate(Chain))
16146 return true;
16147 R.buildTree(Chain);
16148 // Check if tree tiny and store itself or its value is not vectorized.
16149 if (R.isTreeTinyAndNotFullyVectorizable()) {
16150 if (R.isGathered(Chain.front()) ||
16151 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
16152 return std::nullopt;
16153 Size = R.getTreeSize();
16154 return false;
16155 }
16156 R.reorderTopToBottom();
16157 R.reorderBottomToTop();
16158 R.buildExternalUses();
16159
16160 R.computeMinimumValueSizes();
16161 R.transformNodes();
16162
16163 Size = R.getTreeSize();
16164 if (S.getOpcode() == Instruction::Load)
16165 Size = 2; // cut off masked gather small trees
16166 InstructionCost Cost = R.getTreeCost();
16167
16168 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
16169 if (Cost < -SLPCostThreshold) {
16170 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
16171
16172 using namespace ore;
16173
16174 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
16175 cast<StoreInst>(Chain[0]))
16176 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
16177 << " and with tree size "
16178 << NV("TreeSize", R.getTreeSize()));
16179
16180 R.vectorizeTree();
16181 return true;
16182 }
16183
16184 return false;
16185}
16186
16187/// Checks if the quadratic mean deviation is less than 90% of the mean size.
16188static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
16189 bool First) {
16190 unsigned Num = 0;
16191 uint64_t Sum = std::accumulate(
16192 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
16193 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
16194 unsigned Size = First ? Val.first : Val.second;
16195 if (Size == 1)
16196 return V;
16197 ++Num;
16198 return V + Size;
16199 });
16200 if (Num == 0)
16201 return true;
16202 uint64_t Mean = Sum / Num;
16203 if (Mean == 0)
16204 return true;
16205 uint64_t Dev = std::accumulate(
16206 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
16207 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
16208 unsigned P = First ? Val.first : Val.second;
16209 if (P == 1)
16210 return V;
16211 return V + (P - Mean) * (P - Mean);
16212 }) /
16213 Num;
16214 return Dev * 81 / (Mean * Mean) == 0;
16215}
16216
16217bool SLPVectorizerPass::vectorizeStores(
16218 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
16219 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
16220 &Visited) {
16221 // We may run into multiple chains that merge into a single chain. We mark the
16222 // stores that we vectorized so that we don't visit the same store twice.
16223 BoUpSLP::ValueSet VectorizedStores;
16224 bool Changed = false;
16225
16226 struct StoreDistCompare {
16227 bool operator()(const std::pair<unsigned, int> &Op1,
16228 const std::pair<unsigned, int> &Op2) const {
16229 return Op1.second < Op2.second;
16230 }
16231 };
16232 // A set of pairs (index of store in Stores array ref, Distance of the store
16233 // address relative to base store address in units).
16234 using StoreIndexToDistSet =
16235 std::set<std::pair<unsigned, int>, StoreDistCompare>;
16236 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
16237 int PrevDist = -1;
16239 // Collect the chain into a list.
16240 for (auto [Idx, Data] : enumerate(Set)) {
16241 if (Operands.empty() || Data.second - PrevDist == 1) {
16242 Operands.push_back(Stores[Data.first]);
16243 PrevDist = Data.second;
16244 if (Idx != Set.size() - 1)
16245 continue;
16246 }
16247 auto E = make_scope_exit([&, &DataVar = Data]() {
16248 Operands.clear();
16249 Operands.push_back(Stores[DataVar.first]);
16250 PrevDist = DataVar.second;
16251 });
16252
16253 if (Operands.size() <= 1 ||
16254 !Visited
16255 .insert({Operands.front(),
16256 cast<StoreInst>(Operands.front())->getValueOperand(),
16257 Operands.back(),
16258 cast<StoreInst>(Operands.back())->getValueOperand(),
16259 Operands.size()})
16260 .second)
16261 continue;
16262
16263 unsigned MaxVecRegSize = R.getMaxVecRegSize();
16264 unsigned EltSize = R.getVectorElementSize(Operands[0]);
16265 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
16266
16267 unsigned MaxVF =
16268 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
16269 unsigned MaxRegVF = MaxVF;
16270 auto *Store = cast<StoreInst>(Operands[0]);
16271 Type *StoreTy = Store->getValueOperand()->getType();
16272 Type *ValueTy = StoreTy;
16273 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
16274 ValueTy = Trunc->getSrcTy();
16275 if (ValueTy == StoreTy &&
16276 R.getVectorElementSize(Store->getValueOperand()) <= EltSize)
16277 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
16278 unsigned MinVF = std::max<unsigned>(
16280 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
16281 ValueTy)));
16282
16283 if (MaxVF < MinVF) {
16284 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
16285 << ") < "
16286 << "MinVF (" << MinVF << ")\n");
16287 continue;
16288 }
16289
16290 unsigned NonPowerOf2VF = 0;
16292 // First try vectorizing with a non-power-of-2 VF. At the moment, only
16293 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
16294 // lanes are used.
16295 unsigned CandVF = Operands.size();
16296 if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxRegVF)
16297 NonPowerOf2VF = CandVF;
16298 }
16299
16300 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
16301 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
16302 unsigned Size = MinVF;
16303 for_each(reverse(CandidateVFs), [&](unsigned &VF) {
16304 VF = Size > MaxVF ? NonPowerOf2VF : Size;
16305 Size *= 2;
16306 });
16307 unsigned End = Operands.size();
16308 unsigned Repeat = 0;
16309 constexpr unsigned MaxAttempts = 4;
16311 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
16312 P.first = P.second = 1;
16313 });
16315 auto IsNotVectorized = [](bool First,
16316 const std::pair<unsigned, unsigned> &P) {
16317 return First ? P.first > 0 : P.second > 0;
16318 };
16319 auto IsVectorized = [](bool First,
16320 const std::pair<unsigned, unsigned> &P) {
16321 return First ? P.first == 0 : P.second == 0;
16322 };
16323 auto VFIsProfitable = [](bool First, unsigned Size,
16324 const std::pair<unsigned, unsigned> &P) {
16325 return First ? Size >= P.first : Size >= P.second;
16326 };
16327 auto FirstSizeSame = [](unsigned Size,
16328 const std::pair<unsigned, unsigned> &P) {
16329 return Size == P.first;
16330 };
16331 while (true) {
16332 ++Repeat;
16333 bool RepeatChanged = false;
16334 bool AnyProfitableGraph = false;
16335 for (unsigned Size : CandidateVFs) {
16336 AnyProfitableGraph = false;
16337 unsigned StartIdx = std::distance(
16338 RangeSizes.begin(),
16339 find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
16340 std::placeholders::_1)));
16341 while (StartIdx < End) {
16342 unsigned EndIdx =
16343 std::distance(RangeSizes.begin(),
16344 find_if(RangeSizes.drop_front(StartIdx),
16345 std::bind(IsVectorized, Size >= MaxRegVF,
16346 std::placeholders::_1)));
16347 unsigned Sz = EndIdx >= End ? End : EndIdx;
16348 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
16349 if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
16350 Size >= MaxRegVF)) {
16351 ++Cnt;
16352 continue;
16353 }
16355 assert(all_of(Slice,
16356 [&](Value *V) {
16357 return cast<StoreInst>(V)
16358 ->getValueOperand()
16359 ->getType() ==
16360 cast<StoreInst>(Slice.front())
16361 ->getValueOperand()
16362 ->getType();
16363 }) &&
16364 "Expected all operands of same type.");
16365 if (!NonSchedulable.empty()) {
16366 auto [NonSchedSizeMax, NonSchedSizeMin] =
16367 NonSchedulable.lookup(Slice.front());
16368 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
16369 Cnt += NonSchedSizeMax;
16370 continue;
16371 }
16372 }
16373 unsigned TreeSize;
16374 std::optional<bool> Res =
16375 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
16376 if (!Res) {
16377 NonSchedulable
16378 .try_emplace(Slice.front(), std::make_pair(Size, Size))
16379 .first->getSecond()
16380 .second = Size;
16381 } else if (*Res) {
16382 // Mark the vectorized stores so that we don't vectorize them
16383 // again.
16384 VectorizedStores.insert(Slice.begin(), Slice.end());
16385 // Mark the vectorized stores so that we don't vectorize them
16386 // again.
16387 AnyProfitableGraph = RepeatChanged = Changed = true;
16388 // If we vectorized initial block, no need to try to vectorize
16389 // it again.
16390 for_each(RangeSizes.slice(Cnt, Size),
16391 [](std::pair<unsigned, unsigned> &P) {
16392 P.first = P.second = 0;
16393 });
16394 if (Cnt < StartIdx + MinVF) {
16395 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
16396 [](std::pair<unsigned, unsigned> &P) {
16397 P.first = P.second = 0;
16398 });
16399 StartIdx = Cnt + Size;
16400 }
16401 if (Cnt > Sz - Size - MinVF) {
16402 for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
16403 [](std::pair<unsigned, unsigned> &P) {
16404 P.first = P.second = 0;
16405 });
16406 if (Sz == End)
16407 End = Cnt;
16408 Sz = Cnt;
16409 }
16410 Cnt += Size;
16411 continue;
16412 }
16413 if (Size > 2 && Res &&
16414 !all_of(RangeSizes.slice(Cnt, Size),
16415 std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
16416 std::placeholders::_1))) {
16417 Cnt += Size;
16418 continue;
16419 }
16420 // Check for the very big VFs that we're not rebuilding same
16421 // trees, just with larger number of elements.
16422 if (Size > MaxRegVF && TreeSize > 1 &&
16423 all_of(RangeSizes.slice(Cnt, Size),
16424 std::bind(FirstSizeSame, TreeSize,
16425 std::placeholders::_1))) {
16426 Cnt += Size;
16427 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
16428 ++Cnt;
16429 continue;
16430 }
16431 if (TreeSize > 1)
16432 for_each(RangeSizes.slice(Cnt, Size),
16433 [&](std::pair<unsigned, unsigned> &P) {
16434 if (Size >= MaxRegVF)
16435 P.second = std::max(P.second, TreeSize);
16436 else
16437 P.first = std::max(P.first, TreeSize);
16438 });
16439 ++Cnt;
16440 AnyProfitableGraph = true;
16441 }
16442 if (StartIdx >= End)
16443 break;
16444 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
16445 AnyProfitableGraph = true;
16446 StartIdx = std::distance(
16447 RangeSizes.begin(),
16448 find_if(RangeSizes.drop_front(Sz),
16449 std::bind(IsNotVectorized, Size >= MaxRegVF,
16450 std::placeholders::_1)));
16451 }
16452 if (!AnyProfitableGraph && Size >= MaxRegVF)
16453 break;
16454 }
16455 // All values vectorized - exit.
16456 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
16457 return P.first == 0 && P.second == 0;
16458 }))
16459 break;
16460 // Check if tried all attempts or no need for the last attempts at all.
16461 if (Repeat >= MaxAttempts ||
16462 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
16463 break;
16464 constexpr unsigned StoresLimit = 64;
16465 const unsigned MaxTotalNum = bit_floor(std::min<unsigned>(
16466 Operands.size(),
16467 static_cast<unsigned>(
16468 End -
16469 std::distance(
16470 RangeSizes.begin(),
16471 find_if(RangeSizes, std::bind(IsNotVectorized, true,
16472 std::placeholders::_1))) +
16473 1)));
16474 unsigned VF = PowerOf2Ceil(CandidateVFs.front()) * 2;
16475 if (VF > MaxTotalNum || VF >= StoresLimit)
16476 break;
16477 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
16478 if (P.first != 0)
16479 P.first = std::max(P.second, P.first);
16480 });
16481 // Last attempt to vectorize max number of elements, if all previous
16482 // attempts were unsuccessful because of the cost issues.
16483 CandidateVFs.clear();
16484 CandidateVFs.push_back(VF);
16485 }
16486 }
16487 };
16488
16489 // Stores pair (first: index of the store into Stores array ref, address of
16490 // which taken as base, second: sorted set of pairs {index, dist}, which are
16491 // indices of stores in the set and their store location distances relative to
16492 // the base address).
16493
16494 // Need to store the index of the very first store separately, since the set
16495 // may be reordered after the insertion and the first store may be moved. This
16496 // container allows to reduce number of calls of getPointersDiff() function.
16498 // Inserts the specified store SI with the given index Idx to the set of the
16499 // stores. If the store with the same distance is found already - stop
16500 // insertion, try to vectorize already found stores. If some stores from this
16501 // sequence were not vectorized - try to vectorize them with the new store
16502 // later. But this logic is applied only to the stores, that come before the
16503 // previous store with the same distance.
16504 // Example:
16505 // 1. store x, %p
16506 // 2. store y, %p+1
16507 // 3. store z, %p+2
16508 // 4. store a, %p
16509 // 5. store b, %p+3
16510 // - Scan this from the last to first store. The very first bunch of stores is
16511 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
16512 // vector).
16513 // - The next store in the list - #1 - has the same distance from store #5 as
16514 // the store #4.
16515 // - Try to vectorize sequence of stores 4,2,3,5.
16516 // - If all these stores are vectorized - just drop them.
16517 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
16518 // - Start new stores sequence.
16519 // The new bunch of stores is {1, {1, 0}}.
16520 // - Add the stores from previous sequence, that were not vectorized.
16521 // Here we consider the stores in the reversed order, rather they are used in
16522 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
16523 // Store #3 can be added -> comes after store #4 with the same distance as
16524 // store #1.
16525 // Store #5 cannot be added - comes before store #4.
16526 // This logic allows to improve the compile time, we assume that the stores
16527 // after previous store with the same distance most likely have memory
16528 // dependencies and no need to waste compile time to try to vectorize them.
16529 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
16530 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
16531 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
16532 std::optional<int> Diff = getPointersDiff(
16533 Stores[Set.first]->getValueOperand()->getType(),
16534 Stores[Set.first]->getPointerOperand(),
16535 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
16536 /*StrictCheck=*/true);
16537 if (!Diff)
16538 continue;
16539 auto It = Set.second.find(std::make_pair(Idx, *Diff));
16540 if (It == Set.second.end()) {
16541 Set.second.emplace(Idx, *Diff);
16542 return;
16543 }
16544 // Try to vectorize the first found set to avoid duplicate analysis.
16545 TryToVectorize(Set.second);
16546 StoreIndexToDistSet PrevSet;
16547 PrevSet.swap(Set.second);
16548 Set.first = Idx;
16549 Set.second.emplace(Idx, 0);
16550 // Insert stores that followed previous match to try to vectorize them
16551 // with this store.
16552 unsigned StartIdx = It->first + 1;
16553 SmallBitVector UsedStores(Idx - StartIdx);
16554 // Distances to previously found dup store (or this store, since they
16555 // store to the same addresses).
16556 SmallVector<int> Dists(Idx - StartIdx, 0);
16557 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
16558 // Do not try to vectorize sequences, we already tried.
16559 if (Pair.first <= It->first ||
16560 VectorizedStores.contains(Stores[Pair.first]))
16561 break;
16562 unsigned BI = Pair.first - StartIdx;
16563 UsedStores.set(BI);
16564 Dists[BI] = Pair.second - It->second;
16565 }
16566 for (unsigned I = StartIdx; I < Idx; ++I) {
16567 unsigned BI = I - StartIdx;
16568 if (UsedStores.test(BI))
16569 Set.second.emplace(I, Dists[BI]);
16570 }
16571 return;
16572 }
16573 auto &Res = SortedStores.emplace_back();
16574 Res.first = Idx;
16575 Res.second.emplace(Idx, 0);
16576 };
16577 Type *PrevValTy = nullptr;
16578 for (auto [I, SI] : enumerate(Stores)) {
16579 if (R.isDeleted(SI))
16580 continue;
16581 if (!PrevValTy)
16582 PrevValTy = SI->getValueOperand()->getType();
16583 // Check that we do not try to vectorize stores of different types.
16584 if (PrevValTy != SI->getValueOperand()->getType()) {
16585 for (auto &Set : SortedStores)
16586 TryToVectorize(Set.second);
16587 SortedStores.clear();
16588 PrevValTy = SI->getValueOperand()->getType();
16589 }
16590 FillStoresSet(I, SI);
16591 }
16592
16593 // Final vectorization attempt.
16594 for (auto &Set : SortedStores)
16595 TryToVectorize(Set.second);
16596
16597 return Changed;
16598}
16599
16600void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
16601 // Initialize the collections. We will make a single pass over the block.
16602 Stores.clear();
16603 GEPs.clear();
16604
16605 // Visit the store and getelementptr instructions in BB and organize them in
16606 // Stores and GEPs according to the underlying objects of their pointer
16607 // operands.
16608 for (Instruction &I : *BB) {
16609 // Ignore store instructions that are volatile or have a pointer operand
16610 // that doesn't point to a scalar type.
16611 if (auto *SI = dyn_cast<StoreInst>(&I)) {
16612 if (!SI->isSimple())
16613 continue;
16614 if (!isValidElementType(SI->getValueOperand()->getType()))
16615 continue;
16616 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
16617 }
16618
16619 // Ignore getelementptr instructions that have more than one index, a
16620 // constant index, or a pointer operand that doesn't point to a scalar
16621 // type.
16622 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
16623 if (GEP->getNumIndices() != 1)
16624 continue;
16625 Value *Idx = GEP->idx_begin()->get();
16626 if (isa<Constant>(Idx))
16627 continue;
16628 if (!isValidElementType(Idx->getType()))
16629 continue;
16630 if (GEP->getType()->isVectorTy())
16631 continue;
16632 GEPs[GEP->getPointerOperand()].push_back(GEP);
16633 }
16634 }
16635}
16636
16637bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
16638 bool MaxVFOnly) {
16639 if (VL.size() < 2)
16640 return false;
16641
16642 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
16643 << VL.size() << ".\n");
16644
16645 // Check that all of the parts are instructions of the same type,
16646 // we permit an alternate opcode via InstructionsState.
16647 InstructionsState S = getSameOpcode(VL, *TLI);
16648 if (!S.getOpcode())
16649 return false;
16650
16651 Instruction *I0 = cast<Instruction>(S.OpValue);
16652 // Make sure invalid types (including vector type) are rejected before
16653 // determining vectorization factor for scalar instructions.
16654 for (Value *V : VL) {
16655 Type *Ty = V->getType();
16656 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
16657 // NOTE: the following will give user internal llvm type name, which may
16658 // not be useful.
16659 R.getORE()->emit([&]() {
16660 std::string TypeStr;
16661 llvm::raw_string_ostream rso(TypeStr);
16662 Ty->print(rso);
16663 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
16664 << "Cannot SLP vectorize list: type "
16665 << TypeStr + " is unsupported by vectorizer";
16666 });
16667 return false;
16668 }
16669 }
16670
16671 unsigned Sz = R.getVectorElementSize(I0);
16672 unsigned MinVF = R.getMinVF(Sz);
16673 unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
16674 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
16675 if (MaxVF < 2) {
16676 R.getORE()->emit([&]() {
16677 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
16678 << "Cannot SLP vectorize list: vectorization factor "
16679 << "less than 2 is not supported";
16680 });
16681 return false;
16682 }
16683
16684 bool Changed = false;
16685 bool CandidateFound = false;
16686 InstructionCost MinCost = SLPCostThreshold.getValue();
16687 Type *ScalarTy = VL[0]->getType();
16688 if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
16689 ScalarTy = IE->getOperand(1)->getType();
16690
16691 unsigned NextInst = 0, MaxInst = VL.size();
16692 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
16693 // No actual vectorization should happen, if number of parts is the same as
16694 // provided vectorization factor (i.e. the scalar type is used for vector
16695 // code during codegen).
16696 auto *VecTy = getWidenedType(ScalarTy, VF);
16697 if (TTI->getNumberOfParts(VecTy) == VF)
16698 continue;
16699 for (unsigned I = NextInst; I < MaxInst; ++I) {
16700 unsigned ActualVF = std::min(MaxInst - I, VF);
16701
16702 if (!isPowerOf2_32(ActualVF))
16703 continue;
16704
16705 if (MaxVFOnly && ActualVF < MaxVF)
16706 break;
16707 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
16708 break;
16709
16710 ArrayRef<Value *> Ops = VL.slice(I, ActualVF);
16711 // Check that a previous iteration of this loop did not delete the Value.
16712 if (llvm::any_of(Ops, [&R](Value *V) {
16713 auto *I = dyn_cast<Instruction>(V);
16714 return I && R.isDeleted(I);
16715 }))
16716 continue;
16717
16718 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
16719 << "\n");
16720
16721 R.buildTree(Ops);
16722 if (R.isTreeTinyAndNotFullyVectorizable())
16723 continue;
16724 R.reorderTopToBottom();
16725 R.reorderBottomToTop(
16726 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
16727 !R.doesRootHaveInTreeUses());
16728 R.buildExternalUses();
16729
16730 R.computeMinimumValueSizes();
16731 R.transformNodes();
16732 InstructionCost Cost = R.getTreeCost();
16733 CandidateFound = true;
16734 MinCost = std::min(MinCost, Cost);
16735
16736 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
16737 << " for VF=" << ActualVF << "\n");
16738 if (Cost < -SLPCostThreshold) {
16739 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
16740 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
16741 cast<Instruction>(Ops[0]))
16742 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
16743 << " and with tree size "
16744 << ore::NV("TreeSize", R.getTreeSize()));
16745
16746 R.vectorizeTree();
16747 // Move to the next bundle.
16748 I += VF - 1;
16749 NextInst = I + 1;
16750 Changed = true;
16751 }
16752 }
16753 }
16754
16755 if (!Changed && CandidateFound) {
16756 R.getORE()->emit([&]() {
16757 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
16758 << "List vectorization was possible but not beneficial with cost "
16759 << ore::NV("Cost", MinCost) << " >= "
16760 << ore::NV("Treshold", -SLPCostThreshold);
16761 });
16762 } else if (!Changed) {
16763 R.getORE()->emit([&]() {
16764 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
16765 << "Cannot SLP vectorize list: vectorization was impossible"
16766 << " with available vectorization factors";
16767 });
16768 }
16769 return Changed;
16770}
16771
16772bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
16773 if (!I)
16774 return false;
16775
16776 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
16777 return false;
16778
16779 Value *P = I->getParent();
16780
16781 // Vectorize in current basic block only.
16782 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
16783 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
16784 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
16785 return false;
16786
16787 // First collect all possible candidates
16789 Candidates.emplace_back(Op0, Op1);
16790
16791 auto *A = dyn_cast<BinaryOperator>(Op0);
16792 auto *B = dyn_cast<BinaryOperator>(Op1);
16793 // Try to skip B.
16794 if (A && B && B->hasOneUse()) {
16795 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
16796 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
16797 if (B0 && B0->getParent() == P)
16798 Candidates.emplace_back(A, B0);
16799 if (B1 && B1->getParent() == P)
16800 Candidates.emplace_back(A, B1);
16801 }
16802 // Try to skip A.
16803 if (B && A && A->hasOneUse()) {
16804 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
16805 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
16806 if (A0 && A0->getParent() == P)
16807 Candidates.emplace_back(A0, B);
16808 if (A1 && A1->getParent() == P)
16809 Candidates.emplace_back(A1, B);
16810 }
16811
16812 if (Candidates.size() == 1)
16813 return tryToVectorizeList({Op0, Op1}, R);
16814
16815 // We have multiple options. Try to pick the single best.
16816 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
16817 if (!BestCandidate)
16818 return false;
16819 return tryToVectorizeList(
16820 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
16821}
16822
16823namespace {
16824
16825/// Model horizontal reductions.
16826///
16827/// A horizontal reduction is a tree of reduction instructions that has values
16828/// that can be put into a vector as its leaves. For example:
16829///
16830/// mul mul mul mul
16831/// \ / \ /
16832/// + +
16833/// \ /
16834/// +
16835/// This tree has "mul" as its leaf values and "+" as its reduction
16836/// instructions. A reduction can feed into a store or a binary operation
16837/// feeding a phi.
16838/// ...
16839/// \ /
16840/// +
16841/// |
16842/// phi +=
16843///
16844/// Or:
16845/// ...
16846/// \ /
16847/// +
16848/// |
16849/// *p =
16850///
16851class HorizontalReduction {
16852 using ReductionOpsType = SmallVector<Value *, 16>;
16853 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
16854 ReductionOpsListType ReductionOps;
16855 /// List of possibly reduced values.
16857 /// Maps reduced value to the corresponding reduction operation.
16859 WeakTrackingVH ReductionRoot;
16860 /// The type of reduction operation.
16861 RecurKind RdxKind;
16862 /// Checks if the optimization of original scalar identity operations on
16863 /// matched horizontal reductions is enabled and allowed.
16864 bool IsSupportedHorRdxIdentityOp = false;
16865
16866 static bool isCmpSelMinMax(Instruction *I) {
16867 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
16869 }
16870
16871 // And/or are potentially poison-safe logical patterns like:
16872 // select x, y, false
16873 // select x, true, y
16874 static bool isBoolLogicOp(Instruction *I) {
16875 return isa<SelectInst>(I) &&
16876 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
16877 }
16878
16879 /// Checks if instruction is associative and can be vectorized.
16880 static bool isVectorizable(RecurKind Kind, Instruction *I) {
16881 if (Kind == RecurKind::None)
16882 return false;
16883
16884 // Integer ops that map to select instructions or intrinsics are fine.
16886 isBoolLogicOp(I))
16887 return true;
16888
16889 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
16890 // FP min/max are associative except for NaN and -0.0. We do not
16891 // have to rule out -0.0 here because the intrinsic semantics do not
16892 // specify a fixed result for it.
16893 return I->getFastMathFlags().noNaNs();
16894 }
16895
16896 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
16897 return true;
16898
16899 return I->isAssociative();
16900 }
16901
16902 static Value *getRdxOperand(Instruction *I, unsigned Index) {
16903 // Poison-safe 'or' takes the form: select X, true, Y
16904 // To make that work with the normal operand processing, we skip the
16905 // true value operand.
16906 // TODO: Change the code and data structures to handle this without a hack.
16907 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
16908 return I->getOperand(2);
16909 return I->getOperand(Index);
16910 }
16911
16912 /// Creates reduction operation with the current opcode.
16913 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
16914 Value *RHS, const Twine &Name, bool UseSelect) {
16915 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
16916 switch (Kind) {
16917 case RecurKind::Or:
16918 if (UseSelect &&
16920 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
16921 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16922 Name);
16923 case RecurKind::And:
16924 if (UseSelect &&
16926 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
16927 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16928 Name);
16929 case RecurKind::Add:
16930 case RecurKind::Mul:
16931 case RecurKind::Xor:
16932 case RecurKind::FAdd:
16933 case RecurKind::FMul:
16934 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16935 Name);
16936 case RecurKind::FMax:
16937 return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
16938 case RecurKind::FMin:
16939 return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
16940 case RecurKind::FMaximum:
16941 return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
16942 case RecurKind::FMinimum:
16943 return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
16944 case RecurKind::SMax:
16945 if (UseSelect) {
16946 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
16947 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16948 }
16949 return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
16950 case RecurKind::SMin:
16951 if (UseSelect) {
16952 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
16953 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16954 }
16955 return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
16956 case RecurKind::UMax:
16957 if (UseSelect) {
16958 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
16959 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16960 }
16961 return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
16962 case RecurKind::UMin:
16963 if (UseSelect) {
16964 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
16965 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16966 }
16967 return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
16968 default:
16969 llvm_unreachable("Unknown reduction operation.");
16970 }
16971 }
16972
16973 /// Creates reduction operation with the current opcode with the IR flags
16974 /// from \p ReductionOps, dropping nuw/nsw flags.
16975 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
16976 Value *RHS, const Twine &Name,
16977 const ReductionOpsListType &ReductionOps) {
16978 bool UseSelect = ReductionOps.size() == 2 ||
16979 // Logical or/and.
16980 (ReductionOps.size() == 1 &&
16981 any_of(ReductionOps.front(), IsaPred<SelectInst>));
16982 assert((!UseSelect || ReductionOps.size() != 2 ||
16983 isa<SelectInst>(ReductionOps[1][0])) &&
16984 "Expected cmp + select pairs for reduction");
16985 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
16987 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
16988 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
16989 /*IncludeWrapFlags=*/false);
16990 propagateIRFlags(Op, ReductionOps[1], nullptr,
16991 /*IncludeWrapFlags=*/false);
16992 return Op;
16993 }
16994 }
16995 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
16996 return Op;
16997 }
16998
16999public:
17000 static RecurKind getRdxKind(Value *V) {
17001 auto *I = dyn_cast<Instruction>(V);
17002 if (!I)
17003 return RecurKind::None;
17004 if (match(I, m_Add(m_Value(), m_Value())))
17005 return RecurKind::Add;
17006 if (match(I, m_Mul(m_Value(), m_Value())))
17007 return RecurKind::Mul;
17008 if (match(I, m_And(m_Value(), m_Value())) ||
17010 return RecurKind::And;
17011 if (match(I, m_Or(m_Value(), m_Value())) ||
17013 return RecurKind::Or;
17014 if (match(I, m_Xor(m_Value(), m_Value())))
17015 return RecurKind::Xor;
17016 if (match(I, m_FAdd(m_Value(), m_Value())))
17017 return RecurKind::FAdd;
17018 if (match(I, m_FMul(m_Value(), m_Value())))
17019 return RecurKind::FMul;
17020
17021 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
17022 return RecurKind::FMax;
17023 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
17024 return RecurKind::FMin;
17025
17026 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
17027 return RecurKind::FMaximum;
17028 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
17029 return RecurKind::FMinimum;
17030 // This matches either cmp+select or intrinsics. SLP is expected to handle
17031 // either form.
17032 // TODO: If we are canonicalizing to intrinsics, we can remove several
17033 // special-case paths that deal with selects.
17034 if (match(I, m_SMax(m_Value(), m_Value())))
17035 return RecurKind::SMax;
17036 if (match(I, m_SMin(m_Value(), m_Value())))
17037 return RecurKind::SMin;
17038 if (match(I, m_UMax(m_Value(), m_Value())))
17039 return RecurKind::UMax;
17040 if (match(I, m_UMin(m_Value(), m_Value())))
17041 return RecurKind::UMin;
17042
17043 if (auto *Select = dyn_cast<SelectInst>(I)) {
17044 // Try harder: look for min/max pattern based on instructions producing
17045 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
17046 // During the intermediate stages of SLP, it's very common to have
17047 // pattern like this (since optimizeGatherSequence is run only once
17048 // at the end):
17049 // %1 = extractelement <2 x i32> %a, i32 0
17050 // %2 = extractelement <2 x i32> %a, i32 1
17051 // %cond = icmp sgt i32 %1, %2
17052 // %3 = extractelement <2 x i32> %a, i32 0
17053 // %4 = extractelement <2 x i32> %a, i32 1
17054 // %select = select i1 %cond, i32 %3, i32 %4
17055 CmpInst::Predicate Pred;
17056 Instruction *L1;
17057 Instruction *L2;
17058
17059 Value *LHS = Select->getTrueValue();
17060 Value *RHS = Select->getFalseValue();
17061 Value *Cond = Select->getCondition();
17062
17063 // TODO: Support inverse predicates.
17064 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
17065 if (!isa<ExtractElementInst>(RHS) ||
17066 !L2->isIdenticalTo(cast<Instruction>(RHS)))
17067 return RecurKind::None;
17068 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
17069 if (!isa<ExtractElementInst>(LHS) ||
17070 !L1->isIdenticalTo(cast<Instruction>(LHS)))
17071 return RecurKind::None;
17072 } else {
17073 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
17074 return RecurKind::None;
17075 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
17076 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
17077 !L2->isIdenticalTo(cast<Instruction>(RHS)))
17078 return RecurKind::None;
17079 }
17080
17081 switch (Pred) {
17082 default:
17083 return RecurKind::None;
17084 case CmpInst::ICMP_SGT:
17085 case CmpInst::ICMP_SGE:
17086 return RecurKind::SMax;
17087 case CmpInst::ICMP_SLT:
17088 case CmpInst::ICMP_SLE:
17089 return RecurKind::SMin;
17090 case CmpInst::ICMP_UGT:
17091 case CmpInst::ICMP_UGE:
17092 return RecurKind::UMax;
17093 case CmpInst::ICMP_ULT:
17094 case CmpInst::ICMP_ULE:
17095 return RecurKind::UMin;
17096 }
17097 }
17098 return RecurKind::None;
17099 }
17100
17101 /// Get the index of the first operand.
17102 static unsigned getFirstOperandIndex(Instruction *I) {
17103 return isCmpSelMinMax(I) ? 1 : 0;
17104 }
17105
17106private:
17107 /// Total number of operands in the reduction operation.
17108 static unsigned getNumberOfOperands(Instruction *I) {
17109 return isCmpSelMinMax(I) ? 3 : 2;
17110 }
17111
17112 /// Checks if the instruction is in basic block \p BB.
17113 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
17114 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
17115 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
17116 auto *Sel = cast<SelectInst>(I);
17117 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
17118 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
17119 }
17120 return I->getParent() == BB;
17121 }
17122
17123 /// Expected number of uses for reduction operations/reduced values.
17124 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
17125 if (IsCmpSelMinMax) {
17126 // SelectInst must be used twice while the condition op must have single
17127 // use only.
17128 if (auto *Sel = dyn_cast<SelectInst>(I))
17129 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
17130 return I->hasNUses(2);
17131 }
17132
17133 // Arithmetic reduction operation must be used once only.
17134 return I->hasOneUse();
17135 }
17136
17137 /// Initializes the list of reduction operations.
17138 void initReductionOps(Instruction *I) {
17139 if (isCmpSelMinMax(I))
17140 ReductionOps.assign(2, ReductionOpsType());
17141 else
17142 ReductionOps.assign(1, ReductionOpsType());
17143 }
17144
17145 /// Add all reduction operations for the reduction instruction \p I.
17146 void addReductionOps(Instruction *I) {
17147 if (isCmpSelMinMax(I)) {
17148 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
17149 ReductionOps[1].emplace_back(I);
17150 } else {
17151 ReductionOps[0].emplace_back(I);
17152 }
17153 }
17154
17155 static bool isGoodForReduction(ArrayRef<Value *> Data) {
17156 int Sz = Data.size();
17157 auto *I = dyn_cast<Instruction>(Data.front());
17158 return Sz > 1 || isConstant(Data.front()) ||
17159 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
17160 }
17161
17162public:
17163 HorizontalReduction() = default;
17164
17165 /// Try to find a reduction tree.
17166 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
17167 ScalarEvolution &SE, const DataLayout &DL,
17168 const TargetLibraryInfo &TLI) {
17169 RdxKind = HorizontalReduction::getRdxKind(Root);
17170 if (!isVectorizable(RdxKind, Root))
17171 return false;
17172
17173 // Analyze "regular" integer/FP types for reductions - no target-specific
17174 // types or pointers.
17175 Type *Ty = Root->getType();
17176 if (!isValidElementType(Ty) || Ty->isPointerTy())
17177 return false;
17178
17179 // Though the ultimate reduction may have multiple uses, its condition must
17180 // have only single use.
17181 if (auto *Sel = dyn_cast<SelectInst>(Root))
17182 if (!Sel->getCondition()->hasOneUse())
17183 return false;
17184
17185 ReductionRoot = Root;
17186
17187 // Iterate through all the operands of the possible reduction tree and
17188 // gather all the reduced values, sorting them by their value id.
17189 BasicBlock *BB = Root->getParent();
17190 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
17192 1, std::make_pair(Root, 0));
17193 // Checks if the operands of the \p TreeN instruction are also reduction
17194 // operations or should be treated as reduced values or an extra argument,
17195 // which is not part of the reduction.
17196 auto CheckOperands = [&](Instruction *TreeN,
17197 SmallVectorImpl<Value *> &PossibleReducedVals,
17198 SmallVectorImpl<Instruction *> &ReductionOps,
17199 unsigned Level) {
17200 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
17201 getNumberOfOperands(TreeN)))) {
17202 Value *EdgeVal = getRdxOperand(TreeN, I);
17203 ReducedValsToOps[EdgeVal].push_back(TreeN);
17204 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
17205 // If the edge is not an instruction, or it is different from the main
17206 // reduction opcode or has too many uses - possible reduced value.
17207 // Also, do not try to reduce const values, if the operation is not
17208 // foldable.
17209 if (!EdgeInst || Level > RecursionMaxDepth ||
17210 getRdxKind(EdgeInst) != RdxKind ||
17211 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
17212 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
17213 !isVectorizable(RdxKind, EdgeInst) ||
17214 (R.isAnalyzedReductionRoot(EdgeInst) &&
17215 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
17216 PossibleReducedVals.push_back(EdgeVal);
17217 continue;
17218 }
17219 ReductionOps.push_back(EdgeInst);
17220 }
17221 };
17222 // Try to regroup reduced values so that it gets more profitable to try to
17223 // reduce them. Values are grouped by their value ids, instructions - by
17224 // instruction op id and/or alternate op id, plus do extra analysis for
17225 // loads (grouping them by the distabce between pointers) and cmp
17226 // instructions (grouping them by the predicate).
17228 PossibleReducedVals;
17229 initReductionOps(Root);
17231 SmallSet<size_t, 2> LoadKeyUsed;
17232
17233 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
17234 Key = hash_combine(hash_value(LI->getParent()), Key);
17236 if (LoadKeyUsed.contains(Key)) {
17237 auto LIt = LoadsMap.find(Ptr);
17238 if (LIt != LoadsMap.end()) {
17239 for (LoadInst *RLI : LIt->second) {
17240 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
17241 LI->getType(), LI->getPointerOperand(), DL, SE,
17242 /*StrictCheck=*/true))
17243 return hash_value(RLI->getPointerOperand());
17244 }
17245 for (LoadInst *RLI : LIt->second) {
17247 LI->getPointerOperand(), TLI)) {
17248 hash_code SubKey = hash_value(RLI->getPointerOperand());
17249 return SubKey;
17250 }
17251 }
17252 if (LIt->second.size() > 2) {
17253 hash_code SubKey =
17254 hash_value(LIt->second.back()->getPointerOperand());
17255 return SubKey;
17256 }
17257 }
17258 }
17259 LoadKeyUsed.insert(Key);
17260 LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
17261 return hash_value(LI->getPointerOperand());
17262 };
17263
17264 while (!Worklist.empty()) {
17265 auto [TreeN, Level] = Worklist.pop_back_val();
17266 SmallVector<Value *> PossibleRedVals;
17267 SmallVector<Instruction *> PossibleReductionOps;
17268 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
17269 addReductionOps(TreeN);
17270 // Add reduction values. The values are sorted for better vectorization
17271 // results.
17272 for (Value *V : PossibleRedVals) {
17273 size_t Key, Idx;
17274 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
17275 /*AllowAlternate=*/false);
17276 ++PossibleReducedVals[Key][Idx]
17277 .insert(std::make_pair(V, 0))
17278 .first->second;
17279 }
17280 for (Instruction *I : reverse(PossibleReductionOps))
17281 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
17282 }
17283 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
17284 // Sort values by the total number of values kinds to start the reduction
17285 // from the longest possible reduced values sequences.
17286 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
17287 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
17288 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
17289 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
17290 It != E; ++It) {
17291 PossibleRedValsVect.emplace_back();
17292 auto RedValsVect = It->second.takeVector();
17293 stable_sort(RedValsVect, llvm::less_second());
17294 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
17295 PossibleRedValsVect.back().append(Data.second, Data.first);
17296 }
17297 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
17298 return P1.size() > P2.size();
17299 });
17300 int NewIdx = -1;
17301 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
17302 if (NewIdx < 0 ||
17303 (!isGoodForReduction(Data) &&
17304 (!isa<LoadInst>(Data.front()) ||
17305 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
17307 cast<LoadInst>(Data.front())->getPointerOperand()) !=
17309 cast<LoadInst>(ReducedVals[NewIdx].front())
17310 ->getPointerOperand())))) {
17311 NewIdx = ReducedVals.size();
17312 ReducedVals.emplace_back();
17313 }
17314 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
17315 }
17316 }
17317 // Sort the reduced values by number of same/alternate opcode and/or pointer
17318 // operand.
17319 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
17320 return P1.size() > P2.size();
17321 });
17322 return true;
17323 }
17324
17325 /// Attempt to vectorize the tree found by matchAssociativeReduction.
17326 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
17327 const TargetLibraryInfo &TLI) {
17328 constexpr int ReductionLimit = 4;
17329 constexpr unsigned RegMaxNumber = 4;
17330 constexpr unsigned RedValsMaxNumber = 128;
17331 // If there are a sufficient number of reduction values, reduce
17332 // to a nearby power-of-2. We can safely generate oversized
17333 // vectors and rely on the backend to split them to legal sizes.
17334 unsigned NumReducedVals =
17335 std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0,
17336 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
17337 if (!isGoodForReduction(Vals))
17338 return Num;
17339 return Num + Vals.size();
17340 });
17341 if (NumReducedVals < ReductionLimit &&
17343 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
17344 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
17345 }))) {
17346 for (ReductionOpsType &RdxOps : ReductionOps)
17347 for (Value *RdxOp : RdxOps)
17348 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17349 return nullptr;
17350 }
17351
17352 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
17353 TargetFolder(DL));
17354 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
17355
17356 // Track the reduced values in case if they are replaced by extractelement
17357 // because of the vectorization.
17358 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
17359 ReducedVals.front().size());
17360 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
17361
17362 // The compare instruction of a min/max is the insertion point for new
17363 // instructions and may be replaced with a new compare instruction.
17364 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
17365 assert(isa<SelectInst>(RdxRootInst) &&
17366 "Expected min/max reduction to have select root instruction");
17367 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
17368 assert(isa<Instruction>(ScalarCond) &&
17369 "Expected min/max reduction to have compare condition");
17370 return cast<Instruction>(ScalarCond);
17371 };
17372
17373 // Return new VectorizedTree, based on previous value.
17374 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
17375 if (VectorizedTree) {
17376 // Update the final value in the reduction.
17378 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
17379 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
17381 !isGuaranteedNotToBePoison(VectorizedTree))) {
17382 auto It = ReducedValsToOps.find(Res);
17383 if (It != ReducedValsToOps.end() &&
17384 any_of(It->getSecond(),
17385 [](Instruction *I) { return isBoolLogicOp(I); }))
17386 std::swap(VectorizedTree, Res);
17387 }
17388
17389 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
17390 ReductionOps);
17391 }
17392 // Initialize the final value in the reduction.
17393 return Res;
17394 };
17395 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
17396 return isBoolLogicOp(cast<Instruction>(V));
17397 });
17398 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
17399 ReductionOps.front().size());
17400 for (ReductionOpsType &RdxOps : ReductionOps)
17401 for (Value *RdxOp : RdxOps) {
17402 if (!RdxOp)
17403 continue;
17404 IgnoreList.insert(RdxOp);
17405 }
17406 // Intersect the fast-math-flags from all reduction operations.
17407 FastMathFlags RdxFMF;
17408 RdxFMF.set();
17409 for (Value *U : IgnoreList)
17410 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
17411 RdxFMF &= FPMO->getFastMathFlags();
17412 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
17413
17414 // Need to track reduced vals, they may be changed during vectorization of
17415 // subvectors.
17416 for (ArrayRef<Value *> Candidates : ReducedVals)
17417 for (Value *V : Candidates)
17418 TrackedVals.try_emplace(V, V);
17419
17420 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
17421 // List of the values that were reduced in other trees as part of gather
17422 // nodes and thus requiring extract if fully vectorized in other trees.
17423 SmallPtrSet<Value *, 4> RequiredExtract;
17424 Value *VectorizedTree = nullptr;
17425 bool CheckForReusedReductionOps = false;
17426 // Try to vectorize elements based on their type.
17428 for (ArrayRef<Value *> RV : ReducedVals)
17429 States.push_back(getSameOpcode(RV, TLI));
17430 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
17431 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
17432 InstructionsState S = States[I];
17433 SmallVector<Value *> Candidates;
17434 Candidates.reserve(2 * OrigReducedVals.size());
17435 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
17436 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
17437 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
17438 // Check if the reduction value was not overriden by the extractelement
17439 // instruction because of the vectorization and exclude it, if it is not
17440 // compatible with other values.
17441 // Also check if the instruction was folded to constant/other value.
17442 auto *Inst = dyn_cast<Instruction>(RdxVal);
17443 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
17444 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
17445 (S.getOpcode() && !Inst))
17446 continue;
17447 Candidates.push_back(RdxVal);
17448 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
17449 }
17450 bool ShuffledExtracts = false;
17451 // Try to handle shuffled extractelements.
17452 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
17453 I + 1 < E) {
17454 SmallVector<Value *> CommonCandidates(Candidates);
17455 for (Value *RV : ReducedVals[I + 1]) {
17456 Value *RdxVal = TrackedVals.find(RV)->second;
17457 // Check if the reduction value was not overriden by the
17458 // extractelement instruction because of the vectorization and
17459 // exclude it, if it is not compatible with other values.
17460 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
17461 if (!Inst)
17462 continue;
17463 CommonCandidates.push_back(RdxVal);
17464 TrackedToOrig.try_emplace(RdxVal, RV);
17465 }
17467 if (isFixedVectorShuffle(CommonCandidates, Mask)) {
17468 ++I;
17469 Candidates.swap(CommonCandidates);
17470 ShuffledExtracts = true;
17471 }
17472 }
17473
17474 // Emit code for constant values.
17475 if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 &&
17476 allConstant(Candidates)) {
17477 Value *Res = Candidates.front();
17478 ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond();
17479 for (Value *VC : ArrayRef(Candidates).drop_front()) {
17480 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
17481 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
17482 if (auto *ResI = dyn_cast<Instruction>(Res))
17483 V.analyzedReductionRoot(ResI);
17484 }
17485 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
17486 continue;
17487 }
17488
17489 unsigned NumReducedVals = Candidates.size();
17490 if (NumReducedVals < ReductionLimit &&
17491 (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization ||
17492 !isSplat(Candidates)))
17493 continue;
17494
17495 // Check if we support repeated scalar values processing (optimization of
17496 // original scalar identity operations on matched horizontal reductions).
17497 IsSupportedHorRdxIdentityOp =
17498 AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
17499 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
17500 // Gather same values.
17501 MapVector<Value *, unsigned> SameValuesCounter;
17502 if (IsSupportedHorRdxIdentityOp)
17503 for (Value *V : Candidates)
17504 ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second;
17505 // Used to check if the reduced values used same number of times. In this
17506 // case the compiler may produce better code. E.g. if reduced values are
17507 // aabbccdd (8 x values), then the first node of the tree will have a node
17508 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
17509 // Plus, the final reduction will be performed on <8 x aabbccdd>.
17510 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
17511 // x abcd) * 2.
17512 // Currently it only handles add/fadd/xor. and/or/min/max do not require
17513 // this analysis, other operations may require an extra estimation of
17514 // the profitability.
17515 bool SameScaleFactor = false;
17516 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
17517 SameValuesCounter.size() != Candidates.size();
17518 if (OptReusedScalars) {
17519 SameScaleFactor =
17520 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
17521 RdxKind == RecurKind::Xor) &&
17522 all_of(drop_begin(SameValuesCounter),
17523 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
17524 return P.second == SameValuesCounter.front().second;
17525 });
17526 Candidates.resize(SameValuesCounter.size());
17527 transform(SameValuesCounter, Candidates.begin(),
17528 [](const auto &P) { return P.first; });
17529 NumReducedVals = Candidates.size();
17530 // Have a reduction of the same element.
17531 if (NumReducedVals == 1) {
17532 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
17533 unsigned Cnt = SameValuesCounter.lookup(OrigV);
17534 Value *RedVal =
17535 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
17536 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17537 VectorizedVals.try_emplace(OrigV, Cnt);
17538 continue;
17539 }
17540 }
17541
17542 unsigned MaxVecRegSize = V.getMaxVecRegSize();
17543 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
17544 unsigned MaxElts =
17545 RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize);
17546
17547 unsigned ReduxWidth = std::min<unsigned>(
17548 llvm::bit_floor(NumReducedVals),
17549 std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
17550 RegMaxNumber * RedValsMaxNumber));
17551 unsigned Start = 0;
17552 unsigned Pos = Start;
17553 // Restarts vectorization attempt with lower vector factor.
17554 unsigned PrevReduxWidth = ReduxWidth;
17555 bool CheckForReusedReductionOpsLocal = false;
17556 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
17557 &CheckForReusedReductionOpsLocal,
17558 &PrevReduxWidth, &V,
17559 &IgnoreList](bool IgnoreVL = false) {
17560 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
17561 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
17562 // Check if any of the reduction ops are gathered. If so, worth
17563 // trying again with less number of reduction ops.
17564 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
17565 }
17566 ++Pos;
17567 if (Pos < NumReducedVals - ReduxWidth + 1)
17568 return IsAnyRedOpGathered;
17569 Pos = Start;
17570 ReduxWidth /= 2;
17571 return IsAnyRedOpGathered;
17572 };
17573 bool AnyVectorized = false;
17574 while (Pos < NumReducedVals - ReduxWidth + 1 &&
17575 ReduxWidth >= ReductionLimit) {
17576 // Dependency in tree of the reduction ops - drop this attempt, try
17577 // later.
17578 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
17579 Start == 0) {
17580 CheckForReusedReductionOps = true;
17581 break;
17582 }
17583 PrevReduxWidth = ReduxWidth;
17584 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
17585 // Beeing analyzed already - skip.
17586 if (V.areAnalyzedReductionVals(VL)) {
17587 (void)AdjustReducedVals(/*IgnoreVL=*/true);
17588 continue;
17589 }
17590 // Early exit if any of the reduction values were deleted during
17591 // previous vectorization attempts.
17592 if (any_of(VL, [&V](Value *RedVal) {
17593 auto *RedValI = dyn_cast<Instruction>(RedVal);
17594 if (!RedValI)
17595 return false;
17596 return V.isDeleted(RedValI);
17597 }))
17598 break;
17599 V.buildTree(VL, IgnoreList);
17600 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
17601 if (!AdjustReducedVals())
17602 V.analyzedReductionVals(VL);
17603 continue;
17604 }
17605 if (V.isLoadCombineReductionCandidate(RdxKind)) {
17606 if (!AdjustReducedVals())
17607 V.analyzedReductionVals(VL);
17608 continue;
17609 }
17610 V.reorderTopToBottom();
17611 // No need to reorder the root node at all.
17612 V.reorderBottomToTop(/*IgnoreReorder=*/true);
17613 // Keep extracted other reduction values, if they are used in the
17614 // vectorization trees.
17615 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues;
17616 // The reduction root is used as the insertion point for new
17617 // instructions, so set it as externally used to prevent it from being
17618 // deleted.
17619 LocalExternallyUsedValues[ReductionRoot];
17620 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
17621 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
17622 continue;
17623 for (Value *V : ReducedVals[Cnt])
17624 if (isa<Instruction>(V))
17625 LocalExternallyUsedValues[TrackedVals[V]];
17626 }
17627 if (!IsSupportedHorRdxIdentityOp) {
17628 // Number of uses of the candidates in the vector of values.
17629 assert(SameValuesCounter.empty() &&
17630 "Reused values counter map is not empty");
17631 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17632 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17633 continue;
17634 Value *V = Candidates[Cnt];
17635 Value *OrigV = TrackedToOrig.find(V)->second;
17636 ++SameValuesCounter[OrigV];
17637 }
17638 }
17639 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
17640 // Gather externally used values.
17642 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17643 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17644 continue;
17645 Value *RdxVal = Candidates[Cnt];
17646 if (!Visited.insert(RdxVal).second)
17647 continue;
17648 // Check if the scalar was vectorized as part of the vectorization
17649 // tree but not the top node.
17650 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
17651 LocalExternallyUsedValues[RdxVal];
17652 continue;
17653 }
17654 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17655 unsigned NumOps =
17656 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
17657 if (NumOps != ReducedValsToOps.find(OrigV)->second.size())
17658 LocalExternallyUsedValues[RdxVal];
17659 }
17660 // Do not need the list of reused scalars in regular mode anymore.
17661 if (!IsSupportedHorRdxIdentityOp)
17662 SameValuesCounter.clear();
17663 for (Value *RdxVal : VL)
17664 if (RequiredExtract.contains(RdxVal))
17665 LocalExternallyUsedValues[RdxVal];
17666 V.buildExternalUses(LocalExternallyUsedValues);
17667
17668 V.computeMinimumValueSizes();
17669 V.transformNodes();
17670
17671 // Estimate cost.
17672 InstructionCost TreeCost = V.getTreeCost(VL);
17673 InstructionCost ReductionCost =
17674 getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
17675 InstructionCost Cost = TreeCost + ReductionCost;
17676 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
17677 << " for reduction\n");
17678 if (!Cost.isValid())
17679 break;
17680 if (Cost >= -SLPCostThreshold) {
17681 V.getORE()->emit([&]() {
17683 SV_NAME, "HorSLPNotBeneficial",
17684 ReducedValsToOps.find(VL[0])->second.front())
17685 << "Vectorizing horizontal reduction is possible "
17686 << "but not beneficial with cost " << ore::NV("Cost", Cost)
17687 << " and threshold "
17688 << ore::NV("Threshold", -SLPCostThreshold);
17689 });
17690 if (!AdjustReducedVals())
17691 V.analyzedReductionVals(VL);
17692 continue;
17693 }
17694
17695 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
17696 << Cost << ". (HorRdx)\n");
17697 V.getORE()->emit([&]() {
17698 return OptimizationRemark(
17699 SV_NAME, "VectorizedHorizontalReduction",
17700 ReducedValsToOps.find(VL[0])->second.front())
17701 << "Vectorized horizontal reduction with cost "
17702 << ore::NV("Cost", Cost) << " and with tree size "
17703 << ore::NV("TreeSize", V.getTreeSize());
17704 });
17705
17706 Builder.setFastMathFlags(RdxFMF);
17707
17708 // Emit a reduction. If the root is a select (min/max idiom), the insert
17709 // point is the compare condition of that select.
17710 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
17711 Instruction *InsertPt = RdxRootInst;
17712 if (IsCmpSelMinMax)
17713 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
17714
17715 // Vectorize a tree.
17716 Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,
17717 ReplacedExternals, InsertPt);
17718
17719 Builder.SetInsertPoint(InsertPt);
17720
17721 // To prevent poison from leaking across what used to be sequential,
17722 // safe, scalar boolean logic operations, the reduction operand must be
17723 // frozen.
17724 if ((isBoolLogicOp(RdxRootInst) ||
17725 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
17726 !isGuaranteedNotToBePoison(VectorizedRoot))
17727 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
17728
17729 // Emit code to correctly handle reused reduced values, if required.
17730 if (OptReusedScalars && !SameScaleFactor) {
17731 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
17732 SameValuesCounter, TrackedToOrig);
17733 }
17734
17735 Value *ReducedSubTree =
17736 emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
17737 if (ReducedSubTree->getType() != VL.front()->getType()) {
17738 assert(ReducedSubTree->getType() != VL.front()->getType() &&
17739 "Expected different reduction type.");
17740 ReducedSubTree =
17741 Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
17742 V.isSignedMinBitwidthRootNode());
17743 }
17744
17745 // Improved analysis for add/fadd/xor reductions with same scale factor
17746 // for all operands of reductions. We can emit scalar ops for them
17747 // instead.
17748 if (OptReusedScalars && SameScaleFactor)
17749 ReducedSubTree = emitScaleForReusedOps(
17750 ReducedSubTree, Builder, SameValuesCounter.front().second);
17751
17752 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
17753 // Count vectorized reduced values to exclude them from final reduction.
17754 for (Value *RdxVal : VL) {
17755 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17756 if (IsSupportedHorRdxIdentityOp) {
17757 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
17758 continue;
17759 }
17760 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
17761 if (!V.isVectorized(RdxVal))
17762 RequiredExtract.insert(RdxVal);
17763 }
17764 Pos += ReduxWidth;
17765 Start = Pos;
17766 ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
17767 AnyVectorized = true;
17768 }
17769 if (OptReusedScalars && !AnyVectorized) {
17770 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
17771 Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second);
17772 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17773 Value *OrigV = TrackedToOrig.find(P.first)->second;
17774 VectorizedVals.try_emplace(OrigV, P.second);
17775 }
17776 continue;
17777 }
17778 }
17779 if (VectorizedTree) {
17780 // Reorder operands of bool logical op in the natural order to avoid
17781 // possible problem with poison propagation. If not possible to reorder
17782 // (both operands are originally RHS), emit an extra freeze instruction
17783 // for the LHS operand.
17784 // I.e., if we have original code like this:
17785 // RedOp1 = select i1 ?, i1 LHS, i1 false
17786 // RedOp2 = select i1 RHS, i1 ?, i1 false
17787
17788 // Then, we swap LHS/RHS to create a new op that matches the poison
17789 // semantics of the original code.
17790
17791 // If we have original code like this and both values could be poison:
17792 // RedOp1 = select i1 ?, i1 LHS, i1 false
17793 // RedOp2 = select i1 ?, i1 RHS, i1 false
17794
17795 // Then, we must freeze LHS in the new op.
17796 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
17797 Instruction *RedOp1,
17798 Instruction *RedOp2,
17799 bool InitStep) {
17800 if (!AnyBoolLogicOp)
17801 return;
17802 if (isBoolLogicOp(RedOp1) &&
17803 ((!InitStep && LHS == VectorizedTree) ||
17804 getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
17805 return;
17806 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
17807 getRdxOperand(RedOp2, 0) == RHS ||
17809 std::swap(LHS, RHS);
17810 return;
17811 }
17812 if (LHS != VectorizedTree)
17813 LHS = Builder.CreateFreeze(LHS);
17814 };
17815 // Finish the reduction.
17816 // Need to add extra arguments and not vectorized possible reduction
17817 // values.
17818 // Try to avoid dependencies between the scalar remainders after
17819 // reductions.
17820 auto FinalGen =
17822 bool InitStep) {
17823 unsigned Sz = InstVals.size();
17825 Sz % 2);
17826 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
17827 Instruction *RedOp = InstVals[I + 1].first;
17828 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
17829 Value *RdxVal1 = InstVals[I].second;
17830 Value *StableRdxVal1 = RdxVal1;
17831 auto It1 = TrackedVals.find(RdxVal1);
17832 if (It1 != TrackedVals.end())
17833 StableRdxVal1 = It1->second;
17834 Value *RdxVal2 = InstVals[I + 1].second;
17835 Value *StableRdxVal2 = RdxVal2;
17836 auto It2 = TrackedVals.find(RdxVal2);
17837 if (It2 != TrackedVals.end())
17838 StableRdxVal2 = It2->second;
17839 // To prevent poison from leaking across what used to be
17840 // sequential, safe, scalar boolean logic operations, the
17841 // reduction operand must be frozen.
17842 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
17843 RedOp, InitStep);
17844 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
17845 StableRdxVal2, "op.rdx", ReductionOps);
17846 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
17847 }
17848 if (Sz % 2 == 1)
17849 ExtraReds[Sz / 2] = InstVals.back();
17850 return ExtraReds;
17851 };
17853 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
17854 VectorizedTree);
17856 for (ArrayRef<Value *> Candidates : ReducedVals) {
17857 for (Value *RdxVal : Candidates) {
17858 if (!Visited.insert(RdxVal).second)
17859 continue;
17860 unsigned NumOps = VectorizedVals.lookup(RdxVal);
17861 for (Instruction *RedOp :
17862 ArrayRef(ReducedValsToOps.find(RdxVal)->second)
17863 .drop_back(NumOps))
17864 ExtraReductions.emplace_back(RedOp, RdxVal);
17865 }
17866 }
17867 // Iterate through all not-vectorized reduction values/extra arguments.
17868 bool InitStep = true;
17869 while (ExtraReductions.size() > 1) {
17871 FinalGen(ExtraReductions, InitStep);
17872 ExtraReductions.swap(NewReds);
17873 InitStep = false;
17874 }
17875 VectorizedTree = ExtraReductions.front().second;
17876
17877 ReductionRoot->replaceAllUsesWith(VectorizedTree);
17878
17879 // The original scalar reduction is expected to have no remaining
17880 // uses outside the reduction tree itself. Assert that we got this
17881 // correct, replace internal uses with undef, and mark for eventual
17882 // deletion.
17883#ifndef NDEBUG
17884 SmallSet<Value *, 4> IgnoreSet;
17885 for (ArrayRef<Value *> RdxOps : ReductionOps)
17886 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
17887#endif
17888 for (ArrayRef<Value *> RdxOps : ReductionOps) {
17889 for (Value *Ignore : RdxOps) {
17890 if (!Ignore)
17891 continue;
17892#ifndef NDEBUG
17893 for (auto *U : Ignore->users()) {
17894 assert(IgnoreSet.count(U) &&
17895 "All users must be either in the reduction ops list.");
17896 }
17897#endif
17898 if (!Ignore->use_empty()) {
17899 Value *P = PoisonValue::get(Ignore->getType());
17900 Ignore->replaceAllUsesWith(P);
17901 }
17902 }
17903 V.removeInstructionsAndOperands(RdxOps);
17904 }
17905 } else if (!CheckForReusedReductionOps) {
17906 for (ReductionOpsType &RdxOps : ReductionOps)
17907 for (Value *RdxOp : RdxOps)
17908 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17909 }
17910 return VectorizedTree;
17911 }
17912
17913private:
17914 /// Calculate the cost of a reduction.
17915 InstructionCost getReductionCost(TargetTransformInfo *TTI,
17916 ArrayRef<Value *> ReducedVals,
17917 bool IsCmpSelMinMax, unsigned ReduxWidth,
17918 FastMathFlags FMF) {
17920 Type *ScalarTy = ReducedVals.front()->getType();
17921 FixedVectorType *VectorTy = getWidenedType(ScalarTy, ReduxWidth);
17922 InstructionCost VectorCost = 0, ScalarCost;
17923 // If all of the reduced values are constant, the vector cost is 0, since
17924 // the reduction value can be calculated at the compile time.
17925 bool AllConsts = allConstant(ReducedVals);
17926 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
17928 // Scalar cost is repeated for N-1 elements.
17929 int Cnt = ReducedVals.size();
17930 for (Value *RdxVal : ReducedVals) {
17931 if (Cnt == 1)
17932 break;
17933 --Cnt;
17934 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
17935 Cost += GenCostFn();
17936 continue;
17937 }
17938 InstructionCost ScalarCost = 0;
17939 for (User *U : RdxVal->users()) {
17940 auto *RdxOp = cast<Instruction>(U);
17941 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
17942 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
17943 continue;
17944 }
17945 ScalarCost = InstructionCost::getInvalid();
17946 break;
17947 }
17948 if (ScalarCost.isValid())
17949 Cost += ScalarCost;
17950 else
17951 Cost += GenCostFn();
17952 }
17953 return Cost;
17954 };
17955 switch (RdxKind) {
17956 case RecurKind::Add:
17957 case RecurKind::Mul:
17958 case RecurKind::Or:
17959 case RecurKind::And:
17960 case RecurKind::Xor:
17961 case RecurKind::FAdd:
17962 case RecurKind::FMul: {
17963 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
17964 if (!AllConsts)
17965 VectorCost =
17966 TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
17967 ScalarCost = EvaluateScalarCost([&]() {
17968 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
17969 });
17970 break;
17971 }
17972 case RecurKind::FMax:
17973 case RecurKind::FMin:
17974 case RecurKind::FMaximum:
17975 case RecurKind::FMinimum:
17976 case RecurKind::SMax:
17977 case RecurKind::SMin:
17978 case RecurKind::UMax:
17979 case RecurKind::UMin: {
17981 if (!AllConsts)
17982 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
17983 ScalarCost = EvaluateScalarCost([&]() {
17984 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
17985 return TTI->getIntrinsicInstrCost(ICA, CostKind);
17986 });
17987 break;
17988 }
17989 default:
17990 llvm_unreachable("Expected arithmetic or min/max reduction operation");
17991 }
17992
17993 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
17994 << " for reduction of " << shortBundleName(ReducedVals)
17995 << " (It is a splitting reduction)\n");
17996 return VectorCost - ScalarCost;
17997 }
17998
17999 /// Emit a horizontal reduction of the vectorized value.
18000 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
18001 unsigned ReduxWidth, const TargetTransformInfo *TTI) {
18002 assert(VectorizedValue && "Need to have a vectorized tree node");
18003 assert(isPowerOf2_32(ReduxWidth) &&
18004 "We only handle power-of-two reductions for now");
18005 assert(RdxKind != RecurKind::FMulAdd &&
18006 "A call to the llvm.fmuladd intrinsic is not handled yet");
18007
18008 ++NumVectorInstructions;
18009 return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind);
18010 }
18011
18012 /// Emits optimized code for unique scalar value reused \p Cnt times.
18013 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
18014 unsigned Cnt) {
18015 assert(IsSupportedHorRdxIdentityOp &&
18016 "The optimization of matched scalar identity horizontal reductions "
18017 "must be supported.");
18018 if (Cnt == 1)
18019 return VectorizedValue;
18020 switch (RdxKind) {
18021 case RecurKind::Add: {
18022 // res = mul vv, n
18023 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
18024 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
18025 << VectorizedValue << ". (HorRdx)\n");
18026 return Builder.CreateMul(VectorizedValue, Scale);
18027 }
18028 case RecurKind::Xor: {
18029 // res = n % 2 ? 0 : vv
18030 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
18031 << ". (HorRdx)\n");
18032 if (Cnt % 2 == 0)
18033 return Constant::getNullValue(VectorizedValue->getType());
18034 return VectorizedValue;
18035 }
18036 case RecurKind::FAdd: {
18037 // res = fmul v, n
18038 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
18039 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
18040 << VectorizedValue << ". (HorRdx)\n");
18041 return Builder.CreateFMul(VectorizedValue, Scale);
18042 }
18043 case RecurKind::And:
18044 case RecurKind::Or:
18045 case RecurKind::SMax:
18046 case RecurKind::SMin:
18047 case RecurKind::UMax:
18048 case RecurKind::UMin:
18049 case RecurKind::FMax:
18050 case RecurKind::FMin:
18051 case RecurKind::FMaximum:
18052 case RecurKind::FMinimum:
18053 // res = vv
18054 return VectorizedValue;
18055 case RecurKind::Mul:
18056 case RecurKind::FMul:
18057 case RecurKind::FMulAdd:
18058 case RecurKind::IAnyOf:
18059 case RecurKind::FAnyOf:
18060 case RecurKind::None:
18061 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
18062 }
18063 return nullptr;
18064 }
18065
18066 /// Emits actual operation for the scalar identity values, found during
18067 /// horizontal reduction analysis.
18068 Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
18069 BoUpSLP &R,
18070 const MapVector<Value *, unsigned> &SameValuesCounter,
18071 const DenseMap<Value *, Value *> &TrackedToOrig) {
18072 assert(IsSupportedHorRdxIdentityOp &&
18073 "The optimization of matched scalar identity horizontal reductions "
18074 "must be supported.");
18075 ArrayRef<Value *> VL = R.getRootNodeScalars();
18076 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
18077 if (VTy->getElementType() != VL.front()->getType()) {
18078 VectorizedValue = Builder.CreateIntCast(
18079 VectorizedValue,
18080 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
18081 R.isSignedMinBitwidthRootNode());
18082 }
18083 switch (RdxKind) {
18084 case RecurKind::Add: {
18085 // root = mul prev_root, <1, 1, n, 1>
18087 for (Value *V : VL) {
18088 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
18089 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
18090 }
18091 auto *Scale = ConstantVector::get(Vals);
18092 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
18093 << VectorizedValue << ". (HorRdx)\n");
18094 return Builder.CreateMul(VectorizedValue, Scale);
18095 }
18096 case RecurKind::And:
18097 case RecurKind::Or:
18098 // No need for multiple or/and(s).
18099 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
18100 << ". (HorRdx)\n");
18101 return VectorizedValue;
18102 case RecurKind::SMax:
18103 case RecurKind::SMin:
18104 case RecurKind::UMax:
18105 case RecurKind::UMin:
18106 case RecurKind::FMax:
18107 case RecurKind::FMin:
18108 case RecurKind::FMaximum:
18109 case RecurKind::FMinimum:
18110 // No need for multiple min/max(s) of the same value.
18111 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
18112 << ". (HorRdx)\n");
18113 return VectorizedValue;
18114 case RecurKind::Xor: {
18115 // Replace values with even number of repeats with 0, since
18116 // x xor x = 0.
18117 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
18118 // 7>, if elements 4th and 6th elements have even number of repeats.
18120 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
18122 std::iota(Mask.begin(), Mask.end(), 0);
18123 bool NeedShuffle = false;
18124 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
18125 Value *V = VL[I];
18126 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
18127 if (Cnt % 2 == 0) {
18128 Mask[I] = VF;
18129 NeedShuffle = true;
18130 }
18131 }
18132 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
18133 : Mask) dbgs()
18134 << I << " ";
18135 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
18136 if (NeedShuffle)
18137 VectorizedValue = Builder.CreateShuffleVector(
18138 VectorizedValue,
18139 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
18140 return VectorizedValue;
18141 }
18142 case RecurKind::FAdd: {
18143 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
18145 for (Value *V : VL) {
18146 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
18147 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
18148 }
18149 auto *Scale = ConstantVector::get(Vals);
18150 return Builder.CreateFMul(VectorizedValue, Scale);
18151 }
18152 case RecurKind::Mul:
18153 case RecurKind::FMul:
18154 case RecurKind::FMulAdd:
18155 case RecurKind::IAnyOf:
18156 case RecurKind::FAnyOf:
18157 case RecurKind::None:
18158 llvm_unreachable("Unexpected reduction kind for reused scalars.");
18159 }
18160 return nullptr;
18161 }
18162};
18163} // end anonymous namespace
18164
18165/// Gets recurrence kind from the specified value.
18167 return HorizontalReduction::getRdxKind(V);
18168}
18169static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
18170 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
18171 return cast<FixedVectorType>(IE->getType())->getNumElements();
18172
18173 unsigned AggregateSize = 1;
18174 auto *IV = cast<InsertValueInst>(InsertInst);
18175 Type *CurrentType = IV->getType();
18176 do {
18177 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
18178 for (auto *Elt : ST->elements())
18179 if (Elt != ST->getElementType(0)) // check homogeneity
18180 return std::nullopt;
18181 AggregateSize *= ST->getNumElements();
18182 CurrentType = ST->getElementType(0);
18183 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
18184 AggregateSize *= AT->getNumElements();
18185 CurrentType = AT->getElementType();
18186 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
18187 AggregateSize *= VT->getNumElements();
18188 return AggregateSize;
18189 } else if (CurrentType->isSingleValueType()) {
18190 return AggregateSize;
18191 } else {
18192 return std::nullopt;
18193 }
18194 } while (true);
18195}
18196
18197static void findBuildAggregate_rec(Instruction *LastInsertInst,
18199 SmallVectorImpl<Value *> &BuildVectorOpds,
18200 SmallVectorImpl<Value *> &InsertElts,
18201 unsigned OperandOffset) {
18202 do {
18203 Value *InsertedOperand = LastInsertInst->getOperand(1);
18204 std::optional<unsigned> OperandIndex =
18205 getElementIndex(LastInsertInst, OperandOffset);
18206 if (!OperandIndex)
18207 return;
18208 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
18209 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
18210 BuildVectorOpds, InsertElts, *OperandIndex);
18211
18212 } else {
18213 BuildVectorOpds[*OperandIndex] = InsertedOperand;
18214 InsertElts[*OperandIndex] = LastInsertInst;
18215 }
18216 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
18217 } while (LastInsertInst != nullptr &&
18218 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
18219 LastInsertInst->hasOneUse());
18220}
18221
18222/// Recognize construction of vectors like
18223/// %ra = insertelement <4 x float> poison, float %s0, i32 0
18224/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
18225/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
18226/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
18227/// starting from the last insertelement or insertvalue instruction.
18228///
18229/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
18230/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
18231/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
18232///
18233/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
18234///
18235/// \return true if it matches.
18236static bool findBuildAggregate(Instruction *LastInsertInst,
18238 SmallVectorImpl<Value *> &BuildVectorOpds,
18239 SmallVectorImpl<Value *> &InsertElts) {
18240
18241 assert((isa<InsertElementInst>(LastInsertInst) ||
18242 isa<InsertValueInst>(LastInsertInst)) &&
18243 "Expected insertelement or insertvalue instruction!");
18244
18245 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
18246 "Expected empty result vectors!");
18247
18248 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
18249 if (!AggregateSize)
18250 return false;
18251 BuildVectorOpds.resize(*AggregateSize);
18252 InsertElts.resize(*AggregateSize);
18253
18254 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);
18255 llvm::erase(BuildVectorOpds, nullptr);
18256 llvm::erase(InsertElts, nullptr);
18257 if (BuildVectorOpds.size() >= 2)
18258 return true;
18259
18260 return false;
18261}
18262
18263/// Try and get a reduction instruction from a phi node.
18264///
18265/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
18266/// if they come from either \p ParentBB or a containing loop latch.
18267///
18268/// \returns A candidate reduction value if possible, or \code nullptr \endcode
18269/// if not possible.
18271 BasicBlock *ParentBB, LoopInfo *LI) {
18272 // There are situations where the reduction value is not dominated by the
18273 // reduction phi. Vectorizing such cases has been reported to cause
18274 // miscompiles. See PR25787.
18275 auto DominatedReduxValue = [&](Value *R) {
18276 return isa<Instruction>(R) &&
18277 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
18278 };
18279
18280 Instruction *Rdx = nullptr;
18281
18282 // Return the incoming value if it comes from the same BB as the phi node.
18283 if (P->getIncomingBlock(0) == ParentBB) {
18284 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
18285 } else if (P->getIncomingBlock(1) == ParentBB) {
18286 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
18287 }
18288
18289 if (Rdx && DominatedReduxValue(Rdx))
18290 return Rdx;
18291
18292 // Otherwise, check whether we have a loop latch to look at.
18293 Loop *BBL = LI->getLoopFor(ParentBB);
18294 if (!BBL)
18295 return nullptr;
18296 BasicBlock *BBLatch = BBL->getLoopLatch();
18297 if (!BBLatch)
18298 return nullptr;
18299
18300 // There is a loop latch, return the incoming value if it comes from
18301 // that. This reduction pattern occasionally turns up.
18302 if (P->getIncomingBlock(0) == BBLatch) {
18303 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
18304 } else if (P->getIncomingBlock(1) == BBLatch) {
18305 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
18306 }
18307
18308 if (Rdx && DominatedReduxValue(Rdx))
18309 return Rdx;
18310
18311 return nullptr;
18312}
18313
18314static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
18315 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
18316 return true;
18317 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
18318 return true;
18319 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
18320 return true;
18321 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
18322 return true;
18323 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
18324 return true;
18325 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
18326 return true;
18327 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
18328 return true;
18329 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
18330 return true;
18331 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
18332 return true;
18333 return false;
18334}
18335
18336/// We could have an initial reduction that is not an add.
18337/// r *= v1 + v2 + v3 + v4
18338/// In such a case start looking for a tree rooted in the first '+'.
18339/// \Returns the new root if found, which may be nullptr if not an instruction.
18341 Instruction *Root) {
18342 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
18343 isa<IntrinsicInst>(Root)) &&
18344 "Expected binop, select, or intrinsic for reduction matching");
18345 Value *LHS =
18346 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
18347 Value *RHS =
18348 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
18349 if (LHS == Phi)
18350 return dyn_cast<Instruction>(RHS);
18351 if (RHS == Phi)
18352 return dyn_cast<Instruction>(LHS);
18353 return nullptr;
18354}
18355
18356/// \p Returns the first operand of \p I that does not match \p Phi. If
18357/// operand is not an instruction it returns nullptr.
18359 Value *Op0 = nullptr;
18360 Value *Op1 = nullptr;
18361 if (!matchRdxBop(I, Op0, Op1))
18362 return nullptr;
18363 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
18364}
18365
18366/// \Returns true if \p I is a candidate instruction for reduction vectorization.
18368 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
18369 Value *B0 = nullptr, *B1 = nullptr;
18370 bool IsBinop = matchRdxBop(I, B0, B1);
18371 return IsBinop || IsSelect;
18372}
18373
18374bool SLPVectorizerPass::vectorizeHorReduction(
18376 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
18377 if (!ShouldVectorizeHor)
18378 return false;
18379 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
18380
18381 if (Root->getParent() != BB || isa<PHINode>(Root))
18382 return false;
18383
18384 // If we can find a secondary reduction root, use that instead.
18385 auto SelectRoot = [&]() {
18386 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
18387 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
18388 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
18389 return NewRoot;
18390 return Root;
18391 };
18392
18393 // Start analysis starting from Root instruction. If horizontal reduction is
18394 // found, try to vectorize it. If it is not a horizontal reduction or
18395 // vectorization is not possible or not effective, and currently analyzed
18396 // instruction is a binary operation, try to vectorize the operands, using
18397 // pre-order DFS traversal order. If the operands were not vectorized, repeat
18398 // the same procedure considering each operand as a possible root of the
18399 // horizontal reduction.
18400 // Interrupt the process if the Root instruction itself was vectorized or all
18401 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
18402 // If a horizintal reduction was not matched or vectorized we collect
18403 // instructions for possible later attempts for vectorization.
18404 std::queue<std::pair<Instruction *, unsigned>> Stack;
18405 Stack.emplace(SelectRoot(), 0);
18406 SmallPtrSet<Value *, 8> VisitedInstrs;
18407 bool Res = false;
18408 auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
18409 if (R.isAnalyzedReductionRoot(Inst))
18410 return nullptr;
18411 if (!isReductionCandidate(Inst))
18412 return nullptr;
18413 HorizontalReduction HorRdx;
18414 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
18415 return nullptr;
18416 return HorRdx.tryToReduce(R, *DL, TTI, *TLI);
18417 };
18418 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
18419 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
18420 FutureSeed = getNonPhiOperand(Root, P);
18421 if (!FutureSeed)
18422 return false;
18423 }
18424 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
18425 // analysis is done separately.
18426 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
18427 PostponedInsts.push_back(FutureSeed);
18428 return true;
18429 };
18430
18431 while (!Stack.empty()) {
18432 Instruction *Inst;
18433 unsigned Level;
18434 std::tie(Inst, Level) = Stack.front();
18435 Stack.pop();
18436 // Do not try to analyze instruction that has already been vectorized.
18437 // This may happen when we vectorize instruction operands on a previous
18438 // iteration while stack was populated before that happened.
18439 if (R.isDeleted(Inst))
18440 continue;
18441 if (Value *VectorizedV = TryToReduce(Inst)) {
18442 Res = true;
18443 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
18444 // Try to find another reduction.
18445 Stack.emplace(I, Level);
18446 continue;
18447 }
18448 if (R.isDeleted(Inst))
18449 continue;
18450 } else {
18451 // We could not vectorize `Inst` so try to use it as a future seed.
18452 if (!TryAppendToPostponedInsts(Inst)) {
18453 assert(Stack.empty() && "Expected empty stack");
18454 break;
18455 }
18456 }
18457
18458 // Try to vectorize operands.
18459 // Continue analysis for the instruction from the same basic block only to
18460 // save compile time.
18461 if (++Level < RecursionMaxDepth)
18462 for (auto *Op : Inst->operand_values())
18463 if (VisitedInstrs.insert(Op).second)
18464 if (auto *I = dyn_cast<Instruction>(Op))
18465 // Do not try to vectorize CmpInst operands, this is done
18466 // separately.
18467 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
18468 !R.isDeleted(I) && I->getParent() == BB)
18469 Stack.emplace(I, Level);
18470 }
18471 return Res;
18472}
18473
18474bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
18475 BasicBlock *BB, BoUpSLP &R,
18477 SmallVector<WeakTrackingVH> PostponedInsts;
18478 bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
18479 Res |= tryToVectorize(PostponedInsts, R);
18480 return Res;
18481}
18482
18483bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
18484 BoUpSLP &R) {
18485 bool Res = false;
18486 for (Value *V : Insts)
18487 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
18488 Res |= tryToVectorize(Inst, R);
18489 return Res;
18490}
18491
18492bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
18493 BasicBlock *BB, BoUpSLP &R,
18494 bool MaxVFOnly) {
18495 if (!R.canMapToVector(IVI->getType()))
18496 return false;
18497
18498 SmallVector<Value *, 16> BuildVectorOpds;
18499 SmallVector<Value *, 16> BuildVectorInsts;
18500 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
18501 return false;
18502
18503 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
18504 R.getORE()->emit([&]() {
18505 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
18506 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
18507 "trying reduction first.";
18508 });
18509 return false;
18510 }
18511 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
18512 // Aggregate value is unlikely to be processed in vector register.
18513 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
18514}
18515
18516bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
18517 BasicBlock *BB, BoUpSLP &R,
18518 bool MaxVFOnly) {
18519 SmallVector<Value *, 16> BuildVectorInsts;
18520 SmallVector<Value *, 16> BuildVectorOpds;
18522 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
18523 (llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
18524 isFixedVectorShuffle(BuildVectorOpds, Mask)))
18525 return false;
18526
18527 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
18528 R.getORE()->emit([&]() {
18529 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
18530 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
18531 "trying reduction first.";
18532 });
18533 return false;
18534 }
18535 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
18536 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
18537}
18538
18539template <typename T>
18541 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
18542 function_ref<bool(T *, T *)> AreCompatible,
18543 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
18544 bool MaxVFOnly, BoUpSLP &R) {
18545 bool Changed = false;
18546 // Sort by type, parent, operands.
18547 stable_sort(Incoming, Comparator);
18548
18549 // Try to vectorize elements base on their type.
18550 SmallVector<T *> Candidates;
18552 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
18553 VL.clear()) {
18554 // Look for the next elements with the same type, parent and operand
18555 // kinds.
18556 auto *I = dyn_cast<Instruction>(*IncIt);
18557 if (!I || R.isDeleted(I)) {
18558 ++IncIt;
18559 continue;
18560 }
18561 auto *SameTypeIt = IncIt;
18562 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
18563 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
18564 AreCompatible(*SameTypeIt, *IncIt))) {
18565 auto *I = dyn_cast<Instruction>(*SameTypeIt);
18566 ++SameTypeIt;
18567 if (I && !R.isDeleted(I))
18568 VL.push_back(cast<T>(I));
18569 }
18570
18571 // Try to vectorize them.
18572 unsigned NumElts = VL.size();
18573 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
18574 << NumElts << ")\n");
18575 // The vectorization is a 3-state attempt:
18576 // 1. Try to vectorize instructions with the same/alternate opcodes with the
18577 // size of maximal register at first.
18578 // 2. Try to vectorize remaining instructions with the same type, if
18579 // possible. This may result in the better vectorization results rather than
18580 // if we try just to vectorize instructions with the same/alternate opcodes.
18581 // 3. Final attempt to try to vectorize all instructions with the
18582 // same/alternate ops only, this may result in some extra final
18583 // vectorization.
18584 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
18585 // Success start over because instructions might have been changed.
18586 Changed = true;
18587 VL.swap(Candidates);
18588 Candidates.clear();
18589 for (T *V : VL) {
18590 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
18591 Candidates.push_back(V);
18592 }
18593 } else {
18594 /// \Returns the minimum number of elements that we will attempt to
18595 /// vectorize.
18596 auto GetMinNumElements = [&R](Value *V) {
18597 unsigned EltSize = R.getVectorElementSize(V);
18598 return std::max(2U, R.getMaxVecRegSize() / EltSize);
18599 };
18600 if (NumElts < GetMinNumElements(*IncIt) &&
18601 (Candidates.empty() ||
18602 Candidates.front()->getType() == (*IncIt)->getType())) {
18603 for (T *V : VL) {
18604 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
18605 Candidates.push_back(V);
18606 }
18607 }
18608 }
18609 // Final attempt to vectorize instructions with the same types.
18610 if (Candidates.size() > 1 &&
18611 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
18612 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
18613 // Success start over because instructions might have been changed.
18614 Changed = true;
18615 } else if (MaxVFOnly) {
18616 // Try to vectorize using small vectors.
18618 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
18619 VL.clear()) {
18620 auto *I = dyn_cast<Instruction>(*It);
18621 if (!I || R.isDeleted(I)) {
18622 ++It;
18623 continue;
18624 }
18625 auto *SameTypeIt = It;
18626 while (SameTypeIt != End &&
18627 (!isa<Instruction>(*SameTypeIt) ||
18628 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
18629 AreCompatible(*SameTypeIt, *It))) {
18630 auto *I = dyn_cast<Instruction>(*SameTypeIt);
18631 ++SameTypeIt;
18632 if (I && !R.isDeleted(I))
18633 VL.push_back(cast<T>(I));
18634 }
18635 unsigned NumElts = VL.size();
18636 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
18637 /*MaxVFOnly=*/false))
18638 Changed = true;
18639 It = SameTypeIt;
18640 }
18641 }
18642 Candidates.clear();
18643 }
18644
18645 // Start over at the next instruction of a different type (or the end).
18646 IncIt = SameTypeIt;
18647 }
18648 return Changed;
18649}
18650
18651/// Compare two cmp instructions. If IsCompatibility is true, function returns
18652/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
18653/// operands. If IsCompatibility is false, function implements strict weak
18654/// ordering relation between two cmp instructions, returning true if the first
18655/// instruction is "less" than the second, i.e. its predicate is less than the
18656/// predicate of the second or the operands IDs are less than the operands IDs
18657/// of the second cmp instruction.
18658template <bool IsCompatibility>
18659static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
18660 const DominatorTree &DT) {
18661 assert(isValidElementType(V->getType()) &&
18662 isValidElementType(V2->getType()) &&
18663 "Expected valid element types only.");
18664 if (V == V2)
18665 return IsCompatibility;
18666 auto *CI1 = cast<CmpInst>(V);
18667 auto *CI2 = cast<CmpInst>(V2);
18668 if (CI1->getOperand(0)->getType()->getTypeID() <
18669 CI2->getOperand(0)->getType()->getTypeID())
18670 return !IsCompatibility;
18671 if (CI1->getOperand(0)->getType()->getTypeID() >
18672 CI2->getOperand(0)->getType()->getTypeID())
18673 return false;
18674 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
18676 return !IsCompatibility;
18677 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
18679 return false;
18680 CmpInst::Predicate Pred1 = CI1->getPredicate();
18681 CmpInst::Predicate Pred2 = CI2->getPredicate();
18684 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
18685 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
18686 if (BasePred1 < BasePred2)
18687 return !IsCompatibility;
18688 if (BasePred1 > BasePred2)
18689 return false;
18690 // Compare operands.
18691 bool CI1Preds = Pred1 == BasePred1;
18692 bool CI2Preds = Pred2 == BasePred1;
18693 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
18694 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
18695 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
18696 if (Op1 == Op2)
18697 continue;
18698 if (Op1->getValueID() < Op2->getValueID())
18699 return !IsCompatibility;
18700 if (Op1->getValueID() > Op2->getValueID())
18701 return false;
18702 if (auto *I1 = dyn_cast<Instruction>(Op1))
18703 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
18704 if (IsCompatibility) {
18705 if (I1->getParent() != I2->getParent())
18706 return false;
18707 } else {
18708 // Try to compare nodes with same parent.
18709 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
18710 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
18711 if (!NodeI1)
18712 return NodeI2 != nullptr;
18713 if (!NodeI2)
18714 return false;
18715 assert((NodeI1 == NodeI2) ==
18716 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18717 "Different nodes should have different DFS numbers");
18718 if (NodeI1 != NodeI2)
18719 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18720 }
18721 InstructionsState S = getSameOpcode({I1, I2}, TLI);
18722 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
18723 continue;
18724 if (IsCompatibility)
18725 return false;
18726 if (I1->getOpcode() != I2->getOpcode())
18727 return I1->getOpcode() < I2->getOpcode();
18728 }
18729 }
18730 return IsCompatibility;
18731}
18732
18733template <typename ItT>
18734bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
18735 BasicBlock *BB, BoUpSLP &R) {
18736 bool Changed = false;
18737 // Try to find reductions first.
18738 for (CmpInst *I : CmpInsts) {
18739 if (R.isDeleted(I))
18740 continue;
18741 for (Value *Op : I->operands())
18742 if (auto *RootOp = dyn_cast<Instruction>(Op))
18743 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI);
18744 }
18745 // Try to vectorize operands as vector bundles.
18746 for (CmpInst *I : CmpInsts) {
18747 if (R.isDeleted(I))
18748 continue;
18749 Changed |= tryToVectorize(I, R);
18750 }
18751 // Try to vectorize list of compares.
18752 // Sort by type, compare predicate, etc.
18753 auto CompareSorter = [&](Value *V, Value *V2) {
18754 if (V == V2)
18755 return false;
18756 return compareCmp<false>(V, V2, *TLI, *DT);
18757 };
18758
18759 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
18760 if (V1 == V2)
18761 return true;
18762 return compareCmp<true>(V1, V2, *TLI, *DT);
18763 };
18764
18766 for (Instruction *V : CmpInsts)
18767 if (!R.isDeleted(V) && isValidElementType(V->getType()))
18768 Vals.push_back(V);
18769 if (Vals.size() <= 1)
18770 return Changed;
18771 Changed |= tryToVectorizeSequence<Value>(
18772 Vals, CompareSorter, AreCompatibleCompares,
18773 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18774 // Exclude possible reductions from other blocks.
18775 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
18776 return any_of(V->users(), [V](User *U) {
18777 auto *Select = dyn_cast<SelectInst>(U);
18778 return Select &&
18779 Select->getParent() != cast<Instruction>(V)->getParent();
18780 });
18781 });
18782 if (ArePossiblyReducedInOtherBlock)
18783 return false;
18784 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18785 },
18786 /*MaxVFOnly=*/true, R);
18787 return Changed;
18788}
18789
18790bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
18791 BasicBlock *BB, BoUpSLP &R) {
18792 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
18793 "This function only accepts Insert instructions");
18794 bool OpsChanged = false;
18795 SmallVector<WeakTrackingVH> PostponedInsts;
18796 for (auto *I : reverse(Instructions)) {
18797 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
18798 if (R.isDeleted(I) || isa<CmpInst>(I))
18799 continue;
18800 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
18801 OpsChanged |=
18802 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
18803 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
18804 OpsChanged |=
18805 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
18806 }
18807 // pass2 - try to vectorize reductions only
18808 if (R.isDeleted(I))
18809 continue;
18810 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
18811 if (R.isDeleted(I) || isa<CmpInst>(I))
18812 continue;
18813 // pass3 - try to match and vectorize a buildvector sequence.
18814 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
18815 OpsChanged |=
18816 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
18817 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
18818 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
18819 /*MaxVFOnly=*/false);
18820 }
18821 }
18822 // Now try to vectorize postponed instructions.
18823 OpsChanged |= tryToVectorize(PostponedInsts, R);
18824
18825 Instructions.clear();
18826 return OpsChanged;
18827}
18828
18829bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
18830 bool Changed = false;
18832 SmallPtrSet<Value *, 16> VisitedInstrs;
18833 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
18834 // node. Allows better to identify the chains that can be vectorized in the
18835 // better way.
18837 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
18839 isValidElementType(V2->getType()) &&
18840 "Expected vectorizable types only.");
18841 // It is fine to compare type IDs here, since we expect only vectorizable
18842 // types, like ints, floats and pointers, we don't care about other type.
18843 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
18844 return true;
18845 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
18846 return false;
18847 if (V1->getType()->getScalarSizeInBits() <
18848 V2->getType()->getScalarSizeInBits())
18849 return true;
18850 if (V1->getType()->getScalarSizeInBits() >
18851 V2->getType()->getScalarSizeInBits())
18852 return false;
18853 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
18854 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
18855 if (Opcodes1.size() < Opcodes2.size())
18856 return true;
18857 if (Opcodes1.size() > Opcodes2.size())
18858 return false;
18859 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
18860 {
18861 // Instructions come first.
18862 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
18863 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
18864 if (I1 && I2) {
18865 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
18866 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
18867 if (!NodeI1)
18868 return NodeI2 != nullptr;
18869 if (!NodeI2)
18870 return false;
18871 assert((NodeI1 == NodeI2) ==
18872 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18873 "Different nodes should have different DFS numbers");
18874 if (NodeI1 != NodeI2)
18875 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18876 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18877 if (S.getOpcode() && !S.isAltShuffle())
18878 continue;
18879 return I1->getOpcode() < I2->getOpcode();
18880 }
18881 if (I1)
18882 return true;
18883 if (I2)
18884 return false;
18885 }
18886 {
18887 // Non-undef constants come next.
18888 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
18889 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
18890 if (C1 && C2)
18891 continue;
18892 if (C1)
18893 return true;
18894 if (C2)
18895 return false;
18896 }
18897 bool U1 = isa<UndefValue>(Opcodes1[I]);
18898 bool U2 = isa<UndefValue>(Opcodes2[I]);
18899 {
18900 // Non-constant non-instructions come next.
18901 if (!U1 && !U2) {
18902 auto ValID1 = Opcodes1[I]->getValueID();
18903 auto ValID2 = Opcodes2[I]->getValueID();
18904 if (ValID1 == ValID2)
18905 continue;
18906 if (ValID1 < ValID2)
18907 return true;
18908 if (ValID1 > ValID2)
18909 return false;
18910 }
18911 if (!U1)
18912 return true;
18913 if (!U2)
18914 return false;
18915 }
18916 // Undefs come last.
18917 assert(U1 && U2 && "The only thing left should be undef & undef.");
18918 continue;
18919 }
18920 return false;
18921 };
18922 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
18923 if (V1 == V2)
18924 return true;
18925 if (V1->getType() != V2->getType())
18926 return false;
18927 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
18928 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
18929 if (Opcodes1.size() != Opcodes2.size())
18930 return false;
18931 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
18932 // Undefs are compatible with any other value.
18933 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
18934 continue;
18935 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
18936 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
18937 if (R.isDeleted(I1) || R.isDeleted(I2))
18938 return false;
18939 if (I1->getParent() != I2->getParent())
18940 return false;
18941 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18942 if (S.getOpcode())
18943 continue;
18944 return false;
18945 }
18946 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
18947 continue;
18948 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
18949 return false;
18950 }
18951 return true;
18952 };
18953
18954 bool HaveVectorizedPhiNodes = false;
18955 do {
18956 // Collect the incoming values from the PHIs.
18957 Incoming.clear();
18958 for (Instruction &I : *BB) {
18959 auto *P = dyn_cast<PHINode>(&I);
18960 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
18961 break;
18962
18963 // No need to analyze deleted, vectorized and non-vectorizable
18964 // instructions.
18965 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
18966 isValidElementType(P->getType()))
18967 Incoming.push_back(P);
18968 }
18969
18970 if (Incoming.size() <= 1)
18971 break;
18972
18973 // Find the corresponding non-phi nodes for better matching when trying to
18974 // build the tree.
18975 for (Value *V : Incoming) {
18976 SmallVectorImpl<Value *> &Opcodes =
18977 PHIToOpcodes.try_emplace(V).first->getSecond();
18978 if (!Opcodes.empty())
18979 continue;
18980 SmallVector<Value *, 4> Nodes(1, V);
18982 while (!Nodes.empty()) {
18983 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
18984 if (!Visited.insert(PHI).second)
18985 continue;
18986 for (Value *V : PHI->incoming_values()) {
18987 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
18988 Nodes.push_back(PHI1);
18989 continue;
18990 }
18991 Opcodes.emplace_back(V);
18992 }
18993 }
18994 }
18995
18996 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
18997 Incoming, PHICompare, AreCompatiblePHIs,
18998 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18999 return tryToVectorizeList(Candidates, R, MaxVFOnly);
19000 },
19001 /*MaxVFOnly=*/true, R);
19002 Changed |= HaveVectorizedPhiNodes;
19003 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
19004 auto *PHI = dyn_cast<PHINode>(P.first);
19005 return !PHI || R.isDeleted(PHI);
19006 }))
19007 PHIToOpcodes.clear();
19008 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
19009 } while (HaveVectorizedPhiNodes);
19010
19011 VisitedInstrs.clear();
19012
19013 InstSetVector PostProcessInserts;
19014 SmallSetVector<CmpInst *, 8> PostProcessCmps;
19015 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
19016 // also vectorizes `PostProcessCmps`.
19017 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
19018 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
19019 if (VectorizeCmps) {
19020 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
19021 PostProcessCmps.clear();
19022 }
19023 PostProcessInserts.clear();
19024 return Changed;
19025 };
19026 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
19027 auto IsInPostProcessInstrs = [&](Instruction *I) {
19028 if (auto *Cmp = dyn_cast<CmpInst>(I))
19029 return PostProcessCmps.contains(Cmp);
19030 return isa<InsertElementInst, InsertValueInst>(I) &&
19031 PostProcessInserts.contains(I);
19032 };
19033 // Returns true if `I` is an instruction without users, like terminator, or
19034 // function call with ignored return value, store. Ignore unused instructions
19035 // (basing on instruction type, except for CallInst and InvokeInst).
19036 auto HasNoUsers = [](Instruction *I) {
19037 return I->use_empty() &&
19038 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
19039 };
19040 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
19041 // Skip instructions with scalable type. The num of elements is unknown at
19042 // compile-time for scalable type.
19043 if (isa<ScalableVectorType>(It->getType()))
19044 continue;
19045
19046 // Skip instructions marked for the deletion.
19047 if (R.isDeleted(&*It))
19048 continue;
19049 // We may go through BB multiple times so skip the one we have checked.
19050 if (!VisitedInstrs.insert(&*It).second) {
19051 if (HasNoUsers(&*It) &&
19052 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
19053 // We would like to start over since some instructions are deleted
19054 // and the iterator may become invalid value.
19055 Changed = true;
19056 It = BB->begin();
19057 E = BB->end();
19058 }
19059 continue;
19060 }
19061
19062 if (isa<DbgInfoIntrinsic>(It))
19063 continue;
19064
19065 // Try to vectorize reductions that use PHINodes.
19066 if (PHINode *P = dyn_cast<PHINode>(It)) {
19067 // Check that the PHI is a reduction PHI.
19068 if (P->getNumIncomingValues() == 2) {
19069 // Try to match and vectorize a horizontal reduction.
19070 Instruction *Root = getReductionInstr(DT, P, BB, LI);
19071 if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
19072 Changed = true;
19073 It = BB->begin();
19074 E = BB->end();
19075 continue;
19076 }
19077 }
19078 // Try to vectorize the incoming values of the PHI, to catch reductions
19079 // that feed into PHIs.
19080 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
19081 // Skip if the incoming block is the current BB for now. Also, bypass
19082 // unreachable IR for efficiency and to avoid crashing.
19083 // TODO: Collect the skipped incoming values and try to vectorize them
19084 // after processing BB.
19085 if (BB == P->getIncomingBlock(I) ||
19086 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
19087 continue;
19088
19089 // Postponed instructions should not be vectorized here, delay their
19090 // vectorization.
19091 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
19092 PI && !IsInPostProcessInstrs(PI)) {
19093 bool Res = vectorizeRootInstruction(nullptr, PI,
19094 P->getIncomingBlock(I), R, TTI);
19095 Changed |= Res;
19096 if (Res && R.isDeleted(P)) {
19097 It = BB->begin();
19098 E = BB->end();
19099 break;
19100 }
19101 }
19102 }
19103 continue;
19104 }
19105
19106 if (HasNoUsers(&*It)) {
19107 bool OpsChanged = false;
19108 auto *SI = dyn_cast<StoreInst>(It);
19109 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
19110 if (SI) {
19111 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
19112 // Try to vectorize chain in store, if this is the only store to the
19113 // address in the block.
19114 // TODO: This is just a temporarily solution to save compile time. Need
19115 // to investigate if we can safely turn on slp-vectorize-hor-store
19116 // instead to allow lookup for reduction chains in all non-vectorized
19117 // stores (need to check side effects and compile time).
19118 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
19119 SI->getValueOperand()->hasOneUse();
19120 }
19121 if (TryToVectorizeRoot) {
19122 for (auto *V : It->operand_values()) {
19123 // Postponed instructions should not be vectorized here, delay their
19124 // vectorization.
19125 if (auto *VI = dyn_cast<Instruction>(V);
19126 VI && !IsInPostProcessInstrs(VI))
19127 // Try to match and vectorize a horizontal reduction.
19128 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI);
19129 }
19130 }
19131 // Start vectorization of post-process list of instructions from the
19132 // top-tree instructions to try to vectorize as many instructions as
19133 // possible.
19134 OpsChanged |=
19135 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
19136 if (OpsChanged) {
19137 // We would like to start over since some instructions are deleted
19138 // and the iterator may become invalid value.
19139 Changed = true;
19140 It = BB->begin();
19141 E = BB->end();
19142 continue;
19143 }
19144 }
19145
19146 if (isa<InsertElementInst, InsertValueInst>(It))
19147 PostProcessInserts.insert(&*It);
19148 else if (isa<CmpInst>(It))
19149 PostProcessCmps.insert(cast<CmpInst>(&*It));
19150 }
19151
19152 return Changed;
19153}
19154
19155bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
19156 auto Changed = false;
19157 for (auto &Entry : GEPs) {
19158 // If the getelementptr list has fewer than two elements, there's nothing
19159 // to do.
19160 if (Entry.second.size() < 2)
19161 continue;
19162
19163 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
19164 << Entry.second.size() << ".\n");
19165
19166 // Process the GEP list in chunks suitable for the target's supported
19167 // vector size. If a vector register can't hold 1 element, we are done. We
19168 // are trying to vectorize the index computations, so the maximum number of
19169 // elements is based on the size of the index expression, rather than the
19170 // size of the GEP itself (the target's pointer size).
19171 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
19172 return !R.isDeleted(GEP);
19173 });
19174 if (It == Entry.second.end())
19175 continue;
19176 unsigned MaxVecRegSize = R.getMaxVecRegSize();
19177 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
19178 if (MaxVecRegSize < EltSize)
19179 continue;
19180
19181 unsigned MaxElts = MaxVecRegSize / EltSize;
19182 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
19183 auto Len = std::min<unsigned>(BE - BI, MaxElts);
19184 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
19185
19186 // Initialize a set a candidate getelementptrs. Note that we use a
19187 // SetVector here to preserve program order. If the index computations
19188 // are vectorizable and begin with loads, we want to minimize the chance
19189 // of having to reorder them later.
19190 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
19191
19192 // Some of the candidates may have already been vectorized after we
19193 // initially collected them or their index is optimized to constant value.
19194 // If so, they are marked as deleted, so remove them from the set of
19195 // candidates.
19196 Candidates.remove_if([&R](Value *I) {
19197 return R.isDeleted(cast<Instruction>(I)) ||
19198 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
19199 });
19200
19201 // Remove from the set of candidates all pairs of getelementptrs with
19202 // constant differences. Such getelementptrs are likely not good
19203 // candidates for vectorization in a bottom-up phase since one can be
19204 // computed from the other. We also ensure all candidate getelementptr
19205 // indices are unique.
19206 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
19207 auto *GEPI = GEPList[I];
19208 if (!Candidates.count(GEPI))
19209 continue;
19210 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
19211 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
19212 auto *GEPJ = GEPList[J];
19213 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
19214 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
19215 Candidates.remove(GEPI);
19216 Candidates.remove(GEPJ);
19217 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
19218 Candidates.remove(GEPJ);
19219 }
19220 }
19221 }
19222
19223 // We break out of the above computation as soon as we know there are
19224 // fewer than two candidates remaining.
19225 if (Candidates.size() < 2)
19226 continue;
19227
19228 // Add the single, non-constant index of each candidate to the bundle. We
19229 // ensured the indices met these constraints when we originally collected
19230 // the getelementptrs.
19231 SmallVector<Value *, 16> Bundle(Candidates.size());
19232 auto BundleIndex = 0u;
19233 for (auto *V : Candidates) {
19234 auto *GEP = cast<GetElementPtrInst>(V);
19235 auto *GEPIdx = GEP->idx_begin()->get();
19236 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
19237 Bundle[BundleIndex++] = GEPIdx;
19238 }
19239
19240 // Try and vectorize the indices. We are currently only interested in
19241 // gather-like cases of the form:
19242 //
19243 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
19244 //
19245 // where the loads of "a", the loads of "b", and the subtractions can be
19246 // performed in parallel. It's likely that detecting this pattern in a
19247 // bottom-up phase will be simpler and less costly than building a
19248 // full-blown top-down phase beginning at the consecutive loads.
19249 Changed |= tryToVectorizeList(Bundle, R);
19250 }
19251 }
19252 return Changed;
19253}
19254
19255bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
19256 bool Changed = false;
19257 // Sort by type, base pointers and values operand. Value operands must be
19258 // compatible (have the same opcode, same parent), otherwise it is
19259 // definitely not profitable to try to vectorize them.
19260 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
19261 if (V->getValueOperand()->getType()->getTypeID() <
19262 V2->getValueOperand()->getType()->getTypeID())
19263 return true;
19264 if (V->getValueOperand()->getType()->getTypeID() >
19265 V2->getValueOperand()->getType()->getTypeID())
19266 return false;
19267 if (V->getPointerOperandType()->getTypeID() <
19268 V2->getPointerOperandType()->getTypeID())
19269 return true;
19270 if (V->getPointerOperandType()->getTypeID() >
19271 V2->getPointerOperandType()->getTypeID())
19272 return false;
19273 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
19274 V2->getValueOperand()->getType()->getScalarSizeInBits())
19275 return true;
19276 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
19277 V2->getValueOperand()->getType()->getScalarSizeInBits())
19278 return false;
19279 // UndefValues are compatible with all other values.
19280 if (isa<UndefValue>(V->getValueOperand()) ||
19281 isa<UndefValue>(V2->getValueOperand()))
19282 return false;
19283 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
19284 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
19286 DT->getNode(I1->getParent());
19288 DT->getNode(I2->getParent());
19289 assert(NodeI1 && "Should only process reachable instructions");
19290 assert(NodeI2 && "Should only process reachable instructions");
19291 assert((NodeI1 == NodeI2) ==
19292 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
19293 "Different nodes should have different DFS numbers");
19294 if (NodeI1 != NodeI2)
19295 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
19296 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
19297 if (S.getOpcode())
19298 return false;
19299 return I1->getOpcode() < I2->getOpcode();
19300 }
19301 if (isa<Constant>(V->getValueOperand()) &&
19302 isa<Constant>(V2->getValueOperand()))
19303 return false;
19304 return V->getValueOperand()->getValueID() <
19305 V2->getValueOperand()->getValueID();
19306 };
19307
19308 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
19309 if (V1 == V2)
19310 return true;
19311 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
19312 return false;
19313 if (V1->getPointerOperandType() != V2->getPointerOperandType())
19314 return false;
19315 // Undefs are compatible with any other value.
19316 if (isa<UndefValue>(V1->getValueOperand()) ||
19317 isa<UndefValue>(V2->getValueOperand()))
19318 return true;
19319 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
19320 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
19321 if (I1->getParent() != I2->getParent())
19322 return false;
19323 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
19324 return S.getOpcode() > 0;
19325 }
19326 if (isa<Constant>(V1->getValueOperand()) &&
19327 isa<Constant>(V2->getValueOperand()))
19328 return true;
19329 return V1->getValueOperand()->getValueID() ==
19330 V2->getValueOperand()->getValueID();
19331 };
19332
19333 // Attempt to sort and vectorize each of the store-groups.
19335 for (auto &Pair : Stores) {
19336 if (Pair.second.size() < 2)
19337 continue;
19338
19339 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
19340 << Pair.second.size() << ".\n");
19341
19342 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
19343 continue;
19344
19345 // Reverse stores to do bottom-to-top analysis. This is important if the
19346 // values are stores to the same addresses several times, in this case need
19347 // to follow the stores order (reversed to meet the memory dependecies).
19348 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
19349 Pair.second.rend());
19350 Changed |= tryToVectorizeSequence<StoreInst>(
19351 ReversedStores, StoreSorter, AreCompatibleStores,
19352 [&](ArrayRef<StoreInst *> Candidates, bool) {
19353 return vectorizeStores(Candidates, R, Attempted);
19354 },
19355 /*MaxVFOnly=*/false, R);
19356 }
19357 return Changed;
19358}
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:537
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Definition: DataLayout.cpp:946
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
IRTranslator LLVM IR MI
This defines the Use class.
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
Loop::LoopBounds::Direction Direction
Definition: LoopInfo.cpp:231
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(VerifyEach)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > AllowHorRdxIdenityOptimization("slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, cl::desc("Allow optimization of original scalar identity operations on " "matched horizontal reductions."))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static Value * isOneOf(const InstructionsState &S, Value *Op)
Chooses the correct key for scheduling data.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
#define SV_NAME
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static std::string shortBundleName(ArrayRef< Value * > VL)
Print a short descriptor of the instruction bundle suitable for debug output.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt)
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition: VPlanSLP.cpp:154
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:78
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1310
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1636
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:180
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:266
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:424
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:187
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:174
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:228
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:204
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:168
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:195
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:232
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
reverse_iterator rend()
Definition: BasicBlock.h:466
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1236
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:2070
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1965
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2207
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:2064
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1323
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1401
unsigned arg_size() const
Definition: InstrTypes.h:1408
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1594
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:2061
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:530
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:747
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1104
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:786
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:787
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:781
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:785
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:783
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:909
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:871
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:847
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2281
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:155
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1450
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1399
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:417
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:484
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
Definition: DataLayout.cpp:904
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:226
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:202
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:146
value_type & FindAndConstruct(const KeyT &Key)
Definition: DenseMap.h:355
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:321
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void set()
Definition: FMF.h:62
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:680
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:130
Type * getReturnType() const
Definition: DerivedTypes.h:124
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:915
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:91
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2262
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1042
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:922
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:508
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2465
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:536
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2270
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1812
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:463
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:933
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1091
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:172
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2540
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:171
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:308
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:217
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1871
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:488
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:845
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1758
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:483
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2371
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2402
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2254
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2499
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:468
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1671
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:166
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2278
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2166
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2201
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1831
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2417
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1592
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1366
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:631
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2671
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:282
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:754
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:466
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
Definition: Instruction.h:279
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:280
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:266
An instruction for reading from memory.
Definition: Instructions.h:174
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:259
Value * getPointerOperand()
Definition: Instructions.h:253
bool isSimple() const
Definition: Instructions.h:245
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:209
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:571
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
iterator end()
Definition: MapVector.h:71
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:55
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
size_type size() const
Definition: MapVector.h:60
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:83
void clear()
Definition: MapVector.h:88
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
T & front() const
front - Get the first element.
Definition: ArrayRef.h:363
iterator end() const
Definition: ArrayRef.h:357
iterator begin() const
Definition: ArrayRef.h:356
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:376
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
Definition: ArrayRef.h:449
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:155
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1852
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:323
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:361
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:412
iterator end() const
Definition: SmallPtrSet.h:437
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
iterator begin() const
Definition: SmallPtrSet.h:432
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:418
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:236
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:95
size_t size() const
Definition: SmallVector.h:92
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:587
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:718
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:951
void reserve(size_type N)
Definition: SmallVector.h:677
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:697
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:982
void resize(size_type N)
Definition: SmallVector.h:652
void push_back(const T &Elt)
Definition: SmallVector.h:427
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1210
An instruction for storing to memory.
Definition: Instructions.h:290
Type * getPointerOperandType() const
Definition: Instructions.h:380
Value * getValueOperand()
Definition: Instructions.h:374
Value * getPointerOperand()
Definition: Instructions.h:377
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:34
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr) const
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:159
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:230
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:251
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:283
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:165
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:224
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:136
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:343
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1833
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:242
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Definition: User.h:73
op_iterator op_begin()
Definition: User.h:234
Value * getOperand(unsigned i) const
Definition: User.h:169
iterator_range< value_op_iterator > operand_values()
Definition: User.h:266
The Vector Function Database.
Definition: VectorUtils.h:30
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:71
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:532
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:153
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:664
Type * getElementType() const
Definition: DerivedTypes.h:436
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type size() const
Definition: DenseSet.h:81
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
bool erase(const ValueT &V)
Definition: DenseSet.h:101
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:75
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals=std::nullopt)
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:105
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ Entry
Definition: COFF.h:826
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1539
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:816
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:875
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239
ManagedStatic< cl::opt< FnT >, OptCreatorT > Action
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:853
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1715
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:128
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:988
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:540
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
Value * createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a target reduction of the given vector.
Definition: LoopUtils.cpp:1210
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2406
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7133
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition: Utils.cpp:1671
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:547
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2065
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition: Local.cpp:400
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1754
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:120
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:419
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1308
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ Other
Any other memory.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ Or
Bitwise or logical OR of integers.
@ None
Not a recurrence.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
DWARFExpression::Operation Op
auto max_element(R &&Range)
Definition: STLExtras.h:1986
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:427
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
InstructionCost Cost
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:593
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:471
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2228
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:220
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1450
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1459
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.