LLVM 19.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <memory>
97#include <optional>
98#include <set>
99#include <string>
100#include <tuple>
101#include <utility>
102
103using namespace llvm;
104using namespace llvm::PatternMatch;
105using namespace slpvectorizer;
106
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
109
110STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111
112static cl::opt<bool>
113 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
114 cl::desc("Run the SLP vectorization passes"));
115
116static cl::opt<int>
118 cl::desc("Only vectorize if you gain more than this "
119 "number "));
120
122 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
123 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
124 "heuristics and makes vectorization decision via cost modeling."));
125
126static cl::opt<bool>
127ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
128 cl::desc("Attempt to vectorize horizontal reductions"));
129
131 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
132 cl::desc(
133 "Attempt to vectorize horizontal reductions feeding into a store"));
134
135// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
136// even if we match a reduction but do not vectorize in the end.
138 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
139 cl::desc("Allow optimization of original scalar identity operations on "
140 "matched horizontal reductions."));
141
142static cl::opt<int>
144 cl::desc("Attempt to vectorize for this register size in bits"));
145
148 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
149
150/// Limits the size of scheduling regions in a block.
151/// It avoid long compile times for _very_ large blocks where vector
152/// instructions are spread over a wide range.
153/// This limit is way higher than needed by real-world functions.
154static cl::opt<int>
155ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
156 cl::desc("Limit the size of the SLP scheduling region per block"));
157
159 "slp-min-reg-size", cl::init(128), cl::Hidden,
160 cl::desc("Attempt to vectorize for this register size in bits"));
161
163 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
164 cl::desc("Limit the recursion depth when building a vectorizable tree"));
165
167 "slp-min-tree-size", cl::init(3), cl::Hidden,
168 cl::desc("Only vectorize small trees if they are fully vectorizable"));
169
170// The maximum depth that the look-ahead score heuristic will explore.
171// The higher this value, the higher the compilation time overhead.
173 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
174 cl::desc("The maximum look-ahead depth for operand reordering scores"));
175
176// The maximum depth that the look-ahead score heuristic will explore
177// when it probing among candidates for vectorization tree roots.
178// The higher this value, the higher the compilation time overhead but unlike
179// similar limit for operands ordering this is less frequently used, hence
180// impact of higher value is less noticeable.
182 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
183 cl::desc("The maximum look-ahead depth for searching best rooting option"));
184
186 "slp-min-strided-loads", cl::init(2), cl::Hidden,
187 cl::desc("The minimum number of loads, which should be considered strided, "
188 "if the stride is > 1 or is runtime value"));
189
191 "slp-max-stride", cl::init(8), cl::Hidden,
192 cl::desc("The maximum stride, considered to be profitable."));
193
194static cl::opt<bool>
195 ViewSLPTree("view-slp-tree", cl::Hidden,
196 cl::desc("Display the SLP trees with Graphviz"));
197
199 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
200 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
201
202// Limit the number of alias checks. The limit is chosen so that
203// it has no negative effect on the llvm benchmarks.
204static const unsigned AliasedCheckLimit = 10;
205
206// Limit of the number of uses for potentially transformed instructions/values,
207// used in checks to avoid compile-time explode.
208static constexpr int UsesLimit = 8;
209
210// Another limit for the alias checks: The maximum distance between load/store
211// instructions where alias checks are done.
212// This limit is useful for very large basic blocks.
213static const unsigned MaxMemDepDistance = 160;
214
215/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
216/// regions to be handled.
217static const int MinScheduleRegionSize = 16;
218
219/// Predicate for the element types that the SLP vectorizer supports.
220///
221/// The most important thing to filter here are types which are invalid in LLVM
222/// vectors. We also filter target specific types which have absolutely no
223/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
224/// avoids spending time checking the cost model and realizing that they will
225/// be inevitably scalarized.
226static bool isValidElementType(Type *Ty) {
227 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
228 !Ty->isPPC_FP128Ty();
229}
230
231/// \returns True if the value is a constant (but not globals/constant
232/// expressions).
233static bool isConstant(Value *V) {
234 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
235}
236
237/// Checks if \p V is one of vector-like instructions, i.e. undef,
238/// insertelement/extractelement with constant indices for fixed vector type or
239/// extractvalue instruction.
241 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
242 !isa<ExtractValueInst, UndefValue>(V))
243 return false;
244 auto *I = dyn_cast<Instruction>(V);
245 if (!I || isa<ExtractValueInst>(I))
246 return true;
247 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
248 return false;
249 if (isa<ExtractElementInst>(I))
250 return isConstant(I->getOperand(1));
251 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
252 return isConstant(I->getOperand(2));
253}
254
255#if !defined(NDEBUG)
256/// Print a short descriptor of the instruction bundle suitable for debug output.
257static std::string shortBundleName(ArrayRef<Value *> VL) {
258 std::string Result;
259 raw_string_ostream OS(Result);
260 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
261 OS.flush();
262 return Result;
263}
264#endif
265
266/// \returns true if all of the instructions in \p VL are in the same block or
267/// false otherwise.
269 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
270 if (!I0)
271 return false;
273 return true;
274
275 BasicBlock *BB = I0->getParent();
276 for (int I = 1, E = VL.size(); I < E; I++) {
277 auto *II = dyn_cast<Instruction>(VL[I]);
278 if (!II)
279 return false;
280
281 if (BB != II->getParent())
282 return false;
283 }
284 return true;
285}
286
287/// \returns True if all of the values in \p VL are constants (but not
288/// globals/constant expressions).
290 // Constant expressions and globals can't be vectorized like normal integer/FP
291 // constants.
292 return all_of(VL, isConstant);
293}
294
295/// \returns True if all of the values in \p VL are identical or some of them
296/// are UndefValue.
297static bool isSplat(ArrayRef<Value *> VL) {
298 Value *FirstNonUndef = nullptr;
299 for (Value *V : VL) {
300 if (isa<UndefValue>(V))
301 continue;
302 if (!FirstNonUndef) {
303 FirstNonUndef = V;
304 continue;
305 }
306 if (V != FirstNonUndef)
307 return false;
308 }
309 return FirstNonUndef != nullptr;
310}
311
312/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
314 if (auto *Cmp = dyn_cast<CmpInst>(I))
315 return Cmp->isCommutative();
316 if (auto *BO = dyn_cast<BinaryOperator>(I))
317 return BO->isCommutative() ||
318 (BO->getOpcode() == Instruction::Sub &&
319 !BO->hasNUsesOrMore(UsesLimit) &&
320 all_of(
321 BO->uses(),
322 [](const Use &U) {
323 // Commutative, if icmp eq/ne sub, 0
324 ICmpInst::Predicate Pred;
325 if (match(U.getUser(),
326 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
327 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
328 return true;
329 // Commutative, if abs(sub nsw, true) or abs(sub, false).
330 ConstantInt *Flag;
331 return match(U.getUser(),
332 m_Intrinsic<Intrinsic::abs>(
333 m_Specific(U.get()), m_ConstantInt(Flag))) &&
334 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
335 Flag->isOne());
336 })) ||
337 (BO->getOpcode() == Instruction::FSub &&
338 !BO->hasNUsesOrMore(UsesLimit) &&
339 all_of(BO->uses(), [](const Use &U) {
340 return match(U.getUser(),
341 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
342 }));
343 return I->isCommutative();
344}
345
346/// \returns inserting index of InsertElement or InsertValue instruction,
347/// using Offset as base offset for index.
348static std::optional<unsigned> getInsertIndex(const Value *InsertInst,
349 unsigned Offset = 0) {
350 int Index = Offset;
351 if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
352 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
353 if (!VT)
354 return std::nullopt;
355 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
356 if (!CI)
357 return std::nullopt;
358 if (CI->getValue().uge(VT->getNumElements()))
359 return std::nullopt;
360 Index *= VT->getNumElements();
361 Index += CI->getZExtValue();
362 return Index;
363 }
364
365 const auto *IV = cast<InsertValueInst>(InsertInst);
366 Type *CurrentType = IV->getType();
367 for (unsigned I : IV->indices()) {
368 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
369 Index *= ST->getNumElements();
370 CurrentType = ST->getElementType(I);
371 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
372 Index *= AT->getNumElements();
373 CurrentType = AT->getElementType();
374 } else {
375 return std::nullopt;
376 }
377 Index += I;
378 }
379 return Index;
380}
381
382namespace {
383/// Specifies the way the mask should be analyzed for undefs/poisonous elements
384/// in the shuffle mask.
385enum class UseMask {
386 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
387 ///< check for the mask elements for the first argument (mask
388 ///< indices are in range [0:VF)).
389 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
390 ///< for the mask elements for the second argument (mask indices
391 ///< are in range [VF:2*VF))
392 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
393 ///< future shuffle elements and mark them as ones as being used
394 ///< in future. Non-undef elements are considered as unused since
395 ///< they're already marked as used in the mask.
396};
397} // namespace
398
399/// Prepares a use bitset for the given mask either for the first argument or
400/// for the second.
402 UseMask MaskArg) {
403 SmallBitVector UseMask(VF, true);
404 for (auto [Idx, Value] : enumerate(Mask)) {
405 if (Value == PoisonMaskElem) {
406 if (MaskArg == UseMask::UndefsAsMask)
407 UseMask.reset(Idx);
408 continue;
409 }
410 if (MaskArg == UseMask::FirstArg && Value < VF)
411 UseMask.reset(Value);
412 else if (MaskArg == UseMask::SecondArg && Value >= VF)
413 UseMask.reset(Value - VF);
414 }
415 return UseMask;
416}
417
418/// Checks if the given value is actually an undefined constant vector.
419/// Also, if the \p UseMask is not empty, tries to check if the non-masked
420/// elements actually mask the insertelement buildvector, if any.
421template <bool IsPoisonOnly = false>
423 const SmallBitVector &UseMask = {}) {
424 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
425 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
426 if (isa<T>(V))
427 return Res;
428 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
429 if (!VecTy)
430 return Res.reset();
431 auto *C = dyn_cast<Constant>(V);
432 if (!C) {
433 if (!UseMask.empty()) {
434 const Value *Base = V;
435 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
436 Base = II->getOperand(0);
437 if (isa<T>(II->getOperand(1)))
438 continue;
439 std::optional<unsigned> Idx = getInsertIndex(II);
440 if (!Idx) {
441 Res.reset();
442 return Res;
443 }
444 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
445 Res.reset(*Idx);
446 }
447 // TODO: Add analysis for shuffles here too.
448 if (V == Base) {
449 Res.reset();
450 } else {
451 SmallBitVector SubMask(UseMask.size(), false);
452 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
453 }
454 } else {
455 Res.reset();
456 }
457 return Res;
458 }
459 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
460 if (Constant *Elem = C->getAggregateElement(I))
461 if (!isa<T>(Elem) &&
462 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
463 Res.reset(I);
464 }
465 return Res;
466}
467
468/// Checks if the vector of instructions can be represented as a shuffle, like:
469/// %x0 = extractelement <4 x i8> %x, i32 0
470/// %x3 = extractelement <4 x i8> %x, i32 3
471/// %y1 = extractelement <4 x i8> %y, i32 1
472/// %y2 = extractelement <4 x i8> %y, i32 2
473/// %x0x0 = mul i8 %x0, %x0
474/// %x3x3 = mul i8 %x3, %x3
475/// %y1y1 = mul i8 %y1, %y1
476/// %y2y2 = mul i8 %y2, %y2
477/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
478/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
479/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
480/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
481/// ret <4 x i8> %ins4
482/// can be transformed into:
483/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
484/// i32 6>
485/// %2 = mul <4 x i8> %1, %1
486/// ret <4 x i8> %2
487/// Mask will return the Shuffle Mask equivalent to the extracted elements.
488/// TODO: Can we split off and reuse the shuffle mask detection from
489/// ShuffleVectorInst/getShuffleCost?
490static std::optional<TargetTransformInfo::ShuffleKind>
492 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
493 if (It == VL.end())
494 return std::nullopt;
495 auto *EI0 = cast<ExtractElementInst>(*It);
496 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
497 return std::nullopt;
498 unsigned Size =
499 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
500 Value *Vec1 = nullptr;
501 Value *Vec2 = nullptr;
502 enum ShuffleMode { Unknown, Select, Permute };
503 ShuffleMode CommonShuffleMode = Unknown;
504 Mask.assign(VL.size(), PoisonMaskElem);
505 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
506 // Undef can be represented as an undef element in a vector.
507 if (isa<UndefValue>(VL[I]))
508 continue;
509 auto *EI = cast<ExtractElementInst>(VL[I]);
510 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
511 return std::nullopt;
512 auto *Vec = EI->getVectorOperand();
513 // We can extractelement from undef or poison vector.
514 if (isUndefVector(Vec).all())
515 continue;
516 // All vector operands must have the same number of vector elements.
517 if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
518 return std::nullopt;
519 if (isa<UndefValue>(EI->getIndexOperand()))
520 continue;
521 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
522 if (!Idx)
523 return std::nullopt;
524 // Undefined behavior if Idx is negative or >= Size.
525 if (Idx->getValue().uge(Size))
526 continue;
527 unsigned IntIdx = Idx->getValue().getZExtValue();
528 Mask[I] = IntIdx;
529 // For correct shuffling we have to have at most 2 different vector operands
530 // in all extractelement instructions.
531 if (!Vec1 || Vec1 == Vec) {
532 Vec1 = Vec;
533 } else if (!Vec2 || Vec2 == Vec) {
534 Vec2 = Vec;
535 Mask[I] += Size;
536 } else {
537 return std::nullopt;
538 }
539 if (CommonShuffleMode == Permute)
540 continue;
541 // If the extract index is not the same as the operation number, it is a
542 // permutation.
543 if (IntIdx != I) {
544 CommonShuffleMode = Permute;
545 continue;
546 }
547 CommonShuffleMode = Select;
548 }
549 // If we're not crossing lanes in different vectors, consider it as blending.
550 if (CommonShuffleMode == Select && Vec2)
552 // If Vec2 was never used, we have a permutation of a single vector, otherwise
553 // we have permutation of 2 vectors.
556}
557
558/// \returns True if Extract{Value,Element} instruction extracts element Idx.
559static std::optional<unsigned> getExtractIndex(Instruction *E) {
560 unsigned Opcode = E->getOpcode();
561 assert((Opcode == Instruction::ExtractElement ||
562 Opcode == Instruction::ExtractValue) &&
563 "Expected extractelement or extractvalue instruction.");
564 if (Opcode == Instruction::ExtractElement) {
565 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
566 if (!CI)
567 return std::nullopt;
568 return CI->getZExtValue();
569 }
570 auto *EI = cast<ExtractValueInst>(E);
571 if (EI->getNumIndices() != 1)
572 return std::nullopt;
573 return *EI->idx_begin();
574}
575
576namespace {
577
578/// Main data required for vectorization of instructions.
579struct InstructionsState {
580 /// The very first instruction in the list with the main opcode.
581 Value *OpValue = nullptr;
582
583 /// The main/alternate instruction.
584 Instruction *MainOp = nullptr;
585 Instruction *AltOp = nullptr;
586
587 /// The main/alternate opcodes for the list of instructions.
588 unsigned getOpcode() const {
589 return MainOp ? MainOp->getOpcode() : 0;
590 }
591
592 unsigned getAltOpcode() const {
593 return AltOp ? AltOp->getOpcode() : 0;
594 }
595
596 /// Some of the instructions in the list have alternate opcodes.
597 bool isAltShuffle() const { return AltOp != MainOp; }
598
599 bool isOpcodeOrAlt(Instruction *I) const {
600 unsigned CheckedOpcode = I->getOpcode();
601 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
602 }
603
604 InstructionsState() = delete;
605 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
606 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
607};
608
609} // end anonymous namespace
610
611/// Chooses the correct key for scheduling data. If \p Op has the same (or
612/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
613/// OpValue.
614static Value *isOneOf(const InstructionsState &S, Value *Op) {
615 auto *I = dyn_cast<Instruction>(Op);
616 if (I && S.isOpcodeOrAlt(I))
617 return Op;
618 return S.OpValue;
619}
620
621/// \returns true if \p Opcode is allowed as part of the main/alternate
622/// instruction for SLP vectorization.
623///
624/// Example of unsupported opcode is SDIV that can potentially cause UB if the
625/// "shuffled out" lane would result in division by zero.
626static bool isValidForAlternation(unsigned Opcode) {
627 if (Instruction::isIntDivRem(Opcode))
628 return false;
629
630 return true;
631}
632
633static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
634 const TargetLibraryInfo &TLI,
635 unsigned BaseIndex = 0);
636
637/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
638/// compatible instructions or constants, or just some other regular values.
639static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
640 Value *Op1, const TargetLibraryInfo &TLI) {
641 return (isConstant(BaseOp0) && isConstant(Op0)) ||
642 (isConstant(BaseOp1) && isConstant(Op1)) ||
643 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
644 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
645 BaseOp0 == Op0 || BaseOp1 == Op1 ||
646 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
647 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
648}
649
650/// \returns true if a compare instruction \p CI has similar "look" and
651/// same predicate as \p BaseCI, "as is" or with its operands and predicate
652/// swapped, false otherwise.
653static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
654 const TargetLibraryInfo &TLI) {
655 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
656 "Assessing comparisons of different types?");
657 CmpInst::Predicate BasePred = BaseCI->getPredicate();
658 CmpInst::Predicate Pred = CI->getPredicate();
660
661 Value *BaseOp0 = BaseCI->getOperand(0);
662 Value *BaseOp1 = BaseCI->getOperand(1);
663 Value *Op0 = CI->getOperand(0);
664 Value *Op1 = CI->getOperand(1);
665
666 return (BasePred == Pred &&
667 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
668 (BasePred == SwappedPred &&
669 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
670}
671
672/// \returns analysis of the Instructions in \p VL described in
673/// InstructionsState, the Opcode that we suppose the whole list
674/// could be vectorized even if its structure is diverse.
675static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
676 const TargetLibraryInfo &TLI,
677 unsigned BaseIndex) {
678 // Make sure these are all Instructions.
679 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
680 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
681
682 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
683 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
684 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
685 CmpInst::Predicate BasePred =
686 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
688 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
689 unsigned AltOpcode = Opcode;
690 unsigned AltIndex = BaseIndex;
691
692 bool SwappedPredsCompatible = [&]() {
693 if (!IsCmpOp)
694 return false;
695 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
696 UniquePreds.insert(BasePred);
697 UniqueNonSwappedPreds.insert(BasePred);
698 for (Value *V : VL) {
699 auto *I = dyn_cast<CmpInst>(V);
700 if (!I)
701 return false;
702 CmpInst::Predicate CurrentPred = I->getPredicate();
703 CmpInst::Predicate SwappedCurrentPred =
704 CmpInst::getSwappedPredicate(CurrentPred);
705 UniqueNonSwappedPreds.insert(CurrentPred);
706 if (!UniquePreds.contains(CurrentPred) &&
707 !UniquePreds.contains(SwappedCurrentPred))
708 UniquePreds.insert(CurrentPred);
709 }
710 // Total number of predicates > 2, but if consider swapped predicates
711 // compatible only 2, consider swappable predicates as compatible opcodes,
712 // not alternate.
713 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
714 }();
715 // Check for one alternate opcode from another BinaryOperator.
716 // TODO - generalize to support all operators (types, calls etc.).
717 auto *IBase = cast<Instruction>(VL[BaseIndex]);
718 Intrinsic::ID BaseID = 0;
719 SmallVector<VFInfo> BaseMappings;
720 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
722 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
723 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
724 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
725 }
726 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
727 auto *I = cast<Instruction>(VL[Cnt]);
728 unsigned InstOpcode = I->getOpcode();
729 if (IsBinOp && isa<BinaryOperator>(I)) {
730 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
731 continue;
732 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
733 isValidForAlternation(Opcode)) {
734 AltOpcode = InstOpcode;
735 AltIndex = Cnt;
736 continue;
737 }
738 } else if (IsCastOp && isa<CastInst>(I)) {
739 Value *Op0 = IBase->getOperand(0);
740 Type *Ty0 = Op0->getType();
741 Value *Op1 = I->getOperand(0);
742 Type *Ty1 = Op1->getType();
743 if (Ty0 == Ty1) {
744 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
745 continue;
746 if (Opcode == AltOpcode) {
748 isValidForAlternation(InstOpcode) &&
749 "Cast isn't safe for alternation, logic needs to be updated!");
750 AltOpcode = InstOpcode;
751 AltIndex = Cnt;
752 continue;
753 }
754 }
755 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
756 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
757 Type *Ty0 = BaseInst->getOperand(0)->getType();
758 Type *Ty1 = Inst->getOperand(0)->getType();
759 if (Ty0 == Ty1) {
760 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
761 // Check for compatible operands. If the corresponding operands are not
762 // compatible - need to perform alternate vectorization.
763 CmpInst::Predicate CurrentPred = Inst->getPredicate();
764 CmpInst::Predicate SwappedCurrentPred =
765 CmpInst::getSwappedPredicate(CurrentPred);
766
767 if ((E == 2 || SwappedPredsCompatible) &&
768 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
769 continue;
770
771 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
772 continue;
773 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
774 if (AltIndex != BaseIndex) {
775 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
776 continue;
777 } else if (BasePred != CurrentPred) {
778 assert(
779 isValidForAlternation(InstOpcode) &&
780 "CmpInst isn't safe for alternation, logic needs to be updated!");
781 AltIndex = Cnt;
782 continue;
783 }
784 CmpInst::Predicate AltPred = AltInst->getPredicate();
785 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
786 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
787 continue;
788 }
789 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
790 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
791 if (Gep->getNumOperands() != 2 ||
792 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
793 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
794 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
796 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
797 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
798 auto *BaseLI = cast<LoadInst>(IBase);
799 if (!LI->isSimple() || !BaseLI->isSimple())
800 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
801 } else if (auto *Call = dyn_cast<CallInst>(I)) {
802 auto *CallBase = cast<CallInst>(IBase);
803 if (Call->getCalledFunction() != CallBase->getCalledFunction())
804 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
805 if (Call->hasOperandBundles() &&
806 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
807 Call->op_begin() + Call->getBundleOperandsEndIndex(),
808 CallBase->op_begin() +
810 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
812 if (ID != BaseID)
813 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
814 if (!ID) {
815 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
816 if (Mappings.size() != BaseMappings.size() ||
817 Mappings.front().ISA != BaseMappings.front().ISA ||
818 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
819 Mappings.front().VectorName != BaseMappings.front().VectorName ||
820 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
821 Mappings.front().Shape.Parameters !=
822 BaseMappings.front().Shape.Parameters)
823 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
824 }
825 }
826 continue;
827 }
828 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
829 }
830
831 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
832 cast<Instruction>(VL[AltIndex]));
833}
834
835/// \returns true if all of the values in \p VL have the same type or false
836/// otherwise.
838 Type *Ty = VL.front()->getType();
839 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
840}
841
842/// \returns True if in-tree use also needs extract. This refers to
843/// possible scalar operand in vectorized instruction.
844static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
845 TargetLibraryInfo *TLI) {
846 unsigned Opcode = UserInst->getOpcode();
847 switch (Opcode) {
848 case Instruction::Load: {
849 LoadInst *LI = cast<LoadInst>(UserInst);
850 return (LI->getPointerOperand() == Scalar);
851 }
852 case Instruction::Store: {
853 StoreInst *SI = cast<StoreInst>(UserInst);
854 return (SI->getPointerOperand() == Scalar);
855 }
856 case Instruction::Call: {
857 CallInst *CI = cast<CallInst>(UserInst);
859 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
860 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
861 Arg.value().get() == Scalar;
862 });
863 }
864 default:
865 return false;
866 }
867}
868
869/// \returns the AA location that is being access by the instruction.
871 if (StoreInst *SI = dyn_cast<StoreInst>(I))
872 return MemoryLocation::get(SI);
873 if (LoadInst *LI = dyn_cast<LoadInst>(I))
874 return MemoryLocation::get(LI);
875 return MemoryLocation();
876}
877
878/// \returns True if the instruction is not a volatile or atomic load/store.
879static bool isSimple(Instruction *I) {
880 if (LoadInst *LI = dyn_cast<LoadInst>(I))
881 return LI->isSimple();
882 if (StoreInst *SI = dyn_cast<StoreInst>(I))
883 return SI->isSimple();
884 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
885 return !MI->isVolatile();
886 return true;
887}
888
889/// Shuffles \p Mask in accordance with the given \p SubMask.
890/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
891/// one but two input vectors.
892static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
893 bool ExtendingManyInputs = false) {
894 if (SubMask.empty())
895 return;
896 assert(
897 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
898 // Check if input scalars were extended to match the size of other node.
899 (SubMask.size() == Mask.size() &&
900 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
901 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
902 "SubMask with many inputs support must be larger than the mask.");
903 if (Mask.empty()) {
904 Mask.append(SubMask.begin(), SubMask.end());
905 return;
906 }
907 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
908 int TermValue = std::min(Mask.size(), SubMask.size());
909 for (int I = 0, E = SubMask.size(); I < E; ++I) {
910 if (SubMask[I] == PoisonMaskElem ||
911 (!ExtendingManyInputs &&
912 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
913 continue;
914 NewMask[I] = Mask[SubMask[I]];
915 }
916 Mask.swap(NewMask);
917}
918
919/// Order may have elements assigned special value (size) which is out of
920/// bounds. Such indices only appear on places which correspond to undef values
921/// (see canReuseExtract for details) and used in order to avoid undef values
922/// have effect on operands ordering.
923/// The first loop below simply finds all unused indices and then the next loop
924/// nest assigns these indices for undef values positions.
925/// As an example below Order has two undef positions and they have assigned
926/// values 3 and 7 respectively:
927/// before: 6 9 5 4 9 2 1 0
928/// after: 6 3 5 4 7 2 1 0
930 const unsigned Sz = Order.size();
931 SmallBitVector UnusedIndices(Sz, /*t=*/true);
932 SmallBitVector MaskedIndices(Sz);
933 for (unsigned I = 0; I < Sz; ++I) {
934 if (Order[I] < Sz)
935 UnusedIndices.reset(Order[I]);
936 else
937 MaskedIndices.set(I);
938 }
939 if (MaskedIndices.none())
940 return;
941 assert(UnusedIndices.count() == MaskedIndices.count() &&
942 "Non-synced masked/available indices.");
943 int Idx = UnusedIndices.find_first();
944 int MIdx = MaskedIndices.find_first();
945 while (MIdx >= 0) {
946 assert(Idx >= 0 && "Indices must be synced.");
947 Order[MIdx] = Idx;
948 Idx = UnusedIndices.find_next(Idx);
949 MIdx = MaskedIndices.find_next(MIdx);
950 }
951}
952
953namespace llvm {
954
956 SmallVectorImpl<int> &Mask) {
957 Mask.clear();
958 const unsigned E = Indices.size();
959 Mask.resize(E, PoisonMaskElem);
960 for (unsigned I = 0; I < E; ++I)
961 Mask[Indices[I]] = I;
962}
963
964/// Reorders the list of scalars in accordance with the given \p Mask.
966 ArrayRef<int> Mask) {
967 assert(!Mask.empty() && "Expected non-empty mask.");
968 SmallVector<Value *> Prev(Scalars.size(),
969 UndefValue::get(Scalars.front()->getType()));
970 Prev.swap(Scalars);
971 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
972 if (Mask[I] != PoisonMaskElem)
973 Scalars[Mask[I]] = Prev[I];
974}
975
976/// Checks if the provided value does not require scheduling. It does not
977/// require scheduling if this is not an instruction or it is an instruction
978/// that does not read/write memory and all operands are either not instructions
979/// or phi nodes or instructions from different blocks.
981 auto *I = dyn_cast<Instruction>(V);
982 if (!I)
983 return true;
984 return !mayHaveNonDefUseDependency(*I) &&
985 all_of(I->operands(), [I](Value *V) {
986 auto *IO = dyn_cast<Instruction>(V);
987 if (!IO)
988 return true;
989 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
990 });
991}
992
993/// Checks if the provided value does not require scheduling. It does not
994/// require scheduling if this is not an instruction or it is an instruction
995/// that does not read/write memory and all users are phi nodes or instructions
996/// from the different blocks.
997static bool isUsedOutsideBlock(Value *V) {
998 auto *I = dyn_cast<Instruction>(V);
999 if (!I)
1000 return true;
1001 // Limits the number of uses to save compile time.
1002 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1003 all_of(I->users(), [I](User *U) {
1004 auto *IU = dyn_cast<Instruction>(U);
1005 if (!IU)
1006 return true;
1007 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1008 });
1009}
1010
1011/// Checks if the specified value does not require scheduling. It does not
1012/// require scheduling if all operands and all users do not need to be scheduled
1013/// in the current basic block.
1016}
1017
1018/// Checks if the specified array of instructions does not require scheduling.
1019/// It is so if all either instructions have operands that do not require
1020/// scheduling or their users do not require scheduling since they are phis or
1021/// in other basic blocks.
1023 return !VL.empty() &&
1025}
1026
1027namespace slpvectorizer {
1028
1029/// Bottom Up SLP Vectorizer.
1030class BoUpSLP {
1031 struct TreeEntry;
1032 struct ScheduleData;
1035
1036public:
1037 /// Tracks the state we can represent the loads in the given sequence.
1038 enum class LoadsState {
1039 Gather,
1040 Vectorize,
1043 };
1044
1052
1054 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1057 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1058 AC(AC), DB(DB), DL(DL), ORE(ORE),
1059 Builder(Se->getContext(), TargetFolder(*DL)) {
1060 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1061 // Use the vector register size specified by the target unless overridden
1062 // by a command-line option.
1063 // TODO: It would be better to limit the vectorization factor based on
1064 // data type rather than just register size. For example, x86 AVX has
1065 // 256-bit registers, but it does not support integer operations
1066 // at that width (that requires AVX2).
1067 if (MaxVectorRegSizeOption.getNumOccurrences())
1068 MaxVecRegSize = MaxVectorRegSizeOption;
1069 else
1070 MaxVecRegSize =
1072 .getFixedValue();
1073
1074 if (MinVectorRegSizeOption.getNumOccurrences())
1075 MinVecRegSize = MinVectorRegSizeOption;
1076 else
1077 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1078 }
1079
1080 /// Vectorize the tree that starts with the elements in \p VL.
1081 /// Returns the vectorized root.
1083
1084 /// Vectorize the tree but with the list of externally used values \p
1085 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1086 /// generated extractvalue instructions.
1087 /// \param ReplacedExternals containd list of replaced external values
1088 /// {scalar, replace} after emitting extractelement for external uses.
1089 Value *
1090 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1091 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1092 Instruction *ReductionRoot = nullptr);
1093
1094 /// \returns the cost incurred by unwanted spills and fills, caused by
1095 /// holding live values over call sites.
1097
1098 /// \returns the vectorization cost of the subtree that starts at \p VL.
1099 /// A negative number means that this is profitable.
1100 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1101
1102 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1103 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1104 void buildTree(ArrayRef<Value *> Roots,
1105 const SmallDenseSet<Value *> &UserIgnoreLst);
1106
1107 /// Construct a vectorizable tree that starts at \p Roots.
1108 void buildTree(ArrayRef<Value *> Roots);
1109
1110 /// Returns whether the root node has in-tree uses.
1112 return !VectorizableTree.empty() &&
1113 !VectorizableTree.front()->UserTreeIndices.empty();
1114 }
1115
1116 /// Return the scalars of the root node.
1118 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1119 return VectorizableTree.front()->Scalars;
1120 }
1121
1122 /// Builds external uses of the vectorized scalars, i.e. the list of
1123 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1124 /// ExternallyUsedValues contains additional list of external uses to handle
1125 /// vectorization of reductions.
1126 void
1127 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1128
1129 /// Transforms graph nodes to target specific representations, if profitable.
1130 void transformNodes();
1131
1132 /// Clear the internal data structures that are created by 'buildTree'.
1133 void deleteTree() {
1134 VectorizableTree.clear();
1135 ScalarToTreeEntry.clear();
1136 MultiNodeScalars.clear();
1137 MustGather.clear();
1138 NonScheduledFirst.clear();
1139 EntryToLastInstruction.clear();
1140 ExternalUses.clear();
1141 ExternalUsesAsGEPs.clear();
1142 for (auto &Iter : BlocksSchedules) {
1143 BlockScheduling *BS = Iter.second.get();
1144 BS->clear();
1145 }
1146 MinBWs.clear();
1147 ReductionBitWidth = 0;
1148 CastMaxMinBWSizes.reset();
1149 ExtraBitWidthNodes.clear();
1150 InstrElementSize.clear();
1151 UserIgnoreList = nullptr;
1152 PostponedGathers.clear();
1153 ValueToGatherNodes.clear();
1154 }
1155
1156 unsigned getTreeSize() const { return VectorizableTree.size(); }
1157
1158 /// Perform LICM and CSE on the newly generated gather sequences.
1160
1161 /// Checks if the specified gather tree entry \p TE can be represented as a
1162 /// shuffled vector entry + (possibly) permutation with other gathers. It
1163 /// implements the checks only for possibly ordered scalars (Loads,
1164 /// ExtractElement, ExtractValue), which can be part of the graph.
1165 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1166
1167 /// Sort loads into increasing pointers offsets to allow greater clustering.
1168 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1169
1170 /// Gets reordering data for the given tree entry. If the entry is vectorized
1171 /// - just return ReorderIndices, otherwise check if the scalars can be
1172 /// reordered and return the most optimal order.
1173 /// \return std::nullopt if ordering is not important, empty order, if
1174 /// identity order is important, or the actual order.
1175 /// \param TopToBottom If true, include the order of vectorized stores and
1176 /// insertelement nodes, otherwise skip them.
1177 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1178 bool TopToBottom);
1179
1180 /// Reorders the current graph to the most profitable order starting from the
1181 /// root node to the leaf nodes. The best order is chosen only from the nodes
1182 /// of the same size (vectorization factor). Smaller nodes are considered
1183 /// parts of subgraph with smaller VF and they are reordered independently. We
1184 /// can make it because we still need to extend smaller nodes to the wider VF
1185 /// and we can merge reordering shuffles with the widening shuffles.
1186 void reorderTopToBottom();
1187
1188 /// Reorders the current graph to the most profitable order starting from
1189 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1190 /// number of reshuffles if the leaf nodes use the same order. In this case we
1191 /// can merge the orders and just shuffle user node instead of shuffling its
1192 /// operands. Plus, even the leaf nodes have different orders, it allows to
1193 /// sink reordering in the graph closer to the root node and merge it later
1194 /// during analysis.
1195 void reorderBottomToTop(bool IgnoreReorder = false);
1196
1197 /// \return The vector element size in bits to use when vectorizing the
1198 /// expression tree ending at \p V. If V is a store, the size is the width of
1199 /// the stored value. Otherwise, the size is the width of the largest loaded
1200 /// value reaching V. This method is used by the vectorizer to calculate
1201 /// vectorization factors.
1202 unsigned getVectorElementSize(Value *V);
1203
1204 /// Compute the minimum type sizes required to represent the entries in a
1205 /// vectorizable tree.
1207
1208 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1209 unsigned getMaxVecRegSize() const {
1210 return MaxVecRegSize;
1211 }
1212
1213 // \returns minimum vector register size as set by cl::opt.
1214 unsigned getMinVecRegSize() const {
1215 return MinVecRegSize;
1216 }
1217
1218 unsigned getMinVF(unsigned Sz) const {
1219 return std::max(2U, getMinVecRegSize() / Sz);
1220 }
1221
1222 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1223 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1224 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1225 return MaxVF ? MaxVF : UINT_MAX;
1226 }
1227
1228 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1229 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1230 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1231 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1232 ///
1233 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1234 unsigned canMapToVector(Type *T) const;
1235
1236 /// \returns True if the VectorizableTree is both tiny and not fully
1237 /// vectorizable. We do not vectorize such trees.
1238 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1239
1240 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1241 /// can be load combined in the backend. Load combining may not be allowed in
1242 /// the IR optimizer, so we do not want to alter the pattern. For example,
1243 /// partially transforming a scalar bswap() pattern into vector code is
1244 /// effectively impossible for the backend to undo.
1245 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1246 /// may not be necessary.
1247 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1248
1249 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1250 /// can be load combined in the backend. Load combining may not be allowed in
1251 /// the IR optimizer, so we do not want to alter the pattern. For example,
1252 /// partially transforming a scalar bswap() pattern into vector code is
1253 /// effectively impossible for the backend to undo.
1254 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1255 /// may not be necessary.
1256 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1257
1258 /// Checks if the given array of loads can be represented as a vectorized,
1259 /// scatter or just simple gather.
1260 /// \param VL list of loads.
1261 /// \param VL0 main load value.
1262 /// \param Order returned order of load instructions.
1263 /// \param PointerOps returned list of pointer operands.
1264 /// \param TryRecursiveCheck used to check if long masked gather can be
1265 /// represented as a serie of loads/insert subvector, if profitable.
1268 SmallVectorImpl<Value *> &PointerOps,
1269 bool TryRecursiveCheck = true) const;
1270
1272
1273 /// This structure holds any data we need about the edges being traversed
1274 /// during buildTree_rec(). We keep track of:
1275 /// (i) the user TreeEntry index, and
1276 /// (ii) the index of the edge.
1277 struct EdgeInfo {
1278 EdgeInfo() = default;
1279 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1281 /// The user TreeEntry.
1282 TreeEntry *UserTE = nullptr;
1283 /// The operand index of the use.
1284 unsigned EdgeIdx = UINT_MAX;
1285#ifndef NDEBUG
1287 const BoUpSLP::EdgeInfo &EI) {
1288 EI.dump(OS);
1289 return OS;
1290 }
1291 /// Debug print.
1292 void dump(raw_ostream &OS) const {
1293 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1294 << " EdgeIdx:" << EdgeIdx << "}";
1295 }
1296 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1297#endif
1298 bool operator == (const EdgeInfo &Other) const {
1299 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1300 }
1301 };
1302
1303 /// A helper class used for scoring candidates for two consecutive lanes.
1305 const TargetLibraryInfo &TLI;
1306 const DataLayout &DL;
1307 ScalarEvolution &SE;
1308 const BoUpSLP &R;
1309 int NumLanes; // Total number of lanes (aka vectorization factor).
1310 int MaxLevel; // The maximum recursion depth for accumulating score.
1311
1312 public:
1314 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1315 int MaxLevel)
1316 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1317 MaxLevel(MaxLevel) {}
1318
1319 // The hard-coded scores listed here are not very important, though it shall
1320 // be higher for better matches to improve the resulting cost. When
1321 // computing the scores of matching one sub-tree with another, we are
1322 // basically counting the number of values that are matching. So even if all
1323 // scores are set to 1, we would still get a decent matching result.
1324 // However, sometimes we have to break ties. For example we may have to
1325 // choose between matching loads vs matching opcodes. This is what these
1326 // scores are helping us with: they provide the order of preference. Also,
1327 // this is important if the scalar is externally used or used in another
1328 // tree entry node in the different lane.
1329
1330 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1331 static const int ScoreConsecutiveLoads = 4;
1332 /// The same load multiple times. This should have a better score than
1333 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1334 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1335 /// a vector load and 1.0 for a broadcast.
1336 static const int ScoreSplatLoads = 3;
1337 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1338 static const int ScoreReversedLoads = 3;
1339 /// A load candidate for masked gather.
1340 static const int ScoreMaskedGatherCandidate = 1;
1341 /// ExtractElementInst from same vector and consecutive indexes.
1342 static const int ScoreConsecutiveExtracts = 4;
1343 /// ExtractElementInst from same vector and reversed indices.
1344 static const int ScoreReversedExtracts = 3;
1345 /// Constants.
1346 static const int ScoreConstants = 2;
1347 /// Instructions with the same opcode.
1348 static const int ScoreSameOpcode = 2;
1349 /// Instructions with alt opcodes (e.g, add + sub).
1350 static const int ScoreAltOpcodes = 1;
1351 /// Identical instructions (a.k.a. splat or broadcast).
1352 static const int ScoreSplat = 1;
1353 /// Matching with an undef is preferable to failing.
1354 static const int ScoreUndef = 1;
1355 /// Score for failing to find a decent match.
1356 static const int ScoreFail = 0;
1357 /// Score if all users are vectorized.
1358 static const int ScoreAllUserVectorized = 1;
1359
1360 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1361 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1362 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1363 /// MainAltOps.
1365 ArrayRef<Value *> MainAltOps) const {
1366 if (!isValidElementType(V1->getType()) ||
1367 !isValidElementType(V2->getType()))
1369
1370 if (V1 == V2) {
1371 if (isa<LoadInst>(V1)) {
1372 // Retruns true if the users of V1 and V2 won't need to be extracted.
1373 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1374 // Bail out if we have too many uses to save compilation time.
1375 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1376 return false;
1377
1378 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1379 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1380 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1381 });
1382 };
1383 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1384 };
1385 // A broadcast of a load can be cheaper on some targets.
1386 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1387 ElementCount::getFixed(NumLanes)) &&
1388 ((int)V1->getNumUses() == NumLanes ||
1389 AllUsersAreInternal(V1, V2)))
1391 }
1393 }
1394
1395 auto *LI1 = dyn_cast<LoadInst>(V1);
1396 auto *LI2 = dyn_cast<LoadInst>(V2);
1397 if (LI1 && LI2) {
1398 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1399 !LI2->isSimple())
1401
1402 std::optional<int> Dist = getPointersDiff(
1403 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1404 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1405 if (!Dist || *Dist == 0) {
1406 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1407 getUnderlyingObject(LI2->getPointerOperand()) &&
1408 R.TTI->isLegalMaskedGather(
1409 FixedVectorType::get(LI1->getType(), NumLanes),
1410 LI1->getAlign()))
1413 }
1414 // The distance is too large - still may be profitable to use masked
1415 // loads/gathers.
1416 if (std::abs(*Dist) > NumLanes / 2)
1418 // This still will detect consecutive loads, but we might have "holes"
1419 // in some cases. It is ok for non-power-2 vectorization and may produce
1420 // better results. It should not affect current vectorization.
1423 }
1424
1425 auto *C1 = dyn_cast<Constant>(V1);
1426 auto *C2 = dyn_cast<Constant>(V2);
1427 if (C1 && C2)
1429
1430 // Extracts from consecutive indexes of the same vector better score as
1431 // the extracts could be optimized away.
1432 Value *EV1;
1433 ConstantInt *Ex1Idx;
1434 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1435 // Undefs are always profitable for extractelements.
1436 // Compiler can easily combine poison and extractelement <non-poison> or
1437 // undef and extractelement <poison>. But combining undef +
1438 // extractelement <non-poison-but-may-produce-poison> requires some
1439 // extra operations.
1440 if (isa<UndefValue>(V2))
1441 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1444 Value *EV2 = nullptr;
1445 ConstantInt *Ex2Idx = nullptr;
1446 if (match(V2,
1448 m_Undef())))) {
1449 // Undefs are always profitable for extractelements.
1450 if (!Ex2Idx)
1452 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1454 if (EV2 == EV1) {
1455 int Idx1 = Ex1Idx->getZExtValue();
1456 int Idx2 = Ex2Idx->getZExtValue();
1457 int Dist = Idx2 - Idx1;
1458 // The distance is too large - still may be profitable to use
1459 // shuffles.
1460 if (std::abs(Dist) == 0)
1462 if (std::abs(Dist) > NumLanes / 2)
1466 }
1468 }
1470 }
1471
1472 auto *I1 = dyn_cast<Instruction>(V1);
1473 auto *I2 = dyn_cast<Instruction>(V2);
1474 if (I1 && I2) {
1475 if (I1->getParent() != I2->getParent())
1477 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1478 Ops.push_back(I1);
1479 Ops.push_back(I2);
1480 InstructionsState S = getSameOpcode(Ops, TLI);
1481 // Note: Only consider instructions with <= 2 operands to avoid
1482 // complexity explosion.
1483 if (S.getOpcode() &&
1484 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1485 !S.isAltShuffle()) &&
1486 all_of(Ops, [&S](Value *V) {
1487 return cast<Instruction>(V)->getNumOperands() ==
1488 S.MainOp->getNumOperands();
1489 }))
1490 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1492 }
1493
1494 if (isa<UndefValue>(V2))
1496
1498 }
1499
1500 /// Go through the operands of \p LHS and \p RHS recursively until
1501 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1502 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1503 /// of \p U1 and \p U2), except at the beginning of the recursion where
1504 /// these are set to nullptr.
1505 ///
1506 /// For example:
1507 /// \verbatim
1508 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1509 /// \ / \ / \ / \ /
1510 /// + + + +
1511 /// G1 G2 G3 G4
1512 /// \endverbatim
1513 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1514 /// each level recursively, accumulating the score. It starts from matching
1515 /// the additions at level 0, then moves on to the loads (level 1). The
1516 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1517 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1518 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1519 /// Please note that the order of the operands does not matter, as we
1520 /// evaluate the score of all profitable combinations of operands. In
1521 /// other words the score of G1 and G4 is the same as G1 and G2. This
1522 /// heuristic is based on ideas described in:
1523 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1524 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1525 /// Luís F. W. Góes
1527 Instruction *U2, int CurrLevel,
1528 ArrayRef<Value *> MainAltOps) const {
1529
1530 // Get the shallow score of V1 and V2.
1531 int ShallowScoreAtThisLevel =
1532 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1533
1534 // If reached MaxLevel,
1535 // or if V1 and V2 are not instructions,
1536 // or if they are SPLAT,
1537 // or if they are not consecutive,
1538 // or if profitable to vectorize loads or extractelements, early return
1539 // the current cost.
1540 auto *I1 = dyn_cast<Instruction>(LHS);
1541 auto *I2 = dyn_cast<Instruction>(RHS);
1542 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1543 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1544 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1545 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1546 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1547 ShallowScoreAtThisLevel))
1548 return ShallowScoreAtThisLevel;
1549 assert(I1 && I2 && "Should have early exited.");
1550
1551 // Contains the I2 operand indexes that got matched with I1 operands.
1552 SmallSet<unsigned, 4> Op2Used;
1553
1554 // Recursion towards the operands of I1 and I2. We are trying all possible
1555 // operand pairs, and keeping track of the best score.
1556 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1557 OpIdx1 != NumOperands1; ++OpIdx1) {
1558 // Try to pair op1I with the best operand of I2.
1559 int MaxTmpScore = 0;
1560 unsigned MaxOpIdx2 = 0;
1561 bool FoundBest = false;
1562 // If I2 is commutative try all combinations.
1563 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1564 unsigned ToIdx = isCommutative(I2)
1565 ? I2->getNumOperands()
1566 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1567 assert(FromIdx <= ToIdx && "Bad index");
1568 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1569 // Skip operands already paired with OpIdx1.
1570 if (Op2Used.count(OpIdx2))
1571 continue;
1572 // Recursively calculate the cost at each level
1573 int TmpScore =
1574 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1575 I1, I2, CurrLevel + 1, std::nullopt);
1576 // Look for the best score.
1577 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1578 TmpScore > MaxTmpScore) {
1579 MaxTmpScore = TmpScore;
1580 MaxOpIdx2 = OpIdx2;
1581 FoundBest = true;
1582 }
1583 }
1584 if (FoundBest) {
1585 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1586 Op2Used.insert(MaxOpIdx2);
1587 ShallowScoreAtThisLevel += MaxTmpScore;
1588 }
1589 }
1590 return ShallowScoreAtThisLevel;
1591 }
1592 };
1593 /// A helper data structure to hold the operands of a vector of instructions.
1594 /// This supports a fixed vector length for all operand vectors.
1596 /// For each operand we need (i) the value, and (ii) the opcode that it
1597 /// would be attached to if the expression was in a left-linearized form.
1598 /// This is required to avoid illegal operand reordering.
1599 /// For example:
1600 /// \verbatim
1601 /// 0 Op1
1602 /// |/
1603 /// Op1 Op2 Linearized + Op2
1604 /// \ / ----------> |/
1605 /// - -
1606 ///
1607 /// Op1 - Op2 (0 + Op1) - Op2
1608 /// \endverbatim
1609 ///
1610 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1611 ///
1612 /// Another way to think of this is to track all the operations across the
1613 /// path from the operand all the way to the root of the tree and to
1614 /// calculate the operation that corresponds to this path. For example, the
1615 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1616 /// corresponding operation is a '-' (which matches the one in the
1617 /// linearized tree, as shown above).
1618 ///
1619 /// For lack of a better term, we refer to this operation as Accumulated
1620 /// Path Operation (APO).
1621 struct OperandData {
1622 OperandData() = default;
1623 OperandData(Value *V, bool APO, bool IsUsed)
1624 : V(V), APO(APO), IsUsed(IsUsed) {}
1625 /// The operand value.
1626 Value *V = nullptr;
1627 /// TreeEntries only allow a single opcode, or an alternate sequence of
1628 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1629 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1630 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1631 /// (e.g., Add/Mul)
1632 bool APO = false;
1633 /// Helper data for the reordering function.
1634 bool IsUsed = false;
1635 };
1636
1637 /// During operand reordering, we are trying to select the operand at lane
1638 /// that matches best with the operand at the neighboring lane. Our
1639 /// selection is based on the type of value we are looking for. For example,
1640 /// if the neighboring lane has a load, we need to look for a load that is
1641 /// accessing a consecutive address. These strategies are summarized in the
1642 /// 'ReorderingMode' enumerator.
1643 enum class ReorderingMode {
1644 Load, ///< Matching loads to consecutive memory addresses
1645 Opcode, ///< Matching instructions based on opcode (same or alternate)
1646 Constant, ///< Matching constants
1647 Splat, ///< Matching the same instruction multiple times (broadcast)
1648 Failed, ///< We failed to create a vectorizable group
1649 };
1650
1652
1653 /// A vector of operand vectors.
1655
1656 const TargetLibraryInfo &TLI;
1657 const DataLayout &DL;
1658 ScalarEvolution &SE;
1659 const BoUpSLP &R;
1660
1661 /// \returns the operand data at \p OpIdx and \p Lane.
1662 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1663 return OpsVec[OpIdx][Lane];
1664 }
1665
1666 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1667 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1668 return OpsVec[OpIdx][Lane];
1669 }
1670
1671 /// Clears the used flag for all entries.
1672 void clearUsed() {
1673 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1674 OpIdx != NumOperands; ++OpIdx)
1675 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1676 ++Lane)
1677 OpsVec[OpIdx][Lane].IsUsed = false;
1678 }
1679
1680 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1681 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1682 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1683 }
1684
1685 /// \param Lane lane of the operands under analysis.
1686 /// \param OpIdx operand index in \p Lane lane we're looking the best
1687 /// candidate for.
1688 /// \param Idx operand index of the current candidate value.
1689 /// \returns The additional score due to possible broadcasting of the
1690 /// elements in the lane. It is more profitable to have power-of-2 unique
1691 /// elements in the lane, it will be vectorized with higher probability
1692 /// after removing duplicates. Currently the SLP vectorizer supports only
1693 /// vectorization of the power-of-2 number of unique scalars.
1694 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1695 Value *IdxLaneV = getData(Idx, Lane).V;
1696 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1697 return 0;
1699 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1700 if (Ln == Lane)
1701 continue;
1702 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1703 if (!isa<Instruction>(OpIdxLnV))
1704 return 0;
1705 Uniques.insert(OpIdxLnV);
1706 }
1707 int UniquesCount = Uniques.size();
1708 int UniquesCntWithIdxLaneV =
1709 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1710 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1711 int UniquesCntWithOpIdxLaneV =
1712 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1713 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1714 return 0;
1715 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1716 UniquesCntWithOpIdxLaneV) -
1717 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1718 }
1719
1720 /// \param Lane lane of the operands under analysis.
1721 /// \param OpIdx operand index in \p Lane lane we're looking the best
1722 /// candidate for.
1723 /// \param Idx operand index of the current candidate value.
1724 /// \returns The additional score for the scalar which users are all
1725 /// vectorized.
1726 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1727 Value *IdxLaneV = getData(Idx, Lane).V;
1728 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1729 // Do not care about number of uses for vector-like instructions
1730 // (extractelement/extractvalue with constant indices), they are extracts
1731 // themselves and already externally used. Vectorization of such
1732 // instructions does not add extra extractelement instruction, just may
1733 // remove it.
1734 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1735 isVectorLikeInstWithConstOps(OpIdxLaneV))
1737 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1738 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1739 return 0;
1740 return R.areAllUsersVectorized(IdxLaneI)
1742 : 0;
1743 }
1744
1745 /// Score scaling factor for fully compatible instructions but with
1746 /// different number of external uses. Allows better selection of the
1747 /// instructions with less external uses.
1748 static const int ScoreScaleFactor = 10;
1749
1750 /// \Returns the look-ahead score, which tells us how much the sub-trees
1751 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1752 /// score. This helps break ties in an informed way when we cannot decide on
1753 /// the order of the operands by just considering the immediate
1754 /// predecessors.
1755 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1756 int Lane, unsigned OpIdx, unsigned Idx,
1757 bool &IsUsed) {
1758 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1760 // Keep track of the instruction stack as we recurse into the operands
1761 // during the look-ahead score exploration.
1762 int Score =
1763 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1764 /*CurrLevel=*/1, MainAltOps);
1765 if (Score) {
1766 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1767 if (Score <= -SplatScore) {
1768 // Set the minimum score for splat-like sequence to avoid setting
1769 // failed state.
1770 Score = 1;
1771 } else {
1772 Score += SplatScore;
1773 // Scale score to see the difference between different operands
1774 // and similar operands but all vectorized/not all vectorized
1775 // uses. It does not affect actual selection of the best
1776 // compatible operand in general, just allows to select the
1777 // operand with all vectorized uses.
1778 Score *= ScoreScaleFactor;
1779 Score += getExternalUseScore(Lane, OpIdx, Idx);
1780 IsUsed = true;
1781 }
1782 }
1783 return Score;
1784 }
1785
1786 /// Best defined scores per lanes between the passes. Used to choose the
1787 /// best operand (with the highest score) between the passes.
1788 /// The key - {Operand Index, Lane}.
1789 /// The value - the best score between the passes for the lane and the
1790 /// operand.
1792 BestScoresPerLanes;
1793
1794 // Search all operands in Ops[*][Lane] for the one that matches best
1795 // Ops[OpIdx][LastLane] and return its opreand index.
1796 // If no good match can be found, return std::nullopt.
1797 std::optional<unsigned>
1798 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1799 ArrayRef<ReorderingMode> ReorderingModes,
1800 ArrayRef<Value *> MainAltOps) {
1801 unsigned NumOperands = getNumOperands();
1802
1803 // The operand of the previous lane at OpIdx.
1804 Value *OpLastLane = getData(OpIdx, LastLane).V;
1805
1806 // Our strategy mode for OpIdx.
1807 ReorderingMode RMode = ReorderingModes[OpIdx];
1808 if (RMode == ReorderingMode::Failed)
1809 return std::nullopt;
1810
1811 // The linearized opcode of the operand at OpIdx, Lane.
1812 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1813
1814 // The best operand index and its score.
1815 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1816 // are using the score to differentiate between the two.
1817 struct BestOpData {
1818 std::optional<unsigned> Idx;
1819 unsigned Score = 0;
1820 } BestOp;
1821 BestOp.Score =
1822 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1823 .first->second;
1824
1825 // Track if the operand must be marked as used. If the operand is set to
1826 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1827 // want to reestimate the operands again on the following iterations).
1828 bool IsUsed =
1829 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1830 // Iterate through all unused operands and look for the best.
1831 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1832 // Get the operand at Idx and Lane.
1833 OperandData &OpData = getData(Idx, Lane);
1834 Value *Op = OpData.V;
1835 bool OpAPO = OpData.APO;
1836
1837 // Skip already selected operands.
1838 if (OpData.IsUsed)
1839 continue;
1840
1841 // Skip if we are trying to move the operand to a position with a
1842 // different opcode in the linearized tree form. This would break the
1843 // semantics.
1844 if (OpAPO != OpIdxAPO)
1845 continue;
1846
1847 // Look for an operand that matches the current mode.
1848 switch (RMode) {
1849 case ReorderingMode::Load:
1850 case ReorderingMode::Constant:
1851 case ReorderingMode::Opcode: {
1852 bool LeftToRight = Lane > LastLane;
1853 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1854 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1855 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1856 OpIdx, Idx, IsUsed);
1857 if (Score > static_cast<int>(BestOp.Score)) {
1858 BestOp.Idx = Idx;
1859 BestOp.Score = Score;
1860 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1861 }
1862 break;
1863 }
1864 case ReorderingMode::Splat:
1865 if (Op == OpLastLane)
1866 BestOp.Idx = Idx;
1867 break;
1868 case ReorderingMode::Failed:
1869 llvm_unreachable("Not expected Failed reordering mode.");
1870 }
1871 }
1872
1873 if (BestOp.Idx) {
1874 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1875 return BestOp.Idx;
1876 }
1877 // If we could not find a good match return std::nullopt.
1878 return std::nullopt;
1879 }
1880
1881 /// Helper for reorderOperandVecs.
1882 /// \returns the lane that we should start reordering from. This is the one
1883 /// which has the least number of operands that can freely move about or
1884 /// less profitable because it already has the most optimal set of operands.
1885 unsigned getBestLaneToStartReordering() const {
1886 unsigned Min = UINT_MAX;
1887 unsigned SameOpNumber = 0;
1888 // std::pair<unsigned, unsigned> is used to implement a simple voting
1889 // algorithm and choose the lane with the least number of operands that
1890 // can freely move about or less profitable because it already has the
1891 // most optimal set of operands. The first unsigned is a counter for
1892 // voting, the second unsigned is the counter of lanes with instructions
1893 // with same/alternate opcodes and same parent basic block.
1895 // Try to be closer to the original results, if we have multiple lanes
1896 // with same cost. If 2 lanes have the same cost, use the one with the
1897 // lowest index.
1898 for (int I = getNumLanes(); I > 0; --I) {
1899 unsigned Lane = I - 1;
1900 OperandsOrderData NumFreeOpsHash =
1901 getMaxNumOperandsThatCanBeReordered(Lane);
1902 // Compare the number of operands that can move and choose the one with
1903 // the least number.
1904 if (NumFreeOpsHash.NumOfAPOs < Min) {
1905 Min = NumFreeOpsHash.NumOfAPOs;
1906 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1907 HashMap.clear();
1908 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1909 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1910 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1911 // Select the most optimal lane in terms of number of operands that
1912 // should be moved around.
1913 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1914 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1915 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1916 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1917 auto *It = HashMap.find(NumFreeOpsHash.Hash);
1918 if (It == HashMap.end())
1919 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1920 else
1921 ++It->second.first;
1922 }
1923 }
1924 // Select the lane with the minimum counter.
1925 unsigned BestLane = 0;
1926 unsigned CntMin = UINT_MAX;
1927 for (const auto &Data : reverse(HashMap)) {
1928 if (Data.second.first < CntMin) {
1929 CntMin = Data.second.first;
1930 BestLane = Data.second.second;
1931 }
1932 }
1933 return BestLane;
1934 }
1935
1936 /// Data structure that helps to reorder operands.
1937 struct OperandsOrderData {
1938 /// The best number of operands with the same APOs, which can be
1939 /// reordered.
1940 unsigned NumOfAPOs = UINT_MAX;
1941 /// Number of operands with the same/alternate instruction opcode and
1942 /// parent.
1943 unsigned NumOpsWithSameOpcodeParent = 0;
1944 /// Hash for the actual operands ordering.
1945 /// Used to count operands, actually their position id and opcode
1946 /// value. It is used in the voting mechanism to find the lane with the
1947 /// least number of operands that can freely move about or less profitable
1948 /// because it already has the most optimal set of operands. Can be
1949 /// replaced with SmallVector<unsigned> instead but hash code is faster
1950 /// and requires less memory.
1951 unsigned Hash = 0;
1952 };
1953 /// \returns the maximum number of operands that are allowed to be reordered
1954 /// for \p Lane and the number of compatible instructions(with the same
1955 /// parent/opcode). This is used as a heuristic for selecting the first lane
1956 /// to start operand reordering.
1957 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
1958 unsigned CntTrue = 0;
1959 unsigned NumOperands = getNumOperands();
1960 // Operands with the same APO can be reordered. We therefore need to count
1961 // how many of them we have for each APO, like this: Cnt[APO] = x.
1962 // Since we only have two APOs, namely true and false, we can avoid using
1963 // a map. Instead we can simply count the number of operands that
1964 // correspond to one of them (in this case the 'true' APO), and calculate
1965 // the other by subtracting it from the total number of operands.
1966 // Operands with the same instruction opcode and parent are more
1967 // profitable since we don't need to move them in many cases, with a high
1968 // probability such lane already can be vectorized effectively.
1969 bool AllUndefs = true;
1970 unsigned NumOpsWithSameOpcodeParent = 0;
1971 Instruction *OpcodeI = nullptr;
1972 BasicBlock *Parent = nullptr;
1973 unsigned Hash = 0;
1974 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1975 const OperandData &OpData = getData(OpIdx, Lane);
1976 if (OpData.APO)
1977 ++CntTrue;
1978 // Use Boyer-Moore majority voting for finding the majority opcode and
1979 // the number of times it occurs.
1980 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
1981 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
1982 I->getParent() != Parent) {
1983 if (NumOpsWithSameOpcodeParent == 0) {
1984 NumOpsWithSameOpcodeParent = 1;
1985 OpcodeI = I;
1986 Parent = I->getParent();
1987 } else {
1988 --NumOpsWithSameOpcodeParent;
1989 }
1990 } else {
1991 ++NumOpsWithSameOpcodeParent;
1992 }
1993 }
1994 Hash = hash_combine(
1995 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
1996 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
1997 }
1998 if (AllUndefs)
1999 return {};
2000 OperandsOrderData Data;
2001 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2002 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2003 Data.Hash = Hash;
2004 return Data;
2005 }
2006
2007 /// Go through the instructions in VL and append their operands.
2008 void appendOperandsOfVL(ArrayRef<Value *> VL) {
2009 assert(!VL.empty() && "Bad VL");
2010 assert((empty() || VL.size() == getNumLanes()) &&
2011 "Expected same number of lanes");
2012 assert(isa<Instruction>(VL[0]) && "Expected instruction");
2013 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2014 constexpr unsigned IntrinsicNumOperands = 2;
2015 if (isa<IntrinsicInst>(VL[0]))
2016 NumOperands = IntrinsicNumOperands;
2017 OpsVec.resize(NumOperands);
2018 unsigned NumLanes = VL.size();
2019 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2020 OpsVec[OpIdx].resize(NumLanes);
2021 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2022 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2023 // Our tree has just 3 nodes: the root and two operands.
2024 // It is therefore trivial to get the APO. We only need to check the
2025 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2026 // RHS operand. The LHS operand of both add and sub is never attached
2027 // to an inversese operation in the linearized form, therefore its APO
2028 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2029
2030 // Since operand reordering is performed on groups of commutative
2031 // operations or alternating sequences (e.g., +, -), we can safely
2032 // tell the inverse operations by checking commutativity.
2033 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2034 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2035 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2036 APO, false};
2037 }
2038 }
2039 }
2040
2041 /// \returns the number of operands.
2042 unsigned getNumOperands() const { return OpsVec.size(); }
2043
2044 /// \returns the number of lanes.
2045 unsigned getNumLanes() const { return OpsVec[0].size(); }
2046
2047 /// \returns the operand value at \p OpIdx and \p Lane.
2048 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2049 return getData(OpIdx, Lane).V;
2050 }
2051
2052 /// \returns true if the data structure is empty.
2053 bool empty() const { return OpsVec.empty(); }
2054
2055 /// Clears the data.
2056 void clear() { OpsVec.clear(); }
2057
2058 /// \Returns true if there are enough operands identical to \p Op to fill
2059 /// the whole vector.
2060 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2061 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2062 bool OpAPO = getData(OpIdx, Lane).APO;
2063 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2064 if (Ln == Lane)
2065 continue;
2066 // This is set to true if we found a candidate for broadcast at Lane.
2067 bool FoundCandidate = false;
2068 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2069 OperandData &Data = getData(OpI, Ln);
2070 if (Data.APO != OpAPO || Data.IsUsed)
2071 continue;
2072 if (Data.V == Op) {
2073 FoundCandidate = true;
2074 Data.IsUsed = true;
2075 break;
2076 }
2077 }
2078 if (!FoundCandidate)
2079 return false;
2080 }
2081 return true;
2082 }
2083
2084 public:
2085 /// Initialize with all the operands of the instruction vector \p RootVL.
2087 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R) {
2088 // Append all the operands of RootVL.
2089 appendOperandsOfVL(RootVL);
2090 }
2091
2092 /// \Returns a value vector with the operands across all lanes for the
2093 /// opearnd at \p OpIdx.
2094 ValueList getVL(unsigned OpIdx) const {
2095 ValueList OpVL(OpsVec[OpIdx].size());
2096 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2097 "Expected same num of lanes across all operands");
2098 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2099 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2100 return OpVL;
2101 }
2102
2103 // Performs operand reordering for 2 or more operands.
2104 // The original operands are in OrigOps[OpIdx][Lane].
2105 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2106 void reorder() {
2107 unsigned NumOperands = getNumOperands();
2108 unsigned NumLanes = getNumLanes();
2109 // Each operand has its own mode. We are using this mode to help us select
2110 // the instructions for each lane, so that they match best with the ones
2111 // we have selected so far.
2112 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2113
2114 // This is a greedy single-pass algorithm. We are going over each lane
2115 // once and deciding on the best order right away with no back-tracking.
2116 // However, in order to increase its effectiveness, we start with the lane
2117 // that has operands that can move the least. For example, given the
2118 // following lanes:
2119 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2120 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2121 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2122 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2123 // we will start at Lane 1, since the operands of the subtraction cannot
2124 // be reordered. Then we will visit the rest of the lanes in a circular
2125 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2126
2127 // Find the first lane that we will start our search from.
2128 unsigned FirstLane = getBestLaneToStartReordering();
2129
2130 // Initialize the modes.
2131 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2132 Value *OpLane0 = getValue(OpIdx, FirstLane);
2133 // Keep track if we have instructions with all the same opcode on one
2134 // side.
2135 if (isa<LoadInst>(OpLane0))
2136 ReorderingModes[OpIdx] = ReorderingMode::Load;
2137 else if (isa<Instruction>(OpLane0)) {
2138 // Check if OpLane0 should be broadcast.
2139 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2140 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2141 else
2142 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2143 }
2144 else if (isa<Constant>(OpLane0))
2145 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2146 else if (isa<Argument>(OpLane0))
2147 // Our best hope is a Splat. It may save some cost in some cases.
2148 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2149 else
2150 // NOTE: This should be unreachable.
2151 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2152 }
2153
2154 // Check that we don't have same operands. No need to reorder if operands
2155 // are just perfect diamond or shuffled diamond match. Do not do it only
2156 // for possible broadcasts or non-power of 2 number of scalars (just for
2157 // now).
2158 auto &&SkipReordering = [this]() {
2159 SmallPtrSet<Value *, 4> UniqueValues;
2160 ArrayRef<OperandData> Op0 = OpsVec.front();
2161 for (const OperandData &Data : Op0)
2162 UniqueValues.insert(Data.V);
2163 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2164 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2165 return !UniqueValues.contains(Data.V);
2166 }))
2167 return false;
2168 }
2169 // TODO: Check if we can remove a check for non-power-2 number of
2170 // scalars after full support of non-power-2 vectorization.
2171 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2172 };
2173
2174 // If the initial strategy fails for any of the operand indexes, then we
2175 // perform reordering again in a second pass. This helps avoid assigning
2176 // high priority to the failed strategy, and should improve reordering for
2177 // the non-failed operand indexes.
2178 for (int Pass = 0; Pass != 2; ++Pass) {
2179 // Check if no need to reorder operands since they're are perfect or
2180 // shuffled diamond match.
2181 // Need to do it to avoid extra external use cost counting for
2182 // shuffled matches, which may cause regressions.
2183 if (SkipReordering())
2184 break;
2185 // Skip the second pass if the first pass did not fail.
2186 bool StrategyFailed = false;
2187 // Mark all operand data as free to use.
2188 clearUsed();
2189 // We keep the original operand order for the FirstLane, so reorder the
2190 // rest of the lanes. We are visiting the nodes in a circular fashion,
2191 // using FirstLane as the center point and increasing the radius
2192 // distance.
2193 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2194 for (unsigned I = 0; I < NumOperands; ++I)
2195 MainAltOps[I].push_back(getData(I, FirstLane).V);
2196
2197 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2198 // Visit the lane on the right and then the lane on the left.
2199 for (int Direction : {+1, -1}) {
2200 int Lane = FirstLane + Direction * Distance;
2201 if (Lane < 0 || Lane >= (int)NumLanes)
2202 continue;
2203 int LastLane = Lane - Direction;
2204 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2205 "Out of bounds");
2206 // Look for a good match for each operand.
2207 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2208 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2209 std::optional<unsigned> BestIdx = getBestOperand(
2210 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2211 // By not selecting a value, we allow the operands that follow to
2212 // select a better matching value. We will get a non-null value in
2213 // the next run of getBestOperand().
2214 if (BestIdx) {
2215 // Swap the current operand with the one returned by
2216 // getBestOperand().
2217 swap(OpIdx, *BestIdx, Lane);
2218 } else {
2219 // We failed to find a best operand, set mode to 'Failed'.
2220 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2221 // Enable the second pass.
2222 StrategyFailed = true;
2223 }
2224 // Try to get the alternate opcode and follow it during analysis.
2225 if (MainAltOps[OpIdx].size() != 2) {
2226 OperandData &AltOp = getData(OpIdx, Lane);
2227 InstructionsState OpS =
2228 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2229 if (OpS.getOpcode() && OpS.isAltShuffle())
2230 MainAltOps[OpIdx].push_back(AltOp.V);
2231 }
2232 }
2233 }
2234 }
2235 // Skip second pass if the strategy did not fail.
2236 if (!StrategyFailed)
2237 break;
2238 }
2239 }
2240
2241#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2242 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2243 switch (RMode) {
2244 case ReorderingMode::Load:
2245 return "Load";
2246 case ReorderingMode::Opcode:
2247 return "Opcode";
2248 case ReorderingMode::Constant:
2249 return "Constant";
2250 case ReorderingMode::Splat:
2251 return "Splat";
2252 case ReorderingMode::Failed:
2253 return "Failed";
2254 }
2255 llvm_unreachable("Unimplemented Reordering Type");
2256 }
2257
2258 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2259 raw_ostream &OS) {
2260 return OS << getModeStr(RMode);
2261 }
2262
2263 /// Debug print.
2264 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2265 printMode(RMode, dbgs());
2266 }
2267
2268 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2269 return printMode(RMode, OS);
2270 }
2271
2273 const unsigned Indent = 2;
2274 unsigned Cnt = 0;
2275 for (const OperandDataVec &OpDataVec : OpsVec) {
2276 OS << "Operand " << Cnt++ << "\n";
2277 for (const OperandData &OpData : OpDataVec) {
2278 OS.indent(Indent) << "{";
2279 if (Value *V = OpData.V)
2280 OS << *V;
2281 else
2282 OS << "null";
2283 OS << ", APO:" << OpData.APO << "}\n";
2284 }
2285 OS << "\n";
2286 }
2287 return OS;
2288 }
2289
2290 /// Debug print.
2291 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2292#endif
2293 };
2294
2295 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2296 /// for a pair which have highest score deemed to have best chance to form
2297 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2298 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2299 /// of the cost, considered to be good enough score.
2300 std::optional<int>
2301 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2302 int Limit = LookAheadHeuristics::ScoreFail) const {
2303 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2305 int BestScore = Limit;
2306 std::optional<int> Index;
2307 for (int I : seq<int>(0, Candidates.size())) {
2308 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2309 Candidates[I].second,
2310 /*U1=*/nullptr, /*U2=*/nullptr,
2311 /*Level=*/1, std::nullopt);
2312 if (Score > BestScore) {
2313 BestScore = Score;
2314 Index = I;
2315 }
2316 }
2317 return Index;
2318 }
2319
2320 /// Checks if the instruction is marked for deletion.
2321 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2322
2323 /// Removes an instruction from its block and eventually deletes it.
2324 /// It's like Instruction::eraseFromParent() except that the actual deletion
2325 /// is delayed until BoUpSLP is destructed.
2327 DeletedInstructions.insert(I);
2328 }
2329
2330 /// Checks if the instruction was already analyzed for being possible
2331 /// reduction root.
2333 return AnalyzedReductionsRoots.count(I);
2334 }
2335 /// Register given instruction as already analyzed for being possible
2336 /// reduction root.
2338 AnalyzedReductionsRoots.insert(I);
2339 }
2340 /// Checks if the provided list of reduced values was checked already for
2341 /// vectorization.
2343 return AnalyzedReductionVals.contains(hash_value(VL));
2344 }
2345 /// Adds the list of reduced values to list of already checked values for the
2346 /// vectorization.
2348 AnalyzedReductionVals.insert(hash_value(VL));
2349 }
2350 /// Clear the list of the analyzed reduction root instructions.
2352 AnalyzedReductionsRoots.clear();
2353 AnalyzedReductionVals.clear();
2354 AnalyzedMinBWVals.clear();
2355 }
2356 /// Checks if the given value is gathered in one of the nodes.
2357 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2358 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2359 }
2360 /// Checks if the given value is gathered in one of the nodes.
2361 bool isGathered(const Value *V) const {
2362 return MustGather.contains(V);
2363 }
2364 /// Checks if the specified value was not schedule.
2365 bool isNotScheduled(const Value *V) const {
2366 return NonScheduledFirst.contains(V);
2367 }
2368
2369 /// Check if the value is vectorized in the tree.
2370 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2371
2372 ~BoUpSLP();
2373
2374private:
2375 /// Determine if a node \p E in can be demoted to a smaller type with a
2376 /// truncation. We collect the entries that will be demoted in ToDemote.
2377 /// \param E Node for analysis
2378 /// \param ToDemote indices of the nodes to be demoted.
2379 bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
2380 unsigned &BitWidth,
2381 SmallVectorImpl<unsigned> &ToDemote,
2383 unsigned &MaxDepthLevel,
2384 bool &IsProfitableToDemote,
2385 bool IsTruncRoot) const;
2386
2387 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2388 /// reordering (i.e. the operands can be reordered because they have only one
2389 /// user and reordarable).
2390 /// \param ReorderableGathers List of all gather nodes that require reordering
2391 /// (e.g., gather of extractlements or partially vectorizable loads).
2392 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2393 /// reordering, subset of \p NonVectorized.
2394 bool
2395 canReorderOperands(TreeEntry *UserTE,
2396 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2397 ArrayRef<TreeEntry *> ReorderableGathers,
2398 SmallVectorImpl<TreeEntry *> &GatherOps);
2399
2400 /// Checks if the given \p TE is a gather node with clustered reused scalars
2401 /// and reorders it per given \p Mask.
2402 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2403
2404 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2405 /// if any. If it is not vectorized (gather node), returns nullptr.
2406 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2407 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2408 TreeEntry *TE = nullptr;
2409 const auto *It = find_if(VL, [&](Value *V) {
2410 TE = getTreeEntry(V);
2411 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2412 return true;
2413 auto It = MultiNodeScalars.find(V);
2414 if (It != MultiNodeScalars.end()) {
2415 for (TreeEntry *E : It->second) {
2416 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2417 TE = E;
2418 return true;
2419 }
2420 }
2421 }
2422 return false;
2423 });
2424 if (It != VL.end()) {
2425 assert(TE->isSame(VL) && "Expected same scalars.");
2426 return TE;
2427 }
2428 return nullptr;
2429 }
2430
2431 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2432 /// if any. If it is not vectorized (gather node), returns nullptr.
2433 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2434 unsigned OpIdx) const {
2435 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2436 const_cast<TreeEntry *>(UserTE), OpIdx);
2437 }
2438
2439 /// Checks if all users of \p I are the part of the vectorization tree.
2440 bool areAllUsersVectorized(
2441 Instruction *I,
2442 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2443
2444 /// Return information about the vector formed for the specified index
2445 /// of a vector of (the same) instruction.
2447
2448 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2449 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2450
2451 /// \returns Cast context for the given graph node.
2453 getCastContextHint(const TreeEntry &TE) const;
2454
2455 /// \returns the cost of the vectorizable entry.
2456 InstructionCost getEntryCost(const TreeEntry *E,
2457 ArrayRef<Value *> VectorizedVals,
2458 SmallPtrSetImpl<Value *> &CheckedExtracts);
2459
2460 /// This is the recursive part of buildTree.
2461 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2462 const EdgeInfo &EI);
2463
2464 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2465 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2466 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2467 /// returns false, setting \p CurrentOrder to either an empty vector or a
2468 /// non-identity permutation that allows to reuse extract instructions.
2469 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2470 /// extract order.
2471 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2472 SmallVectorImpl<unsigned> &CurrentOrder,
2473 bool ResizeAllowed = false) const;
2474
2475 /// Vectorize a single entry in the tree.
2476 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2477 /// avoid issues with def-use order.
2478 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2479
2480 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2481 /// \p E.
2482 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2483 /// avoid issues with def-use order.
2484 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2485
2486 /// Create a new vector from a list of scalar values. Produces a sequence
2487 /// which exploits values reused across lanes, and arranges the inserts
2488 /// for ease of later optimization.
2489 template <typename BVTy, typename ResTy, typename... Args>
2490 ResTy processBuildVector(const TreeEntry *E, Args &...Params);
2491
2492 /// Create a new vector from a list of scalar values. Produces a sequence
2493 /// which exploits values reused across lanes, and arranges the inserts
2494 /// for ease of later optimization.
2495 Value *createBuildVector(const TreeEntry *E);
2496
2497 /// Returns the instruction in the bundle, which can be used as a base point
2498 /// for scheduling. Usually it is the last instruction in the bundle, except
2499 /// for the case when all operands are external (in this case, it is the first
2500 /// instruction in the list).
2501 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2502
2503 /// Tries to find extractelement instructions with constant indices from fixed
2504 /// vector type and gather such instructions into a bunch, which highly likely
2505 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2506 /// was successful, the matched scalars are replaced by poison values in \p VL
2507 /// for future analysis.
2508 std::optional<TargetTransformInfo::ShuffleKind>
2509 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2510 SmallVectorImpl<int> &Mask) const;
2511
2512 /// Tries to find extractelement instructions with constant indices from fixed
2513 /// vector type and gather such instructions into a bunch, which highly likely
2514 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2515 /// was successful, the matched scalars are replaced by poison values in \p VL
2516 /// for future analysis.
2518 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2520 unsigned NumParts) const;
2521
2522 /// Checks if the gathered \p VL can be represented as a single register
2523 /// shuffle(s) of previous tree entries.
2524 /// \param TE Tree entry checked for permutation.
2525 /// \param VL List of scalars (a subset of the TE scalar), checked for
2526 /// permutations. Must form single-register vector.
2527 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2528 /// commands to build the mask using the original vector value, without
2529 /// relying on the potential reordering.
2530 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2531 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2532 std::optional<TargetTransformInfo::ShuffleKind>
2533 isGatherShuffledSingleRegisterEntry(
2534 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2535 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2536 bool ForOrder);
2537
2538 /// Checks if the gathered \p VL can be represented as multi-register
2539 /// shuffle(s) of previous tree entries.
2540 /// \param TE Tree entry checked for permutation.
2541 /// \param VL List of scalars (a subset of the TE scalar), checked for
2542 /// permutations.
2543 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2544 /// commands to build the mask using the original vector value, without
2545 /// relying on the potential reordering.
2546 /// \returns per-register series of ShuffleKind, if gathered values can be
2547 /// represented as shuffles of previous tree entries. \p Mask is filled with
2548 /// the shuffle mask (also on per-register base).
2550 isGatherShuffledEntry(
2551 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2553 unsigned NumParts, bool ForOrder = false);
2554
2555 /// \returns the scalarization cost for this list of values. Assuming that
2556 /// this subtree gets vectorized, we may need to extract the values from the
2557 /// roots. This method calculates the cost of extracting the values.
2558 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2559 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc) const;
2560
2561 /// Set the Builder insert point to one after the last instruction in
2562 /// the bundle
2563 void setInsertPointAfterBundle(const TreeEntry *E);
2564
2565 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2566 /// specified, the starting vector value is poison.
2567 Value *gather(ArrayRef<Value *> VL, Value *Root);
2568
2569 /// \returns whether the VectorizableTree is fully vectorizable and will
2570 /// be beneficial even the tree height is tiny.
2571 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2572
2573 /// Reorder commutative or alt operands to get better probability of
2574 /// generating vectorized code.
2575 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2578 const BoUpSLP &R);
2579
2580 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2581 /// users of \p TE and collects the stores. It returns the map from the store
2582 /// pointers to the collected stores.
2584 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2585
2586 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2587 /// stores in \p StoresVec can form a vector instruction. If so it returns
2588 /// true and populates \p ReorderIndices with the shuffle indices of the
2589 /// stores when compared to the sorted vector.
2590 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2591 OrdersType &ReorderIndices) const;
2592
2593 /// Iterates through the users of \p TE, looking for scalar stores that can be
2594 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2595 /// their order and builds an order index vector for each store bundle. It
2596 /// returns all these order vectors found.
2597 /// We run this after the tree has formed, otherwise we may come across user
2598 /// instructions that are not yet in the tree.
2600 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2601
2602 struct TreeEntry {
2603 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2604 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2605
2606 /// \returns Common mask for reorder indices and reused scalars.
2607 SmallVector<int> getCommonMask() const {
2609 inversePermutation(ReorderIndices, Mask);
2610 ::addMask(Mask, ReuseShuffleIndices);
2611 return Mask;
2612 }
2613
2614 /// \returns true if the scalars in VL are equal to this entry.
2615 bool isSame(ArrayRef<Value *> VL) const {
2616 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2617 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2618 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2619 return VL.size() == Mask.size() &&
2620 std::equal(VL.begin(), VL.end(), Mask.begin(),
2621 [Scalars](Value *V, int Idx) {
2622 return (isa<UndefValue>(V) &&
2623 Idx == PoisonMaskElem) ||
2624 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2625 });
2626 };
2627 if (!ReorderIndices.empty()) {
2628 // TODO: implement matching if the nodes are just reordered, still can
2629 // treat the vector as the same if the list of scalars matches VL
2630 // directly, without reordering.
2632 inversePermutation(ReorderIndices, Mask);
2633 if (VL.size() == Scalars.size())
2634 return IsSame(Scalars, Mask);
2635 if (VL.size() == ReuseShuffleIndices.size()) {
2636 ::addMask(Mask, ReuseShuffleIndices);
2637 return IsSame(Scalars, Mask);
2638 }
2639 return false;
2640 }
2641 return IsSame(Scalars, ReuseShuffleIndices);
2642 }
2643
2644 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2645 return State == TreeEntry::NeedToGather &&
2646 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2647 UserTreeIndices.front().UserTE == UserEI.UserTE;
2648 }
2649
2650 /// \returns true if current entry has same operands as \p TE.
2651 bool hasEqualOperands(const TreeEntry &TE) const {
2652 if (TE.getNumOperands() != getNumOperands())
2653 return false;
2654 SmallBitVector Used(getNumOperands());
2655 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2656 unsigned PrevCount = Used.count();
2657 for (unsigned K = 0; K < E; ++K) {
2658 if (Used.test(K))
2659 continue;
2660 if (getOperand(K) == TE.getOperand(I)) {
2661 Used.set(K);
2662 break;
2663 }
2664 }
2665 // Check if we actually found the matching operand.
2666 if (PrevCount == Used.count())
2667 return false;
2668 }
2669 return true;
2670 }
2671
2672 /// \return Final vectorization factor for the node. Defined by the total
2673 /// number of vectorized scalars, including those, used several times in the
2674 /// entry and counted in the \a ReuseShuffleIndices, if any.
2675 unsigned getVectorFactor() const {
2676 if (!ReuseShuffleIndices.empty())
2677 return ReuseShuffleIndices.size();
2678 return Scalars.size();
2679 };
2680
2681 /// A vector of scalars.
2682 ValueList Scalars;
2683
2684 /// The Scalars are vectorized into this value. It is initialized to Null.
2685 WeakTrackingVH VectorizedValue = nullptr;
2686
2687 /// New vector phi instructions emitted for the vectorized phi nodes.
2688 PHINode *PHI = nullptr;
2689
2690 /// Do we need to gather this sequence or vectorize it
2691 /// (either with vector instruction or with scatter/gather
2692 /// intrinsics for store/load)?
2693 enum EntryState {
2694 Vectorize,
2695 ScatterVectorize,
2696 StridedVectorize,
2697 NeedToGather
2698 };
2699 EntryState State;
2700
2701 /// Does this sequence require some shuffling?
2702 SmallVector<int, 4> ReuseShuffleIndices;
2703
2704 /// Does this entry require reordering?
2705 SmallVector<unsigned, 4> ReorderIndices;
2706
2707 /// Points back to the VectorizableTree.
2708 ///
2709 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2710 /// to be a pointer and needs to be able to initialize the child iterator.
2711 /// Thus we need a reference back to the container to translate the indices
2712 /// to entries.
2713 VecTreeTy &Container;
2714
2715 /// The TreeEntry index containing the user of this entry. We can actually
2716 /// have multiple users so the data structure is not truly a tree.
2717 SmallVector<EdgeInfo, 1> UserTreeIndices;
2718
2719 /// The index of this treeEntry in VectorizableTree.
2720 int Idx = -1;
2721
2722 private:
2723 /// The operands of each instruction in each lane Operands[op_index][lane].
2724 /// Note: This helps avoid the replication of the code that performs the
2725 /// reordering of operands during buildTree_rec() and vectorizeTree().
2727
2728 /// The main/alternate instruction.
2729 Instruction *MainOp = nullptr;
2730 Instruction *AltOp = nullptr;
2731
2732 public:
2733 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
2734 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2735 if (Operands.size() < OpIdx + 1)
2736 Operands.resize(OpIdx + 1);
2737 assert(Operands[OpIdx].empty() && "Already resized?");
2738 assert(OpVL.size() <= Scalars.size() &&
2739 "Number of operands is greater than the number of scalars.");
2740 Operands[OpIdx].resize(OpVL.size());
2741 copy(OpVL, Operands[OpIdx].begin());
2742 }
2743
2744 /// Set the operands of this bundle in their original order.
2745 void setOperandsInOrder() {
2746 assert(Operands.empty() && "Already initialized?");
2747 auto *I0 = cast<Instruction>(Scalars[0]);
2748 Operands.resize(I0->getNumOperands());
2749 unsigned NumLanes = Scalars.size();
2750 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2751 OpIdx != NumOperands; ++OpIdx) {
2752 Operands[OpIdx].resize(NumLanes);
2753 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2754 auto *I = cast<Instruction>(Scalars[Lane]);
2755 assert(I->getNumOperands() == NumOperands &&
2756 "Expected same number of operands");
2757 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
2758 }
2759 }
2760 }
2761
2762 /// Reorders operands of the node to the given mask \p Mask.
2763 void reorderOperands(ArrayRef<int> Mask) {
2764 for (ValueList &Operand : Operands)
2765 reorderScalars(Operand, Mask);
2766 }
2767
2768 /// \returns the \p OpIdx operand of this TreeEntry.
2769 ValueList &getOperand(unsigned OpIdx) {
2770 assert(OpIdx < Operands.size() && "Off bounds");
2771 return Operands[OpIdx];
2772 }
2773
2774 /// \returns the \p OpIdx operand of this TreeEntry.
2775 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
2776 assert(OpIdx < Operands.size() && "Off bounds");
2777 return Operands[OpIdx];
2778 }
2779
2780 /// \returns the number of operands.
2781 unsigned getNumOperands() const { return Operands.size(); }
2782
2783 /// \return the single \p OpIdx operand.
2784 Value *getSingleOperand(unsigned OpIdx) const {
2785 assert(OpIdx < Operands.size() && "Off bounds");
2786 assert(!Operands[OpIdx].empty() && "No operand available");
2787 return Operands[OpIdx][0];
2788 }
2789
2790 /// Some of the instructions in the list have alternate opcodes.
2791 bool isAltShuffle() const { return MainOp != AltOp; }
2792
2793 bool isOpcodeOrAlt(Instruction *I) const {
2794 unsigned CheckedOpcode = I->getOpcode();
2795 return (getOpcode() == CheckedOpcode ||
2796 getAltOpcode() == CheckedOpcode);
2797 }
2798
2799 /// Chooses the correct key for scheduling data. If \p Op has the same (or
2800 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
2801 /// \p OpValue.
2802 Value *isOneOf(Value *Op) const {
2803 auto *I = dyn_cast<Instruction>(Op);
2804 if (I && isOpcodeOrAlt(I))
2805 return Op;
2806 return MainOp;
2807 }
2808
2809 void setOperations(const InstructionsState &S) {
2810 MainOp = S.MainOp;
2811 AltOp = S.AltOp;
2812 }
2813
2814 Instruction *getMainOp() const {
2815 return MainOp;
2816 }
2817
2818 Instruction *getAltOp() const {
2819 return AltOp;
2820 }
2821
2822 /// The main/alternate opcodes for the list of instructions.
2823 unsigned getOpcode() const {
2824 return MainOp ? MainOp->getOpcode() : 0;
2825 }
2826
2827 unsigned getAltOpcode() const {
2828 return AltOp ? AltOp->getOpcode() : 0;
2829 }
2830
2831 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
2832 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
2833 int findLaneForValue(Value *V) const {
2834 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
2835 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2836 if (!ReorderIndices.empty())
2837 FoundLane = ReorderIndices[FoundLane];
2838 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2839 if (!ReuseShuffleIndices.empty()) {
2840 FoundLane = std::distance(ReuseShuffleIndices.begin(),
2841 find(ReuseShuffleIndices, FoundLane));
2842 }
2843 return FoundLane;
2844 }
2845
2846 /// Build a shuffle mask for graph entry which represents a merge of main
2847 /// and alternate operations.
2848 void
2849 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
2851 SmallVectorImpl<Value *> *OpScalars = nullptr,
2852 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
2853
2854 /// Return true if this is a non-power-of-2 node.
2855 bool isNonPowOf2Vec() const {
2856 bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size());
2857 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
2858 "Reshuffling not supported with non-power-of-2 vectors yet.");
2859 return IsNonPowerOf2;
2860 }
2861
2862#ifndef NDEBUG
2863 /// Debug printer.
2864 LLVM_DUMP_METHOD void dump() const {
2865 dbgs() << Idx << ".\n";
2866 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
2867 dbgs() << "Operand " << OpI << ":\n";
2868 for (const Value *V : Operands[OpI])
2869 dbgs().indent(2) << *V << "\n";
2870 }
2871 dbgs() << "Scalars: \n";
2872 for (Value *V : Scalars)
2873 dbgs().indent(2) << *V << "\n";
2874 dbgs() << "State: ";
2875 switch (State) {
2876 case Vectorize:
2877 dbgs() << "Vectorize\n";
2878 break;
2879 case ScatterVectorize:
2880 dbgs() << "ScatterVectorize\n";
2881 break;
2882 case StridedVectorize:
2883 dbgs() << "StridedVectorize\n";
2884 break;
2885 case NeedToGather:
2886 dbgs() << "NeedToGather\n";
2887 break;
2888 }
2889 dbgs() << "MainOp: ";
2890 if (MainOp)
2891 dbgs() << *MainOp << "\n";
2892 else
2893 dbgs() << "NULL\n";
2894 dbgs() << "AltOp: ";
2895 if (AltOp)
2896 dbgs() << *AltOp << "\n";
2897 else
2898 dbgs() << "NULL\n";
2899 dbgs() << "VectorizedValue: ";
2900 if (VectorizedValue)
2901 dbgs() << *VectorizedValue << "\n";
2902 else
2903 dbgs() << "NULL\n";
2904 dbgs() << "ReuseShuffleIndices: ";
2905 if (ReuseShuffleIndices.empty())
2906 dbgs() << "Empty";
2907 else
2908 for (int ReuseIdx : ReuseShuffleIndices)
2909 dbgs() << ReuseIdx << ", ";
2910 dbgs() << "\n";
2911 dbgs() << "ReorderIndices: ";
2912 for (unsigned ReorderIdx : ReorderIndices)
2913 dbgs() << ReorderIdx << ", ";
2914 dbgs() << "\n";
2915 dbgs() << "UserTreeIndices: ";
2916 for (const auto &EInfo : UserTreeIndices)
2917 dbgs() << EInfo << ", ";
2918 dbgs() << "\n";
2919 }
2920#endif
2921 };
2922
2923#ifndef NDEBUG
2924 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
2925 InstructionCost VecCost, InstructionCost ScalarCost,
2926 StringRef Banner) const {
2927 dbgs() << "SLP: " << Banner << ":\n";
2928 E->dump();
2929 dbgs() << "SLP: Costs:\n";
2930 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
2931 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
2932 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
2933 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2934 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
2935 }
2936#endif
2937
2938 /// Create a new VectorizableTree entry.
2939 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2940 std::optional<ScheduleData *> Bundle,
2941 const InstructionsState &S,
2942 const EdgeInfo &UserTreeIdx,
2943 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2944 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2945 TreeEntry::EntryState EntryState =
2946 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2947 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2948 ReuseShuffleIndices, ReorderIndices);
2949 }
2950
2951 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2952 TreeEntry::EntryState EntryState,
2953 std::optional<ScheduleData *> Bundle,
2954 const InstructionsState &S,
2955 const EdgeInfo &UserTreeIdx,
2956 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2957 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2958 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2959 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2960 "Need to vectorize gather entry?");
2961 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
2962 TreeEntry *Last = VectorizableTree.back().get();
2963 Last->Idx = VectorizableTree.size() - 1;
2964 Last->State = EntryState;
2965 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2966 ReuseShuffleIndices.end());
2967 if (ReorderIndices.empty()) {
2968 Last->Scalars.assign(VL.begin(), VL.end());
2969 Last->setOperations(S);
2970 } else {
2971 // Reorder scalars and build final mask.
2972 Last->Scalars.assign(VL.size(), nullptr);
2973 transform(ReorderIndices, Last->Scalars.begin(),
2974 [VL](unsigned Idx) -> Value * {
2975 if (Idx >= VL.size())
2976 return UndefValue::get(VL.front()->getType());
2977 return VL[Idx];
2978 });
2979 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
2980 Last->setOperations(S);
2981 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
2982 }
2983 if (Last->State != TreeEntry::NeedToGather) {
2984 for (Value *V : VL) {
2985 const TreeEntry *TE = getTreeEntry(V);
2986 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
2987 "Scalar already in tree!");
2988 if (TE) {
2989 if (TE != Last)
2990 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
2991 continue;
2992 }
2993 ScalarToTreeEntry[V] = Last;
2994 }
2995 // Update the scheduler bundle to point to this TreeEntry.
2996 ScheduleData *BundleMember = *Bundle;
2997 assert((BundleMember || isa<PHINode>(S.MainOp) ||
2998 isVectorLikeInstWithConstOps(S.MainOp) ||
2999 doesNotNeedToSchedule(VL)) &&
3000 "Bundle and VL out of sync");
3001 if (BundleMember) {
3002 for (Value *V : VL) {
3004 continue;
3005 if (!BundleMember)
3006 continue;
3007 BundleMember->TE = Last;
3008 BundleMember = BundleMember->NextInBundle;
3009 }
3010 }
3011 assert(!BundleMember && "Bundle and VL out of sync");
3012 } else {
3013 // Build a map for gathered scalars to the nodes where they are used.
3014 bool AllConstsOrCasts = true;
3015 for (Value *V : VL)
3016 if (!isConstant(V)) {
3017 auto *I = dyn_cast<CastInst>(V);
3018 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3019 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3020 }
3021 if (AllConstsOrCasts)
3022 CastMaxMinBWSizes =
3023 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3024 MustGather.insert(VL.begin(), VL.end());
3025 }
3026
3027 if (UserTreeIdx.UserTE) {
3028 Last->UserTreeIndices.push_back(UserTreeIdx);
3029 assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3030 "Reordering isn't implemented for non-power-of-2 nodes yet");
3031 }
3032 return Last;
3033 }
3034
3035 /// -- Vectorization State --
3036 /// Holds all of the tree entries.
3037 TreeEntry::VecTreeTy VectorizableTree;
3038
3039#ifndef NDEBUG
3040 /// Debug printer.
3041 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3042 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3043 VectorizableTree[Id]->dump();
3044 dbgs() << "\n";
3045 }
3046 }
3047#endif
3048
3049 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3050
3051 const TreeEntry *getTreeEntry(Value *V) const {
3052 return ScalarToTreeEntry.lookup(V);
3053 }
3054
3055 /// Check that the operand node of alternate node does not generate
3056 /// buildvector sequence. If it is, then probably not worth it to build
3057 /// alternate shuffle, if number of buildvector operands + alternate
3058 /// instruction > than the number of buildvector instructions.
3059 /// \param S the instructions state of the analyzed values.
3060 /// \param VL list of the instructions with alternate opcodes.
3061 bool areAltOperandsProfitable(const InstructionsState &S,
3062 ArrayRef<Value *> VL) const;
3063
3064 /// Checks if the specified list of the instructions/values can be vectorized
3065 /// and fills required data before actual scheduling of the instructions.
3066 TreeEntry::EntryState getScalarsVectorizationState(
3067 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
3068 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
3069
3070 /// Maps a specific scalar to its tree entry.
3071 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3072
3073 /// List of scalars, used in several vectorize nodes, and the list of the
3074 /// nodes.
3076
3077 /// Maps a value to the proposed vectorizable size.
3078 SmallDenseMap<Value *, unsigned> InstrElementSize;
3079
3080 /// A list of scalars that we found that we need to keep as scalars.
3081 ValueSet MustGather;
3082
3083 /// A set of first non-schedulable values.
3084 ValueSet NonScheduledFirst;
3085
3086 /// A map between the vectorized entries and the last instructions in the
3087 /// bundles. The bundles are built in use order, not in the def order of the
3088 /// instructions. So, we cannot rely directly on the last instruction in the
3089 /// bundle being the last instruction in the program order during
3090 /// vectorization process since the basic blocks are affected, need to
3091 /// pre-gather them before.
3092 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3093
3094 /// List of gather nodes, depending on other gather/vector nodes, which should
3095 /// be emitted after the vector instruction emission process to correctly
3096 /// handle order of the vector instructions and shuffles.
3097 SetVector<const TreeEntry *> PostponedGathers;
3098
3099 using ValueToGatherNodesMap =
3101 ValueToGatherNodesMap ValueToGatherNodes;
3102
3103 /// This POD struct describes one external user in the vectorized tree.
3104 struct ExternalUser {
3105 ExternalUser(Value *S, llvm::User *U, int L)
3106 : Scalar(S), User(U), Lane(L) {}
3107
3108 // Which scalar in our function.
3109 Value *Scalar;
3110
3111 // Which user that uses the scalar.
3113
3114 // Which lane does the scalar belong to.
3115 int Lane;
3116 };
3117 using UserList = SmallVector<ExternalUser, 16>;
3118
3119 /// Checks if two instructions may access the same memory.
3120 ///
3121 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3122 /// is invariant in the calling loop.
3123 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3124 Instruction *Inst2) {
3125 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3126 return true;
3127 // First check if the result is already in the cache.
3128 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3129 auto It = AliasCache.find(Key);
3130 if (It != AliasCache.end())
3131 return It->second;
3132 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3133 // Store the result in the cache.
3134 AliasCache.try_emplace(Key, Aliased);
3135 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3136 return Aliased;
3137 }
3138
3139 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3140
3141 /// Cache for alias results.
3142 /// TODO: consider moving this to the AliasAnalysis itself.
3144
3145 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3146 // globally through SLP because we don't perform any action which
3147 // invalidates capture results.
3148 BatchAAResults BatchAA;
3149
3150 /// Temporary store for deleted instructions. Instructions will be deleted
3151 /// eventually when the BoUpSLP is destructed. The deferral is required to
3152 /// ensure that there are no incorrect collisions in the AliasCache, which
3153 /// can happen if a new instruction is allocated at the same address as a
3154 /// previously deleted instruction.
3155 DenseSet<Instruction *> DeletedInstructions;
3156
3157 /// Set of the instruction, being analyzed already for reductions.
3158 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3159
3160 /// Set of hashes for the list of reduction values already being analyzed.
3161 DenseSet<size_t> AnalyzedReductionVals;
3162
3163 /// Values, already been analyzed for mininmal bitwidth and found to be
3164 /// non-profitable.
3165 DenseSet<Value *> AnalyzedMinBWVals;
3166
3167 /// A list of values that need to extracted out of the tree.
3168 /// This list holds pairs of (Internal Scalar : External User). External User
3169 /// can be nullptr, it means that this Internal Scalar will be used later,
3170 /// after vectorization.
3171 UserList ExternalUses;
3172
3173 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3174 /// extractelement instructions.
3175 SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;
3176
3177 /// Values used only by @llvm.assume calls.
3179
3180 /// Holds all of the instructions that we gathered, shuffle instructions and
3181 /// extractelements.
3182 SetVector<Instruction *> GatherShuffleExtractSeq;
3183
3184 /// A list of blocks that we are going to CSE.
3185 DenseSet<BasicBlock *> CSEBlocks;
3186
3187 /// Contains all scheduling relevant data for an instruction.
3188 /// A ScheduleData either represents a single instruction or a member of an
3189 /// instruction bundle (= a group of instructions which is combined into a
3190 /// vector instruction).
3191 struct ScheduleData {
3192 // The initial value for the dependency counters. It means that the
3193 // dependencies are not calculated yet.
3194 enum { InvalidDeps = -1 };
3195
3196 ScheduleData() = default;
3197
3198 void init(int BlockSchedulingRegionID, Value *OpVal) {
3199 FirstInBundle = this;
3200 NextInBundle = nullptr;
3201 NextLoadStore = nullptr;
3202 IsScheduled = false;
3203 SchedulingRegionID = BlockSchedulingRegionID;
3204 clearDependencies();
3205 OpValue = OpVal;
3206 TE = nullptr;
3207 }
3208
3209 /// Verify basic self consistency properties
3210 void verify() {
3211 if (hasValidDependencies()) {
3212 assert(UnscheduledDeps <= Dependencies && "invariant");
3213 } else {
3214 assert(UnscheduledDeps == Dependencies && "invariant");
3215 }
3216
3217 if (IsScheduled) {
3218 assert(isSchedulingEntity() &&
3219 "unexpected scheduled state");
3220 for (const ScheduleData *BundleMember = this; BundleMember;
3221 BundleMember = BundleMember->NextInBundle) {
3222 assert(BundleMember->hasValidDependencies() &&
3223 BundleMember->UnscheduledDeps == 0 &&
3224 "unexpected scheduled state");
3225 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3226 "only bundle is marked scheduled");
3227 }
3228 }
3229
3230 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3231 "all bundle members must be in same basic block");
3232 }
3233
3234 /// Returns true if the dependency information has been calculated.
3235 /// Note that depenendency validity can vary between instructions within
3236 /// a single bundle.
3237 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3238
3239 /// Returns true for single instructions and for bundle representatives
3240 /// (= the head of a bundle).
3241 bool isSchedulingEntity() const { return FirstInBundle == this; }
3242
3243 /// Returns true if it represents an instruction bundle and not only a
3244 /// single instruction.
3245 bool isPartOfBundle() const {
3246 return NextInBundle != nullptr || FirstInBundle != this || TE;
3247 }
3248
3249 /// Returns true if it is ready for scheduling, i.e. it has no more
3250 /// unscheduled depending instructions/bundles.
3251 bool isReady() const {
3252 assert(isSchedulingEntity() &&
3253 "can't consider non-scheduling entity for ready list");
3254 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3255 }
3256
3257 /// Modifies the number of unscheduled dependencies for this instruction,
3258 /// and returns the number of remaining dependencies for the containing
3259 /// bundle.
3260 int incrementUnscheduledDeps(int Incr) {
3261 assert(hasValidDependencies() &&
3262 "increment of unscheduled deps would be meaningless");
3263 UnscheduledDeps += Incr;
3264 return FirstInBundle->unscheduledDepsInBundle();
3265 }
3266
3267 /// Sets the number of unscheduled dependencies to the number of
3268 /// dependencies.
3269 void resetUnscheduledDeps() {
3270 UnscheduledDeps = Dependencies;
3271 }
3272
3273 /// Clears all dependency information.
3274 void clearDependencies() {
3275 Dependencies = InvalidDeps;
3276 resetUnscheduledDeps();
3277 MemoryDependencies.clear();
3278 ControlDependencies.clear();
3279 }
3280
3281 int unscheduledDepsInBundle() const {
3282 assert(isSchedulingEntity() && "only meaningful on the bundle");
3283 int Sum = 0;
3284 for (const ScheduleData *BundleMember = this; BundleMember;
3285 BundleMember = BundleMember->NextInBundle) {
3286 if (BundleMember->UnscheduledDeps == InvalidDeps)
3287 return InvalidDeps;
3288 Sum += BundleMember->UnscheduledDeps;
3289 }
3290 return Sum;
3291 }
3292
3293 void dump(raw_ostream &os) const {
3294 if (!isSchedulingEntity()) {
3295 os << "/ " << *Inst;
3296 } else if (NextInBundle) {
3297 os << '[' << *Inst;
3298 ScheduleData *SD = NextInBundle;
3299 while (SD) {
3300 os << ';' << *SD->Inst;
3301 SD = SD->NextInBundle;
3302 }
3303 os << ']';
3304 } else {
3305 os << *Inst;
3306 }
3307 }
3308
3309 Instruction *Inst = nullptr;
3310
3311 /// Opcode of the current instruction in the schedule data.
3312 Value *OpValue = nullptr;
3313
3314 /// The TreeEntry that this instruction corresponds to.
3315 TreeEntry *TE = nullptr;
3316
3317 /// Points to the head in an instruction bundle (and always to this for
3318 /// single instructions).
3319 ScheduleData *FirstInBundle = nullptr;
3320
3321 /// Single linked list of all instructions in a bundle. Null if it is a
3322 /// single instruction.
3323 ScheduleData *NextInBundle = nullptr;
3324
3325 /// Single linked list of all memory instructions (e.g. load, store, call)
3326 /// in the block - until the end of the scheduling region.
3327 ScheduleData *NextLoadStore = nullptr;
3328
3329 /// The dependent memory instructions.
3330 /// This list is derived on demand in calculateDependencies().
3331 SmallVector<ScheduleData *, 4> MemoryDependencies;
3332
3333 /// List of instructions which this instruction could be control dependent
3334 /// on. Allowing such nodes to be scheduled below this one could introduce
3335 /// a runtime fault which didn't exist in the original program.
3336 /// ex: this is a load or udiv following a readonly call which inf loops
3337 SmallVector<ScheduleData *, 4> ControlDependencies;
3338
3339 /// This ScheduleData is in the current scheduling region if this matches
3340 /// the current SchedulingRegionID of BlockScheduling.
3341 int SchedulingRegionID = 0;
3342
3343 /// Used for getting a "good" final ordering of instructions.
3344 int SchedulingPriority = 0;
3345
3346 /// The number of dependencies. Constitutes of the number of users of the
3347 /// instruction plus the number of dependent memory instructions (if any).
3348 /// This value is calculated on demand.
3349 /// If InvalidDeps, the number of dependencies is not calculated yet.
3350 int Dependencies = InvalidDeps;
3351
3352 /// The number of dependencies minus the number of dependencies of scheduled
3353 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3354 /// for scheduling.
3355 /// Note that this is negative as long as Dependencies is not calculated.
3356 int UnscheduledDeps = InvalidDeps;
3357
3358 /// True if this instruction is scheduled (or considered as scheduled in the
3359 /// dry-run).
3360 bool IsScheduled = false;
3361 };
3362
3363#ifndef NDEBUG
3365 const BoUpSLP::ScheduleData &SD) {
3366 SD.dump(os);
3367 return os;
3368 }
3369#endif
3370
3371 friend struct GraphTraits<BoUpSLP *>;
3372 friend struct DOTGraphTraits<BoUpSLP *>;
3373
3374 /// Contains all scheduling data for a basic block.
3375 /// It does not schedules instructions, which are not memory read/write
3376 /// instructions and their operands are either constants, or arguments, or
3377 /// phis, or instructions from others blocks, or their users are phis or from
3378 /// the other blocks. The resulting vector instructions can be placed at the
3379 /// beginning of the basic block without scheduling (if operands does not need
3380 /// to be scheduled) or at the end of the block (if users are outside of the
3381 /// block). It allows to save some compile time and memory used by the
3382 /// compiler.
3383 /// ScheduleData is assigned for each instruction in between the boundaries of
3384 /// the tree entry, even for those, which are not part of the graph. It is
3385 /// required to correctly follow the dependencies between the instructions and
3386 /// their correct scheduling. The ScheduleData is not allocated for the
3387 /// instructions, which do not require scheduling, like phis, nodes with
3388 /// extractelements/insertelements only or nodes with instructions, with
3389 /// uses/operands outside of the block.
3390 struct BlockScheduling {
3391 BlockScheduling(BasicBlock *BB)
3392 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3393
3394 void clear() {
3395 ReadyInsts.clear();
3396 ScheduleStart = nullptr;
3397 ScheduleEnd = nullptr;
3398 FirstLoadStoreInRegion = nullptr;
3399 LastLoadStoreInRegion = nullptr;
3400 RegionHasStackSave = false;
3401
3402 // Reduce the maximum schedule region size by the size of the
3403 // previous scheduling run.
3404 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3405 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3406 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3407 ScheduleRegionSize = 0;
3408
3409 // Make a new scheduling region, i.e. all existing ScheduleData is not
3410 // in the new region yet.
3411 ++SchedulingRegionID;
3412 }
3413
3414 ScheduleData *getScheduleData(Instruction *I) {
3415 if (BB != I->getParent())
3416 // Avoid lookup if can't possibly be in map.
3417 return nullptr;
3418 ScheduleData *SD = ScheduleDataMap.lookup(I);
3419 if (SD && isInSchedulingRegion(SD))
3420 return SD;
3421 return nullptr;
3422 }
3423
3424 ScheduleData *getScheduleData(Value *V) {
3425 if (auto *I = dyn_cast<Instruction>(V))
3426 return getScheduleData(I);
3427 return nullptr;
3428 }
3429
3430 ScheduleData *getScheduleData(Value *V, Value *Key) {
3431 if (V == Key)
3432 return getScheduleData(V);
3433 auto I = ExtraScheduleDataMap.find(V);
3434 if (I != ExtraScheduleDataMap.end()) {
3435 ScheduleData *SD = I->second.lookup(Key);
3436 if (SD && isInSchedulingRegion(SD))
3437 return SD;
3438 }
3439 return nullptr;
3440 }
3441
3442 bool isInSchedulingRegion(ScheduleData *SD) const {
3443 return SD->SchedulingRegionID == SchedulingRegionID;
3444 }
3445
3446 /// Marks an instruction as scheduled and puts all dependent ready
3447 /// instructions into the ready-list.
3448 template <typename ReadyListType>
3449 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3450 SD->IsScheduled = true;
3451 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3452
3453 for (ScheduleData *BundleMember = SD; BundleMember;
3454 BundleMember = BundleMember->NextInBundle) {
3455 if (BundleMember->Inst != BundleMember->OpValue)
3456 continue;
3457
3458 // Handle the def-use chain dependencies.
3459
3460 // Decrement the unscheduled counter and insert to ready list if ready.
3461 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3462 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
3463 if (OpDef && OpDef->hasValidDependencies() &&
3464 OpDef->incrementUnscheduledDeps(-1) == 0) {
3465 // There are no more unscheduled dependencies after
3466 // decrementing, so we can put the dependent instruction
3467 // into the ready list.
3468 ScheduleData *DepBundle = OpDef->FirstInBundle;
3469 assert(!DepBundle->IsScheduled &&
3470 "already scheduled bundle gets ready");
3471 ReadyList.insert(DepBundle);
3472 LLVM_DEBUG(dbgs()
3473 << "SLP: gets ready (def): " << *DepBundle << "\n");
3474 }
3475 });
3476 };
3477
3478 // If BundleMember is a vector bundle, its operands may have been
3479 // reordered during buildTree(). We therefore need to get its operands
3480 // through the TreeEntry.
3481 if (TreeEntry *TE = BundleMember->TE) {
3482 // Need to search for the lane since the tree entry can be reordered.
3483 int Lane = std::distance(TE->Scalars.begin(),
3484 find(TE->Scalars, BundleMember->Inst));
3485 assert(Lane >= 0 && "Lane not set");
3486
3487 // Since vectorization tree is being built recursively this assertion
3488 // ensures that the tree entry has all operands set before reaching
3489 // this code. Couple of exceptions known at the moment are extracts
3490 // where their second (immediate) operand is not added. Since
3491 // immediates do not affect scheduler behavior this is considered
3492 // okay.
3493 auto *In = BundleMember->Inst;
3494 assert(
3495 In &&
3496 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3497 In->getNumOperands() == TE->getNumOperands()) &&
3498 "Missed TreeEntry operands?");
3499 (void)In; // fake use to avoid build failure when assertions disabled
3500
3501 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3502 OpIdx != NumOperands; ++OpIdx)
3503 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3504 DecrUnsched(I);
3505 } else {
3506 // If BundleMember is a stand-alone instruction, no operand reordering
3507 // has taken place, so we directly access its operands.
3508 for (Use &U : BundleMember->Inst->operands())
3509 if (auto *I = dyn_cast<Instruction>(U.get()))
3510 DecrUnsched(I);
3511 }
3512 // Handle the memory dependencies.
3513 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3514 if (MemoryDepSD->hasValidDependencies() &&
3515 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3516 // There are no more unscheduled dependencies after decrementing,
3517 // so we can put the dependent instruction into the ready list.
3518 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3519 assert(!DepBundle->IsScheduled &&
3520 "already scheduled bundle gets ready");
3521 ReadyList.insert(DepBundle);
3523 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3524 }
3525 }
3526 // Handle the control dependencies.
3527 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3528 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3529 // There are no more unscheduled dependencies after decrementing,
3530 // so we can put the dependent instruction into the ready list.
3531 ScheduleData *DepBundle = DepSD->FirstInBundle;
3532 assert(!DepBundle->IsScheduled &&
3533 "already scheduled bundle gets ready");
3534 ReadyList.insert(DepBundle);
3536 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3537 }
3538 }
3539 }
3540 }
3541
3542 /// Verify basic self consistency properties of the data structure.
3543 void verify() {
3544 if (!ScheduleStart)
3545 return;
3546
3547 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3548 ScheduleStart->comesBefore(ScheduleEnd) &&
3549 "Not a valid scheduling region?");
3550
3551 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3552 auto *SD = getScheduleData(I);
3553 if (!SD)
3554 continue;
3555 assert(isInSchedulingRegion(SD) &&
3556 "primary schedule data not in window?");
3557 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3558 "entire bundle in window!");
3559 (void)SD;
3560 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3561 }
3562
3563 for (auto *SD : ReadyInsts) {
3564 assert(SD->isSchedulingEntity() && SD->isReady() &&
3565 "item in ready list not ready?");
3566 (void)SD;
3567 }
3568 }
3569
3570 void doForAllOpcodes(Value *V,
3571 function_ref<void(ScheduleData *SD)> Action) {
3572 if (ScheduleData *SD = getScheduleData(V))
3573 Action(SD);
3574 auto I = ExtraScheduleDataMap.find(V);
3575 if (I != ExtraScheduleDataMap.end())
3576 for (auto &P : I->second)
3577 if (isInSchedulingRegion(P.second))
3578 Action(P.second);
3579 }
3580
3581 /// Put all instructions into the ReadyList which are ready for scheduling.
3582 template <typename ReadyListType>
3583 void initialFillReadyList(ReadyListType &ReadyList) {
3584 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3585 doForAllOpcodes(I, [&](ScheduleData *SD) {
3586 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3587 SD->isReady()) {
3588 ReadyList.insert(SD);
3589 LLVM_DEBUG(dbgs()
3590 << "SLP: initially in ready list: " << *SD << "\n");
3591 }
3592 });
3593 }
3594 }
3595
3596 /// Build a bundle from the ScheduleData nodes corresponding to the
3597 /// scalar instruction for each lane.
3598 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3599
3600 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3601 /// cyclic dependencies. This is only a dry-run, no instructions are
3602 /// actually moved at this stage.
3603 /// \returns the scheduling bundle. The returned Optional value is not
3604 /// std::nullopt if \p VL is allowed to be scheduled.
3605 std::optional<ScheduleData *>
3606 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3607 const InstructionsState &S);
3608
3609 /// Un-bundles a group of instructions.
3610 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3611
3612 /// Allocates schedule data chunk.
3613 ScheduleData *allocateScheduleDataChunks();
3614
3615 /// Extends the scheduling region so that V is inside the region.
3616 /// \returns true if the region size is within the limit.
3617 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3618
3619 /// Initialize the ScheduleData structures for new instructions in the
3620 /// scheduling region.
3621 void initScheduleData(Instruction *FromI, Instruction *ToI,
3622 ScheduleData *PrevLoadStore,
3623 ScheduleData *NextLoadStore);
3624
3625 /// Updates the dependency information of a bundle and of all instructions/
3626 /// bundles which depend on the original bundle.
3627 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3628 BoUpSLP *SLP);
3629
3630 /// Sets all instruction in the scheduling region to un-scheduled.
3631 void resetSchedule();
3632
3633 BasicBlock *BB;
3634
3635 /// Simple memory allocation for ScheduleData.
3637
3638 /// The size of a ScheduleData array in ScheduleDataChunks.
3639 int ChunkSize;
3640
3641 /// The allocator position in the current chunk, which is the last entry
3642 /// of ScheduleDataChunks.
3643 int ChunkPos;
3644
3645 /// Attaches ScheduleData to Instruction.
3646 /// Note that the mapping survives during all vectorization iterations, i.e.
3647 /// ScheduleData structures are recycled.
3649
3650 /// Attaches ScheduleData to Instruction with the leading key.
3652 ExtraScheduleDataMap;
3653
3654 /// The ready-list for scheduling (only used for the dry-run).
3655 SetVector<ScheduleData *> ReadyInsts;
3656
3657 /// The first instruction of the scheduling region.
3658 Instruction *ScheduleStart = nullptr;
3659
3660 /// The first instruction _after_ the scheduling region.
3661 Instruction *ScheduleEnd = nullptr;
3662
3663 /// The first memory accessing instruction in the scheduling region
3664 /// (can be null).
3665 ScheduleData *FirstLoadStoreInRegion = nullptr;
3666
3667 /// The last memory accessing instruction in the scheduling region
3668 /// (can be null).
3669 ScheduleData *LastLoadStoreInRegion = nullptr;
3670
3671 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3672 /// region? Used to optimize the dependence calculation for the
3673 /// common case where there isn't.
3674 bool RegionHasStackSave = false;
3675
3676 /// The current size of the scheduling region.
3677 int ScheduleRegionSize = 0;
3678
3679 /// The maximum size allowed for the scheduling region.
3680 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3681
3682 /// The ID of the scheduling region. For a new vectorization iteration this
3683 /// is incremented which "removes" all ScheduleData from the region.
3684 /// Make sure that the initial SchedulingRegionID is greater than the
3685 /// initial SchedulingRegionID in ScheduleData (which is 0).
3686 int SchedulingRegionID = 1;
3687 };
3688
3689 /// Attaches the BlockScheduling structures to basic blocks.
3691
3692 /// Performs the "real" scheduling. Done before vectorization is actually
3693 /// performed in a basic block.
3694 void scheduleBlock(BlockScheduling *BS);
3695
3696 /// List of users to ignore during scheduling and that don't need extracting.
3697 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3698
3699 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3700 /// sorted SmallVectors of unsigned.
3701 struct OrdersTypeDenseMapInfo {
3702 static OrdersType getEmptyKey() {
3703 OrdersType V;
3704 V.push_back(~1U);
3705 return V;
3706 }
3707
3708 static OrdersType getTombstoneKey() {
3709 OrdersType V;
3710 V.push_back(~2U);
3711 return V;
3712 }
3713
3714 static unsigned getHashValue(const OrdersType &V) {
3715 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3716 }
3717
3718 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3719 return LHS == RHS;
3720 }
3721 };
3722
3723 // Analysis and block reference.
3724 Function *F;
3725 ScalarEvolution *SE;
3727 TargetLibraryInfo *TLI;
3728 LoopInfo *LI;
3729 DominatorTree *DT;
3730 AssumptionCache *AC;
3731 DemandedBits *DB;
3732 const DataLayout *DL;
3734
3735 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3736 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3737
3738 /// Instruction builder to construct the vectorized tree.
3740
3741 /// A map of scalar integer values to the smallest bit width with which they
3742 /// can legally be represented. The values map to (width, signed) pairs,
3743 /// where "width" indicates the minimum bit width and "signed" is True if the
3744 /// value must be signed-extended, rather than zero-extended, back to its
3745 /// original width.
3747
3748 /// Final size of the reduced vector, if the current graph represents the
3749 /// input for the reduction and it was possible to narrow the size of the
3750 /// reduction.
3751 unsigned ReductionBitWidth = 0;
3752
3753 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
3754 /// type sizes, used in the tree.
3755 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
3756
3757 /// Indices of the vectorized nodes, which supposed to be the roots of the new
3758 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
3759 DenseSet<unsigned> ExtraBitWidthNodes;
3760};
3761
3762} // end namespace slpvectorizer
3763
3764template <> struct GraphTraits<BoUpSLP *> {
3765 using TreeEntry = BoUpSLP::TreeEntry;
3766
3767 /// NodeRef has to be a pointer per the GraphWriter.
3769
3771
3772 /// Add the VectorizableTree to the index iterator to be able to return
3773 /// TreeEntry pointers.
3774 struct ChildIteratorType
3775 : public iterator_adaptor_base<
3776 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3778
3780 ContainerTy &VT)
3781 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
3782
3783 NodeRef operator*() { return I->UserTE; }
3784 };
3785
3787 return R.VectorizableTree[0].get();
3788 }
3789
3790 static ChildIteratorType child_begin(NodeRef N) {
3791 return {N->UserTreeIndices.begin(), N->Container};
3792 }
3793
3794 static ChildIteratorType child_end(NodeRef N) {
3795 return {N->UserTreeIndices.end(), N->Container};
3796 }
3797
3798 /// For the node iterator we just need to turn the TreeEntry iterator into a
3799 /// TreeEntry* iterator so that it dereferences to NodeRef.
3800 class nodes_iterator {
3802 ItTy It;
3803
3804 public:
3805 nodes_iterator(const ItTy &It2) : It(It2) {}
3806 NodeRef operator*() { return It->get(); }
3807 nodes_iterator operator++() {
3808 ++It;
3809 return *this;
3810 }
3811 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
3812 };
3813
3814 static nodes_iterator nodes_begin(BoUpSLP *R) {
3815 return nodes_iterator(R->VectorizableTree.begin());
3816 }
3817
3818 static nodes_iterator nodes_end(BoUpSLP *R) {
3819 return nodes_iterator(R->VectorizableTree.end());
3820 }
3821
3822 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
3823};
3824
3825template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
3826 using TreeEntry = BoUpSLP::TreeEntry;
3827
3828 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
3829
3830 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
3831 std::string Str;
3833 OS << Entry->Idx << ".\n";
3834 if (isSplat(Entry->Scalars))
3835 OS << "<splat> ";
3836 for (auto *V : Entry->Scalars) {
3837 OS << *V;
3838 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
3839 return EU.Scalar == V;
3840 }))
3841 OS << " <extract>";
3842 OS << "\n";
3843 }
3844 return Str;
3845 }
3846
3847 static std::string getNodeAttributes(const TreeEntry *Entry,
3848 const BoUpSLP *) {
3849 if (Entry->State == TreeEntry::NeedToGather)
3850 return "color=red";
3851 if (Entry->State == TreeEntry::ScatterVectorize ||
3852 Entry->State == TreeEntry::StridedVectorize)
3853 return "color=blue";
3854 return "";
3855 }
3856};
3857
3858} // end namespace llvm
3859
3862 for (auto *I : DeletedInstructions) {
3863 for (Use &U : I->operands()) {
3864 auto *Op = dyn_cast<Instruction>(U.get());
3865 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
3867 DeadInsts.emplace_back(Op);
3868 }
3869 I->dropAllReferences();
3870 }
3871 for (auto *I : DeletedInstructions) {
3872 assert(I->use_empty() &&
3873 "trying to erase instruction with users.");
3874 I->eraseFromParent();
3875 }
3876
3877 // Cleanup any dead scalar code feeding the vectorized instructions
3879
3880#ifdef EXPENSIVE_CHECKS
3881 // If we could guarantee that this call is not extremely slow, we could
3882 // remove the ifdef limitation (see PR47712).
3883 assert(!verifyFunction(*F, &dbgs()));
3884#endif
3885}
3886
3887/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
3888/// contains original mask for the scalars reused in the node. Procedure
3889/// transform this mask in accordance with the given \p Mask.
3891 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
3892 "Expected non-empty mask.");
3893 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
3894 Prev.swap(Reuses);
3895 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
3896 if (Mask[I] != PoisonMaskElem)
3897 Reuses[Mask[I]] = Prev[I];
3898}
3899
3900/// Reorders the given \p Order according to the given \p Mask. \p Order - is
3901/// the original order of the scalars. Procedure transforms the provided order
3902/// in accordance with the given \p Mask. If the resulting \p Order is just an
3903/// identity order, \p Order is cleared.
3905 bool BottomOrder = false) {
3906 assert(!Mask.empty() && "Expected non-empty mask.");
3907 unsigned Sz = Mask.size();
3908 if (BottomOrder) {
3909 SmallVector<unsigned> PrevOrder;
3910 if (Order.empty()) {
3911 PrevOrder.resize(Sz);
3912 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
3913 } else {
3914 PrevOrder.swap(Order);
3915 }
3916 Order.assign(Sz, Sz);
3917 for (unsigned I = 0; I < Sz; ++I)
3918 if (Mask[I] != PoisonMaskElem)
3919 Order[I] = PrevOrder[Mask[I]];
3920 if (all_of(enumerate(Order), [&](const auto &Data) {
3921 return Data.value() == Sz || Data.index() == Data.value();
3922 })) {
3923 Order.clear();
3924 return;
3925 }
3926 fixupOrderingIndices(Order);
3927 return;
3928 }
3929 SmallVector<int> MaskOrder;
3930 if (Order.empty()) {
3931 MaskOrder.resize(Sz);
3932 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
3933 } else {
3934 inversePermutation(Order, MaskOrder);
3935 }
3936 reorderReuses(MaskOrder, Mask);
3937 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
3938 Order.clear();
3939 return;
3940 }
3941 Order.assign(Sz, Sz);
3942 for (unsigned I = 0; I < Sz; ++I)
3943 if (MaskOrder[I] != PoisonMaskElem)
3944 Order[MaskOrder[I]] = I;
3945 fixupOrderingIndices(Order);
3946}
3947
3948std::optional<BoUpSLP::OrdersType>
3949BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
3950 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
3951 // Try to find subvector extract/insert patterns and reorder only such
3952 // patterns.
3953 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
3954 Type *ScalarTy = GatheredScalars.front()->getType();
3955 int NumScalars = GatheredScalars.size();
3956 if (!isValidElementType(ScalarTy))
3957 return std::nullopt;
3958 auto *VecTy = FixedVectorType::get(ScalarTy, NumScalars);
3959 int NumParts = TTI->getNumberOfParts(VecTy);
3960 if (NumParts == 0 || NumParts >= NumScalars)
3961 NumParts = 1;
3962 SmallVector<int> ExtractMask;
3963 SmallVector<int> Mask;
3966 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
3968 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
3969 /*ForOrder=*/true);
3970 // No shuffled operands - ignore.
3971 if (GatherShuffles.empty() && ExtractShuffles.empty())
3972 return std::nullopt;
3973 OrdersType CurrentOrder(NumScalars, NumScalars);
3974 if (GatherShuffles.size() == 1 &&
3975 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
3976 Entries.front().front()->isSame(TE.Scalars)) {
3977 // Perfect match in the graph, will reuse the previously vectorized
3978 // node. Cost is 0.
3979 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
3980 return CurrentOrder;
3981 }
3982 auto IsSplatMask = [](ArrayRef<int> Mask) {
3983 int SingleElt = PoisonMaskElem;
3984 return all_of(Mask, [&](int I) {
3985 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
3986 SingleElt = I;
3987 return I == PoisonMaskElem || I == SingleElt;
3988 });
3989 };
3990 // Exclusive broadcast mask - ignore.
3991 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
3992 (Entries.size() != 1 ||
3993 Entries.front().front()->ReorderIndices.empty())) ||
3994 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
3995 return std::nullopt;
3996 SmallBitVector ShuffledSubMasks(NumParts);
3997 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
3998 ArrayRef<int> Mask, int PartSz, int NumParts,
3999 function_ref<unsigned(unsigned)> GetVF) {
4000 for (int I : seq<int>(0, NumParts)) {
4001 if (ShuffledSubMasks.test(I))
4002 continue;
4003 const int VF = GetVF(I);
4004 if (VF == 0)
4005 continue;
4006 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, PartSz);
4007 // Shuffle of at least 2 vectors - ignore.
4008 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4009 std::fill(Slice.begin(), Slice.end(), NumScalars);
4010 ShuffledSubMasks.set(I);
4011 continue;
4012 }
4013 // Try to include as much elements from the mask as possible.
4014 int FirstMin = INT_MAX;
4015 int SecondVecFound = false;
4016 for (int K : seq<int>(0, PartSz)) {
4017 int Idx = Mask[I * PartSz + K];
4018 if (Idx == PoisonMaskElem) {
4019 Value *V = GatheredScalars[I * PartSz + K];
4020 if (isConstant(V) && !isa<PoisonValue>(V)) {
4021 SecondVecFound = true;
4022 break;
4023 }
4024 continue;
4025 }
4026 if (Idx < VF) {
4027 if (FirstMin > Idx)
4028 FirstMin = Idx;
4029 } else {
4030 SecondVecFound = true;
4031 break;
4032 }
4033 }
4034 FirstMin = (FirstMin / PartSz) * PartSz;
4035 // Shuffle of at least 2 vectors - ignore.
4036 if (SecondVecFound) {
4037 std::fill(Slice.begin(), Slice.end(), NumScalars);
4038 ShuffledSubMasks.set(I);
4039 continue;
4040 }
4041 for (int K : seq<int>(0, PartSz)) {
4042 int Idx = Mask[I * PartSz + K];
4043 if (Idx == PoisonMaskElem)
4044 continue;
4045 Idx -= FirstMin;
4046 if (Idx >= PartSz) {
4047 SecondVecFound = true;
4048 break;
4049 }
4050 if (CurrentOrder[I * PartSz + Idx] >
4051 static_cast<unsigned>(I * PartSz + K) &&
4052 CurrentOrder[I * PartSz + Idx] !=
4053 static_cast<unsigned>(I * PartSz + Idx))
4054 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4055 }
4056 // Shuffle of at least 2 vectors - ignore.
4057 if (SecondVecFound) {
4058 std::fill(Slice.begin(), Slice.end(), NumScalars);
4059 ShuffledSubMasks.set(I);
4060 continue;
4061 }
4062 }
4063 };
4064 int PartSz = NumScalars / NumParts;
4065 if (!ExtractShuffles.empty())
4066 TransformMaskToOrder(
4067 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4068 if (!ExtractShuffles[I])
4069 return 0U;
4070 unsigned VF = 0;
4071 for (unsigned Idx : seq<unsigned>(0, PartSz)) {
4072 int K = I * PartSz + Idx;
4073 if (ExtractMask[K] == PoisonMaskElem)
4074 continue;
4075 if (!TE.ReuseShuffleIndices.empty())
4076 K = TE.ReuseShuffleIndices[K];
4077 if (!TE.ReorderIndices.empty())
4078 K = std::distance(TE.ReorderIndices.begin(),
4079 find(TE.ReorderIndices, K));
4080 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4081 if (!EI)
4082 continue;
4083 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4084 ->getElementCount()
4085 .getKnownMinValue());
4086 }
4087 return VF;
4088 });
4089 // Check special corner case - single shuffle of the same entry.
4090 if (GatherShuffles.size() == 1 && NumParts != 1) {
4091 if (ShuffledSubMasks.any())
4092 return std::nullopt;
4093 PartSz = NumScalars;
4094 NumParts = 1;
4095 }
4096 if (!Entries.empty())
4097 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4098 if (!GatherShuffles[I])
4099 return 0U;
4100 return std::max(Entries[I].front()->getVectorFactor(),
4101 Entries[I].back()->getVectorFactor());
4102 });
4103 int NumUndefs =
4104 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4105 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4106 return std::nullopt;
4107 return std::move(CurrentOrder);
4108}
4109
4110static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4111 const TargetLibraryInfo &TLI,
4112 bool CompareOpcodes = true) {
4113 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
4114 return false;
4115 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4116 if (!GEP1)
4117 return false;
4118 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4119 if (!GEP2)
4120 return false;
4121 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4122 ((isConstant(GEP1->getOperand(1)) &&
4123 isConstant(GEP2->getOperand(1))) ||
4124 !CompareOpcodes ||
4125 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4126 .getOpcode());
4127}
4128
4129/// Calculates minimal alignment as a common alignment.
4130template <typename T>
4132 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4133 for (Value *V : VL.drop_front())
4134 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4135 return CommonAlignment;
4136}
4137
4138/// Check if \p Order represents reverse order.
4140 unsigned Sz = Order.size();
4141 return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
4142 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4143 });
4144}
4145
4146/// Checks if the provided list of pointers \p Pointers represents the strided
4147/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4148/// Otherwise, if \p Inst is not specified, just initialized optional value is
4149/// returned to show that the pointers represent strided pointers. If \p Inst
4150/// specified, the runtime stride is materialized before the given \p Inst.
4151/// \returns std::nullopt if the pointers are not pointers with the runtime
4152/// stride, nullptr or actual stride value, otherwise.
4153static std::optional<Value *>
4155 const DataLayout &DL, ScalarEvolution &SE,
4156 SmallVectorImpl<unsigned> &SortedIndices,
4157 Instruction *Inst = nullptr) {
4159 const SCEV *PtrSCEVLowest = nullptr;
4160 const SCEV *PtrSCEVHighest = nullptr;
4161 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4162 // addresses).
4163 for (Value *Ptr : PointerOps) {
4164 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4165 if (!PtrSCEV)
4166 return std::nullopt;
4167 SCEVs.push_back(PtrSCEV);
4168 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4169 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4170 continue;
4171 }
4172 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4173 if (isa<SCEVCouldNotCompute>(Diff))
4174 return std::nullopt;
4175 if (Diff->isNonConstantNegative()) {
4176 PtrSCEVLowest = PtrSCEV;
4177 continue;
4178 }
4179 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4180 if (isa<SCEVCouldNotCompute>(Diff1))
4181 return std::nullopt;
4182 if (Diff1->isNonConstantNegative()) {
4183 PtrSCEVHighest = PtrSCEV;
4184 continue;
4185 }
4186 }
4187 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4188 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4189 if (isa<SCEVCouldNotCompute>(Dist))
4190 return std::nullopt;
4191 int Size = DL.getTypeStoreSize(ElemTy);
4192 auto TryGetStride = [&](const SCEV *Dist,
4193 const SCEV *Multiplier) -> const SCEV * {
4194 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4195 if (M->getOperand(0) == Multiplier)
4196 return M->getOperand(1);
4197 if (M->getOperand(1) == Multiplier)
4198 return M->getOperand(0);
4199 return nullptr;
4200 }
4201 if (Multiplier == Dist)
4202 return SE.getConstant(Dist->getType(), 1);
4203 return SE.getUDivExactExpr(Dist, Multiplier);
4204 };
4205 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4206 const SCEV *Stride = nullptr;
4207 if (Size != 1 || SCEVs.size() > 2) {
4208 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4209 Stride = TryGetStride(Dist, Sz);
4210 if (!Stride)
4211 return std::nullopt;
4212 }
4213 if (!Stride || isa<SCEVConstant>(Stride))
4214 return std::nullopt;
4215 // Iterate through all pointers and check if all distances are
4216 // unique multiple of Stride.
4217 using DistOrdPair = std::pair<int64_t, int>;
4218 auto Compare = llvm::less_first();
4219 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4220 int Cnt = 0;
4221 bool IsConsecutive = true;
4222 for (const SCEV *PtrSCEV : SCEVs) {
4223 unsigned Dist = 0;
4224 if (PtrSCEV != PtrSCEVLowest) {
4225 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4226 const SCEV *Coeff = TryGetStride(Diff, Stride);
4227 if (!Coeff)
4228 return std::nullopt;
4229 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4230 if (!SC || isa<SCEVCouldNotCompute>(SC))
4231 return std::nullopt;
4232 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4233 SE.getMulExpr(Stride, SC)))
4234 ->isZero())
4235 return std::nullopt;
4236 Dist = SC->getAPInt().getZExtValue();
4237 }
4238 // If the strides are not the same or repeated, we can't vectorize.
4239 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4240 return std::nullopt;
4241 auto Res = Offsets.emplace(Dist, Cnt);
4242 if (!Res.second)
4243 return std::nullopt;
4244 // Consecutive order if the inserted element is the last one.
4245 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4246 ++Cnt;
4247 }
4248 if (Offsets.size() != SCEVs.size())
4249 return std::nullopt;
4250 SortedIndices.clear();
4251 if (!IsConsecutive) {
4252 // Fill SortedIndices array only if it is non-consecutive.
4253 SortedIndices.resize(PointerOps.size());
4254 Cnt = 0;
4255 for (const std::pair<int64_t, int> &Pair : Offsets) {
4256 SortedIndices[Cnt] = Pair.second;
4257 ++Cnt;
4258 }
4259 }
4260 if (!Inst)
4261 return nullptr;
4262 SCEVExpander Expander(SE, DL, "strided-load-vec");
4263 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4264}
4265
4267 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
4268 SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
4269 // Check that a vectorized load would load the same memory as a scalar
4270 // load. For example, we don't want to vectorize loads that are smaller
4271 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4272 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4273 // from such a struct, we read/write packed bits disagreeing with the
4274 // unvectorized version.
4275 Type *ScalarTy = VL0->getType();
4276
4277 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4278 return LoadsState::Gather;
4279
4280 // Make sure all loads in the bundle are simple - we can't vectorize
4281 // atomic or volatile loads.
4282 PointerOps.clear();
4283 const unsigned Sz = VL.size();
4284 PointerOps.resize(Sz);
4285 auto *POIter = PointerOps.begin();
4286 for (Value *V : VL) {
4287 auto *L = cast<LoadInst>(V);
4288 if (!L->isSimple())
4289 return LoadsState::Gather;
4290 *POIter = L->getPointerOperand();
4291 ++POIter;
4292 }
4293
4294 Order.clear();
4295 auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
4296 // Check the order of pointer operands or that all pointers are the same.
4297 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4298 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4299 if (!Order.empty() && !isPowerOf2_32(VL.size())) {
4300 assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4301 "supported with VectorizeNonPowerOf2");
4302 return LoadsState::Gather;
4303 }
4304
4305 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4306 if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
4307 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4308 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4310 if (IsSorted || all_of(PointerOps, [&](Value *P) {
4311 return arePointersCompatible(P, PointerOps.front(), *TLI);
4312 })) {
4313 if (IsSorted) {
4314 Value *Ptr0;
4315 Value *PtrN;
4316 if (Order.empty()) {
4317 Ptr0 = PointerOps.front();
4318 PtrN = PointerOps.back();
4319 } else {
4320 Ptr0 = PointerOps[Order.front()];
4321 PtrN = PointerOps[Order.back()];
4322 }
4323 std::optional<int> Diff =
4324 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
4325 // Check that the sorted loads are consecutive.
4326 if (static_cast<unsigned>(*Diff) == Sz - 1)
4327 return LoadsState::Vectorize;
4328 // Simple check if not a strided access - clear order.
4329 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4330 // Try to generate strided load node if:
4331 // 1. Target with strided load support is detected.
4332 // 2. The number of loads is greater than MinProfitableStridedLoads,
4333 // or the potential stride <= MaxProfitableLoadStride and the
4334 // potential stride is power-of-2 (to avoid perf regressions for the very
4335 // small number of loads) and max distance > number of loads, or potential
4336 // stride is -1.
4337 // 3. The loads are ordered, or number of unordered loads <=
4338 // MaxProfitableUnorderedLoads, or loads are in reversed order.
4339 // (this check is to avoid extra costs for very expensive shuffles).
4340 if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
4341 (static_cast<unsigned>(std::abs(*Diff)) <=
4343 isPowerOf2_32(std::abs(*Diff)))) &&
4344 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4345 *Diff == -(static_cast<int>(Sz) - 1))) {
4346 int Stride = *Diff / static_cast<int>(Sz - 1);
4347 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4348 Align Alignment =
4349 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
4350 ->getAlign();
4351 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
4352 // Iterate through all pointers and check if all distances are
4353 // unique multiple of Dist.
4354 SmallSet<int, 4> Dists;
4355 for (Value *Ptr : PointerOps) {
4356 int Dist = 0;
4357 if (Ptr == PtrN)
4358 Dist = *Diff;
4359 else if (Ptr != Ptr0)
4360 Dist =
4361 *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
4362 // If the strides are not the same or repeated, we can't
4363 // vectorize.
4364 if (((Dist / Stride) * Stride) != Dist ||
4365 !Dists.insert(Dist).second)
4366 break;
4367 }
4368 if (Dists.size() == Sz)
4370 }
4371 }
4372 }
4373 }
4374 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4375 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
4376 unsigned MinVF = getMinVF(Sz);
4377 unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
4378 MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
4379 for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4380 unsigned VectorizedCnt = 0;
4382 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4383 Cnt += VF, ++VectorizedCnt) {
4384 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4386 SmallVector<Value *> PointerOps;
4387 LoadsState LS =
4388 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
4389 /*TryRecursiveCheck=*/false);
4390 // Check that the sorted loads are consecutive.
4391 if (LS == LoadsState::Gather)
4392 break;
4393 // If need the reorder - consider as high-cost masked gather for now.
4394 if ((LS == LoadsState::Vectorize ||
4396 !Order.empty() && !isReverseOrder(Order))
4398 States.push_back(LS);
4399 }
4400 // Can be vectorized later as a serie of loads/insertelements.
4401 if (VectorizedCnt == VL.size() / VF) {
4402 // Compare masked gather cost and loads + insersubvector costs.
4404 InstructionCost MaskedGatherCost = TTI.getGatherScatterOpCost(
4405 Instruction::Load, VecTy,
4406 cast<LoadInst>(VL0)->getPointerOperand(),
4407 /*VariableMask=*/false, CommonAlignment, CostKind);
4408 InstructionCost VecLdCost = 0;
4409 auto *SubVecTy = FixedVectorType::get(ScalarTy, VF);
4410 for (auto [I, LS] : enumerate(States)) {
4411 auto *LI0 = cast<LoadInst>(VL[I * VF]);
4412 switch (LS) {
4414 VecLdCost += TTI.getMemoryOpCost(
4415 Instruction::Load, SubVecTy, LI0->getAlign(),
4416 LI0->getPointerAddressSpace(), CostKind,
4418 break;
4420 VecLdCost += TTI.getStridedMemoryOpCost(
4421 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4422 /*VariableMask=*/false, CommonAlignment, CostKind);
4423 break;
4425 VecLdCost += TTI.getGatherScatterOpCost(
4426 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4427 /*VariableMask=*/false, CommonAlignment, CostKind);
4428 break;
4429 case LoadsState::Gather:
4431 "Expected only consecutive, strided or masked gather loads.");
4432 }
4433 SmallVector<int> ShuffleMask(VL.size());
4434 for (int Idx : seq<int>(0, VL.size()))
4435 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4436 VecLdCost +=
4437 TTI.getShuffleCost(TTI ::SK_InsertSubvector, VecTy,
4438 ShuffleMask, CostKind, I * VF, SubVecTy);
4439 }
4440 // If masked gather cost is higher - better to vectorize, so
4441 // consider it as a gather node. It will be better estimated
4442 // later.
4443 if (MaskedGatherCost > VecLdCost)
4444 return true;
4445 }
4446 }
4447 return false;
4448 };
4449 // TODO: need to improve analysis of the pointers, if not all of them are
4450 // GEPs or have > 2 operands, we end up with a gather node, which just
4451 // increases the cost.
4452 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
4453 bool ProfitableGatherPointers =
4454 L && Sz > 2 &&
4455 static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
4456 return L->isLoopInvariant(V);
4457 })) <= Sz / 2;
4458 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
4459 auto *GEP = dyn_cast<GetElementPtrInst>(P);
4460 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
4461 (GEP && GEP->getNumOperands() == 2 &&
4462 isa<Constant, Instruction>(GEP->getOperand(1)));
4463 })) {
4464 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4465 if (TTI->isLegalMaskedGather(VecTy, CommonAlignment) &&
4466 !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
4467 // Check if potential masked gather can be represented as series
4468 // of loads + insertsubvectors.
4469 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4470 // If masked gather cost is higher - better to vectorize, so
4471 // consider it as a gather node. It will be better estimated
4472 // later.
4473 return LoadsState::Gather;
4474 }
4476 }
4477 }
4478 }
4479
4480 return LoadsState::Gather;
4481}
4482
4484 const DataLayout &DL, ScalarEvolution &SE,
4485 SmallVectorImpl<unsigned> &SortedIndices) {
4487 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4488 "Expected list of pointer operands.");
4489 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4490 // Ptr into, sort and return the sorted indices with values next to one
4491 // another.
4493 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4494
4495 unsigned Cnt = 1;
4496 for (Value *Ptr : VL.drop_front()) {
4497 bool Found = any_of(Bases, [&](auto &Base) {
4498 std::optional<int> Diff =
4499 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4500 /*StrictCheck=*/true);
4501 if (!Diff)
4502 return false;
4503
4504 Base.second.emplace_back(Ptr, *Diff, Cnt++);
4505 return true;
4506 });
4507
4508 if (!Found) {
4509 // If we haven't found enough to usefully cluster, return early.
4510 if (Bases.size() > VL.size() / 2 - 1)
4511 return false;
4512
4513 // Not found already - add a new Base
4514 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
4515 }
4516 }
4517
4518 // For each of the bases sort the pointers by Offset and check if any of the
4519 // base become consecutively allocated.
4520 bool AnyConsecutive = false;
4521 for (auto &Base : Bases) {
4522 auto &Vec = Base.second;
4523 if (Vec.size() > 1) {
4524 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
4525 const std::tuple<Value *, int, unsigned> &Y) {
4526 return std::get<1>(X) < std::get<1>(Y);
4527 });
4528 int InitialOffset = std::get<1>(Vec[0]);
4529 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
4530 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4531 });
4532 }
4533 }
4534
4535 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4536 SortedIndices.clear();
4537 if (!AnyConsecutive)
4538 return false;
4539
4540 for (auto &Base : Bases) {
4541 for (auto &T : Base.second)
4542 SortedIndices.push_back(std::get<2>(T));
4543 }
4544
4545 assert(SortedIndices.size() == VL.size() &&
4546 "Expected SortedIndices to be the size of VL");
4547 return true;
4548}
4549
4550std::optional<BoUpSLP::OrdersType>
4551BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4552 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
4553 Type *ScalarTy = TE.Scalars[0]->getType();
4554
4556 Ptrs.reserve(TE.Scalars.size());
4557 for (Value *V : TE.Scalars) {
4558 auto *L = dyn_cast<LoadInst>(V);
4559 if (!L || !L->isSimple())
4560 return std::nullopt;
4561 Ptrs.push_back(L->getPointerOperand());
4562 }
4563
4564 BoUpSLP::OrdersType Order;
4565 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
4566 return std::move(Order);
4567 return std::nullopt;
4568}
4569
4570/// Check if two insertelement instructions are from the same buildvector.
4573 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4574 // Instructions must be from the same basic blocks.
4575 if (VU->getParent() != V->getParent())
4576 return false;
4577 // Checks if 2 insertelements are from the same buildvector.
4578 if (VU->getType() != V->getType())
4579 return false;
4580 // Multiple used inserts are separate nodes.
4581 if (!VU->hasOneUse() && !V->hasOneUse())
4582 return false;
4583 auto *IE1 = VU;
4584 auto *IE2 = V;
4585 std::optional<unsigned> Idx1 = getInsertIndex(IE1);
4586 std::optional<unsigned> Idx2 = getInsertIndex(IE2);
4587 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4588 return false;
4589 // Go through the vector operand of insertelement instructions trying to find
4590 // either VU as the original vector for IE2 or V as the original vector for
4591 // IE1.
4592 SmallBitVector ReusedIdx(
4593 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
4594 bool IsReusedIdx = false;
4595 do {
4596 if (IE2 == VU && !IE1)
4597 return VU->hasOneUse();
4598 if (IE1 == V && !IE2)
4599 return V->hasOneUse();
4600 if (IE1 && IE1 != V) {
4601 unsigned Idx1 = getInsertIndex(IE1).value_or(*Idx2);
4602 IsReusedIdx |= ReusedIdx.test(Idx1);
4603 ReusedIdx.set(Idx1);
4604 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4605 IE1 = nullptr;
4606 else
4607 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4608 }
4609 if (IE2 && IE2 != VU) {
4610 unsigned Idx2 = getInsertIndex(IE2).value_or(*Idx1);
4611 IsReusedIdx |= ReusedIdx.test(Idx2);
4612 ReusedIdx.set(Idx2);
4613 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4614 IE2 = nullptr;
4615 else
4616 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4617 }
4618 } while (!IsReusedIdx && (IE1 || IE2));
4619 return false;
4620}
4621
4622std::optional<BoUpSLP::OrdersType>
4623BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4624 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
4625 if (TE.isNonPowOf2Vec())
4626 return std::nullopt;
4627
4628 // No need to reorder if need to shuffle reuses, still need to shuffle the
4629 // node.
4630 if (!TE.ReuseShuffleIndices.empty()) {
4631 if (isSplat(TE.Scalars))
4632 return std::nullopt;
4633 // Check if reuse shuffle indices can be improved by reordering.
4634 // For this, check that reuse mask is "clustered", i.e. each scalar values
4635 // is used once in each submask of size <number_of_scalars>.
4636 // Example: 4 scalar values.
4637 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4638 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4639 // element 3 is used twice in the second submask.
4640 unsigned Sz = TE.Scalars.size();
4641 if (TE.State == TreeEntry::NeedToGather) {
4642 if (std::optional<OrdersType> CurrentOrder =
4644 SmallVector<int> Mask;
4645 fixupOrderingIndices(*CurrentOrder);
4646 inversePermutation(*CurrentOrder, Mask);
4647 ::addMask(Mask, TE.ReuseShuffleIndices);
4648 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4649 unsigned Sz = TE.Scalars.size();
4650 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
4651 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
4652 if (Idx != PoisonMaskElem)
4653 Res[Idx + K * Sz] = I + K * Sz;
4654 }
4655 return std::move(Res);
4656 }
4657 }
4658 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4660 TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
4661 return std::nullopt;
4662 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
4663 Sz)) {
4664 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4665 if (TE.ReorderIndices.empty())
4666 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4667 else
4668 inversePermutation(TE.ReorderIndices, ReorderMask);
4669 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4670 unsigned VF = ReorderMask.size();
4671 OrdersType ResOrder(VF, VF);
4672 unsigned NumParts = VF / Sz;
4673 SmallBitVector UsedVals(NumParts);
4674 for (unsigned I = 0; I < VF; I += Sz) {
4675 int Val = PoisonMaskElem;
4676 unsigned UndefCnt = 0;
4677 if (any_of(ArrayRef(ReorderMask).slice(I, Sz),
4678 [&](int Idx) {
4679 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
4680 Val = Idx;
4681 if (Idx == PoisonMaskElem)
4682 ++UndefCnt;
4683 return Idx != PoisonMaskElem && Idx != Val;
4684 }) ||
4685 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
4686 UndefCnt > Sz / 2)
4687 return std::nullopt;
4688 UsedVals.set(Val);
4689 for (unsigned K = 0; K < NumParts; ++K)
4690 ResOrder[Val + Sz * K] = I + K;
4691 }
4692 return std::move(ResOrder);
4693 }
4694 unsigned VF = TE.getVectorFactor();
4695 // Try build correct order for extractelement instructions.
4696 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
4697 TE.ReuseShuffleIndices.end());
4698 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4699 all_of(TE.Scalars, [Sz](Value *V) {
4700 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4701 return Idx && *Idx < Sz;
4702 })) {
4703 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4704 if (TE.ReorderIndices.empty())
4705 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4706 else
4707 inversePermutation(TE.ReorderIndices, ReorderMask);
4708 for (unsigned I = 0; I < VF; ++I) {
4709 int &Idx = ReusedMask[I];
4710 if (Idx == PoisonMaskElem)
4711 continue;
4712 Value *V = TE.Scalars[ReorderMask[Idx]];
4713 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
4714 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
4715 }
4716 }
4717 // Build the order of the VF size, need to reorder reuses shuffles, they are
4718 // always of VF size.
4719 OrdersType ResOrder(VF);
4720 std::iota(ResOrder.begin(), ResOrder.end(), 0);
4721 auto *It = ResOrder.begin();
4722 for (unsigned K = 0; K < VF; K += Sz) {
4723 OrdersType CurrentOrder(TE.ReorderIndices);
4724 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
4725 if (SubMask.front() == PoisonMaskElem)
4726 std::iota(SubMask.begin(), SubMask.end(), 0);
4727 reorderOrder(CurrentOrder, SubMask);
4728 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
4729 std::advance(It, Sz);
4730 }
4731 if (TE.State == TreeEntry::NeedToGather &&
4732 all_of(enumerate(ResOrder),
4733 [](const auto &Data) { return Data.index() == Data.value(); }))
4734 return std::nullopt; // No need to reorder.
4735 return std::move(ResOrder);
4736 }
4737 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
4738 any_of(TE.UserTreeIndices,
4739 [](const EdgeInfo &EI) {
4740 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
4741 }) &&
4742 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
4743 return std::nullopt;
4744 if ((TE.State == TreeEntry::Vectorize ||
4745 TE.State == TreeEntry::StridedVectorize) &&
4746 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4747 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4748 !TE.isAltShuffle())
4749 return TE.ReorderIndices;
4750 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4751 auto PHICompare = [&](unsigned I1, unsigned I2) {
4752 Value *V1 = TE.Scalars[I1];
4753 Value *V2 = TE.Scalars[I2];
4754 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
4755 return false;
4756 if (V1->getNumUses() < V2->getNumUses())
4757 return true;
4758 if (V1->getNumUses() > V2->getNumUses())
4759 return false;
4760 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
4761 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4762 if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4763 if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4765 IE1, IE2,
4766 [](InsertElementInst *II) { return II->getOperand(0); }))
4767 return I1 < I2;
4768 return getInsertIndex(IE1) < getInsertIndex(IE2);
4769 }
4770 if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4771 if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4772 if (EE1->getOperand(0) != EE2->getOperand(0))
4773 return I1 < I2;
4774 return getInsertIndex(EE1) < getInsertIndex(EE2);
4775 }
4776 return I1 < I2;
4777 };
4778 auto IsIdentityOrder = [](const OrdersType &Order) {
4779 for (unsigned Idx : seq<unsigned>(0, Order.size()))
4780 if (Idx != Order[Idx])
4781 return false;
4782 return true;
4783 };
4784 if (!TE.ReorderIndices.empty())
4785 return TE.ReorderIndices;
4787 SmallVector<unsigned> Phis(TE.Scalars.size());
4788 std::iota(Phis.begin(), Phis.end(), 0);
4789 OrdersType ResOrder(TE.Scalars.size());
4790 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4791 PhiToId[Id] = Id;
4792 stable_sort(Phis, PHICompare);
4793 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4794 ResOrder[Id] = PhiToId[Phis[Id]];
4795 if (IsIdentityOrder(ResOrder))
4796 return std::nullopt; // No need to reorder.
4797 return std::move(ResOrder);
4798 }
4799 if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
4800 allSameType(TE.Scalars)) {
4801 // TODO: add analysis of other gather nodes with extractelement
4802 // instructions and other values/instructions, not only undefs.
4803 if ((TE.getOpcode() == Instruction::ExtractElement ||
4804 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
4805 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
4806 all_of(TE.Scalars, [](Value *V) {
4807 auto *EE = dyn_cast<ExtractElementInst>(V);
4808 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4809 })) {
4810 // Check that gather of extractelements can be represented as
4811 // just a shuffle of a single vector.
4812 OrdersType CurrentOrder;
4813 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4814 /*ResizeAllowed=*/true);
4815 if (Reuse || !CurrentOrder.empty())
4816 return std::move(CurrentOrder);
4817 }
4818 // If the gather node is <undef, v, .., poison> and
4819 // insertelement poison, v, 0 [+ permute]
4820 // is cheaper than
4821 // insertelement poison, v, n - try to reorder.
4822 // If rotating the whole graph, exclude the permute cost, the whole graph
4823 // might be transformed.
4824 int Sz = TE.Scalars.size();
4825 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
4826 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
4827 const auto *It =
4828 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
4829 if (It == TE.Scalars.begin())
4830 return OrdersType();
4831 auto *Ty = FixedVectorType::get(TE.Scalars.front()->getType(), Sz);
4832 if (It != TE.Scalars.end()) {
4833 OrdersType Order(Sz, Sz);
4834 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4835 Order[Idx] = 0;
4836 fixupOrderingIndices(Order);
4837 SmallVector<int> Mask;
4838 inversePermutation(Order, Mask);
4839 InstructionCost PermuteCost =
4840 TopToBottom
4841 ? 0
4843 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
4844 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
4845 PoisonValue::get(Ty), *It);
4846 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
4847 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
4848 PoisonValue::get(Ty), *It);
4849 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
4850 OrdersType Order(Sz, Sz);
4851 Order[Idx] = 0;
4852 return std::move(Order);
4853 }
4854 }
4855 }
4856 if (isSplat(TE.Scalars))
4857 return std::nullopt;
4858 if (TE.Scalars.size() >= 4)
4859 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
4860 return Order;
4861 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
4862 return CurrentOrder;
4863 }
4864 return std::nullopt;
4865}
4866
4867/// Checks if the given mask is a "clustered" mask with the same clusters of
4868/// size \p Sz, which are not identity submasks.
4870 unsigned Sz) {
4871 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
4872 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
4873 return false;
4874 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
4875 ArrayRef<int> Cluster = Mask.slice(I, Sz);
4876 if (Cluster != FirstCluster)
4877 return false;
4878 }
4879 return true;
4880}
4881
4882void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
4883 // Reorder reuses mask.
4884 reorderReuses(TE.ReuseShuffleIndices, Mask);
4885 const unsigned Sz = TE.Scalars.size();
4886 // For vectorized and non-clustered reused no need to do anything else.
4887 if (TE.State != TreeEntry::NeedToGather ||
4889 Sz) ||
4890 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
4891 return;
4892 SmallVector<int> NewMask;
4893 inversePermutation(TE.ReorderIndices, NewMask);
4894 addMask(NewMask, TE.ReuseShuffleIndices);
4895 // Clear reorder since it is going to be applied to the new mask.
4896 TE.ReorderIndices.clear();
4897 // Try to improve gathered nodes with clustered reuses, if possible.
4898 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
4899 SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
4900 inversePermutation(NewOrder, NewMask);
4901 reorderScalars(TE.Scalars, NewMask);
4902 // Fill the reuses mask with the identity submasks.
4903 for (auto *It = TE.ReuseShuffleIndices.begin(),
4904 *End = TE.ReuseShuffleIndices.end();
4905 It != End; std::advance(It, Sz))
4906 std::iota(It, std::next(It, Sz), 0);
4907}
4908
4910 ArrayRef<unsigned> SecondaryOrder) {
4911 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
4912 "Expected same size of orders");
4913 unsigned Sz = Order.size();
4914 SmallBitVector UsedIndices(Sz);
4915 for (unsigned Idx : seq<unsigned>(0, Sz)) {
4916 if (Order[Idx] != Sz)
4917 UsedIndices.set(Order[Idx]);
4918 }
4919 if (SecondaryOrder.empty()) {
4920 for (unsigned Idx : seq<unsigned>(0, Sz))
4921 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
4922 Order[Idx] = Idx;
4923 } else {
4924 for (unsigned Idx : seq<unsigned>(0, Sz))
4925 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
4926 !UsedIndices.test(SecondaryOrder[Idx]))
4927 Order[Idx] = SecondaryOrder[Idx];
4928 }
4929}
4930
4932 // Maps VF to the graph nodes.
4934 // ExtractElement gather nodes which can be vectorized and need to handle
4935 // their ordering.
4937
4938 // Phi nodes can have preferred ordering based on their result users
4940
4941 // AltShuffles can also have a preferred ordering that leads to fewer
4942 // instructions, e.g., the addsub instruction in x86.
4943 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
4944
4945 // Maps a TreeEntry to the reorder indices of external users.
4947 ExternalUserReorderMap;
4948 // Find all reorderable nodes with the given VF.
4949 // Currently the are vectorized stores,loads,extracts + some gathering of
4950 // extracts.
4951 for_each(VectorizableTree, [&, &TTIRef = *TTI](
4952 const std::unique_ptr<TreeEntry> &TE) {
4953 // Look for external users that will probably be vectorized.
4954 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
4955 findExternalStoreUsersReorderIndices(TE.get());
4956 if (!ExternalUserReorderIndices.empty()) {
4957 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4958 ExternalUserReorderMap.try_emplace(TE.get(),
4959 std::move(ExternalUserReorderIndices));
4960 }
4961
4962 // Patterns like [fadd,fsub] can be combined into a single instruction in
4963 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
4964 // to take into account their order when looking for the most used order.
4965 if (TE->isAltShuffle()) {
4966 VectorType *VecTy =
4967 FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
4968 unsigned Opcode0 = TE->getOpcode();
4969 unsigned Opcode1 = TE->getAltOpcode();
4970 // The opcode mask selects between the two opcodes.
4971 SmallBitVector OpcodeMask(TE->Scalars.size(), false);
4972 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
4973 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
4974 OpcodeMask.set(Lane);
4975 // If this pattern is supported by the target then we consider the order.
4976 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
4977 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4978 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
4979 }
4980 // TODO: Check the reverse order too.
4981 }
4982
4983 if (std::optional<OrdersType> CurrentOrder =
4984 getReorderingData(*TE, /*TopToBottom=*/true)) {
4985 // Do not include ordering for nodes used in the alt opcode vectorization,
4986 // better to reorder them during bottom-to-top stage. If follow the order
4987 // here, it causes reordering of the whole graph though actually it is
4988 // profitable just to reorder the subgraph that starts from the alternate
4989 // opcode vectorization node. Such nodes already end-up with the shuffle
4990 // instruction and it is just enough to change this shuffle rather than
4991 // rotate the scalars for the whole graph.
4992 unsigned Cnt = 0;
4993 const TreeEntry *UserTE = TE.get();
4994 while (UserTE && Cnt < RecursionMaxDepth) {
4995 if (UserTE->UserTreeIndices.size() != 1)
4996 break;
4997 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
4998 return EI.UserTE->State == TreeEntry::Vectorize &&
4999 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5000 }))
5001 return;
5002 UserTE = UserTE->UserTreeIndices.back().UserTE;
5003 ++Cnt;
5004 }
5005 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5006 if (!(TE->State == TreeEntry::Vectorize ||
5007 TE->State == TreeEntry::StridedVectorize) ||
5008 !TE->ReuseShuffleIndices.empty())
5009 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
5010 if (TE->State == TreeEntry::Vectorize &&
5011 TE->getOpcode() == Instruction::PHI)
5012 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
5013 }
5014 });
5015
5016 // Reorder the graph nodes according to their vectorization factor.
5017 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5018 VF /= 2) {
5019 auto It = VFToOrderedEntries.find(VF);
5020 if (It == VFToOrderedEntries.end())
5021 continue;
5022 // Try to find the most profitable order. We just are looking for the most
5023 // used order and reorder scalar elements in the nodes according to this
5024 // mostly used order.
5025 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5026 // All operands are reordered and used only in this node - propagate the
5027 // most used order to the user node.
5030 OrdersUses;
5032 for (const TreeEntry *OpTE : OrderedEntries) {
5033 // No need to reorder this nodes, still need to extend and to use shuffle,
5034 // just need to merge reordering shuffle and the reuse shuffle.
5035 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5036 continue;
5037 // Count number of orders uses.
5038 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5039 &PhisToOrders]() -> const OrdersType & {
5040 if (OpTE->State == TreeEntry::NeedToGather ||
5041 !OpTE->ReuseShuffleIndices.empty()) {
5042 auto It = GathersToOrders.find(OpTE);
5043 if (It != GathersToOrders.end())
5044 return It->second;
5045 }
5046 if (OpTE->isAltShuffle()) {
5047 auto It = AltShufflesToOrders.find(OpTE);
5048 if (It != AltShufflesToOrders.end())
5049 return It->second;
5050 }
5051 if (OpTE->State == TreeEntry::Vectorize &&
5052 OpTE->getOpcode() == Instruction::PHI) {
5053 auto It = PhisToOrders.find(OpTE);
5054 if (It != PhisToOrders.end())
5055 return It->second;
5056 }
5057 return OpTE->ReorderIndices;
5058 }();
5059 // First consider the order of the external scalar users.
5060 auto It = ExternalUserReorderMap.find(OpTE);
5061 if (It != ExternalUserReorderMap.end()) {
5062 const auto &ExternalUserReorderIndices = It->second;
5063 // If the OpTE vector factor != number of scalars - use natural order,
5064 // it is an attempt to reorder node with reused scalars but with
5065 // external uses.
5066 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5067 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
5068 ExternalUserReorderIndices.size();
5069 } else {
5070 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
5071 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5072 }
5073 // No other useful reorder data in this entry.
5074 if (Order.empty())
5075 continue;
5076 }
5077 // Stores actually store the mask, not the order, need to invert.
5078 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5079 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5080 SmallVector<int> Mask;
5081 inversePermutation(Order, Mask);
5082 unsigned E = Order.size();
5083 OrdersType CurrentOrder(E, E);
5084 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5085 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5086 });
5087 fixupOrderingIndices(CurrentOrder);
5088 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5089 } else {
5090 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5091 }
5092 }
5093 if (OrdersUses.empty())
5094 continue;
5095 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5096 const unsigned Sz = Order.size();
5097 for (unsigned Idx : seq<unsigned>(0, Sz))
5098 if (Idx != Order[Idx] && Order[Idx] != Sz)
5099 return false;
5100 return true;
5101 };
5102 // Choose the most used order.
5103 unsigned IdentityCnt = 0;
5104 unsigned FilledIdentityCnt = 0;
5105 OrdersType IdentityOrder(VF, VF);
5106 for (auto &Pair : OrdersUses) {
5107 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5108 if (!Pair.first.empty())
5109 FilledIdentityCnt += Pair.second;
5110 IdentityCnt += Pair.second;
5111 combineOrders(IdentityOrder, Pair.first);
5112 }
5113 }
5114 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5115 unsigned Cnt = IdentityCnt;
5116 for (auto &Pair : OrdersUses) {
5117 // Prefer identity order. But, if filled identity found (non-empty order)
5118 // with same number of uses, as the new candidate order, we can choose
5119 // this candidate order.
5120 if (Cnt < Pair.second ||
5121 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5122 Cnt == Pair.second && !BestOrder.empty() &&
5123 IsIdentityOrder(BestOrder))) {
5124 combineOrders(Pair.first, BestOrder);
5125 BestOrder = Pair.first;
5126 Cnt = Pair.second;
5127 } else {
5128 combineOrders(BestOrder, Pair.first);
5129 }
5130 }
5131 // Set order of the user node.
5132 if (IsIdentityOrder(BestOrder))
5133 continue;
5134 fixupOrderingIndices(BestOrder);
5135 SmallVector<int> Mask;
5136 inversePermutation(BestOrder, Mask);
5137 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5138 unsigned E = BestOrder.size();
5139 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5140 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5141 });
5142 // Do an actual reordering, if profitable.
5143 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5144 // Just do the reordering for the nodes with the given VF.
5145 if (TE->Scalars.size() != VF) {
5146 if (TE->ReuseShuffleIndices.size() == VF) {
5147 // Need to reorder the reuses masks of the operands with smaller VF to
5148 // be able to find the match between the graph nodes and scalar
5149 // operands of the given node during vectorization/cost estimation.
5150 assert(all_of(TE->UserTreeIndices,
5151 [VF, &TE](const EdgeInfo &EI) {
5152 return EI.UserTE->Scalars.size() == VF ||
5153 EI.UserTE->Scalars.size() ==
5154 TE->Scalars.size();
5155 }) &&
5156 "All users must be of VF size.");
5157 // Update ordering of the operands with the smaller VF than the given
5158 // one.
5159 reorderNodeWithReuses(*TE, Mask);
5160 }
5161 continue;
5162 }
5163 if ((TE->State == TreeEntry::Vectorize ||
5164 TE->State == TreeEntry::StridedVectorize) &&
5166 InsertElementInst>(TE->getMainOp()) &&
5167 !TE->isAltShuffle()) {
5168 // Build correct orders for extract{element,value}, loads and
5169 // stores.
5170 reorderOrder(TE->ReorderIndices, Mask);
5171 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5172 TE->reorderOperands(Mask);
5173 } else {
5174 // Reorder the node and its operands.
5175 TE->reorderOperands(Mask);
5176 assert(TE->ReorderIndices.empty() &&
5177 "Expected empty reorder sequence.");
5178 reorderScalars(TE->Scalars, Mask);
5179 }
5180 if (!TE->ReuseShuffleIndices.empty()) {
5181 // Apply reversed order to keep the original ordering of the reused
5182 // elements to avoid extra reorder indices shuffling.
5183 OrdersType CurrentOrder;
5184 reorderOrder(CurrentOrder, MaskOrder);
5185 SmallVector<int> NewReuses;
5186 inversePermutation(CurrentOrder, NewReuses);
5187 addMask(NewReuses, TE->ReuseShuffleIndices);
5188 TE->ReuseShuffleIndices.swap(NewReuses);
5189 }
5190 }
5191 }
5192}
5193
5194bool BoUpSLP::canReorderOperands(
5195 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5196 ArrayRef<TreeEntry *> ReorderableGathers,
5197 SmallVectorImpl<TreeEntry *> &GatherOps) {
5198 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5199 if (UserTE->isNonPowOf2Vec())
5200 return false;
5201
5202 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
5203 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
5204 return OpData.first == I &&
5205 (OpData.second->State == TreeEntry::Vectorize ||
5206 OpData.second->State == TreeEntry::StridedVectorize);
5207 }))
5208 continue;
5209 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
5210 // Do not reorder if operand node is used by many user nodes.
5211 if (any_of(TE->UserTreeIndices,
5212 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5213 return false;
5214 // Add the node to the list of the ordered nodes with the identity
5215 // order.
5216 Edges.emplace_back(I, TE);
5217 // Add ScatterVectorize nodes to the list of operands, where just
5218 // reordering of the scalars is required. Similar to the gathers, so
5219 // simply add to the list of gathered ops.
5220 // If there are reused scalars, process this node as a regular vectorize
5221 // node, just reorder reuses mask.
5222 if (TE->State != TreeEntry::Vectorize &&
5223 TE->State != TreeEntry::StridedVectorize &&
5224 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5225 GatherOps.push_back(TE);
5226 continue;
5227 }
5228 TreeEntry *Gather = nullptr;
5229 if (count_if(ReorderableGathers,
5230 [&Gather, UserTE, I](TreeEntry *TE) {
5231 assert(TE->State != TreeEntry::Vectorize &&
5232 TE->State != TreeEntry::StridedVectorize &&
5233 "Only non-vectorized nodes are expected.");
5234 if (any_of(TE->UserTreeIndices,
5235 [UserTE, I](const EdgeInfo &EI) {
5236 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5237 })) {
5238 assert(TE->isSame(UserTE->getOperand(I)) &&
5239 "Operand entry does not match operands.");
5240 Gather = TE;
5241 return true;
5242 }
5243 return false;
5244 }) > 1 &&
5245 !allConstant(UserTE->getOperand(I)))
5246 return false;
5247 if (Gather)
5248 GatherOps.push_back(Gather);
5249 }
5250 return true;
5251}
5252
5253void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
5254 SetVector<TreeEntry *> OrderedEntries;
5255 DenseSet<const TreeEntry *> GathersToOrders;
5256 // Find all reorderable leaf nodes with the given VF.
5257 // Currently the are vectorized loads,extracts without alternate operands +
5258 // some gathering of extracts.
5259 SmallVector<TreeEntry *> NonVectorized;
5260 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5261 if (TE->State != TreeEntry::Vectorize &&
5262 TE->State != TreeEntry::StridedVectorize)
5263 NonVectorized.push_back(TE.get());
5264 if (std::optional<OrdersType> CurrentOrder =
5265 getReorderingData(*TE, /*TopToBottom=*/false)) {
5266 OrderedEntries.insert(TE.get());
5267 if (!(TE->State == TreeEntry::Vectorize ||
5268 TE->State == TreeEntry::StridedVectorize) ||
5269 !TE->ReuseShuffleIndices.empty())
5270 GathersToOrders.insert(TE.get());
5271 }
5272 }
5273
5274 // 1. Propagate order to the graph nodes, which use only reordered nodes.
5275 // I.e., if the node has operands, that are reordered, try to make at least
5276 // one operand order in the natural order and reorder others + reorder the
5277 // user node itself.
5279 while (!OrderedEntries.empty()) {
5280 // 1. Filter out only reordered nodes.
5281 // 2. If the entry has multiple uses - skip it and jump to the next node.
5283 SmallVector<TreeEntry *> Filtered;
5284 for (TreeEntry *TE : OrderedEntries) {
5285 if (!(TE->State == TreeEntry::Vectorize ||
5286 TE->State == TreeEntry::StridedVectorize ||
5287 (TE->State == TreeEntry::NeedToGather &&
5288 GathersToOrders.contains(TE))) ||
5289 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5290 !all_of(drop_begin(TE->UserTreeIndices),
5291 [TE](const EdgeInfo &EI) {
5292 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5293 }) ||
5294 !Visited.insert(TE).second) {
5295 Filtered.push_back(TE);
5296 continue;
5297 }
5298 // Build a map between user nodes and their operands order to speedup
5299 // search. The graph currently does not provide this dependency directly.
5300 for (EdgeInfo &EI : TE->UserTreeIndices) {
5301 TreeEntry *UserTE = EI.UserTE;
5302 auto It = Users.find(UserTE);
5303 if (It == Users.end())
5304 It = Users.insert({UserTE, {}}).first;
5305 It->second.emplace_back(EI.EdgeIdx, TE);
5306 }
5307 }
5308 // Erase filtered entries.
5309 for (TreeEntry *TE : Filtered)
5310 OrderedEntries.remove(TE);
5312 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5313 UsersVec(Users.begin(), Users.end());
5314 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
5315 return Data1.first->Idx > Data2.first->Idx;
5316 });
5317 for (auto &Data : UsersVec) {
5318 // Check that operands are used only in the User node.
5319 SmallVector<TreeEntry *> GatherOps;
5320 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
5321 GatherOps)) {
5322 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5323 OrderedEntries.remove(Op.second);
5324 continue;
5325 }
5326 // All operands are reordered and used only in this node - propagate the
5327 // most used order to the user node.
5330 OrdersUses;
5331 // Do the analysis for each tree entry only once, otherwise the order of
5332 // the same node my be considered several times, though might be not
5333 // profitable.
5336 for (const auto &Op : Data.second) {
5337 TreeEntry *OpTE = Op.second;
5338 if (!VisitedOps.insert(OpTE).second)
5339 continue;
5340 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5341 continue;
5342 const auto Order = [&]() -> const OrdersType {
5343 if (OpTE->State == TreeEntry::NeedToGather ||
5344 !OpTE->ReuseShuffleIndices.empty())
5345 return getReorderingData(*OpTE, /*TopToBottom=*/false)
5346 .value_or(OrdersType(1));
5347 return OpTE->ReorderIndices;
5348 }();
5349 // The order is partially ordered, skip it in favor of fully non-ordered
5350 // orders.
5351 if (Order.size() == 1)
5352 continue;
5353 unsigned NumOps = count_if(
5354 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
5355 return P.second == OpTE;
5356 });
5357 // Stores actually store the mask, not the order, need to invert.
5358 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5359 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5360 SmallVector<int> Mask;
5361 inversePermutation(Order, Mask);
5362 unsigned E = Order.size();
5363 OrdersType CurrentOrder(E, E);
5364 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5365 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5366 });
5367 fixupOrderingIndices(CurrentOrder);
5368 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5369 NumOps;
5370 } else {
5371 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5372 }
5373 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
5374 const auto AllowsReordering = [&](const TreeEntry *TE) {
5375 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5376 if (TE->isNonPowOf2Vec())
5377 return false;
5378 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5379 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5380 (IgnoreReorder && TE->Idx == 0))
5381 return true;
5382 if (TE->State == TreeEntry::NeedToGather) {
5383 if (GathersToOrders.contains(TE))
5384 return !getReorderingData(*TE, /*TopToBottom=*/false)
5385 .value_or(OrdersType(1))
5386 .empty();
5387 return true;
5388 }
5389 return false;
5390 };
5391 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
5392 TreeEntry *UserTE = EI.UserTE;
5393 if (!VisitedUsers.insert(UserTE).second)
5394 continue;
5395 // May reorder user node if it requires reordering, has reused
5396 // scalars, is an alternate op vectorize node or its op nodes require
5397 // reordering.
5398 if (AllowsReordering(UserTE))
5399 continue;
5400 // Check if users allow reordering.
5401 // Currently look up just 1 level of operands to avoid increase of
5402 // the compile time.
5403 // Profitable to reorder if definitely more operands allow
5404 // reordering rather than those with natural order.
5406 if (static_cast<unsigned>(count_if(
5407 Ops, [UserTE, &AllowsReordering](
5408 const std::pair<unsigned, TreeEntry *> &Op) {
5409 return AllowsReordering(Op.second) &&
5410 all_of(Op.second->UserTreeIndices,
5411 [UserTE](const EdgeInfo &EI) {
5412 return EI.UserTE == UserTE;
5413 });
5414 })) <= Ops.size() / 2)
5415 ++Res.first->second;
5416 }
5417 }
5418 if (OrdersUses.empty()) {
5419 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5420 OrderedEntries.remove(Op.second);
5421 continue;
5422 }
5423 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5424 const unsigned Sz = Order.size();
5425 for (unsigned Idx : seq<unsigned>(0, Sz))
5426 if (Idx != Order[Idx] && Order[Idx] != Sz)
5427 return false;
5428 return true;
5429 };
5430 // Choose the most used order.
5431 unsigned IdentityCnt = 0;
5432 unsigned VF = Data.second.front().second->getVectorFactor();
5433 OrdersType IdentityOrder(VF, VF);
5434 for (auto &Pair : OrdersUses) {
5435 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5436 IdentityCnt += Pair.second;
5437 combineOrders(IdentityOrder, Pair.first);
5438 }
5439 }
5440 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5441 unsigned Cnt = IdentityCnt;
5442 for (auto &Pair : OrdersUses) {
5443 // Prefer identity order. But, if filled identity found (non-empty
5444 // order) with same number of uses, as the new candidate order, we can
5445 // choose this candidate order.
5446 if (Cnt < Pair.second) {
5447 combineOrders(Pair.first, BestOrder);
5448 BestOrder = Pair.first;
5449 Cnt = Pair.second;
5450 } else {
5451 combineOrders(BestOrder, Pair.first);
5452 }
5453 }
5454 // Set order of the user node.
5455 if (IsIdentityOrder(BestOrder)) {
5456 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5457 OrderedEntries.remove(Op.second);
5458 continue;
5459 }
5460 fixupOrderingIndices(BestOrder);
5461 // Erase operands from OrderedEntries list and adjust their orders.
5462 VisitedOps.clear();
5463 SmallVector<int> Mask;
5464 inversePermutation(BestOrder, Mask);
5465 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5466 unsigned E = BestOrder.size();
5467 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5468 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5469 });
5470 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
5471 TreeEntry *TE = Op.second;
5472 OrderedEntries.remove(TE);
5473 if (!VisitedOps.insert(TE).second)
5474 continue;
5475 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
5476 reorderNodeWithReuses(*TE, Mask);
5477 continue;
5478 }
5479 // Gathers are processed separately.
5480 if (TE->State != TreeEntry::Vectorize &&
5481 TE->State != TreeEntry::StridedVectorize &&
5482 (TE->State != TreeEntry::ScatterVectorize ||
5483 TE->ReorderIndices.empty()))
5484 continue;
5485 assert((BestOrder.size() == TE->ReorderIndices.size() ||
5486 TE->ReorderIndices.empty()) &&
5487 "Non-matching sizes of user/operand entries.");
5488 reorderOrder(TE->ReorderIndices, Mask);
5489 if (IgnoreReorder && TE == VectorizableTree.front().get())
5490 IgnoreReorder = false;
5491 }
5492 // For gathers just need to reorder its scalars.
5493 for (TreeEntry *Gather : GatherOps) {
5494 assert(Gather->ReorderIndices.empty() &&
5495 "Unexpected reordering of gathers.");
5496 if (!Gather->ReuseShuffleIndices.empty()) {
5497 // Just reorder reuses indices.
5498 reorderReuses(Gather->ReuseShuffleIndices, Mask);
5499 continue;
5500 }
5501 reorderScalars(Gather->Scalars, Mask);
5502 OrderedEntries.remove(Gather);
5503 }
5504 // Reorder operands of the user node and set the ordering for the user
5505 // node itself.
5506 if (Data.first->State != TreeEntry::Vectorize ||
5507 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5508 Data.first->getMainOp()) ||
5509 Data.first->isAltShuffle())
5510 Data.first->reorderOperands(Mask);
5511 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
5512 Data.first->isAltShuffle() ||
5513 Data.first->State == TreeEntry::StridedVectorize) {
5514 reorderScalars(Data.first->Scalars, Mask);
5515 reorderOrder(Data.first->ReorderIndices, MaskOrder,
5516 /*BottomOrder=*/true);
5517 if (Data.first->ReuseShuffleIndices.empty() &&
5518 !Data.first->ReorderIndices.empty() &&
5519 !Data.first->isAltShuffle()) {
5520 // Insert user node to the list to try to sink reordering deeper in
5521 // the graph.
5522 OrderedEntries.insert(Data.first);
5523 }
5524 } else {
5525 reorderOrder(Data.first->ReorderIndices, Mask);
5526 }
5527 }
5528 }
5529 // If the reordering is unnecessary, just remove the reorder.
5530 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5531 VectorizableTree.front()->ReuseShuffleIndices.empty())
5532 VectorizableTree.front()->ReorderIndices.clear();
5533}
5534
5536 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
5537 DenseMap<Value *, unsigned> ScalarToExtUses;
5538 // Collect the values that we need to extract from the tree.
5539 for (auto &TEPtr : VectorizableTree) {
5540 TreeEntry *Entry = TEPtr.get();
5541
5542 // No need to handle users of gathered values.
5543 if (Entry->State == TreeEntry::NeedToGather)
5544 continue;
5545
5546 // For each lane:
5547 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5548 Value *Scalar = Entry->Scalars[Lane];
5549 if (!isa<Instruction>(Scalar))
5550 continue;
5551 // All uses must be replaced already? No need to do it again.
5552 auto It = ScalarToExtUses.find(Scalar);
5553 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
5554 continue;
5555
5556 // Check if the scalar is externally used as an extra arg.
5557 const auto *ExtI = ExternallyUsedValues.find(Scalar);
5558 if (ExtI != ExternallyUsedValues.end()) {
5559 int FoundLane = Entry->findLaneForValue(Scalar);
5560 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
5561 << FoundLane << " from " << *Scalar << ".\n");
5562 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
5563 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
5564 continue;
5565 }
5566 for (User *U : Scalar->users()) {
5567 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
5568
5569 Instruction *UserInst = dyn_cast<Instruction>(U);
5570 if (!UserInst || isDeleted(UserInst))
5571 continue;
5572
5573 // Ignore users in the user ignore list.
5574 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5575 continue;
5576
5577 // Skip in-tree scalars that become vectors
5578 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5579 // Some in-tree scalars will remain as scalar in vectorized
5580 // instructions. If that is the case, the one in FoundLane will
5581 // be used.
5582 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5584 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5585 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
5586 << ".\n");
5587 assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
5588 continue;
5589 }
5590 U = nullptr;
5591 if (It != ScalarToExtUses.end()) {
5592 ExternalUses[It->second].User = nullptr;
5593 break;
5594 }
5595 }
5596
5597 int FoundLane = Entry->findLaneForValue(Scalar);
5598 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
5599 << " from lane " << FoundLane << " from " << *Scalar
5600 << ".\n");
5601 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
5602 ExternalUses.emplace_back(Scalar, U, FoundLane);
5603 if (!U)
5604 break;
5605 }
5606 }
5607 }
5608}
5609
5611BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
5613 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5614 Value *V = TE->Scalars[Lane];
5615 // To save compilation time we don't visit if we have too many users.
5616 if (V->hasNUsesOrMore(UsesLimit))
5617 break;
5618
5619 // Collect stores per pointer object.
5620 for (User *U : V->users()) {
5621 auto *SI = dyn_cast<StoreInst>(U);
5622 if (SI == nullptr || !SI->isSimple() ||
5623 !isValidElementType(SI->getValueOperand()->getType()))
5624 continue;
5625 // Skip entry if already
5626 if (getTreeEntry(U))
5627 continue;
5628
5629 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
5630 auto &StoresVec = PtrToStoresMap[Ptr];
5631 // For now just keep one store per pointer object per lane.
5632 // TODO: Extend this to support multiple stores per pointer per lane
5633 if (StoresVec.size() > Lane)
5634 continue;
5635 // Skip if in different BBs.
5636 if (!StoresVec.empty() &&
5637 SI->getParent() != StoresVec.back()->getParent())
5638 continue;
5639 // Make sure that the stores are of the same type.
5640 if (!StoresVec.empty() &&
5641 SI->getValueOperand()->getType() !=
5642 StoresVec.back()->getValueOperand()->getType())
5643 continue;
5644 StoresVec.push_back(SI);
5645 }
5646 }
5647 return PtrToStoresMap;
5648}
5649
5650bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
5651 OrdersType &ReorderIndices) const {
5652 // We check whether the stores in StoreVec can form a vector by sorting them
5653 // and checking whether they are consecutive.
5654
5655 // To avoid calling getPointersDiff() while sorting we create a vector of
5656 // pairs {store, offset from first} and sort this instead.
5657 SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
5658 StoreInst *S0 = StoresVec[0];
5659 StoreOffsetVec[0] = {S0, 0};
5660 Type *S0Ty = S0->getValueOperand()->getType();
5661 Value *S0Ptr = S0->getPointerOperand();
5662 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
5663 StoreInst *SI = StoresVec[Idx];
5664 std::optional<int> Diff =
5665 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
5666 SI->getPointerOperand(), *DL, *SE,
5667 /*StrictCheck=*/true);
5668 // We failed to compare the pointers so just abandon this StoresVec.
5669 if (!Diff)
5670 return false;
5671 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
5672 }
5673
5674 // Sort the vector based on the pointers. We create a copy because we may
5675 // need the original later for calculating the reorder (shuffle) indices.
5676 stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
5677 const std::pair<StoreInst *, int> &Pair2) {
5678 int Offset1 = Pair1.second;
5679 int Offset2 = Pair2.second;
5680 return Offset1 < Offset2;
5681 });
5682
5683 // Check if the stores are consecutive by checking if their difference is 1.
5684 for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5685 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
5686 return false;
5687
5688 // Calculate the shuffle indices according to their offset against the sorted
5689 // StoreOffsetVec.
5690 ReorderIndices.reserve(StoresVec.size());
5691 for (StoreInst *SI : StoresVec) {
5692 unsigned Idx = find_if(StoreOffsetVec,
5693 [SI](const std::pair<StoreInst *, int> &Pair) {
5694 return Pair.first == SI;
5695 }) -
5696 StoreOffsetVec.begin();
5697 ReorderIndices.push_back(Idx);
5698 }
5699 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
5700 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
5701 // same convention here.
5702 auto IsIdentityOrder = [](const OrdersType &Order) {
5703 for (unsigned Idx : seq<unsigned>(0, Order.size()))
5704 if (Idx != Order[Idx])
5705 return false;
5706 return true;
5707 };
5708 if (IsIdentityOrder(ReorderIndices))
5709 ReorderIndices.clear();
5710
5711 return true;
5712}
5713
5714#ifndef NDEBUG
5716 for (unsigned Idx : Order)
5717 dbgs() << Idx << ", ";
5718 dbgs() << "\n";
5719}
5720#endif
5721
5723BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
5724 unsigned NumLanes = TE->Scalars.size();
5725
5727 collectUserStores(TE);
5728
5729 // Holds the reorder indices for each candidate store vector that is a user of
5730 // the current TreeEntry.
5731 SmallVector<OrdersType, 1> ExternalReorderIndices;
5732
5733 // Now inspect the stores collected per pointer and look for vectorization
5734 // candidates. For each candidate calculate the reorder index vector and push
5735 // it into `ExternalReorderIndices`
5736 for (const auto &Pair : PtrToStoresMap) {
5737 auto &StoresVec = Pair.second;
5738 // If we have fewer than NumLanes stores, then we can't form a vector.
5739 if (StoresVec.size() != NumLanes)
5740 continue;
5741
5742 // If the stores are not consecutive then abandon this StoresVec.
5743 OrdersType ReorderIndices;
5744 if (!canFormVector(StoresVec, ReorderIndices))
5745 continue;
5746
5747 // We now know that the scalars in StoresVec can form a vector instruction,
5748 // so set the reorder indices.
5749 ExternalReorderIndices.push_back(ReorderIndices);
5750 }
5751 return ExternalReorderIndices;
5752}
5753
5755 const SmallDenseSet<Value *> &UserIgnoreLst) {
5756 deleteTree();
5757 UserIgnoreList = &UserIgnoreLst;
5758 if (!allSameType(Roots))
5759 return;
5760 buildTree_rec(Roots, 0, EdgeInfo());
5761}
5762
5764 deleteTree();
5765 if (!allSameType(Roots))
5766 return;
5767 buildTree_rec(Roots, 0, EdgeInfo());
5768}
5769
5770/// \return true if the specified list of values has only one instruction that
5771/// requires scheduling, false otherwise.
5772#ifndef NDEBUG
5774 Value *NeedsScheduling = nullptr;
5775 for (Value *V : VL) {
5777 continue;
5778 if (!NeedsScheduling) {
5779 NeedsScheduling = V;
5780 continue;
5781 }
5782 return false;
5783 }
5784 return NeedsScheduling;
5785}
5786#endif
5787
5788/// Generates key/subkey pair for the given value to provide effective sorting
5789/// of the values and better detection of the vectorizable values sequences. The
5790/// keys/subkeys can be used for better sorting of the values themselves (keys)
5791/// and in values subgroups (subkeys).
5792static std::pair<size_t, size_t> generateKeySubkey(
5793 Value *V, const TargetLibraryInfo *TLI,
5794 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
5795 bool AllowAlternate) {
5796 hash_code Key = hash_value(V->getValueID() + 2);
5797 hash_code SubKey = hash_value(0);
5798 // Sort the loads by the distance between the pointers.
5799 if (auto *LI = dyn_cast<LoadInst>(V)) {
5800 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
5801 if (LI->isSimple())
5802 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
5803 else
5804 Key = SubKey = hash_value(LI);
5805 } else if (isVectorLikeInstWithConstOps(V)) {
5806 // Sort extracts by the vector operands.
5807 if (isa<ExtractElementInst, UndefValue>(V))
5808 Key = hash_value(Value::UndefValueVal + 1);
5809 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
5810 if (!isUndefVector(EI->getVectorOperand()).all() &&
5811 !isa<UndefValue>(EI->getIndexOperand()))
5812 SubKey = hash_value(EI->getVectorOperand());
5813 }
5814 } else if (auto *I = dyn_cast<Instruction>(V)) {
5815 // Sort other instructions just by the opcodes except for CMPInst.
5816 // For CMP also sort by the predicate kind.
5817 if ((isa<BinaryOperator, CastInst>(I)) &&
5818 isValidForAlternation(I->getOpcode())) {
5819 if (AllowAlternate)
5820 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
5821 else
5822 Key = hash_combine(hash_value(I->getOpcode()), Key);
5823 SubKey = hash_combine(
5824 hash_value(I->getOpcode()), hash_value(I->getType()),
5825 hash_value(isa<BinaryOperator>(I)
5826 ? I->getType()
5827 : cast<CastInst>(I)->getOperand(0)->getType()));
5828 // For casts, look through the only operand to improve compile time.
5829 if (isa<CastInst>(I)) {
5830 std::pair<size_t, size_t> OpVals =
5831 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
5832 /*AllowAlternate=*/true);
5833 Key = hash_combine(OpVals.first, Key);
5834 SubKey = hash_combine(OpVals.first, SubKey);
5835 }
5836 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
5837 CmpInst::Predicate Pred = CI->getPredicate();
5838 if (CI->isCommutative())
5839 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
5841 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
5842 hash_value(SwapPred),
5843 hash_value(CI->getOperand(0)->getType()));
5844 } else if (auto *Call = dyn_cast<CallInst>(I)) {
5847 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
5848 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
5849 SubKey = hash_combine(hash_value(I->getOpcode()),
5850 hash_value(Call->getCalledFunction()));
5851 } else {
5852 Key = hash_combine(hash_value(Call), Key);
5853 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
5854 }
5855 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
5856 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
5857 hash_value(Op.Tag), SubKey);
5858 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
5859 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
5860 SubKey = hash_value(Gep->getPointerOperand());
5861 else
5862 SubKey = hash_value(Gep);
5863 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
5864 !isa<ConstantInt>(I->getOperand(1))) {
5865 // Do not try to vectorize instructions with potentially high cost.
5866 SubKey = hash_value(I);
5867 } else {
5868 SubKey = hash_value(I->getOpcode());
5869 }
5870 Key = hash_combine(hash_value(I->getParent()), Key);
5871 }
5872 return std::make_pair(Key, SubKey);
5873}
5874
5875/// Checks if the specified instruction \p I is an alternate operation for
5876/// the given \p MainOp and \p AltOp instructions.
5877static bool isAlternateInstruction(const Instruction *I,
5878 const Instruction *MainOp,
5879 const Instruction *AltOp,
5880 const TargetLibraryInfo &TLI);
5881
5882bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
5883 ArrayRef<Value *> VL) const {
5884 unsigned Opcode0 = S.getOpcode();
5885 unsigned Opcode1 = S.getAltOpcode();
5886 // The opcode mask selects between the two opcodes.
5887 SmallBitVector OpcodeMask(VL.size(), false);
5888 for (unsigned Lane : seq<unsigned>(0, VL.size()))
5889 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
5890 OpcodeMask.set(Lane);
5891 // If this pattern is supported by the target then consider it profitable.
5892 if (TTI->isLegalAltInstr(FixedVectorType::get(S.MainOp->getType(), VL.size()),
5893 Opcode0, Opcode1, OpcodeMask))
5894 return true;
5896 for (unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
5897 Operands.emplace_back();
5898 // Prepare the operand vector.
5899 for (Value *V : VL)
5900 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
5901 }
5902 if (Operands.size() == 2) {
5903 // Try find best operands candidates.
5904 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
5906 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
5907 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
5908 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
5909 std::optional<int> Res = findBestRootPair(Candidates);
5910 switch (Res.value_or(0)) {
5911 case 0:
5912 break;
5913 case 1:
5914 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
5915 break;
5916 case 2:
5917 std::swap(Operands[0][I], Operands[1][I]);
5918 break;
5919 default:
5920 llvm_unreachable("Unexpected index.");
5921 }
5922 }
5923 }
5924 DenseSet<unsigned> UniqueOpcodes;
5925 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
5926 unsigned NonInstCnt = 0;
5927 // Estimate number of instructions, required for the vectorized node and for
5928 // the buildvector node.
5929 unsigned UndefCnt = 0;
5930 // Count the number of extra shuffles, required for vector nodes.
5931 unsigned ExtraShuffleInsts = 0;
5932 // Check that operands do not contain same values and create either perfect
5933 // diamond match or shuffled match.
5934 if (Operands.size() == 2) {
5935 // Do not count same operands twice.
5936 if (Operands.front() == Operands.back()) {
5937 Operands.erase(Operands.begin());
5938 } else if (!allConstant(Operands.front()) &&
5939 all_of(Operands.front(), [&](Value *V) {
5940 return is_contained(Operands.back(), V);
5941 })) {
5942 Operands.erase(Operands.begin());
5943 ++ExtraShuffleInsts;
5944 }
5945 }
5946 const Loop *L = LI->getLoopFor(S.MainOp->getParent());
5947 // Vectorize node, if:
5948 // 1. at least single operand is constant or splat.
5949 // 2. Operands have many loop invariants (the instructions are not loop
5950 // invariants).
5951 // 3. At least single unique operands is supposed to vectorized.
5952 return none_of(Operands,
5953 [&](ArrayRef<Value *> Op) {
5954 if (allConstant(Op) ||
5955 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
5956 getSameOpcode(Op, *TLI).MainOp))
5957 return false;
5959 for (Value *V : Op) {
5960 if (isa<Constant, ExtractElementInst>(V) ||
5961 getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
5962 if (isa<UndefValue>(V))
5963 ++UndefCnt;
5964 continue;
5965 }
5966 auto Res = Uniques.try_emplace(V, 0);
5967 // Found first duplicate - need to add shuffle.
5968 if (!Res.second && Res.first->second == 1)
5969 ++ExtraShuffleInsts;
5970 ++Res.first->getSecond();
5971 if (auto *I = dyn_cast<Instruction>(V))
5972 UniqueOpcodes.insert(I->getOpcode());
5973 else if (Res.second)
5974 ++NonInstCnt;
5975 }
5976 return none_of(Uniques, [&](const auto &P) {
5977 return P.first->hasNUsesOrMore(P.second + 1) &&
5978 none_of(P.first->users(), [&](User *U) {
5979 return getTreeEntry(U) || Uniques.contains(U);
5980 });
5981 });
5982 }) ||
5983 // Do not vectorize node, if estimated number of vector instructions is
5984 // more than estimated number of buildvector instructions. Number of
5985 // vector operands is number of vector instructions + number of vector
5986 // instructions for operands (buildvectors). Number of buildvector
5987 // instructions is just number_of_operands * number_of_scalars.
5988 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
5989 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
5990 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
5991}
5992
5993BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
5994 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
5995 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
5996 assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
5997
5998 unsigned ShuffleOrOp =
5999 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
6000 auto *VL0 = cast<Instruction>(S.OpValue);
6001 switch (ShuffleOrOp) {
6002 case Instruction::PHI: {
6003 // Check for terminator values (e.g. invoke).
6004 for (Value *V : VL)
6005 for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
6006 Instruction *Term = dyn_cast<Instruction>(Incoming);
6007 if (Term && Term->isTerminator()) {
6009 << "SLP: Need to swizzle PHINodes (terminator use).\n");
6010 return TreeEntry::NeedToGather;
6011 }
6012 }
6013
6014 return TreeEntry::Vectorize;
6015 }
6016 case Instruction::ExtractValue:
6017 case Instruction::ExtractElement: {
6018 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6019 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6020 if (!isPowerOf2_32(VL.size()))
6021 return TreeEntry::NeedToGather;
6022 if (Reuse || !CurrentOrder.empty())
6023 return TreeEntry::Vectorize;
6024 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
6025 return TreeEntry::NeedToGather;
6026 }
6027 case Instruction::InsertElement: {
6028 // Check that we have a buildvector and not a shuffle of 2 or more
6029 // different vectors.
6030 ValueSet SourceVectors;
6031 for (Value *V : VL) {
6032 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
6033 assert(getInsertIndex(V) != std::nullopt &&
6034 "Non-constant or undef index?");
6035 }
6036
6037 if (count_if(VL, [&SourceVectors](Value *V) {
6038 return !SourceVectors.contains(V);
6039 }) >= 2) {
6040 // Found 2nd source vector - cancel.
6041 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
6042 "different source vectors.\n");
6043 return TreeEntry::NeedToGather;
6044 }
6045
6046 return TreeEntry::Vectorize;
6047 }
6048 case Instruction::Load: {
6049 // Check that a vectorized load would load the same memory as a scalar
6050 // load. For example, we don't want to vectorize loads that are smaller
6051 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6052 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6053 // from such a struct, we read/write packed bits disagreeing with the
6054 // unvectorized version.
6055 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
6057 return TreeEntry::Vectorize;
6059 return TreeEntry::ScatterVectorize;
6061 return TreeEntry::StridedVectorize;
6062 case LoadsState::Gather:
6063#ifndef NDEBUG
6064 Type *ScalarTy = VL0->getType();
6065 if (DL->getTypeSizeInBits(ScalarTy) !=
6066 DL->getTypeAllocSizeInBits(ScalarTy))
6067 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
6068 else if (any_of(VL,
6069 [](Value *V) { return !cast<LoadInst>(V)->isSimple(); }))
6070 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
6071 else
6072 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
6073#endif // NDEBUG
6074 return TreeEntry::NeedToGather;
6075 }
6076 llvm_unreachable("Unexpected state of loads");
6077 }
6078 case Instruction::ZExt:
6079 case Instruction::SExt:
6080 case Instruction::FPToUI:
6081 case Instruction::FPToSI:
6082 case Instruction::FPExt:
6083 case Instruction::PtrToInt:
6084 case Instruction::IntToPtr:
6085 case Instruction::SIToFP:
6086 case Instruction::UIToFP:
6087 case Instruction::Trunc:
6088 case Instruction::FPTrunc:
6089 case Instruction::BitCast: {
6090 Type *SrcTy = VL0->getOperand(0)->getType();
6091 for (Value *V : VL) {
6092 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6093 if (Ty != SrcTy || !isValidElementType(Ty)) {
6094 LLVM_DEBUG(
6095 dbgs() << "SLP: Gathering casts with different src types.\n");
6096 return TreeEntry::NeedToGather;
6097 }
6098 }
6099 return TreeEntry::Vectorize;
6100 }
6101 case Instruction::ICmp:
6102 case Instruction::FCmp: {
6103 // Check that all of the compares have the same predicate.
6104 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
6106 Type *ComparedTy = VL0->getOperand(0)->getType();
6107 for (Value *V : VL) {
6108 CmpInst *Cmp = cast<CmpInst>(V);
6109 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
6110 Cmp->getOperand(0)->getType() != ComparedTy) {
6111 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
6112 return TreeEntry::NeedToGather;
6113 }
6114 }
6115 return TreeEntry::Vectorize;
6116 }
6117 case Instruction::Select:
6118 case Instruction::FNeg:
6119 case Instruction::Add:
6120 case Instruction::FAdd:
6121 case Instruction::Sub:
6122 case Instruction::FSub:
6123 case Instruction::Mul:
6124 case Instruction::FMul:
6125 case Instruction::UDiv:
6126 case Instruction::SDiv:
6127 case Instruction::FDiv:
6128 case Instruction::URem:
6129 case Instruction::SRem:
6130 case Instruction::FRem:
6131 case Instruction::Shl:
6132 case Instruction::LShr:
6133 case Instruction::AShr:
6134 case Instruction::And:
6135 case Instruction::Or:
6136 case Instruction::Xor:
6137 return TreeEntry::Vectorize;
6138 case Instruction::GetElementPtr: {
6139 // We don't combine GEPs with complicated (nested) indexing.
6140 for (Value *V : VL) {
6141 auto *I = dyn_cast<GetElementPtrInst>(V);
6142 if (!I)
6143 continue;
6144 if (I->getNumOperands() != 2) {
6145 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
6146 return TreeEntry::NeedToGather;
6147 }
6148 }
6149
6150 // We can't combine several GEPs into one vector if they operate on
6151 // different types.
6152 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6153 for (Value *V : VL) {
6154 auto *GEP = dyn_cast<GEPOperator>(V);
6155 if (!GEP)
6156 continue;
6157 Type *CurTy = GEP->getSourceElementType();
6158 if (Ty0 != CurTy) {
6159 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
6160 return TreeEntry::NeedToGather;
6161 }
6162 }
6163
6164 // We don't combine GEPs with non-constant indexes.
6165 Type *Ty1 = VL0->getOperand(1)->getType();
6166 for (Value *V : VL) {
6167 auto *I = dyn_cast<GetElementPtrInst>(V);
6168 if (!I)
6169 continue;
6170 auto *Op = I->getOperand(1);
6171 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6172 (Op->getType() != Ty1 &&
6173 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6174 Op->getType()->getScalarSizeInBits() >
6175 DL->getIndexSizeInBits(
6176 V->getType()->getPointerAddressSpace())))) {
6177 LLVM_DEBUG(
6178 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
6179 return TreeEntry::NeedToGather;
6180 }
6181 }
6182
6183 return TreeEntry::Vectorize;
6184 }
6185 case Instruction::Store: {
6186 // Check if the stores are consecutive or if we need to swizzle them.
6187 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6188 // Avoid types that are padded when being allocated as scalars, while
6189 // being packed together in a vector (such as i1).
6190 if (DL->getTypeSizeInBits(ScalarTy) !=
6191 DL->getTypeAllocSizeInBits(ScalarTy)) {
6192 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
6193 return TreeEntry::NeedToGather;
6194 }
6195 // Make sure all stores in the bundle are simple - we can't vectorize
6196 // atomic or volatile stores.
6197 for (Value *V : VL) {
6198 auto *SI = cast<StoreInst>(V);
6199 if (!SI->isSimple()) {
6200 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
6201 return TreeEntry::NeedToGather;
6202 }
6203 PointerOps.push_back(SI->getPointerOperand());
6204 }
6205
6206 // Check the order of pointer operands.
6207 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
6208 Value *Ptr0;
6209 Value *PtrN;
6210 if (CurrentOrder.empty()) {
6211 Ptr0 = PointerOps.front();
6212 PtrN = PointerOps.back();
6213 } else {
6214 Ptr0 = PointerOps[CurrentOrder.front()];
6215 PtrN = PointerOps[CurrentOrder.back()];
6216 }
6217 std::optional<int> Dist =
6218 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
6219 // Check that the sorted pointer operands are consecutive.
6220 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
6221 return TreeEntry::Vectorize;
6222 }
6223
6224 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
6225 return TreeEntry::NeedToGather;
6226 }
6227 case Instruction::Call: {
6228 // Check if the calls are all to the same vectorizable intrinsic or
6229 // library function.
6230 CallInst *CI = cast<CallInst>(VL0);
6232
6233 VFShape Shape = VFShape::get(
6234 CI->getFunctionType(),
6235 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
6236 false /*HasGlobalPred*/);
6237 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
6238
6239 if (!VecFunc && !isTriviallyVectorizable(ID)) {
6240 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
6241 return TreeEntry::NeedToGather;
6242 }
6243 Function *F = CI->getCalledFunction();
6244 unsigned NumArgs = CI->arg_size();
6245 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
6246 for (unsigned J = 0; J != NumArgs; ++J)
6248 ScalarArgs[J] = CI->getArgOperand(J);
6249 for (Value *V : VL) {
6250 CallInst *CI2 = dyn_cast<CallInst>(V);
6251 if (!CI2 || CI2->getCalledFunction() != F ||
6252 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
6253 (VecFunc &&
6254 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
6256 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
6257 << "\n");
6258 return TreeEntry::NeedToGather;
6259 }
6260 // Some intrinsics have scalar arguments and should be same in order for
6261 // them to be vectorized.
6262 for (unsigned J = 0; J != NumArgs; ++J) {
6264 Value *A1J = CI2->getArgOperand(J);
6265 if (ScalarArgs[J] != A1J) {
6267 << "SLP: mismatched arguments in call:" << *CI
6268 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
6269 return TreeEntry::NeedToGather;
6270 }
6271 }
6272 }
6273 // Verify that the bundle operands are identical between the two calls.
6274 if (CI->hasOperandBundles() &&
6275 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
6276 CI->op_begin() + CI->getBundleOperandsEndIndex(),
6277 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
6278 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
6279 << "!=" << *V << '\n');
6280 return TreeEntry::NeedToGather;
6281 }
6282 }
6283
6284 return TreeEntry::Vectorize;
6285 }
6286 case Instruction::ShuffleVector: {
6287 // If this is not an alternate sequence of opcode like add-sub
6288 // then do not vectorize this instruction.
6289 if (!S.isAltShuffle()) {
6290 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
6291 return TreeEntry::NeedToGather;
6292 }
6293 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
6294 LLVM_DEBUG(
6295 dbgs()
6296 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
6297 "the whole alt sequence is not profitable.\n");
6298 return TreeEntry::NeedToGather;
6299 }
6300
6301 return TreeEntry::Vectorize;
6302 }
6303 default:
6304 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
6305 return TreeEntry::NeedToGather;
6306 }
6307}
6308
6309void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
6310 const EdgeInfo &UserTreeIdx) {
6311 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
6312
6313 SmallVector<int> ReuseShuffleIndicies;
6314 SmallVector<Value *> UniqueValues;
6315 SmallVector<Value *> NonUniqueValueVL;
6316 auto TryToFindDuplicates = [&](const InstructionsState &S,
6317 bool DoNotFail = false) {
6318 // Check that every instruction appears once in this bundle.
6319 DenseMap<Value *, unsigned> UniquePositions(VL.size());
6320 for (Value *V : VL) {
6321 if (isConstant(V)) {
6322 ReuseShuffleIndicies.emplace_back(
6323 isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());
6324 UniqueValues.emplace_back(V);
6325 continue;
6326 }
6327 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
6328 ReuseShuffleIndicies.emplace_back(Res.first->second);
6329 if (Res.second)
6330 UniqueValues.emplace_back(V);
6331 }
6332 size_t NumUniqueScalarValues = UniqueValues.size();
6333 if (NumUniqueScalarValues == VL.size()) {
6334 ReuseShuffleIndicies.clear();
6335 } else {
6336 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6337 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6338 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
6339 "for nodes with padding.\n");
6340 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6341 return false;
6342 }
6343 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
6344 if (NumUniqueScalarValues <= 1 ||
6345 (UniquePositions.size() == 1 && all_of(UniqueValues,
6346 [](Value *V) {
6347 return isa<UndefValue>(V) ||
6348 !isConstant(V);
6349 })) ||
6350 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6351 if (DoNotFail && UniquePositions.size() > 1 &&
6352 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6353 all_of(UniqueValues, [=](Value *V) {
6354 return isa<ExtractElementInst>(V) ||
6355 areAllUsersVectorized(cast<Instruction>(V),
6356 UserIgnoreList);
6357 })) {
6358 unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
6359 if (PWSz == VL.size()) {
6360 ReuseShuffleIndicies.clear();
6361 } else {
6362 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
6363 NonUniqueValueVL.append(PWSz - UniqueValues.size(),
6364 UniqueValues.back());
6365 VL = NonUniqueValueVL;
6366 }
6367 return true;
6368 }
6369 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
6370 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6371 return false;
6372 }
6373 VL = UniqueValues;
6374 }
6375 return true;
6376 };
6377
6378 InstructionsState S = getSameOpcode(VL, *TLI);
6379
6380 // Don't vectorize ephemeral values.
6381 if (!EphValues.empty()) {
6382 for (Value *V : VL) {
6383 if (EphValues.count(V)) {
6384 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6385 << ") is ephemeral.\n");
6386 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6387 return;
6388 }
6389 }
6390 }
6391
6392 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
6393 // a load), in which case peek through to include it in the tree, without
6394 // ballooning over-budget.
6395 if (Depth >= RecursionMaxDepth &&
6396 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6397 VL.size() >= 4 &&
6398 (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {
6399 return match(I,
6401 cast<Instruction>(I)->getOpcode() ==
6402 cast<Instruction>(S.MainOp)->getOpcode();
6403 })))) {
6404 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
6405 if (TryToFindDuplicates(S))
6406 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6407 ReuseShuffleIndicies);
6408 return;
6409 }
6410
6411 // Don't handle scalable vectors
6412 if (S.getOpcode() == Instruction::ExtractElement &&
6413 isa<ScalableVectorType>(
6414 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6415 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
6416 if (TryToFindDuplicates(S))
6417 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6418 ReuseShuffleIndicies);
6419 return;
6420 }
6421
6422 // Don't handle vectors.
6423 if (S.OpValue->getType()->isVectorTy() &&
6424 !isa<InsertElementInst>(S.OpValue)) {
6425 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
6426 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6427 return;
6428 }
6429
6430 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6431 if (SI->getValueOperand()->getType()->isVectorTy()) {
6432 LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
6433 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6434 return;
6435 }
6436
6437 // If all of the operands are identical or constant we have a simple solution.
6438 // If we deal with insert/extract instructions, they all must have constant
6439 // indices, otherwise we should gather them, not try to vectorize.
6440 // If alternate op node with 2 elements with gathered operands - do not
6441 // vectorize.
6442 auto &&NotProfitableForVectorization = [&S, this,
6444 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6445 return false;
6446 if (VectorizableTree.size() < MinTreeSize)
6447 return false;
6448 if (Depth >= RecursionMaxDepth - 1)
6449 return true;
6450 // Check if all operands are extracts, part of vector node or can build a
6451 // regular vectorize node.
6452 SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
6453 for (Value *V : VL) {
6454 auto *I = cast<Instruction>(V);
6455 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
6456 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6457 }));
6458 }
6459 bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
6460 if ((IsCommutative &&
6461 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6462 (!IsCommutative &&
6463 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
6464 return true;
6465 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
6467 auto *I1 = cast<Instruction>(VL.front());
6468 auto *I2 = cast<Instruction>(VL.back());
6469 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6470 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6471 I2->getOperand(Op));
6472 if (static_cast<unsigned>(count_if(
6473 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6475 })) >= S.MainOp->getNumOperands() / 2)
6476 return false;
6477 if (S.MainOp->getNumOperands() > 2)
6478 return true;
6479 if (IsCommutative) {
6480 // Check permuted operands.
6481 Candidates.clear();
6482 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6483 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6484 I2->getOperand((Op + 1) % E));
6485 if (any_of(
6486 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6488 }))
6489 return false;
6490 }
6491 return true;
6492 };
6493 SmallVector<unsigned> SortedIndices;
6494 BasicBlock *BB = nullptr;
6495 bool IsScatterVectorizeUserTE =
6496 UserTreeIdx.UserTE &&
6497 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6498 bool AreAllSameInsts =
6499 (S.getOpcode() && allSameBlock(VL)) ||
6500 (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
6501 VL.size() > 2 &&
6502 all_of(VL,
6503 [&BB](Value *V) {
6504 auto *I = dyn_cast<GetElementPtrInst>(V);
6505 if (!I)
6506 return doesNotNeedToBeScheduled(V);
6507 if (!BB)
6508 BB = I->getParent();
6509 return BB == I->getParent() && I->getNumOperands() == 2;
6510 }) &&
6511 BB &&
6512 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
6513 SortedIndices));
6514 if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) ||
6515 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6516 S.OpValue) &&
6518 NotProfitableForVectorization(VL)) {
6519 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
6520 if (TryToFindDuplicates(S))
6521 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6522 ReuseShuffleIndicies);
6523 return;
6524 }
6525
6526 // We now know that this is a vector of instructions of the same type from
6527 // the same block.
6528
6529 // Check if this is a duplicate of another entry.
6530 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
6531 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
6532 if (!E->isSame(VL)) {
6533 auto It = MultiNodeScalars.find(S.OpValue);
6534 if (It != MultiNodeScalars.end()) {
6535 auto *TEIt = find_if(It->getSecond(),
6536 [&](TreeEntry *ME) { return ME->isSame(VL); });
6537 if (TEIt != It->getSecond().end())
6538 E = *TEIt;
6539 else
6540 E = nullptr;
6541 } else {
6542 E = nullptr;
6543 }
6544 }
6545 if (!E) {
6546 if (!doesNotNeedToBeScheduled(S.OpValue)) {
6547 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
6548 if (TryToFindDuplicates(S))
6549 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6550 ReuseShuffleIndicies);
6551 return;
6552 }
6553 } else {
6554 // Record the reuse of the tree node. FIXME, currently this is only used
6555 // to properly draw the graph rather than for the actual vectorization.
6556 E->UserTreeIndices.push_back(UserTreeIdx);
6557 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
6558 << ".\n");
6559 return;
6560 }
6561 }
6562
6563 // Check that none of the instructions in the bundle are already in the tree.
6564 for (Value *V : VL) {
6565 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
6567 continue;
6568 if (getTreeEntry(V)) {
6569 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6570 << ") is already in tree.\n");
6571 if (TryToFindDuplicates(S))
6572 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6573 ReuseShuffleIndicies);
6574 return;
6575 }
6576 }
6577
6578 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
6579 if (UserIgnoreList && !UserIgnoreList->empty()) {
6580 for (Value *V : VL) {
6581 if (UserIgnoreList && UserIgnoreList->contains(V)) {
6582 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
6583 if (TryToFindDuplicates(S))
6584 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6585 ReuseShuffleIndicies);
6586 return;
6587 }
6588 }
6589 }
6590
6591 // Special processing for sorted pointers for ScatterVectorize node with
6592 // constant indeces only.
6593 if (AreAllSameInsts && UserTreeIdx.UserTE &&
6594 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
6595 !(S.getOpcode() && allSameBlock(VL))) {
6596 assert(S.OpValue->getType()->isPointerTy() &&
6597 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
6598 "Expected pointers only.");
6599 // Reset S to make it GetElementPtr kind of node.
6600 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
6601 assert(It != VL.end() && "Expected at least one GEP.");
6602 S = getSameOpcode(*It, *TLI);
6603 }
6604
6605 // Check that all of the users of the scalars that we want to vectorize are
6606 // schedulable.
6607 auto *VL0 = cast<Instruction>(S.OpValue);
6608 BB = VL0->getParent();
6609
6610 if (!DT->isReachableFromEntry(BB)) {
6611 // Don't go into unreachable blocks. They may contain instructions with
6612 // dependency cycles which confuse the final scheduling.
6613 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
6614 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6615 return;
6616 }
6617
6618 // Don't go into catchswitch blocks, which can happen with PHIs.
6619 // Such blocks can only have PHIs and the catchswitch. There is no
6620 // place to insert a shuffle if we need to, so just avoid that issue.
6621 if (isa<CatchSwitchInst>(BB->getTerminator())) {
6622 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
6623 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6624 return;
6625 }
6626
6627 // Check that every instruction appears once in this bundle.
6628 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
6629 return;
6630
6631 // Perform specific checks for each particular instruction kind.
6632 OrdersType CurrentOrder;
6633 SmallVector<Value *> PointerOps;
6634 TreeEntry::EntryState State = getScalarsVectorizationState(
6635 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
6636 if (State == TreeEntry::NeedToGather) {
6637 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6638 ReuseShuffleIndicies);
6639 return;
6640 }
6641
6642 auto &BSRef = BlocksSchedules[BB];
6643 if (!BSRef)
6644 BSRef = std::make_unique<BlockScheduling>(BB);
6645
6646 BlockScheduling &BS = *BSRef;
6647
6648 std::optional<ScheduleData *> Bundle =
6649 BS.tryScheduleBundle(UniqueValues, this, S);
6650#ifdef EXPENSIVE_CHECKS
6651 // Make sure we didn't break any internal invariants
6652 BS.verify();
6653#endif
6654 if (!Bundle) {
6655 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
6656 assert((!BS.getScheduleData(VL0) ||
6657 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
6658 "tryScheduleBundle should cancelScheduling on failure");
6659 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6660 ReuseShuffleIndicies);
6661 NonScheduledFirst.insert(VL.front());
6662 return;
6663 }
6664 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
6665
6666 unsigned ShuffleOrOp = S.isAltShuffle() ?
6667 (unsigned) Instruction::ShuffleVector : S.getOpcode();
6668 switch (ShuffleOrOp) {
6669 case Instruction::PHI: {
6670 auto *PH = cast<PHINode>(VL0);
6671
6672 TreeEntry *TE =
6673 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
6674 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
6675
6676 // Keeps the reordered operands to avoid code duplication.
6677 SmallVector<ValueList, 2> OperandsVec;
6678 for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
6679 if (!DT->isReachableFromEntry(PH->getIncomingBlock(I))) {
6680 ValueList Operands(VL.size(), PoisonValue::get(PH->getType()));
6681 TE->setOperand(I, Operands);
6682 OperandsVec.push_back(Operands);
6683 continue;
6684 }
6686 // Prepare the operand vector.
6687 for (Value *V : VL)
6688 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
6689 PH->getIncomingBlock(I)));
6690 TE->setOperand(I, Operands);
6691 OperandsVec.push_back(Operands);
6692 }
6693 for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
6694 buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
6695 return;
6696 }
6697 case Instruction::ExtractValue:
6698 case Instruction::ExtractElement: {
6699 if (CurrentOrder.empty()) {
6700 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
6701 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6702 ReuseShuffleIndicies);
6703 // This is a special case, as it does not gather, but at the same time
6704 // we are not extending buildTree_rec() towards the operands.
6705 ValueList Op0;
6706 Op0.assign(VL.size(), VL0->getOperand(0));
6707 VectorizableTree.back()->setOperand(0, Op0);
6708 return;
6709 }
6710 LLVM_DEBUG({
6711 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
6712 "with order";
6713 for (unsigned Idx : CurrentOrder)
6714 dbgs() << " " << Idx;
6715 dbgs() << "\n";
6716 });
6717 fixupOrderingIndices(CurrentOrder);
6718 // Insert new order with initial value 0, if it does not exist,
6719 // otherwise return the iterator to the existing one.
6720 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6721 ReuseShuffleIndicies, CurrentOrder);
6722 // This is a special case, as it does not gather, but at the same time
6723 // we are not extending buildTree_rec() towards the operands.
6724 ValueList Op0;
6725 Op0.assign(VL.size(), VL0->getOperand(0));
6726 VectorizableTree.back()->setOperand(0, Op0);
6727 return;
6728 }
6729 case Instruction::InsertElement: {
6730 assert(ReuseShuffleIndicies.empty() && "All inserts should be unique");
6731
6732 auto OrdCompare = [](const std::pair<int, int> &P1,
6733 const std::pair<int, int> &P2) {
6734 return P1.first > P2.first;
6735 };
6737 decltype(OrdCompare)>
6738 Indices(OrdCompare);
6739 for (int I = 0, E = VL.size(); I < E; ++I) {
6740 unsigned Idx = *getInsertIndex(VL[I]);
6741 Indices.emplace(Idx, I);
6742 }
6743 OrdersType CurrentOrder(VL.size(), VL.size());
6744 bool IsIdentity = true;
6745 for (int I = 0, E = VL.size(); I < E; ++I) {
6746 CurrentOrder[Indices.top().second] = I;
6747 IsIdentity &= Indices.top().second == I;
6748 Indices.pop();
6749 }
6750 if (IsIdentity)
6751 CurrentOrder.clear();
6752 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6753 std::nullopt, CurrentOrder);
6754 LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
6755
6756 constexpr int NumOps = 2;
6757 ValueList VectorOperands[NumOps];
6758 for (int I = 0; I < NumOps; ++I) {
6759 for (Value *V : VL)
6760 VectorOperands[I].push_back(cast<Instruction>(V)->getOperand(I));
6761
6762 TE->setOperand(I, VectorOperands[I]);
6763 }
6764 buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, NumOps - 1});
6765 return;
6766 }
6767 case Instruction::Load: {
6768 // Check that a vectorized load would load the same memory as a scalar
6769 // load. For example, we don't want to vectorize loads that are smaller
6770 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6771 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6772 // from such a struct, we read/write packed bits disagreeing with the
6773 // unvectorized version.
6774 TreeEntry *TE = nullptr;
6775 fixupOrderingIndices(CurrentOrder);
6776 switch (State) {
6777 case TreeEntry::Vectorize:
6778 if (CurrentOrder.empty()) {
6779 // Original loads are consecutive and does not require reordering.
6780 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6781 ReuseShuffleIndicies);
6782 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
6783 } else {
6784 // Need to reorder.
6785 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6786 ReuseShuffleIndicies, CurrentOrder);
6787 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
6788 }
6789 TE->setOperandsInOrder();
6790 break;
6791 case TreeEntry::StridedVectorize:
6792 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
6793 if (CurrentOrder.empty()) {
6794 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6795 UserTreeIdx, ReuseShuffleIndicies);
6796 } else {
6797 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6798 UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
6799 }
6800 TE->setOperandsInOrder();
6801 LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
6802 break;
6803 case TreeEntry::ScatterVectorize:
6804 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
6805 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
6806 UserTreeIdx, ReuseShuffleIndicies);
6807 TE->setOperandsInOrder();
6808 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
6809 LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
6810 break;
6811 case TreeEntry::NeedToGather:
6812 llvm_unreachable("Unexpected loads state.");
6813 }
6814 return;
6815 }
6816 case Instruction::ZExt:
6817 case Instruction::SExt:
6818 case Instruction::FPToUI:
6819 case Instruction::FPToSI:
6820 case Instruction::FPExt:
6821 case Instruction::PtrToInt:
6822 case Instruction::IntToPtr:
6823 case Instruction::SIToFP:
6824 case Instruction::UIToFP:
6825 case Instruction::Trunc:
6826 case Instruction::FPTrunc:
6827 case Instruction::BitCast: {
6828 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
6829 std::make_pair(std::numeric_limits<unsigned>::min(),
6830 std::numeric_limits<unsigned>::max()));
6831 if (ShuffleOrOp == Instruction::ZExt ||
6832 ShuffleOrOp == Instruction::SExt) {
6833 CastMaxMinBWSizes = std::make_pair(
6834 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
6835 PrevMaxBW),
6836 std::min<unsigned>(
6837 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
6838 PrevMinBW));
6839 } else if (ShuffleOrOp == Instruction::Trunc) {
6840 CastMaxMinBWSizes = std::make_pair(
6841 std::max<unsigned>(
6842 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
6843 PrevMaxBW),
6844 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
6845 PrevMinBW));
6846 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
6847 } else if (ShuffleOrOp == Instruction::SIToFP ||
6848 ShuffleOrOp == Instruction::UIToFP) {
6849 unsigned NumSignBits =
6850 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
6851 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
6852 APInt Mask = DB->getDemandedBits(OpI);
6853 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
6854 }
6855 if (NumSignBits * 2 >=
6856 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
6857 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
6858 }
6859 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6860 ReuseShuffleIndicies);
6861 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
6862
6863 TE->setOperandsInOrder();
6864 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6866 // Prepare the operand vector.
6867 for (Value *V : VL)
6868 Operands.push_back(cast<Instruction>(V)->getOperand(I));
6869
6870 buildTree_rec(Operands, Depth + 1, {TE, I});
6871 }
6872 return;
6873 }
6874 case Instruction::ICmp:
6875 case Instruction::FCmp: {
6876 // Check that all of the compares have the same predicate.
6877 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
6878 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6879 ReuseShuffleIndicies);
6880 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
6881
6883 if (cast<CmpInst>(VL0)->isCommutative()) {
6884 // Commutative predicate - collect + sort operands of the instructions
6885 // so that each side is more likely to have the same opcode.
6887 "Commutative Predicate mismatch");
6888 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
6889 } else {
6890 // Collect operands - commute if it uses the swapped predicate.
6891 for (Value *V : VL) {
6892 auto *Cmp = cast<CmpInst>(V);
6893 Value *LHS = Cmp->getOperand(0);
6894 Value *RHS = Cmp->getOperand(1);
6895 if (Cmp->getPredicate() != P0)
6896 std::swap(LHS, RHS);
6897 Left.push_back(LHS);
6898 Right.push_back(RHS);
6899 }
6900 }
6901 TE->setOperand(0, Left);
6902 TE->setOperand(1, Right);
6903 buildTree_rec(Left, Depth + 1, {TE, 0});
6904 buildTree_rec(Right, Depth + 1, {TE, 1});
6905 if (ShuffleOrOp == Instruction::ICmp) {
6906 unsigned NumSignBits0 =
6907 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
6908 if (NumSignBits0 * 2 >=
6909 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
6910 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
6911 unsigned NumSignBits1 =
6912 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
6913 if (NumSignBits1 * 2 >=
6914 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
6915 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
6916 }
6917 return;
6918 }
6919 case Instruction::Select:
6920 case Instruction::FNeg:
6921 case Instruction::Add:
6922 case Instruction::FAdd:
6923 case Instruction::Sub:
6924 case Instruction::FSub:
6925 case Instruction::Mul:
6926 case Instruction::FMul:
6927 case Instruction::UDiv:
6928 case Instruction::SDiv:
6929 case Instruction::FDiv:
6930 case Instruction::URem:
6931 case Instruction::SRem:
6932 case Instruction::FRem:
6933 case Instruction::Shl:
6934 case Instruction::LShr:
6935 case Instruction::AShr:
6936 case Instruction::And:
6937 case Instruction::Or:
6938 case Instruction::Xor: {
6939 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6940 ReuseShuffleIndicies);
6941 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
6942
6943 // Sort operands of the instructions so that each side is more likely to
6944 // have the same opcode.
6945 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
6947 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
6948 TE->setOperand(0, Left);
6949 TE->setOperand(1, Right);
6950 buildTree_rec(Left, Depth + 1, {TE, 0});
6951 buildTree_rec(Right, Depth + 1, {TE, 1});
6952 return;
6953 }
6954
6955 TE->setOperandsInOrder();
6956 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6958 // Prepare the operand vector.
6959 for (Value *V : VL)
6960 Operands.push_back(cast<Instruction>(V)->getOperand(I));
6961
6962 buildTree_rec(Operands, Depth + 1, {TE, I});
6963 }
6964 return;
6965 }
6966 case Instruction::GetElementPtr: {
6967 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6968 ReuseShuffleIndicies);
6969 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
6971 // Prepare the operand vector for pointer operands.
6972 for (Value *V : VL) {
6973 auto *GEP = dyn_cast<GetElementPtrInst>(V);
6974 if (!GEP) {
6975 Operands.front().push_back(V);
6976 continue;
6977 }
6978 Operands.front().push_back(GEP->getPointerOperand());
6979 }
6980 TE->setOperand(0, Operands.front());
6981 // Need to cast all indices to the same type before vectorization to
6982 // avoid crash.
6983 // Required to be able to find correct matches between different gather
6984 // nodes and reuse the vectorized values rather than trying to gather them
6985 // again.
6986 int IndexIdx = 1;
6987 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
6988 Type *Ty = all_of(VL,
6989 [VL0Ty, IndexIdx](Value *V) {
6990 auto *GEP = dyn_cast<GetElementPtrInst>(V);
6991 if (!GEP)
6992 return true;
6993 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
6994 })
6995 ? VL0Ty
6996 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
6997 ->getPointerOperandType()
6998 ->getScalarType());
6999 // Prepare the operand vector.
7000 for (Value *V : VL) {
7001 auto *I = dyn_cast<GetElementPtrInst>(V);
7002 if (!I) {
7003 Operands.back().push_back(
7004 ConstantInt::get(Ty, 0, /*isSigned=*/false));
7005 continue;
7006 }
7007 auto *Op = I->getOperand(IndexIdx);
7008 auto *CI = dyn_cast<ConstantInt>(Op);
7009 if (!CI)
7010 Operands.back().push_back(Op);
7011 else
7012 Operands.back().push_back(ConstantFoldIntegerCast(
7013 CI, Ty, CI->getValue().isSignBitSet(), *DL));
7014 }
7015 TE->setOperand(IndexIdx, Operands.back());
7016
7017 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
7018 buildTree_rec(Operands[I], Depth + 1, {TE, I});
7019 return;
7020 }
7021 case Instruction::Store: {
7022 // Check if the stores are consecutive or if we need to swizzle them.
7023 ValueList Operands(VL.size());
7024 auto *OIter = Operands.begin();
7025 for (Value *V : VL) {
7026 auto *SI = cast<StoreInst>(V);
7027 *OIter = SI->getValueOperand();
7028 ++OIter;
7029 }
7030 // Check that the sorted pointer operands are consecutive.
7031 if (CurrentOrder.empty()) {
7032 // Original stores are consecutive and does not require reordering.
7033 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7034 ReuseShuffleIndicies);
7035 TE->setOperandsInOrder();
7036 buildTree_rec(Operands, Depth + 1, {TE, 0});
7037 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
7038 } else {
7039 fixupOrderingIndices(CurrentOrder);
7040 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7041 ReuseShuffleIndicies, CurrentOrder);
7042 TE->setOperandsInOrder();
7043 buildTree_rec(Operands, Depth + 1, {TE, 0});
7044 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
7045 }
7046 return;
7047 }
7048 case Instruction::Call: {
7049 // Check if the calls are all to the same vectorizable intrinsic or
7050 // library function.
7051 CallInst *CI = cast<CallInst>(VL0);
7053
7054 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7055 ReuseShuffleIndicies);
7056 // Sort operands of the instructions so that each side is more likely to
7057 // have the same opcode.
7058 if (isCommutative(VL0)) {
7060 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7061 TE->setOperand(0, Left);
7062 TE->setOperand(1, Right);
7064 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7065 Operands.emplace_back();
7067 continue;
7068 for (Value *V : VL) {
7069 auto *CI2 = cast<CallInst>(V);
7070 Operands.back().push_back(CI2->getArgOperand(I));
7071 }
7072 TE->setOperand(I, Operands.back());
7073 }
7074 buildTree_rec(Left, Depth + 1, {TE, 0});
7075 buildTree_rec(Right, Depth + 1, {TE, 1});
7076 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7077 if (Operands[I - 2].empty())
7078 continue;
7079 buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
7080 }
7081 return;
7082 }
7083 TE->setOperandsInOrder();
7084 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
7085 // For scalar operands no need to create an entry since no need to
7086 // vectorize it.
7088 continue;
7090 // Prepare the operand vector.
7091 for (Value *V : VL) {
7092 auto *CI2 = cast<CallInst>(V);
7093 Operands.push_back(CI2->getArgOperand(I));
7094 }
7095 buildTree_rec(Operands, Depth + 1, {TE, I});
7096 }
7097 return;
7098 }
7099 case Instruction::ShuffleVector: {
7100 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7101 ReuseShuffleIndicies);
7102 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
7103
7104 // Reorder operands if reordering would enable vectorization.
7105 auto *CI = dyn_cast<CmpInst>(VL0);
7106 if (isa<BinaryOperator>(VL0) || CI) {
7108 if (!CI || all_of(VL, [](Value *V) {
7109 return cast<CmpInst>(V)->isCommutative();
7110 })) {
7111 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7112 } else {
7113 auto *MainCI = cast<CmpInst>(S.MainOp);
7114 auto *AltCI = cast<CmpInst>(S.AltOp);
7115 CmpInst::Predicate MainP = MainCI->getPredicate();
7116 CmpInst::Predicate AltP = AltCI->getPredicate();
7117 assert(MainP != AltP &&
7118 "Expected different main/alternate predicates.");
7119 // Collect operands - commute if it uses the swapped predicate or
7120 // alternate operation.
7121 for (Value *V : VL) {
7122 auto *Cmp = cast<CmpInst>(V);
7123 Value *LHS = Cmp->getOperand(0);
7124 Value *RHS = Cmp->getOperand(1);
7125
7126 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
7127 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7128 std::swap(LHS, RHS);
7129 } else {
7130 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7131 std::swap(LHS, RHS);
7132 }
7133 Left.push_back(LHS);
7134 Right.push_back(RHS);
7135 }
7136 }
7137 TE->setOperand(0, Left);
7138 TE->setOperand(1, Right);
7139 buildTree_rec(Left, Depth + 1, {TE, 0});
7140 buildTree_rec(Right, Depth + 1, {TE, 1});
7141 return;
7142 }
7143
7144 TE->setOperandsInOrder();
7145 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
7147 // Prepare the operand vector.
7148 for (Value *V : VL)
7149 Operands.push_back(cast<Instruction>(V)->getOperand(I));
7150
7151 buildTree_rec(Operands, Depth + 1, {TE, I});
7152 }
7153 return;
7154 }
7155 default:
7156 break;
7157 }
7158 llvm_unreachable("Unexpected vectorization of the instructions.");
7159}
7160
7162 unsigned N = 1;
7163 Type *EltTy = T;
7164
7165 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7166 if (auto *ST = dyn_cast<StructType>(EltTy)) {
7167 // Check that struct is homogeneous.
7168 for (const auto *Ty : ST->elements())
7169 if (Ty != *ST->element_begin())
7170 return 0;
7171 N *= ST->getNumElements();
7172 EltTy = *ST->element_begin();
7173 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
7174 N *= AT->getNumElements();
7175 EltTy = AT->getElementType();
7176 } else {
7177 auto *VT = cast<FixedVectorType>(EltTy);
7178 N *= VT->getNumElements();
7179 EltTy = VT->getElementType();
7180 }
7181 }
7182
7183 if (!isValidElementType(EltTy))
7184 return 0;
7185 uint64_t VTSize = DL->getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
7186 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7187 VTSize != DL->getTypeStoreSizeInBits(T))
7188 return 0;
7189 return N;
7190}
7191
7192bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
7193 SmallVectorImpl<unsigned> &CurrentOrder,
7194 bool ResizeAllowed) const {
7195 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7196 assert(It != VL.end() && "Expected at least one extract instruction.");
7197 auto *E0 = cast<Instruction>(*It);
7198 assert(
7199 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7200 "Invalid opcode");
7201 // Check if all of the extracts come from the same vector and from the
7202 // correct offset.
7203 Value *Vec = E0->getOperand(0);
7204
7205 CurrentOrder.clear();
7206
7207 // We have to extract from a vector/aggregate with the same number of elements.
7208 unsigned NElts;
7209 if (E0->getOpcode() == Instruction::ExtractValue) {
7210 NElts = canMapToVector(Vec->getType());
7211 if (!NElts)
7212 return false;
7213 // Check if load can be rewritten as load of vector.
7214 LoadInst *LI = dyn_cast<LoadInst>(Vec);
7215 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
7216 return false;
7217 } else {
7218 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
7219 }
7220
7221 unsigned E = VL.size();
7222 if (!ResizeAllowed && NElts != E)
7223 return false;
7225 unsigned MinIdx = NElts, MaxIdx = 0;
7226 for (auto [I, V] : enumerate(VL)) {
7227 auto *Inst = dyn_cast<Instruction>(V);
7228 if (!Inst)
7229 continue;
7230 if (Inst->getOperand(0) != Vec)
7231 return false;
7232 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
7233 if (isa<UndefValue>(EE->getIndexOperand()))
7234 continue;
7235 std::optional<unsigned> Idx = getExtractIndex(Inst);
7236 if (!Idx)
7237 return false;
7238 const unsigned ExtIdx = *Idx;
7239 if (ExtIdx >= NElts)
7240 continue;
7241 Indices[I] = ExtIdx;
7242 if (MinIdx > ExtIdx)
7243 MinIdx = ExtIdx;
7244 if (MaxIdx < ExtIdx)
7245 MaxIdx = ExtIdx;
7246 }
7247 if (MaxIdx - MinIdx + 1 > E)
7248 return false;
7249 if (MaxIdx + 1 <= E)
7250 MinIdx = 0;
7251
7252 // Check that all of the indices extract from the correct offset.
7253 bool ShouldKeepOrder = true;
7254 // Assign to all items the initial value E + 1 so we can check if the extract
7255 // instruction index was used already.
7256 // Also, later we can check that all the indices are used and we have a
7257 // consecutive access in the extract instructions, by checking that no
7258 // element of CurrentOrder still has value E + 1.
7259 CurrentOrder.assign(E, E);
7260 for (unsigned I = 0; I < E; ++I) {
7261 if (Indices[I] == PoisonMaskElem)
7262 continue;
7263 const unsigned ExtIdx = Indices[I] - MinIdx;
7264 if (CurrentOrder[ExtIdx] != E) {
7265 CurrentOrder.clear();
7266 return false;
7267 }
7268 ShouldKeepOrder &= ExtIdx == I;
7269 CurrentOrder[ExtIdx] = I;
7270 }
7271 if (ShouldKeepOrder)
7272 CurrentOrder.clear();
7273
7274 return ShouldKeepOrder;
7275}
7276
7277bool BoUpSLP::areAllUsersVectorized(
7278 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
7279 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
7280 all_of(I->users(), [this](User *U) {
7281 return ScalarToTreeEntry.contains(U) ||
7282 isVectorLikeInstWithConstOps(U) ||
7283 (isa<ExtractElementInst>(U) && MustGather.contains(U));
7284 });
7285}
7286
7287static std::pair<InstructionCost, InstructionCost>
7290 ArrayRef<Type *> ArgTys) {
7292
7293 // Calculate the cost of the scalar and vector calls.
7294 FastMathFlags FMF;
7295 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
7296 FMF = FPCI->getFastMathFlags();
7298 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
7299 dyn_cast<IntrinsicInst>(CI));
7300 auto IntrinsicCost =
7302
7303 auto Shape = VFShape::get(CI->getFunctionType(),
7305 false /*HasGlobalPred*/);
7306 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7307 auto LibCost = IntrinsicCost;
7308 if (!CI->isNoBuiltin() && VecFunc) {
7309 // Calculate the cost of the vector library call.
7310 // If the corresponding vector call is cheaper, return its cost.
7311 LibCost =
7312 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
7313 }
7314 return {IntrinsicCost, LibCost};
7315}
7316
7317void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7318 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
7319 SmallVectorImpl<Value *> *OpScalars,
7320 SmallVectorImpl<Value *> *AltScalars) const {
7321 unsigned Sz = Scalars.size();
7322 Mask.assign(Sz, PoisonMaskElem);
7323 SmallVector<int> OrderMask;
7324 if (!ReorderIndices.empty())
7325 inversePermutation(ReorderIndices, OrderMask);
7326 for (unsigned I = 0; I < Sz; ++I) {
7327 unsigned Idx = I;
7328 if (!ReorderIndices.empty())
7329 Idx = OrderMask[I];
7330 auto *OpInst = cast<Instruction>(Scalars[Idx]);
7331 if (IsAltOp(OpInst)) {
7332 Mask[I] = Sz + Idx;
7333 if (AltScalars)
7334 AltScalars->push_back(OpInst);
7335 } else {
7336 Mask[I] = Idx;
7337 if (OpScalars)
7338 OpScalars->push_back(OpInst);
7339 }
7340 }
7341 if (!ReuseShuffleIndices.empty()) {
7342 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
7343 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
7344 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7345 });
7346 Mask.swap(NewMask);
7347 }
7348}
7349
7351 const Instruction *MainOp,
7352 const Instruction *AltOp,
7353 const TargetLibraryInfo &TLI) {
7354 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7355 auto *AltCI = cast<CmpInst>(AltOp);
7356 CmpInst::Predicate MainP = MainCI->getPredicate();
7357 CmpInst::Predicate AltP = AltCI->getPredicate();
7358 assert(MainP != AltP && "Expected different main/alternate predicates.");
7359 auto *CI = cast<CmpInst>(I);
7360 if (isCmpSameOrSwapped(MainCI, CI, TLI))
7361 return false;
7362 if (isCmpSameOrSwapped(AltCI, CI, TLI))
7363 return true;
7364 CmpInst::Predicate P = CI->getPredicate();
7366
7367 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
7368 "CmpInst expected to match either main or alternate predicate or "
7369 "their swap.");
7370 (void)AltP;
7371 return MainP != P && MainP != SwappedP;
7372 }
7373 return I->getOpcode() == AltOp->getOpcode();
7374}
7375
7376TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
7377 assert(!Ops.empty());
7378 const auto *Op0 = Ops.front();
7379
7380 const bool IsConstant = all_of(Ops, [](Value *V) {
7381 // TODO: We should allow undef elements here
7382 return isConstant(V) && !isa<UndefValue>(V);
7383 });
7384 const bool IsUniform = all_of(Ops, [=](Value *V) {
7385 // TODO: We should allow undef elements here
7386 return V == Op0;
7387 });
7388 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
7389 // TODO: We should allow undef elements here
7390 if (auto *CI = dyn_cast<ConstantInt>(V))
7391 return CI->getValue().isPowerOf2();
7392 return false;
7393 });
7394 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
7395 // TODO: We should allow undef elements here
7396 if (auto *CI = dyn_cast<ConstantInt>(V))
7397 return CI->getValue().isNegatedPowerOf2();
7398 return false;
7399 });
7400
7402 if (IsConstant && IsUniform)
7404 else if (IsConstant)
7406 else if (IsUniform)
7408
7410 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
7411 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
7412
7413 return {VK, VP};
7414}
7415
7416namespace {
7417/// The base class for shuffle instruction emission and shuffle cost estimation.
7418class BaseShuffleAnalysis {
7419protected:
7420 /// Checks if the mask is an identity mask.
7421 /// \param IsStrict if is true the function returns false if mask size does
7422 /// not match vector size.
7423 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
7424 bool IsStrict) {
7425 int Limit = Mask.size();
7426 int VF = VecTy->getNumElements();
7427 int Index = -1;
7428 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
7429 return true;
7430 if (!IsStrict) {
7431 // Consider extract subvector starting from index 0.
7433 Index == 0)
7434 return true;
7435 // All VF-size submasks are identity (e.g.
7436 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
7437 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
7438 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
7439 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
7441 }))
7442 return true;
7443 }
7444 return false;
7445 }
7446
7447 /// Tries to combine 2 different masks into single one.
7448 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
7449 /// change the size of the vector, \p LocalVF is the original size of the
7450 /// shuffled vector.
7451 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
7452 ArrayRef<int> ExtMask) {
7453 unsigned VF = Mask.size();
7454 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
7455 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
7456 if (ExtMask[I] == PoisonMaskElem)
7457 continue;
7458 int MaskedIdx = Mask[ExtMask[I] % VF];
7459 NewMask[I] =
7460 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
7461 }
7462 Mask.swap(NewMask);
7463 }
7464
7465 /// Looks through shuffles trying to reduce final number of shuffles in the
7466 /// code. The function looks through the previously emitted shuffle
7467 /// instructions and properly mark indices in mask as undef.
7468 /// For example, given the code
7469 /// \code
7470 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
7471 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
7472 /// \endcode
7473 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
7474 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7475 /// <0, 1, 2, 3> for the shuffle.
7476 /// If 2 operands are of different size, the smallest one will be resized and
7477 /// the mask recalculated properly.
7478 /// For example, given the code
7479 /// \code
7480 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
7481 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
7482 /// \endcode
7483 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
7484 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7485 /// <0, 1, 2, 3> for the shuffle.
7486 /// So, it tries to transform permutations to simple vector merge, if
7487 /// possible.
7488 /// \param V The input vector which must be shuffled using the given \p Mask.
7489 /// If the better candidate is found, \p V is set to this best candidate
7490 /// vector.
7491 /// \param Mask The input mask for the shuffle. If the best candidate is found
7492 /// during looking-through-shuffles attempt, it is updated accordingly.
7493 /// \param SinglePermute true if the shuffle operation is originally a
7494 /// single-value-permutation. In this case the look-through-shuffles procedure
7495 /// may look for resizing shuffles as the best candidates.
7496 /// \return true if the shuffle results in the non-resizing identity shuffle
7497 /// (and thus can be ignored), false - otherwise.
7498 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
7499 bool SinglePermute) {
7500 Value *Op = V;
7501 ShuffleVectorInst *IdentityOp = nullptr;
7502 SmallVector<int> IdentityMask;
7503 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
7504 // Exit if not a fixed vector type or changing size shuffle.
7505 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7506 if (!SVTy)
7507 break;
7508 // Remember the identity or broadcast mask, if it is not a resizing
7509 // shuffle. If no better candidates are found, this Op and Mask will be
7510 // used in the final shuffle.
7511 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
7512 if (!IdentityOp || !SinglePermute ||
7513 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
7515 IdentityMask.size()))) {
7516 IdentityOp = SV;
7517 // Store current mask in the IdentityMask so later we did not lost
7518 // this info if IdentityOp is selected as the best candidate for the
7519 // permutation.
7520 IdentityMask.assign(Mask);
7521 }
7522 }
7523 // Remember the broadcast mask. If no better candidates are found, this Op
7524 // and Mask will be used in the final shuffle.
7525 // Zero splat can be used as identity too, since it might be used with
7526 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
7527 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
7528 // expensive, the analysis founds out, that the source vector is just a
7529 // broadcast, this original mask can be transformed to identity mask <0,
7530 // 1, 2, 3>.
7531 // \code
7532 // %0 = shuffle %v, poison, zeroinitalizer
7533 // %res = shuffle %0, poison, <3, 1, 2, 0>
7534 // \endcode
7535 // may be transformed to
7536 // \code
7537 // %0 = shuffle %v, poison, zeroinitalizer
7538 // %res = shuffle %0, poison, <0, 1, 2, 3>
7539 // \endcode
7540 if (SV->isZeroEltSplat()) {
7541 IdentityOp = SV;
7542 IdentityMask.assign(Mask);
7543 }
7544 int LocalVF = Mask.size();
7545 if (auto *SVOpTy =
7546 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7547 LocalVF = SVOpTy->getNumElements();
7548 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
7549 for (auto [Idx, I] : enumerate(Mask)) {
7550 if (I == PoisonMaskElem ||
7551 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
7552 continue;
7553 ExtMask[Idx] = SV->getMaskValue(I);
7554 }
7555 bool IsOp1Undef =
7556 isUndefVector(SV->getOperand(0),
7557 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
7558 .all();
7559 bool IsOp2Undef =
7560 isUndefVector(SV->getOperand(1),
7561 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
7562 .all();
7563 if (!IsOp1Undef && !IsOp2Undef) {
7564 // Update mask and mark undef elems.
7565 for (int &I : Mask) {
7566 if (I == PoisonMaskElem)
7567 continue;
7568 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
7570 I = PoisonMaskElem;
7571 }
7572 break;
7573 }
7574 SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
7575 SV->getShuffleMask().end());
7576 combineMasks(LocalVF, ShuffleMask, Mask);
7577 Mask.swap(ShuffleMask);
7578 if (IsOp2Undef)
7579 Op = SV->getOperand(0);
7580 else
7581 Op = SV->getOperand(1);
7582 }
7583 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
7584 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7586 if (IdentityOp) {
7587 V = IdentityOp;
7588 assert(Mask.size() == IdentityMask.size() &&
7589 "Expected masks of same sizes.");
7590 // Clear known poison elements.
7591 for (auto [I, Idx] : enumerate(Mask))
7592 if (Idx == PoisonMaskElem)
7593 IdentityMask[I] = PoisonMaskElem;
7594 Mask.swap(IdentityMask);
7595 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7596 return SinglePermute &&
7597 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
7598 /*IsStrict=*/true) ||
7599 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
7600 Shuffle->isZeroEltSplat() &&
7602 }
7603 V = Op;
7604 return false;
7605 }
7606 V = Op;
7607 return true;
7608 }
7609
7610 /// Smart shuffle instruction emission, walks through shuffles trees and
7611 /// tries to find the best matching vector for the actual shuffle
7612 /// instruction.
7613 template <typename T, typename ShuffleBuilderTy>
7614 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
7615 ShuffleBuilderTy &Builder) {
7616 assert(V1 && "Expected at least one vector value.");
7617 if (V2)
7618 Builder.resizeToMatch(V1, V2);
7619 int VF = Mask.size();
7620 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
7621 VF = FTy->getNumElements();
7622 if (V2 &&
7623 !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) {
7624 // Peek through shuffles.
7625 Value *Op1 = V1;
7626 Value *Op2 = V2;
7627 int VF =
7628 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
7629 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
7630 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
7631 for (int I = 0, E = Mask.size(); I < E; ++I) {
7632 if (Mask[I] < VF)
7633 CombinedMask1[I] = Mask[I];
7634 else
7635 CombinedMask2[I] = Mask[I] - VF;
7636 }
7637 Value *PrevOp1;
7638 Value *PrevOp2;
7639 do {
7640 PrevOp1 = Op1;
7641 PrevOp2 = Op2;
7642 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
7643 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
7644 // Check if we have 2 resizing shuffles - need to peek through operands
7645 // again.
7646 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
7647 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
7648 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
7649 for (auto [Idx, I] : enumerate(CombinedMask1)) {
7650 if (I == PoisonMaskElem)
7651 continue;
7652 ExtMask1[Idx] = SV1->getMaskValue(I);
7653 }
7654 SmallBitVector UseMask1 = buildUseMask(
7655 cast<FixedVectorType>(SV1->getOperand(1)->getType())
7656 ->getNumElements(),
7657 ExtMask1, UseMask::SecondArg);
7658 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
7659 for (auto [Idx, I] : enumerate(CombinedMask2)) {
7660 if (I == PoisonMaskElem)
7661 continue;
7662 ExtMask2[Idx] = SV2->getMaskValue(I);
7663 }
7664 SmallBitVector UseMask2 = buildUseMask(
7665 cast<FixedVectorType>(SV2->getOperand(1)->getType())
7666 ->getNumElements(),
7667 ExtMask2, UseMask::SecondArg);
7668 if (SV1->getOperand(0)->getType() ==
7669 SV2->getOperand(0)->getType() &&
7670 SV1->getOperand(0)->getType() != SV1->getType() &&
7671 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
7672 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
7673 Op1 = SV1->getOperand(0);
7674 Op2 = SV2->getOperand(0);
7675 SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
7676 SV1->getShuffleMask().end());
7677 int LocalVF = ShuffleMask1.size();
7678 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
7679 LocalVF = FTy->getNumElements();
7680 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
7681 CombinedMask1.swap(ShuffleMask1);
7682 SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
7683 SV2->getShuffleMask().end());
7684 LocalVF = ShuffleMask2.size();
7685 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
7686 LocalVF = FTy->getNumElements();
7687 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
7688 CombinedMask2.swap(ShuffleMask2);
7689 }
7690 }
7691 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
7692 Builder.resizeToMatch(Op1, Op2);
7693 VF = std::max(cast<VectorType>(Op1->getType())
7694 ->getElementCount()
7695 .getKnownMinValue(),
7696 cast<VectorType>(Op2->getType())
7697 ->getElementCount()
7698 .getKnownMinValue());
7699 for (int I = 0, E = Mask.size(); I < E; ++I) {
7700 if (CombinedMask2[I] != PoisonMaskElem) {
7701 assert(CombinedMask1[I] == PoisonMaskElem &&
7702 "Expected undefined mask element");
7703 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
7704 }
7705 }
7706 if (Op1 == Op2 &&
7707 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
7708 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
7709 isa<ShuffleVectorInst>(Op1) &&
7710 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
7711 ArrayRef(CombinedMask1))))
7712 return Builder.createIdentity(Op1);
7713 return Builder.createShuffleVector(
7714 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
7715 CombinedMask1);
7716 }
7717 if (isa<PoisonValue>(V1))
7718 return Builder.createPoison(
7719 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
7720 SmallVector<int> NewMask(Mask.begin(), Mask.end());
7721 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
7722 assert(V1 && "Expected non-null value after looking through shuffles.");
7723
7724 if (!IsIdentity)
7725 return Builder.createShuffleVector(V1, NewMask);
7726 return Builder.createIdentity(V1);
7727 }
7728};
7729} // namespace
7730
7731/// Returns the cost of the shuffle instructions with the given \p Kind, vector
7732/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
7733/// subvector pattern.
7734static InstructionCost
7736 VectorType *Tp, ArrayRef<int> Mask = std::nullopt,
7738 int Index = 0, VectorType *SubTp = nullptr,
7739 ArrayRef<const Value *> Args = std::nullopt) {
7740 if (Kind != TTI::SK_PermuteTwoSrc)
7741 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
7742 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7743 int NumSubElts;
7744 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
7745 Mask, NumSrcElts, NumSubElts, Index)) {
7746 if (Index + NumSubElts > NumSrcElts &&
7747 Index + NumSrcElts <= static_cast<int>(Mask.size()))
7748 return TTI.getShuffleCost(
7750 FixedVectorType::get(Tp->getElementType(), Mask.size()), Mask,
7752 }
7753 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
7754}
7755
7756/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
7757static std::pair<InstructionCost, InstructionCost>
7759 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
7760 Type *ScalarTy, VectorType *VecTy) {
7761 InstructionCost ScalarCost = 0;
7762 InstructionCost VecCost = 0;
7763 // Here we differentiate two cases: (1) when Ptrs represent a regular
7764 // vectorization tree node (as they are pointer arguments of scattered
7765 // loads) or (2) when Ptrs are the arguments of loads or stores being
7766 // vectorized as plane wide unit-stride load/store since all the
7767 // loads/stores are known to be from/to adjacent locations.
7768 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
7769 // Case 2: estimate costs for pointer related costs when vectorizing to
7770 // a wide load/store.
7771 // Scalar cost is estimated as a set of pointers with known relationship
7772 // between them.
7773 // For vector code we will use BasePtr as argument for the wide load/store
7774 // but we also need to account all the instructions which are going to
7775 // stay in vectorized code due to uses outside of these scalar
7776 // loads/stores.
7777 ScalarCost = TTI.getPointersChainCost(
7778 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
7779 CostKind);
7780
7781 SmallVector<const Value *> PtrsRetainedInVecCode;
7782 for (Value *V : Ptrs) {
7783 if (V == BasePtr) {
7784 PtrsRetainedInVecCode.push_back(V);
7785 continue;
7786 }
7787 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
7788 // For simplicity assume Ptr to stay in vectorized code if it's not a
7789 // GEP instruction. We don't care since it's cost considered free.
7790 // TODO: We should check for any uses outside of vectorizable tree
7791 // rather than just single use.
7792 if (!Ptr || !Ptr->hasOneUse())
7793 PtrsRetainedInVecCode.push_back(V);
7794 }
7795
7796 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
7797 // If all pointers stay in vectorized code then we don't have
7798 // any savings on that.
7799 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
7800 }
7801 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
7802 TTI::PointersChainInfo::getKnownStride(),
7803 VecTy, CostKind);
7804 } else {
7805 // Case 1: Ptrs are the arguments of loads that we are going to transform
7806 // into masked gather load intrinsic.
7807 // All the scalar GEPs will be removed as a result of vectorization.
7808 // For any external uses of some lanes extract element instructions will
7809 // be generated (which cost is estimated separately).
7810 TTI::PointersChainInfo PtrsInfo =
7811 all_of(Ptrs,
7812 [](const Value *V) {
7813 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
7814 return Ptr && !Ptr->hasAllConstantIndices();
7815 })
7816 ? TTI::PointersChainInfo::getUnknownStride()
7817 : TTI::PointersChainInfo::getKnownStride();
7818
7819 ScalarCost =
7820 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
7821 if (auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {
7822 SmallVector<const Value *> Indices(BaseGEP->indices());
7823 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
7824 BaseGEP->getPointerOperand(), Indices, VecTy,
7825 CostKind);
7826 }
7827 }
7828
7829 return std::make_pair(ScalarCost, VecCost);
7830}
7831
7834 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
7835 TreeEntry &E = *TE.get();
7836 switch (E.getOpcode()) {
7837 case Instruction::Load: {
7838 Type *ScalarTy = E.getMainOp()->getType();
7839 auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
7840 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
7841 // Check if profitable to represent consecutive load + reverse as strided
7842 // load with stride -1.
7843 if (isReverseOrder(E.ReorderIndices) &&
7844 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
7845 SmallVector<int> Mask;
7846 inversePermutation(E.ReorderIndices, Mask);
7847 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
7848 InstructionCost OriginalVecCost =
7849 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
7854 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
7855 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
7856 if (StridedCost < OriginalVecCost)
7857 // Strided load is more profitable than consecutive load + reverse -
7858 // transform the node to strided load.
7859 E.State = TreeEntry::StridedVectorize;
7860 }
7861 break;
7862 }
7863 default:
7864 break;
7865 }
7866 }
7867}
7868
7869/// Merges shuffle masks and emits final shuffle instruction, if required. It
7870/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
7871/// when the actual shuffle instruction is generated only if this is actually
7872/// required. Otherwise, the shuffle instruction emission is delayed till the
7873/// end of the process, to reduce the number of emitted instructions and further
7874/// analysis/transformations.
7875class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7876 bool IsFinalized = false;
7877 SmallVector<int> CommonMask;
7879 const TargetTransformInfo &TTI;
7881 SmallDenseSet<Value *> VectorizedVals;
7882 BoUpSLP &R;
7883 SmallPtrSetImpl<Value *> &CheckedExtracts;
7884 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7885 /// While set, still trying to estimate the cost for the same nodes and we
7886 /// can delay actual cost estimation (virtual shuffle instruction emission).
7887 /// May help better estimate the cost if same nodes must be permuted + allows
7888 /// to move most of the long shuffles cost estimation to TTI.
7889 bool SameNodesEstimated = true;
7890
7891 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
7892 if (Ty->getScalarType()->isPointerTy()) {
7896 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
7897 Ty->getScalarType());
7898 if (auto *VTy = dyn_cast<VectorType>(Ty))
7899 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
7900 return Res;
7901 }
7902 return Constant::getAllOnesValue(Ty);
7903 }
7904
7905 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
7906 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
7907 return TTI::TCC_Free;
7908 auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
7909 InstructionCost GatherCost = 0;
7910 SmallVector<Value *> Gathers(VL.begin(), VL.end());
7911 // Improve gather cost for gather of loads, if we can group some of the
7912 // loads into vector loads.
7913 InstructionsState S = getSameOpcode(VL, *R.TLI);
7914 const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType());
7915 unsigned MinVF = R.getMinVF(2 * Sz);
7916 if (VL.size() > 2 &&
7917 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
7918 (InVectors.empty() &&
7919 any_of(seq<unsigned>(0, VL.size() / MinVF),
7920 [&](unsigned Idx) {
7921 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
7922 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
7923 return S.getOpcode() == Instruction::Load &&
7924 !S.isAltShuffle();
7925 }))) &&
7926 !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
7927 !isSplat(Gathers)) {
7928 InstructionCost BaseCost = R.getGatherCost(Gathers, !Root);
7929 SetVector<Value *> VectorizedLoads;
7931 SmallVector<unsigned> ScatterVectorized;
7932 unsigned StartIdx = 0;
7933 unsigned VF = VL.size() / 2;
7934 for (; VF >= MinVF; VF /= 2) {
7935 for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
7936 Cnt += VF) {
7937 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7938 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
7939 InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);
7940 if (SliceS.getOpcode() != Instruction::Load ||
7941 SliceS.isAltShuffle())
7942 continue;
7943 }
7944 if (!VectorizedLoads.count(Slice.front()) &&
7945 !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
7946 SmallVector<Value *> PointerOps;
7947 OrdersType CurrentOrder;
7948 LoadsState LS = R.canVectorizeLoads(Slice, Slice.front(),
7949 CurrentOrder, PointerOps);
7950 switch (LS) {
7954 // Mark the vectorized loads so that we don't vectorize them
7955 // again.
7956 // TODO: better handling of loads with reorders.
7957 if (((LS == LoadsState::Vectorize ||
7959 CurrentOrder.empty()) ||
7961 isReverseOrder(CurrentOrder)))
7962 VectorizedStarts.emplace_back(Cnt, LS);
7963 else
7964 ScatterVectorized.push_back(Cnt);
7965 VectorizedLoads.insert(Slice.begin(), Slice.end());
7966 // If we vectorized initial block, no need to try to vectorize
7967 // it again.
7968 if (Cnt == StartIdx)
7969 StartIdx += VF;
7970 break;
7971 case LoadsState::Gather:
7972 break;
7973 }
7974 }
7975 }
7976 // Check if the whole array was vectorized already - exit.
7977 if (StartIdx >= VL.size())
7978 break;
7979 // Found vectorizable parts - exit.
7980 if (!VectorizedLoads.empty())
7981 break;
7982 }
7983 if (!VectorizedLoads.empty()) {
7984 unsigned NumParts = TTI.getNumberOfParts(VecTy);
7985 bool NeedInsertSubvectorAnalysis =
7986 !NumParts || (VL.size() / VF) > NumParts;
7987 // Get the cost for gathered loads.
7988 for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
7989 if (VectorizedLoads.contains(VL[I]))
7990 continue;
7991 GatherCost +=
7992 getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root);
7993 }
7994 // Exclude potentially vectorized loads from list of gathered
7995 // scalars.
7996 Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));
7997 // The cost for vectorized loads.
7998 InstructionCost ScalarsCost = 0;
7999 for (Value *V : VectorizedLoads) {
8000 auto *LI = cast<LoadInst>(V);
8001 ScalarsCost +=
8002 TTI.getMemoryOpCost(Instruction::Load, LI->getType(),
8003 LI->getAlign(), LI->getPointerAddressSpace(),
8004 CostKind, TTI::OperandValueInfo(), LI);
8005 }
8006 auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);
8007 for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
8008 auto *LI = cast<LoadInst>(VL[P.first]);
8009 Align Alignment = LI->getAlign();
8010 GatherCost +=
8011 P.second == LoadsState::Vectorize
8012 ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
8013 LI->getPointerAddressSpace(), CostKind,
8016 Instruction::Load, LoadTy, LI->getPointerOperand(),
8017 /*VariableMask=*/false, Alignment, CostKind, LI);
8018 // Estimate GEP cost.
8019 SmallVector<Value *> PointerOps(VF);
8020 for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
8021 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8022 auto [ScalarGEPCost, VectorGEPCost] =
8023 getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
8024 Instruction::Load, CostKind, LI->getType(), LoadTy);
8025 GatherCost += VectorGEPCost - ScalarGEPCost;
8026 }
8027 for (unsigned P : ScatterVectorized) {
8028 auto *LI0 = cast<LoadInst>(VL[P]);
8029 ArrayRef<Value *> Slice = VL.slice(P, VF);
8030 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
8031 GatherCost += TTI.getGatherScatterOpCost(
8032 Instruction::Load, LoadTy, LI0->getPointerOperand(),
8033 /*VariableMask=*/false, CommonAlignment, CostKind, LI0);
8034 // Estimate GEP cost.
8035 SmallVector<Value *> PointerOps(VF);
8036 for (auto [I, V] : enumerate(Slice))
8037 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8038 OrdersType Order;
8039 if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE,
8040 Order)) {
8041 // TODO: improve checks if GEPs can be vectorized.
8042 Value *Ptr0 = PointerOps.front();
8043 Type *ScalarTy = Ptr0->getType();
8044 auto *VecTy = FixedVectorType::get(ScalarTy, VF);
8045 auto [ScalarGEPCost, VectorGEPCost] =
8046 getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr,
8047 CostKind, ScalarTy, VecTy);
8048 GatherCost += VectorGEPCost - ScalarGEPCost;
8049 if (!Order.empty()) {
8050 SmallVector<int> Mask;
8051 inversePermutation(Order, Mask);
8053 VecTy, Mask, CostKind);
8054 }
8055 } else {
8056 GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true);
8057 }
8058 }
8059 if (NeedInsertSubvectorAnalysis) {
8060 // Add the cost for the subvectors insert.
8061 SmallVector<int> ShuffleMask(VL.size());
8062 for (unsigned I = VF, E = VL.size(); I < E; I += VF) {
8063 for (unsigned Idx : seq<unsigned>(0, E))
8064 ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx;
8065 GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy,
8066 ShuffleMask, CostKind, I, LoadTy);
8067 }
8068 }
8069 GatherCost -= ScalarsCost;
8070 }
8071 GatherCost = std::min(BaseCost, GatherCost);
8072 } else if (!Root && isSplat(VL)) {
8073 // Found the broadcasting of the single scalar, calculate the cost as
8074 // the broadcast.
8075 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
8076 assert(It != VL.end() && "Expected at least one non-undef value.");
8077 // Add broadcast for non-identity shuffle only.
8078 bool NeedShuffle =
8079 count(VL, *It) > 1 &&
8080 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
8081 if (!NeedShuffle)
8082 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
8083 CostKind, std::distance(VL.begin(), It),
8084 PoisonValue::get(VecTy), *It);
8085
8086 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
8087 transform(VL, ShuffleMask.begin(), [](Value *V) {
8088 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8089 });
8091 Instruction::InsertElement, VecTy, CostKind, 0,
8092 PoisonValue::get(VecTy), *It);
8093 return InsertCost +
8095 ShuffleMask, CostKind, /*Index=*/0,
8096 /*SubTp=*/nullptr, /*Args=*/*It);
8097 }
8098 return GatherCost +
8099 (all_of(Gathers, IsaPred<UndefValue>)
8101 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));
8102 };
8103
8104 /// Compute the cost of creating a vector containing the extracted values from
8105 /// \p VL.
8107 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
8108 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8109 unsigned NumParts) {
8110 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
8111 unsigned NumElts =
8112 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
8113 auto *EE = dyn_cast<ExtractElementInst>(V);
8114 if (!EE)
8115 return Sz;
8116 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8117 if (!VecTy)
8118 return Sz;
8119 return std::max(Sz, VecTy->getNumElements());
8120 });
8121 unsigned NumSrcRegs = TTI.getNumberOfParts(
8122 FixedVectorType::get(VL.front()->getType(), NumElts));
8123 if (NumSrcRegs == 0)
8124 NumSrcRegs = 1;
8125 // FIXME: this must be moved to TTI for better estimation.
8126 unsigned EltsPerVector = PowerOf2Ceil(std::max(
8127 divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
8128 auto CheckPerRegistersShuffle =
8129 [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
8130 DenseSet<int> RegIndices;
8131 // Check that if trying to permute same single/2 input vectors.
8133 int FirstRegId = -1;
8134 for (int &I : Mask) {
8135 if (I == PoisonMaskElem)
8136 continue;
8137 int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
8138 if (FirstRegId < 0)
8139 FirstRegId = RegId;
8140 RegIndices.insert(RegId);
8141 if (RegIndices.size() > 2)
8142 return std::nullopt;
8143 if (RegIndices.size() == 2)
8144 ShuffleKind = TTI::SK_PermuteTwoSrc;
8145 I = (I % NumElts) % EltsPerVector +
8146 (RegId == FirstRegId ? 0 : EltsPerVector);
8147 }
8148 return ShuffleKind;
8149 };
8151
8152 // Process extracts in blocks of EltsPerVector to check if the source vector
8153 // operand can be re-used directly. If not, add the cost of creating a
8154 // shuffle to extract the values into a vector register.
8155 for (unsigned Part = 0; Part < NumParts; ++Part) {
8156 if (!ShuffleKinds[Part])
8157 continue;
8158 ArrayRef<int> MaskSlice =
8159 Mask.slice(Part * EltsPerVector,
8160 (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
8161 ? Mask.size() % EltsPerVector
8162 : EltsPerVector);
8163 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
8164 copy(MaskSlice, SubMask.begin());
8165 std::optional<TTI::ShuffleKind> RegShuffleKind =
8166 CheckPerRegistersShuffle(SubMask);
8167 if (!RegShuffleKind) {
8169 TTI, *ShuffleKinds[Part],
8170 FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
8171 continue;
8172 }
8173 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
8174 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
8176 TTI, *RegShuffleKind,
8177 FixedVectorType::get(VL.front()->getType(), EltsPerVector),
8178 SubMask);
8179 }
8180 }
8181 return Cost;
8182 }
8183 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
8184 /// shuffle emission.
8185 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
8186 ArrayRef<int> Mask) {
8187 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8188 if (Mask[Idx] != PoisonMaskElem)
8189 CommonMask[Idx] = Idx;
8190 }
8191 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
8192 /// mask \p Mask, register number \p Part, that includes \p SliceSize
8193 /// elements.
8194 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
8195 ArrayRef<int> Mask, unsigned Part,
8196 unsigned SliceSize) {
8197 if (SameNodesEstimated) {
8198 // Delay the cost estimation if the same nodes are reshuffling.
8199 // If we already requested the cost of reshuffling of E1 and E2 before, no
8200 // need to estimate another cost with the sub-Mask, instead include this
8201 // sub-Mask into the CommonMask to estimate it later and avoid double cost
8202 // estimation.
8203 if ((InVectors.size() == 2 &&
8204 InVectors.front().get<const TreeEntry *>() == &E1 &&
8205 InVectors.back().get<const TreeEntry *>() == E2) ||
8206 (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
8207 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
8208 [](int Idx) { return Idx == PoisonMaskElem; }) &&
8209 "Expected all poisoned elements.");
8210 ArrayRef<int> SubMask =
8211 ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
8212 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
8213 return;
8214 }
8215 // Found non-matching nodes - need to estimate the cost for the matched
8216 // and transform mask.
8217 Cost += createShuffle(InVectors.front(),
8218 InVectors.size() == 1 ? nullptr : InVectors.back(),
8219 CommonMask);
8220 transformMaskAfterShuffle(CommonMask, CommonMask);
8221 }
8222 SameNodesEstimated = false;
8223 if (!E2 && InVectors.size() == 1) {
8224 unsigned VF = E1.getVectorFactor();
8225 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
8226 VF = std::max(VF,
8227 cast<FixedVectorType>(V1->getType())->getNumElements());
8228 } else {
8229 const auto *E = InVectors.front().get<const TreeEntry *>();
8230 VF = std::max(VF, E->getVectorFactor());
8231 }
8232 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8233 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8234 CommonMask[Idx] = Mask[Idx] + VF;
8235 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
8236 transformMaskAfterShuffle(CommonMask, CommonMask);
8237 } else {
8238 Cost += createShuffle(&E1, E2, Mask);
8239 transformMaskAfterShuffle(CommonMask, Mask);
8240 }
8241 }
8242
8243 class ShuffleCostBuilder {
8244 const TargetTransformInfo &TTI;
8245
8246 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
8247 int Index = -1;
8248 return Mask.empty() ||
8249 (VF == Mask.size() &&
8252 Index == 0);
8253 }
8254
8255 public:
8256 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
8257 ~ShuffleCostBuilder() = default;
8258 InstructionCost createShuffleVector(Value *V1, Value *,
8259 ArrayRef<int> Mask) const {
8260 // Empty mask or identity mask are free.
8261 unsigned VF =
8262 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8263 if (isEmptyOrIdentity(Mask, VF))
8264 return TTI::TCC_Free;
8265 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
8266 cast<VectorType>(V1->getType()), Mask);
8267 }
8268 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
8269 // Empty mask or identity mask are free.
8270 unsigned VF =
8271 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8272 if (isEmptyOrIdentity(Mask, VF))
8273 return TTI::TCC_Free;
8275 cast<VectorType>(V1->getType()), Mask);
8276 }
8277 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
8278 InstructionCost createPoison(Type *Ty, unsigned VF) const {
8279 return TTI::TCC_Free;
8280 }
8281 void resizeToMatch(Value *&, Value *&) const {}
8282 };
8283
8284 /// Smart shuffle instruction emission, walks through shuffles trees and
8285 /// tries to find the best matching vector for the actual shuffle
8286 /// instruction.
8288 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
8290 ArrayRef<int> Mask) {
8291 ShuffleCostBuilder Builder(TTI);
8292 SmallVector<int> CommonMask(Mask.begin(), Mask.end());
8293 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
8294 unsigned CommonVF = Mask.size();
8295 if (!V1 && !V2 && !P2.isNull()) {
8296 // Shuffle 2 entry nodes.
8297 const TreeEntry *E = P1.get<const TreeEntry *>();
8298 unsigned VF = E->getVectorFactor();
8299 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8300 CommonVF = std::max(VF, E2->getVectorFactor());
8301 assert(all_of(Mask,
8302 [=](int Idx) {
8303 return Idx < 2 * static_cast<int>(CommonVF);
8304 }) &&
8305 "All elements in mask must be less than 2 * CommonVF.");
8306 if (E->Scalars.size() == E2->Scalars.size()) {
8307 SmallVector<int> EMask = E->getCommonMask();
8308 SmallVector<int> E2Mask = E2->getCommonMask();
8309 if (!EMask.empty() || !E2Mask.empty()) {
8310 for (int &Idx : CommonMask) {
8311 if (Idx == PoisonMaskElem)
8312 continue;
8313 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
8314 Idx = EMask[Idx];
8315 else if (Idx >= static_cast<int>(CommonVF))
8316 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
8317 E->Scalars.size();
8318 }
8319 }
8320 CommonVF = E->Scalars.size();
8321 }
8323 FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
8324 V2 = getAllOnesValue(
8325 *R.DL, FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
8326 } else if (!V1 && P2.isNull()) {
8327 // Shuffle single entry node.
8328 const TreeEntry *E = P1.get<const TreeEntry *>();
8329 unsigned VF = E->getVectorFactor();
8330 CommonVF = VF;
8331 assert(
8332 all_of(Mask,
8333 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8334 "All elements in mask must be less than CommonVF.");
8335 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8336 SmallVector<int> EMask = E->getCommonMask();
8337 assert(!EMask.empty() && "Expected non-empty common mask.");
8338 for (int &Idx : CommonMask) {
8339 if (Idx != PoisonMaskElem)
8340 Idx = EMask[Idx];
8341 }
8342 CommonVF = E->Scalars.size();
8343 }
8345 FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
8346 // Not identity/broadcast? Try to see if the original vector is better.
8347 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8348 CommonVF == CommonMask.size() &&
8349 any_of(enumerate(CommonMask),
8350 [](const auto &&P) {
8351 return P.value() != PoisonMaskElem &&
8352 static_cast<unsigned>(P.value()) != P.index();
8353 }) &&
8354 any_of(CommonMask,
8355 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
8356 SmallVector<int> ReorderMask;
8357 inversePermutation(E->ReorderIndices, ReorderMask);
8358 ::addMask(CommonMask, ReorderMask);
8359 }
8360 } else if (V1 && P2.isNull()) {
8361 // Shuffle single vector.
8362 CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
8363 assert(
8364 all_of(Mask,
8365 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8366 "All elements in mask must be less than CommonVF.");
8367 } else if (V1 && !V2) {
8368 // Shuffle vector and tree node.
8369 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8370 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8371 CommonVF = std::max(VF, E2->getVectorFactor());
8372 assert(all_of(Mask,
8373 [=](int Idx) {
8374 return Idx < 2 * static_cast<int>(CommonVF);
8375 }) &&
8376 "All elements in mask must be less than 2 * CommonVF.");
8377 if (E2->Scalars.size() == VF && VF != CommonVF) {
8378 SmallVector<int> E2Mask = E2->getCommonMask();
8379 assert(!E2Mask.empty() && "Expected non-empty common mask.");
8380 for (int &Idx : CommonMask) {
8381 if (Idx == PoisonMaskElem)
8382 continue;
8383 if (Idx >= static_cast<int>(CommonVF))
8384 Idx = E2Mask[Idx - CommonVF] + VF;
8385 }
8386 CommonVF = VF;
8387 }
8389 FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
8390 V2 = getAllOnesValue(
8391 *R.DL,
8392 FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
8393 } else if (!V1 && V2) {
8394 // Shuffle vector and tree node.
8395 unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
8396 const TreeEntry *E1 = P1.get<const TreeEntry *>();
8397 CommonVF = std::max(VF, E1->getVectorFactor());
8398 assert(all_of(Mask,
8399 [=](int Idx) {
8400 return Idx < 2 * static_cast<int>(CommonVF);
8401 }) &&
8402 "All elements in mask must be less than 2 * CommonVF.");
8403 if (E1->Scalars.size() == VF && VF != CommonVF) {
8404 SmallVector<int> E1Mask = E1->getCommonMask();
8405 assert(!E1Mask.empty() && "Expected non-empty common mask.");
8406 for (int &Idx : CommonMask) {
8407 if (Idx == PoisonMaskElem)
8408 continue;
8409 if (Idx >= static_cast<int>(CommonVF))
8410 Idx = E1Mask[Idx - CommonVF] + VF;
8411 else
8412 Idx = E1Mask[Idx];
8413 }
8414 CommonVF = VF;
8415 }
8417 FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));
8418 V2 = getAllOnesValue(
8419 *R.DL,
8420 FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));
8421 } else {
8422 assert(V1 && V2 && "Expected both vectors.");
8423 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8424 CommonVF =
8425 std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
8426 assert(all_of(Mask,
8427 [=](int Idx) {
8428 return Idx < 2 * static_cast<int>(CommonVF);
8429 }) &&
8430 "All elements in mask must be less than 2 * CommonVF.");
8431 if (V1->getType() != V2->getType()) {
8433 cast<FixedVectorType>(V1->getType())->getElementType(), CommonVF));
8434 V2 = getAllOnesValue(
8435 *R.DL, FixedVectorType::get(
8436 cast<FixedVectorType>(V1->getType())->getElementType(),
8437 CommonVF));
8438 }
8439 }
8441 cast<FixedVectorType>(V1->getType())->getElementType(),
8442 CommonMask.size()));
8443 if (InVectors.size() == 2)
8444 InVectors.pop_back();
8445 return BaseShuffleAnalysis::createShuffle<InstructionCost>(
8446 V1, V2, CommonMask, Builder);
8447 }
8448
8449public:
8451 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
8452 SmallPtrSetImpl<Value *> &CheckedExtracts)
8453 : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),
8454 R(R), CheckedExtracts(CheckedExtracts) {}
8455 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
8456 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8457 unsigned NumParts, bool &UseVecBaseAsInput) {
8458 UseVecBaseAsInput = false;
8459 if (Mask.empty())
8460 return nullptr;
8461 Value *VecBase = nullptr;
8462 ArrayRef<Value *> VL = E->Scalars;
8463 // If the resulting type is scalarized, do not adjust the cost.
8464 if (NumParts == VL.size())
8465 return nullptr;
8466 // Check if it can be considered reused if same extractelements were
8467 // vectorized already.
8468 bool PrevNodeFound = any_of(
8469 ArrayRef(R.VectorizableTree).take_front(E->Idx),
8470 [&](const std::unique_ptr<TreeEntry> &TE) {
8471 return ((!TE->isAltShuffle() &&
8472 TE->getOpcode() == Instruction::ExtractElement) ||
8473 TE->State == TreeEntry::NeedToGather) &&
8474 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
8475 return VL.size() > Data.index() &&
8476 (Mask[Data.index()] == PoisonMaskElem ||
8477 isa<UndefValue>(VL[Data.index()]) ||
8478 Data.value() == VL[Data.index()]);
8479 });
8480 });
8481 SmallPtrSet<Value *, 4> UniqueBases;
8482 unsigned SliceSize = VL.size() / NumParts;
8483 for (unsigned Part = 0; Part < NumParts; ++Part) {
8484 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
8485 for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
8486 // Ignore non-extractelement scalars.
8487 if (isa<UndefValue>(V) ||
8488 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
8489 continue;
8490 // If all users of instruction are going to be vectorized and this
8491 // instruction itself is not going to be vectorized, consider this
8492 // instruction as dead and remove its cost from the final cost of the
8493 // vectorized tree.
8494 // Also, avoid adjusting the cost for extractelements with multiple uses
8495 // in different graph entries.
8496 auto *EE = cast<ExtractElementInst>(V);
8497 VecBase = EE->getVectorOperand();
8498 UniqueBases.insert(VecBase);
8499 const TreeEntry *VE = R.getTreeEntry(V);
8500 if (!CheckedExtracts.insert(V).second ||
8501 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
8502 any_of(EE->users(),
8503 [&](User *U) {
8504 return isa<GetElementPtrInst>(U) &&
8505 !R.areAllUsersVectorized(cast<Instruction>(U),
8506 &VectorizedVals);
8507 }) ||
8508 (VE && VE != E))
8509 continue;
8510 std::optional<unsigned> EEIdx = getExtractIndex(EE);
8511 if (!EEIdx)
8512 continue;
8513 unsigned Idx = *EEIdx;
8514 // Take credit for instruction that will become dead.
8515 if (EE->hasOneUse() || !PrevNodeFound) {
8516 Instruction *Ext = EE->user_back();
8517 if (isa<SExtInst, ZExtInst>(Ext) &&
8518 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
8519 // Use getExtractWithExtendCost() to calculate the cost of
8520 // extractelement/ext pair.
8521 Cost -=
8522 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
8523 EE->getVectorOperandType(), Idx);
8524 // Add back the cost of s|zext which is subtracted separately.
8526 Ext->getOpcode(), Ext->getType(), EE->getType(),
8527 TTI::getCastContextHint(Ext), CostKind, Ext);
8528 continue;
8529 }
8530 }
8531 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
8532 CostKind, Idx);
8533 }
8534 }
8535 // Check that gather of extractelements can be represented as just a
8536 // shuffle of a single/two vectors the scalars are extracted from.
8537 // Found the bunch of extractelement instructions that must be gathered
8538 // into a vector and can be represented as a permutation elements in a
8539 // single input vector or of 2 input vectors.
8540 // Done for reused if same extractelements were vectorized already.
8541 if (!PrevNodeFound)
8542 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8543 InVectors.assign(1, E);
8544 CommonMask.assign(Mask.begin(), Mask.end());
8545 transformMaskAfterShuffle(CommonMask, CommonMask);
8546 SameNodesEstimated = false;
8547 if (NumParts != 1 && UniqueBases.size() != 1) {
8548 UseVecBaseAsInput = true;
8549 VecBase = Constant::getNullValue(
8550 FixedVectorType::get(VL.front()->getType(), CommonMask.size()));
8551 }
8552 return VecBase;
8553 }
8554 /// Checks if the specified entry \p E needs to be delayed because of its
8555 /// dependency nodes.
8556 std::optional<InstructionCost>
8557 needToDelay(const TreeEntry *,
8559 // No need to delay the cost estimation during analysis.
8560 return std::nullopt;
8561 }
8562 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
8563 if (&E1 == &E2) {
8564 assert(all_of(Mask,
8565 [&](int Idx) {
8566 return Idx < static_cast<int>(E1.getVectorFactor());
8567 }) &&
8568 "Expected single vector shuffle mask.");
8569 add(E1, Mask);
8570 return;
8571 }
8572 if (InVectors.empty()) {
8573 CommonMask.assign(Mask.begin(), Mask.end());
8574 InVectors.assign({&E1, &E2});
8575 return;
8576 }
8577 assert(!CommonMask.empty() && "Expected non-empty common mask.");
8578 auto *MaskVecTy =
8579 FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
8580 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
8581 if (NumParts == 0 || NumParts >= Mask.size())
8582 NumParts = 1;
8583 unsigned SliceSize = Mask.size() / NumParts;
8584 const auto *It =
8585 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
8586 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8587 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
8588 }
8589 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
8590 if (InVectors.empty()) {
8591 CommonMask.assign(Mask.begin(), Mask.end());
8592 InVectors.assign(1, &E1);
8593 return;
8594 }
8595 assert(!CommonMask.empty() && "Expected non-empty common mask.");
8596 auto *MaskVecTy =
8597 FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
8598 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
8599 if (NumParts == 0 || NumParts >= Mask.size())
8600 NumParts = 1;
8601 unsigned SliceSize = Mask.size() / NumParts;
8602 const auto *It =
8603 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
8604 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8605 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
8606 if (!SameNodesEstimated && InVectors.size() == 1)
8607 InVectors.emplace_back(&E1);
8608 }
8609 /// Adds 2 input vectors and the mask for their shuffling.
8610 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
8611 // May come only for shuffling of 2 vectors with extractelements, already
8612 // handled in adjustExtracts.
8613 assert(InVectors.size() == 1 &&
8614 all_of(enumerate(CommonMask),
8615 [&](auto P) {
8616 if (P.value() == PoisonMaskElem)
8617 return Mask[P.index()] == PoisonMaskElem;
8618 auto *EI =
8619 cast<ExtractElementInst>(InVectors.front()
8620 .get<const TreeEntry *>()
8621 ->Scalars[P.index()]);
8622 return EI->getVectorOperand() == V1 ||
8623 EI->getVectorOperand() == V2;
8624 }) &&
8625 "Expected extractelement vectors.");
8626 }
8627 /// Adds another one input vector and the mask for the shuffling.
8628 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
8629 if (InVectors.empty()) {
8630 assert(CommonMask.empty() && !ForExtracts &&
8631 "Expected empty input mask/vectors.");
8632 CommonMask.assign(Mask.begin(), Mask.end());
8633 InVectors.assign(1, V1);
8634 return;
8635 }
8636 if (ForExtracts) {
8637 // No need to add vectors here, already handled them in adjustExtracts.
8638 assert(InVectors.size() == 1 &&
8639 InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
8640 all_of(enumerate(CommonMask),
8641 [&](auto P) {
8642 Value *Scalar = InVectors.front()
8643 .get<const TreeEntry *>()
8644 ->Scalars[P.index()];
8645 if (P.value() == PoisonMaskElem)
8646 return P.value() == Mask[P.index()] ||
8647 isa<UndefValue>(Scalar);
8648 if (isa<Constant>(V1))
8649 return true;
8650 auto *EI = cast<ExtractElementInst>(Scalar);
8651 return EI->getVectorOperand() == V1;
8652 }) &&
8653 "Expected only tree entry for extractelement vectors.");
8654 return;
8655 }
8656 assert(!InVectors.empty() && !CommonMask.empty() &&
8657 "Expected only tree entries from extracts/reused buildvectors.");
8658 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8659 if (InVectors.size() == 2) {
8660 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
8661 transformMaskAfterShuffle(CommonMask, CommonMask);
8662 VF = std::max<unsigned>(VF, CommonMask.size());
8663 } else if (const auto *InTE =
8664 InVectors.front().dyn_cast<const TreeEntry *>()) {
8665 VF = std::max(VF, InTE->getVectorFactor());
8666 } else {
8667 VF = std::max(
8668 VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())
8669 ->getNumElements());
8670 }
8671 InVectors.push_back(V1);
8672 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8673 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8674 CommonMask[Idx] = Mask[Idx] + VF;
8675 }
8676 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
8677 Value *Root = nullptr) {
8678 Cost += getBuildVectorCost(VL, Root);
8679 if (!Root) {
8680 // FIXME: Need to find a way to avoid use of getNullValue here.
8682 unsigned VF = VL.size();
8683 if (MaskVF != 0)
8684 VF = std::min(VF, MaskVF);
8685 for (Value *V : VL.take_front(VF)) {
8686 if (isa<UndefValue>(V)) {
8687 Vals.push_back(cast<Constant>(V));
8688 continue;
8689 }
8690 Vals.push_back(Constant::getNullValue(V->getType()));
8691 }
8692 return ConstantVector::get(Vals);
8693 }
8696 cast<FixedVectorType>(Root->getType())->getNumElements()),
8697 getAllOnesValue(*R.DL, VL.front()->getType()));
8698 }
8700 /// Finalize emission of the shuffles.
8702 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
8703 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
8704 IsFinalized = true;
8705 if (Action) {
8706 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
8707 if (InVectors.size() == 2)
8708 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
8709 else
8710 Cost += createShuffle(Vec, nullptr, CommonMask);
8711 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8712 if (CommonMask[Idx] != PoisonMaskElem)
8713 CommonMask[Idx] = Idx;
8714 assert(VF > 0 &&
8715 "Expected vector length for the final value before action.");
8716 Value *V = Vec.get<Value *>();
8717 Action(V, CommonMask);
8718 InVectors.front() = V;
8719 }
8720 ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
8721 if (CommonMask.empty()) {
8722 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
8723 return Cost;
8724 }
8725 return Cost +
8726 createShuffle(InVectors.front(),
8727 InVectors.size() == 2 ? InVectors.back() : nullptr,
8728 CommonMask);
8729 }
8730
8732 assert((IsFinalized || CommonMask.empty()) &&
8733 "Shuffle construction must be finalized.");
8734 }
8735};
8736
8737const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
8738 unsigned Idx) const {
8739 Value *Op = E->getOperand(Idx).front();
8740 if (const TreeEntry *TE = getTreeEntry(Op)) {
8741 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
8742 return EI.EdgeIdx == Idx && EI.UserTE == E;
8743 }) != TE->UserTreeIndices.end())
8744 return TE;
8745 auto MIt = MultiNodeScalars.find(Op);
8746 if (MIt != MultiNodeScalars.end()) {
8747 for (const TreeEntry *TE : MIt->second) {
8748 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
8749 return EI.EdgeIdx == Idx && EI.UserTE == E;
8750 }) != TE->UserTreeIndices.end())
8751 return TE;
8752 }
8753 }
8754 }
8755 const auto *It =
8756 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8757 return TE->State == TreeEntry::NeedToGather &&
8758 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
8759 return EI.EdgeIdx == Idx && EI.UserTE == E;
8760 }) != TE->UserTreeIndices.end();
8761 });
8762 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
8763 return It->get();
8764}
8765
8766TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
8767 if (TE.State == TreeEntry::ScatterVectorize ||
8768 TE.State == TreeEntry::StridedVectorize)
8770 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
8771 !TE.isAltShuffle()) {
8772 if (TE.ReorderIndices.empty())
8775 inversePermutation(TE.ReorderIndices, Mask);
8776 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
8778 }
8780}
8781
8782/// Builds the arguments types vector for the given call instruction with the
8783/// given \p ID for the specified vector factor.
8785 const Intrinsic::ID ID,
8786 const unsigned VF,
8787 unsigned MinBW) {
8788 SmallVector<Type *> ArgTys;
8789 for (auto [Idx, Arg] : enumerate(CI->args())) {
8792 ArgTys.push_back(Arg->getType());
8793 continue;
8794 }
8795 if (MinBW > 0) {
8797 IntegerType::get(CI->getContext(), MinBW), VF));
8798 continue;
8799 }
8800 }
8801 ArgTys.push_back(FixedVectorType::get(Arg->getType(), VF));
8802 }
8803 return ArgTys;
8804}
8805
8807BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
8808 SmallPtrSetImpl<Value *> &CheckedExtracts) {
8809 ArrayRef<Value *> VL = E->Scalars;
8810
8811 Type *ScalarTy = VL[0]->getType();
8812 if (E->State != TreeEntry::NeedToGather) {
8813 if (auto *SI = dyn_cast<StoreInst>(VL[0]))
8814 ScalarTy = SI->getValueOperand()->getType();
8815 else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
8816 ScalarTy = CI->getOperand(0)->getType();
8817 else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
8818 ScalarTy = IE->getOperand(1)->getType();
8819 }
8820 if (!isValidElementType(ScalarTy))
8822 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
8824
8825 // If we have computed a smaller type for the expression, update VecTy so
8826 // that the costs will be accurate.
8827 auto It = MinBWs.find(E);
8828 Type *OrigScalarTy = ScalarTy;
8829 if (It != MinBWs.end()) {
8830 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
8831 VecTy = FixedVectorType::get(ScalarTy, VL.size());
8832 }
8833 unsigned EntryVF = E->getVectorFactor();
8834 auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
8835
8836 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
8837 if (E->State == TreeEntry::NeedToGather) {
8838 if (allConstant(VL))
8839 return 0;
8840 if (isa<InsertElementInst>(VL[0]))
8842 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
8843 E, *TTI, VectorizedVals, *this, CheckedExtracts);
8844 }
8845 InstructionCost CommonCost = 0;
8847 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
8848 if (!E->ReorderIndices.empty() &&
8849 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
8850 SmallVector<int> NewMask;
8851 if (E->getOpcode() == Instruction::Store) {
8852 // For stores the order is actually a mask.
8853 NewMask.resize(E->ReorderIndices.size());
8854 copy(E->ReorderIndices, NewMask.begin());
8855 } else {
8856 inversePermutation(E->ReorderIndices, NewMask);
8857 }
8858 ::addMask(Mask, NewMask);
8859 }
8860 if (NeedToShuffleReuses)
8861 ::addMask(Mask, E->ReuseShuffleIndices);
8862 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
8863 CommonCost =
8864 TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
8865 assert((E->State == TreeEntry::Vectorize ||
8866 E->State == TreeEntry::ScatterVectorize ||
8867 E->State == TreeEntry::StridedVectorize) &&
8868 "Unhandled state");
8869 assert(E->getOpcode() &&
8870 ((allSameType(VL) && allSameBlock(VL)) ||
8871 (E->getOpcode() == Instruction::GetElementPtr &&
8872 E->getMainOp()->getType()->isPointerTy())) &&
8873 "Invalid VL");
8874 Instruction *VL0 = E->getMainOp();
8875 unsigned ShuffleOrOp =
8876 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
8877 SetVector<Value *> UniqueValues(VL.begin(), VL.end());
8878 const unsigned Sz = UniqueValues.size();
8879 SmallBitVector UsedScalars(Sz, false);
8880 for (unsigned I = 0; I < Sz; ++I) {
8881 if (getTreeEntry(UniqueValues[I]) == E)
8882 continue;
8883 UsedScalars.set(I);
8884 }
8885 auto GetCastContextHint = [&](Value *V) {
8886 if (const TreeEntry *OpTE = getTreeEntry(V))
8887 return getCastContextHint(*OpTE);
8888 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
8889 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
8892 };
8893 auto GetCostDiff =
8894 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
8896 // Calculate the cost of this instruction.
8897 InstructionCost ScalarCost = 0;
8898 if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) {
8899 // For some of the instructions no need to calculate cost for each
8900 // particular instruction, we can use the cost of the single
8901 // instruction x total number of scalar instructions.
8902 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
8903 } else {
8904 for (unsigned I = 0; I < Sz; ++I) {
8905 if (UsedScalars.test(I))
8906 continue;
8907 ScalarCost += ScalarEltCost(I);
8908 }
8909 }
8910
8911 InstructionCost VecCost = VectorCost(CommonCost);
8912 // Check if the current node must be resized, if the parent node is not
8913 // resized.
8914 if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) {
8915 const EdgeInfo &EI = E->UserTreeIndices.front();
8916 if ((EI.UserTE->getOpcode() != Instruction::Select ||
8917 EI.EdgeIdx != 0) &&
8918 It != MinBWs.end()) {
8919 auto UserBWIt = MinBWs.find(EI.UserTE);
8920 Type *UserScalarTy =
8921 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
8922 if (UserBWIt != MinBWs.end())
8923 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
8924 UserBWIt->second.first);
8925 if (ScalarTy != UserScalarTy) {
8926 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
8927 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
8928 unsigned VecOpcode;
8929 auto *UserVecTy =
8930 FixedVectorType::get(UserScalarTy, E->getVectorFactor());
8931 if (BWSz > SrcBWSz)
8932 VecOpcode = Instruction::Trunc;
8933 else
8934 VecOpcode =
8935 It->second.second ? Instruction::SExt : Instruction::ZExt;
8936 TTI::CastContextHint CCH = GetCastContextHint(VL0);
8937 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
8938 CostKind);
8939 }
8940 }
8941 }
8942 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
8943 ScalarCost, "Calculated costs for Tree"));
8944 return VecCost - ScalarCost;
8945 };
8946 // Calculate cost difference from vectorizing set of GEPs.
8947 // Negative value means vectorizing is profitable.
8948 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
8949 assert((E->State == TreeEntry::Vectorize ||
8950 E->State == TreeEntry::StridedVectorize) &&
8951 "Entry state expected to be Vectorize or StridedVectorize here.");
8952 InstructionCost ScalarCost = 0;
8953 InstructionCost VecCost = 0;
8954 std::tie(ScalarCost, VecCost) = getGEPCosts(
8955 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
8956 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
8957 "Calculated GEPs cost for Tree"));
8958
8959 return VecCost - ScalarCost;
8960 };
8961
8962 switch (ShuffleOrOp) {
8963 case Instruction::PHI: {
8964 // Count reused scalars.
8965 InstructionCost ScalarCost = 0;
8967 for (Value *V : UniqueValues) {
8968 auto *PHI = dyn_cast<PHINode>(V);
8969 if (!PHI)
8970 continue;
8971
8972 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
8973 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
8974 Value *Op = PHI->getIncomingValue(I);
8975 Operands[I] = Op;
8976 }
8977 if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
8978 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
8979 if (!OpTE->ReuseShuffleIndices.empty())
8980 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
8981 OpTE->Scalars.size());
8982 }
8983
8984 return CommonCost - ScalarCost;
8985 }
8986 case Instruction::ExtractValue:
8987 case Instruction::ExtractElement: {
8988 auto GetScalarCost = [&](unsigned Idx) {
8989 auto *I = cast<Instruction>(UniqueValues[Idx]);
8990 VectorType *SrcVecTy;
8991 if (ShuffleOrOp == Instruction::ExtractElement) {
8992 auto *EE = cast<ExtractElementInst>(I);
8993 SrcVecTy = EE->getVectorOperandType();
8994 } else {
8995 auto *EV = cast<ExtractValueInst>(I);
8996 Type *AggregateTy = EV->getAggregateOperand()->getType();
8997 unsigned NumElts;
8998 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
8999 NumElts = ATy->getNumElements();
9000 else
9001 NumElts = AggregateTy->getStructNumElements();
9002 SrcVecTy = FixedVectorType::get(OrigScalarTy, NumElts);
9003 }
9004 if (I->hasOneUse()) {
9005 Instruction *Ext = I->user_back();
9006 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
9007 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
9008 // Use getExtractWithExtendCost() to calculate the cost of
9009 // extractelement/ext pair.
9011 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
9012 // Subtract the cost of s|zext which is subtracted separately.
9014 Ext->getOpcode(), Ext->getType(), I->getType(),
9016 return Cost;
9017 }
9018 }
9019 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
9021 };
9022 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
9023 return GetCostDiff(GetScalarCost, GetVectorCost);
9024 }
9025 case Instruction::InsertElement: {
9026 assert(E->ReuseShuffleIndices.empty() &&
9027 "Unique insertelements only are expected.");
9028 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
9029 unsigned const NumElts = SrcVecTy->getNumElements();
9030 unsigned const NumScalars = VL.size();
9031
9032 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
9033
9034 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
9035 unsigned OffsetBeg = *getInsertIndex(VL.front());
9036 unsigned OffsetEnd = OffsetBeg;
9037 InsertMask[OffsetBeg] = 0;
9038 for (auto [I, V] : enumerate(VL.drop_front())) {
9039 unsigned Idx = *getInsertIndex(V);
9040 if (OffsetBeg > Idx)
9041 OffsetBeg = Idx;
9042 else if (OffsetEnd < Idx)
9043 OffsetEnd = Idx;
9044 InsertMask[Idx] = I + 1;
9045 }
9046 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
9047 if (NumOfParts > 0)
9048 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
9049 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9050 VecScalarsSz;
9051 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9052 unsigned InsertVecSz = std::min<unsigned>(
9053 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
9054 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9055 bool IsWholeSubvector =
9056 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9057 // Check if we can safely insert a subvector. If it is not possible, just
9058 // generate a whole-sized vector and shuffle the source vector and the new
9059 // subvector.
9060 if (OffsetBeg + InsertVecSz > VecSz) {
9061 // Align OffsetBeg to generate correct mask.
9062 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
9063 InsertVecSz = VecSz;
9064 }
9065
9066 APInt DemandedElts = APInt::getZero(NumElts);
9067 // TODO: Add support for Instruction::InsertValue.
9069 if (!E->ReorderIndices.empty()) {
9070 inversePermutation(E->ReorderIndices, Mask);
9071 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
9072 } else {
9073 Mask.assign(VecSz, PoisonMaskElem);
9074 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
9075 }
9076 bool IsIdentity = true;
9077 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
9078 Mask.swap(PrevMask);
9079 for (unsigned I = 0; I < NumScalars; ++I) {
9080 unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
9081 DemandedElts.setBit(InsertIdx);
9082 IsIdentity &= InsertIdx - OffsetBeg == I;
9083 Mask[InsertIdx - OffsetBeg] = I;
9084 }
9085 assert(Offset < NumElts && "Failed to find vector index offset");
9086
9088 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
9089 /*Insert*/ true, /*Extract*/ false,
9090 CostKind);
9091
9092 // First cost - resize to actual vector size if not identity shuffle or
9093 // need to shift the vector.
9094 // Do not calculate the cost if the actual size is the register size and
9095 // we can merge this shuffle with the following SK_Select.
9096 auto *InsertVecTy = FixedVectorType::get(ScalarTy, InsertVecSz);
9097 if (!IsIdentity)
9099 InsertVecTy, Mask);
9100 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
9101 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9102 }));
9103 // Second cost - permutation with subvector, if some elements are from the
9104 // initial vector or inserting a subvector.
9105 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
9106 // subvector of ActualVecTy.
9107 SmallBitVector InMask =
9108 isUndefVector(FirstInsert->getOperand(0),
9109 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9110 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
9111 if (InsertVecSz != VecSz) {
9112 auto *ActualVecTy = FixedVectorType::get(ScalarTy, VecSz);
9114 std::nullopt, CostKind, OffsetBeg - Offset,
9115 InsertVecTy);
9116 } else {
9117 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
9118 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
9119 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
9120 I <= End; ++I)
9121 if (Mask[I] != PoisonMaskElem)
9122 Mask[I] = I + VecSz;
9123 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
9124 Mask[I] =
9125 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
9126 Cost +=
9127 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
9128 }
9129 }
9130 return Cost;
9131 }
9132 case Instruction::ZExt:
9133 case Instruction::SExt:
9134 case Instruction::FPToUI:
9135 case Instruction::FPToSI:
9136 case Instruction::FPExt:
9137 case Instruction::PtrToInt:
9138 case Instruction::IntToPtr:
9139 case Instruction::SIToFP:
9140 case Instruction::UIToFP:
9141 case Instruction::Trunc:
9142 case Instruction::FPTrunc:
9143 case Instruction::BitCast: {
9144 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9145 Type *SrcScalarTy = VL0->getOperand(0)->getType();
9146 auto *SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
9147 unsigned Opcode = ShuffleOrOp;
9148 unsigned VecOpcode = Opcode;
9149 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
9150 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
9151 // Check if the values are candidates to demote.
9152 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
9153 if (SrcIt != MinBWs.end()) {
9154 SrcBWSz = SrcIt->second.first;
9155 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
9156 SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
9157 }
9158 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9159 if (BWSz == SrcBWSz) {
9160 VecOpcode = Instruction::BitCast;
9161 } else if (BWSz < SrcBWSz) {
9162 VecOpcode = Instruction::Trunc;
9163 } else if (It != MinBWs.end()) {
9164 assert(BWSz > SrcBWSz && "Invalid cast!");
9165 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9166 } else if (SrcIt != MinBWs.end()) {
9167 assert(BWSz > SrcBWSz && "Invalid cast!");
9168 VecOpcode =
9169 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9170 }
9171 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
9172 !SrcIt->second.second) {
9173 VecOpcode = Instruction::UIToFP;
9174 }
9175 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
9176 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9177 return TTI->getCastInstrCost(Opcode, VL0->getType(),
9178 VL0->getOperand(0)->getType(),
9180 };
9181 auto GetVectorCost = [=](InstructionCost CommonCost) {
9182 // Do not count cost here if minimum bitwidth is in effect and it is just
9183 // a bitcast (here it is just a noop).
9184 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9185 return CommonCost;
9186 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
9187 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
9188 return CommonCost +
9189 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
9190 VecOpcode == Opcode ? VI : nullptr);
9191 };
9192 return GetCostDiff(GetScalarCost, GetVectorCost);
9193 }
9194 case Instruction::FCmp:
9195 case Instruction::ICmp:
9196 case Instruction::Select: {
9197 CmpInst::Predicate VecPred, SwappedVecPred;
9198 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
9199 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
9200 match(VL0, MatchCmp))
9201 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
9202 else
9203 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
9206 auto GetScalarCost = [&](unsigned Idx) {
9207 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9208 CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
9211 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
9212 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
9213 !match(VI, MatchCmp)) ||
9214 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9215 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
9218
9219 return TTI->getCmpSelInstrCost(E->getOpcode(), OrigScalarTy,
9220 Builder.getInt1Ty(), CurrentPred, CostKind,
9221 VI);
9222 };
9223 auto GetVectorCost = [&](InstructionCost CommonCost) {
9224 auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
9225
9227 E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
9228 // Check if it is possible and profitable to use min/max for selects
9229 // in VL.
9230 //
9231 auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
9232 if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
9233 IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
9234 {VecTy, VecTy});
9235 InstructionCost IntrinsicCost =
9236 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9237 // If the selects are the only uses of the compares, they will be
9238 // dead and we can adjust the cost by removing their cost.
9239 if (IntrinsicAndUse.second)
9240 IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy,
9241 MaskTy, VecPred, CostKind);
9242 VecCost = std::min(VecCost, IntrinsicCost);
9243 }
9244 return VecCost + CommonCost;
9245 };
9246 return GetCostDiff(GetScalarCost, GetVectorCost);
9247 }
9248 case Instruction::FNeg:
9249 case Instruction::Add:
9250 case Instruction::FAdd:
9251 case Instruction::Sub:
9252 case Instruction::FSub:
9253 case Instruction::Mul:
9254 case Instruction::FMul:
9255 case Instruction::UDiv:
9256 case Instruction::SDiv:
9257 case Instruction::FDiv:
9258 case Instruction::URem:
9259 case Instruction::SRem:
9260 case Instruction::FRem:
9261 case Instruction::Shl:
9262 case Instruction::LShr:
9263 case Instruction::AShr:
9264 case Instruction::And:
9265 case Instruction::Or:
9266 case Instruction::Xor: {
9267 auto GetScalarCost = [&](unsigned Idx) {
9268 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9269 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9270 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
9271 TTI::OperandValueInfo Op2Info =
9272 TTI::getOperandInfo(VI->getOperand(OpIdx));
9273 SmallVector<const Value *> Operands(VI->operand_values());
9274 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
9275 Op1Info, Op2Info, Operands, VI);
9276 };
9277 auto GetVectorCost = [=](InstructionCost CommonCost) {
9278 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
9279 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
9280 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
9281 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
9282 Op2Info, std::nullopt, nullptr, TLI) +
9283 CommonCost;
9284 };
9285 return GetCostDiff(GetScalarCost, GetVectorCost);
9286 }
9287 case Instruction::GetElementPtr: {
9288 return CommonCost + GetGEPCostDiff(VL, VL0);
9289 }
9290 case Instruction::Load: {
9291 auto GetScalarCost = [&](unsigned Idx) {
9292 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
9293 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
9294 VI->getAlign(), VI->getPointerAddressSpace(),
9296 };
9297 auto *LI0 = cast<LoadInst>(VL0);
9298 auto GetVectorCost = [&](InstructionCost CommonCost) {
9299 InstructionCost VecLdCost;
9300 if (E->State == TreeEntry::Vectorize) {
9301 VecLdCost = TTI->getMemoryOpCost(
9302 Instruction::Load, VecTy, LI0->getAlign(),
9303 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
9304 } else if (E->State == TreeEntry::StridedVectorize) {
9305 Align CommonAlignment =
9306 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9307 VecLdCost = TTI->getStridedMemoryOpCost(
9308 Instruction::Load, VecTy, LI0->getPointerOperand(),
9309 /*VariableMask=*/false, CommonAlignment, CostKind);
9310 } else {
9311 assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
9312 Align CommonAlignment =
9313 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9314 VecLdCost = TTI->getGatherScatterOpCost(
9315 Instruction::Load, VecTy, LI0->getPointerOperand(),
9316 /*VariableMask=*/false, CommonAlignment, CostKind);
9317 }
9318 return VecLdCost + CommonCost;
9319 };
9320
9321 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
9322 // If this node generates masked gather load then it is not a terminal node.
9323 // Hence address operand cost is estimated separately.
9324 if (E->State == TreeEntry::ScatterVectorize)
9325 return Cost;
9326
9327 // Estimate cost of GEPs since this tree node is a terminator.
9328 SmallVector<Value *> PointerOps(VL.size());
9329 for (auto [I, V] : enumerate(VL))
9330 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
9331 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9332 }
9333 case Instruction::Store: {
9334 bool IsReorder = !E->ReorderIndices.empty();
9335 auto GetScalarCost = [=](unsigned Idx) {
9336 auto *VI = cast<StoreInst>(VL[Idx]);
9337 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
9338 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
9339 VI->getAlign(), VI->getPointerAddressSpace(),
9340 CostKind, OpInfo, VI);
9341 };
9342 auto *BaseSI =
9343 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9344 auto GetVectorCost = [=](InstructionCost CommonCost) {
9345 // We know that we can merge the stores. Calculate the cost.
9346 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
9347 return TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
9348 BaseSI->getPointerAddressSpace(), CostKind,
9349 OpInfo) +
9350 CommonCost;
9351 };
9352 SmallVector<Value *> PointerOps(VL.size());
9353 for (auto [I, V] : enumerate(VL)) {
9354 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
9355 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
9356 }
9357
9358 return GetCostDiff(GetScalarCost, GetVectorCost) +
9359 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9360 }
9361 case Instruction::Call: {
9362 auto GetScalarCost = [&](unsigned Idx) {
9363 auto *CI = cast<CallInst>(UniqueValues[Idx]);
9366 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
9367 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9368 }
9371 CI->getFunctionType()->params(), CostKind);
9372 };
9373 auto GetVectorCost = [=](InstructionCost CommonCost) {
9374 auto *CI = cast<CallInst>(VL0);
9376 SmallVector<Type *> ArgTys =
9378 It != MinBWs.end() ? It->second.first : 0);
9379 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
9380 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
9381 };
9382 return GetCostDiff(GetScalarCost, GetVectorCost);
9383 }
9384 case Instruction::ShuffleVector: {
9385 assert(E->isAltShuffle() &&
9386 ((Instruction::isBinaryOp(E->getOpcode()) &&
9387 Instruction::isBinaryOp(E->getAltOpcode())) ||
9388 (Instruction::isCast(E->getOpcode()) &&
9389 Instruction::isCast(E->getAltOpcode())) ||
9390 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9391 "Invalid Shuffle Vector Operand");
9392 // Try to find the previous shuffle node with the same operands and same
9393 // main/alternate ops.
9394 auto TryFindNodeWithEqualOperands = [=]() {
9395 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9396 if (TE.get() == E)
9397 break;
9398 if (TE->isAltShuffle() &&
9399 ((TE->getOpcode() == E->getOpcode() &&
9400 TE->getAltOpcode() == E->getAltOpcode()) ||
9401 (TE->getOpcode() == E->getAltOpcode() &&
9402 TE->getAltOpcode() == E->getOpcode())) &&
9403 TE->hasEqualOperands(*E))
9404 return true;
9405 }
9406 return false;
9407 };
9408 auto GetScalarCost = [&](unsigned Idx) {
9409 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9410 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
9411 (void)E;
9412 return TTI->getInstructionCost(VI, CostKind);
9413 };
9414 // Need to clear CommonCost since the final shuffle cost is included into
9415 // vector cost.
9416 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
9417 // VecCost is equal to sum of the cost of creating 2 vectors
9418 // and the cost of creating shuffle.
9419 InstructionCost VecCost = 0;
9420 if (TryFindNodeWithEqualOperands()) {
9421 LLVM_DEBUG({
9422 dbgs() << "SLP: diamond match for alternate node found.\n";
9423 E->dump();
9424 });
9425 // No need to add new vector costs here since we're going to reuse
9426 // same main/alternate vector ops, just do different shuffling.
9427 } else if (Instruction::isBinaryOp(E->getOpcode())) {
9428 VecCost =
9429 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
9430 VecCost +=
9431 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
9432 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
9433 auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
9434 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
9435 CI0->getPredicate(), CostKind, VL0);
9436 VecCost += TTIRef.getCmpSelInstrCost(
9437 E->getOpcode(), VecTy, MaskTy,
9438 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
9439 E->getAltOp());
9440 } else {
9441 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
9442 auto *SrcTy = FixedVectorType::get(SrcSclTy, VL.size());
9443 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
9444 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9445 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9446 unsigned SrcBWSz =
9447 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
9448 if (SrcIt != MinBWs.end()) {
9449 SrcBWSz = SrcIt->second.first;
9450 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
9451 SrcTy = FixedVectorType::get(SrcSclTy, VL.size());
9452 }
9453 if (BWSz <= SrcBWSz) {
9454 if (BWSz < SrcBWSz)
9455 VecCost =
9456 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
9458 LLVM_DEBUG({
9459 dbgs()
9460 << "SLP: alternate extension, which should be truncated.\n";
9461 E->dump();
9462 });
9463 return VecCost;
9464 }
9465 }
9466 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
9468 VecCost +=
9469 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
9471 }
9473 E->buildAltOpShuffleMask(
9474 [E](Instruction *I) {
9475 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
9476 return I->getOpcode() == E->getAltOpcode();
9477 },
9478 Mask);
9480 FinalVecTy, Mask);
9481 // Patterns like [fadd,fsub] can be combined into a single instruction
9482 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
9483 // need to take into account their order when looking for the most used
9484 // order.
9485 unsigned Opcode0 = E->getOpcode();
9486 unsigned Opcode1 = E->getAltOpcode();
9487 // The opcode mask selects between the two opcodes.
9488 SmallBitVector OpcodeMask(E->Scalars.size(), false);
9489 for (unsigned Lane : seq<unsigned>(0, E->Scalars.size()))
9490 if (cast<Instruction>(E->Scalars[Lane])->getOpcode() == Opcode1)
9491 OpcodeMask.set(Lane);
9492 // If this pattern is supported by the target then we consider the
9493 // order.
9494 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9495 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
9496 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
9497 return AltVecCost < VecCost ? AltVecCost : VecCost;
9498 }
9499 // TODO: Check the reverse order too.
9500 return VecCost;
9501 };
9502 return GetCostDiff(GetScalarCost, GetVectorCost);
9503 }
9504 default:
9505 llvm_unreachable("Unknown instruction");
9506 }
9507}
9508
9509bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
9510 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
9511 << VectorizableTree.size() << " is fully vectorizable .\n");
9512
9513 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
9515 return TE->State == TreeEntry::NeedToGather &&
9516 !any_of(TE->Scalars,
9517 [this](Value *V) { return EphValues.contains(V); }) &&
9518 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
9519 TE->Scalars.size() < Limit ||
9520 ((TE->getOpcode() == Instruction::ExtractElement ||
9521 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
9522 isFixedVectorShuffle(TE->Scalars, Mask)) ||
9523 (TE->State == TreeEntry::NeedToGather &&
9524 TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()));
9525 };
9526
9527 // We only handle trees of heights 1 and 2.
9528 if (VectorizableTree.size() == 1 &&
9529 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
9530 (ForReduction &&
9531 AreVectorizableGathers(VectorizableTree[0].get(),
9532 VectorizableTree[0]->Scalars.size()) &&
9533 VectorizableTree[0]->getVectorFactor() > 2)))
9534 return true;
9535
9536 if (VectorizableTree.size() != 2)
9537 return false;
9538
9539 // Handle splat and all-constants stores. Also try to vectorize tiny trees
9540 // with the second gather nodes if they have less scalar operands rather than
9541 // the initial tree element (may be profitable to shuffle the second gather)
9542 // or they are extractelements, which form shuffle.
9544 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
9545 AreVectorizableGathers(VectorizableTree[1].get(),
9546 VectorizableTree[0]->Scalars.size()))
9547 return true;
9548
9549 // Gathering cost would be too much for tiny trees.
9550 if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
9551 (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9552 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
9553 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
9554 return false;
9555
9556 return true;
9557}
9558
9559static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
9561 bool MustMatchOrInst) {
9562 // Look past the root to find a source value. Arbitrarily follow the
9563 // path through operand 0 of any 'or'. Also, peek through optional
9564 // shift-left-by-multiple-of-8-bits.
9565 Value *ZextLoad = Root;
9566 const APInt *ShAmtC;
9567 bool FoundOr = false;
9568 while (!isa<ConstantExpr>(ZextLoad) &&
9569 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
9570 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
9571 ShAmtC->urem(8) == 0))) {
9572 auto *BinOp = cast<BinaryOperator>(ZextLoad);
9573 ZextLoad = BinOp->getOperand(0);
9574 if (BinOp->getOpcode() == Instruction::Or)
9575 FoundOr = true;
9576 }
9577 // Check if the input is an extended load of the required or/shift expression.
9578 Value *Load;
9579 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
9580 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
9581 return false;
9582
9583 // Require that the total load bit width is a legal integer type.
9584 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
9585 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
9586 Type *SrcTy = Load->getType();
9587 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
9588 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
9589 return false;
9590
9591 // Everything matched - assume that we can fold the whole sequence using
9592 // load combining.
9593 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
9594 << *(cast<Instruction>(Root)) << "\n");
9595
9596 return true;
9597}
9598
9600 if (RdxKind != RecurKind::Or)
9601 return false;
9602
9603 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9604 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
9605 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
9606 /* MatchOr */ false);
9607}
9608
9610 // Peek through a final sequence of stores and check if all operations are
9611 // likely to be load-combined.
9612 unsigned NumElts = Stores.size();
9613 for (Value *Scalar : Stores) {
9614 Value *X;
9615 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
9616 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
9617 return false;
9618 }
9619 return true;
9620}
9621
9622bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
9623 // No need to vectorize inserts of gathered values.
9624 if (VectorizableTree.size() == 2 &&
9625 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
9626 VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9627 (VectorizableTree[1]->getVectorFactor() <= 2 ||
9628 !(isSplat(VectorizableTree[1]->Scalars) ||
9629 allConstant(VectorizableTree[1]->Scalars))))
9630 return true;
9631
9632 // If the graph includes only PHI nodes and gathers, it is defnitely not
9633 // profitable for the vectorization, we can skip it, if the cost threshold is
9634 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
9635 // gathers/buildvectors.
9636 constexpr int Limit = 4;
9637 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
9638 !VectorizableTree.empty() &&
9639 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9640 return (TE->State == TreeEntry::NeedToGather &&
9641 TE->getOpcode() != Instruction::ExtractElement &&
9642 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
9643 TE->getOpcode() == Instruction::PHI;
9644 }))
9645 return true;
9646
9647 // We can vectorize the tree if its size is greater than or equal to the
9648 // minimum size specified by the MinTreeSize command line option.
9649 if (VectorizableTree.size() >= MinTreeSize)
9650 return false;
9651
9652 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
9653 // can vectorize it if we can prove it fully vectorizable.
9654 if (isFullyVectorizableTinyTree(ForReduction))
9655 return false;
9656
9657 // Check if any of the gather node forms an insertelement buildvector
9658 // somewhere.
9659 bool IsAllowedSingleBVNode =
9660 VectorizableTree.size() > 1 ||
9661 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
9662 !VectorizableTree.front()->isAltShuffle() &&
9663 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
9664 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
9665 allSameBlock(VectorizableTree.front()->Scalars));
9666 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9667 return TE->State == TreeEntry::NeedToGather &&
9668 all_of(TE->Scalars, [&](Value *V) {
9669 return isa<ExtractElementInst, UndefValue>(V) ||
9670 (IsAllowedSingleBVNode &&
9671 !V->hasNUsesOrMore(UsesLimit) &&
9672 any_of(V->users(), IsaPred<InsertElementInst>));
9673 });
9674 }))
9675 return false;
9676
9677 assert(VectorizableTree.empty()
9678 ? ExternalUses.empty()
9679 : true && "We shouldn't have any external users");
9680
9681 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
9682 // vectorizable.
9683 return true;
9684}
9685
9687 // Walk from the bottom of the tree to the top, tracking which values are
9688 // live. When we see a call instruction that is not part of our tree,
9689 // query TTI to see if there is a cost to keeping values live over it
9690 // (for example, if spills and fills are required).
9691 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
9693
9695 Instruction *PrevInst = nullptr;
9696
9697 // The entries in VectorizableTree are not necessarily ordered by their
9698 // position in basic blocks. Collect them and order them by dominance so later
9699 // instructions are guaranteed to be visited first. For instructions in
9700 // different basic blocks, we only scan to the beginning of the block, so
9701 // their order does not matter, as long as all instructions in a basic block
9702 // are grouped together. Using dominance ensures a deterministic order.
9703 SmallVector<Instruction *, 16> OrderedScalars;
9704 for (const auto &TEPtr : VectorizableTree) {
9705 if (TEPtr->State != TreeEntry::Vectorize)
9706 continue;
9707 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
9708 if (!Inst)
9709 continue;
9710 OrderedScalars.push_back(Inst);
9711 }
9712 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
9713 auto *NodeA = DT->getNode(A->getParent());
9714 auto *NodeB = DT->getNode(B->getParent());
9715 assert(NodeA && "Should only process reachable instructions");
9716 assert(NodeB && "Should only process reachable instructions");
9717 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
9718 "Different nodes should have different DFS numbers");
9719 if (NodeA != NodeB)
9720 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
9721 return B->comesBefore(A);
9722 });
9723
9724 for (Instruction *Inst : OrderedScalars) {
9725 if (!PrevInst) {
9726 PrevInst = Inst;
9727 continue;
9728 }
9729
9730 // Update LiveValues.
9731 LiveValues.erase(PrevInst);
9732 for (auto &J : PrevInst->operands()) {
9733 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
9734 LiveValues.insert(cast<Instruction>(&*J));
9735 }
9736
9737 LLVM_DEBUG({
9738 dbgs() << "SLP: #LV: " << LiveValues.size();
9739 for (auto *X : LiveValues)
9740 dbgs() << " " << X->getName();
9741 dbgs() << ", Looking at ";
9742 Inst->dump();
9743 });
9744
9745 // Now find the sequence of instructions between PrevInst and Inst.
9746 unsigned NumCalls = 0;
9747 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
9748 PrevInstIt =
9749 PrevInst->getIterator().getReverse();
9750 while (InstIt != PrevInstIt) {
9751 if (PrevInstIt == PrevInst->getParent()->rend()) {
9752 PrevInstIt = Inst->getParent()->rbegin();
9753 continue;
9754 }
9755
9756 auto NoCallIntrinsic = [this](Instruction *I) {
9757 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
9758 if (II->isAssumeLikeIntrinsic())
9759 return true;
9760 FastMathFlags FMF;
9762 for (auto &ArgOp : II->args())
9763 Tys.push_back(ArgOp->getType());
9764 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
9765 FMF = FPMO->getFastMathFlags();
9766 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
9767 FMF);
9768 InstructionCost IntrCost =
9771 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
9772 if (IntrCost < CallCost)
9773 return true;
9774 }
9775 return false;
9776 };
9777
9778 // Debug information does not impact spill cost.
9779 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
9780 &*PrevInstIt != PrevInst)
9781 NumCalls++;
9782
9783 ++PrevInstIt;
9784 }
9785
9786 if (NumCalls) {
9788 for (auto *II : LiveValues) {
9789 auto *ScalarTy = II->getType();
9790 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
9791 ScalarTy = VectorTy->getElementType();
9792 V.push_back(FixedVectorType::get(ScalarTy, BundleWidth));
9793 }
9794 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
9795 }
9796
9797 PrevInst = Inst;
9798 }
9799
9800 return Cost;
9801}
9802
9803/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
9804/// buildvector sequence.
9806 const InsertElementInst *IE2) {
9807 if (IE1 == IE2)
9808 return false;
9809 const auto *I1 = IE1;
9810 const auto *I2 = IE2;
9811 const InsertElementInst *PrevI1;
9812 const InsertElementInst *PrevI2;
9813 unsigned Idx1 = *getInsertIndex(IE1);
9814 unsigned Idx2 = *getInsertIndex(IE2);
9815 do {
9816 if (I2 == IE1)
9817 return true;
9818 if (I1 == IE2)
9819 return false;
9820 PrevI1 = I1;
9821 PrevI2 = I2;
9822 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
9823 getInsertIndex(I1).value_or(Idx2) != Idx2)
9824 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
9825 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
9826 getInsertIndex(I2).value_or(Idx1) != Idx1)
9827 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
9828 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
9829 llvm_unreachable("Two different buildvectors not expected.");
9830}
9831
9832namespace {
9833/// Returns incoming Value *, if the requested type is Value * too, or a default
9834/// value, otherwise.
9835struct ValueSelect {
9836 template <typename U>
9837 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
9838 return V;
9839 }
9840 template <typename U>
9841 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
9842 return U();
9843 }
9844};
9845} // namespace
9846
9847/// Does the analysis of the provided shuffle masks and performs the requested
9848/// actions on the vectors with the given shuffle masks. It tries to do it in
9849/// several steps.
9850/// 1. If the Base vector is not undef vector, resizing the very first mask to
9851/// have common VF and perform action for 2 input vectors (including non-undef
9852/// Base). Other shuffle masks are combined with the resulting after the 1 stage
9853/// and processed as a shuffle of 2 elements.
9854/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
9855/// action only for 1 vector with the given mask, if it is not the identity
9856/// mask.
9857/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
9858/// vectors, combing the masks properly between the steps.
9859template <typename T>
9861 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
9862 function_ref<unsigned(T *)> GetVF,
9863 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
9865 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
9866 SmallVector<int> Mask(ShuffleMask.begin()->second);
9867 auto VMIt = std::next(ShuffleMask.begin());
9868 T *Prev = nullptr;
9869 SmallBitVector UseMask =
9870 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
9871 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
9872 if (!IsBaseUndef.all()) {
9873 // Base is not undef, need to combine it with the next subvectors.
9874 std::pair<T *, bool> Res =
9875 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
9876 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
9877 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
9878 if (Mask[Idx] == PoisonMaskElem)
9879 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
9880 else
9881 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
9882 }
9883 auto *V = ValueSelect::get<T *>(Base);
9884 (void)V;
9885 assert((!V || GetVF(V) == Mask.size()) &&
9886 "Expected base vector of VF number of elements.");
9887 Prev = Action(Mask, {nullptr, Res.first});
9888 } else if (ShuffleMask.size() == 1) {
9889 // Base is undef and only 1 vector is shuffled - perform the action only for
9890 // single vector, if the mask is not the identity mask.
9891 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
9892 /*ForSingleMask=*/true);
9893 if (Res.second)
9894 // Identity mask is found.
9895 Prev = Res.first;
9896 else
9897 Prev = Action(Mask, {ShuffleMask.begin()->first});
9898 } else {
9899 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
9900 // shuffles step by step, combining shuffle between the steps.
9901 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
9902 unsigned Vec2VF = GetVF(VMIt->first);
9903 if (Vec1VF == Vec2VF) {
9904 // No need to resize the input vectors since they are of the same size, we
9905 // can shuffle them directly.
9906 ArrayRef<int> SecMask = VMIt->second;
9907 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
9908 if (SecMask[I] != PoisonMaskElem) {
9909 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
9910 Mask[I] = SecMask[I] + Vec1VF;
9911 }
9912 }
9913 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
9914 } else {
9915 // Vectors of different sizes - resize and reshuffle.
9916 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
9917 /*ForSingleMask=*/false);
9918 std::pair<T *, bool> Res2 =
9919 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
9920 ArrayRef<int> SecMask = VMIt->second;
9921 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
9922 if (Mask[I] != PoisonMaskElem) {
9923 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
9924 if (Res1.second)
9925 Mask[I] = I;
9926 } else if (SecMask[I] != PoisonMaskElem) {
9927 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
9928 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
9929 }
9930 }
9931 Prev = Action(Mask, {Res1.first, Res2.first});
9932 }
9933 VMIt = std::next(VMIt);
9934 }
9935 bool IsBaseNotUndef = !IsBaseUndef.all();
9936 (void)IsBaseNotUndef;
9937 // Perform requested actions for the remaining masks/vectors.
9938 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
9939 // Shuffle other input vectors, if any.
9940 std::pair<T *, bool> Res =
9941 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
9942 ArrayRef<int> SecMask = VMIt->second;
9943 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
9944 if (SecMask[I] != PoisonMaskElem) {
9945 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
9946 "Multiple uses of scalars.");
9947 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
9948 } else if (Mask[I] != PoisonMaskElem) {
9949 Mask[I] = I;
9950 }
9951 }
9952 Prev = Action(Mask, {Prev, Res.first});
9953 }
9954 return Prev;
9955}
9956
9959 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
9960 << VectorizableTree.size() << ".\n");
9961
9962 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
9963
9964 SmallPtrSet<Value *, 4> CheckedExtracts;
9965 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
9966 TreeEntry &TE = *VectorizableTree[I];
9967 if (TE.State == TreeEntry::NeedToGather) {
9968 if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
9969 E && E->getVectorFactor() == TE.getVectorFactor() &&
9970 E->isSame(TE.Scalars)) {
9971 // Some gather nodes might be absolutely the same as some vectorizable
9972 // nodes after reordering, need to handle it.
9973 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
9974 << shortBundleName(TE.Scalars) << ".\n"
9975 << "SLP: Current total cost = " << Cost << "\n");
9976 continue;
9977 }
9978 }
9979
9980 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
9981 Cost += C;
9982 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
9983 << shortBundleName(TE.Scalars) << ".\n"
9984 << "SLP: Current total cost = " << Cost << "\n");
9985 }
9986
9987 SmallPtrSet<Value *, 16> ExtractCostCalculated;
9988 InstructionCost ExtractCost = 0;
9991 SmallVector<APInt> DemandedElts;
9992 SmallDenseSet<Value *, 4> UsedInserts;
9994 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
9995 for (ExternalUser &EU : ExternalUses) {
9996 // We only add extract cost once for the same scalar.
9997 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
9998 !ExtractCostCalculated.insert(EU.Scalar).second)
9999 continue;
10000
10001 // Uses by ephemeral values are free (because the ephemeral value will be
10002 // removed prior to code generation, and so the extraction will be
10003 // removed as well).
10004 if (EphValues.count(EU.User))
10005 continue;
10006
10007 // No extract cost for vector "scalar"
10008 if (isa<FixedVectorType>(EU.Scalar->getType()))
10009 continue;
10010
10011 // If found user is an insertelement, do not calculate extract cost but try
10012 // to detect it as a final shuffled/identity match.
10013 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
10014 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
10015 if (!UsedInserts.insert(VU).second)
10016 continue;
10017 std::optional<unsigned> InsertIdx = getInsertIndex(VU);
10018 if (InsertIdx) {
10019 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
10020 auto *It = find_if(
10021 FirstUsers,
10022 [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
10024 VU, cast<InsertElementInst>(Pair.first),
10025 [this](InsertElementInst *II) -> Value * {
10026 Value *Op0 = II->getOperand(0);
10027 if (getTreeEntry(II) && !getTreeEntry(Op0))
10028 return nullptr;
10029 return Op0;
10030 });
10031 });
10032 int VecId = -1;
10033 if (It == FirstUsers.end()) {
10034 (void)ShuffleMasks.emplace_back();
10035 SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
10036 if (Mask.empty())
10037 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10038 // Find the insertvector, vectorized in tree, if any.
10039 Value *Base = VU;
10040 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
10041 if (IEBase != EU.User &&
10042 (!IEBase->hasOneUse() ||
10043 getInsertIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
10044 break;
10045 // Build the mask for the vectorized insertelement instructions.
10046 if (const TreeEntry *E = getTreeEntry(IEBase)) {
10047 VU = IEBase;
10048 do {
10049 IEBase = cast<InsertElementInst>(Base);
10050 int Idx = *getInsertIndex(IEBase);
10051 assert(Mask[Idx] == PoisonMaskElem &&
10052 "InsertElementInstruction used already.");
10053 Mask[Idx] = Idx;
10054 Base = IEBase->getOperand(0);
10055 } while (E == getTreeEntry(Base));
10056 break;
10057 }
10058 Base = cast<InsertElementInst>(Base)->getOperand(0);
10059 }
10060 FirstUsers.emplace_back(VU, ScalarTE);
10061 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
10062 VecId = FirstUsers.size() - 1;
10063 auto It = MinBWs.find(ScalarTE);
10064 if (It != MinBWs.end() &&
10065 VectorCasts
10066 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
10067 .second) {
10068 unsigned BWSz = It->second.first;
10069 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
10070 unsigned VecOpcode;
10071 if (DstBWSz < BWSz)
10072 VecOpcode = Instruction::Trunc;
10073 else
10074 VecOpcode =
10075 It->second.second ? Instruction::SExt : Instruction::ZExt;
10078 VecOpcode, FTy,
10080 IntegerType::get(FTy->getContext(), BWSz),
10081 FTy->getNumElements()),
10083 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10084 << " for extending externally used vector with "
10085 "non-equal minimum bitwidth.\n");
10086 Cost += C;
10087 }
10088 } else {
10089 if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
10090 It->first = VU;
10091 VecId = std::distance(FirstUsers.begin(), It);
10092 }
10093 int InIdx = *InsertIdx;
10094 SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
10095 if (Mask.empty())
10096 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10097 Mask[InIdx] = EU.Lane;
10098 DemandedElts[VecId].setBit(InIdx);
10099 continue;
10100 }
10101 }
10102 }
10103 // Leave the GEPs as is, they are free in most cases and better to keep them
10104 // as GEPs.
10106 if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10107 if (!ValueToExtUses) {
10108 ValueToExtUses.emplace();
10109 for_each(enumerate(ExternalUses), [&](const auto &P) {
10110 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
10111 });
10112 }
10113 // Can use original GEP, if no operands vectorized or they are marked as
10114 // externally used already.
10115 bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
10116 if (!getTreeEntry(V))
10117 return true;
10118 auto It = ValueToExtUses->find(V);
10119 if (It != ValueToExtUses->end()) {
10120 // Replace all uses to avoid compiler crash.
10121 ExternalUses[It->second].User = nullptr;
10122 return true;
10123 }
10124 return false;
10125 });
10126 if (CanBeUsedAsGEP) {
10127 ExtractCost += TTI->getInstructionCost(GEP, CostKind);
10128 ExternalUsesAsGEPs.insert(EU.Scalar);
10129 continue;
10130 }
10131 }
10132
10133 // If we plan to rewrite the tree in a smaller type, we will need to sign
10134 // extend the extracted value back to the original type. Here, we account
10135 // for the extract and the added cost of the sign extend if needed.
10136 auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
10137 auto It = MinBWs.find(getTreeEntry(EU.Scalar));
10138 if (It != MinBWs.end()) {
10139 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
10140 unsigned Extend =
10141 It->second.second ? Instruction::SExt : Instruction::ZExt;
10142 VecTy = FixedVectorType::get(MinTy, BundleWidth);
10143 ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
10144 VecTy, EU.Lane);
10145 } else {
10146 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
10147 CostKind, EU.Lane);
10148 }
10149 }
10150 // Add reduced value cost, if resized.
10151 if (!VectorizedVals.empty()) {
10152 const TreeEntry &Root = *VectorizableTree.front().get();
10153 auto BWIt = MinBWs.find(&Root);
10154 if (BWIt != MinBWs.end()) {
10155 Type *DstTy = Root.Scalars.front()->getType();
10156 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
10157 unsigned SrcSz =
10158 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10159 if (OriginalSz != SrcSz) {
10160 unsigned Opcode = Instruction::Trunc;
10161 if (OriginalSz > SrcSz)
10162 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10163 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
10164 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
10167 }
10168 }
10169 }
10170
10171 InstructionCost SpillCost = getSpillCost();
10172 Cost += SpillCost + ExtractCost;
10173 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
10174 bool) {
10175 InstructionCost C = 0;
10176 unsigned VF = Mask.size();
10177 unsigned VecVF = TE->getVectorFactor();
10178 if (VF != VecVF &&
10179 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
10181 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
10182 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
10183 OrigMask.begin());
10184 C = TTI->getShuffleCost(
10186 FixedVectorType::get(TE->getMainOp()->getType(), VecVF), OrigMask);
10187 LLVM_DEBUG(
10188 dbgs() << "SLP: Adding cost " << C
10189 << " for final shuffle of insertelement external users.\n";
10190 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
10191 Cost += C;
10192 return std::make_pair(TE, true);
10193 }
10194 return std::make_pair(TE, false);
10195 };
10196 // Calculate the cost of the reshuffled vectors, if any.
10197 for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
10198 Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
10199 auto Vector = ShuffleMasks[I].takeVector();
10200 unsigned VF = 0;
10201 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
10203 assert((TEs.size() == 1 || TEs.size() == 2) &&
10204 "Expected exactly 1 or 2 tree entries.");
10205 if (TEs.size() == 1) {
10206 if (VF == 0)
10207 VF = TEs.front()->getVectorFactor();
10208 auto *FTy =
10209 FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
10210 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
10211 !all_of(enumerate(Mask), [=](const auto &Data) {
10212 return Data.value() == PoisonMaskElem ||
10213 (Data.index() < VF &&
10214 static_cast<int>(Data.index()) == Data.value());
10215 })) {
10218 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10219 << " for final shuffle of insertelement "
10220 "external users.\n";
10221 TEs.front()->dump();
10222 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10223 Cost += C;
10224 }
10225 } else {
10226 if (VF == 0) {
10227 if (TEs.front() &&
10228 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10229 VF = TEs.front()->getVectorFactor();
10230 else
10231 VF = Mask.size();
10232 }
10233 auto *FTy =
10234 FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
10237 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10238 << " for final shuffle of vector node and external "
10239 "insertelement users.\n";
10240 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10241 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10242 Cost += C;
10243 }
10244 VF = Mask.size();
10245 return TEs.back();
10246 };
10247 (void)performExtractsShuffleAction<const TreeEntry>(
10248 MutableArrayRef(Vector.data(), Vector.size()), Base,
10249 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
10250 EstimateShufflesCost);
10252 cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
10253 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
10254 Cost -= InsertCost;
10255 }
10256
10257 // Add the cost for reduced value resize (if required).
10258 if (ReductionBitWidth != 0) {
10259 assert(UserIgnoreList && "Expected reduction tree.");
10260 const TreeEntry &E = *VectorizableTree.front().get();
10261 auto It = MinBWs.find(&E);
10262 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10263 unsigned SrcSize = It->second.first;
10264 unsigned DstSize = ReductionBitWidth;
10265 unsigned Opcode = Instruction::Trunc;
10266 if (SrcSize < DstSize)
10267 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10268 auto *SrcVecTy =
10269 FixedVectorType::get(Builder.getIntNTy(SrcSize), E.getVectorFactor());
10270 auto *DstVecTy =
10271 FixedVectorType::get(Builder.getIntNTy(DstSize), E.getVectorFactor());
10272 TTI::CastContextHint CCH = getCastContextHint(E);
10273 InstructionCost CastCost;
10274 switch (E.getOpcode()) {
10275 case Instruction::SExt:
10276 case Instruction::ZExt:
10277 case Instruction::Trunc: {
10278 const TreeEntry *OpTE = getOperandEntry(&E, 0);
10279 CCH = getCastContextHint(*OpTE);
10280 break;
10281 }
10282 default:
10283 break;
10284 }
10285 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
10287 Cost += CastCost;
10288 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
10289 << " for final resize for reduction from " << SrcVecTy
10290 << " to " << DstVecTy << "\n";
10291 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10292 }
10293 }
10294
10295#ifndef NDEBUG
10296 SmallString<256> Str;
10297 {
10299 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
10300 << "SLP: Extract Cost = " << ExtractCost << ".\n"
10301 << "SLP: Total Cost = " << Cost << ".\n";
10302 }
10303 LLVM_DEBUG(dbgs() << Str);
10304 if (ViewSLPTree)
10305 ViewGraph(this, "SLP" + F->getName(), false, Str);
10306#endif
10307
10308 return Cost;
10309}
10310
10311/// Tries to find extractelement instructions with constant indices from fixed
10312/// vector type and gather such instructions into a bunch, which highly likely
10313/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10314/// successful, the matched scalars are replaced by poison values in \p VL for
10315/// future analysis.
10316std::optional<TTI::ShuffleKind>
10317BoUpSLP::tryToGatherSingleRegisterExtractElements(
10319 // Scan list of gathered scalars for extractelements that can be represented
10320 // as shuffles.
10322 SmallVector<int> UndefVectorExtracts;
10323 for (int I = 0, E = VL.size(); I < E; ++I) {
10324 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10325 if (!EI) {
10326 if (isa<UndefValue>(VL[I]))
10327 UndefVectorExtracts.push_back(I);
10328 continue;
10329 }
10330 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
10331 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
10332 continue;
10333 std::optional<unsigned> Idx = getExtractIndex(EI);
10334 // Undefined index.
10335 if (!Idx) {
10336 UndefVectorExtracts.push_back(I);
10337 continue;
10338 }
10339 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
10340 ExtractMask.reset(*Idx);
10341 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
10342 UndefVectorExtracts.push_back(I);
10343 continue;
10344 }
10345 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
10346 }
10347 // Sort the vector operands by the maximum number of uses in extractelements.
10349 for (const auto &Data : VectorOpToIdx)
10350 VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]
10351 .push_back(Data.first);
10352 for (auto &Data : VFToVector) {
10353 stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {
10354 return VectorOpToIdx.find(V1)->second.size() >
10355 VectorOpToIdx.find(V2)->second.size();
10356 });
10357 }
10358 // Find the best pair of the vectors with the same number of elements or a
10359 // single vector.
10360 const int UndefSz = UndefVectorExtracts.size();
10361 unsigned SingleMax = 0;
10362 Value *SingleVec = nullptr;
10363 unsigned PairMax = 0;
10364 std::pair<Value *, Value *> PairVec(nullptr, nullptr);
10365 for (auto &Data : VFToVector) {
10366 Value *V1 = Data.second.front();
10367 if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {
10368 SingleMax = VectorOpToIdx[V1].size() + UndefSz;
10369 SingleVec = V1;
10370 }
10371 Value *V2 = nullptr;
10372 if (Data.second.size() > 1)
10373 V2 = *std::next(Data.second.begin());
10374 if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +
10375 UndefSz) {
10376 PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;
10377 PairVec = std::make_pair(V1, V2);
10378 }
10379 }
10380 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10381 return std::nullopt;
10382 // Check if better to perform a shuffle of 2 vectors or just of a single
10383 // vector.
10384 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
10385 SmallVector<Value *> GatheredExtracts(
10386 VL.size(), PoisonValue::get(VL.front()->getType()));
10387 if (SingleMax >= PairMax && SingleMax) {
10388 for (int Idx : VectorOpToIdx[SingleVec])
10389 std::swap(GatheredExtracts[Idx], VL[Idx]);
10390 } else {
10391 for (Value *V : {PairVec.first, PairVec.second})
10392 for (int Idx : VectorOpToIdx[V])
10393 std::swap(GatheredExtracts[Idx], VL[Idx]);
10394 }
10395 // Add extracts from undefs too.
10396 for (int Idx : UndefVectorExtracts)
10397 std::swap(GatheredExtracts[Idx], VL[Idx]);
10398 // Check that gather of extractelements can be represented as just a
10399 // shuffle of a single/two vectors the scalars are extracted from.
10400 std::optional<TTI::ShuffleKind> Res =
10401 isFixedVectorShuffle(GatheredExtracts, Mask);
10402 if (!Res) {
10403 // TODO: try to check other subsets if possible.
10404 // Restore the original VL if attempt was not successful.
10405 copy(SavedVL, VL.begin());
10406 return std::nullopt;
10407 }
10408 // Restore unused scalars from mask, if some of the extractelements were not
10409 // selected for shuffle.
10410 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
10411 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
10412 isa<UndefValue>(GatheredExtracts[I])) {
10413 std::swap(VL[I], GatheredExtracts[I]);
10414 continue;
10415 }
10416 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10417 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
10418 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
10419 is_contained(UndefVectorExtracts, I))
10420 continue;
10421 }
10422 return Res;
10423}
10424
10425/// Tries to find extractelement instructions with constant indices from fixed
10426/// vector type and gather such instructions into a bunch, which highly likely
10427/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10428/// successful, the matched scalars are replaced by poison values in \p VL for
10429/// future analysis.
10431BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
10433 unsigned NumParts) const {
10434 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
10435 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
10436 Mask.assign(VL.size(), PoisonMaskElem);
10437 unsigned SliceSize = VL.size() / NumParts;
10438 for (unsigned Part = 0; Part < NumParts; ++Part) {
10439 // Scan list of gathered scalars for extractelements that can be represented
10440 // as shuffles.
10442 MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
10443 SmallVector<int> SubMask;
10444 std::optional<TTI::ShuffleKind> Res =
10445 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
10446 ShufflesRes[Part] = Res;
10447 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
10448 }
10449 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
10450 return Res.has_value();
10451 }))
10452 ShufflesRes.clear();
10453 return ShufflesRes;
10454}
10455
10456std::optional<TargetTransformInfo::ShuffleKind>
10457BoUpSLP::isGatherShuffledSingleRegisterEntry(
10458 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
10459 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
10460 Entries.clear();
10461 // TODO: currently checking only for Scalars in the tree entry, need to count
10462 // reused elements too for better cost estimation.
10463 const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
10464 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
10465 const BasicBlock *TEInsertBlock = nullptr;
10466 // Main node of PHI entries keeps the correct order of operands/incoming
10467 // blocks.
10468 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
10469 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
10470 TEInsertPt = TEInsertBlock->getTerminator();
10471 } else {
10472 TEInsertBlock = TEInsertPt->getParent();
10473 }
10474 if (!DT->isReachableFromEntry(TEInsertBlock))
10475 return std::nullopt;
10476 auto *NodeUI = DT->getNode(TEInsertBlock);
10477 assert(NodeUI && "Should only process reachable instructions");
10478 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
10479 auto CheckOrdering = [&](const Instruction *InsertPt) {
10480 // Argument InsertPt is an instruction where vector code for some other
10481 // tree entry (one that shares one or more scalars with TE) is going to be
10482 // generated. This lambda returns true if insertion point of vector code
10483 // for the TE dominates that point (otherwise dependency is the other way
10484 // around). The other node is not limited to be of a gather kind. Gather
10485 // nodes are not scheduled and their vector code is inserted before their
10486 // first user. If user is PHI, that is supposed to be at the end of a
10487 // predecessor block. Otherwise it is the last instruction among scalars of
10488 // the user node. So, instead of checking dependency between instructions
10489 // themselves, we check dependency between their insertion points for vector
10490 // code (since each scalar instruction ends up as a lane of a vector
10491 // instruction).
10492 const BasicBlock *InsertBlock = InsertPt->getParent();
10493 auto *NodeEUI = DT->getNode(InsertBlock);
10494 if (!NodeEUI)
10495 return false;
10496 assert((NodeUI == NodeEUI) ==
10497 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10498 "Different nodes should have different DFS numbers");
10499 // Check the order of the gather nodes users.
10500 if (TEInsertPt->getParent() != InsertBlock &&
10501 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
10502 return false;
10503 if (TEInsertPt->getParent() == InsertBlock &&
10504 TEInsertPt->comesBefore(InsertPt))
10505 return false;
10506 return true;
10507 };
10508 // Find all tree entries used by the gathered values. If no common entries
10509 // found - not a shuffle.
10510 // Here we build a set of tree nodes for each gathered value and trying to
10511 // find the intersection between these sets. If we have at least one common
10512 // tree node for each gathered value - we have just a permutation of the
10513 // single vector. If we have 2 different sets, we're in situation where we
10514 // have a permutation of 2 input vectors.
10516 DenseMap<Value *, int> UsedValuesEntry;
10517 for (Value *V : VL) {
10518 if (isConstant(V))
10519 continue;
10520 // Build a list of tree entries where V is used.
10522 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
10523 if (TEPtr == TE)
10524 continue;
10525 assert(any_of(TEPtr->Scalars,
10526 [&](Value *V) { return GatheredScalars.contains(V); }) &&
10527 "Must contain at least single gathered value.");
10528 assert(TEPtr->UserTreeIndices.size() == 1 &&
10529 "Expected only single user of a gather node.");
10530 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
10531
10532 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
10533 const Instruction *InsertPt =
10534 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
10535 : &getLastInstructionInBundle(UseEI.UserTE);
10536 if (TEInsertPt == InsertPt) {
10537 // If 2 gathers are operands of the same entry (regardless of whether
10538 // user is PHI or else), compare operands indices, use the earlier one
10539 // as the base.
10540 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
10541 continue;
10542 // If the user instruction is used for some reason in different
10543 // vectorized nodes - make it depend on index.
10544 if (TEUseEI.UserTE != UseEI.UserTE &&
10545 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
10546 continue;
10547 }
10548
10549 // Check if the user node of the TE comes after user node of TEPtr,
10550 // otherwise TEPtr depends on TE.
10551 if ((TEInsertBlock != InsertPt->getParent() ||
10552 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
10553 !CheckOrdering(InsertPt))
10554 continue;
10555 VToTEs.insert(TEPtr);
10556 }
10557 if (const TreeEntry *VTE = getTreeEntry(V)) {
10558 if (ForOrder) {
10559 if (VTE->State != TreeEntry::Vectorize) {
10560 auto It = MultiNodeScalars.find(V);
10561 if (It == MultiNodeScalars.end())
10562 continue;
10563 VTE = *It->getSecond().begin();
10564 // Iterate through all vectorized nodes.
10565 auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
10566 return MTE->State == TreeEntry::Vectorize;
10567 });
10568 if (MIt == It->getSecond().end())
10569 continue;
10570 VTE = *MIt;
10571 }
10572 }
10573 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
10574 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
10575 continue;
10576 VToTEs.insert(VTE);
10577 }
10578 if (VToTEs.empty())
10579 continue;
10580 if (UsedTEs.empty()) {
10581 // The first iteration, just insert the list of nodes to vector.
10582 UsedTEs.push_back(VToTEs);
10583 UsedValuesEntry.try_emplace(V, 0);
10584 } else {
10585 // Need to check if there are any previously used tree nodes which use V.
10586 // If there are no such nodes, consider that we have another one input
10587 // vector.
10588 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
10589 unsigned Idx = 0;
10590 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
10591 // Do we have a non-empty intersection of previously listed tree entries
10592 // and tree entries using current V?
10593 set_intersect(VToTEs, Set);
10594 if (!VToTEs.empty()) {
10595 // Yes, write the new subset and continue analysis for the next
10596 // scalar.
10597 Set.swap(VToTEs);
10598 break;
10599 }
10600 VToTEs = SavedVToTEs;
10601 ++Idx;
10602 }
10603 // No non-empty intersection found - need to add a second set of possible
10604 // source vectors.
10605 if (Idx == UsedTEs.size()) {
10606 // If the number of input vectors is greater than 2 - not a permutation,
10607 // fallback to the regular gather.
10608 // TODO: support multiple reshuffled nodes.
10609 if (UsedTEs.size() == 2)
10610 continue;
10611 UsedTEs.push_back(SavedVToTEs);
10612 Idx = UsedTEs.size() - 1;
10613 }
10614 UsedValuesEntry.try_emplace(V, Idx);
10615 }
10616 }
10617
10618 if (UsedTEs.empty()) {
10619 Entries.clear();
10620 return std::nullopt;
10621 }
10622
10623 unsigned VF = 0;
10624 if (UsedTEs.size() == 1) {
10625 // Keep the order to avoid non-determinism.
10626 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
10627 UsedTEs.front().end());
10628 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
10629 return TE1->Idx < TE2->Idx;
10630 });
10631 // Try to find the perfect match in another gather node at first.
10632 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
10633 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
10634 });
10635 if (It != FirstEntries.end() &&
10636 ((*It)->getVectorFactor() == VL.size() ||
10637 ((*It)->getVectorFactor() == TE->Scalars.size() &&
10638 TE->ReuseShuffleIndices.size() == VL.size() &&
10639 (*It)->isSame(TE->Scalars)))) {
10640 Entries.push_back(*It);
10641 if ((*It)->getVectorFactor() == VL.size()) {
10642 std::iota(std::next(Mask.begin(), Part * VL.size()),
10643 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
10644 } else {
10645 SmallVector<int> CommonMask = TE->getCommonMask();
10646 copy(CommonMask, Mask.begin());
10647 }
10648 // Clear undef scalars.
10649 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
10650 if (isa<PoisonValue>(VL[I]))
10653 }
10654 // No perfect match, just shuffle, so choose the first tree node from the
10655 // tree.
10656 Entries.push_back(FirstEntries.front());
10657 } else {
10658 // Try to find nodes with the same vector factor.
10659 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
10660 // Keep the order of tree nodes to avoid non-determinism.
10662 for (const TreeEntry *TE : UsedTEs.front()) {
10663 unsigned VF = TE->getVectorFactor();
10664 auto It = VFToTE.find(VF);
10665 if (It != VFToTE.end()) {
10666 if (It->second->Idx > TE->Idx)
10667 It->getSecond() = TE;
10668 continue;
10669 }
10670 VFToTE.try_emplace(VF, TE);
10671 }
10672 // Same, keep the order to avoid non-determinism.
10673 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
10674 UsedTEs.back().end());
10675 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
10676 return TE1->Idx < TE2->Idx;
10677 });
10678 for (const TreeEntry *TE : SecondEntries) {
10679 auto It = VFToTE.find(TE->getVectorFactor());
10680 if (It != VFToTE.end()) {
10681 VF = It->first;
10682 Entries.push_back(It->second);
10683 Entries.push_back(TE);
10684 break;
10685 }
10686 }
10687 // No 2 source vectors with the same vector factor - just choose 2 with max
10688 // index.
10689 if (Entries.empty()) {
10690 Entries.push_back(*llvm::max_element(
10691 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
10692 return TE1->Idx < TE2->Idx;
10693 }));
10694 Entries.push_back(SecondEntries.front());
10695 VF = std::max(Entries.front()->getVectorFactor(),
10696 Entries.back()->getVectorFactor());
10697 }
10698 }
10699
10700 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
10701 // Checks if the 2 PHIs are compatible in terms of high possibility to be
10702 // vectorized.
10703 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
10704 auto *PHI = cast<PHINode>(V);
10705 auto *PHI1 = cast<PHINode>(V1);
10706 // Check that all incoming values are compatible/from same parent (if they
10707 // are instructions).
10708 // The incoming values are compatible if they all are constants, or
10709 // instruction with the same/alternate opcodes from the same basic block.
10710 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
10711 Value *In = PHI->getIncomingValue(I);
10712 Value *In1 = PHI1->getIncomingValue(I);
10713 if (isConstant(In) && isConstant(In1))
10714 continue;
10715 if (!getSameOpcode({In, In1}, *TLI).getOpcode())
10716 return false;
10717 if (cast<Instruction>(In)->getParent() !=
10718 cast<Instruction>(In1)->getParent())
10719 return false;
10720 }
10721 return true;
10722 };
10723 // Check if the value can be ignored during analysis for shuffled gathers.
10724 // We suppose it is better to ignore instruction, which do not form splats,
10725 // are not vectorized/not extractelements (these instructions will be handled
10726 // by extractelements processing) or may form vector node in future.
10727 auto MightBeIgnored = [=](Value *V) {
10728 auto *I = dyn_cast<Instruction>(V);
10729 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
10731 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
10732 };
10733 // Check that the neighbor instruction may form a full vector node with the
10734 // current instruction V. It is possible, if they have same/alternate opcode
10735 // and same parent basic block.
10736 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
10737 Value *V1 = VL[Idx];
10738 bool UsedInSameVTE = false;
10739 auto It = UsedValuesEntry.find(V1);
10740 if (It != UsedValuesEntry.end())
10741 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
10742 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
10743 getSameOpcode({V, V1}, *TLI).getOpcode() &&
10744 cast<Instruction>(V)->getParent() ==
10745 cast<Instruction>(V1)->getParent() &&
10746 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
10747 };
10748 // Build a shuffle mask for better cost estimation and vector emission.
10749 SmallBitVector UsedIdxs(Entries.size());
10751 for (int I = 0, E = VL.size(); I < E; ++I) {
10752 Value *V = VL[I];
10753 auto It = UsedValuesEntry.find(V);
10754 if (It == UsedValuesEntry.end())
10755 continue;
10756 // Do not try to shuffle scalars, if they are constants, or instructions
10757 // that can be vectorized as a result of the following vector build
10758 // vectorization.
10759 if (isConstant(V) || (MightBeIgnored(V) &&
10760 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
10761 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
10762 continue;
10763 unsigned Idx = It->second;
10764 EntryLanes.emplace_back(Idx, I);
10765 UsedIdxs.set(Idx);
10766 }
10767 // Iterate through all shuffled scalars and select entries, which can be used
10768 // for final shuffle.
10770 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
10771 if (!UsedIdxs.test(I))
10772 continue;
10773 // Fix the entry number for the given scalar. If it is the first entry, set
10774 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
10775 // These indices are used when calculating final shuffle mask as the vector
10776 // offset.
10777 for (std::pair<unsigned, int> &Pair : EntryLanes)
10778 if (Pair.first == I)
10779 Pair.first = TempEntries.size();
10780 TempEntries.push_back(Entries[I]);
10781 }
10782 Entries.swap(TempEntries);
10783 if (EntryLanes.size() == Entries.size() &&
10784 !VL.equals(ArrayRef(TE->Scalars)
10785 .slice(Part * VL.size(),
10786 std::min<int>(VL.size(), TE->Scalars.size())))) {
10787 // We may have here 1 or 2 entries only. If the number of scalars is equal
10788 // to the number of entries, no need to do the analysis, it is not very
10789 // profitable. Since VL is not the same as TE->Scalars, it means we already
10790 // have some shuffles before. Cut off not profitable case.
10791 Entries.clear();
10792 return std::nullopt;
10793 }
10794 // Build the final mask, check for the identity shuffle, if possible.
10795 bool IsIdentity = Entries.size() == 1;
10796 // Pair.first is the offset to the vector, while Pair.second is the index of
10797 // scalar in the list.
10798 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
10799 unsigned Idx = Part * VL.size() + Pair.second;
10800 Mask[Idx] =
10801 Pair.first * VF +
10802 (ForOrder ? std::distance(
10803 Entries[Pair.first]->Scalars.begin(),
10804 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
10805 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
10806 IsIdentity &= Mask[Idx] == Pair.second;
10807 }
10808 switch (Entries.size()) {
10809 case 1:
10810 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
10812 break;
10813 case 2:
10814 if (EntryLanes.size() > 2 || VL.size() <= 2)
10816 break;
10817 default:
10818 break;
10819 }
10820 Entries.clear();
10821 // Clear the corresponding mask elements.
10822 std::fill(std::next(Mask.begin(), Part * VL.size()),
10823 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
10824 return std::nullopt;
10825}
10826
10828BoUpSLP::isGatherShuffledEntry(
10829 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
10830 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
10831 bool ForOrder) {
10832 assert(NumParts > 0 && NumParts < VL.size() &&
10833 "Expected positive number of registers.");
10834 Entries.clear();
10835 // No need to check for the topmost gather node.
10836 if (TE == VectorizableTree.front().get())
10837 return {};
10838 // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
10839 if (TE->isNonPowOf2Vec())
10840 return {};
10841 Mask.assign(VL.size(), PoisonMaskElem);
10842 assert(TE->UserTreeIndices.size() == 1 &&
10843 "Expected only single user of the gather node.");
10844 assert(VL.size() % NumParts == 0 &&
10845 "Number of scalars must be divisible by NumParts.");
10846 unsigned SliceSize = VL.size() / NumParts;
10848 for (unsigned Part = 0; Part < NumParts; ++Part) {
10849 ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
10850 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
10851 std::optional<TTI::ShuffleKind> SubRes =
10852 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
10853 ForOrder);
10854 if (!SubRes)
10855 SubEntries.clear();
10856 Res.push_back(SubRes);
10857 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
10858 SubEntries.front()->getVectorFactor() == VL.size() &&
10859 (SubEntries.front()->isSame(TE->Scalars) ||
10860 SubEntries.front()->isSame(VL))) {
10861 SmallVector<const TreeEntry *> LocalSubEntries;
10862 LocalSubEntries.swap(SubEntries);
10863 Entries.clear();
10864 Res.clear();
10865 std::iota(Mask.begin(), Mask.end(), 0);
10866 // Clear undef scalars.
10867 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
10868 if (isa<PoisonValue>(VL[I]))
10870 Entries.emplace_back(1, LocalSubEntries.front());
10872 return Res;
10873 }
10874 }
10875 if (all_of(Res,
10876 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
10877 Entries.clear();
10878 return {};
10879 }
10880 return Res;
10881}
10882
10883InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
10884 bool ForPoisonSrc) const {
10885 // Find the type of the operands in VL.
10886 Type *ScalarTy = VL[0]->getType();
10887 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
10888 ScalarTy = SI->getValueOperand()->getType();
10889 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
10890 bool DuplicateNonConst = false;
10891 // Find the cost of inserting/extracting values from the vector.
10892 // Check if the same elements are inserted several times and count them as
10893 // shuffle candidates.
10894 APInt ShuffledElements = APInt::getZero(VL.size());
10895 DenseMap<Value *, unsigned> UniqueElements;
10898 auto EstimateInsertCost = [&](unsigned I, Value *V) {
10899 if (!ForPoisonSrc)
10900 Cost +=
10901 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
10902 I, Constant::getNullValue(VecTy), V);
10903 };
10904 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
10905 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
10906 Value *V = VL[I];
10907 // No need to shuffle duplicates for constants.
10908 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
10909 ShuffledElements.setBit(I);
10910 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
10911 continue;
10912 }
10913
10914 auto Res = UniqueElements.try_emplace(V, I);
10915 if (Res.second) {
10916 EstimateInsertCost(I, V);
10917 ShuffleMask[I] = I;
10918 continue;
10919 }
10920
10921 DuplicateNonConst = true;
10922 ShuffledElements.setBit(I);
10923 ShuffleMask[I] = Res.first->second;
10924 }
10925 if (ForPoisonSrc)
10926 Cost =
10927 TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
10928 /*Extract*/ false, CostKind);
10929 if (DuplicateNonConst)
10931 VecTy, ShuffleMask);
10932 return Cost;
10933}
10934
10935// Perform operand reordering on the instructions in VL and return the reordered
10936// operands in Left and Right.
10937void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
10940 const BoUpSLP &R) {
10941 if (VL.empty())
10942 return;
10943 VLOperands Ops(VL, R);
10944 // Reorder the operands in place.
10945 Ops.reorder();
10946 Left = Ops.getVL(0);
10947 Right = Ops.getVL(1);
10948}
10949
10950Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
10951 auto &Res = EntryToLastInstruction.FindAndConstruct(E);
10952 if (Res.second)
10953 return *Res.second;
10954 // Get the basic block this bundle is in. All instructions in the bundle
10955 // should be in this block (except for extractelement-like instructions with
10956 // constant indeces).
10957 auto *Front = E->getMainOp();
10958 auto *BB = Front->getParent();
10959 assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
10960 if (E->getOpcode() == Instruction::GetElementPtr &&
10961 !isa<GetElementPtrInst>(V))
10962 return true;
10963 auto *I = cast<Instruction>(V);
10964 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
10965 isVectorLikeInstWithConstOps(I);
10966 }));
10967
10968 auto FindLastInst = [&]() {
10969 Instruction *LastInst = Front;
10970 for (Value *V : E->Scalars) {
10971 auto *I = dyn_cast<Instruction>(V);
10972 if (!I)
10973 continue;
10974 if (LastInst->getParent() == I->getParent()) {
10975 if (LastInst->comesBefore(I))
10976 LastInst = I;
10977 continue;
10978 }
10979 assert(((E->getOpcode() == Instruction::GetElementPtr &&
10980 !isa<GetElementPtrInst>(I)) ||
10981 (isVectorLikeInstWithConstOps(LastInst) &&
10983 "Expected vector-like or non-GEP in GEP node insts only.");
10984 if (!DT->isReachableFromEntry(LastInst->getParent())) {
10985 LastInst = I;
10986 continue;
10987 }
10988 if (!DT->isReachableFromEntry(I->getParent()))
10989 continue;
10990 auto *NodeA = DT->getNode(LastInst->getParent());
10991 auto *NodeB = DT->getNode(I->getParent());
10992 assert(NodeA && "Should only process reachable instructions");
10993 assert(NodeB && "Should only process reachable instructions");
10994 assert((NodeA == NodeB) ==
10995 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10996 "Different nodes should have different DFS numbers");
10997 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
10998 LastInst = I;
10999 }
11000 BB = LastInst->getParent();
11001 return LastInst;
11002 };
11003
11004 auto FindFirstInst = [&]() {
11005 Instruction *FirstInst = Front;
11006 for (Value *V : E->Scalars) {
11007 auto *I = dyn_cast<Instruction>(V);
11008 if (!I)
11009 continue;
11010 if (FirstInst->getParent() == I->getParent()) {
11011 if (I->comesBefore(FirstInst))
11012 FirstInst = I;
11013 continue;
11014 }
11015 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11016 !isa<GetElementPtrInst>(I)) ||
11017 (isVectorLikeInstWithConstOps(FirstInst) &&
11019 "Expected vector-like or non-GEP in GEP node insts only.");
11020 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
11021 FirstInst = I;
11022 continue;
11023 }
11024 if (!DT->isReachableFromEntry(I->getParent()))
11025 continue;
11026 auto *NodeA = DT->getNode(FirstInst->getParent());
11027 auto *NodeB = DT->getNode(I->getParent());
11028 assert(NodeA && "Should only process reachable instructions");
11029 assert(NodeB && "Should only process reachable instructions");
11030 assert((NodeA == NodeB) ==
11031 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11032 "Different nodes should have different DFS numbers");
11033 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11034 FirstInst = I;
11035 }
11036 return FirstInst;
11037 };
11038
11039 // Set the insert point to the beginning of the basic block if the entry
11040 // should not be scheduled.
11041 if (doesNotNeedToSchedule(E->Scalars) ||
11042 (E->State != TreeEntry::NeedToGather &&
11043 all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
11044 if ((E->getOpcode() == Instruction::GetElementPtr &&
11045 any_of(E->Scalars,
11046 [](Value *V) {
11047 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
11048 })) ||
11049 all_of(E->Scalars,
11050 [](Value *V) {
11051 return !isVectorLikeInstWithConstOps(V) &&
11052 isUsedOutsideBlock(V);
11053 }) ||
11054 (E->State == TreeEntry::NeedToGather && E->Idx == 0 &&
11055 all_of(E->Scalars, [](Value *V) {
11056 return isa<ExtractElementInst, UndefValue>(V) ||
11057 areAllOperandsNonInsts(V);
11058 })))
11059 Res.second = FindLastInst();
11060 else
11061 Res.second = FindFirstInst();
11062 return *Res.second;
11063 }
11064
11065 // Find the last instruction. The common case should be that BB has been
11066 // scheduled, and the last instruction is VL.back(). So we start with
11067 // VL.back() and iterate over schedule data until we reach the end of the
11068 // bundle. The end of the bundle is marked by null ScheduleData.
11069 if (BlocksSchedules.count(BB)) {
11070 Value *V = E->isOneOf(E->Scalars.back());
11072 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
11073 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11074 if (Bundle && Bundle->isPartOfBundle())
11075 for (; Bundle; Bundle = Bundle->NextInBundle)
11076 if (Bundle->OpValue == Bundle->Inst)
11077 Res.second = Bundle->Inst;
11078 }
11079
11080 // LastInst can still be null at this point if there's either not an entry
11081 // for BB in BlocksSchedules or there's no ScheduleData available for
11082 // VL.back(). This can be the case if buildTree_rec aborts for various
11083 // reasons (e.g., the maximum recursion depth is reached, the maximum region
11084 // size is reached, etc.). ScheduleData is initialized in the scheduling
11085 // "dry-run".
11086 //
11087 // If this happens, we can still find the last instruction by brute force. We
11088 // iterate forwards from Front (inclusive) until we either see all
11089 // instructions in the bundle or reach the end of the block. If Front is the
11090 // last instruction in program order, LastInst will be set to Front, and we
11091 // will visit all the remaining instructions in the block.
11092 //
11093 // One of the reasons we exit early from buildTree_rec is to place an upper
11094 // bound on compile-time. Thus, taking an additional compile-time hit here is
11095 // not ideal. However, this should be exceedingly rare since it requires that
11096 // we both exit early from buildTree_rec and that the bundle be out-of-order
11097 // (causing us to iterate all the way to the end of the block).
11098 if (!Res.second)
11099 Res.second = FindLastInst();
11100 assert(Res.second && "Failed to find last instruction in bundle");
11101 return *Res.second;
11102}
11103
11104void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
11105 auto *Front = E->getMainOp();
11106 Instruction *LastInst = &getLastInstructionInBundle(E);
11107 assert(LastInst && "Failed to find last instruction in bundle");
11108 BasicBlock::iterator LastInstIt = LastInst->getIterator();
11109 // If the instruction is PHI, set the insert point after all the PHIs.
11110 bool IsPHI = isa<PHINode>(LastInst);
11111 if (IsPHI)
11112 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
11113 if (IsPHI || (E->State != TreeEntry::NeedToGather &&
11114 doesNotNeedToSchedule(E->Scalars))) {
11115 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
11116 } else {
11117 // Set the insertion point after the last instruction in the bundle. Set the
11118 // debug location to Front.
11119 Builder.SetInsertPoint(
11120 LastInst->getParent(),
11122 }
11123 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11124}
11125
11126Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
11127 // List of instructions/lanes from current block and/or the blocks which are
11128 // part of the current loop. These instructions will be inserted at the end to
11129 // make it possible to optimize loops and hoist invariant instructions out of
11130 // the loops body with better chances for success.
11132 SmallSet<int, 4> PostponedIndices;
11133 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
11134 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
11136 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
11137 InsertBB = InsertBB->getSinglePredecessor();
11138 return InsertBB && InsertBB == InstBB;
11139 };
11140 for (int I = 0, E = VL.size(); I < E; ++I) {
11141 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
11142 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11143 getTreeEntry(Inst) ||
11144 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
11145 PostponedIndices.insert(I).second)
11146 PostponedInsts.emplace_back(Inst, I);
11147 }
11148
11149 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
11150 Type *Ty) {
11151 Value *Scalar = V;
11152 if (cast<VectorType>(Vec->getType())->getElementType() != Ty) {
11153 assert(V->getType()->isIntegerTy() && Ty->isIntegerTy() &&
11154 "Expected integer types only.");
11155 Vec = Builder.CreateIntCast(
11156 Vec,
11157 VectorType::get(Ty,
11158 cast<VectorType>(Vec->getType())->getElementCount()),
11159 !isKnownNonNegative(Vec, SimplifyQuery(*DL)));
11160 }
11161
11162 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11163 auto *InsElt = dyn_cast<InsertElementInst>(Vec);
11164 if (!InsElt)
11165 return Vec;
11166 GatherShuffleExtractSeq.insert(InsElt);
11167 CSEBlocks.insert(InsElt->getParent());
11168 // Add to our 'need-to-extract' list.
11169 if (isa<Instruction>(V)) {
11170 if (TreeEntry *Entry = getTreeEntry(V)) {
11171 // Find which lane we need to extract.
11172 User *UserOp = nullptr;
11173 if (Scalar != V) {
11174 if (auto *SI = dyn_cast<Instruction>(Scalar))
11175 UserOp = SI;
11176 } else {
11177 UserOp = InsElt;
11178 }
11179 if (UserOp) {
11180 unsigned FoundLane = Entry->findLaneForValue(V);
11181 ExternalUses.emplace_back(V, UserOp, FoundLane);
11182 }
11183 }
11184 }
11185 return Vec;
11186 };
11187 Value *Val0 =
11188 isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
11189 Type *ScalarTy = Val0->getType();
11190 FixedVectorType *VecTy = FixedVectorType::get(ScalarTy, VL.size());
11191 Value *Vec = Root ? Root : PoisonValue::get(VecTy);
11192 SmallVector<int> NonConsts;
11193 // Insert constant values at first.
11194 for (int I = 0, E = VL.size(); I < E; ++I) {
11195 if (PostponedIndices.contains(I))
11196 continue;
11197 if (!isConstant(VL[I])) {
11198 NonConsts.push_back(I);
11199 continue;
11200 }
11201 if (Root) {
11202 if (!isa<UndefValue>(VL[I])) {
11203 NonConsts.push_back(I);
11204 continue;
11205 }
11206 if (isa<PoisonValue>(VL[I]))
11207 continue;
11208 if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
11209 if (SV->getMaskValue(I) == PoisonMaskElem)
11210 continue;
11211 }
11212 }
11213 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11214 }
11215 // Insert non-constant values.
11216 for (int I : NonConsts)
11217 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11218 // Append instructions, which are/may be part of the loop, in the end to make
11219 // it possible to hoist non-loop-based instructions.
11220 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11221 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11222
11223 return Vec;
11224}
11225
11226/// Merges shuffle masks and emits final shuffle instruction, if required. It
11227/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
11228/// when the actual shuffle instruction is generated only if this is actually
11229/// required. Otherwise, the shuffle instruction emission is delayed till the
11230/// end of the process, to reduce the number of emitted instructions and further
11231/// analysis/transformations.
11232/// The class also will look through the previously emitted shuffle instructions
11233/// and properly mark indices in mask as undef.
11234/// For example, given the code
11235/// \code
11236/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
11237/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
11238/// \endcode
11239/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
11240/// look through %s1 and %s2 and emit
11241/// \code
11242/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11243/// \endcode
11244/// instead.
11245/// If 2 operands are of different size, the smallest one will be resized and
11246/// the mask recalculated properly.
11247/// For example, given the code
11248/// \code
11249/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
11250/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
11251/// \endcode
11252/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
11253/// look through %s1 and %s2 and emit
11254/// \code
11255/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11256/// \endcode
11257/// instead.
11258class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
11259 bool IsFinalized = false;
11260 /// Combined mask for all applied operands and masks. It is built during
11261 /// analysis and actual emission of shuffle vector instructions.
11262 SmallVector<int> CommonMask;
11263 /// List of operands for the shuffle vector instruction. It hold at max 2
11264 /// operands, if the 3rd is going to be added, the first 2 are combined into
11265 /// shuffle with \p CommonMask mask, the first operand sets to be the
11266 /// resulting shuffle and the second operand sets to be the newly added
11267 /// operand. The \p CommonMask is transformed in the proper way after that.
11268 SmallVector<Value *, 2> InVectors;
11269 IRBuilderBase &Builder;
11270 BoUpSLP &R;
11271
11272 class ShuffleIRBuilder {
11273 IRBuilderBase &Builder;
11274 /// Holds all of the instructions that we gathered.
11275 SetVector<Instruction *> &GatherShuffleExtractSeq;
11276 /// A list of blocks that we are going to CSE.
11277 DenseSet<BasicBlock *> &CSEBlocks;
11278 /// Data layout.
11279 const DataLayout &DL;
11280
11281 public:
11282 ShuffleIRBuilder(IRBuilderBase &Builder,
11283 SetVector<Instruction *> &GatherShuffleExtractSeq,
11284 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
11285 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11286 CSEBlocks(CSEBlocks), DL(DL) {}
11287 ~ShuffleIRBuilder() = default;
11288 /// Creates shufflevector for the 2 operands with the given mask.
11289 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
11290 if (V1->getType() != V2->getType()) {
11292 V1->getType()->isIntOrIntVectorTy() &&
11293 "Expected integer vector types only.");
11294 if (V1->getType() != V2->getType()) {
11295 if (cast<VectorType>(V2->getType())
11296 ->getElementType()
11297 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
11298 ->getElementType()
11299 ->getIntegerBitWidth())
11300 V2 = Builder.CreateIntCast(
11301 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
11302 else
11303 V1 = Builder.CreateIntCast(
11304 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
11305 }
11306 }
11307 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
11308 if (auto *I = dyn_cast<Instruction>(Vec)) {
11309 GatherShuffleExtractSeq.insert(I);
11310 CSEBlocks.insert(I->getParent());
11311 }
11312 return Vec;
11313 }
11314 /// Creates permutation of the single vector operand with the given mask, if
11315 /// it is not identity mask.
11316 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
11317 if (Mask.empty())
11318 return V1;
11319 unsigned VF = Mask.size();
11320 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
11321 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
11322 return V1;
11323 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
11324 if (auto *I = dyn_cast<Instruction>(Vec)) {
11325 GatherShuffleExtractSeq.insert(I);
11326 CSEBlocks.insert(I->getParent());
11327 }
11328 return Vec;
11329 }
11330 Value *createIdentity(Value *V) { return V; }
11331 Value *createPoison(Type *Ty, unsigned VF) {
11332 return PoisonValue::get(FixedVectorType::get(Ty, VF));
11333 }
11334 /// Resizes 2 input vector to match the sizes, if the they are not equal
11335 /// yet. The smallest vector is resized to the size of the larger vector.
11336 void resizeToMatch(Value *&V1, Value *&V2) {
11337 if (V1->getType() == V2->getType())
11338 return;
11339 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
11340 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
11341 int VF = std::max(V1VF, V2VF);
11342 int MinVF = std::min(V1VF, V2VF);
11343 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
11344 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
11345 0);
11346 Value *&Op = MinVF == V1VF ? V1 : V2;
11347 Op = Builder.CreateShuffleVector(Op, IdentityMask);
11348 if (auto *I = dyn_cast<Instruction>(Op)) {
11349 GatherShuffleExtractSeq.insert(I);
11350 CSEBlocks.insert(I->getParent());
11351 }
11352 if (MinVF == V1VF)
11353 V1 = Op;
11354 else
11355 V2 = Op;
11356 }
11357 };
11358
11359 /// Smart shuffle instruction emission, walks through shuffles trees and
11360 /// tries to find the best matching vector for the actual shuffle
11361 /// instruction.
11362 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
11363 assert(V1 && "Expected at least one vector value.");
11364 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11365 R.CSEBlocks, *R.DL);
11366 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11367 ShuffleBuilder);
11368 }
11369
11370 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
11371 /// shuffle emission.
11372 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
11373 ArrayRef<int> Mask) {
11374 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11375 if (Mask[Idx] != PoisonMaskElem)
11376 CommonMask[Idx] = Idx;
11377 }
11378
11379public:
11381 : Builder(Builder), R(R) {}
11382
11383 /// Adjusts extractelements after reusing them.
11384 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
11385 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11386 unsigned NumParts, bool &UseVecBaseAsInput) {
11387 UseVecBaseAsInput = false;
11388 SmallPtrSet<Value *, 4> UniqueBases;
11389 Value *VecBase = nullptr;
11390 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
11391 int Idx = Mask[I];
11392 if (Idx == PoisonMaskElem)
11393 continue;
11394 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
11395 VecBase = EI->getVectorOperand();
11396 if (const TreeEntry *TE = R.getTreeEntry(VecBase))
11397 VecBase = TE->VectorizedValue;
11398 assert(VecBase && "Expected vectorized value.");
11399 UniqueBases.insert(VecBase);
11400 // If the only one use is vectorized - can delete the extractelement
11401 // itself.
11402 if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) ||
11403 any_of(EI->users(), [&](User *U) {
11404 const TreeEntry *UTE = R.getTreeEntry(U);
11405 return !UTE || R.MultiNodeScalars.contains(U) ||
11406 (isa<GetElementPtrInst>(U) &&
11407 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
11408 count_if(R.VectorizableTree,
11409 [&](const std::unique_ptr<TreeEntry> &TE) {
11410 return any_of(TE->UserTreeIndices,
11411 [&](const EdgeInfo &Edge) {
11412 return Edge.UserTE == UTE;
11413 }) &&
11414 is_contained(TE->Scalars, EI);
11415 }) != 1;
11416 }))
11417 continue;
11418 R.eraseInstruction(EI);
11419 }
11420 if (NumParts == 1 || UniqueBases.size() == 1)
11421 return VecBase;
11422 UseVecBaseAsInput = true;
11423 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
11424 for (auto [I, Idx] : enumerate(Mask))
11425 if (Idx != PoisonMaskElem)
11426 Idx = I;
11427 };
11428 // Perform multi-register vector shuffle, joining them into a single virtual
11429 // long vector.
11430 // Need to shuffle each part independently and then insert all this parts
11431 // into a long virtual vector register, forming the original vector.
11432 Value *Vec = nullptr;
11433 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
11434 unsigned SliceSize = E->Scalars.size() / NumParts;
11435 for (unsigned Part = 0; Part < NumParts; ++Part) {
11437 ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
11438 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
11439 constexpr int MaxBases = 2;
11440 SmallVector<Value *, MaxBases> Bases(MaxBases);
11441#ifndef NDEBUG
11442 int PrevSize = 0;
11443#endif // NDEBUG
11444 for (const auto [I, V]: enumerate(VL)) {
11445 if (SubMask[I] == PoisonMaskElem)
11446 continue;
11447 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
11448 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
11449 VecOp = TE->VectorizedValue;
11450 assert(VecOp && "Expected vectorized value.");
11451 const int Size =
11452 cast<FixedVectorType>(VecOp->getType())->getNumElements();
11453#ifndef NDEBUG
11454 assert((PrevSize == Size || PrevSize == 0) &&
11455 "Expected vectors of the same size.");
11456 PrevSize = Size;
11457#endif // NDEBUG
11458 Bases[SubMask[I] < Size ? 0 : 1] = VecOp;
11459 }
11460 if (!Bases.front())
11461 continue;
11462 Value *SubVec;
11463 if (Bases.back()) {
11464 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
11465 TransformToIdentity(SubMask);
11466 } else {
11467 SubVec = Bases.front();
11468 }
11469 if (!Vec) {
11470 Vec = SubVec;
11471 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
11472 [&](unsigned P) {
11473 ArrayRef<int> SubMask =
11474 Mask.slice(P * SliceSize, SliceSize);
11475 return all_of(SubMask, [](int Idx) {
11476 return Idx == PoisonMaskElem;
11477 });
11478 })) &&
11479 "Expected first part or all previous parts masked.");
11480 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11481 } else {
11482 unsigned VF = cast<FixedVectorType>(Vec->getType())->getNumElements();
11483 if (Vec->getType() != SubVec->getType()) {
11484 unsigned SubVecVF =
11485 cast<FixedVectorType>(SubVec->getType())->getNumElements();
11486 VF = std::max(VF, SubVecVF);
11487 }
11488 // Adjust SubMask.
11489 for (int &Idx : SubMask)
11490 if (Idx != PoisonMaskElem)
11491 Idx += VF;
11492 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11493 Vec = createShuffle(Vec, SubVec, VecMask);
11494 TransformToIdentity(VecMask);
11495 }
11496 }
11497 copy(VecMask, Mask.begin());
11498 return Vec;
11499 }
11500 /// Checks if the specified entry \p E needs to be delayed because of its
11501 /// dependency nodes.
11502 std::optional<Value *>
11503 needToDelay(const TreeEntry *E,
11505 // No need to delay emission if all deps are ready.
11506 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
11507 return all_of(
11508 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
11509 }))
11510 return std::nullopt;
11511 // Postpone gather emission, will be emitted after the end of the
11512 // process to keep correct order.
11513 auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(),
11514 E->getVectorFactor());
11515 return Builder.CreateAlignedLoad(
11517 MaybeAlign());
11518 }
11519 /// Adds 2 input vectors (in form of tree entries) and the mask for their
11520 /// shuffling.
11521 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
11522 add(E1.VectorizedValue, E2.VectorizedValue, Mask);
11523 }
11524 /// Adds single input vector (in form of tree entry) and the mask for its
11525 /// shuffling.
11526 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
11527 add(E1.VectorizedValue, Mask);
11528 }
11529 /// Adds 2 input vectors and the mask for their shuffling.
11530 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
11531 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
11532 if (InVectors.empty()) {
11533 InVectors.push_back(V1);
11534 InVectors.push_back(V2);
11535 CommonMask.assign(Mask.begin(), Mask.end());
11536 return;
11537 }
11538 Value *Vec = InVectors.front();
11539 if (InVectors.size() == 2) {
11540 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
11541 transformMaskAfterShuffle(CommonMask, CommonMask);
11542 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
11543 Mask.size()) {
11544 Vec = createShuffle(Vec, nullptr, CommonMask);
11545 transformMaskAfterShuffle(CommonMask, CommonMask);
11546 }
11547 V1 = createShuffle(V1, V2, Mask);
11548 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11549 if (Mask[Idx] != PoisonMaskElem)
11550 CommonMask[Idx] = Idx + Sz;
11551 InVectors.front() = Vec;
11552 if (InVectors.size() == 2)
11553 InVectors.back() = V1;
11554 else
11555 InVectors.push_back(V1);
11556 }
11557 /// Adds another one input vector and the mask for the shuffling.
11558 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
11559 if (InVectors.empty()) {
11560 if (!isa<FixedVectorType>(V1->getType())) {
11561 V1 = createShuffle(V1, nullptr, CommonMask);
11562 CommonMask.assign(Mask.size(), PoisonMaskElem);
11563 transformMaskAfterShuffle(CommonMask, Mask);
11564 }
11565 InVectors.push_back(V1);
11566 CommonMask.assign(Mask.begin(), Mask.end());
11567 return;
11568 }
11569 const auto *It = find(InVectors, V1);
11570 if (It == InVectors.end()) {
11571 if (InVectors.size() == 2 ||
11572 InVectors.front()->getType() != V1->getType() ||
11573 !isa<FixedVectorType>(V1->getType())) {
11574 Value *V = InVectors.front();
11575 if (InVectors.size() == 2) {
11576 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
11577 transformMaskAfterShuffle(CommonMask, CommonMask);
11578 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
11579 CommonMask.size()) {
11580 V = createShuffle(InVectors.front(), nullptr, CommonMask);
11581 transformMaskAfterShuffle(CommonMask, CommonMask);
11582 }
11583 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11584 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
11585 CommonMask[Idx] =
11586 V->getType() != V1->getType()
11587 ? Idx + Sz
11588 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
11589 ->getNumElements();
11590 if (V->getType() != V1->getType())
11591 V1 = createShuffle(V1, nullptr, Mask);
11592 InVectors.front() = V;
11593 if (InVectors.size() == 2)
11594 InVectors.back() = V1;
11595 else
11596 InVectors.push_back(V1);
11597 return;
11598 }
11599 // Check if second vector is required if the used elements are already
11600 // used from the first one.
11601 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11602 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
11603 InVectors.push_back(V1);
11604 break;
11605 }
11606 }
11607 int VF = CommonMask.size();
11608 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
11609 VF = FTy->getNumElements();
11610 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11611 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
11612 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
11613 }
11614 /// Adds another one input vector and the mask for the shuffling.
11616 SmallVector<int> NewMask;
11617 inversePermutation(Order, NewMask);
11618 add(V1, NewMask);
11619 }
11620 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
11621 Value *Root = nullptr) {
11622 return R.gather(VL, Root);
11623 }
11624 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
11625 /// Finalize emission of the shuffles.
11626 /// \param Action the action (if any) to be performed before final applying of
11627 /// the \p ExtMask mask.
11628 Value *
11629 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
11630 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
11631 IsFinalized = true;
11632 if (Action) {
11633 Value *Vec = InVectors.front();
11634 if (InVectors.size() == 2) {
11635 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
11636 InVectors.pop_back();
11637 } else {
11638 Vec = createShuffle(Vec, nullptr, CommonMask);
11639 }
11640 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11641 if (CommonMask[Idx] != PoisonMaskElem)
11642 CommonMask[Idx] = Idx;
11643 assert(VF > 0 &&
11644 "Expected vector length for the final value before action.");
11645 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
11646 if (VecVF < VF) {
11647 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
11648 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
11649 Vec = createShuffle(Vec, nullptr, ResizeMask);
11650 }
11651 Action(Vec, CommonMask);
11652 InVectors.front() = Vec;
11653 }
11654 if (!ExtMask.empty()) {
11655 if (CommonMask.empty()) {
11656 CommonMask.assign(ExtMask.begin(), ExtMask.end());
11657 } else {
11658 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
11659 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
11660 if (ExtMask[I] == PoisonMaskElem)
11661 continue;
11662 NewMask[I] = CommonMask[ExtMask[I]];
11663 }
11664 CommonMask.swap(NewMask);
11665 }
11666 }
11667 if (CommonMask.empty()) {
11668 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
11669 return InVectors.front();
11670 }
11671 if (InVectors.size() == 2)
11672 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
11673 return createShuffle(InVectors.front(), nullptr, CommonMask);
11674 }
11675
11677 assert((IsFinalized || CommonMask.empty()) &&
11678 "Shuffle construction must be finalized.");
11679 }
11680};
11681
11682Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
11683 bool PostponedPHIs) {
11684 ValueList &VL = E->getOperand(NodeIdx);
11685 const unsigned VF = VL.size();
11686 InstructionsState S = getSameOpcode(VL, *TLI);
11687 // Special processing for GEPs bundle, which may include non-gep values.
11688 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
11689 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11690 if (It != VL.end())
11691 S = getSameOpcode(*It, *TLI);
11692 }
11693 if (S.getOpcode()) {
11694 auto CheckSameVE = [&](const TreeEntry *VE) {
11695 return VE->isSame(VL) &&
11696 (any_of(VE->UserTreeIndices,
11697 [E, NodeIdx](const EdgeInfo &EI) {
11698 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11699 }) ||
11700 any_of(VectorizableTree,
11701 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
11702 return TE->isOperandGatherNode({E, NodeIdx}) &&
11703 VE->isSame(TE->Scalars);
11704 }));
11705 };
11706 TreeEntry *VE = getTreeEntry(S.OpValue);
11707 bool IsSameVE = VE && CheckSameVE(VE);
11708 if (!IsSameVE) {
11709 auto It = MultiNodeScalars.find(S.OpValue);
11710 if (It != MultiNodeScalars.end()) {
11711 auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
11712 return TE != VE && CheckSameVE(TE);
11713 });
11714 if (I != It->getSecond().end()) {
11715 VE = *I;
11716 IsSameVE = true;
11717 }
11718 }
11719 }
11720 if (IsSameVE) {
11721 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
11722 ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
11723 ShuffleBuilder.add(V, Mask);
11724 return ShuffleBuilder.finalize(std::nullopt);
11725 };
11726 Value *V = vectorizeTree(VE, PostponedPHIs);
11727 if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
11728 if (!VE->ReuseShuffleIndices.empty()) {
11729 // Reshuffle to get only unique values.
11730 // If some of the scalars are duplicated in the vectorization
11731 // tree entry, we do not vectorize them but instead generate a
11732 // mask for the reuses. But if there are several users of the
11733 // same entry, they may have different vectorization factors.
11734 // This is especially important for PHI nodes. In this case, we
11735 // need to adapt the resulting instruction for the user
11736 // vectorization factor and have to reshuffle it again to take
11737 // only unique elements of the vector. Without this code the
11738 // function incorrectly returns reduced vector instruction with
11739 // the same elements, not with the unique ones.
11740
11741 // block:
11742 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
11743 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
11744 // ... (use %2)
11745 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
11746 // br %block
11748 for (auto [I, V] : enumerate(VL)) {
11749 if (isa<PoisonValue>(V))
11750 continue;
11751 Mask[I] = VE->findLaneForValue(V);
11752 }
11753 V = FinalShuffle(V, Mask);
11754 } else {
11755 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
11756 "Expected vectorization factor less "
11757 "than original vector size.");
11758 SmallVector<int> UniformMask(VF, 0);
11759 std::iota(UniformMask.begin(), UniformMask.end(), 0);
11760 V = FinalShuffle(V, UniformMask);
11761 }
11762 }
11763 // Need to update the operand gather node, if actually the operand is not a
11764 // vectorized node, but the buildvector/gather node, which matches one of
11765 // the vectorized nodes.
11766 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
11767 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11768 }) == VE->UserTreeIndices.end()) {
11769 auto *It = find_if(
11770 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
11771 return TE->State == TreeEntry::NeedToGather &&
11772 TE->UserTreeIndices.front().UserTE == E &&
11773 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
11774 });
11775 assert(It != VectorizableTree.end() && "Expected gather node operand.");
11776 (*It)->VectorizedValue = V;
11777 }
11778 return V;
11779 }
11780 }
11781
11782 // Find the corresponding gather entry and vectorize it.
11783 // Allows to be more accurate with tree/graph transformations, checks for the
11784 // correctness of the transformations in many cases.
11785 auto *I = find_if(VectorizableTree,
11786 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
11787 return TE->isOperandGatherNode({E, NodeIdx});
11788 });
11789 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
11790 assert(I->get()->UserTreeIndices.size() == 1 &&
11791 "Expected only single user for the gather node.");
11792 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
11793 return vectorizeTree(I->get(), PostponedPHIs);
11794}
11795
11796template <typename BVTy, typename ResTy, typename... Args>
11797ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
11798 assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
11799 unsigned VF = E->getVectorFactor();
11800
11801 bool NeedFreeze = false;
11802 SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
11803 E->ReuseShuffleIndices.end());
11804 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
11805 // Build a mask out of the reorder indices and reorder scalars per this
11806 // mask.
11807 SmallVector<int> ReorderMask;
11808 inversePermutation(E->ReorderIndices, ReorderMask);
11809 if (!ReorderMask.empty())
11810 reorderScalars(GatheredScalars, ReorderMask);
11811 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
11812 unsigned I, unsigned SliceSize) {
11813 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
11814 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
11815 }))
11816 return false;
11817 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
11818 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
11819 if (UserTE->getNumOperands() != 2)
11820 return false;
11821 auto *It =
11822 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
11823 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
11824 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
11825 }) != TE->UserTreeIndices.end();
11826 });
11827 if (It == VectorizableTree.end())
11828 return false;
11829 int Idx;
11830 if ((Mask.size() < InputVF &&
11832 Idx == 0) ||
11833 (Mask.size() == InputVF &&
11834 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
11835 std::iota(std::next(Mask.begin(), I * SliceSize),
11836 std::next(Mask.begin(), (I + 1) * SliceSize), 0);
11837 } else {
11838 unsigned IVal =
11839 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
11840 std::fill(std::next(Mask.begin(), I * SliceSize),
11841 std::next(Mask.begin(), (I + 1) * SliceSize), IVal);
11842 }
11843 return true;
11844 };
11845 BVTy ShuffleBuilder(Params...);
11846 ResTy Res = ResTy();
11848 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
11850 Value *ExtractVecBase = nullptr;
11851 bool UseVecBaseAsInput = false;
11854 Type *ScalarTy = GatheredScalars.front()->getType();
11855 auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());
11856 unsigned NumParts = TTI->getNumberOfParts(VecTy);
11857 if (NumParts == 0 || NumParts >= GatheredScalars.size())
11858 NumParts = 1;
11859 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
11860 // Check for gathered extracts.
11861 bool Resized = false;
11862 ExtractShuffles =
11863 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
11864 if (!ExtractShuffles.empty()) {
11865 SmallVector<const TreeEntry *> ExtractEntries;
11866 for (auto [Idx, I] : enumerate(ExtractMask)) {
11867 if (I == PoisonMaskElem)
11868 continue;
11869 if (const auto *TE = getTreeEntry(
11870 cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
11871 ExtractEntries.push_back(TE);
11872 }
11873 if (std::optional<ResTy> Delayed =
11874 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
11875 // Delay emission of gathers which are not ready yet.
11876 PostponedGathers.insert(E);
11877 // Postpone gather emission, will be emitted after the end of the
11878 // process to keep correct order.
11879 return *Delayed;
11880 }
11881 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
11882 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
11883 ExtractVecBase = VecBase;
11884 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
11885 if (VF == VecBaseTy->getNumElements() &&
11886 GatheredScalars.size() != VF) {
11887 Resized = true;
11888 GatheredScalars.append(VF - GatheredScalars.size(),
11889 PoisonValue::get(ScalarTy));
11890 }
11891 }
11892 }
11893 // Gather extracts after we check for full matched gathers only.
11894 if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
11895 E->isAltShuffle() ||
11896 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
11897 isSplat(E->Scalars) ||
11898 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
11899 GatherShuffles =
11900 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
11901 }
11902 if (!GatherShuffles.empty()) {
11903 if (std::optional<ResTy> Delayed =
11904 ShuffleBuilder.needToDelay(E, Entries)) {
11905 // Delay emission of gathers which are not ready yet.
11906 PostponedGathers.insert(E);
11907 // Postpone gather emission, will be emitted after the end of the
11908 // process to keep correct order.
11909 return *Delayed;
11910 }
11911 if (GatherShuffles.size() == 1 &&
11912 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
11913 Entries.front().front()->isSame(E->Scalars)) {
11914 // Perfect match in the graph, will reuse the previously vectorized
11915 // node. Cost is 0.
11916 LLVM_DEBUG(
11917 dbgs()
11918 << "SLP: perfect diamond match for gather bundle "
11919 << shortBundleName(E->Scalars) << ".\n");
11920 // Restore the mask for previous partially matched values.
11921 Mask.resize(E->Scalars.size());
11922 const TreeEntry *FrontTE = Entries.front().front();
11923 if (FrontTE->ReorderIndices.empty() &&
11924 ((FrontTE->ReuseShuffleIndices.empty() &&
11925 E->Scalars.size() == FrontTE->Scalars.size()) ||
11926 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
11927 std::iota(Mask.begin(), Mask.end(), 0);
11928 } else {
11929 for (auto [I, V] : enumerate(E->Scalars)) {
11930 if (isa<PoisonValue>(V)) {
11932 continue;
11933 }
11934 Mask[I] = FrontTE->findLaneForValue(V);
11935 }
11936 }
11937 ShuffleBuilder.add(*FrontTE, Mask);
11938 Res = ShuffleBuilder.finalize(E->getCommonMask());
11939 return Res;
11940 }
11941 if (!Resized) {
11942 if (GatheredScalars.size() != VF &&
11943 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
11944 return any_of(TEs, [&](const TreeEntry *TE) {
11945 return TE->getVectorFactor() == VF;
11946 });
11947 }))
11948 GatheredScalars.append(VF - GatheredScalars.size(),
11949 PoisonValue::get(ScalarTy));
11950 }
11951 // Remove shuffled elements from list of gathers.
11952 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
11953 if (Mask[I] != PoisonMaskElem)
11954 GatheredScalars[I] = PoisonValue::get(ScalarTy);
11955 }
11956 }
11957 }
11958 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
11959 SmallVectorImpl<int> &ReuseMask,
11960 bool IsRootPoison) {
11961 // For splats with can emit broadcasts instead of gathers, so try to find
11962 // such sequences.
11963 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
11964 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
11965 Scalars.append(VF - Scalars.size(), PoisonValue::get(ScalarTy));
11966 SmallVector<int> UndefPos;
11967 DenseMap<Value *, unsigned> UniquePositions;
11968 // Gather unique non-const values and all constant values.
11969 // For repeated values, just shuffle them.
11970 int NumNonConsts = 0;
11971 int SinglePos = 0;
11972 for (auto [I, V] : enumerate(Scalars)) {
11973 if (isa<UndefValue>(V)) {
11974 if (!isa<PoisonValue>(V)) {
11975 ReuseMask[I] = I;
11976 UndefPos.push_back(I);
11977 }
11978 continue;
11979 }
11980 if (isConstant(V)) {
11981 ReuseMask[I] = I;
11982 continue;
11983 }
11984 ++NumNonConsts;
11985 SinglePos = I;
11986 Value *OrigV = V;
11987 Scalars[I] = PoisonValue::get(ScalarTy);
11988 if (IsSplat) {
11989 Scalars.front() = OrigV;
11990 ReuseMask[I] = 0;
11991 } else {
11992 const auto Res = UniquePositions.try_emplace(OrigV, I);
11993 Scalars[Res.first->second] = OrigV;
11994 ReuseMask[I] = Res.first->second;
11995 }
11996 }
11997 if (NumNonConsts == 1) {
11998 // Restore single insert element.
11999 if (IsSplat) {
12000 ReuseMask.assign(VF, PoisonMaskElem);
12001 std::swap(Scalars.front(), Scalars[SinglePos]);
12002 if (!UndefPos.empty() && UndefPos.front() == 0)
12003 Scalars.front() = UndefValue::get(ScalarTy);
12004 }
12005 ReuseMask[SinglePos] = SinglePos;
12006 } else if (!UndefPos.empty() && IsSplat) {
12007 // For undef values, try to replace them with the simple broadcast.
12008 // We can do it if the broadcasted value is guaranteed to be
12009 // non-poisonous, or by freezing the incoming scalar value first.
12010 auto *It = find_if(Scalars, [this, E](Value *V) {
12011 return !isa<UndefValue>(V) &&
12012 (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
12013 (E->UserTreeIndices.size() == 1 &&
12014 any_of(V->uses(), [E](const Use &U) {
12015 // Check if the value already used in the same operation in
12016 // one of the nodes already.
12017 return E->UserTreeIndices.front().EdgeIdx !=
12018 U.getOperandNo() &&
12019 is_contained(
12020 E->UserTreeIndices.front().UserTE->Scalars,
12021 U.getUser());
12022 })));
12023 });
12024 if (It != Scalars.end()) {
12025 // Replace undefs by the non-poisoned scalars and emit broadcast.
12026 int Pos = std::distance(Scalars.begin(), It);
12027 for (int I : UndefPos) {
12028 // Set the undef position to the non-poisoned scalar.
12029 ReuseMask[I] = Pos;
12030 // Replace the undef by the poison, in the mask it is replaced by
12031 // non-poisoned scalar already.
12032 if (I != Pos)
12033 Scalars[I] = PoisonValue::get(ScalarTy);
12034 }
12035 } else {
12036 // Replace undefs by the poisons, emit broadcast and then emit
12037 // freeze.
12038 for (int I : UndefPos) {
12039 ReuseMask[I] = PoisonMaskElem;
12040 if (isa<UndefValue>(Scalars[I]))
12041 Scalars[I] = PoisonValue::get(ScalarTy);
12042 }
12043 NeedFreeze = true;
12044 }
12045 }
12046 };
12047 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
12048 bool IsNonPoisoned = true;
12049 bool IsUsedInExpr = true;
12050 Value *Vec1 = nullptr;
12051 if (!ExtractShuffles.empty()) {
12052 // Gather of extractelements can be represented as just a shuffle of
12053 // a single/two vectors the scalars are extracted from.
12054 // Find input vectors.
12055 Value *Vec2 = nullptr;
12056 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12057 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
12058 ExtractMask[I] = PoisonMaskElem;
12059 }
12060 if (UseVecBaseAsInput) {
12061 Vec1 = ExtractVecBase;
12062 } else {
12063 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12064 if (ExtractMask[I] == PoisonMaskElem)
12065 continue;
12066 if (isa<UndefValue>(E->Scalars[I]))
12067 continue;
12068 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
12069 Value *VecOp = EI->getVectorOperand();
12070 if (const auto *TE = getTreeEntry(VecOp))
12071 if (TE->VectorizedValue)
12072 VecOp = TE->VectorizedValue;
12073 if (!Vec1) {
12074 Vec1 = VecOp;
12075 } else if (Vec1 != VecOp) {
12076 assert((!Vec2 || Vec2 == VecOp) &&
12077 "Expected only 1 or 2 vectors shuffle.");
12078 Vec2 = VecOp;
12079 }
12080 }
12081 }
12082 if (Vec2) {
12083 IsUsedInExpr = false;
12084 IsNonPoisoned &=
12086 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12087 } else if (Vec1) {
12088 IsUsedInExpr &= FindReusedSplat(
12089 ExtractMask,
12090 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
12091 ExtractMask.size());
12092 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
12093 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
12094 } else {
12095 IsUsedInExpr = false;
12096 ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get(
12097 ScalarTy, GatheredScalars.size())),
12098 ExtractMask, /*ForExtracts=*/true);
12099 }
12100 }
12101 if (!GatherShuffles.empty()) {
12102 unsigned SliceSize = E->Scalars.size() / NumParts;
12103 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
12104 for (const auto [I, TEs] : enumerate(Entries)) {
12105 if (TEs.empty()) {
12106 assert(!GatherShuffles[I] &&
12107 "No shuffles with empty entries list expected.");
12108 continue;
12109 }
12110 assert((TEs.size() == 1 || TEs.size() == 2) &&
12111 "Expected shuffle of 1 or 2 entries.");
12112 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
12113 VecMask.assign(VecMask.size(), PoisonMaskElem);
12114 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
12115 if (TEs.size() == 1) {
12116 IsUsedInExpr &= FindReusedSplat(
12117 VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
12118 ShuffleBuilder.add(*TEs.front(), VecMask);
12119 if (TEs.front()->VectorizedValue)
12120 IsNonPoisoned &=
12121 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
12122 } else {
12123 IsUsedInExpr = false;
12124 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
12125 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
12126 IsNonPoisoned &=
12127 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
12128 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
12129 }
12130 }
12131 }
12132 // Try to figure out best way to combine values: build a shuffle and insert
12133 // elements or just build several shuffles.
12134 // Insert non-constant scalars.
12135 SmallVector<Value *> NonConstants(GatheredScalars);
12136 int EMSz = ExtractMask.size();
12137 int MSz = Mask.size();
12138 // Try to build constant vector and shuffle with it only if currently we
12139 // have a single permutation and more than 1 scalar constants.
12140 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
12141 bool IsIdentityShuffle =
12142 ((UseVecBaseAsInput ||
12143 all_of(ExtractShuffles,
12144 [](const std::optional<TTI::ShuffleKind> &SK) {
12145 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12147 })) &&
12148 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
12149 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
12150 (!GatherShuffles.empty() &&
12151 all_of(GatherShuffles,
12152 [](const std::optional<TTI::ShuffleKind> &SK) {
12153 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12155 }) &&
12156 none_of(Mask, [&](int I) { return I >= MSz; }) &&
12158 bool EnoughConstsForShuffle =
12159 IsSingleShuffle &&
12160 (none_of(GatheredScalars,
12161 [](Value *V) {
12162 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12163 }) ||
12164 any_of(GatheredScalars,
12165 [](Value *V) {
12166 return isa<Constant>(V) && !isa<UndefValue>(V);
12167 })) &&
12168 (!IsIdentityShuffle ||
12169 (GatheredScalars.size() == 2 &&
12170 any_of(GatheredScalars,
12171 [](Value *V) { return !isa<UndefValue>(V); })) ||
12172 count_if(GatheredScalars, [](Value *V) {
12173 return isa<Constant>(V) && !isa<PoisonValue>(V);
12174 }) > 1);
12175 // NonConstants array contains just non-constant values, GatheredScalars
12176 // contains only constant to build final vector and then shuffle.
12177 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
12178 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
12179 NonConstants[I] = PoisonValue::get(ScalarTy);
12180 else
12181 GatheredScalars[I] = PoisonValue::get(ScalarTy);
12182 }
12183 // Generate constants for final shuffle and build a mask for them.
12184 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
12185 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
12186 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
12187 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12188 ShuffleBuilder.add(BV, BVMask);
12189 }
12190 if (all_of(NonConstants, [=](Value *V) {
12191 return isa<PoisonValue>(V) ||
12192 (IsSingleShuffle && ((IsIdentityShuffle &&
12193 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
12194 }))
12195 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12196 else
12197 Res = ShuffleBuilder.finalize(
12198 E->ReuseShuffleIndices, E->Scalars.size(),
12199 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
12200 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
12201 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
12202 });
12203 } else if (!allConstant(GatheredScalars)) {
12204 // Gather unique scalars and all constants.
12205 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
12206 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
12207 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
12208 ShuffleBuilder.add(BV, ReuseMask);
12209 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12210 } else {
12211 // Gather all constants.
12212 SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem);
12213 for (auto [I, V] : enumerate(E->Scalars)) {
12214 if (!isa<PoisonValue>(V))
12215 Mask[I] = I;
12216 }
12217 Value *BV = ShuffleBuilder.gather(E->Scalars);
12218 ShuffleBuilder.add(BV, Mask);
12219 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12220 }
12221
12222 if (NeedFreeze)
12223 Res = ShuffleBuilder.createFreeze(Res);
12224 return Res;
12225}
12226
12227Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
12228 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, Builder,
12229 *this);
12230}
12231
12232Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
12233 IRBuilderBase::InsertPointGuard Guard(Builder);
12234
12235 if (E->VectorizedValue &&
12236 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12237 E->isAltShuffle())) {
12238 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
12239 return E->VectorizedValue;
12240 }
12241
12242 if (E->State == TreeEntry::NeedToGather) {
12243 // Set insert point for non-reduction initial nodes.
12244 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12245 setInsertPointAfterBundle(E);
12246 Value *Vec = createBuildVector(E);
12247 E->VectorizedValue = Vec;
12248 return Vec;
12249 }
12250
12251 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
12252 auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
12253 ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
12254 if (E->getOpcode() == Instruction::Store) {
12256 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
12257 E->ReorderIndices.size());
12258 ShuffleBuilder.add(V, Mask);
12259 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12260 ShuffleBuilder.addOrdered(V, std::nullopt);
12261 } else {
12262 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
12263 }
12264 return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12265 };
12266
12267 assert((E->State == TreeEntry::Vectorize ||
12268 E->State == TreeEntry::ScatterVectorize ||
12269 E->State == TreeEntry::StridedVectorize) &&
12270 "Unhandled state");
12271 unsigned ShuffleOrOp =
12272 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
12273 Instruction *VL0 = E->getMainOp();
12274 Type *ScalarTy = VL0->getType();
12275 if (auto *Store = dyn_cast<StoreInst>(VL0))
12276 ScalarTy = Store->getValueOperand()->getType();
12277 else if (auto *IE = dyn_cast<InsertElementInst>(VL0))
12278 ScalarTy = IE->getOperand(1)->getType();
12279 auto It = MinBWs.find(E);
12280 if (It != MinBWs.end())
12281 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
12282 auto GetOperandSignedness = [&](unsigned Idx) {
12283 const TreeEntry *OpE = getOperandEntry(E, Idx);
12284 bool IsSigned = false;
12285 auto It = MinBWs.find(OpE);
12286 if (It != MinBWs.end())
12287 IsSigned = It->second.second;
12288 else
12289 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
12290 return !isKnownNonNegative(R, SimplifyQuery(*DL));
12291 });
12292 return IsSigned;
12293 };
12294 auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
12295 switch (ShuffleOrOp) {
12296 case Instruction::PHI: {
12297 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
12298 E != VectorizableTree.front().get() ||
12299 !E->UserTreeIndices.empty()) &&
12300 "PHI reordering is free.");
12301 if (PostponedPHIs && E->VectorizedValue)
12302 return E->VectorizedValue;
12303 auto *PH = cast<PHINode>(VL0);
12304 Builder.SetInsertPoint(PH->getParent(),
12305 PH->getParent()->getFirstNonPHIIt());
12306 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12307 if (PostponedPHIs || !E->VectorizedValue) {
12308 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
12309 E->PHI = NewPhi;
12310 Value *V = NewPhi;
12311
12312 // Adjust insertion point once all PHI's have been generated.
12313 Builder.SetInsertPoint(PH->getParent(),
12314 PH->getParent()->getFirstInsertionPt());
12315 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12316
12317 V = FinalShuffle(V, E, VecTy);
12318
12319 E->VectorizedValue = V;
12320 if (PostponedPHIs)
12321 return V;
12322 }
12323 PHINode *NewPhi = cast<PHINode>(E->PHI);
12324 // If phi node is fully emitted - exit.
12325 if (NewPhi->getNumIncomingValues() != 0)
12326 return NewPhi;
12327
12328 // PHINodes may have multiple entries from the same block. We want to
12329 // visit every block once.
12331
12332 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12334 BasicBlock *IBB = PH->getIncomingBlock(I);
12335
12336 // Stop emission if all incoming values are generated.
12337 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
12338 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12339 return NewPhi;
12340 }
12341
12342 if (!VisitedBBs.insert(IBB).second) {
12343 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
12344 continue;
12345 }
12346
12347 Builder.SetInsertPoint(IBB->getTerminator());
12348 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12349 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
12350 if (VecTy != Vec->getType()) {
12351 assert((It != MinBWs.end() ||
12352 getOperandEntry(E, I)->State == TreeEntry::NeedToGather ||
12353 MinBWs.contains(getOperandEntry(E, I))) &&
12354 "Expected item in MinBWs.");
12355 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
12356 }
12357 NewPhi->addIncoming(Vec, IBB);
12358 }
12359
12360 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
12361 "Invalid number of incoming values");
12362 return NewPhi;
12363 }
12364
12365 case Instruction::ExtractElement: {
12366 Value *V = E->getSingleOperand(0);
12367 if (const TreeEntry *TE = getTreeEntry(V))
12368 V = TE->VectorizedValue;
12369 setInsertPointAfterBundle(E);
12370 V = FinalShuffle(V, E, VecTy);
12371 E->VectorizedValue = V;
12372 return V;
12373 }
12374 case Instruction::ExtractValue: {
12375 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
12376 Builder.SetInsertPoint(LI);
12377 Value *Ptr = LI->getPointerOperand();
12378 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
12379 Value *NewV = propagateMetadata(V, E->Scalars);
12380 NewV = FinalShuffle(NewV, E, VecTy);
12381 E->VectorizedValue = NewV;
12382 return NewV;
12383 }
12384 case Instruction::InsertElement: {
12385 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
12386 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
12387 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
12388 ArrayRef<Value *> Op = E->getOperand(1);
12389 Type *ScalarTy = Op.front()->getType();
12390 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
12391 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
12392 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
12393 assert(Res.first > 0 && "Expected item in MinBWs.");
12394 V = Builder.CreateIntCast(
12395 V,
12397 ScalarTy,
12398 cast<FixedVectorType>(V->getType())->getNumElements()),
12399 Res.second);
12400 }
12401
12402 // Create InsertVector shuffle if necessary
12403 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
12404 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
12405 }));
12406 const unsigned NumElts =
12407 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
12408 const unsigned NumScalars = E->Scalars.size();
12409
12410 unsigned Offset = *getInsertIndex(VL0);
12411 assert(Offset < NumElts && "Failed to find vector index offset");
12412
12413 // Create shuffle to resize vector
12415 if (!E->ReorderIndices.empty()) {
12416 inversePermutation(E->ReorderIndices, Mask);
12417 Mask.append(NumElts - NumScalars, PoisonMaskElem);
12418 } else {
12419 Mask.assign(NumElts, PoisonMaskElem);
12420 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
12421 }
12422 // Create InsertVector shuffle if necessary
12423 bool IsIdentity = true;
12424 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
12425 Mask.swap(PrevMask);
12426 for (unsigned I = 0; I < NumScalars; ++I) {
12427 Value *Scalar = E->Scalars[PrevMask[I]];
12428 unsigned InsertIdx = *getInsertIndex(Scalar);
12429 IsIdentity &= InsertIdx - Offset == I;
12430 Mask[InsertIdx - Offset] = I;
12431 }
12432 if (!IsIdentity || NumElts != NumScalars) {
12433 Value *V2 = nullptr;
12434 bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
12435 SmallVector<int> InsertMask(Mask);
12436 if (NumElts != NumScalars && Offset == 0) {
12437 // Follow all insert element instructions from the current buildvector
12438 // sequence.
12439 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
12440 do {
12441 std::optional<unsigned> InsertIdx = getInsertIndex(Ins);
12442 if (!InsertIdx)
12443 break;
12444 if (InsertMask[*InsertIdx] == PoisonMaskElem)
12445 InsertMask[*InsertIdx] = *InsertIdx;
12446 if (!Ins->hasOneUse())
12447 break;
12448 Ins = dyn_cast_or_null<InsertElementInst>(
12449 Ins->getUniqueUndroppableUser());
12450 } while (Ins);
12451 SmallBitVector UseMask =
12452 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12453 SmallBitVector IsFirstPoison =
12454 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12455 SmallBitVector IsFirstUndef =
12456 isUndefVector(FirstInsert->getOperand(0), UseMask);
12457 if (!IsFirstPoison.all()) {
12458 unsigned Idx = 0;
12459 for (unsigned I = 0; I < NumElts; I++) {
12460 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
12461 IsFirstUndef.test(I)) {
12462 if (IsVNonPoisonous) {
12463 InsertMask[I] = I < NumScalars ? I : 0;
12464 continue;
12465 }
12466 if (!V2)
12467 V2 = UndefValue::get(V->getType());
12468 if (Idx >= NumScalars)
12469 Idx = NumScalars - 1;
12470 InsertMask[I] = NumScalars + Idx;
12471 ++Idx;
12472 } else if (InsertMask[I] != PoisonMaskElem &&
12473 Mask[I] == PoisonMaskElem) {
12474 InsertMask[I] = PoisonMaskElem;
12475 }
12476 }
12477 } else {
12478 InsertMask = Mask;
12479 }
12480 }
12481 if (!V2)
12482 V2 = PoisonValue::get(V->getType());
12483 V = Builder.CreateShuffleVector(V, V2, InsertMask);
12484 if (auto *I = dyn_cast<Instruction>(V)) {
12485 GatherShuffleExtractSeq.insert(I);
12486 CSEBlocks.insert(I->getParent());
12487 }
12488 }
12489
12490 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
12491 for (unsigned I = 0; I < NumElts; I++) {
12492 if (Mask[I] != PoisonMaskElem)
12493 InsertMask[Offset + I] = I;
12494 }
12495 SmallBitVector UseMask =
12496 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12497 SmallBitVector IsFirstUndef =
12498 isUndefVector(FirstInsert->getOperand(0), UseMask);
12499 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
12500 NumElts != NumScalars) {
12501 if (IsFirstUndef.all()) {
12502 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
12503 SmallBitVector IsFirstPoison =
12504 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12505 if (!IsFirstPoison.all()) {
12506 for (unsigned I = 0; I < NumElts; I++) {
12507 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
12508 InsertMask[I] = I + NumElts;
12509 }
12510 }
12511 V = Builder.CreateShuffleVector(
12512 V,
12513 IsFirstPoison.all() ? PoisonValue::get(V->getType())
12514 : FirstInsert->getOperand(0),
12515 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
12516 if (auto *I = dyn_cast<Instruction>(V)) {
12517 GatherShuffleExtractSeq.insert(I);
12518 CSEBlocks.insert(I->getParent());
12519 }
12520 }
12521 } else {
12522 SmallBitVector IsFirstPoison =
12523 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12524 for (unsigned I = 0; I < NumElts; I++) {
12525 if (InsertMask[I] == PoisonMaskElem)
12526 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
12527 else
12528 InsertMask[I] += NumElts;
12529 }
12530 V = Builder.CreateShuffleVector(
12531 FirstInsert->getOperand(0), V, InsertMask,
12532 cast<Instruction>(E->Scalars.back())->getName());
12533 if (auto *I = dyn_cast<Instruction>(V)) {
12534 GatherShuffleExtractSeq.insert(I);
12535 CSEBlocks.insert(I->getParent());
12536 }
12537 }
12538 }
12539
12540 ++NumVectorInstructions;
12541 E->VectorizedValue = V;
12542 return V;
12543 }
12544 case Instruction::ZExt:
12545 case Instruction::SExt:
12546 case Instruction::FPToUI:
12547 case Instruction::FPToSI:
12548 case Instruction::FPExt:
12549 case Instruction::PtrToInt:
12550 case Instruction::IntToPtr:
12551 case Instruction::SIToFP:
12552 case Instruction::UIToFP:
12553 case Instruction::Trunc:
12554 case Instruction::FPTrunc:
12555 case Instruction::BitCast: {
12556 setInsertPointAfterBundle(E);
12557
12558 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
12559 if (E->VectorizedValue) {
12560 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12561 return E->VectorizedValue;
12562 }
12563
12564 auto *CI = cast<CastInst>(VL0);
12565 Instruction::CastOps VecOpcode = CI->getOpcode();
12566 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
12567 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
12568 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
12569 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
12570 SrcScalarTy != CI->getOperand(0)->getType())) {
12571 // Check if the values are candidates to demote.
12572 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
12573 if (SrcIt != MinBWs.end())
12574 SrcBWSz = SrcIt->second.first;
12575 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
12576 if (BWSz == SrcBWSz) {
12577 VecOpcode = Instruction::BitCast;
12578 } else if (BWSz < SrcBWSz) {
12579 VecOpcode = Instruction::Trunc;
12580 } else if (It != MinBWs.end()) {
12581 assert(BWSz > SrcBWSz && "Invalid cast!");
12582 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12583 } else if (SrcIt != MinBWs.end()) {
12584 assert(BWSz > SrcBWSz && "Invalid cast!");
12585 VecOpcode =
12586 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
12587 }
12588 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
12589 !SrcIt->second.second) {
12590 VecOpcode = Instruction::UIToFP;
12591 }
12592 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
12593 ? InVec
12594 : Builder.CreateCast(VecOpcode, InVec, VecTy);
12595 V = FinalShuffle(V, E, VecTy);
12596
12597 E->VectorizedValue = V;
12598 ++NumVectorInstructions;
12599 return V;
12600 }
12601 case Instruction::FCmp:
12602 case Instruction::ICmp: {
12603 setInsertPointAfterBundle(E);
12604
12605 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
12606 if (E->VectorizedValue) {
12607 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12608 return E->VectorizedValue;
12609 }
12610 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
12611 if (E->VectorizedValue) {
12612 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12613 return E->VectorizedValue;
12614 }
12615 if (L->getType() != R->getType()) {
12616 assert((getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12617 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12618 MinBWs.contains(getOperandEntry(E, 0)) ||
12619 MinBWs.contains(getOperandEntry(E, 1))) &&
12620 "Expected item in MinBWs.");
12621 if (cast<VectorType>(L->getType())
12622 ->getElementType()
12623 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
12624 ->getElementType()
12625 ->getIntegerBitWidth()) {
12626 Type *CastTy = R->getType();
12627 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
12628 } else {
12629 Type *CastTy = L->getType();
12630 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
12631 }
12632 }
12633
12634 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
12635 Value *V = Builder.CreateCmp(P0, L, R);
12636 propagateIRFlags(V, E->Scalars, VL0);
12637 // Do not cast for cmps.
12638 VecTy = cast<FixedVectorType>(V->getType());
12639 V = FinalShuffle(V, E, VecTy);
12640
12641 E->VectorizedValue = V;
12642 ++NumVectorInstructions;
12643 return V;
12644 }
12645 case Instruction::Select: {
12646 setInsertPointAfterBundle(E);
12647
12648 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
12649 if (E->VectorizedValue) {
12650 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12651 return E->VectorizedValue;
12652 }
12653 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
12654 if (E->VectorizedValue) {
12655 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12656 return E->VectorizedValue;
12657 }
12658 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
12659 if (E->VectorizedValue) {
12660 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12661 return E->VectorizedValue;
12662 }
12663 if (True->getType() != VecTy || False->getType() != VecTy) {
12664 assert((It != MinBWs.end() ||
12665 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12666 getOperandEntry(E, 2)->State == TreeEntry::NeedToGather ||
12667 MinBWs.contains(getOperandEntry(E, 1)) ||
12668 MinBWs.contains(getOperandEntry(E, 2))) &&
12669 "Expected item in MinBWs.");
12670 if (True->getType() != VecTy)
12671 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
12672 if (False->getType() != VecTy)
12673 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
12674 }
12675
12676 Value *V = Builder.CreateSelect(Cond, True, False);
12677 V = FinalShuffle(V, E, VecTy);
12678
12679 E->VectorizedValue = V;
12680 ++NumVectorInstructions;
12681 return V;
12682 }
12683 case Instruction::FNeg: {
12684 setInsertPointAfterBundle(E);
12685
12686 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
12687
12688 if (E->VectorizedValue) {
12689 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12690 return E->VectorizedValue;
12691 }
12692
12693 Value *V = Builder.CreateUnOp(
12694 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
12695 propagateIRFlags(V, E->Scalars, VL0);
12696 if (auto *I = dyn_cast<Instruction>(V))
12697 V = propagateMetadata(I, E->Scalars);
12698
12699 V = FinalShuffle(V, E, VecTy);
12700
12701 E->VectorizedValue = V;
12702 ++NumVectorInstructions;
12703
12704 return V;
12705 }
12706 case Instruction::Add:
12707 case Instruction::FAdd:
12708 case Instruction::Sub:
12709 case Instruction::FSub:
12710 case Instruction::Mul:
12711 case Instruction::FMul:
12712 case Instruction::UDiv:
12713 case Instruction::SDiv:
12714 case Instruction::FDiv:
12715 case Instruction::URem:
12716 case Instruction::SRem:
12717 case Instruction::FRem:
12718 case Instruction::Shl:
12719 case Instruction::LShr:
12720 case Instruction::AShr:
12721 case Instruction::And:
12722 case Instruction::Or:
12723 case Instruction::Xor: {
12724 setInsertPointAfterBundle(E);
12725
12726 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
12727 if (E->VectorizedValue) {
12728 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12729 return E->VectorizedValue;
12730 }
12731 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
12732 if (E->VectorizedValue) {
12733 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12734 return E->VectorizedValue;
12735 }
12736 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
12737 assert((It != MinBWs.end() ||
12738 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12739 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12740 MinBWs.contains(getOperandEntry(E, 0)) ||
12741 MinBWs.contains(getOperandEntry(E, 1))) &&
12742 "Expected item in MinBWs.");
12743 if (LHS->getType() != VecTy)
12744 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
12745 if (RHS->getType() != VecTy)
12746 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
12747 }
12748
12749 Value *V = Builder.CreateBinOp(
12750 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
12751 RHS);
12752 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
12753 if (auto *I = dyn_cast<Instruction>(V)) {
12754 V = propagateMetadata(I, E->Scalars);
12755 // Drop nuw flags for abs(sub(commutative), true).
12756 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
12757 any_of(E->Scalars, [](Value *V) {
12758 return isCommutative(cast<Instruction>(V));
12759 }))
12760 I->setHasNoUnsignedWrap(/*b=*/false);
12761 }
12762
12763 V = FinalShuffle(V, E, VecTy);
12764
12765 E->VectorizedValue = V;
12766 ++NumVectorInstructions;
12767
12768 return V;
12769 }
12770 case Instruction::Load: {
12771 // Loads are inserted at the head of the tree because we don't want to
12772 // sink them all the way down past store instructions.
12773 setInsertPointAfterBundle(E);
12774
12775 LoadInst *LI = cast<LoadInst>(VL0);
12776 Instruction *NewLI;
12777 Value *PO = LI->getPointerOperand();
12778 if (E->State == TreeEntry::Vectorize) {
12779 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
12780 } else if (E->State == TreeEntry::StridedVectorize) {
12781 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
12782 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
12783 PO = IsReverseOrder ? PtrN : Ptr0;
12784 std::optional<int> Diff = getPointersDiff(
12785 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
12786 Type *StrideTy = DL->getIndexType(PO->getType());
12787 Value *StrideVal;
12788 if (Diff) {
12789 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
12790 StrideVal =
12791 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
12792 DL->getTypeAllocSize(ScalarTy));
12793 } else {
12794 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
12795 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
12796 return cast<LoadInst>(V)->getPointerOperand();
12797 });
12798 OrdersType Order;
12799 std::optional<Value *> Stride =
12800 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
12801 &*Builder.GetInsertPoint());
12802 Value *NewStride =
12803 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
12804 StrideVal = Builder.CreateMul(
12805 NewStride,
12806 ConstantInt::get(
12807 StrideTy,
12808 (IsReverseOrder ? -1 : 1) *
12809 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
12810 }
12811 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
12812 auto *Inst = Builder.CreateIntrinsic(
12813 Intrinsic::experimental_vp_strided_load,
12814 {VecTy, PO->getType(), StrideTy},
12815 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
12816 Builder.getInt32(E->Scalars.size())});
12817 Inst->addParamAttr(
12818 /*ArgNo=*/0,
12819 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
12820 NewLI = Inst;
12821 } else {
12822 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
12823 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
12824 if (E->VectorizedValue) {
12825 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12826 return E->VectorizedValue;
12827 }
12828 // Use the minimum alignment of the gathered loads.
12829 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
12830 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
12831 }
12832 Value *V = propagateMetadata(NewLI, E->Scalars);
12833
12834 V = FinalShuffle(V, E, VecTy);
12835 E->VectorizedValue = V;
12836 ++NumVectorInstructions;
12837 return V;
12838 }
12839 case Instruction::Store: {
12840 auto *SI = cast<StoreInst>(VL0);
12841
12842 setInsertPointAfterBundle(E);
12843
12844 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
12845 if (VecValue->getType() != VecTy)
12846 VecValue =
12847 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
12848 VecValue = FinalShuffle(VecValue, E, VecTy);
12849
12850 Value *Ptr = SI->getPointerOperand();
12851 StoreInst *ST =
12852 Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
12853
12854 Value *V = propagateMetadata(ST, E->Scalars);
12855
12856 E->VectorizedValue = V;
12857 ++NumVectorInstructions;
12858 return V;
12859 }
12860 case Instruction::GetElementPtr: {
12861 auto *GEP0 = cast<GetElementPtrInst>(VL0);
12862 setInsertPointAfterBundle(E);
12863
12864 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
12865 if (E->VectorizedValue) {
12866 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12867 return E->VectorizedValue;
12868 }
12869
12870 SmallVector<Value *> OpVecs;
12871 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
12872 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
12873 if (E->VectorizedValue) {
12874 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12875 return E->VectorizedValue;
12876 }
12877 OpVecs.push_back(OpVec);
12878 }
12879
12880 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
12881 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
12883 for (Value *V : E->Scalars) {
12884 if (isa<GetElementPtrInst>(V))
12885 GEPs.push_back(V);
12886 }
12887 V = propagateMetadata(I, GEPs);
12888 }
12889
12890 V = FinalShuffle(V, E, VecTy);
12891
12892 E->VectorizedValue = V;
12893 ++NumVectorInstructions;
12894
12895 return V;
12896 }
12897 case Instruction::Call: {
12898 CallInst *CI = cast<CallInst>(VL0);
12899 setInsertPointAfterBundle(E);
12900
12902
12903 SmallVector<Type *> ArgTys =
12905 It != MinBWs.end() ? It->second.first : 0);
12906 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
12907 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
12908 VecCallCosts.first <= VecCallCosts.second;
12909
12910 Value *ScalarArg = nullptr;
12911 SmallVector<Value *> OpVecs;
12912 SmallVector<Type *, 2> TysForDecl;
12913 // Add return type if intrinsic is overloaded on it.
12914 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
12915 TysForDecl.push_back(VecTy);
12916 auto *CEI = cast<CallInst>(VL0);
12917 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
12918 ValueList OpVL;
12919 // Some intrinsics have scalar arguments. This argument should not be
12920 // vectorized.
12921 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
12922 ScalarArg = CEI->getArgOperand(I);
12923 // if decided to reduce bitwidth of abs intrinsic, it second argument
12924 // must be set false (do not return poison, if value issigned min).
12925 if (ID == Intrinsic::abs && It != MinBWs.end() &&
12926 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
12927 ScalarArg = Builder.getFalse();
12928 OpVecs.push_back(ScalarArg);
12930 TysForDecl.push_back(ScalarArg->getType());
12931 continue;
12932 }
12933
12934 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
12935 if (E->VectorizedValue) {
12936 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12937 return E->VectorizedValue;
12938 }
12939 ScalarArg = CEI->getArgOperand(I);
12940 if (cast<VectorType>(OpVec->getType())->getElementType() !=
12941 ScalarArg->getType() &&
12942 It == MinBWs.end()) {
12943 auto *CastTy = FixedVectorType::get(ScalarArg->getType(),
12944 VecTy->getNumElements());
12945 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
12946 } else if (It != MinBWs.end()) {
12947 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
12948 }
12949 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
12950 OpVecs.push_back(OpVec);
12951 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
12952 TysForDecl.push_back(OpVec->getType());
12953 }
12954
12955 Function *CF;
12956 if (!UseIntrinsic) {
12957 VFShape Shape =
12960 static_cast<unsigned>(VecTy->getNumElements())),
12961 false /*HasGlobalPred*/);
12962 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
12963 } else {
12964 CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl);
12965 }
12966
12968 CI->getOperandBundlesAsDefs(OpBundles);
12969 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
12970
12971 propagateIRFlags(V, E->Scalars, VL0);
12972 V = FinalShuffle(V, E, VecTy);
12973
12974 E->VectorizedValue = V;
12975 ++NumVectorInstructions;
12976 return V;
12977 }
12978 case Instruction::ShuffleVector: {
12979 assert(E->isAltShuffle() &&
12980 ((Instruction::isBinaryOp(E->getOpcode()) &&
12981 Instruction::isBinaryOp(E->getAltOpcode())) ||
12982 (Instruction::isCast(E->getOpcode()) &&
12983 Instruction::isCast(E->getAltOpcode())) ||
12984 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
12985 "Invalid Shuffle Vector Operand");
12986
12987 Value *LHS = nullptr, *RHS = nullptr;
12988 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
12989 setInsertPointAfterBundle(E);
12990 LHS = vectorizeOperand(E, 0, PostponedPHIs);
12991 if (E->VectorizedValue) {
12992 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12993 return E->VectorizedValue;
12994 }
12995 RHS = vectorizeOperand(E, 1, PostponedPHIs);
12996 } else {
12997 setInsertPointAfterBundle(E);
12998 LHS = vectorizeOperand(E, 0, PostponedPHIs);
12999 }
13000 if (E->VectorizedValue) {
13001 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13002 return E->VectorizedValue;
13003 }
13004 if (LHS && RHS &&
13005 ((Instruction::isBinaryOp(E->getOpcode()) &&
13006 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
13007 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
13008 assert((It != MinBWs.end() ||
13009 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
13010 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
13011 MinBWs.contains(getOperandEntry(E, 0)) ||
13012 MinBWs.contains(getOperandEntry(E, 1))) &&
13013 "Expected item in MinBWs.");
13014 Type *CastTy = VecTy;
13015 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
13016 if (cast<VectorType>(LHS->getType())
13017 ->getElementType()
13018 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
13019 ->getElementType()
13020 ->getIntegerBitWidth())
13021 CastTy = RHS->getType();
13022 else
13023 CastTy = LHS->getType();
13024 }
13025 if (LHS->getType() != CastTy)
13026 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
13027 if (RHS->getType() != CastTy)
13028 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
13029 }
13030
13031 Value *V0, *V1;
13032 if (Instruction::isBinaryOp(E->getOpcode())) {
13033 V0 = Builder.CreateBinOp(
13034 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
13035 V1 = Builder.CreateBinOp(
13036 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
13037 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
13038 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
13039 auto *AltCI = cast<CmpInst>(E->getAltOp());
13040 CmpInst::Predicate AltPred = AltCI->getPredicate();
13041 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
13042 } else {
13043 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
13044 unsigned SrcBWSz = DL->getTypeSizeInBits(
13045 cast<VectorType>(LHS->getType())->getElementType());
13046 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
13047 if (BWSz <= SrcBWSz) {
13048 if (BWSz < SrcBWSz)
13049 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
13050 assert(LHS->getType() == VecTy && "Expected same type as operand.");
13051 if (auto *I = dyn_cast<Instruction>(LHS))
13052 LHS = propagateMetadata(I, E->Scalars);
13053 E->VectorizedValue = LHS;
13054 ++NumVectorInstructions;
13055 return LHS;
13056 }
13057 }
13058 V0 = Builder.CreateCast(
13059 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
13060 V1 = Builder.CreateCast(
13061 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
13062 }
13063 // Add V0 and V1 to later analysis to try to find and remove matching
13064 // instruction, if any.
13065 for (Value *V : {V0, V1}) {
13066 if (auto *I = dyn_cast<Instruction>(V)) {
13067 GatherShuffleExtractSeq.insert(I);
13068 CSEBlocks.insert(I->getParent());
13069 }
13070 }
13071
13072 // Create shuffle to take alternate operations from the vector.
13073 // Also, gather up main and alt scalar ops to propagate IR flags to
13074 // each vector operation.
13075 ValueList OpScalars, AltScalars;
13077 E->buildAltOpShuffleMask(
13078 [E, this](Instruction *I) {
13079 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
13080 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
13081 *TLI);
13082 },
13083 Mask, &OpScalars, &AltScalars);
13084
13085 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
13086 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
13087 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
13088 // Drop nuw flags for abs(sub(commutative), true).
13089 if (auto *I = dyn_cast<Instruction>(Vec);
13090 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
13091 any_of(E->Scalars, [](Value *V) {
13092 auto *IV = cast<Instruction>(V);
13093 return IV->getOpcode() == Instruction::Sub &&
13094 isCommutative(cast<Instruction>(IV));
13095 }))
13096 I->setHasNoUnsignedWrap(/*b=*/false);
13097 };
13098 DropNuwFlag(V0, E->getOpcode());
13099 DropNuwFlag(V1, E->getAltOpcode());
13100
13101 Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
13102 if (auto *I = dyn_cast<Instruction>(V)) {
13103 V = propagateMetadata(I, E->Scalars);
13104 GatherShuffleExtractSeq.insert(I);
13105 CSEBlocks.insert(I->getParent());
13106 }
13107
13108 E->VectorizedValue = V;
13109 ++NumVectorInstructions;
13110
13111 return V;
13112 }
13113 default:
13114 llvm_unreachable("unknown inst");
13115 }
13116 return nullptr;
13117}
13118
13120 ExtraValueToDebugLocsMap ExternallyUsedValues;
13121 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
13122 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13123}
13124
13125namespace {
13126/// Data type for handling buildvector sequences with the reused scalars from
13127/// other tree entries.
13128struct ShuffledInsertData {
13129 /// List of insertelements to be replaced by shuffles.
13130 SmallVector<InsertElementInst *> InsertElements;
13131 /// The parent vectors and shuffle mask for the given list of inserts.
13133};
13134} // namespace
13135
13137 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
13138 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
13139 Instruction *ReductionRoot) {
13140 // All blocks must be scheduled before any instructions are inserted.
13141 for (auto &BSIter : BlocksSchedules) {
13142 scheduleBlock(BSIter.second.get());
13143 }
13144 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
13145 // need to rebuild it.
13146 EntryToLastInstruction.clear();
13147
13148 if (ReductionRoot)
13149 Builder.SetInsertPoint(ReductionRoot->getParent(),
13150 ReductionRoot->getIterator());
13151 else
13152 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13153
13154 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
13155 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
13156 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13157 if (TE->State == TreeEntry::Vectorize &&
13158 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13159 TE->VectorizedValue)
13160 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
13161 // Run through the list of postponed gathers and emit them, replacing the temp
13162 // emitted allocas with actual vector instructions.
13163 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
13165 for (const TreeEntry *E : PostponedNodes) {
13166 auto *TE = const_cast<TreeEntry *>(E);
13167 if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
13168 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
13169 TE->UserTreeIndices.front().EdgeIdx)))
13170 // Found gather node which is absolutely the same as one of the
13171 // vectorized nodes. It may happen after reordering.
13172 continue;
13173 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
13174 TE->VectorizedValue = nullptr;
13175 auto *UserI =
13176 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
13177 // If user is a PHI node, its vector code have to be inserted right before
13178 // block terminator. Since the node was delayed, there were some unresolved
13179 // dependencies at the moment when stab instruction was emitted. In a case
13180 // when any of these dependencies turn out an operand of another PHI, coming
13181 // from this same block, position of a stab instruction will become invalid.
13182 // The is because source vector that supposed to feed this gather node was
13183 // inserted at the end of the block [after stab instruction]. So we need
13184 // to adjust insertion point again to the end of block.
13185 if (isa<PHINode>(UserI)) {
13186 // Insert before all users.
13187 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
13188 for (User *U : PrevVec->users()) {
13189 if (U == UserI)
13190 continue;
13191 auto *UI = dyn_cast<Instruction>(U);
13192 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
13193 continue;
13194 if (UI->comesBefore(InsertPt))
13195 InsertPt = UI;
13196 }
13197 Builder.SetInsertPoint(InsertPt);
13198 } else {
13199 Builder.SetInsertPoint(PrevVec);
13200 }
13201 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
13202 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
13203 if (Vec->getType() != PrevVec->getType()) {
13204 assert(Vec->getType()->isIntOrIntVectorTy() &&
13205 PrevVec->getType()->isIntOrIntVectorTy() &&
13206 "Expected integer vector types only.");
13207 std::optional<bool> IsSigned;
13208 for (Value *V : TE->Scalars) {
13209 if (const TreeEntry *BaseTE = getTreeEntry(V)) {
13210 auto It = MinBWs.find(BaseTE);
13211 if (It != MinBWs.end()) {
13212 IsSigned = IsSigned.value_or(false) || It->second.second;
13213 if (*IsSigned)
13214 break;
13215 }
13216 for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
13217 auto It = MinBWs.find(MNTE);
13218 if (It != MinBWs.end()) {
13219 IsSigned = IsSigned.value_or(false) || It->second.second;
13220 if (*IsSigned)
13221 break;
13222 }
13223 }
13224 if (IsSigned.value_or(false))
13225 break;
13226 // Scan through gather nodes.
13227 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
13228 auto It = MinBWs.find(BVE);
13229 if (It != MinBWs.end()) {
13230 IsSigned = IsSigned.value_or(false) || It->second.second;
13231 if (*IsSigned)
13232 break;
13233 }
13234 }
13235 if (IsSigned.value_or(false))
13236 break;
13237 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
13238 IsSigned =
13239 IsSigned.value_or(false) ||
13240 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
13241 continue;
13242 }
13243 if (IsSigned.value_or(false))
13244 break;
13245 }
13246 }
13247 if (IsSigned.value_or(false)) {
13248 // Final attempt - check user node.
13249 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
13250 if (It != MinBWs.end())
13251 IsSigned = It->second.second;
13252 }
13253 assert(IsSigned &&
13254 "Expected user node or perfect diamond match in MinBWs.");
13255 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
13256 }
13257 PrevVec->replaceAllUsesWith(Vec);
13258 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
13259 // Replace the stub vector node, if it was used before for one of the
13260 // buildvector nodes already.
13261 auto It = PostponedValues.find(PrevVec);
13262 if (It != PostponedValues.end()) {
13263 for (TreeEntry *VTE : It->getSecond())
13264 VTE->VectorizedValue = Vec;
13265 }
13266 eraseInstruction(PrevVec);
13267 }
13268
13269 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
13270 << " values .\n");
13271
13272 SmallVector<ShuffledInsertData> ShuffledInserts;
13273 // Maps vector instruction to original insertelement instruction
13274 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
13275 // Maps extract Scalar to the corresponding extractelement instruction in the
13276 // basic block. Only one extractelement per block should be emitted.
13277 DenseMap<Value *,
13279 ScalarToEEs;
13280 SmallDenseSet<Value *, 4> UsedInserts;
13282 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
13283 // Extract all of the elements with the external uses.
13284 for (const auto &ExternalUse : ExternalUses) {
13285 Value *Scalar = ExternalUse.Scalar;
13286 llvm::User *User = ExternalUse.User;
13287
13288 // Skip users that we already RAUW. This happens when one instruction
13289 // has multiple uses of the same value.
13290 if (User && !is_contained(Scalar->users(), User))
13291 continue;
13292 TreeEntry *E = getTreeEntry(Scalar);
13293 assert(E && "Invalid scalar");
13294 assert(E->State != TreeEntry::NeedToGather &&
13295 "Extracting from a gather list");
13296 // Non-instruction pointers are not deleted, just skip them.
13297 if (E->getOpcode() == Instruction::GetElementPtr &&
13298 !isa<GetElementPtrInst>(Scalar))
13299 continue;
13300
13301 Value *Vec = E->VectorizedValue;
13302 assert(Vec && "Can't find vectorizable value");
13303
13304 Value *Lane = Builder.getInt32(ExternalUse.Lane);
13305 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
13306 if (Scalar->getType() != Vec->getType()) {
13307 Value *Ex = nullptr;
13308 Value *ExV = nullptr;
13309 auto *GEP = dyn_cast<GetElementPtrInst>(Scalar);
13310 bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP);
13311 auto It = ScalarToEEs.find(Scalar);
13312 if (It != ScalarToEEs.end()) {
13313 // No need to emit many extracts, just move the only one in the
13314 // current block.
13315 auto EEIt = It->second.find(Builder.GetInsertBlock());
13316 if (EEIt != It->second.end()) {
13317 Instruction *I = EEIt->second.first;
13318 if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
13319 Builder.GetInsertPoint()->comesBefore(I)) {
13320 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
13321 Builder.GetInsertPoint());
13322 if (auto *CI = EEIt->second.second)
13323 CI->moveAfter(I);
13324 }
13325 Ex = I;
13326 ExV = EEIt->second.second ? EEIt->second.second : Ex;
13327 }
13328 }
13329 if (!Ex) {
13330 // "Reuse" the existing extract to improve final codegen.
13331 if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
13332 Value *V = ES->getVectorOperand();
13333 if (const TreeEntry *ETE = getTreeEntry(V))
13334 V = ETE->VectorizedValue;
13335 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
13336 } else if (ReplaceGEP) {
13337 // Leave the GEPs as is, they are free in most cases and better to
13338 // keep them as GEPs.
13339 auto *CloneGEP = GEP->clone();
13340 if (isa<Instruction>(Vec))
13341 CloneGEP->insertBefore(*Builder.GetInsertBlock(),
13342 Builder.GetInsertPoint());
13343 else
13344 CloneGEP->insertBefore(GEP);
13345 if (GEP->hasName())
13346 CloneGEP->takeName(GEP);
13347 Ex = CloneGEP;
13348 } else {
13349 Ex = Builder.CreateExtractElement(Vec, Lane);
13350 }
13351 // If necessary, sign-extend or zero-extend ScalarRoot
13352 // to the larger type.
13353 ExV = Ex;
13354 if (Scalar->getType() != Ex->getType())
13355 ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
13356 MinBWs.find(E)->second.second);
13357 if (auto *I = dyn_cast<Instruction>(Ex))
13358 ScalarToEEs[Scalar].try_emplace(
13359 Builder.GetInsertBlock(),
13360 std::make_pair(I, cast<Instruction>(ExV)));
13361 }
13362 // The then branch of the previous if may produce constants, since 0
13363 // operand might be a constant.
13364 if (auto *ExI = dyn_cast<Instruction>(Ex)) {
13365 GatherShuffleExtractSeq.insert(ExI);
13366 CSEBlocks.insert(ExI->getParent());
13367 }
13368 return ExV;
13369 }
13370 assert(isa<FixedVectorType>(Scalar->getType()) &&
13371 isa<InsertElementInst>(Scalar) &&
13372 "In-tree scalar of vector type is not insertelement?");
13373 auto *IE = cast<InsertElementInst>(Scalar);
13374 VectorToInsertElement.try_emplace(Vec, IE);
13375 return Vec;
13376 };
13377 // If User == nullptr, the Scalar remains as scalar in vectorized
13378 // instructions or is used as extra arg. Generate ExtractElement instruction
13379 // and update the record for this scalar in ExternallyUsedValues.
13380 if (!User) {
13381 if (!ScalarsWithNullptrUser.insert(Scalar).second)
13382 continue;
13383 assert((ExternallyUsedValues.count(Scalar) ||
13384 any_of(Scalar->users(),
13385 [&](llvm::User *U) {
13386 if (ExternalUsesAsGEPs.contains(U))
13387 return true;
13388 TreeEntry *UseEntry = getTreeEntry(U);
13389 return UseEntry &&
13390 (UseEntry->State == TreeEntry::Vectorize ||
13391 UseEntry->State ==
13392 TreeEntry::StridedVectorize) &&
13393 (E->State == TreeEntry::Vectorize ||
13394 E->State == TreeEntry::StridedVectorize) &&
13395 doesInTreeUserNeedToExtract(
13396 Scalar,
13397 cast<Instruction>(UseEntry->Scalars.front()),
13398 TLI);
13399 })) &&
13400 "Scalar with nullptr User must be registered in "
13401 "ExternallyUsedValues map or remain as scalar in vectorized "
13402 "instructions");
13403 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
13404 if (auto *PHI = dyn_cast<PHINode>(VecI))
13405 Builder.SetInsertPoint(PHI->getParent(),
13406 PHI->getParent()->getFirstNonPHIIt());
13407 else
13408 Builder.SetInsertPoint(VecI->getParent(),
13409 std::next(VecI->getIterator()));
13410 } else {
13411 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13412 }
13413 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13414 // Required to update internally referenced instructions.
13415 Scalar->replaceAllUsesWith(NewInst);
13416 ReplacedExternals.emplace_back(Scalar, NewInst);
13417 continue;
13418 }
13419
13420 if (auto *VU = dyn_cast<InsertElementInst>(User)) {
13421 // Skip if the scalar is another vector op or Vec is not an instruction.
13422 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
13423 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
13424 if (!UsedInserts.insert(VU).second)
13425 continue;
13426 // Need to use original vector, if the root is truncated.
13427 auto BWIt = MinBWs.find(E);
13428 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
13429 auto *ScalarTy = FTy->getElementType();
13430 auto Key = std::make_pair(Vec, ScalarTy);
13431 auto VecIt = VectorCasts.find(Key);
13432 if (VecIt == VectorCasts.end()) {
13433 IRBuilderBase::InsertPointGuard Guard(Builder);
13434 if (auto *IVec = dyn_cast<Instruction>(Vec))
13435 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
13436 Vec = Builder.CreateIntCast(
13437 Vec,
13439 ScalarTy,
13440 cast<FixedVectorType>(Vec->getType())->getNumElements()),
13441 BWIt->second.second);
13442 VectorCasts.try_emplace(Key, Vec);
13443 } else {
13444 Vec = VecIt->second;
13445 }
13446 }
13447
13448 std::optional<unsigned> InsertIdx = getInsertIndex(VU);
13449 if (InsertIdx) {
13450 auto *It =
13451 find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
13452 // Checks if 2 insertelements are from the same buildvector.
13453 InsertElementInst *VecInsert = Data.InsertElements.front();
13455 VU, VecInsert,
13456 [](InsertElementInst *II) { return II->getOperand(0); });
13457 });
13458 unsigned Idx = *InsertIdx;
13459 if (It == ShuffledInserts.end()) {
13460 (void)ShuffledInserts.emplace_back();
13461 It = std::next(ShuffledInserts.begin(),
13462 ShuffledInserts.size() - 1);
13463 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
13464 if (Mask.empty())
13465 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
13466 // Find the insertvector, vectorized in tree, if any.
13467 Value *Base = VU;
13468 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
13469 if (IEBase != User &&
13470 (!IEBase->hasOneUse() ||
13471 getInsertIndex(IEBase).value_or(Idx) == Idx))
13472 break;
13473 // Build the mask for the vectorized insertelement instructions.
13474 if (const TreeEntry *E = getTreeEntry(IEBase)) {
13475 do {
13476 IEBase = cast<InsertElementInst>(Base);
13477 int IEIdx = *getInsertIndex(IEBase);
13478 assert(Mask[IEIdx] == PoisonMaskElem &&
13479 "InsertElementInstruction used already.");
13480 Mask[IEIdx] = IEIdx;
13481 Base = IEBase->getOperand(0);
13482 } while (E == getTreeEntry(Base));
13483 break;
13484 }
13485 Base = cast<InsertElementInst>(Base)->getOperand(0);
13486 // After the vectorization the def-use chain has changed, need
13487 // to look through original insertelement instructions, if they
13488 // get replaced by vector instructions.
13489 auto It = VectorToInsertElement.find(Base);
13490 if (It != VectorToInsertElement.end())
13491 Base = It->second;
13492 }
13493 }
13494 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
13495 if (Mask.empty())
13496 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
13497 Mask[Idx] = ExternalUse.Lane;
13498 It->InsertElements.push_back(cast<InsertElementInst>(User));
13499 continue;
13500 }
13501 }
13502 }
13503 }
13504
13505 // Generate extracts for out-of-tree users.
13506 // Find the insertion point for the extractelement lane.
13507 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
13508 if (PHINode *PH = dyn_cast<PHINode>(User)) {
13509 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
13510 if (PH->getIncomingValue(I) == Scalar) {
13511 Instruction *IncomingTerminator =
13512 PH->getIncomingBlock(I)->getTerminator();
13513 if (isa<CatchSwitchInst>(IncomingTerminator)) {
13514 Builder.SetInsertPoint(VecI->getParent(),
13515 std::next(VecI->getIterator()));
13516 } else {
13517 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
13518 }
13519 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13520 PH->setOperand(I, NewInst);
13521 }
13522 }
13523 } else {
13524 Builder.SetInsertPoint(cast<Instruction>(User));
13525 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13526 User->replaceUsesOfWith(Scalar, NewInst);
13527 }
13528 } else {
13529 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13530 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13531 User->replaceUsesOfWith(Scalar, NewInst);
13532 }
13533
13534 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
13535 }
13536
13537 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
13538 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
13539 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
13540 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
13541 for (int I = 0, E = Mask.size(); I < E; ++I) {
13542 if (Mask[I] < VF)
13543 CombinedMask1[I] = Mask[I];
13544 else
13545 CombinedMask2[I] = Mask[I] - VF;
13546 }
13547 ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
13548 ShuffleBuilder.add(V1, CombinedMask1);
13549 if (V2)
13550 ShuffleBuilder.add(V2, CombinedMask2);
13551 return ShuffleBuilder.finalize(std::nullopt);
13552 };
13553
13554 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
13555 bool ForSingleMask) {
13556 unsigned VF = Mask.size();
13557 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
13558 if (VF != VecVF) {
13559 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
13560 Vec = CreateShuffle(Vec, nullptr, Mask);
13561 return std::make_pair(Vec, true);
13562 }
13563 if (!ForSingleMask) {
13564 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
13565 for (unsigned I = 0; I < VF; ++I) {
13566 if (Mask[I] != PoisonMaskElem)
13567 ResizeMask[Mask[I]] = Mask[I];
13568 }
13569 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
13570 }
13571 }
13572
13573 return std::make_pair(Vec, false);
13574 };
13575 // Perform shuffling of the vectorize tree entries for better handling of
13576 // external extracts.
13577 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
13578 // Find the first and the last instruction in the list of insertelements.
13579 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
13580 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
13581 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
13582 Builder.SetInsertPoint(LastInsert);
13583 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
13584 Value *NewInst = performExtractsShuffleAction<Value>(
13585 MutableArrayRef(Vector.data(), Vector.size()),
13586 FirstInsert->getOperand(0),
13587 [](Value *Vec) {
13588 return cast<VectorType>(Vec->getType())
13589 ->getElementCount()
13590 .getKnownMinValue();
13591 },
13592 ResizeToVF,
13593 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
13594 ArrayRef<Value *> Vals) {
13595 assert((Vals.size() == 1 || Vals.size() == 2) &&
13596 "Expected exactly 1 or 2 input values.");
13597 if (Vals.size() == 1) {
13598 // Do not create shuffle if the mask is a simple identity
13599 // non-resizing mask.
13600 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
13601 ->getNumElements() ||
13602 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
13603 return CreateShuffle(Vals.front(), nullptr, Mask);
13604 return Vals.front();
13605 }
13606 return CreateShuffle(Vals.front() ? Vals.front()
13607 : FirstInsert->getOperand(0),
13608 Vals.back(), Mask);
13609 });
13610 auto It = ShuffledInserts[I].InsertElements.rbegin();
13611 // Rebuild buildvector chain.
13612 InsertElementInst *II = nullptr;
13613 if (It != ShuffledInserts[I].InsertElements.rend())
13614 II = *It;
13616 while (It != ShuffledInserts[I].InsertElements.rend()) {
13617 assert(II && "Must be an insertelement instruction.");
13618 if (*It == II)
13619 ++It;
13620 else
13621 Inserts.push_back(cast<Instruction>(II));
13622 II = dyn_cast<InsertElementInst>(II->getOperand(0));
13623 }
13624 for (Instruction *II : reverse(Inserts)) {
13625 II->replaceUsesOfWith(II->getOperand(0), NewInst);
13626 if (auto *NewI = dyn_cast<Instruction>(NewInst))
13627 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
13628 II->moveAfter(NewI);
13629 NewInst = II;
13630 }
13631 LastInsert->replaceAllUsesWith(NewInst);
13632 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
13633 IE->replaceUsesOfWith(IE->getOperand(0),
13634 PoisonValue::get(IE->getOperand(0)->getType()));
13635 IE->replaceUsesOfWith(IE->getOperand(1),
13636 PoisonValue::get(IE->getOperand(1)->getType()));
13637 eraseInstruction(IE);
13638 }
13639 CSEBlocks.insert(LastInsert->getParent());
13640 }
13641
13642 SmallVector<Instruction *> RemovedInsts;
13643 // For each vectorized value:
13644 for (auto &TEPtr : VectorizableTree) {
13645 TreeEntry *Entry = TEPtr.get();
13646
13647 // No need to handle users of gathered values.
13648 if (Entry->State == TreeEntry::NeedToGather)
13649 continue;
13650
13651 assert(Entry->VectorizedValue && "Can't find vectorizable value");
13652
13653 // For each lane:
13654 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
13655 Value *Scalar = Entry->Scalars[Lane];
13656
13657 if (Entry->getOpcode() == Instruction::GetElementPtr &&
13658 !isa<GetElementPtrInst>(Scalar))
13659 continue;
13660#ifndef NDEBUG
13661 Type *Ty = Scalar->getType();
13662 if (!Ty->isVoidTy()) {
13663 for (User *U : Scalar->users()) {
13664 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
13665
13666 // It is legal to delete users in the ignorelist.
13667 assert((getTreeEntry(U) ||
13668 (UserIgnoreList && UserIgnoreList->contains(U)) ||
13669 (isa_and_nonnull<Instruction>(U) &&
13670 isDeleted(cast<Instruction>(U)))) &&
13671 "Deleting out-of-tree value");
13672 }
13673 }
13674#endif
13675 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
13676 eraseInstruction(cast<Instruction>(Scalar));
13677 // Retain to-be-deleted instructions for some debug-info
13678 // bookkeeping. NOTE: eraseInstruction only marks the instruction for
13679 // deletion - instructions are not deleted until later.
13680 RemovedInsts.push_back(cast<Instruction>(Scalar));
13681 }
13682 }
13683
13684 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
13685 // new vector instruction.
13686 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
13687 V->mergeDIAssignID(RemovedInsts);
13688
13689 Builder.ClearInsertionPoint();
13690 InstrElementSize.clear();
13691
13692 const TreeEntry &RootTE = *VectorizableTree.front().get();
13693 Value *Vec = RootTE.VectorizedValue;
13694 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
13695 It != MinBWs.end() &&
13696 ReductionBitWidth != It->second.first) {
13697 IRBuilder<>::InsertPointGuard Guard(Builder);
13698 Builder.SetInsertPoint(ReductionRoot->getParent(),
13699 ReductionRoot->getIterator());
13700 Vec = Builder.CreateIntCast(
13701 Vec,
13702 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
13703 cast<VectorType>(Vec->getType())->getElementCount()),
13704 It->second.second);
13705 }
13706 return Vec;
13707}
13708
13710 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
13711 << " gather sequences instructions.\n");
13712 // LICM InsertElementInst sequences.
13713 for (Instruction *I : GatherShuffleExtractSeq) {
13714 if (isDeleted(I))
13715 continue;
13716
13717 // Check if this block is inside a loop.
13718 Loop *L = LI->getLoopFor(I->getParent());
13719 if (!L)
13720 continue;
13721
13722 // Check if it has a preheader.
13723 BasicBlock *PreHeader = L->getLoopPreheader();
13724 if (!PreHeader)
13725 continue;
13726
13727 // If the vector or the element that we insert into it are
13728 // instructions that are defined in this basic block then we can't
13729 // hoist this instruction.
13730 if (any_of(I->operands(), [L](Value *V) {
13731 auto *OpI = dyn_cast<Instruction>(V);
13732 return OpI && L->contains(OpI);
13733 }))
13734 continue;
13735
13736 // We can hoist this instruction. Move it to the pre-header.
13737 I->moveBefore(PreHeader->getTerminator());
13738 CSEBlocks.insert(PreHeader);
13739 }
13740
13741 // Make a list of all reachable blocks in our CSE queue.
13743 CSEWorkList.reserve(CSEBlocks.size());
13744 for (BasicBlock *BB : CSEBlocks)
13745 if (DomTreeNode *N = DT->getNode(BB)) {
13747 CSEWorkList.push_back(N);
13748 }
13749
13750 // Sort blocks by domination. This ensures we visit a block after all blocks
13751 // dominating it are visited.
13752 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
13753 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
13754 "Different nodes should have different DFS numbers");
13755 return A->getDFSNumIn() < B->getDFSNumIn();
13756 });
13757
13758 // Less defined shuffles can be replaced by the more defined copies.
13759 // Between two shuffles one is less defined if it has the same vector operands
13760 // and its mask indeces are the same as in the first one or undefs. E.g.
13761 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
13762 // poison, <0, 0, 0, 0>.
13763 auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
13764 SmallVectorImpl<int> &NewMask) {
13765 if (I1->getType() != I2->getType())
13766 return false;
13767 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
13768 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
13769 if (!SI1 || !SI2)
13770 return I1->isIdenticalTo(I2);
13771 if (SI1->isIdenticalTo(SI2))
13772 return true;
13773 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
13774 if (SI1->getOperand(I) != SI2->getOperand(I))
13775 return false;
13776 // Check if the second instruction is more defined than the first one.
13777 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
13778 ArrayRef<int> SM1 = SI1->getShuffleMask();
13779 // Count trailing undefs in the mask to check the final number of used
13780 // registers.
13781 unsigned LastUndefsCnt = 0;
13782 for (int I = 0, E = NewMask.size(); I < E; ++I) {
13783 if (SM1[I] == PoisonMaskElem)
13784 ++LastUndefsCnt;
13785 else
13786 LastUndefsCnt = 0;
13787 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
13788 NewMask[I] != SM1[I])
13789 return false;
13790 if (NewMask[I] == PoisonMaskElem)
13791 NewMask[I] = SM1[I];
13792 }
13793 // Check if the last undefs actually change the final number of used vector
13794 // registers.
13795 return SM1.size() - LastUndefsCnt > 1 &&
13796 TTI->getNumberOfParts(SI1->getType()) ==
13798 FixedVectorType::get(SI1->getType()->getElementType(),
13799 SM1.size() - LastUndefsCnt));
13800 };
13801 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
13802 // instructions. TODO: We can further optimize this scan if we split the
13803 // instructions into different buckets based on the insert lane.
13805 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
13806 assert(*I &&
13807 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
13808 "Worklist not sorted properly!");
13809 BasicBlock *BB = (*I)->getBlock();
13810 // For all instructions in blocks containing gather sequences:
13811 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
13812 if (isDeleted(&In))
13813 continue;
13814 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
13815 !GatherShuffleExtractSeq.contains(&In))
13816 continue;
13817
13818 // Check if we can replace this instruction with any of the
13819 // visited instructions.
13820 bool Replaced = false;
13821 for (Instruction *&V : Visited) {
13822 SmallVector<int> NewMask;
13823 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
13824 DT->dominates(V->getParent(), In.getParent())) {
13825 In.replaceAllUsesWith(V);
13826 eraseInstruction(&In);
13827 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
13828 if (!NewMask.empty())
13829 SI->setShuffleMask(NewMask);
13830 Replaced = true;
13831 break;
13832 }
13833 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
13834 GatherShuffleExtractSeq.contains(V) &&
13835 IsIdenticalOrLessDefined(V, &In, NewMask) &&
13836 DT->dominates(In.getParent(), V->getParent())) {
13837 In.moveAfter(V);
13838 V->replaceAllUsesWith(&In);
13840 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
13841 if (!NewMask.empty())
13842 SI->setShuffleMask(NewMask);
13843 V = &In;
13844 Replaced = true;
13845 break;
13846 }
13847 }
13848 if (!Replaced) {
13849 assert(!is_contained(Visited, &In));
13850 Visited.push_back(&In);
13851 }
13852 }
13853 }
13854 CSEBlocks.clear();
13855 GatherShuffleExtractSeq.clear();
13856}
13857
13858BoUpSLP::ScheduleData *
13859BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
13860 ScheduleData *Bundle = nullptr;
13861 ScheduleData *PrevInBundle = nullptr;
13862 for (Value *V : VL) {
13864 continue;
13865 ScheduleData *BundleMember = getScheduleData(V);
13866 assert(BundleMember &&
13867 "no ScheduleData for bundle member "
13868 "(maybe not in same basic block)");
13869 assert(BundleMember->isSchedulingEntity() &&
13870 "bundle member already part of other bundle");
13871 if (PrevInBundle) {
13872 PrevInBundle->NextInBundle = BundleMember;
13873 } else {
13874 Bundle = BundleMember;
13875 }
13876
13877 // Group the instructions to a bundle.
13878 BundleMember->FirstInBundle = Bundle;
13879 PrevInBundle = BundleMember;
13880 }
13881 assert(Bundle && "Failed to find schedule bundle");
13882 return Bundle;
13883}
13884
13885// Groups the instructions to a bundle (which is then a single scheduling entity)
13886// and schedules instructions until the bundle gets ready.
13887std::optional<BoUpSLP::ScheduleData *>
13888BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
13889 const InstructionsState &S) {
13890 // No need to schedule PHIs, insertelement, extractelement and extractvalue
13891 // instructions.
13892 if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
13894 return nullptr;
13895
13896 // Initialize the instruction bundle.
13897 Instruction *OldScheduleEnd = ScheduleEnd;
13898 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
13899
13900 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
13901 ScheduleData *Bundle) {
13902 // The scheduling region got new instructions at the lower end (or it is a
13903 // new region for the first bundle). This makes it necessary to
13904 // recalculate all dependencies.
13905 // It is seldom that this needs to be done a second time after adding the
13906 // initial bundle to the region.
13907 if (ScheduleEnd != OldScheduleEnd) {
13908 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
13909 doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
13910 ReSchedule = true;
13911 }
13912 if (Bundle) {
13913 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
13914 << " in block " << BB->getName() << "\n");
13915 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
13916 }
13917
13918 if (ReSchedule) {
13919 resetSchedule();
13920 initialFillReadyList(ReadyInsts);
13921 }
13922
13923 // Now try to schedule the new bundle or (if no bundle) just calculate
13924 // dependencies. As soon as the bundle is "ready" it means that there are no
13925 // cyclic dependencies and we can schedule it. Note that's important that we
13926 // don't "schedule" the bundle yet (see cancelScheduling).
13927 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
13928 !ReadyInsts.empty()) {
13929 ScheduleData *Picked = ReadyInsts.pop_back_val();
13930 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
13931 "must be ready to schedule");
13932 schedule(Picked, ReadyInsts);
13933 }
13934 };
13935
13936 // Make sure that the scheduling region contains all
13937 // instructions of the bundle.
13938 for (Value *V : VL) {
13940 continue;
13941 if (!extendSchedulingRegion(V, S)) {
13942 // If the scheduling region got new instructions at the lower end (or it
13943 // is a new region for the first bundle). This makes it necessary to
13944 // recalculate all dependencies.
13945 // Otherwise the compiler may crash trying to incorrectly calculate
13946 // dependencies and emit instruction in the wrong order at the actual
13947 // scheduling.
13948 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
13949 return std::nullopt;
13950 }
13951 }
13952
13953 bool ReSchedule = false;
13954 for (Value *V : VL) {
13956 continue;
13957 ScheduleData *BundleMember = getScheduleData(V);
13958 assert(BundleMember &&
13959 "no ScheduleData for bundle member (maybe not in same basic block)");
13960
13961 // Make sure we don't leave the pieces of the bundle in the ready list when
13962 // whole bundle might not be ready.
13963 ReadyInsts.remove(BundleMember);
13964
13965 if (!BundleMember->IsScheduled)
13966 continue;
13967 // A bundle member was scheduled as single instruction before and now
13968 // needs to be scheduled as part of the bundle. We just get rid of the
13969 // existing schedule.
13970 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
13971 << " was already scheduled\n");
13972 ReSchedule = true;
13973 }
13974
13975 auto *Bundle = buildBundle(VL);
13976 TryScheduleBundleImpl(ReSchedule, Bundle);
13977 if (!Bundle->isReady()) {
13978 cancelScheduling(VL, S.OpValue);
13979 return std::nullopt;
13980 }
13981 return Bundle;
13982}
13983
13984void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
13985 Value *OpValue) {
13986 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
13988 return;
13989
13990 if (doesNotNeedToBeScheduled(OpValue))
13991 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
13992 ScheduleData *Bundle = getScheduleData(OpValue);
13993 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
13994 assert(!Bundle->IsScheduled &&
13995 "Can't cancel bundle which is already scheduled");
13996 assert(Bundle->isSchedulingEntity() &&
13997 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
13998 "tried to unbundle something which is not a bundle");
13999
14000 // Remove the bundle from the ready list.
14001 if (Bundle->isReady())
14002 ReadyInsts.remove(Bundle);
14003
14004 // Un-bundle: make single instructions out of the bundle.
14005 ScheduleData *BundleMember = Bundle;
14006 while (BundleMember) {
14007 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
14008 BundleMember->FirstInBundle = BundleMember;
14009 ScheduleData *Next = BundleMember->NextInBundle;
14010 BundleMember->NextInBundle = nullptr;
14011 BundleMember->TE = nullptr;
14012 if (BundleMember->unscheduledDepsInBundle() == 0) {
14013 ReadyInsts.insert(BundleMember);
14014 }
14015 BundleMember = Next;
14016 }
14017}
14018
14019BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
14020 // Allocate a new ScheduleData for the instruction.
14021 if (ChunkPos >= ChunkSize) {
14022 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
14023 ChunkPos = 0;
14024 }
14025 return &(ScheduleDataChunks.back()[ChunkPos++]);
14026}
14027
14028bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
14029 const InstructionsState &S) {
14030 if (getScheduleData(V, isOneOf(S, V)))
14031 return true;
14032 Instruction *I = dyn_cast<Instruction>(V);
14033 assert(I && "bundle member must be an instruction");
14034 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
14036 "phi nodes/insertelements/extractelements/extractvalues don't need to "
14037 "be scheduled");
14038 auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
14039 ScheduleData *ISD = getScheduleData(I);
14040 if (!ISD)
14041 return false;
14042 assert(isInSchedulingRegion(ISD) &&
14043 "ScheduleData not in scheduling region");
14044 ScheduleData *SD = allocateScheduleDataChunks();
14045 SD->Inst = I;
14046 SD->init(SchedulingRegionID, S.OpValue);
14047 ExtraScheduleDataMap[I][S.OpValue] = SD;
14048 return true;
14049 };
14050 if (CheckScheduleForI(I))
14051 return true;
14052 if (!ScheduleStart) {
14053 // It's the first instruction in the new region.
14054 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
14055 ScheduleStart = I;
14056 ScheduleEnd = I->getNextNode();
14057 if (isOneOf(S, I) != I)
14058 CheckScheduleForI(I);
14059 assert(ScheduleEnd && "tried to vectorize a terminator?");
14060 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
14061 return true;
14062 }
14063 // Search up and down at the same time, because we don't know if the new
14064 // instruction is above or below the existing scheduling region.
14065 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
14066 // against the budget. Otherwise debug info could affect codegen.
14068 ++ScheduleStart->getIterator().getReverse();
14069 BasicBlock::reverse_iterator UpperEnd = BB->rend();
14070 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
14071 BasicBlock::iterator LowerEnd = BB->end();
14072 auto IsAssumeLikeIntr = [](const Instruction &I) {
14073 if (auto *II = dyn_cast<IntrinsicInst>(&I))
14074 return II->isAssumeLikeIntrinsic();
14075 return false;
14076 };
14077 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14078 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14079 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
14080 &*DownIter != I) {
14081 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14082 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
14083 return false;
14084 }
14085
14086 ++UpIter;
14087 ++DownIter;
14088
14089 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14090 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14091 }
14092 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
14093 assert(I->getParent() == ScheduleStart->getParent() &&
14094 "Instruction is in wrong basic block.");
14095 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
14096 ScheduleStart = I;
14097 if (isOneOf(S, I) != I)
14098 CheckScheduleForI(I);
14099 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
14100 << "\n");
14101 return true;
14102 }
14103 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
14104 "Expected to reach top of the basic block or instruction down the "
14105 "lower end.");
14106 assert(I->getParent() == ScheduleEnd->getParent() &&
14107 "Instruction is in wrong basic block.");
14108 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
14109 nullptr);
14110 ScheduleEnd = I->getNextNode();
14111 if (isOneOf(S, I) != I)
14112 CheckScheduleForI(I);
14113 assert(ScheduleEnd && "tried to vectorize a terminator?");
14114 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
14115 return true;
14116}
14117
14118void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
14119 Instruction *ToI,
14120 ScheduleData *PrevLoadStore,
14121 ScheduleData *NextLoadStore) {
14122 ScheduleData *CurrentLoadStore = PrevLoadStore;
14123 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
14124 // No need to allocate data for non-schedulable instructions.
14126 continue;
14127 ScheduleData *SD = ScheduleDataMap.lookup(I);
14128 if (!SD) {
14129 SD = allocateScheduleDataChunks();
14130 ScheduleDataMap[I] = SD;
14131 SD->Inst = I;
14132 }
14133 assert(!isInSchedulingRegion(SD) &&
14134 "new ScheduleData already in scheduling region");
14135 SD->init(SchedulingRegionID, I);
14136
14137 if (I->mayReadOrWriteMemory() &&
14138 (!isa<IntrinsicInst>(I) ||
14139 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
14140 cast<IntrinsicInst>(I)->getIntrinsicID() !=
14141 Intrinsic::pseudoprobe))) {
14142 // Update the linked list of memory accessing instructions.
14143 if (CurrentLoadStore) {
14144 CurrentLoadStore->NextLoadStore = SD;
14145 } else {
14146 FirstLoadStoreInRegion = SD;
14147 }
14148 CurrentLoadStore = SD;
14149 }
14150
14151 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14152 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14153 RegionHasStackSave = true;
14154 }
14155 if (NextLoadStore) {
14156 if (CurrentLoadStore)
14157 CurrentLoadStore->NextLoadStore = NextLoadStore;
14158 } else {
14159 LastLoadStoreInRegion = CurrentLoadStore;
14160 }
14161}
14162
14163void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14164 bool InsertInReadyList,
14165 BoUpSLP *SLP) {
14166 assert(SD->isSchedulingEntity());
14167
14169 WorkList.push_back(SD);
14170
14171 while (!WorkList.empty()) {
14172 ScheduleData *SD = WorkList.pop_back_val();
14173 for (ScheduleData *BundleMember = SD; BundleMember;
14174 BundleMember = BundleMember->NextInBundle) {
14175 assert(isInSchedulingRegion(BundleMember));
14176 if (BundleMember->hasValidDependencies())
14177 continue;
14178
14179 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
14180 << "\n");
14181 BundleMember->Dependencies = 0;
14182 BundleMember->resetUnscheduledDeps();
14183
14184 // Handle def-use chain dependencies.
14185 if (BundleMember->OpValue != BundleMember->Inst) {
14186 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
14187 BundleMember->Dependencies++;
14188 ScheduleData *DestBundle = UseSD->FirstInBundle;
14189 if (!DestBundle->IsScheduled)
14190 BundleMember->incrementUnscheduledDeps(1);
14191 if (!DestBundle->hasValidDependencies())
14192 WorkList.push_back(DestBundle);
14193 }
14194 } else {
14195 for (User *U : BundleMember->Inst->users()) {
14196 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
14197 BundleMember->Dependencies++;
14198 ScheduleData *DestBundle = UseSD->FirstInBundle;
14199 if (!DestBundle->IsScheduled)
14200 BundleMember->incrementUnscheduledDeps(1);
14201 if (!DestBundle->hasValidDependencies())
14202 WorkList.push_back(DestBundle);
14203 }
14204 }
14205 }
14206
14207 auto MakeControlDependent = [&](Instruction *I) {
14208 auto *DepDest = getScheduleData(I);
14209 assert(DepDest && "must be in schedule window");
14210 DepDest->ControlDependencies.push_back(BundleMember);
14211 BundleMember->Dependencies++;
14212 ScheduleData *DestBundle = DepDest->FirstInBundle;
14213 if (!DestBundle->IsScheduled)
14214 BundleMember->incrementUnscheduledDeps(1);
14215 if (!DestBundle->hasValidDependencies())
14216 WorkList.push_back(DestBundle);
14217 };
14218
14219 // Any instruction which isn't safe to speculate at the beginning of the
14220 // block is control dependend on any early exit or non-willreturn call
14221 // which proceeds it.
14222 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
14223 for (Instruction *I = BundleMember->Inst->getNextNode();
14224 I != ScheduleEnd; I = I->getNextNode()) {
14225 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
14226 continue;
14227
14228 // Add the dependency
14229 MakeControlDependent(I);
14230
14232 // Everything past here must be control dependent on I.
14233 break;
14234 }
14235 }
14236
14237 if (RegionHasStackSave) {
14238 // If we have an inalloc alloca instruction, it needs to be scheduled
14239 // after any preceeding stacksave. We also need to prevent any alloca
14240 // from reordering above a preceeding stackrestore.
14241 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
14242 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
14243 for (Instruction *I = BundleMember->Inst->getNextNode();
14244 I != ScheduleEnd; I = I->getNextNode()) {
14245 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14246 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14247 // Any allocas past here must be control dependent on I, and I
14248 // must be memory dependend on BundleMember->Inst.
14249 break;
14250
14251 if (!isa<AllocaInst>(I))
14252 continue;
14253
14254 // Add the dependency
14255 MakeControlDependent(I);
14256 }
14257 }
14258
14259 // In addition to the cases handle just above, we need to prevent
14260 // allocas and loads/stores from moving below a stacksave or a
14261 // stackrestore. Avoiding moving allocas below stackrestore is currently
14262 // thought to be conservatism. Moving loads/stores below a stackrestore
14263 // can lead to incorrect code.
14264 if (isa<AllocaInst>(BundleMember->Inst) ||
14265 BundleMember->Inst->mayReadOrWriteMemory()) {
14266 for (Instruction *I = BundleMember->Inst->getNextNode();
14267 I != ScheduleEnd; I = I->getNextNode()) {
14268 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
14269 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14270 continue;
14271
14272 // Add the dependency
14273 MakeControlDependent(I);
14274 break;
14275 }
14276 }
14277 }
14278
14279 // Handle the memory dependencies (if any).
14280 ScheduleData *DepDest = BundleMember->NextLoadStore;
14281 if (!DepDest)
14282 continue;
14283 Instruction *SrcInst = BundleMember->Inst;
14284 assert(SrcInst->mayReadOrWriteMemory() &&
14285 "NextLoadStore list for non memory effecting bundle?");
14286 MemoryLocation SrcLoc = getLocation(SrcInst);
14287 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14288 unsigned NumAliased = 0;
14289 unsigned DistToSrc = 1;
14290
14291 for (; DepDest; DepDest = DepDest->NextLoadStore) {
14292 assert(isInSchedulingRegion(DepDest));
14293
14294 // We have two limits to reduce the complexity:
14295 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
14296 // SLP->isAliased (which is the expensive part in this loop).
14297 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
14298 // the whole loop (even if the loop is fast, it's quadratic).
14299 // It's important for the loop break condition (see below) to
14300 // check this limit even between two read-only instructions.
14301 if (DistToSrc >= MaxMemDepDistance ||
14302 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
14303 (NumAliased >= AliasedCheckLimit ||
14304 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
14305
14306 // We increment the counter only if the locations are aliased
14307 // (instead of counting all alias checks). This gives a better
14308 // balance between reduced runtime and accurate dependencies.
14309 NumAliased++;
14310
14311 DepDest->MemoryDependencies.push_back(BundleMember);
14312 BundleMember->Dependencies++;
14313 ScheduleData *DestBundle = DepDest->FirstInBundle;
14314 if (!DestBundle->IsScheduled) {
14315 BundleMember->incrementUnscheduledDeps(1);
14316 }
14317 if (!DestBundle->hasValidDependencies()) {
14318 WorkList.push_back(DestBundle);
14319 }
14320 }
14321
14322 // Example, explaining the loop break condition: Let's assume our
14323 // starting instruction is i0 and MaxMemDepDistance = 3.
14324 //
14325 // +--------v--v--v
14326 // i0,i1,i2,i3,i4,i5,i6,i7,i8
14327 // +--------^--^--^
14328 //
14329 // MaxMemDepDistance let us stop alias-checking at i3 and we add
14330 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
14331 // Previously we already added dependencies from i3 to i6,i7,i8
14332 // (because of MaxMemDepDistance). As we added a dependency from
14333 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
14334 // and we can abort this loop at i6.
14335 if (DistToSrc >= 2 * MaxMemDepDistance)
14336 break;
14337 DistToSrc++;
14338 }
14339 }
14340 if (InsertInReadyList && SD->isReady()) {
14341 ReadyInsts.insert(SD);
14342 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
14343 << "\n");
14344 }
14345 }
14346}
14347
14348void BoUpSLP::BlockScheduling::resetSchedule() {
14349 assert(ScheduleStart &&
14350 "tried to reset schedule on block which has not been scheduled");
14351 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
14352 doForAllOpcodes(I, [&](ScheduleData *SD) {
14353 assert(isInSchedulingRegion(SD) &&
14354 "ScheduleData not in scheduling region");
14355 SD->IsScheduled = false;
14356 SD->resetUnscheduledDeps();
14357 });
14358 }
14359 ReadyInsts.clear();
14360}
14361
14362void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14363 if (!BS->ScheduleStart)
14364 return;
14365
14366 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
14367
14368 // A key point - if we got here, pre-scheduling was able to find a valid
14369 // scheduling of the sub-graph of the scheduling window which consists
14370 // of all vector bundles and their transitive users. As such, we do not
14371 // need to reschedule anything *outside of* that subgraph.
14372
14373 BS->resetSchedule();
14374
14375 // For the real scheduling we use a more sophisticated ready-list: it is
14376 // sorted by the original instruction location. This lets the final schedule
14377 // be as close as possible to the original instruction order.
14378 // WARNING: If changing this order causes a correctness issue, that means
14379 // there is some missing dependence edge in the schedule data graph.
14380 struct ScheduleDataCompare {
14381 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
14382 return SD2->SchedulingPriority < SD1->SchedulingPriority;
14383 }
14384 };
14385 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14386
14387 // Ensure that all dependency data is updated (for nodes in the sub-graph)
14388 // and fill the ready-list with initial instructions.
14389 int Idx = 0;
14390 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
14391 I = I->getNextNode()) {
14392 BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
14393 TreeEntry *SDTE = getTreeEntry(SD->Inst);
14394 (void)SDTE;
14396 SD->isPartOfBundle() ==
14397 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
14398 "scheduler and vectorizer bundle mismatch");
14399 SD->FirstInBundle->SchedulingPriority = Idx++;
14400
14401 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14402 BS->calculateDependencies(SD, false, this);
14403 });
14404 }
14405 BS->initialFillReadyList(ReadyInsts);
14406
14407 Instruction *LastScheduledInst = BS->ScheduleEnd;
14408
14409 // Do the "real" scheduling.
14410 while (!ReadyInsts.empty()) {
14411 ScheduleData *Picked = *ReadyInsts.begin();
14412 ReadyInsts.erase(ReadyInsts.begin());
14413
14414 // Move the scheduled instruction(s) to their dedicated places, if not
14415 // there yet.
14416 for (ScheduleData *BundleMember = Picked; BundleMember;
14417 BundleMember = BundleMember->NextInBundle) {
14418 Instruction *PickedInst = BundleMember->Inst;
14419 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
14420 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
14421 LastScheduledInst = PickedInst;
14422 }
14423
14424 BS->schedule(Picked, ReadyInsts);
14425 }
14426
14427 // Check that we didn't break any of our invariants.
14428#ifdef EXPENSIVE_CHECKS
14429 BS->verify();
14430#endif
14431
14432#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
14433 // Check that all schedulable entities got scheduled
14434 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
14435 BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
14436 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
14437 assert(SD->IsScheduled && "must be scheduled at this point");
14438 }
14439 });
14440 }
14441#endif
14442
14443 // Avoid duplicate scheduling of the block.
14444 BS->ScheduleStart = nullptr;
14445}
14446
14448 // If V is a store, just return the width of the stored value (or value
14449 // truncated just before storing) without traversing the expression tree.
14450 // This is the common case.
14451 if (auto *Store = dyn_cast<StoreInst>(V))
14452 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
14453
14454 if (auto *IEI = dyn_cast<InsertElementInst>(V))
14455 return getVectorElementSize(IEI->getOperand(1));
14456
14457 auto E = InstrElementSize.find(V);
14458 if (E != InstrElementSize.end())
14459 return E->second;
14460
14461 // If V is not a store, we can traverse the expression tree to find loads
14462 // that feed it. The type of the loaded value may indicate a more suitable
14463 // width than V's type. We want to base the vector element size on the width
14464 // of memory operations where possible.
14467 if (auto *I = dyn_cast<Instruction>(V)) {
14468 Worklist.emplace_back(I, I->getParent(), 0);
14469 Visited.insert(I);
14470 }
14471
14472 // Traverse the expression tree in bottom-up order looking for loads. If we
14473 // encounter an instruction we don't yet handle, we give up.
14474 auto Width = 0u;
14475 Value *FirstNonBool = nullptr;
14476 while (!Worklist.empty()) {
14477 auto [I, Parent, Level] = Worklist.pop_back_val();
14478
14479 // We should only be looking at scalar instructions here. If the current
14480 // instruction has a vector type, skip.
14481 auto *Ty = I->getType();
14482 if (isa<VectorType>(Ty))
14483 continue;
14484 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
14485 FirstNonBool = I;
14486 if (Level > RecursionMaxDepth)
14487 continue;
14488
14489 // If the current instruction is a load, update MaxWidth to reflect the
14490 // width of the loaded value.
14491 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
14492 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
14493
14494 // Otherwise, we need to visit the operands of the instruction. We only
14495 // handle the interesting cases from buildTree here. If an operand is an
14496 // instruction we haven't yet visited and from the same basic block as the
14497 // user or the use is a PHI node, we add it to the worklist.
14500 for (Use &U : I->operands()) {
14501 if (auto *J = dyn_cast<Instruction>(U.get()))
14502 if (Visited.insert(J).second &&
14503 (isa<PHINode>(I) || J->getParent() == Parent)) {
14504 Worklist.emplace_back(J, J->getParent(), Level + 1);
14505 continue;
14506 }
14507 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
14508 FirstNonBool = U.get();
14509 }
14510 } else {
14511 break;
14512 }
14513 }
14514
14515 // If we didn't encounter a memory access in the expression tree, or if we
14516 // gave up for some reason, just return the width of V. Otherwise, return the
14517 // maximum width we found.
14518 if (!Width) {
14519 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
14520 V = FirstNonBool;
14521 Width = DL->getTypeSizeInBits(V->getType());
14522 }
14523
14524 for (Instruction *I : Visited)
14525 InstrElementSize[I] = Width;
14526
14527 return Width;
14528}
14529
14530bool BoUpSLP::collectValuesToDemote(
14531 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
14533 unsigned &MaxDepthLevel, bool &IsProfitableToDemote,
14534 bool IsTruncRoot) const {
14535 // We can always demote constants.
14536 if (all_of(E.Scalars, IsaPred<Constant>))
14537 return true;
14538
14539 unsigned OrigBitWidth = DL->getTypeSizeInBits(E.Scalars.front()->getType());
14540 if (OrigBitWidth == BitWidth) {
14541 MaxDepthLevel = 1;
14542 return true;
14543 }
14544
14545 // If the value is not a vectorized instruction in the expression and not used
14546 // by the insertelement instruction and not used in multiple vector nodes, it
14547 // cannot be demoted.
14548 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
14549 if (MultiNodeScalars.contains(V))
14550 return false;
14551 if (OrigBitWidth > BitWidth) {
14552 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14553 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
14554 return true;
14555 }
14556 auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
14557 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
14558 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*DL));
14559 if (IsSigned)
14560 ++BitWidth1;
14561 if (auto *I = dyn_cast<Instruction>(V)) {
14562 APInt Mask = DB->getDemandedBits(I);
14563 unsigned BitWidth2 =
14564 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
14565 while (!IsSigned && BitWidth2 < OrigBitWidth) {
14566 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
14567 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
14568 break;
14569 BitWidth2 *= 2;
14570 }
14571 BitWidth1 = std::min(BitWidth1, BitWidth2);
14572 }
14573 BitWidth = std::max(BitWidth, BitWidth1);
14574 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
14575 };
14576 using namespace std::placeholders;
14577 auto FinalAnalysis = [&]() {
14578 if (!IsProfitableToDemote)
14579 return false;
14580 bool Res = all_of(
14581 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
14582 // Gather demoted constant operands.
14583 if (Res && E.State == TreeEntry::NeedToGather &&
14584 all_of(E.Scalars, IsaPred<Constant>))
14585 ToDemote.push_back(E.Idx);
14586 return Res;
14587 };
14588 // TODO: improve handling of gathered values and others.
14589 if (E.State == TreeEntry::NeedToGather || !Visited.insert(&E).second ||
14590 any_of(E.Scalars, [&](Value *V) {
14591 return all_of(V->users(), [&](User *U) {
14592 return isa<InsertElementInst>(U) && !getTreeEntry(U);
14593 });
14594 }))
14595 return FinalAnalysis();
14596
14597 if (any_of(E.Scalars, [&](Value *V) {
14598 return !all_of(V->users(), [=](User *U) {
14599 return getTreeEntry(U) ||
14600 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14601 (!isa<CmpInst>(U) && U->getType()->isSized() &&
14602 !U->getType()->isScalableTy() &&
14603 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
14604 }) && !IsPotentiallyTruncated(V, BitWidth);
14605 }))
14606 return false;
14607
14608 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
14609 bool &NeedToExit) {
14610 NeedToExit = false;
14611 unsigned InitLevel = MaxDepthLevel;
14612 for (const TreeEntry *Op : Operands) {
14613 unsigned Level = InitLevel;
14614 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
14615 ToDemote, Visited, Level, IsProfitableToDemote,
14616 IsTruncRoot)) {
14617 if (!IsProfitableToDemote)
14618 return false;
14619 NeedToExit = true;
14620 if (!FinalAnalysis())
14621 return false;
14622 continue;
14623 }
14624 MaxDepthLevel = std::max(MaxDepthLevel, Level);
14625 }
14626 return true;
14627 };
14628 auto AttemptCheckBitwidth =
14629 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
14630 // Try all bitwidth < OrigBitWidth.
14631 NeedToExit = false;
14632 unsigned BestFailBitwidth = 0;
14633 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
14634 if (Checker(BitWidth, OrigBitWidth))
14635 return true;
14636 if (BestFailBitwidth == 0 && FinalAnalysis())
14637 BestFailBitwidth = BitWidth;
14638 }
14639 if (BitWidth >= OrigBitWidth) {
14640 if (BestFailBitwidth == 0) {
14641 BitWidth = OrigBitWidth;
14642 return false;
14643 }
14644 MaxDepthLevel = 1;
14645 BitWidth = BestFailBitwidth;
14646 NeedToExit = true;
14647 return true;
14648 }
14649 return false;
14650 };
14651 auto TryProcessInstruction =
14652 [&](unsigned &BitWidth,
14654 function_ref<bool(unsigned, unsigned)> Checker = {}) {
14655 if (Operands.empty()) {
14656 if (!IsTruncRoot)
14657 MaxDepthLevel = 1;
14658 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
14659 std::ref(BitWidth)));
14660 } else {
14661 // Several vectorized uses? Check if we can truncate it, otherwise -
14662 // exit.
14663 if (E.UserTreeIndices.size() > 1 &&
14664 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
14665 std::ref(BitWidth))))
14666 return false;
14667 bool NeedToExit = false;
14668 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
14669 return false;
14670 if (NeedToExit)
14671 return true;
14672 if (!ProcessOperands(Operands, NeedToExit))
14673 return false;
14674 if (NeedToExit)
14675 return true;
14676 }
14677
14678 ++MaxDepthLevel;
14679 // Record the entry that we can demote.
14680 ToDemote.push_back(E.Idx);
14681 return IsProfitableToDemote;
14682 };
14683 switch (E.getOpcode()) {
14684
14685 // We can always demote truncations and extensions. Since truncations can
14686 // seed additional demotion, we save the truncated value.
14687 case Instruction::Trunc:
14688 if (IsProfitableToDemoteRoot)
14689 IsProfitableToDemote = true;
14690 return TryProcessInstruction(BitWidth);
14691 case Instruction::ZExt:
14692 case Instruction::SExt:
14693 IsProfitableToDemote = true;
14694 return TryProcessInstruction(BitWidth);
14695
14696 // We can demote certain binary operations if we can demote both of their
14697 // operands.
14698 case Instruction::Add:
14699 case Instruction::Sub:
14700 case Instruction::Mul:
14701 case Instruction::And:
14702 case Instruction::Or:
14703 case Instruction::Xor: {
14704 return TryProcessInstruction(
14705 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
14706 }
14707 case Instruction::Shl: {
14708 // If we are truncating the result of this SHL, and if it's a shift of an
14709 // inrange amount, we can always perform a SHL in a smaller type.
14710 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
14711 return all_of(E.Scalars, [&](Value *V) {
14712 auto *I = cast<Instruction>(V);
14713 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14714 return AmtKnownBits.getMaxValue().ult(BitWidth);
14715 });
14716 };
14717 return TryProcessInstruction(
14718 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
14719 }
14720 case Instruction::LShr: {
14721 // If this is a truncate of a logical shr, we can truncate it to a smaller
14722 // lshr iff we know that the bits we would otherwise be shifting in are
14723 // already zeros.
14724 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14725 return all_of(E.Scalars, [&](Value *V) {
14726 auto *I = cast<Instruction>(V);
14727 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14728 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14729 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
14730 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
14731 SimplifyQuery(*DL));
14732 });
14733 };
14734 return TryProcessInstruction(
14735 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
14736 LShrChecker);
14737 }
14738 case Instruction::AShr: {
14739 // If this is a truncate of an arithmetic shr, we can truncate it to a
14740 // smaller ashr iff we know that all the bits from the sign bit of the
14741 // original type and the sign bit of the truncate type are similar.
14742 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14743 return all_of(E.Scalars, [&](Value *V) {
14744 auto *I = cast<Instruction>(V);
14745 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14746 unsigned ShiftedBits = OrigBitWidth - BitWidth;
14747 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
14748 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
14749 nullptr, DT);
14750 });
14751 };
14752 return TryProcessInstruction(
14753 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
14754 AShrChecker);
14755 }
14756 case Instruction::UDiv:
14757 case Instruction::URem: {
14758 // UDiv and URem can be truncated if all the truncated bits are zero.
14759 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14760 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
14761 return all_of(E.Scalars, [&](Value *V) {
14762 auto *I = cast<Instruction>(V);
14763 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14764 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
14765 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
14766 });
14767 };
14768 return TryProcessInstruction(
14769 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
14770 }
14771
14772 // We can demote selects if we can demote their true and false values.
14773 case Instruction::Select: {
14774 return TryProcessInstruction(
14775 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
14776 }
14777
14778 // We can demote phis if we can demote all their incoming operands. Note that
14779 // we don't need to worry about cycles since we ensure single use above.
14780 case Instruction::PHI: {
14781 const unsigned NumOps = E.getNumOperands();
14783 transform(seq<unsigned>(0, NumOps), Ops.begin(),
14784 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
14785
14786 return TryProcessInstruction(BitWidth, Ops);
14787 }
14788
14789 case Instruction::Call: {
14790 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
14791 if (!IC)
14792 break;
14794 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
14795 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
14796 break;
14797 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
14798 function_ref<bool(unsigned, unsigned)> CallChecker;
14799 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14800 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
14801 return all_of(E.Scalars, [&](Value *V) {
14802 auto *I = cast<Instruction>(V);
14803 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
14804 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14805 return MaskedValueIsZero(I->getOperand(0), Mask,
14806 SimplifyQuery(*DL)) &&
14807 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
14808 }
14809 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
14810 "Expected min/max intrinsics only.");
14811 unsigned SignBits = OrigBitWidth - BitWidth;
14812 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
14813 return SignBits <= ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
14814 nullptr, DT) &&
14815 (!isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL)) ||
14816 MaskedValueIsZero(I->getOperand(0), Mask,
14817 SimplifyQuery(*DL))) &&
14818 SignBits <= ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
14819 nullptr, DT) &&
14820 (!isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL)) ||
14821 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
14822 });
14823 };
14824 if (ID != Intrinsic::abs) {
14825 Operands.push_back(getOperandEntry(&E, 1));
14826 CallChecker = CompChecker;
14827 }
14828 InstructionCost BestCost =
14829 std::numeric_limits<InstructionCost::CostType>::max();
14830 unsigned BestBitWidth = BitWidth;
14831 unsigned VF = E.Scalars.size();
14832 // Choose the best bitwidth based on cost estimations.
14833 auto Checker = [&](unsigned BitWidth, unsigned) {
14834 unsigned MinBW = PowerOf2Ceil(BitWidth);
14835 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(IC, ID, VF, MinBW);
14836 auto VecCallCosts = getVectorCallCosts(
14837 IC,
14838 FixedVectorType::get(IntegerType::get(IC->getContext(), MinBW), VF),
14839 TTI, TLI, ArgTys);
14840 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
14841 if (Cost < BestCost) {
14842 BestCost = Cost;
14843 BestBitWidth = BitWidth;
14844 }
14845 return false;
14846 };
14847 [[maybe_unused]] bool NeedToExit;
14848 (void)AttemptCheckBitwidth(Checker, NeedToExit);
14849 BitWidth = BestBitWidth;
14850 return TryProcessInstruction(BitWidth, Operands, CallChecker);
14851 }
14852
14853 // Otherwise, conservatively give up.
14854 default:
14855 break;
14856 }
14857 MaxDepthLevel = 1;
14858 return FinalAnalysis();
14859}
14860
14861static RecurKind getRdxKind(Value *V);
14862
14864 // We only attempt to truncate integer expressions.
14865 bool IsStoreOrInsertElt =
14866 VectorizableTree.front()->getOpcode() == Instruction::Store ||
14867 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
14868 if ((IsStoreOrInsertElt || UserIgnoreList) &&
14869 ExtraBitWidthNodes.size() <= 1 &&
14870 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
14871 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
14872 return;
14873
14874 unsigned NodeIdx = 0;
14875 if (IsStoreOrInsertElt &&
14876 VectorizableTree.front()->State != TreeEntry::NeedToGather)
14877 NodeIdx = 1;
14878
14879 // Ensure the roots of the vectorizable tree don't form a cycle.
14880 if (VectorizableTree[NodeIdx]->State == TreeEntry::NeedToGather ||
14881 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
14882 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
14883 [NodeIdx](const EdgeInfo &EI) {
14884 return EI.UserTE->Idx >
14885 static_cast<int>(NodeIdx);
14886 })))
14887 return;
14888
14889 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
14890 // resize to the final type.
14891 bool IsTruncRoot = false;
14892 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
14893 SmallVector<unsigned> RootDemotes;
14894 if (NodeIdx != 0 &&
14895 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
14896 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
14897 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
14898 IsTruncRoot = true;
14899 RootDemotes.push_back(NodeIdx);
14900 IsProfitableToDemoteRoot = true;
14901 ++NodeIdx;
14902 }
14903
14904 // Analyzed the reduction already and not profitable - exit.
14905 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
14906 return;
14907
14908 SmallVector<unsigned> ToDemote;
14909 auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
14910 bool IsProfitableToDemoteRoot, unsigned Opcode,
14911 unsigned Limit, bool IsTruncRoot,
14912 bool IsSignedCmp) {
14913 ToDemote.clear();
14914 unsigned VF = E.getVectorFactor();
14915 auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType());
14916 if (!TreeRootIT || !Opcode)
14917 return 0u;
14918
14919 if (any_of(E.Scalars,
14920 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
14921 return 0u;
14922
14923 unsigned NumParts =
14924 TTI->getNumberOfParts(FixedVectorType::get(TreeRootIT, VF));
14925
14926 // The maximum bit width required to represent all the values that can be
14927 // demoted without loss of precision. It would be safe to truncate the roots
14928 // of the expression to this width.
14929 unsigned MaxBitWidth = 1u;
14930
14931 // True if the roots can be zero-extended back to their original type,
14932 // rather than sign-extended. We know that if the leading bits are not
14933 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
14934 // True.
14935 // Determine if the sign bit of all the roots is known to be zero. If not,
14936 // IsKnownPositive is set to False.
14937 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
14938 KnownBits Known = computeKnownBits(R, *DL);
14939 return Known.isNonNegative();
14940 });
14941
14942 // We first check if all the bits of the roots are demanded. If they're not,
14943 // we can truncate the roots to this narrower type.
14944 for (Value *Root : E.Scalars) {
14945 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
14946 TypeSize NumTypeBits = DL->getTypeSizeInBits(Root->getType());
14947 unsigned BitWidth1 = NumTypeBits - NumSignBits;
14948 // If we can't prove that the sign bit is zero, we must add one to the
14949 // maximum bit width to account for the unknown sign bit. This preserves
14950 // the existing sign bit so we can safely sign-extend the root back to the
14951 // original type. Otherwise, if we know the sign bit is zero, we will
14952 // zero-extend the root instead.
14953 //
14954 // FIXME: This is somewhat suboptimal, as there will be cases where adding
14955 // one to the maximum bit width will yield a larger-than-necessary
14956 // type. In general, we need to add an extra bit only if we can't
14957 // prove that the upper bit of the original type is equal to the
14958 // upper bit of the proposed smaller type. If these two bits are
14959 // the same (either zero or one) we know that sign-extending from
14960 // the smaller type will result in the same value. Here, since we
14961 // can't yet prove this, we are just making the proposed smaller
14962 // type larger to ensure correctness.
14963 if (!IsKnownPositive)
14964 ++BitWidth1;
14965
14966 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
14967 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
14968 MaxBitWidth =
14969 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
14970 }
14971
14972 if (MaxBitWidth < 8 && MaxBitWidth > 1)
14973 MaxBitWidth = 8;
14974
14975 // If the original type is large, but reduced type does not improve the reg
14976 // use - ignore it.
14977 if (NumParts > 1 &&
14978 NumParts ==
14980 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
14981 return 0u;
14982
14983 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
14984 Opcode == Instruction::SExt ||
14985 Opcode == Instruction::ZExt || NumParts > 1;
14986 // Conservatively determine if we can actually truncate the roots of the
14987 // expression. Collect the values that can be demoted in ToDemote and
14988 // additional roots that require investigating in Roots.
14990 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
14991 bool NeedToDemote = IsProfitableToDemote;
14992
14993 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
14994 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
14995 IsTruncRoot) ||
14996 (MaxDepthLevel <= Limit &&
14997 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
14998 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
14999 DL->getTypeSizeInBits(TreeRootIT) /
15000 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
15001 ->getOperand(0)
15002 ->getType()) >
15003 2)))))
15004 return 0u;
15005 // Round MaxBitWidth up to the next power-of-two.
15006 MaxBitWidth = bit_ceil(MaxBitWidth);
15007
15008 return MaxBitWidth;
15009 };
15010
15011 // If we can truncate the root, we must collect additional values that might
15012 // be demoted as a result. That is, those seeded by truncations we will
15013 // modify.
15014 // Add reduction ops sizes, if any.
15015 if (UserIgnoreList &&
15016 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
15017 for (Value *V : *UserIgnoreList) {
15018 auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
15019 auto NumTypeBits = DL->getTypeSizeInBits(V->getType());
15020 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15022 ++BitWidth1;
15023 unsigned BitWidth2 = BitWidth1;
15025 auto Mask = DB->getDemandedBits(cast<Instruction>(V));
15026 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15027 }
15028 ReductionBitWidth =
15029 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
15030 }
15031 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
15032 ReductionBitWidth = 8;
15033
15034 ReductionBitWidth = bit_ceil(ReductionBitWidth);
15035 }
15036 bool IsTopRoot = NodeIdx == 0;
15037 while (NodeIdx < VectorizableTree.size() &&
15038 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15039 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15040 RootDemotes.push_back(NodeIdx);
15041 ++NodeIdx;
15042 IsTruncRoot = true;
15043 }
15044 bool IsSignedCmp = false;
15045 while (NodeIdx < VectorizableTree.size()) {
15046 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
15047 unsigned Limit = 2;
15048 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
15049 if (IsTopRoot &&
15050 ReductionBitWidth ==
15051 DL->getTypeSizeInBits(
15052 VectorizableTree.front()->Scalars.front()->getType()))
15053 Limit = 3;
15054 unsigned MaxBitWidth = ComputeMaxBitWidth(
15055 *VectorizableTree[NodeIdx].get(), IsTopRoot, IsProfitableToDemoteRoot,
15056 Opcode, Limit, IsTruncRoot, IsSignedCmp);
15057 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
15058 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
15059 ReductionBitWidth = bit_ceil(MaxBitWidth);
15060 else if (MaxBitWidth == 0)
15061 ReductionBitWidth = 0;
15062 }
15063
15064 for (unsigned Idx : RootDemotes) {
15065 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
15066 uint32_t OrigBitWidth = DL->getTypeSizeInBits(V->getType());
15067 if (OrigBitWidth > MaxBitWidth) {
15068 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
15069 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
15070 }
15071 return false;
15072 }))
15073 ToDemote.push_back(Idx);
15074 }
15075 RootDemotes.clear();
15076 IsTopRoot = false;
15077 IsProfitableToDemoteRoot = true;
15078
15079 if (ExtraBitWidthNodes.empty()) {
15080 NodeIdx = VectorizableTree.size();
15081 } else {
15082 unsigned NewIdx = 0;
15083 do {
15084 NewIdx = *ExtraBitWidthNodes.begin();
15085 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
15086 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
15087 NodeIdx = NewIdx;
15088 IsTruncRoot =
15089 NodeIdx < VectorizableTree.size() &&
15090 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15091 [](const EdgeInfo &EI) {
15092 return EI.EdgeIdx == 0 &&
15093 EI.UserTE->getOpcode() == Instruction::Trunc &&
15094 !EI.UserTE->isAltShuffle();
15095 });
15096 IsSignedCmp =
15097 NodeIdx < VectorizableTree.size() &&
15098 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15099 [&](const EdgeInfo &EI) {
15100 return EI.UserTE->getOpcode() == Instruction::ICmp &&
15101 any_of(EI.UserTE->Scalars, [&](Value *V) {
15102 auto *IC = dyn_cast<ICmpInst>(V);
15103 return IC &&
15104 (IC->isSigned() ||
15105 !isKnownNonNegative(IC->getOperand(0),
15106 SimplifyQuery(*DL)) ||
15107 !isKnownNonNegative(IC->getOperand(1),
15108 SimplifyQuery(*DL)));
15109 });
15110 });
15111 }
15112
15113 // If the maximum bit width we compute is less than the with of the roots'
15114 // type, we can proceed with the narrowing. Otherwise, do nothing.
15115 if (MaxBitWidth == 0 ||
15116 MaxBitWidth >=
15117 cast<IntegerType>(TreeRoot.front()->getType())->getBitWidth()) {
15118 if (UserIgnoreList)
15119 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
15120 continue;
15121 }
15122
15123 // Finally, map the values we can demote to the maximum bit with we
15124 // computed.
15125 for (unsigned Idx : ToDemote) {
15126 TreeEntry *TE = VectorizableTree[Idx].get();
15127 if (MinBWs.contains(TE))
15128 continue;
15129 bool IsSigned = TE->getOpcode() == Instruction::SExt ||
15130 any_of(TE->Scalars, [&](Value *R) {
15131 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15132 });
15133 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
15134 }
15135 }
15136}
15137
15139 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
15140 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
15141 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
15142 auto *AA = &AM.getResult<AAManager>(F);
15143 auto *LI = &AM.getResult<LoopAnalysis>(F);
15144 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
15145 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
15146 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
15148
15149 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
15150 if (!Changed)
15151 return PreservedAnalyses::all();
15152
15155 return PA;
15156}
15157
15159 TargetTransformInfo *TTI_,
15160 TargetLibraryInfo *TLI_, AAResults *AA_,
15161 LoopInfo *LI_, DominatorTree *DT_,
15162 AssumptionCache *AC_, DemandedBits *DB_,
15165 return false;
15166 SE = SE_;
15167 TTI = TTI_;
15168 TLI = TLI_;
15169 AA = AA_;
15170 LI = LI_;
15171 DT = DT_;
15172 AC = AC_;
15173 DB = DB_;
15174 DL = &F.getParent()->getDataLayout();
15175
15176 Stores.clear();
15177 GEPs.clear();
15178 bool Changed = false;
15179
15180 // If the target claims to have no vector registers don't attempt
15181 // vectorization.
15183 LLVM_DEBUG(
15184 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
15185 return false;
15186 }
15187
15188 // Don't vectorize when the attribute NoImplicitFloat is used.
15189 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
15190 return false;
15191
15192 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
15193
15194 // Use the bottom up slp vectorizer to construct chains that start with
15195 // store instructions.
15196 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
15197
15198 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
15199 // delete instructions.
15200
15201 // Update DFS numbers now so that we can use them for ordering.
15202 DT->updateDFSNumbers();
15203
15204 // Scan the blocks in the function in post order.
15205 for (auto *BB : post_order(&F.getEntryBlock())) {
15206 // Start new block - clear the list of reduction roots.
15207 R.clearReductionData();
15208 collectSeedInstructions(BB);
15209
15210 // Vectorize trees that end at stores.
15211 if (!Stores.empty()) {
15212 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
15213 << " underlying objects.\n");
15214 Changed |= vectorizeStoreChains(R);
15215 }
15216
15217 // Vectorize trees that end at reductions.
15218 Changed |= vectorizeChainsInBlock(BB, R);
15219
15220 // Vectorize the index computations of getelementptr instructions. This
15221 // is primarily intended to catch gather-like idioms ending at
15222 // non-consecutive loads.
15223 if (!GEPs.empty()) {
15224 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
15225 << " underlying objects.\n");
15226 Changed |= vectorizeGEPIndices(BB, R);
15227 }
15228 }
15229
15230 if (Changed) {
15231 R.optimizeGatherSequence();
15232 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
15233 }
15234 return Changed;
15235}
15236
15237std::optional<bool>
15238SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
15239 unsigned Idx, unsigned MinVF,
15240 unsigned &Size) {
15241 Size = 0;
15242 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
15243 << "\n");
15244 const unsigned Sz = R.getVectorElementSize(Chain[0]);
15245 unsigned VF = Chain.size();
15246
15247 if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) {
15248 // Check if vectorizing with a non-power-of-2 VF should be considered. At
15249 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
15250 // all vector lanes are used.
15251 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
15252 return false;
15253 }
15254
15255 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
15256 << "\n");
15257
15258 SetVector<Value *> ValOps;
15259 for (Value *V : Chain)
15260 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
15261 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
15262 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
15263 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
15264 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
15265 bool IsPowerOf2 =
15266 isPowerOf2_32(ValOps.size()) ||
15267 (VectorizeNonPowerOf2 && isPowerOf2_32(ValOps.size() + 1));
15268 if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
15269 (!S.MainOp->isSafeToRemove() ||
15270 any_of(ValOps.getArrayRef(),
15271 [&](Value *V) {
15272 return !isa<ExtractElementInst>(V) &&
15273 (V->getNumUses() > Chain.size() ||
15274 any_of(V->users(), [&](User *U) {
15275 return !Stores.contains(U);
15276 }));
15277 }))) ||
15278 (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
15279 Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
15280 return false;
15281 }
15282 }
15283 if (R.isLoadCombineCandidate(Chain))
15284 return true;
15285 R.buildTree(Chain);
15286 // Check if tree tiny and store itself or its value is not vectorized.
15287 if (R.isTreeTinyAndNotFullyVectorizable()) {
15288 if (R.isGathered(Chain.front()) ||
15289 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
15290 return std::nullopt;
15291 Size = R.getTreeSize();
15292 return false;
15293 }
15294 R.reorderTopToBottom();
15295 R.reorderBottomToTop();
15296 R.buildExternalUses();
15297
15298 R.computeMinimumValueSizes();
15299 R.transformNodes();
15300
15301 Size = R.getTreeSize();
15302 if (S.getOpcode() == Instruction::Load)
15303 Size = 2; // cut off masked gather small trees
15304 InstructionCost Cost = R.getTreeCost();
15305
15306 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
15307 if (Cost < -SLPCostThreshold) {
15308 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
15309
15310 using namespace ore;
15311
15312 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
15313 cast<StoreInst>(Chain[0]))
15314 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
15315 << " and with tree size "
15316 << NV("TreeSize", R.getTreeSize()));
15317
15318 R.vectorizeTree();
15319 return true;
15320 }
15321
15322 return false;
15323}
15324
15325/// Checks if the quadratic mean deviation is less than 90% of the mean size.
15326static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
15327 bool First) {
15328 unsigned Num = 0;
15329 uint64_t Sum = std::accumulate(
15330 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
15331 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
15332 unsigned Size = First ? Val.first : Val.second;
15333 if (Size == 1)
15334 return V;
15335 ++Num;
15336 return V + Size;
15337 });
15338 if (Num == 0)
15339 return true;
15340 uint64_t Mean = Sum / Num;
15341 if (Mean == 0)
15342 return true;
15343 uint64_t Dev = std::accumulate(
15344 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
15345 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
15346 unsigned P = First ? Val.first : Val.second;
15347 if (P == 1)
15348 return V;
15349 return V + (P - Mean) * (P - Mean);
15350 }) /
15351 Num;
15352 return Dev * 81 / (Mean * Mean) == 0;
15353}
15354
15355bool SLPVectorizerPass::vectorizeStores(
15356 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
15357 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
15358 &Visited) {
15359 // We may run into multiple chains that merge into a single chain. We mark the
15360 // stores that we vectorized so that we don't visit the same store twice.
15361 BoUpSLP::ValueSet VectorizedStores;
15362 bool Changed = false;
15363
15364 struct StoreDistCompare {
15365 bool operator()(const std::pair<unsigned, int> &Op1,
15366 const std::pair<unsigned, int> &Op2) const {
15367 return Op1.second < Op2.second;
15368 }
15369 };
15370 // A set of pairs (index of store in Stores array ref, Distance of the store
15371 // address relative to base store address in units).
15372 using StoreIndexToDistSet =
15373 std::set<std::pair<unsigned, int>, StoreDistCompare>;
15374 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
15375 int PrevDist = -1;
15377 // Collect the chain into a list.
15378 for (auto [Idx, Data] : enumerate(Set)) {
15379 if (Operands.empty() || Data.second - PrevDist == 1) {
15380 Operands.push_back(Stores[Data.first]);
15381 PrevDist = Data.second;
15382 if (Idx != Set.size() - 1)
15383 continue;
15384 }
15385 auto E = make_scope_exit([&, &DataVar = Data]() {
15386 Operands.clear();
15387 Operands.push_back(Stores[DataVar.first]);
15388 PrevDist = DataVar.second;
15389 });
15390
15391 if (Operands.size() <= 1 ||
15392 !Visited
15393 .insert({Operands.front(),
15394 cast<StoreInst>(Operands.front())->getValueOperand(),
15395 Operands.back(),
15396 cast<StoreInst>(Operands.back())->getValueOperand(),
15397 Operands.size()})
15398 .second)
15399 continue;
15400
15401 unsigned MaxVecRegSize = R.getMaxVecRegSize();
15402 unsigned EltSize = R.getVectorElementSize(Operands[0]);
15403 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
15404
15405 unsigned MaxVF =
15406 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
15407 unsigned MaxRegVF = MaxVF;
15408 auto *Store = cast<StoreInst>(Operands[0]);
15409 Type *StoreTy = Store->getValueOperand()->getType();
15410 Type *ValueTy = StoreTy;
15411 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
15412 ValueTy = Trunc->getSrcTy();
15413 if (ValueTy == StoreTy &&
15414 R.getVectorElementSize(Store->getValueOperand()) <= EltSize)
15415 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
15416 unsigned MinVF = std::max<unsigned>(
15418 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
15419 ValueTy)));
15420
15421 if (MaxVF < MinVF) {
15422 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
15423 << ") < "
15424 << "MinVF (" << MinVF << ")\n");
15425 continue;
15426 }
15427
15428 unsigned NonPowerOf2VF = 0;
15430 // First try vectorizing with a non-power-of-2 VF. At the moment, only
15431 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
15432 // lanes are used.
15433 unsigned CandVF = Operands.size();
15434 if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxRegVF)
15435 NonPowerOf2VF = CandVF;
15436 }
15437
15438 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
15439 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
15440 unsigned Size = MinVF;
15441 for_each(reverse(CandidateVFs), [&](unsigned &VF) {
15442 VF = Size > MaxVF ? NonPowerOf2VF : Size;
15443 Size *= 2;
15444 });
15445 unsigned End = Operands.size();
15446 unsigned Repeat = 0;
15447 constexpr unsigned MaxAttempts = 4;
15449 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
15450 P.first = P.second = 1;
15451 });
15453 auto IsNotVectorized = [](bool First,
15454 const std::pair<unsigned, unsigned> &P) {
15455 return First ? P.first > 0 : P.second > 0;
15456 };
15457 auto IsVectorized = [](bool First,
15458 const std::pair<unsigned, unsigned> &P) {
15459 return First ? P.first == 0 : P.second == 0;
15460 };
15461 auto VFIsProfitable = [](bool First, unsigned Size,
15462 const std::pair<unsigned, unsigned> &P) {
15463 return First ? Size >= P.first : Size >= P.second;
15464 };
15465 auto FirstSizeSame = [](unsigned Size,
15466 const std::pair<unsigned, unsigned> &P) {
15467 return Size == P.first;
15468 };
15469 while (true) {
15470 ++Repeat;
15471 bool RepeatChanged = false;
15472 bool AnyProfitableGraph;
15473 for (unsigned Size : CandidateVFs) {
15474 AnyProfitableGraph = false;
15475 unsigned StartIdx = std::distance(
15476 RangeSizes.begin(),
15477 find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
15478 std::placeholders::_1)));
15479 while (StartIdx < End) {
15480 unsigned EndIdx =
15481 std::distance(RangeSizes.begin(),
15482 find_if(RangeSizes.drop_front(StartIdx),
15483 std::bind(IsVectorized, Size >= MaxRegVF,
15484 std::placeholders::_1)));
15485 unsigned Sz = EndIdx >= End ? End : EndIdx;
15486 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
15487 if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
15488 Size >= MaxRegVF)) {
15489 ++Cnt;
15490 continue;
15491 }
15493 assert(all_of(Slice,
15494 [&](Value *V) {
15495 return cast<StoreInst>(V)
15496 ->getValueOperand()
15497 ->getType() ==
15498 cast<StoreInst>(Slice.front())
15499 ->getValueOperand()
15500 ->getType();
15501 }) &&
15502 "Expected all operands of same type.");
15503 if (!NonSchedulable.empty()) {
15504 auto [NonSchedSizeMax, NonSchedSizeMin] =
15505 NonSchedulable.lookup(Slice.front());
15506 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
15507 Cnt += NonSchedSizeMax;
15508 continue;
15509 }
15510 }
15511 unsigned TreeSize;
15512 std::optional<bool> Res =
15513 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
15514 if (!Res) {
15515 NonSchedulable
15516 .try_emplace(Slice.front(), std::make_pair(Size, Size))
15517 .first->getSecond()
15518 .second = Size;
15519 } else if (*Res) {
15520 // Mark the vectorized stores so that we don't vectorize them
15521 // again.
15522 VectorizedStores.insert(Slice.begin(), Slice.end());
15523 // Mark the vectorized stores so that we don't vectorize them
15524 // again.
15525 AnyProfitableGraph = RepeatChanged = Changed = true;
15526 // If we vectorized initial block, no need to try to vectorize
15527 // it again.
15528 for_each(RangeSizes.slice(Cnt, Size),
15529 [](std::pair<unsigned, unsigned> &P) {
15530 P.first = P.second = 0;
15531 });
15532 if (Cnt < StartIdx + MinVF) {
15533 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
15534 [](std::pair<unsigned, unsigned> &P) {
15535 P.first = P.second = 0;
15536 });
15537 StartIdx = Cnt + Size;
15538 }
15539 if (Cnt > Sz - Size - MinVF) {
15540 for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
15541 [](std::pair<unsigned, unsigned> &P) {
15542 P.first = P.second = 0;
15543 });
15544 if (Sz == End)
15545 End = Cnt;
15546 Sz = Cnt;
15547 }
15548 Cnt += Size;
15549 continue;
15550 }
15551 if (Size > 2 && Res &&
15552 !all_of(RangeSizes.slice(Cnt, Size),
15553 std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
15554 std::placeholders::_1))) {
15555 Cnt += Size;
15556 continue;
15557 }
15558 // Check for the very big VFs that we're not rebuilding same
15559 // trees, just with larger number of elements.
15560 if (Size > MaxRegVF && TreeSize > 1 &&
15561 all_of(RangeSizes.slice(Cnt, Size),
15562 std::bind(FirstSizeSame, TreeSize,
15563 std::placeholders::_1))) {
15564 Cnt += Size;
15565 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
15566 ++Cnt;
15567 continue;
15568 }
15569 if (TreeSize > 1)
15570 for_each(RangeSizes.slice(Cnt, Size),
15571 [&](std::pair<unsigned, unsigned> &P) {
15572 if (Size >= MaxRegVF)
15573 P.second = std::max(P.second, TreeSize);
15574 else
15575 P.first = std::max(P.first, TreeSize);
15576 });
15577 ++Cnt;
15578 AnyProfitableGraph = true;
15579 }
15580 if (StartIdx >= End)
15581 break;
15582 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
15583 AnyProfitableGraph = true;
15584 StartIdx = std::distance(
15585 RangeSizes.begin(),
15586 find_if(RangeSizes.drop_front(Sz),
15587 std::bind(IsNotVectorized, Size >= MaxRegVF,
15588 std::placeholders::_1)));
15589 }
15590 if (!AnyProfitableGraph && Size >= MaxRegVF)
15591 break;
15592 }
15593 // All values vectorized - exit.
15594 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
15595 return P.first == 0 && P.second == 0;
15596 }))
15597 break;
15598 // Check if tried all attempts or no need for the last attempts at all.
15599 if (Repeat >= MaxAttempts ||
15600 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
15601 break;
15602 constexpr unsigned StoresLimit = 64;
15603 const unsigned MaxTotalNum = bit_floor(std::min<unsigned>(
15604 Operands.size(),
15605 static_cast<unsigned>(
15606 End -
15607 std::distance(
15608 RangeSizes.begin(),
15609 find_if(RangeSizes, std::bind(IsNotVectorized, true,
15610 std::placeholders::_1))) +
15611 1)));
15612 unsigned VF = PowerOf2Ceil(CandidateVFs.front()) * 2;
15613 if (VF > MaxTotalNum || VF >= StoresLimit)
15614 break;
15615 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
15616 if (P.first != 0)
15617 P.first = std::max(P.second, P.first);
15618 });
15619 // Last attempt to vectorize max number of elements, if all previous
15620 // attempts were unsuccessful because of the cost issues.
15621 CandidateVFs.clear();
15622 CandidateVFs.push_back(VF);
15623 }
15624 }
15625 };
15626
15627 // Stores pair (first: index of the store into Stores array ref, address of
15628 // which taken as base, second: sorted set of pairs {index, dist}, which are
15629 // indices of stores in the set and their store location distances relative to
15630 // the base address).
15631
15632 // Need to store the index of the very first store separately, since the set
15633 // may be reordered after the insertion and the first store may be moved. This
15634 // container allows to reduce number of calls of getPointersDiff() function.
15636 // Inserts the specified store SI with the given index Idx to the set of the
15637 // stores. If the store with the same distance is found already - stop
15638 // insertion, try to vectorize already found stores. If some stores from this
15639 // sequence were not vectorized - try to vectorize them with the new store
15640 // later. But this logic is applied only to the stores, that come before the
15641 // previous store with the same distance.
15642 // Example:
15643 // 1. store x, %p
15644 // 2. store y, %p+1
15645 // 3. store z, %p+2
15646 // 4. store a, %p
15647 // 5. store b, %p+3
15648 // - Scan this from the last to first store. The very first bunch of stores is
15649 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
15650 // vector).
15651 // - The next store in the list - #1 - has the same distance from store #5 as
15652 // the store #4.
15653 // - Try to vectorize sequence of stores 4,2,3,5.
15654 // - If all these stores are vectorized - just drop them.
15655 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
15656 // - Start new stores sequence.
15657 // The new bunch of stores is {1, {1, 0}}.
15658 // - Add the stores from previous sequence, that were not vectorized.
15659 // Here we consider the stores in the reversed order, rather they are used in
15660 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
15661 // Store #3 can be added -> comes after store #4 with the same distance as
15662 // store #1.
15663 // Store #5 cannot be added - comes before store #4.
15664 // This logic allows to improve the compile time, we assume that the stores
15665 // after previous store with the same distance most likely have memory
15666 // dependencies and no need to waste compile time to try to vectorize them.
15667 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
15668 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
15669 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
15670 std::optional<int> Diff = getPointersDiff(
15671 Stores[Set.first]->getValueOperand()->getType(),
15672 Stores[Set.first]->getPointerOperand(),
15673 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
15674 /*StrictCheck=*/true);
15675 if (!Diff)
15676 continue;
15677 auto It = Set.second.find(std::make_pair(Idx, *Diff));
15678 if (It == Set.second.end()) {
15679 Set.second.emplace(Idx, *Diff);
15680 return;
15681 }
15682 // Try to vectorize the first found set to avoid duplicate analysis.
15683 TryToVectorize(Set.second);
15684 StoreIndexToDistSet PrevSet;
15685 PrevSet.swap(Set.second);
15686 Set.first = Idx;
15687 Set.second.emplace(Idx, 0);
15688 // Insert stores that followed previous match to try to vectorize them
15689 // with this store.
15690 unsigned StartIdx = It->first + 1;
15691 SmallBitVector UsedStores(Idx - StartIdx);
15692 // Distances to previously found dup store (or this store, since they
15693 // store to the same addresses).
15694 SmallVector<int> Dists(Idx - StartIdx, 0);
15695 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
15696 // Do not try to vectorize sequences, we already tried.
15697 if (Pair.first <= It->first ||
15698 VectorizedStores.contains(Stores[Pair.first]))
15699 break;
15700 unsigned BI = Pair.first - StartIdx;
15701 UsedStores.set(BI);
15702 Dists[BI] = Pair.second - It->second;
15703 }
15704 for (unsigned I = StartIdx; I < Idx; ++I) {
15705 unsigned BI = I - StartIdx;
15706 if (UsedStores.test(BI))
15707 Set.second.emplace(I, Dists[BI]);
15708 }
15709 return;
15710 }
15711 auto &Res = SortedStores.emplace_back();
15712 Res.first = Idx;
15713 Res.second.emplace(Idx, 0);
15714 };
15715 StoreInst *PrevStore = Stores.front();
15716 for (auto [I, SI] : enumerate(Stores)) {
15717 // Check that we do not try to vectorize stores of different types.
15718 if (PrevStore->getValueOperand()->getType() !=
15719 SI->getValueOperand()->getType()) {
15720 for (auto &Set : SortedStores)
15721 TryToVectorize(Set.second);
15722 SortedStores.clear();
15723 PrevStore = SI;
15724 }
15725 FillStoresSet(I, SI);
15726 }
15727
15728 // Final vectorization attempt.
15729 for (auto &Set : SortedStores)
15730 TryToVectorize(Set.second);
15731
15732 return Changed;
15733}
15734
15735void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
15736 // Initialize the collections. We will make a single pass over the block.
15737 Stores.clear();
15738 GEPs.clear();
15739
15740 // Visit the store and getelementptr instructions in BB and organize them in
15741 // Stores and GEPs according to the underlying objects of their pointer
15742 // operands.
15743 for (Instruction &I : *BB) {
15744 // Ignore store instructions that are volatile or have a pointer operand
15745 // that doesn't point to a scalar type.
15746 if (auto *SI = dyn_cast<StoreInst>(&I)) {
15747 if (!SI->isSimple())
15748 continue;
15749 if (!isValidElementType(SI->getValueOperand()->getType()))
15750 continue;
15751 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
15752 }
15753
15754 // Ignore getelementptr instructions that have more than one index, a
15755 // constant index, or a pointer operand that doesn't point to a scalar
15756 // type.
15757 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
15758 if (GEP->getNumIndices() != 1)
15759 continue;
15760 Value *Idx = GEP->idx_begin()->get();
15761 if (isa<Constant>(Idx))
15762 continue;
15763 if (!isValidElementType(Idx->getType()))
15764 continue;
15765 if (GEP->getType()->isVectorTy())
15766 continue;
15767 GEPs[GEP->getPointerOperand()].push_back(GEP);
15768 }
15769 }
15770}
15771
15772bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
15773 bool MaxVFOnly) {
15774 if (VL.size() < 2)
15775 return false;
15776
15777 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
15778 << VL.size() << ".\n");
15779
15780 // Check that all of the parts are instructions of the same type,
15781 // we permit an alternate opcode via InstructionsState.
15782 InstructionsState S = getSameOpcode(VL, *TLI);
15783 if (!S.getOpcode())
15784 return false;
15785
15786 Instruction *I0 = cast<Instruction>(S.OpValue);
15787 // Make sure invalid types (including vector type) are rejected before
15788 // determining vectorization factor for scalar instructions.
15789 for (Value *V : VL) {
15790 Type *Ty = V->getType();
15791 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
15792 // NOTE: the following will give user internal llvm type name, which may
15793 // not be useful.
15794 R.getORE()->emit([&]() {
15795 std::string TypeStr;
15796 llvm::raw_string_ostream rso(TypeStr);
15797 Ty->print(rso);
15798 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
15799 << "Cannot SLP vectorize list: type "
15800 << rso.str() + " is unsupported by vectorizer";
15801 });
15802 return false;
15803 }
15804 }
15805
15806 unsigned Sz = R.getVectorElementSize(I0);
15807 unsigned MinVF = R.getMinVF(Sz);
15808 unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
15809 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
15810 if (MaxVF < 2) {
15811 R.getORE()->emit([&]() {
15812 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
15813 << "Cannot SLP vectorize list: vectorization factor "
15814 << "less than 2 is not supported";
15815 });
15816 return false;
15817 }
15818
15819 bool Changed = false;
15820 bool CandidateFound = false;
15821 InstructionCost MinCost = SLPCostThreshold.getValue();
15822 Type *ScalarTy = VL[0]->getType();
15823 if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
15824 ScalarTy = IE->getOperand(1)->getType();
15825
15826 unsigned NextInst = 0, MaxInst = VL.size();
15827 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
15828 // No actual vectorization should happen, if number of parts is the same as
15829 // provided vectorization factor (i.e. the scalar type is used for vector
15830 // code during codegen).
15831 auto *VecTy = FixedVectorType::get(ScalarTy, VF);
15832 if (TTI->getNumberOfParts(VecTy) == VF)
15833 continue;
15834 for (unsigned I = NextInst; I < MaxInst; ++I) {
15835 unsigned ActualVF = std::min(MaxInst - I, VF);
15836
15837 if (!isPowerOf2_32(ActualVF))
15838 continue;
15839
15840 if (MaxVFOnly && ActualVF < MaxVF)
15841 break;
15842 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
15843 break;
15844
15845 ArrayRef<Value *> Ops = VL.slice(I, ActualVF);
15846 // Check that a previous iteration of this loop did not delete the Value.
15847 if (llvm::any_of(Ops, [&R](Value *V) {
15848 auto *I = dyn_cast<Instruction>(V);
15849 return I && R.isDeleted(I);
15850 }))
15851 continue;
15852
15853 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
15854 << "\n");
15855
15856 R.buildTree(Ops);
15857 if (R.isTreeTinyAndNotFullyVectorizable())
15858 continue;
15859 R.reorderTopToBottom();
15860 R.reorderBottomToTop(
15861 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
15862 !R.doesRootHaveInTreeUses());
15863 R.buildExternalUses();
15864
15865 R.computeMinimumValueSizes();
15866 R.transformNodes();
15867 InstructionCost Cost = R.getTreeCost();
15868 CandidateFound = true;
15869 MinCost = std::min(MinCost, Cost);
15870
15871 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
15872 << " for VF=" << ActualVF << "\n");
15873 if (Cost < -SLPCostThreshold) {
15874 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
15875 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
15876 cast<Instruction>(Ops[0]))
15877 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
15878 << " and with tree size "
15879 << ore::NV("TreeSize", R.getTreeSize()));
15880
15881 R.vectorizeTree();
15882 // Move to the next bundle.
15883 I += VF - 1;
15884 NextInst = I + 1;
15885 Changed = true;
15886 }
15887 }
15888 }
15889
15890 if (!Changed && CandidateFound) {
15891 R.getORE()->emit([&]() {
15892 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
15893 << "List vectorization was possible but not beneficial with cost "
15894 << ore::NV("Cost", MinCost) << " >= "
15895 << ore::NV("Treshold", -SLPCostThreshold);
15896 });
15897 } else if (!Changed) {
15898 R.getORE()->emit([&]() {
15899 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
15900 << "Cannot SLP vectorize list: vectorization was impossible"
15901 << " with available vectorization factors";
15902 });
15903 }
15904 return Changed;
15905}
15906
15907bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
15908 if (!I)
15909 return false;
15910
15911 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
15912 return false;
15913
15914 Value *P = I->getParent();
15915
15916 // Vectorize in current basic block only.
15917 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
15918 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
15919 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
15920 return false;
15921
15922 // First collect all possible candidates
15924 Candidates.emplace_back(Op0, Op1);
15925
15926 auto *A = dyn_cast<BinaryOperator>(Op0);
15927 auto *B = dyn_cast<BinaryOperator>(Op1);
15928 // Try to skip B.
15929 if (A && B && B->hasOneUse()) {
15930 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
15931 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
15932 if (B0 && B0->getParent() == P)
15933 Candidates.emplace_back(A, B0);
15934 if (B1 && B1->getParent() == P)
15935 Candidates.emplace_back(A, B1);
15936 }
15937 // Try to skip A.
15938 if (B && A && A->hasOneUse()) {
15939 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
15940 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
15941 if (A0 && A0->getParent() == P)
15942 Candidates.emplace_back(A0, B);
15943 if (A1 && A1->getParent() == P)
15944 Candidates.emplace_back(A1, B);
15945 }
15946
15947 if (Candidates.size() == 1)
15948 return tryToVectorizeList({Op0, Op1}, R);
15949
15950 // We have multiple options. Try to pick the single best.
15951 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
15952 if (!BestCandidate)
15953 return false;
15954 return tryToVectorizeList(
15955 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
15956}
15957
15958namespace {
15959
15960/// Model horizontal reductions.
15961///
15962/// A horizontal reduction is a tree of reduction instructions that has values
15963/// that can be put into a vector as its leaves. For example:
15964///
15965/// mul mul mul mul
15966/// \ / \ /
15967/// + +
15968/// \ /
15969/// +
15970/// This tree has "mul" as its leaf values and "+" as its reduction
15971/// instructions. A reduction can feed into a store or a binary operation
15972/// feeding a phi.
15973/// ...
15974/// \ /
15975/// +
15976/// |
15977/// phi +=
15978///
15979/// Or:
15980/// ...
15981/// \ /
15982/// +
15983/// |
15984/// *p =
15985///
15986class HorizontalReduction {
15987 using ReductionOpsType = SmallVector<Value *, 16>;
15988 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
15989 ReductionOpsListType ReductionOps;
15990 /// List of possibly reduced values.
15992 /// Maps reduced value to the corresponding reduction operation.
15994 // Use map vector to make stable output.
15996 WeakTrackingVH ReductionRoot;
15997 /// The type of reduction operation.
15998 RecurKind RdxKind;
15999 /// Checks if the optimization of original scalar identity operations on
16000 /// matched horizontal reductions is enabled and allowed.
16001 bool IsSupportedHorRdxIdentityOp = false;
16002
16003 static bool isCmpSelMinMax(Instruction *I) {
16004 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
16006 }
16007
16008 // And/or are potentially poison-safe logical patterns like:
16009 // select x, y, false
16010 // select x, true, y
16011 static bool isBoolLogicOp(Instruction *I) {
16012 return isa<SelectInst>(I) &&
16013 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
16014 }
16015
16016 /// Checks if instruction is associative and can be vectorized.
16017 static bool isVectorizable(RecurKind Kind, Instruction *I) {
16018 if (Kind == RecurKind::None)
16019 return false;
16020
16021 // Integer ops that map to select instructions or intrinsics are fine.
16023 isBoolLogicOp(I))
16024 return true;
16025
16026 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
16027 // FP min/max are associative except for NaN and -0.0. We do not
16028 // have to rule out -0.0 here because the intrinsic semantics do not
16029 // specify a fixed result for it.
16030 return I->getFastMathFlags().noNaNs();
16031 }
16032
16033 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
16034 return true;
16035
16036 return I->isAssociative();
16037 }
16038
16039 static Value *getRdxOperand(Instruction *I, unsigned Index) {
16040 // Poison-safe 'or' takes the form: select X, true, Y
16041 // To make that work with the normal operand processing, we skip the
16042 // true value operand.
16043 // TODO: Change the code and data structures to handle this without a hack.
16044 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
16045 return I->getOperand(2);
16046 return I->getOperand(Index);
16047 }
16048
16049 /// Creates reduction operation with the current opcode.
16050 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
16051 Value *RHS, const Twine &Name, bool UseSelect) {
16052 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
16053 switch (Kind) {
16054 case RecurKind::Or:
16055 if (UseSelect &&
16057 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
16058 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16059 Name);
16060 case RecurKind::And:
16061 if (UseSelect &&
16063 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
16064 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16065 Name);
16066 case RecurKind::Add:
16067 case RecurKind::Mul:
16068 case RecurKind::Xor:
16069 case RecurKind::FAdd:
16070 case RecurKind::FMul:
16071 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16072 Name);
16073 case RecurKind::FMax:
16074 return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
16075 case RecurKind::FMin:
16076 return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
16077 case RecurKind::FMaximum:
16078 return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
16079 case RecurKind::FMinimum:
16080 return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
16081 case RecurKind::SMax:
16082 if (UseSelect) {
16083 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
16084 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16085 }
16086 return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
16087 case RecurKind::SMin:
16088 if (UseSelect) {
16089 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
16090 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16091 }
16092 return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
16093 case RecurKind::UMax:
16094 if (UseSelect) {
16095 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
16096 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16097 }
16098 return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
16099 case RecurKind::UMin:
16100 if (UseSelect) {
16101 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
16102 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16103 }
16104 return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
16105 default:
16106 llvm_unreachable("Unknown reduction operation.");
16107 }
16108 }
16109
16110 /// Creates reduction operation with the current opcode with the IR flags
16111 /// from \p ReductionOps, dropping nuw/nsw flags.
16112 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
16113 Value *RHS, const Twine &Name,
16114 const ReductionOpsListType &ReductionOps) {
16115 bool UseSelect = ReductionOps.size() == 2 ||
16116 // Logical or/and.
16117 (ReductionOps.size() == 1 &&
16118 any_of(ReductionOps.front(), IsaPred<SelectInst>));
16119 assert((!UseSelect || ReductionOps.size() != 2 ||
16120 isa<SelectInst>(ReductionOps[1][0])) &&
16121 "Expected cmp + select pairs for reduction");
16122 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
16124 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
16125 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
16126 /*IncludeWrapFlags=*/false);
16127 propagateIRFlags(Op, ReductionOps[1], nullptr,
16128 /*IncludeWrapFlags=*/false);
16129 return Op;
16130 }
16131 }
16132 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
16133 return Op;
16134 }
16135
16136public:
16137 static RecurKind getRdxKind(Value *V) {
16138 auto *I = dyn_cast<Instruction>(V);
16139 if (!I)
16140 return RecurKind::None;
16141 if (match(I, m_Add(m_Value(), m_Value())))
16142 return RecurKind::Add;
16143 if (match(I, m_Mul(m_Value(), m_Value())))
16144 return RecurKind::Mul;
16145 if (match(I, m_And(m_Value(), m_Value())) ||
16147 return RecurKind::And;
16148 if (match(I, m_Or(m_Value(), m_Value())) ||
16150 return RecurKind::Or;
16151 if (match(I, m_Xor(m_Value(), m_Value())))
16152 return RecurKind::Xor;
16153 if (match(I, m_FAdd(m_Value(), m_Value())))
16154 return RecurKind::FAdd;
16155 if (match(I, m_FMul(m_Value(), m_Value())))
16156 return RecurKind::FMul;
16157
16158 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
16159 return RecurKind::FMax;
16160 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
16161 return RecurKind::FMin;
16162
16163 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
16164 return RecurKind::FMaximum;
16165 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
16166 return RecurKind::FMinimum;
16167 // This matches either cmp+select or intrinsics. SLP is expected to handle
16168 // either form.
16169 // TODO: If we are canonicalizing to intrinsics, we can remove several
16170 // special-case paths that deal with selects.
16171 if (match(I, m_SMax(m_Value(), m_Value())))
16172 return RecurKind::SMax;
16173 if (match(I, m_SMin(m_Value(), m_Value())))
16174 return RecurKind::SMin;
16175 if (match(I, m_UMax(m_Value(), m_Value())))
16176 return RecurKind::UMax;
16177 if (match(I, m_UMin(m_Value(), m_Value())))
16178 return RecurKind::UMin;
16179
16180 if (auto *Select = dyn_cast<SelectInst>(I)) {
16181 // Try harder: look for min/max pattern based on instructions producing
16182 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
16183 // During the intermediate stages of SLP, it's very common to have
16184 // pattern like this (since optimizeGatherSequence is run only once
16185 // at the end):
16186 // %1 = extractelement <2 x i32> %a, i32 0
16187 // %2 = extractelement <2 x i32> %a, i32 1
16188 // %cond = icmp sgt i32 %1, %2
16189 // %3 = extractelement <2 x i32> %a, i32 0
16190 // %4 = extractelement <2 x i32> %a, i32 1
16191 // %select = select i1 %cond, i32 %3, i32 %4
16192 CmpInst::Predicate Pred;
16193 Instruction *L1;
16194 Instruction *L2;
16195
16196 Value *LHS = Select->getTrueValue();
16197 Value *RHS = Select->getFalseValue();
16198 Value *Cond = Select->getCondition();
16199
16200 // TODO: Support inverse predicates.
16201 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
16202 if (!isa<ExtractElementInst>(RHS) ||
16203 !L2->isIdenticalTo(cast<Instruction>(RHS)))
16204 return RecurKind::None;
16205 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
16206 if (!isa<ExtractElementInst>(LHS) ||
16207 !L1->isIdenticalTo(cast<Instruction>(LHS)))
16208 return RecurKind::None;
16209 } else {
16210 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
16211 return RecurKind::None;
16212 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
16213 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
16214 !L2->isIdenticalTo(cast<Instruction>(RHS)))
16215 return RecurKind::None;
16216 }
16217
16218 switch (Pred) {
16219 default:
16220 return RecurKind::None;
16221 case CmpInst::ICMP_SGT:
16222 case CmpInst::ICMP_SGE:
16223 return RecurKind::SMax;
16224 case CmpInst::ICMP_SLT:
16225 case CmpInst::ICMP_SLE:
16226 return RecurKind::SMin;
16227 case CmpInst::ICMP_UGT:
16228 case CmpInst::ICMP_UGE:
16229 return RecurKind::UMax;
16230 case CmpInst::ICMP_ULT:
16231 case CmpInst::ICMP_ULE:
16232 return RecurKind::UMin;
16233 }
16234 }
16235 return RecurKind::None;
16236 }
16237
16238 /// Get the index of the first operand.
16239 static unsigned getFirstOperandIndex(Instruction *I) {
16240 return isCmpSelMinMax(I) ? 1 : 0;
16241 }
16242
16243private:
16244 /// Total number of operands in the reduction operation.
16245 static unsigned getNumberOfOperands(Instruction *I) {
16246 return isCmpSelMinMax(I) ? 3 : 2;
16247 }
16248
16249 /// Checks if the instruction is in basic block \p BB.
16250 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
16251 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
16252 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
16253 auto *Sel = cast<SelectInst>(I);
16254 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
16255 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
16256 }
16257 return I->getParent() == BB;
16258 }
16259
16260 /// Expected number of uses for reduction operations/reduced values.
16261 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
16262 if (IsCmpSelMinMax) {
16263 // SelectInst must be used twice while the condition op must have single
16264 // use only.
16265 if (auto *Sel = dyn_cast<SelectInst>(I))
16266 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
16267 return I->hasNUses(2);
16268 }
16269
16270 // Arithmetic reduction operation must be used once only.
16271 return I->hasOneUse();
16272 }
16273
16274 /// Initializes the list of reduction operations.
16275 void initReductionOps(Instruction *I) {
16276 if (isCmpSelMinMax(I))
16277 ReductionOps.assign(2, ReductionOpsType());
16278 else
16279 ReductionOps.assign(1, ReductionOpsType());
16280 }
16281
16282 /// Add all reduction operations for the reduction instruction \p I.
16283 void addReductionOps(Instruction *I) {
16284 if (isCmpSelMinMax(I)) {
16285 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
16286 ReductionOps[1].emplace_back(I);
16287 } else {
16288 ReductionOps[0].emplace_back(I);
16289 }
16290 }
16291
16292 static bool isGoodForReduction(ArrayRef<Value *> Data) {
16293 int Sz = Data.size();
16294 auto *I = dyn_cast<Instruction>(Data.front());
16295 return Sz > 1 || isConstant(Data.front()) ||
16296 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
16297 }
16298
16299public:
16300 HorizontalReduction() = default;
16301
16302 /// Try to find a reduction tree.
16303 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
16304 ScalarEvolution &SE, const DataLayout &DL,
16305 const TargetLibraryInfo &TLI) {
16306 RdxKind = HorizontalReduction::getRdxKind(Root);
16307 if (!isVectorizable(RdxKind, Root))
16308 return false;
16309
16310 // Analyze "regular" integer/FP types for reductions - no target-specific
16311 // types or pointers.
16312 Type *Ty = Root->getType();
16313 if (!isValidElementType(Ty) || Ty->isPointerTy())
16314 return false;
16315
16316 // Though the ultimate reduction may have multiple uses, its condition must
16317 // have only single use.
16318 if (auto *Sel = dyn_cast<SelectInst>(Root))
16319 if (!Sel->getCondition()->hasOneUse())
16320 return false;
16321
16322 ReductionRoot = Root;
16323
16324 // Iterate through all the operands of the possible reduction tree and
16325 // gather all the reduced values, sorting them by their value id.
16326 BasicBlock *BB = Root->getParent();
16327 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
16328 SmallVector<Instruction *> Worklist(1, Root);
16329 // Checks if the operands of the \p TreeN instruction are also reduction
16330 // operations or should be treated as reduced values or an extra argument,
16331 // which is not part of the reduction.
16332 auto CheckOperands = [&](Instruction *TreeN,
16333 SmallVectorImpl<Value *> &ExtraArgs,
16334 SmallVectorImpl<Value *> &PossibleReducedVals,
16335 SmallVectorImpl<Instruction *> &ReductionOps) {
16336 for (int I = getFirstOperandIndex(TreeN),
16337 End = getNumberOfOperands(TreeN);
16338 I < End; ++I) {
16339 Value *EdgeVal = getRdxOperand(TreeN, I);
16340 ReducedValsToOps[EdgeVal].push_back(TreeN);
16341 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
16342 // Edge has wrong parent - mark as an extra argument.
16343 if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
16344 !hasSameParent(EdgeInst, BB)) {
16345 ExtraArgs.push_back(EdgeVal);
16346 continue;
16347 }
16348 // If the edge is not an instruction, or it is different from the main
16349 // reduction opcode or has too many uses - possible reduced value.
16350 // Also, do not try to reduce const values, if the operation is not
16351 // foldable.
16352 if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
16353 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
16354 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
16355 !isVectorizable(RdxKind, EdgeInst) ||
16356 (R.isAnalyzedReductionRoot(EdgeInst) &&
16357 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
16358 PossibleReducedVals.push_back(EdgeVal);
16359 continue;
16360 }
16361 ReductionOps.push_back(EdgeInst);
16362 }
16363 };
16364 // Try to regroup reduced values so that it gets more profitable to try to
16365 // reduce them. Values are grouped by their value ids, instructions - by
16366 // instruction op id and/or alternate op id, plus do extra analysis for
16367 // loads (grouping them by the distabce between pointers) and cmp
16368 // instructions (grouping them by the predicate).
16370 PossibleReducedVals;
16371 initReductionOps(Root);
16373 SmallSet<size_t, 2> LoadKeyUsed;
16374 SmallPtrSet<Value *, 4> DoNotReverseVals;
16375
16376 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
16378 if (LoadKeyUsed.contains(Key)) {
16379 auto LIt = LoadsMap.find(Ptr);
16380 if (LIt != LoadsMap.end()) {
16381 for (LoadInst *RLI : LIt->second) {
16382 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
16383 LI->getType(), LI->getPointerOperand(), DL, SE,
16384 /*StrictCheck=*/true))
16385 return hash_value(RLI->getPointerOperand());
16386 }
16387 for (LoadInst *RLI : LIt->second) {
16389 LI->getPointerOperand(), TLI)) {
16390 hash_code SubKey = hash_value(RLI->getPointerOperand());
16391 DoNotReverseVals.insert(RLI);
16392 return SubKey;
16393 }
16394 }
16395 if (LIt->second.size() > 2) {
16396 hash_code SubKey =
16397 hash_value(LIt->second.back()->getPointerOperand());
16398 DoNotReverseVals.insert(LIt->second.back());
16399 return SubKey;
16400 }
16401 }
16402 }
16403 LoadKeyUsed.insert(Key);
16404 LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
16405 return hash_value(LI->getPointerOperand());
16406 };
16407
16408 while (!Worklist.empty()) {
16409 Instruction *TreeN = Worklist.pop_back_val();
16411 SmallVector<Value *> PossibleRedVals;
16412 SmallVector<Instruction *> PossibleReductionOps;
16413 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
16414 // If too many extra args - mark the instruction itself as a reduction
16415 // value, not a reduction operation.
16416 if (Args.size() < 2) {
16417 addReductionOps(TreeN);
16418 // Add extra args.
16419 if (!Args.empty()) {
16420 assert(Args.size() == 1 && "Expected only single argument.");
16421 ExtraArgs[TreeN] = Args.front();
16422 }
16423 // Add reduction values. The values are sorted for better vectorization
16424 // results.
16425 for (Value *V : PossibleRedVals) {
16426 size_t Key, Idx;
16427 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
16428 /*AllowAlternate=*/false);
16429 ++PossibleReducedVals[Key][Idx]
16430 .insert(std::make_pair(V, 0))
16431 .first->second;
16432 }
16433 Worklist.append(PossibleReductionOps.rbegin(),
16434 PossibleReductionOps.rend());
16435 } else {
16436 size_t Key, Idx;
16437 std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey,
16438 /*AllowAlternate=*/false);
16439 ++PossibleReducedVals[Key][Idx]
16440 .insert(std::make_pair(TreeN, 0))
16441 .first->second;
16442 }
16443 }
16444 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
16445 // Sort values by the total number of values kinds to start the reduction
16446 // from the longest possible reduced values sequences.
16447 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
16448 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
16449 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
16450 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
16451 It != E; ++It) {
16452 PossibleRedValsVect.emplace_back();
16453 auto RedValsVect = It->second.takeVector();
16454 stable_sort(RedValsVect, llvm::less_second());
16455 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
16456 PossibleRedValsVect.back().append(Data.second, Data.first);
16457 }
16458 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
16459 return P1.size() > P2.size();
16460 });
16461 int NewIdx = -1;
16462 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
16463 if (isGoodForReduction(Data) ||
16464 (isa<LoadInst>(Data.front()) && NewIdx >= 0 &&
16465 isa<LoadInst>(ReducedVals[NewIdx].front()) &&
16467 cast<LoadInst>(Data.front())->getPointerOperand()) ==
16468 getUnderlyingObject(cast<LoadInst>(ReducedVals[NewIdx].front())
16469 ->getPointerOperand()))) {
16470 if (NewIdx < 0) {
16471 NewIdx = ReducedVals.size();
16472 ReducedVals.emplace_back();
16473 }
16474 if (DoNotReverseVals.contains(Data.front()))
16475 ReducedVals[NewIdx].append(Data.begin(), Data.end());
16476 else
16477 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
16478 } else {
16479 ReducedVals.emplace_back().append(Data.rbegin(), Data.rend());
16480 }
16481 }
16482 }
16483 // Sort the reduced values by number of same/alternate opcode and/or pointer
16484 // operand.
16485 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
16486 return P1.size() > P2.size();
16487 });
16488 return true;
16489 }
16490
16491 /// Attempt to vectorize the tree found by matchAssociativeReduction.
16492 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
16493 const TargetLibraryInfo &TLI) {
16494 constexpr int ReductionLimit = 4;
16495 constexpr unsigned RegMaxNumber = 4;
16496 constexpr unsigned RedValsMaxNumber = 128;
16497 // If there are a sufficient number of reduction values, reduce
16498 // to a nearby power-of-2. We can safely generate oversized
16499 // vectors and rely on the backend to split them to legal sizes.
16500 unsigned NumReducedVals =
16501 std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0,
16502 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
16503 if (!isGoodForReduction(Vals))
16504 return Num;
16505 return Num + Vals.size();
16506 });
16507 if (NumReducedVals < ReductionLimit &&
16509 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
16510 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
16511 }))) {
16512 for (ReductionOpsType &RdxOps : ReductionOps)
16513 for (Value *RdxOp : RdxOps)
16514 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
16515 return nullptr;
16516 }
16517
16518 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
16519 TargetFolder(DL));
16520 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
16521
16522 // Track the reduced values in case if they are replaced by extractelement
16523 // because of the vectorization.
16525 ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
16526 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
16527 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
16528 ExternallyUsedValues.reserve(ExtraArgs.size() + 1);
16529 // The same extra argument may be used several times, so log each attempt
16530 // to use it.
16531 for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
16532 assert(Pair.first && "DebugLoc must be set.");
16533 ExternallyUsedValues[Pair.second].push_back(Pair.first);
16534 TrackedVals.try_emplace(Pair.second, Pair.second);
16535 }
16536
16537 // The compare instruction of a min/max is the insertion point for new
16538 // instructions and may be replaced with a new compare instruction.
16539 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
16540 assert(isa<SelectInst>(RdxRootInst) &&
16541 "Expected min/max reduction to have select root instruction");
16542 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
16543 assert(isa<Instruction>(ScalarCond) &&
16544 "Expected min/max reduction to have compare condition");
16545 return cast<Instruction>(ScalarCond);
16546 };
16547
16548 // Return new VectorizedTree, based on previous value.
16549 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
16550 if (VectorizedTree) {
16551 // Update the final value in the reduction.
16553 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
16554 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
16556 !isGuaranteedNotToBePoison(VectorizedTree))) {
16557 auto It = ReducedValsToOps.find(Res);
16558 if (It != ReducedValsToOps.end() &&
16559 any_of(It->getSecond(),
16560 [](Instruction *I) { return isBoolLogicOp(I); }))
16561 std::swap(VectorizedTree, Res);
16562 }
16563
16564 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
16565 ReductionOps);
16566 }
16567 // Initialize the final value in the reduction.
16568 return Res;
16569 };
16570 bool AnyBoolLogicOp =
16571 any_of(ReductionOps.back(), [](Value *V) {
16572 return isBoolLogicOp(cast<Instruction>(V));
16573 });
16574 // The reduction root is used as the insertion point for new instructions,
16575 // so set it as externally used to prevent it from being deleted.
16576 ExternallyUsedValues[ReductionRoot];
16577 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
16578 ReductionOps.front().size());
16579 for (ReductionOpsType &RdxOps : ReductionOps)
16580 for (Value *RdxOp : RdxOps) {
16581 if (!RdxOp)
16582 continue;
16583 IgnoreList.insert(RdxOp);
16584 }
16585 // Intersect the fast-math-flags from all reduction operations.
16586 FastMathFlags RdxFMF;
16587 RdxFMF.set();
16588 for (Value *U : IgnoreList)
16589 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
16590 RdxFMF &= FPMO->getFastMathFlags();
16591 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
16592
16593 // Need to track reduced vals, they may be changed during vectorization of
16594 // subvectors.
16595 for (ArrayRef<Value *> Candidates : ReducedVals)
16596 for (Value *V : Candidates)
16597 TrackedVals.try_emplace(V, V);
16598
16599 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
16600 // List of the values that were reduced in other trees as part of gather
16601 // nodes and thus requiring extract if fully vectorized in other trees.
16602 SmallPtrSet<Value *, 4> RequiredExtract;
16603 Value *VectorizedTree = nullptr;
16604 bool CheckForReusedReductionOps = false;
16605 // Try to vectorize elements based on their type.
16606 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
16607 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
16608 InstructionsState S = getSameOpcode(OrigReducedVals, TLI);
16609 SmallVector<Value *> Candidates;
16610 Candidates.reserve(2 * OrigReducedVals.size());
16611 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
16612 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
16613 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
16614 // Check if the reduction value was not overriden by the extractelement
16615 // instruction because of the vectorization and exclude it, if it is not
16616 // compatible with other values.
16617 // Also check if the instruction was folded to constant/other value.
16618 auto *Inst = dyn_cast<Instruction>(RdxVal);
16619 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
16620 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
16621 (S.getOpcode() && !Inst))
16622 continue;
16623 Candidates.push_back(RdxVal);
16624 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
16625 }
16626 bool ShuffledExtracts = false;
16627 // Try to handle shuffled extractelements.
16628 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
16629 I + 1 < E) {
16630 InstructionsState NextS = getSameOpcode(ReducedVals[I + 1], TLI);
16631 if (NextS.getOpcode() == Instruction::ExtractElement &&
16632 !NextS.isAltShuffle()) {
16633 SmallVector<Value *> CommonCandidates(Candidates);
16634 for (Value *RV : ReducedVals[I + 1]) {
16635 Value *RdxVal = TrackedVals.find(RV)->second;
16636 // Check if the reduction value was not overriden by the
16637 // extractelement instruction because of the vectorization and
16638 // exclude it, if it is not compatible with other values.
16639 if (auto *Inst = dyn_cast<Instruction>(RdxVal))
16640 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
16641 continue;
16642 CommonCandidates.push_back(RdxVal);
16643 TrackedToOrig.try_emplace(RdxVal, RV);
16644 }
16646 if (isFixedVectorShuffle(CommonCandidates, Mask)) {
16647 ++I;
16648 Candidates.swap(CommonCandidates);
16649 ShuffledExtracts = true;
16650 }
16651 }
16652 }
16653
16654 // Emit code for constant values.
16655 if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 &&
16656 allConstant(Candidates)) {
16657 Value *Res = Candidates.front();
16658 ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond();
16659 for (Value *VC : ArrayRef(Candidates).drop_front()) {
16660 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
16661 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
16662 if (auto *ResI = dyn_cast<Instruction>(Res))
16663 V.analyzedReductionRoot(ResI);
16664 }
16665 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
16666 continue;
16667 }
16668
16669 unsigned NumReducedVals = Candidates.size();
16670 if (NumReducedVals < ReductionLimit &&
16671 (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization ||
16672 !isSplat(Candidates)))
16673 continue;
16674
16675 // Check if we support repeated scalar values processing (optimization of
16676 // original scalar identity operations on matched horizontal reductions).
16677 IsSupportedHorRdxIdentityOp =
16678 AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
16679 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
16680 // Gather same values.
16681 MapVector<Value *, unsigned> SameValuesCounter;
16682 if (IsSupportedHorRdxIdentityOp)
16683 for (Value *V : Candidates)
16684 ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second;
16685 // Used to check if the reduced values used same number of times. In this
16686 // case the compiler may produce better code. E.g. if reduced values are
16687 // aabbccdd (8 x values), then the first node of the tree will have a node
16688 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
16689 // Plus, the final reduction will be performed on <8 x aabbccdd>.
16690 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
16691 // x abcd) * 2.
16692 // Currently it only handles add/fadd/xor. and/or/min/max do not require
16693 // this analysis, other operations may require an extra estimation of
16694 // the profitability.
16695 bool SameScaleFactor = false;
16696 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
16697 SameValuesCounter.size() != Candidates.size();
16698 if (OptReusedScalars) {
16699 SameScaleFactor =
16700 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
16701 RdxKind == RecurKind::Xor) &&
16702 all_of(drop_begin(SameValuesCounter),
16703 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
16704 return P.second == SameValuesCounter.front().second;
16705 });
16706 Candidates.resize(SameValuesCounter.size());
16707 transform(SameValuesCounter, Candidates.begin(),
16708 [](const auto &P) { return P.first; });
16709 NumReducedVals = Candidates.size();
16710 // Have a reduction of the same element.
16711 if (NumReducedVals == 1) {
16712 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
16713 unsigned Cnt = SameValuesCounter.lookup(OrigV);
16714 Value *RedVal =
16715 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
16716 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
16717 VectorizedVals.try_emplace(OrigV, Cnt);
16718 continue;
16719 }
16720 }
16721
16722 unsigned MaxVecRegSize = V.getMaxVecRegSize();
16723 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
16724 unsigned MaxElts =
16725 RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize);
16726
16727 unsigned ReduxWidth = std::min<unsigned>(
16728 llvm::bit_floor(NumReducedVals),
16729 std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
16730 RegMaxNumber * RedValsMaxNumber));
16731 unsigned Start = 0;
16732 unsigned Pos = Start;
16733 // Restarts vectorization attempt with lower vector factor.
16734 unsigned PrevReduxWidth = ReduxWidth;
16735 bool CheckForReusedReductionOpsLocal = false;
16736 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
16737 &CheckForReusedReductionOpsLocal,
16738 &PrevReduxWidth, &V,
16739 &IgnoreList](bool IgnoreVL = false) {
16740 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
16741 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
16742 // Check if any of the reduction ops are gathered. If so, worth
16743 // trying again with less number of reduction ops.
16744 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
16745 }
16746 ++Pos;
16747 if (Pos < NumReducedVals - ReduxWidth + 1)
16748 return IsAnyRedOpGathered;
16749 Pos = Start;
16750 ReduxWidth /= 2;
16751 return IsAnyRedOpGathered;
16752 };
16753 bool AnyVectorized = false;
16754 while (Pos < NumReducedVals - ReduxWidth + 1 &&
16755 ReduxWidth >= ReductionLimit) {
16756 // Dependency in tree of the reduction ops - drop this attempt, try
16757 // later.
16758 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
16759 Start == 0) {
16760 CheckForReusedReductionOps = true;
16761 break;
16762 }
16763 PrevReduxWidth = ReduxWidth;
16764 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
16765 // Beeing analyzed already - skip.
16766 if (V.areAnalyzedReductionVals(VL)) {
16767 (void)AdjustReducedVals(/*IgnoreVL=*/true);
16768 continue;
16769 }
16770 // Early exit if any of the reduction values were deleted during
16771 // previous vectorization attempts.
16772 if (any_of(VL, [&V](Value *RedVal) {
16773 auto *RedValI = dyn_cast<Instruction>(RedVal);
16774 if (!RedValI)
16775 return false;
16776 return V.isDeleted(RedValI);
16777 }))
16778 break;
16779 V.buildTree(VL, IgnoreList);
16780 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
16781 if (!AdjustReducedVals())
16782 V.analyzedReductionVals(VL);
16783 continue;
16784 }
16785 if (V.isLoadCombineReductionCandidate(RdxKind)) {
16786 if (!AdjustReducedVals())
16787 V.analyzedReductionVals(VL);
16788 continue;
16789 }
16790 V.reorderTopToBottom();
16791 // No need to reorder the root node at all.
16792 V.reorderBottomToTop(/*IgnoreReorder=*/true);
16793 // Keep extracted other reduction values, if they are used in the
16794 // vectorization trees.
16795 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
16796 ExternallyUsedValues);
16797 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
16798 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
16799 continue;
16800 for (Value *V : ReducedVals[Cnt])
16801 if (isa<Instruction>(V))
16802 LocalExternallyUsedValues[TrackedVals[V]];
16803 }
16804 if (!IsSupportedHorRdxIdentityOp) {
16805 // Number of uses of the candidates in the vector of values.
16806 assert(SameValuesCounter.empty() &&
16807 "Reused values counter map is not empty");
16808 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
16809 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
16810 continue;
16811 Value *V = Candidates[Cnt];
16812 Value *OrigV = TrackedToOrig.find(V)->second;
16813 ++SameValuesCounter[OrigV];
16814 }
16815 }
16816 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
16817 // Gather externally used values.
16819 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
16820 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
16821 continue;
16822 Value *RdxVal = Candidates[Cnt];
16823 if (!Visited.insert(RdxVal).second)
16824 continue;
16825 // Check if the scalar was vectorized as part of the vectorization
16826 // tree but not the top node.
16827 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
16828 LocalExternallyUsedValues[RdxVal];
16829 continue;
16830 }
16831 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
16832 unsigned NumOps =
16833 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
16834 if (NumOps != ReducedValsToOps.find(OrigV)->second.size())
16835 LocalExternallyUsedValues[RdxVal];
16836 }
16837 // Do not need the list of reused scalars in regular mode anymore.
16838 if (!IsSupportedHorRdxIdentityOp)
16839 SameValuesCounter.clear();
16840 for (Value *RdxVal : VL)
16841 if (RequiredExtract.contains(RdxVal))
16842 LocalExternallyUsedValues[RdxVal];
16843 // Update LocalExternallyUsedValues for the scalar, replaced by
16844 // extractelement instructions.
16845 DenseMap<Value *, Value *> ReplacementToExternal;
16846 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals)
16847 ReplacementToExternal.try_emplace(Pair.second, Pair.first);
16848 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
16849 Value *Ext = Pair.first;
16850 auto RIt = ReplacementToExternal.find(Ext);
16851 while (RIt != ReplacementToExternal.end()) {
16852 Ext = RIt->second;
16853 RIt = ReplacementToExternal.find(Ext);
16854 }
16855 auto *It = ExternallyUsedValues.find(Ext);
16856 if (It == ExternallyUsedValues.end())
16857 continue;
16858 LocalExternallyUsedValues[Pair.second].append(It->second);
16859 }
16860 V.buildExternalUses(LocalExternallyUsedValues);
16861
16862 V.computeMinimumValueSizes();
16863 V.transformNodes();
16864
16865 // Estimate cost.
16866 InstructionCost TreeCost = V.getTreeCost(VL);
16867 InstructionCost ReductionCost =
16868 getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
16869 InstructionCost Cost = TreeCost + ReductionCost;
16870 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
16871 << " for reduction\n");
16872 if (!Cost.isValid())
16873 break;
16874 if (Cost >= -SLPCostThreshold) {
16875 V.getORE()->emit([&]() {
16877 SV_NAME, "HorSLPNotBeneficial",
16878 ReducedValsToOps.find(VL[0])->second.front())
16879 << "Vectorizing horizontal reduction is possible "
16880 << "but not beneficial with cost " << ore::NV("Cost", Cost)
16881 << " and threshold "
16882 << ore::NV("Threshold", -SLPCostThreshold);
16883 });
16884 if (!AdjustReducedVals())
16885 V.analyzedReductionVals(VL);
16886 continue;
16887 }
16888
16889 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
16890 << Cost << ". (HorRdx)\n");
16891 V.getORE()->emit([&]() {
16892 return OptimizationRemark(
16893 SV_NAME, "VectorizedHorizontalReduction",
16894 ReducedValsToOps.find(VL[0])->second.front())
16895 << "Vectorized horizontal reduction with cost "
16896 << ore::NV("Cost", Cost) << " and with tree size "
16897 << ore::NV("TreeSize", V.getTreeSize());
16898 });
16899
16900 Builder.setFastMathFlags(RdxFMF);
16901
16902 // Emit a reduction. If the root is a select (min/max idiom), the insert
16903 // point is the compare condition of that select.
16904 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
16905 Instruction *InsertPt = RdxRootInst;
16906 if (IsCmpSelMinMax)
16907 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
16908
16909 // Vectorize a tree.
16910 Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,
16911 ReplacedExternals, InsertPt);
16912
16913 Builder.SetInsertPoint(InsertPt);
16914
16915 // To prevent poison from leaking across what used to be sequential,
16916 // safe, scalar boolean logic operations, the reduction operand must be
16917 // frozen.
16918 if ((isBoolLogicOp(RdxRootInst) ||
16919 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
16920 !isGuaranteedNotToBePoison(VectorizedRoot))
16921 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
16922
16923 // Emit code to correctly handle reused reduced values, if required.
16924 if (OptReusedScalars && !SameScaleFactor) {
16925 VectorizedRoot =
16926 emitReusedOps(VectorizedRoot, Builder, V.getRootNodeScalars(),
16927 SameValuesCounter, TrackedToOrig);
16928 }
16929
16930 Value *ReducedSubTree =
16931 emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
16932 if (ReducedSubTree->getType() != VL.front()->getType()) {
16933 ReducedSubTree = Builder.CreateIntCast(
16934 ReducedSubTree, VL.front()->getType(), any_of(VL, [&](Value *R) {
16936 R, cast<Instruction>(ReductionOps.front().front())
16937 ->getModule()
16938 ->getDataLayout());
16939 return !Known.isNonNegative();
16940 }));
16941 }
16942
16943 // Improved analysis for add/fadd/xor reductions with same scale factor
16944 // for all operands of reductions. We can emit scalar ops for them
16945 // instead.
16946 if (OptReusedScalars && SameScaleFactor)
16947 ReducedSubTree = emitScaleForReusedOps(
16948 ReducedSubTree, Builder, SameValuesCounter.front().second);
16949
16950 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
16951 // Count vectorized reduced values to exclude them from final reduction.
16952 for (Value *RdxVal : VL) {
16953 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
16954 if (IsSupportedHorRdxIdentityOp) {
16955 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
16956 continue;
16957 }
16958 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
16959 if (!V.isVectorized(RdxVal))
16960 RequiredExtract.insert(RdxVal);
16961 }
16962 Pos += ReduxWidth;
16963 Start = Pos;
16964 ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
16965 AnyVectorized = true;
16966 }
16967 if (OptReusedScalars && !AnyVectorized) {
16968 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
16969 Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second);
16970 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
16971 Value *OrigV = TrackedToOrig.find(P.first)->second;
16972 VectorizedVals.try_emplace(OrigV, P.second);
16973 }
16974 continue;
16975 }
16976 }
16977 if (VectorizedTree) {
16978 // Reorder operands of bool logical op in the natural order to avoid
16979 // possible problem with poison propagation. If not possible to reorder
16980 // (both operands are originally RHS), emit an extra freeze instruction
16981 // for the LHS operand.
16982 // I.e., if we have original code like this:
16983 // RedOp1 = select i1 ?, i1 LHS, i1 false
16984 // RedOp2 = select i1 RHS, i1 ?, i1 false
16985
16986 // Then, we swap LHS/RHS to create a new op that matches the poison
16987 // semantics of the original code.
16988
16989 // If we have original code like this and both values could be poison:
16990 // RedOp1 = select i1 ?, i1 LHS, i1 false
16991 // RedOp2 = select i1 ?, i1 RHS, i1 false
16992
16993 // Then, we must freeze LHS in the new op.
16994 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
16995 Instruction *RedOp1,
16996 Instruction *RedOp2,
16997 bool InitStep) {
16998 if (!AnyBoolLogicOp)
16999 return;
17000 if (isBoolLogicOp(RedOp1) &&
17001 ((!InitStep && LHS == VectorizedTree) ||
17002 getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
17003 return;
17004 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
17005 getRdxOperand(RedOp2, 0) == RHS ||
17007 std::swap(LHS, RHS);
17008 return;
17009 }
17010 if (LHS != VectorizedTree)
17011 LHS = Builder.CreateFreeze(LHS);
17012 };
17013 // Finish the reduction.
17014 // Need to add extra arguments and not vectorized possible reduction
17015 // values.
17016 // Try to avoid dependencies between the scalar remainders after
17017 // reductions.
17018 auto FinalGen =
17020 bool InitStep) {
17021 unsigned Sz = InstVals.size();
17023 Sz % 2);
17024 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
17025 Instruction *RedOp = InstVals[I + 1].first;
17026 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
17027 Value *RdxVal1 = InstVals[I].second;
17028 Value *StableRdxVal1 = RdxVal1;
17029 auto It1 = TrackedVals.find(RdxVal1);
17030 if (It1 != TrackedVals.end())
17031 StableRdxVal1 = It1->second;
17032 Value *RdxVal2 = InstVals[I + 1].second;
17033 Value *StableRdxVal2 = RdxVal2;
17034 auto It2 = TrackedVals.find(RdxVal2);
17035 if (It2 != TrackedVals.end())
17036 StableRdxVal2 = It2->second;
17037 // To prevent poison from leaking across what used to be
17038 // sequential, safe, scalar boolean logic operations, the
17039 // reduction operand must be frozen.
17040 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
17041 RedOp, InitStep);
17042 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
17043 StableRdxVal2, "op.rdx", ReductionOps);
17044 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
17045 }
17046 if (Sz % 2 == 1)
17047 ExtraReds[Sz / 2] = InstVals.back();
17048 return ExtraReds;
17049 };
17051 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
17052 VectorizedTree);
17054 for (ArrayRef<Value *> Candidates : ReducedVals) {
17055 for (Value *RdxVal : Candidates) {
17056 if (!Visited.insert(RdxVal).second)
17057 continue;
17058 unsigned NumOps = VectorizedVals.lookup(RdxVal);
17059 for (Instruction *RedOp :
17060 ArrayRef(ReducedValsToOps.find(RdxVal)->second)
17061 .drop_back(NumOps))
17062 ExtraReductions.emplace_back(RedOp, RdxVal);
17063 }
17064 }
17065 for (auto &Pair : ExternallyUsedValues) {
17066 // Add each externally used value to the final reduction.
17067 for (auto *I : Pair.second)
17068 ExtraReductions.emplace_back(I, Pair.first);
17069 }
17070 // Iterate through all not-vectorized reduction values/extra arguments.
17071 bool InitStep = true;
17072 while (ExtraReductions.size() > 1) {
17073 VectorizedTree = ExtraReductions.front().second;
17075 FinalGen(ExtraReductions, InitStep);
17076 ExtraReductions.swap(NewReds);
17077 InitStep = false;
17078 }
17079 VectorizedTree = ExtraReductions.front().second;
17080
17081 ReductionRoot->replaceAllUsesWith(VectorizedTree);
17082
17083 // The original scalar reduction is expected to have no remaining
17084 // uses outside the reduction tree itself. Assert that we got this
17085 // correct, replace internal uses with undef, and mark for eventual
17086 // deletion.
17087#ifndef NDEBUG
17088 SmallSet<Value *, 4> IgnoreSet;
17089 for (ArrayRef<Value *> RdxOps : ReductionOps)
17090 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
17091#endif
17092 for (ArrayRef<Value *> RdxOps : ReductionOps) {
17093 for (Value *Ignore : RdxOps) {
17094 if (!Ignore)
17095 continue;
17096#ifndef NDEBUG
17097 for (auto *U : Ignore->users()) {
17098 assert(IgnoreSet.count(U) &&
17099 "All users must be either in the reduction ops list.");
17100 }
17101#endif
17102 if (!Ignore->use_empty()) {
17103 Value *Undef = UndefValue::get(Ignore->getType());
17104 Ignore->replaceAllUsesWith(Undef);
17105 }
17106 V.eraseInstruction(cast<Instruction>(Ignore));
17107 }
17108 }
17109 } else if (!CheckForReusedReductionOps) {
17110 for (ReductionOpsType &RdxOps : ReductionOps)
17111 for (Value *RdxOp : RdxOps)
17112 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17113 }
17114 return VectorizedTree;
17115 }
17116
17117private:
17118 /// Calculate the cost of a reduction.
17119 InstructionCost getReductionCost(TargetTransformInfo *TTI,
17120 ArrayRef<Value *> ReducedVals,
17121 bool IsCmpSelMinMax, unsigned ReduxWidth,
17122 FastMathFlags FMF) {
17124 Type *ScalarTy = ReducedVals.front()->getType();
17125 FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
17126 InstructionCost VectorCost = 0, ScalarCost;
17127 // If all of the reduced values are constant, the vector cost is 0, since
17128 // the reduction value can be calculated at the compile time.
17129 bool AllConsts = allConstant(ReducedVals);
17130 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
17132 // Scalar cost is repeated for N-1 elements.
17133 int Cnt = ReducedVals.size();
17134 for (Value *RdxVal : ReducedVals) {
17135 if (Cnt == 1)
17136 break;
17137 --Cnt;
17138 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
17139 Cost += GenCostFn();
17140 continue;
17141 }
17142 InstructionCost ScalarCost = 0;
17143 for (User *U : RdxVal->users()) {
17144 auto *RdxOp = cast<Instruction>(U);
17145 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
17146 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
17147 continue;
17148 }
17149 ScalarCost = InstructionCost::getInvalid();
17150 break;
17151 }
17152 if (ScalarCost.isValid())
17153 Cost += ScalarCost;
17154 else
17155 Cost += GenCostFn();
17156 }
17157 return Cost;
17158 };
17159 switch (RdxKind) {
17160 case RecurKind::Add:
17161 case RecurKind::Mul:
17162 case RecurKind::Or:
17163 case RecurKind::And:
17164 case RecurKind::Xor:
17165 case RecurKind::FAdd:
17166 case RecurKind::FMul: {
17167 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
17168 if (!AllConsts)
17169 VectorCost =
17170 TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
17171 ScalarCost = EvaluateScalarCost([&]() {
17172 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
17173 });
17174 break;
17175 }
17176 case RecurKind::FMax:
17177 case RecurKind::FMin:
17178 case RecurKind::FMaximum:
17179 case RecurKind::FMinimum:
17180 case RecurKind::SMax:
17181 case RecurKind::SMin:
17182 case RecurKind::UMax:
17183 case RecurKind::UMin: {
17185 if (!AllConsts)
17186 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
17187 ScalarCost = EvaluateScalarCost([&]() {
17188 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
17189 return TTI->getIntrinsicInstrCost(ICA, CostKind);
17190 });
17191 break;
17192 }
17193 default:
17194 llvm_unreachable("Expected arithmetic or min/max reduction operation");
17195 }
17196
17197 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
17198 << " for reduction of " << shortBundleName(ReducedVals)
17199 << " (It is a splitting reduction)\n");
17200 return VectorCost - ScalarCost;
17201 }
17202
17203 /// Emit a horizontal reduction of the vectorized value.
17204 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
17205 unsigned ReduxWidth, const TargetTransformInfo *TTI) {
17206 assert(VectorizedValue && "Need to have a vectorized tree node");
17207 assert(isPowerOf2_32(ReduxWidth) &&
17208 "We only handle power-of-two reductions for now");
17209 assert(RdxKind != RecurKind::FMulAdd &&
17210 "A call to the llvm.fmuladd intrinsic is not handled yet");
17211
17212 ++NumVectorInstructions;
17213 return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind);
17214 }
17215
17216 /// Emits optimized code for unique scalar value reused \p Cnt times.
17217 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17218 unsigned Cnt) {
17219 assert(IsSupportedHorRdxIdentityOp &&
17220 "The optimization of matched scalar identity horizontal reductions "
17221 "must be supported.");
17222 switch (RdxKind) {
17223 case RecurKind::Add: {
17224 // res = mul vv, n
17225 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
17226 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
17227 << VectorizedValue << ". (HorRdx)\n");
17228 return Builder.CreateMul(VectorizedValue, Scale);
17229 }
17230 case RecurKind::Xor: {
17231 // res = n % 2 ? 0 : vv
17232 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
17233 << ". (HorRdx)\n");
17234 if (Cnt % 2 == 0)
17235 return Constant::getNullValue(VectorizedValue->getType());
17236 return VectorizedValue;
17237 }
17238 case RecurKind::FAdd: {
17239 // res = fmul v, n
17240 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
17241 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
17242 << VectorizedValue << ". (HorRdx)\n");
17243 return Builder.CreateFMul(VectorizedValue, Scale);
17244 }
17245 case RecurKind::And:
17246 case RecurKind::Or:
17247 case RecurKind::SMax:
17248 case RecurKind::SMin:
17249 case RecurKind::UMax:
17250 case RecurKind::UMin:
17251 case RecurKind::FMax:
17252 case RecurKind::FMin:
17253 case RecurKind::FMaximum:
17254 case RecurKind::FMinimum:
17255 // res = vv
17256 return VectorizedValue;
17257 case RecurKind::Mul:
17258 case RecurKind::FMul:
17259 case RecurKind::FMulAdd:
17260 case RecurKind::IAnyOf:
17261 case RecurKind::FAnyOf:
17262 case RecurKind::None:
17263 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
17264 }
17265 return nullptr;
17266 }
17267
17268 /// Emits actual operation for the scalar identity values, found during
17269 /// horizontal reduction analysis.
17270 Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17272 const MapVector<Value *, unsigned> &SameValuesCounter,
17273 const DenseMap<Value *, Value *> &TrackedToOrig) {
17274 assert(IsSupportedHorRdxIdentityOp &&
17275 "The optimization of matched scalar identity horizontal reductions "
17276 "must be supported.");
17277 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
17278 if (VTy->getElementType() != VL.front()->getType()) {
17279 VectorizedValue = Builder.CreateIntCast(
17280 VectorizedValue,
17281 FixedVectorType::get(VL.front()->getType(), VTy->getNumElements()),
17282 any_of(VL, [&](Value *R) {
17284 R, cast<Instruction>(ReductionOps.front().front())
17285 ->getModule()
17286 ->getDataLayout());
17287 return !Known.isNonNegative();
17288 }));
17289 }
17290 switch (RdxKind) {
17291 case RecurKind::Add: {
17292 // root = mul prev_root, <1, 1, n, 1>
17294 for (Value *V : VL) {
17295 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17296 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
17297 }
17298 auto *Scale = ConstantVector::get(Vals);
17299 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
17300 << VectorizedValue << ". (HorRdx)\n");
17301 return Builder.CreateMul(VectorizedValue, Scale);
17302 }
17303 case RecurKind::And:
17304 case RecurKind::Or:
17305 // No need for multiple or/and(s).
17306 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
17307 << ". (HorRdx)\n");
17308 return VectorizedValue;
17309 case RecurKind::SMax:
17310 case RecurKind::SMin:
17311 case RecurKind::UMax:
17312 case RecurKind::UMin:
17313 case RecurKind::FMax:
17314 case RecurKind::FMin:
17315 case RecurKind::FMaximum:
17316 case RecurKind::FMinimum:
17317 // No need for multiple min/max(s) of the same value.
17318 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
17319 << ". (HorRdx)\n");
17320 return VectorizedValue;
17321 case RecurKind::Xor: {
17322 // Replace values with even number of repeats with 0, since
17323 // x xor x = 0.
17324 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
17325 // 7>, if elements 4th and 6th elements have even number of repeats.
17327 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
17329 std::iota(Mask.begin(), Mask.end(), 0);
17330 bool NeedShuffle = false;
17331 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
17332 Value *V = VL[I];
17333 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17334 if (Cnt % 2 == 0) {
17335 Mask[I] = VF;
17336 NeedShuffle = true;
17337 }
17338 }
17339 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
17340 : Mask) dbgs()
17341 << I << " ";
17342 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
17343 if (NeedShuffle)
17344 VectorizedValue = Builder.CreateShuffleVector(
17345 VectorizedValue,
17346 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
17347 return VectorizedValue;
17348 }
17349 case RecurKind::FAdd: {
17350 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
17352 for (Value *V : VL) {
17353 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17354 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
17355 }
17356 auto *Scale = ConstantVector::get(Vals);
17357 return Builder.CreateFMul(VectorizedValue, Scale);
17358 }
17359 case RecurKind::Mul:
17360 case RecurKind::FMul:
17361 case RecurKind::FMulAdd:
17362 case RecurKind::IAnyOf:
17363 case RecurKind::FAnyOf:
17364 case RecurKind::None:
17365 llvm_unreachable("Unexpected reduction kind for reused scalars.");
17366 }
17367 return nullptr;
17368 }
17369};
17370} // end anonymous namespace
17371
17372/// Gets recurrence kind from the specified value.
17374 return HorizontalReduction::getRdxKind(V);
17375}
17376static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
17377 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
17378 return cast<FixedVectorType>(IE->getType())->getNumElements();
17379
17380 unsigned AggregateSize = 1;
17381 auto *IV = cast<InsertValueInst>(InsertInst);
17382 Type *CurrentType = IV->getType();
17383 do {
17384 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
17385 for (auto *Elt : ST->elements())
17386 if (Elt != ST->getElementType(0)) // check homogeneity
17387 return std::nullopt;
17388 AggregateSize *= ST->getNumElements();
17389 CurrentType = ST->getElementType(0);
17390 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
17391 AggregateSize *= AT->getNumElements();
17392 CurrentType = AT->getElementType();
17393 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
17394 AggregateSize *= VT->getNumElements();
17395 return AggregateSize;
17396 } else if (CurrentType->isSingleValueType()) {
17397 return AggregateSize;
17398 } else {
17399 return std::nullopt;
17400 }
17401 } while (true);
17402}
17403
17404static void findBuildAggregate_rec(Instruction *LastInsertInst,
17406 SmallVectorImpl<Value *> &BuildVectorOpds,
17407 SmallVectorImpl<Value *> &InsertElts,
17408 unsigned OperandOffset) {
17409 do {
17410 Value *InsertedOperand = LastInsertInst->getOperand(1);
17411 std::optional<unsigned> OperandIndex =
17412 getInsertIndex(LastInsertInst, OperandOffset);
17413 if (!OperandIndex)
17414 return;
17415 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
17416 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
17417 BuildVectorOpds, InsertElts, *OperandIndex);
17418
17419 } else {
17420 BuildVectorOpds[*OperandIndex] = InsertedOperand;
17421 InsertElts[*OperandIndex] = LastInsertInst;
17422 }
17423 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
17424 } while (LastInsertInst != nullptr &&
17425 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
17426 LastInsertInst->hasOneUse());
17427}
17428
17429/// Recognize construction of vectors like
17430/// %ra = insertelement <4 x float> poison, float %s0, i32 0
17431/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
17432/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
17433/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
17434/// starting from the last insertelement or insertvalue instruction.
17435///
17436/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
17437/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
17438/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
17439///
17440/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
17441///
17442/// \return true if it matches.
17443static bool findBuildAggregate(Instruction *LastInsertInst,
17445 SmallVectorImpl<Value *> &BuildVectorOpds,
17446 SmallVectorImpl<Value *> &InsertElts) {
17447
17448 assert((isa<InsertElementInst>(LastInsertInst) ||
17449 isa<InsertValueInst>(LastInsertInst)) &&
17450 "Expected insertelement or insertvalue instruction!");
17451
17452 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
17453 "Expected empty result vectors!");
17454
17455 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
17456 if (!AggregateSize)
17457 return false;
17458 BuildVectorOpds.resize(*AggregateSize);
17459 InsertElts.resize(*AggregateSize);
17460
17461 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);
17462 llvm::erase(BuildVectorOpds, nullptr);
17463 llvm::erase(InsertElts, nullptr);
17464 if (BuildVectorOpds.size() >= 2)
17465 return true;
17466
17467 return false;
17468}
17469
17470/// Try and get a reduction instruction from a phi node.
17471///
17472/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
17473/// if they come from either \p ParentBB or a containing loop latch.
17474///
17475/// \returns A candidate reduction value if possible, or \code nullptr \endcode
17476/// if not possible.
17478 BasicBlock *ParentBB, LoopInfo *LI) {
17479 // There are situations where the reduction value is not dominated by the
17480 // reduction phi. Vectorizing such cases has been reported to cause
17481 // miscompiles. See PR25787.
17482 auto DominatedReduxValue = [&](Value *R) {
17483 return isa<Instruction>(R) &&
17484 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
17485 };
17486
17487 Instruction *Rdx = nullptr;
17488
17489 // Return the incoming value if it comes from the same BB as the phi node.
17490 if (P->getIncomingBlock(0) == ParentBB) {
17491 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
17492 } else if (P->getIncomingBlock(1) == ParentBB) {
17493 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
17494 }
17495
17496 if (Rdx && DominatedReduxValue(Rdx))
17497 return Rdx;
17498
17499 // Otherwise, check whether we have a loop latch to look at.
17500 Loop *BBL = LI->getLoopFor(ParentBB);
17501 if (!BBL)
17502 return nullptr;
17503 BasicBlock *BBLatch = BBL->getLoopLatch();
17504 if (!BBLatch)
17505 return nullptr;
17506
17507 // There is a loop latch, return the incoming value if it comes from
17508 // that. This reduction pattern occasionally turns up.
17509 if (P->getIncomingBlock(0) == BBLatch) {
17510 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
17511 } else if (P->getIncomingBlock(1) == BBLatch) {
17512 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
17513 }
17514
17515 if (Rdx && DominatedReduxValue(Rdx))
17516 return Rdx;
17517
17518 return nullptr;
17519}
17520
17521static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
17522 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
17523 return true;
17524 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
17525 return true;
17526 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
17527 return true;
17528 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
17529 return true;
17530 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
17531 return true;
17532 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
17533 return true;
17534 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
17535 return true;
17536 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
17537 return true;
17538 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
17539 return true;
17540 return false;
17541}
17542
17543/// We could have an initial reduction that is not an add.
17544/// r *= v1 + v2 + v3 + v4
17545/// In such a case start looking for a tree rooted in the first '+'.
17546/// \Returns the new root if found, which may be nullptr if not an instruction.
17548 Instruction *Root) {
17549 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
17550 isa<IntrinsicInst>(Root)) &&
17551 "Expected binop, select, or intrinsic for reduction matching");
17552 Value *LHS =
17553 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
17554 Value *RHS =
17555 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
17556 if (LHS == Phi)
17557 return dyn_cast<Instruction>(RHS);
17558 if (RHS == Phi)
17559 return dyn_cast<Instruction>(LHS);
17560 return nullptr;
17561}
17562
17563/// \p Returns the first operand of \p I that does not match \p Phi. If
17564/// operand is not an instruction it returns nullptr.
17566 Value *Op0 = nullptr;
17567 Value *Op1 = nullptr;
17568 if (!matchRdxBop(I, Op0, Op1))
17569 return nullptr;
17570 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
17571}
17572
17573/// \Returns true if \p I is a candidate instruction for reduction vectorization.
17575 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
17576 Value *B0 = nullptr, *B1 = nullptr;
17577 bool IsBinop = matchRdxBop(I, B0, B1);
17578 return IsBinop || IsSelect;
17579}
17580
17581bool SLPVectorizerPass::vectorizeHorReduction(
17583 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
17584 if (!ShouldVectorizeHor)
17585 return false;
17586 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
17587
17588 if (Root->getParent() != BB || isa<PHINode>(Root))
17589 return false;
17590
17591 // If we can find a secondary reduction root, use that instead.
17592 auto SelectRoot = [&]() {
17593 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
17594 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
17595 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
17596 return NewRoot;
17597 return Root;
17598 };
17599
17600 // Start analysis starting from Root instruction. If horizontal reduction is
17601 // found, try to vectorize it. If it is not a horizontal reduction or
17602 // vectorization is not possible or not effective, and currently analyzed
17603 // instruction is a binary operation, try to vectorize the operands, using
17604 // pre-order DFS traversal order. If the operands were not vectorized, repeat
17605 // the same procedure considering each operand as a possible root of the
17606 // horizontal reduction.
17607 // Interrupt the process if the Root instruction itself was vectorized or all
17608 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
17609 // If a horizintal reduction was not matched or vectorized we collect
17610 // instructions for possible later attempts for vectorization.
17611 std::queue<std::pair<Instruction *, unsigned>> Stack;
17612 Stack.emplace(SelectRoot(), 0);
17613 SmallPtrSet<Value *, 8> VisitedInstrs;
17614 bool Res = false;
17615 auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
17616 if (R.isAnalyzedReductionRoot(Inst))
17617 return nullptr;
17618 if (!isReductionCandidate(Inst))
17619 return nullptr;
17620 HorizontalReduction HorRdx;
17621 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
17622 return nullptr;
17623 return HorRdx.tryToReduce(R, *DL, TTI, *TLI);
17624 };
17625 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
17626 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
17627 FutureSeed = getNonPhiOperand(Root, P);
17628 if (!FutureSeed)
17629 return false;
17630 }
17631 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
17632 // analysis is done separately.
17633 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
17634 PostponedInsts.push_back(FutureSeed);
17635 return true;
17636 };
17637
17638 while (!Stack.empty()) {
17639 Instruction *Inst;
17640 unsigned Level;
17641 std::tie(Inst, Level) = Stack.front();
17642 Stack.pop();
17643 // Do not try to analyze instruction that has already been vectorized.
17644 // This may happen when we vectorize instruction operands on a previous
17645 // iteration while stack was populated before that happened.
17646 if (R.isDeleted(Inst))
17647 continue;
17648 if (Value *VectorizedV = TryToReduce(Inst)) {
17649 Res = true;
17650 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
17651 // Try to find another reduction.
17652 Stack.emplace(I, Level);
17653 continue;
17654 }
17655 } else {
17656 // We could not vectorize `Inst` so try to use it as a future seed.
17657 if (!TryAppendToPostponedInsts(Inst)) {
17658 assert(Stack.empty() && "Expected empty stack");
17659 break;
17660 }
17661 }
17662
17663 // Try to vectorize operands.
17664 // Continue analysis for the instruction from the same basic block only to
17665 // save compile time.
17666 if (++Level < RecursionMaxDepth)
17667 for (auto *Op : Inst->operand_values())
17668 if (VisitedInstrs.insert(Op).second)
17669 if (auto *I = dyn_cast<Instruction>(Op))
17670 // Do not try to vectorize CmpInst operands, this is done
17671 // separately.
17672 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
17673 !R.isDeleted(I) && I->getParent() == BB)
17674 Stack.emplace(I, Level);
17675 }
17676 return Res;
17677}
17678
17679bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
17680 BasicBlock *BB, BoUpSLP &R,
17682 SmallVector<WeakTrackingVH> PostponedInsts;
17683 bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
17684 Res |= tryToVectorize(PostponedInsts, R);
17685 return Res;
17686}
17687
17688bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
17689 BoUpSLP &R) {
17690 bool Res = false;
17691 for (Value *V : Insts)
17692 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
17693 Res |= tryToVectorize(Inst, R);
17694 return Res;
17695}
17696
17697bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
17698 BasicBlock *BB, BoUpSLP &R) {
17699 if (!R.canMapToVector(IVI->getType()))
17700 return false;
17701
17702 SmallVector<Value *, 16> BuildVectorOpds;
17703 SmallVector<Value *, 16> BuildVectorInsts;
17704 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
17705 return false;
17706
17707 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
17708 // Aggregate value is unlikely to be processed in vector register.
17709 return tryToVectorizeList(BuildVectorOpds, R);
17710}
17711
17712bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
17713 BasicBlock *BB, BoUpSLP &R) {
17714 SmallVector<Value *, 16> BuildVectorInsts;
17715 SmallVector<Value *, 16> BuildVectorOpds;
17717 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
17718 (llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
17719 isFixedVectorShuffle(BuildVectorOpds, Mask)))
17720 return false;
17721
17722 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
17723 return tryToVectorizeList(BuildVectorInsts, R);
17724}
17725
17726template <typename T>
17728 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
17729 function_ref<bool(T *, T *)> AreCompatible,
17730 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
17731 bool MaxVFOnly, BoUpSLP &R) {
17732 bool Changed = false;
17733 // Sort by type, parent, operands.
17734 stable_sort(Incoming, Comparator);
17735
17736 // Try to vectorize elements base on their type.
17737 SmallVector<T *> Candidates;
17738 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) {
17739 // Look for the next elements with the same type, parent and operand
17740 // kinds.
17741 auto *SameTypeIt = IncIt;
17742 while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))
17743 ++SameTypeIt;
17744
17745 // Try to vectorize them.
17746 unsigned NumElts = (SameTypeIt - IncIt);
17747 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
17748 << NumElts << ")\n");
17749 // The vectorization is a 3-state attempt:
17750 // 1. Try to vectorize instructions with the same/alternate opcodes with the
17751 // size of maximal register at first.
17752 // 2. Try to vectorize remaining instructions with the same type, if
17753 // possible. This may result in the better vectorization results rather than
17754 // if we try just to vectorize instructions with the same/alternate opcodes.
17755 // 3. Final attempt to try to vectorize all instructions with the
17756 // same/alternate ops only, this may result in some extra final
17757 // vectorization.
17758 if (NumElts > 1 &&
17759 TryToVectorizeHelper(ArrayRef(IncIt, NumElts), MaxVFOnly)) {
17760 // Success start over because instructions might have been changed.
17761 Changed = true;
17762 } else {
17763 /// \Returns the minimum number of elements that we will attempt to
17764 /// vectorize.
17765 auto GetMinNumElements = [&R](Value *V) {
17766 unsigned EltSize = R.getVectorElementSize(V);
17767 return std::max(2U, R.getMaxVecRegSize() / EltSize);
17768 };
17769 if (NumElts < GetMinNumElements(*IncIt) &&
17770 (Candidates.empty() ||
17771 Candidates.front()->getType() == (*IncIt)->getType())) {
17772 Candidates.append(IncIt, std::next(IncIt, NumElts));
17773 }
17774 }
17775 // Final attempt to vectorize instructions with the same types.
17776 if (Candidates.size() > 1 &&
17777 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
17778 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
17779 // Success start over because instructions might have been changed.
17780 Changed = true;
17781 } else if (MaxVFOnly) {
17782 // Try to vectorize using small vectors.
17783 for (auto *It = Candidates.begin(), *End = Candidates.end();
17784 It != End;) {
17785 auto *SameTypeIt = It;
17786 while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))
17787 ++SameTypeIt;
17788 unsigned NumElts = (SameTypeIt - It);
17789 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(It, NumElts),
17790 /*MaxVFOnly=*/false))
17791 Changed = true;
17792 It = SameTypeIt;
17793 }
17794 }
17795 Candidates.clear();
17796 }
17797
17798 // Start over at the next instruction of a different type (or the end).
17799 IncIt = SameTypeIt;
17800 }
17801 return Changed;
17802}
17803
17804/// Compare two cmp instructions. If IsCompatibility is true, function returns
17805/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
17806/// operands. If IsCompatibility is false, function implements strict weak
17807/// ordering relation between two cmp instructions, returning true if the first
17808/// instruction is "less" than the second, i.e. its predicate is less than the
17809/// predicate of the second or the operands IDs are less than the operands IDs
17810/// of the second cmp instruction.
17811template <bool IsCompatibility>
17812static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
17813 const DominatorTree &DT) {
17814 assert(isValidElementType(V->getType()) &&
17815 isValidElementType(V2->getType()) &&
17816 "Expected valid element types only.");
17817 if (V == V2)
17818 return IsCompatibility;
17819 auto *CI1 = cast<CmpInst>(V);
17820 auto *CI2 = cast<CmpInst>(V2);
17821 if (CI1->getOperand(0)->getType()->getTypeID() <
17822 CI2->getOperand(0)->getType()->getTypeID())
17823 return !IsCompatibility;
17824 if (CI1->getOperand(0)->getType()->getTypeID() >
17825 CI2->getOperand(0)->getType()->getTypeID())
17826 return false;
17827 CmpInst::Predicate Pred1 = CI1->getPredicate();
17828 CmpInst::Predicate Pred2 = CI2->getPredicate();
17831 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
17832 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
17833 if (BasePred1 < BasePred2)
17834 return !IsCompatibility;
17835 if (BasePred1 > BasePred2)
17836 return false;
17837 // Compare operands.
17838 bool CI1Preds = Pred1 == BasePred1;
17839 bool CI2Preds = Pred2 == BasePred1;
17840 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
17841 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
17842 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
17843 if (Op1 == Op2)
17844 continue;
17845 if (Op1->getValueID() < Op2->getValueID())
17846 return !IsCompatibility;
17847 if (Op1->getValueID() > Op2->getValueID())
17848 return false;
17849 if (auto *I1 = dyn_cast<Instruction>(Op1))
17850 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
17851 if (IsCompatibility) {
17852 if (I1->getParent() != I2->getParent())
17853 return false;
17854 } else {
17855 // Try to compare nodes with same parent.
17856 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
17857 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
17858 if (!NodeI1)
17859 return NodeI2 != nullptr;
17860 if (!NodeI2)
17861 return false;
17862 assert((NodeI1 == NodeI2) ==
17863 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
17864 "Different nodes should have different DFS numbers");
17865 if (NodeI1 != NodeI2)
17866 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
17867 }
17868 InstructionsState S = getSameOpcode({I1, I2}, TLI);
17869 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
17870 continue;
17871 if (IsCompatibility)
17872 return false;
17873 if (I1->getOpcode() != I2->getOpcode())
17874 return I1->getOpcode() < I2->getOpcode();
17875 }
17876 }
17877 return IsCompatibility;
17878}
17879
17880template <typename ItT>
17881bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
17882 BasicBlock *BB, BoUpSLP &R) {
17883 bool Changed = false;
17884 // Try to find reductions first.
17885 for (CmpInst *I : CmpInsts) {
17886 if (R.isDeleted(I))
17887 continue;
17888 for (Value *Op : I->operands())
17889 if (auto *RootOp = dyn_cast<Instruction>(Op))
17890 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI);
17891 }
17892 // Try to vectorize operands as vector bundles.
17893 for (CmpInst *I : CmpInsts) {
17894 if (R.isDeleted(I))
17895 continue;
17896 Changed |= tryToVectorize(I, R);
17897 }
17898 // Try to vectorize list of compares.
17899 // Sort by type, compare predicate, etc.
17900 auto CompareSorter = [&](Value *V, Value *V2) {
17901 if (V == V2)
17902 return false;
17903 return compareCmp<false>(V, V2, *TLI, *DT);
17904 };
17905
17906 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
17907 if (V1 == V2)
17908 return true;
17909 return compareCmp<true>(V1, V2, *TLI, *DT);
17910 };
17911
17913 for (Instruction *V : CmpInsts)
17914 if (!R.isDeleted(V) && isValidElementType(V->getType()))
17915 Vals.push_back(V);
17916 if (Vals.size() <= 1)
17917 return Changed;
17918 Changed |= tryToVectorizeSequence<Value>(
17919 Vals, CompareSorter, AreCompatibleCompares,
17920 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
17921 // Exclude possible reductions from other blocks.
17922 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
17923 return any_of(V->users(), [V](User *U) {
17924 auto *Select = dyn_cast<SelectInst>(U);
17925 return Select &&
17926 Select->getParent() != cast<Instruction>(V)->getParent();
17927 });
17928 });
17929 if (ArePossiblyReducedInOtherBlock)
17930 return false;
17931 return tryToVectorizeList(Candidates, R, MaxVFOnly);
17932 },
17933 /*MaxVFOnly=*/true, R);
17934 return Changed;
17935}
17936
17937bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
17938 BasicBlock *BB, BoUpSLP &R) {
17939 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
17940 "This function only accepts Insert instructions");
17941 bool OpsChanged = false;
17942 SmallVector<WeakTrackingVH> PostponedInsts;
17943 // pass1 - try to vectorize reductions only
17944 for (auto *I : reverse(Instructions)) {
17945 if (R.isDeleted(I))
17946 continue;
17947 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
17948 }
17949 // pass2 - try to match and vectorize a buildvector sequence.
17950 for (auto *I : reverse(Instructions)) {
17951 if (R.isDeleted(I) || isa<CmpInst>(I))
17952 continue;
17953 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
17954 OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
17955 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
17956 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
17957 }
17958 }
17959 // Now try to vectorize postponed instructions.
17960 OpsChanged |= tryToVectorize(PostponedInsts, R);
17961
17962 Instructions.clear();
17963 return OpsChanged;
17964}
17965
17966bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
17967 bool Changed = false;
17969 SmallPtrSet<Value *, 16> VisitedInstrs;
17970 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
17971 // node. Allows better to identify the chains that can be vectorized in the
17972 // better way.
17974 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
17976 isValidElementType(V2->getType()) &&
17977 "Expected vectorizable types only.");
17978 // It is fine to compare type IDs here, since we expect only vectorizable
17979 // types, like ints, floats and pointers, we don't care about other type.
17980 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
17981 return true;
17982 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
17983 return false;
17984 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
17985 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
17986 if (Opcodes1.size() < Opcodes2.size())
17987 return true;
17988 if (Opcodes1.size() > Opcodes2.size())
17989 return false;
17990 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
17991 {
17992 // Instructions come first.
17993 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
17994 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
17995 if (I1 && I2) {
17996 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
17997 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
17998 if (!NodeI1)
17999 return NodeI2 != nullptr;
18000 if (!NodeI2)
18001 return false;
18002 assert((NodeI1 == NodeI2) ==
18003 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18004 "Different nodes should have different DFS numbers");
18005 if (NodeI1 != NodeI2)
18006 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18007 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18008 if (S.getOpcode() && !S.isAltShuffle())
18009 continue;
18010 return I1->getOpcode() < I2->getOpcode();
18011 }
18012 if (I1)
18013 return true;
18014 if (I2)
18015 return false;
18016 }
18017 {
18018 // Non-undef constants come next.
18019 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
18020 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
18021 if (C1 && C2)
18022 continue;
18023 if (C1)
18024 return true;
18025 if (C2)
18026 return false;
18027 }
18028 bool U1 = isa<UndefValue>(Opcodes1[I]);
18029 bool U2 = isa<UndefValue>(Opcodes2[I]);
18030 {
18031 // Non-constant non-instructions come next.
18032 if (!U1 && !U2) {
18033 auto ValID1 = Opcodes1[I]->getValueID();
18034 auto ValID2 = Opcodes2[I]->getValueID();
18035 if (ValID1 == ValID2)
18036 continue;
18037 if (ValID1 < ValID2)
18038 return true;
18039 if (ValID1 > ValID2)
18040 return false;
18041 }
18042 if (!U1)
18043 return true;
18044 if (!U2)
18045 return false;
18046 }
18047 // Undefs come last.
18048 assert(U1 && U2 && "The only thing left should be undef & undef.");
18049 continue;
18050 }
18051 return false;
18052 };
18053 auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) {
18054 if (V1 == V2)
18055 return true;
18056 if (V1->getType() != V2->getType())
18057 return false;
18058 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
18059 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
18060 if (Opcodes1.size() != Opcodes2.size())
18061 return false;
18062 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
18063 // Undefs are compatible with any other value.
18064 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
18065 continue;
18066 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
18067 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
18068 if (I1->getParent() != I2->getParent())
18069 return false;
18070 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18071 if (S.getOpcode())
18072 continue;
18073 return false;
18074 }
18075 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
18076 continue;
18077 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
18078 return false;
18079 }
18080 return true;
18081 };
18082
18083 bool HaveVectorizedPhiNodes = false;
18084 do {
18085 // Collect the incoming values from the PHIs.
18086 Incoming.clear();
18087 for (Instruction &I : *BB) {
18088 PHINode *P = dyn_cast<PHINode>(&I);
18089 if (!P)
18090 break;
18091
18092 // No need to analyze deleted, vectorized and non-vectorizable
18093 // instructions.
18094 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
18095 isValidElementType(P->getType()))
18096 Incoming.push_back(P);
18097 }
18098
18099 if (Incoming.size() <= 1)
18100 break;
18101
18102 // Find the corresponding non-phi nodes for better matching when trying to
18103 // build the tree.
18104 for (Value *V : Incoming) {
18105 SmallVectorImpl<Value *> &Opcodes =
18106 PHIToOpcodes.try_emplace(V).first->getSecond();
18107 if (!Opcodes.empty())
18108 continue;
18109 SmallVector<Value *, 4> Nodes(1, V);
18111 while (!Nodes.empty()) {
18112 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
18113 if (!Visited.insert(PHI).second)
18114 continue;
18115 for (Value *V : PHI->incoming_values()) {
18116 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
18117 Nodes.push_back(PHI1);
18118 continue;
18119 }
18120 Opcodes.emplace_back(V);
18121 }
18122 }
18123 }
18124
18125 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
18126 Incoming, PHICompare, AreCompatiblePHIs,
18127 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18128 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18129 },
18130 /*MaxVFOnly=*/true, R);
18131 Changed |= HaveVectorizedPhiNodes;
18132 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
18133 } while (HaveVectorizedPhiNodes);
18134
18135 VisitedInstrs.clear();
18136
18137 InstSetVector PostProcessInserts;
18138 SmallSetVector<CmpInst *, 8> PostProcessCmps;
18139 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
18140 // also vectorizes `PostProcessCmps`.
18141 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
18142 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
18143 if (VectorizeCmps) {
18144 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
18145 PostProcessCmps.clear();
18146 }
18147 PostProcessInserts.clear();
18148 return Changed;
18149 };
18150 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
18151 auto IsInPostProcessInstrs = [&](Instruction *I) {
18152 if (auto *Cmp = dyn_cast<CmpInst>(I))
18153 return PostProcessCmps.contains(Cmp);
18154 return isa<InsertElementInst, InsertValueInst>(I) &&
18155 PostProcessInserts.contains(I);
18156 };
18157 // Returns true if `I` is an instruction without users, like terminator, or
18158 // function call with ignored return value, store. Ignore unused instructions
18159 // (basing on instruction type, except for CallInst and InvokeInst).
18160 auto HasNoUsers = [](Instruction *I) {
18161 return I->use_empty() &&
18162 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
18163 };
18164 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
18165 // Skip instructions with scalable type. The num of elements is unknown at
18166 // compile-time for scalable type.
18167 if (isa<ScalableVectorType>(It->getType()))
18168 continue;
18169
18170 // Skip instructions marked for the deletion.
18171 if (R.isDeleted(&*It))
18172 continue;
18173 // We may go through BB multiple times so skip the one we have checked.
18174 if (!VisitedInstrs.insert(&*It).second) {
18175 if (HasNoUsers(&*It) &&
18176 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
18177 // We would like to start over since some instructions are deleted
18178 // and the iterator may become invalid value.
18179 Changed = true;
18180 It = BB->begin();
18181 E = BB->end();
18182 }
18183 continue;
18184 }
18185
18186 if (isa<DbgInfoIntrinsic>(It))
18187 continue;
18188
18189 // Try to vectorize reductions that use PHINodes.
18190 if (PHINode *P = dyn_cast<PHINode>(It)) {
18191 // Check that the PHI is a reduction PHI.
18192 if (P->getNumIncomingValues() == 2) {
18193 // Try to match and vectorize a horizontal reduction.
18194 Instruction *Root = getReductionInstr(DT, P, BB, LI);
18195 if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
18196 Changed = true;
18197 It = BB->begin();
18198 E = BB->end();
18199 continue;
18200 }
18201 }
18202 // Try to vectorize the incoming values of the PHI, to catch reductions
18203 // that feed into PHIs.
18204 for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) {
18205 // Skip if the incoming block is the current BB for now. Also, bypass
18206 // unreachable IR for efficiency and to avoid crashing.
18207 // TODO: Collect the skipped incoming values and try to vectorize them
18208 // after processing BB.
18209 if (BB == P->getIncomingBlock(I) ||
18210 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
18211 continue;
18212
18213 // Postponed instructions should not be vectorized here, delay their
18214 // vectorization.
18215 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
18216 PI && !IsInPostProcessInstrs(PI))
18217 Changed |= vectorizeRootInstruction(nullptr, PI,
18218 P->getIncomingBlock(I), R, TTI);
18219 }
18220 continue;
18221 }
18222
18223 if (HasNoUsers(&*It)) {
18224 bool OpsChanged = false;
18225 auto *SI = dyn_cast<StoreInst>(It);
18226 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
18227 if (SI) {
18228 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
18229 // Try to vectorize chain in store, if this is the only store to the
18230 // address in the block.
18231 // TODO: This is just a temporarily solution to save compile time. Need
18232 // to investigate if we can safely turn on slp-vectorize-hor-store
18233 // instead to allow lookup for reduction chains in all non-vectorized
18234 // stores (need to check side effects and compile time).
18235 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
18236 SI->getValueOperand()->hasOneUse();
18237 }
18238 if (TryToVectorizeRoot) {
18239 for (auto *V : It->operand_values()) {
18240 // Postponed instructions should not be vectorized here, delay their
18241 // vectorization.
18242 if (auto *VI = dyn_cast<Instruction>(V);
18243 VI && !IsInPostProcessInstrs(VI))
18244 // Try to match and vectorize a horizontal reduction.
18245 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI);
18246 }
18247 }
18248 // Start vectorization of post-process list of instructions from the
18249 // top-tree instructions to try to vectorize as many instructions as
18250 // possible.
18251 OpsChanged |=
18252 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
18253 if (OpsChanged) {
18254 // We would like to start over since some instructions are deleted
18255 // and the iterator may become invalid value.
18256 Changed = true;
18257 It = BB->begin();
18258 E = BB->end();
18259 continue;
18260 }
18261 }
18262
18263 if (isa<InsertElementInst, InsertValueInst>(It))
18264 PostProcessInserts.insert(&*It);
18265 else if (isa<CmpInst>(It))
18266 PostProcessCmps.insert(cast<CmpInst>(&*It));
18267 }
18268
18269 return Changed;
18270}
18271
18272bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
18273 auto Changed = false;
18274 for (auto &Entry : GEPs) {
18275 // If the getelementptr list has fewer than two elements, there's nothing
18276 // to do.
18277 if (Entry.second.size() < 2)
18278 continue;
18279
18280 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
18281 << Entry.second.size() << ".\n");
18282
18283 // Process the GEP list in chunks suitable for the target's supported
18284 // vector size. If a vector register can't hold 1 element, we are done. We
18285 // are trying to vectorize the index computations, so the maximum number of
18286 // elements is based on the size of the index expression, rather than the
18287 // size of the GEP itself (the target's pointer size).
18288 unsigned MaxVecRegSize = R.getMaxVecRegSize();
18289 unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin());
18290 if (MaxVecRegSize < EltSize)
18291 continue;
18292
18293 unsigned MaxElts = MaxVecRegSize / EltSize;
18294 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
18295 auto Len = std::min<unsigned>(BE - BI, MaxElts);
18296 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
18297
18298 // Initialize a set a candidate getelementptrs. Note that we use a
18299 // SetVector here to preserve program order. If the index computations
18300 // are vectorizable and begin with loads, we want to minimize the chance
18301 // of having to reorder them later.
18302 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
18303
18304 // Some of the candidates may have already been vectorized after we
18305 // initially collected them or their index is optimized to constant value.
18306 // If so, they are marked as deleted, so remove them from the set of
18307 // candidates.
18308 Candidates.remove_if([&R](Value *I) {
18309 return R.isDeleted(cast<Instruction>(I)) ||
18310 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
18311 });
18312
18313 // Remove from the set of candidates all pairs of getelementptrs with
18314 // constant differences. Such getelementptrs are likely not good
18315 // candidates for vectorization in a bottom-up phase since one can be
18316 // computed from the other. We also ensure all candidate getelementptr
18317 // indices are unique.
18318 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
18319 auto *GEPI = GEPList[I];
18320 if (!Candidates.count(GEPI))
18321 continue;
18322 auto *SCEVI = SE->getSCEV(GEPList[I]);
18323 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
18324 auto *GEPJ = GEPList[J];
18325 auto *SCEVJ = SE->getSCEV(GEPList[J]);
18326 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
18327 Candidates.remove(GEPI);
18328 Candidates.remove(GEPJ);
18329 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
18330 Candidates.remove(GEPJ);
18331 }
18332 }
18333 }
18334
18335 // We break out of the above computation as soon as we know there are
18336 // fewer than two candidates remaining.
18337 if (Candidates.size() < 2)
18338 continue;
18339
18340 // Add the single, non-constant index of each candidate to the bundle. We
18341 // ensured the indices met these constraints when we originally collected
18342 // the getelementptrs.
18343 SmallVector<Value *, 16> Bundle(Candidates.size());
18344 auto BundleIndex = 0u;
18345 for (auto *V : Candidates) {
18346 auto *GEP = cast<GetElementPtrInst>(V);
18347 auto *GEPIdx = GEP->idx_begin()->get();
18348 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
18349 Bundle[BundleIndex++] = GEPIdx;
18350 }
18351
18352 // Try and vectorize the indices. We are currently only interested in
18353 // gather-like cases of the form:
18354 //
18355 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
18356 //
18357 // where the loads of "a", the loads of "b", and the subtractions can be
18358 // performed in parallel. It's likely that detecting this pattern in a
18359 // bottom-up phase will be simpler and less costly than building a
18360 // full-blown top-down phase beginning at the consecutive loads.
18361 Changed |= tryToVectorizeList(Bundle, R);
18362 }
18363 }
18364 return Changed;
18365}
18366
18367bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
18368 bool Changed = false;
18369 // Sort by type, base pointers and values operand. Value operands must be
18370 // compatible (have the same opcode, same parent), otherwise it is
18371 // definitely not profitable to try to vectorize them.
18372 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
18373 if (V->getValueOperand()->getType()->getTypeID() <
18374 V2->getValueOperand()->getType()->getTypeID())
18375 return true;
18376 if (V->getValueOperand()->getType()->getTypeID() >
18377 V2->getValueOperand()->getType()->getTypeID())
18378 return false;
18379 if (V->getPointerOperandType()->getTypeID() <
18380 V2->getPointerOperandType()->getTypeID())
18381 return true;
18382 if (V->getPointerOperandType()->getTypeID() >
18383 V2->getPointerOperandType()->getTypeID())
18384 return false;
18385 // UndefValues are compatible with all other values.
18386 if (isa<UndefValue>(V->getValueOperand()) ||
18387 isa<UndefValue>(V2->getValueOperand()))
18388 return false;
18389 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
18390 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
18392 DT->getNode(I1->getParent());
18394 DT->getNode(I2->getParent());
18395 assert(NodeI1 && "Should only process reachable instructions");
18396 assert(NodeI2 && "Should only process reachable instructions");
18397 assert((NodeI1 == NodeI2) ==
18398 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18399 "Different nodes should have different DFS numbers");
18400 if (NodeI1 != NodeI2)
18401 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18402 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18403 if (S.getOpcode())
18404 return false;
18405 return I1->getOpcode() < I2->getOpcode();
18406 }
18407 if (isa<Constant>(V->getValueOperand()) &&
18408 isa<Constant>(V2->getValueOperand()))
18409 return false;
18410 return V->getValueOperand()->getValueID() <
18411 V2->getValueOperand()->getValueID();
18412 };
18413
18414 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
18415 if (V1 == V2)
18416 return true;
18417 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
18418 return false;
18419 if (V1->getPointerOperandType() != V2->getPointerOperandType())
18420 return false;
18421 // Undefs are compatible with any other value.
18422 if (isa<UndefValue>(V1->getValueOperand()) ||
18423 isa<UndefValue>(V2->getValueOperand()))
18424 return true;
18425 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
18426 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
18427 if (I1->getParent() != I2->getParent())
18428 return false;
18429 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18430 return S.getOpcode() > 0;
18431 }
18432 if (isa<Constant>(V1->getValueOperand()) &&
18433 isa<Constant>(V2->getValueOperand()))
18434 return true;
18435 return V1->getValueOperand()->getValueID() ==
18436 V2->getValueOperand()->getValueID();
18437 };
18438
18439 // Attempt to sort and vectorize each of the store-groups.
18441 for (auto &Pair : Stores) {
18442 if (Pair.second.size() < 2)
18443 continue;
18444
18445 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
18446 << Pair.second.size() << ".\n");
18447
18448 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
18449 continue;
18450
18451 // Reverse stores to do bottom-to-top analysis. This is important if the
18452 // values are stores to the same addresses several times, in this case need
18453 // to follow the stores order (reversed to meet the memory dependecies).
18454 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
18455 Pair.second.rend());
18456 Changed |= tryToVectorizeSequence<StoreInst>(
18457 ReversedStores, StoreSorter, AreCompatibleStores,
18458 [&](ArrayRef<StoreInst *> Candidates, bool) {
18459 return vectorizeStores(Candidates, R, Attempted);
18460 },
18461 /*MaxVFOnly=*/false, R);
18462 }
18463 return Changed;
18464}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:529
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition: LICM.cpp:1497
Loop::LoopBounds::Direction Direction
Definition: LoopInfo.cpp:230
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
#define T
Module.h This file contains the declarations for the Module class.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(VerifyEach)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > AllowHorRdxIdenityOptimization("slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, cl::desc("Allow optimization of original scalar identity operations on " "matched horizontal reductions."))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static Value * isOneOf(const InstructionsState &S, Value *Op)
Chooses the correct key for scheduling data.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
#define SV_NAME
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static std::string shortBundleName(ArrayRef< Value * > VL)
Print a short descriptor of the instruction bundle suitable for debug output.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static std::optional< unsigned > getInsertIndex(const Value *InsertInst, unsigned Offset=0)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt)
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
This defines the Use class.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleCostEstimator(TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:76
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1636
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:264
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:492
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:473
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:187
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:174
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:228
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:204
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:168
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:195
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:194
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator end()
Definition: BasicBlock.h:443
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:430
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
Definition: BasicBlock.cpp:367
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:167
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
reverse_iterator rend()
Definition: BasicBlock.h:448
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:165
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:221
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:70
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1494
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:2332
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:2227
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1742
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2469
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:2326
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1600
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1678
unsigned arg_size() const
Definition: InstrTypes.h:1685
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1871
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:2323
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:601
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:983
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1362
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:1022
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:1023
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:1017
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:1016
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:1020
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:1018
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:1021
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:1019
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:1167
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:1129
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:1105
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2126
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:154
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1449
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1398
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:417
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:101
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
value_type & FindAndConstruct(const KeyT &Key)
Definition: DenseMap.h:348
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:321
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:296
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void set()
Definition: FMF.h:62
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:130
Type * getReturnType() const
Definition: DerivedTypes.h:124
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2257
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:921
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:511
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2460
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:539
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2265
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1807
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:466
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1110
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:175
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2535
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:311
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:220
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:848
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1753
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:486
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2366
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2397
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2249
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2494
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:471
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1666
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:169
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2273
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2161
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2196
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1826
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2412
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1866
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1587
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1361
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:630
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:260
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:742
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:454
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
Definition: Instruction.h:257
const BasicBlock * getParent() const
Definition: Instruction.h:152
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:258
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
An instruction for reading from memory.
Definition: Instructions.h:184
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:286
Value * getPointerOperand()
Definition: Instructions.h:280
bool isSimple() const
Definition: Instructions.h:272
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:236
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
iterator end()
Definition: MapVector.h:71
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:55
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:118
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
void reserve(size_type NumEntries)
Grow the MapVector so that it can contain at least NumEntries items before resizing again.
Definition: MapVector.h:64
size_type size() const
Definition: MapVector.h:60
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:83
void clear()
Definition: MapVector.h:88
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
T & front() const
front - Get the first element.
Definition: ArrayRef.h:363
iterator end() const
Definition: ArrayRef.h:357
iterator begin() const
Definition: ArrayRef.h:356
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:376
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
Definition: ArrayRef.h:449
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:155
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:144
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:356
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
iterator end() const
Definition: SmallPtrSet.h:385
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
iterator begin() const
Definition: SmallPtrSet.h:380
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:366
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:236
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:981
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
Type * getPointerOperandType() const
Definition: Instructions.h:420
Value * getValueOperand()
Definition: Instructions.h:414
Value * getPointerOperand()
Definition: Instructions.h:417
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:34
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr) const
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:160
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:287
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:166
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1808
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:242
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Definition: User.h:73
op_iterator op_begin()
Definition: User.h:234
Value * getOperand(unsigned i) const
Definition: User.h:169
iterator_range< value_op_iterator > operand_values()
Definition: User.h:266
The Vector Function Database.
Definition: VectorUtils.h:29
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:70
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:532
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:153
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type size() const
Definition: DenseSet.h:81
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
bool erase(const ValueT &V)
Definition: DenseSet.h:101
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:74
self_iterator getIterator()
Definition: ilist_node.h:109
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:660
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:690
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals=std::nullopt)
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:103
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1469
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:777
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:836
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239
@ Undef
Value of the register doesn't matter.
ManagedStatic< cl::opt< FnT >, OptCreatorT > Action
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:456
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1715
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:128
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:950
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:540
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
Value * createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a target reduction of the given vector.
Definition: LoopUtils.cpp:1166
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:40
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7062
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool getAlign(const Function &F, unsigned index, unsigned &align)
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
iterator_range< po_iterator< T > > post_order(const T &G)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2059
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1754
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:116
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:419
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1235
constexpr int PoisonMaskElem
@ Other
Any other memory.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ Or
Bitwise or logical OR of integers.
@ None
Not a recurrence.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Definition: STLExtras.h:1986
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:427
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
InstructionCost Cost
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:439
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:613
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:491
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2490
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:104
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:220
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1450
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1459
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const