LLVM 19.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <memory>
97#include <optional>
98#include <set>
99#include <string>
100#include <tuple>
101#include <utility>
102
103using namespace llvm;
104using namespace llvm::PatternMatch;
105using namespace slpvectorizer;
106
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
109
110STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111
112static cl::opt<bool>
113 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
114 cl::desc("Run the SLP vectorization passes"));
115
116static cl::opt<int>
118 cl::desc("Only vectorize if you gain more than this "
119 "number "));
120
122 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
123 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
124 "heuristics and makes vectorization decision via cost modeling."));
125
126static cl::opt<bool>
127ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
128 cl::desc("Attempt to vectorize horizontal reductions"));
129
131 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
132 cl::desc(
133 "Attempt to vectorize horizontal reductions feeding into a store"));
134
135// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
136// even if we match a reduction but do not vectorize in the end.
138 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
139 cl::desc("Allow optimization of original scalar identity operations on "
140 "matched horizontal reductions."));
141
142static cl::opt<int>
144 cl::desc("Attempt to vectorize for this register size in bits"));
145
148 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
149
150/// Limits the size of scheduling regions in a block.
151/// It avoid long compile times for _very_ large blocks where vector
152/// instructions are spread over a wide range.
153/// This limit is way higher than needed by real-world functions.
154static cl::opt<int>
155ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
156 cl::desc("Limit the size of the SLP scheduling region per block"));
157
159 "slp-min-reg-size", cl::init(128), cl::Hidden,
160 cl::desc("Attempt to vectorize for this register size in bits"));
161
163 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
164 cl::desc("Limit the recursion depth when building a vectorizable tree"));
165
167 "slp-min-tree-size", cl::init(3), cl::Hidden,
168 cl::desc("Only vectorize small trees if they are fully vectorizable"));
169
170// The maximum depth that the look-ahead score heuristic will explore.
171// The higher this value, the higher the compilation time overhead.
173 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
174 cl::desc("The maximum look-ahead depth for operand reordering scores"));
175
176// The maximum depth that the look-ahead score heuristic will explore
177// when it probing among candidates for vectorization tree roots.
178// The higher this value, the higher the compilation time overhead but unlike
179// similar limit for operands ordering this is less frequently used, hence
180// impact of higher value is less noticeable.
182 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
183 cl::desc("The maximum look-ahead depth for searching best rooting option"));
184
186 "slp-min-strided-loads", cl::init(2), cl::Hidden,
187 cl::desc("The minimum number of loads, which should be considered strided, "
188 "if the stride is > 1 or is runtime value"));
189
191 "slp-max-stride", cl::init(8), cl::Hidden,
192 cl::desc("The maximum stride, considered to be profitable."));
193
194static cl::opt<bool>
195 ViewSLPTree("view-slp-tree", cl::Hidden,
196 cl::desc("Display the SLP trees with Graphviz"));
197
199 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
200 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
201
202// Limit the number of alias checks. The limit is chosen so that
203// it has no negative effect on the llvm benchmarks.
204static const unsigned AliasedCheckLimit = 10;
205
206// Limit of the number of uses for potentially transformed instructions/values,
207// used in checks to avoid compile-time explode.
208static constexpr int UsesLimit = 8;
209
210// Another limit for the alias checks: The maximum distance between load/store
211// instructions where alias checks are done.
212// This limit is useful for very large basic blocks.
213static const unsigned MaxMemDepDistance = 160;
214
215/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
216/// regions to be handled.
217static const int MinScheduleRegionSize = 16;
218
219/// Predicate for the element types that the SLP vectorizer supports.
220///
221/// The most important thing to filter here are types which are invalid in LLVM
222/// vectors. We also filter target specific types which have absolutely no
223/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
224/// avoids spending time checking the cost model and realizing that they will
225/// be inevitably scalarized.
226static bool isValidElementType(Type *Ty) {
227 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
228 !Ty->isPPC_FP128Ty();
229}
230
231/// \returns True if the value is a constant (but not globals/constant
232/// expressions).
233static bool isConstant(Value *V) {
234 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
235}
236
237/// Checks if \p V is one of vector-like instructions, i.e. undef,
238/// insertelement/extractelement with constant indices for fixed vector type or
239/// extractvalue instruction.
241 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
242 !isa<ExtractValueInst, UndefValue>(V))
243 return false;
244 auto *I = dyn_cast<Instruction>(V);
245 if (!I || isa<ExtractValueInst>(I))
246 return true;
247 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
248 return false;
249 if (isa<ExtractElementInst>(I))
250 return isConstant(I->getOperand(1));
251 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
252 return isConstant(I->getOperand(2));
253}
254
255#if !defined(NDEBUG)
256/// Print a short descriptor of the instruction bundle suitable for debug output.
257static std::string shortBundleName(ArrayRef<Value *> VL) {
258 std::string Result;
259 raw_string_ostream OS(Result);
260 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
261 OS.flush();
262 return Result;
263}
264#endif
265
266/// \returns true if all of the instructions in \p VL are in the same block or
267/// false otherwise.
269 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
270 if (!I0)
271 return false;
273 return true;
274
275 BasicBlock *BB = I0->getParent();
276 for (int I = 1, E = VL.size(); I < E; I++) {
277 auto *II = dyn_cast<Instruction>(VL[I]);
278 if (!II)
279 return false;
280
281 if (BB != II->getParent())
282 return false;
283 }
284 return true;
285}
286
287/// \returns True if all of the values in \p VL are constants (but not
288/// globals/constant expressions).
290 // Constant expressions and globals can't be vectorized like normal integer/FP
291 // constants.
292 return all_of(VL, isConstant);
293}
294
295/// \returns True if all of the values in \p VL are identical or some of them
296/// are UndefValue.
297static bool isSplat(ArrayRef<Value *> VL) {
298 Value *FirstNonUndef = nullptr;
299 for (Value *V : VL) {
300 if (isa<UndefValue>(V))
301 continue;
302 if (!FirstNonUndef) {
303 FirstNonUndef = V;
304 continue;
305 }
306 if (V != FirstNonUndef)
307 return false;
308 }
309 return FirstNonUndef != nullptr;
310}
311
312/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
314 if (auto *Cmp = dyn_cast<CmpInst>(I))
315 return Cmp->isCommutative();
316 if (auto *BO = dyn_cast<BinaryOperator>(I))
317 return BO->isCommutative() ||
318 (BO->getOpcode() == Instruction::Sub &&
319 !BO->hasNUsesOrMore(UsesLimit) &&
320 all_of(
321 BO->uses(),
322 [](const Use &U) {
323 // Commutative, if icmp eq/ne sub, 0
324 ICmpInst::Predicate Pred;
325 if (match(U.getUser(),
326 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
327 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
328 return true;
329 // Commutative, if abs(sub nsw, true) or abs(sub, false).
330 ConstantInt *Flag;
331 return match(U.getUser(),
332 m_Intrinsic<Intrinsic::abs>(
333 m_Specific(U.get()), m_ConstantInt(Flag))) &&
334 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
335 Flag->isOne());
336 })) ||
337 (BO->getOpcode() == Instruction::FSub &&
338 !BO->hasNUsesOrMore(UsesLimit) &&
339 all_of(BO->uses(), [](const Use &U) {
340 return match(U.getUser(),
341 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
342 }));
343 return I->isCommutative();
344}
345
346/// \returns inserting index of InsertElement or InsertValue instruction,
347/// using Offset as base offset for index.
348static std::optional<unsigned> getInsertIndex(const Value *InsertInst,
349 unsigned Offset = 0) {
350 int Index = Offset;
351 if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
352 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
353 if (!VT)
354 return std::nullopt;
355 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
356 if (!CI)
357 return std::nullopt;
358 if (CI->getValue().uge(VT->getNumElements()))
359 return std::nullopt;
360 Index *= VT->getNumElements();
361 Index += CI->getZExtValue();
362 return Index;
363 }
364
365 const auto *IV = cast<InsertValueInst>(InsertInst);
366 Type *CurrentType = IV->getType();
367 for (unsigned I : IV->indices()) {
368 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
369 Index *= ST->getNumElements();
370 CurrentType = ST->getElementType(I);
371 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
372 Index *= AT->getNumElements();
373 CurrentType = AT->getElementType();
374 } else {
375 return std::nullopt;
376 }
377 Index += I;
378 }
379 return Index;
380}
381
382namespace {
383/// Specifies the way the mask should be analyzed for undefs/poisonous elements
384/// in the shuffle mask.
385enum class UseMask {
386 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
387 ///< check for the mask elements for the first argument (mask
388 ///< indices are in range [0:VF)).
389 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
390 ///< for the mask elements for the second argument (mask indices
391 ///< are in range [VF:2*VF))
392 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
393 ///< future shuffle elements and mark them as ones as being used
394 ///< in future. Non-undef elements are considered as unused since
395 ///< they're already marked as used in the mask.
396};
397} // namespace
398
399/// Prepares a use bitset for the given mask either for the first argument or
400/// for the second.
402 UseMask MaskArg) {
403 SmallBitVector UseMask(VF, true);
404 for (auto [Idx, Value] : enumerate(Mask)) {
405 if (Value == PoisonMaskElem) {
406 if (MaskArg == UseMask::UndefsAsMask)
407 UseMask.reset(Idx);
408 continue;
409 }
410 if (MaskArg == UseMask::FirstArg && Value < VF)
411 UseMask.reset(Value);
412 else if (MaskArg == UseMask::SecondArg && Value >= VF)
413 UseMask.reset(Value - VF);
414 }
415 return UseMask;
416}
417
418/// Checks if the given value is actually an undefined constant vector.
419/// Also, if the \p UseMask is not empty, tries to check if the non-masked
420/// elements actually mask the insertelement buildvector, if any.
421template <bool IsPoisonOnly = false>
423 const SmallBitVector &UseMask = {}) {
424 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
425 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
426 if (isa<T>(V))
427 return Res;
428 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
429 if (!VecTy)
430 return Res.reset();
431 auto *C = dyn_cast<Constant>(V);
432 if (!C) {
433 if (!UseMask.empty()) {
434 const Value *Base = V;
435 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
436 Base = II->getOperand(0);
437 if (isa<T>(II->getOperand(1)))
438 continue;
439 std::optional<unsigned> Idx = getInsertIndex(II);
440 if (!Idx) {
441 Res.reset();
442 return Res;
443 }
444 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
445 Res.reset(*Idx);
446 }
447 // TODO: Add analysis for shuffles here too.
448 if (V == Base) {
449 Res.reset();
450 } else {
451 SmallBitVector SubMask(UseMask.size(), false);
452 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
453 }
454 } else {
455 Res.reset();
456 }
457 return Res;
458 }
459 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
460 if (Constant *Elem = C->getAggregateElement(I))
461 if (!isa<T>(Elem) &&
462 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
463 Res.reset(I);
464 }
465 return Res;
466}
467
468/// Checks if the vector of instructions can be represented as a shuffle, like:
469/// %x0 = extractelement <4 x i8> %x, i32 0
470/// %x3 = extractelement <4 x i8> %x, i32 3
471/// %y1 = extractelement <4 x i8> %y, i32 1
472/// %y2 = extractelement <4 x i8> %y, i32 2
473/// %x0x0 = mul i8 %x0, %x0
474/// %x3x3 = mul i8 %x3, %x3
475/// %y1y1 = mul i8 %y1, %y1
476/// %y2y2 = mul i8 %y2, %y2
477/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
478/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
479/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
480/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
481/// ret <4 x i8> %ins4
482/// can be transformed into:
483/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
484/// i32 6>
485/// %2 = mul <4 x i8> %1, %1
486/// ret <4 x i8> %2
487/// Mask will return the Shuffle Mask equivalent to the extracted elements.
488/// TODO: Can we split off and reuse the shuffle mask detection from
489/// ShuffleVectorInst/getShuffleCost?
490static std::optional<TargetTransformInfo::ShuffleKind>
492 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
493 if (It == VL.end())
494 return std::nullopt;
495 auto *EI0 = cast<ExtractElementInst>(*It);
496 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
497 return std::nullopt;
498 unsigned Size =
499 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
500 Value *Vec1 = nullptr;
501 Value *Vec2 = nullptr;
502 enum ShuffleMode { Unknown, Select, Permute };
503 ShuffleMode CommonShuffleMode = Unknown;
504 Mask.assign(VL.size(), PoisonMaskElem);
505 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
506 // Undef can be represented as an undef element in a vector.
507 if (isa<UndefValue>(VL[I]))
508 continue;
509 auto *EI = cast<ExtractElementInst>(VL[I]);
510 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
511 return std::nullopt;
512 auto *Vec = EI->getVectorOperand();
513 // We can extractelement from undef or poison vector.
514 if (isUndefVector(Vec).all())
515 continue;
516 // All vector operands must have the same number of vector elements.
517 if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
518 return std::nullopt;
519 if (isa<UndefValue>(EI->getIndexOperand()))
520 continue;
521 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
522 if (!Idx)
523 return std::nullopt;
524 // Undefined behavior if Idx is negative or >= Size.
525 if (Idx->getValue().uge(Size))
526 continue;
527 unsigned IntIdx = Idx->getValue().getZExtValue();
528 Mask[I] = IntIdx;
529 // For correct shuffling we have to have at most 2 different vector operands
530 // in all extractelement instructions.
531 if (!Vec1 || Vec1 == Vec) {
532 Vec1 = Vec;
533 } else if (!Vec2 || Vec2 == Vec) {
534 Vec2 = Vec;
535 Mask[I] += Size;
536 } else {
537 return std::nullopt;
538 }
539 if (CommonShuffleMode == Permute)
540 continue;
541 // If the extract index is not the same as the operation number, it is a
542 // permutation.
543 if (IntIdx != I) {
544 CommonShuffleMode = Permute;
545 continue;
546 }
547 CommonShuffleMode = Select;
548 }
549 // If we're not crossing lanes in different vectors, consider it as blending.
550 if (CommonShuffleMode == Select && Vec2)
552 // If Vec2 was never used, we have a permutation of a single vector, otherwise
553 // we have permutation of 2 vectors.
556}
557
558/// \returns True if Extract{Value,Element} instruction extracts element Idx.
559static std::optional<unsigned> getExtractIndex(Instruction *E) {
560 unsigned Opcode = E->getOpcode();
561 assert((Opcode == Instruction::ExtractElement ||
562 Opcode == Instruction::ExtractValue) &&
563 "Expected extractelement or extractvalue instruction.");
564 if (Opcode == Instruction::ExtractElement) {
565 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
566 if (!CI)
567 return std::nullopt;
568 return CI->getZExtValue();
569 }
570 auto *EI = cast<ExtractValueInst>(E);
571 if (EI->getNumIndices() != 1)
572 return std::nullopt;
573 return *EI->idx_begin();
574}
575
576namespace {
577
578/// Main data required for vectorization of instructions.
579struct InstructionsState {
580 /// The very first instruction in the list with the main opcode.
581 Value *OpValue = nullptr;
582
583 /// The main/alternate instruction.
584 Instruction *MainOp = nullptr;
585 Instruction *AltOp = nullptr;
586
587 /// The main/alternate opcodes for the list of instructions.
588 unsigned getOpcode() const {
589 return MainOp ? MainOp->getOpcode() : 0;
590 }
591
592 unsigned getAltOpcode() const {
593 return AltOp ? AltOp->getOpcode() : 0;
594 }
595
596 /// Some of the instructions in the list have alternate opcodes.
597 bool isAltShuffle() const { return AltOp != MainOp; }
598
599 bool isOpcodeOrAlt(Instruction *I) const {
600 unsigned CheckedOpcode = I->getOpcode();
601 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
602 }
603
604 InstructionsState() = delete;
605 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
606 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
607};
608
609} // end anonymous namespace
610
611/// Chooses the correct key for scheduling data. If \p Op has the same (or
612/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
613/// OpValue.
614static Value *isOneOf(const InstructionsState &S, Value *Op) {
615 auto *I = dyn_cast<Instruction>(Op);
616 if (I && S.isOpcodeOrAlt(I))
617 return Op;
618 return S.OpValue;
619}
620
621/// \returns true if \p Opcode is allowed as part of the main/alternate
622/// instruction for SLP vectorization.
623///
624/// Example of unsupported opcode is SDIV that can potentially cause UB if the
625/// "shuffled out" lane would result in division by zero.
626static bool isValidForAlternation(unsigned Opcode) {
627 if (Instruction::isIntDivRem(Opcode))
628 return false;
629
630 return true;
631}
632
633static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
634 const TargetLibraryInfo &TLI,
635 unsigned BaseIndex = 0);
636
637/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
638/// compatible instructions or constants, or just some other regular values.
639static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
640 Value *Op1, const TargetLibraryInfo &TLI) {
641 return (isConstant(BaseOp0) && isConstant(Op0)) ||
642 (isConstant(BaseOp1) && isConstant(Op1)) ||
643 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
644 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
645 BaseOp0 == Op0 || BaseOp1 == Op1 ||
646 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
647 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
648}
649
650/// \returns true if a compare instruction \p CI has similar "look" and
651/// same predicate as \p BaseCI, "as is" or with its operands and predicate
652/// swapped, false otherwise.
653static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
654 const TargetLibraryInfo &TLI) {
655 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
656 "Assessing comparisons of different types?");
657 CmpInst::Predicate BasePred = BaseCI->getPredicate();
658 CmpInst::Predicate Pred = CI->getPredicate();
660
661 Value *BaseOp0 = BaseCI->getOperand(0);
662 Value *BaseOp1 = BaseCI->getOperand(1);
663 Value *Op0 = CI->getOperand(0);
664 Value *Op1 = CI->getOperand(1);
665
666 return (BasePred == Pred &&
667 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
668 (BasePred == SwappedPred &&
669 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
670}
671
672/// \returns analysis of the Instructions in \p VL described in
673/// InstructionsState, the Opcode that we suppose the whole list
674/// could be vectorized even if its structure is diverse.
675static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
676 const TargetLibraryInfo &TLI,
677 unsigned BaseIndex) {
678 // Make sure these are all Instructions.
679 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
680 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
681
682 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
683 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
684 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
685 CmpInst::Predicate BasePred =
686 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
688 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
689 unsigned AltOpcode = Opcode;
690 unsigned AltIndex = BaseIndex;
691
692 bool SwappedPredsCompatible = [&]() {
693 if (!IsCmpOp)
694 return false;
695 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
696 UniquePreds.insert(BasePred);
697 UniqueNonSwappedPreds.insert(BasePred);
698 for (Value *V : VL) {
699 auto *I = dyn_cast<CmpInst>(V);
700 if (!I)
701 return false;
702 CmpInst::Predicate CurrentPred = I->getPredicate();
703 CmpInst::Predicate SwappedCurrentPred =
704 CmpInst::getSwappedPredicate(CurrentPred);
705 UniqueNonSwappedPreds.insert(CurrentPred);
706 if (!UniquePreds.contains(CurrentPred) &&
707 !UniquePreds.contains(SwappedCurrentPred))
708 UniquePreds.insert(CurrentPred);
709 }
710 // Total number of predicates > 2, but if consider swapped predicates
711 // compatible only 2, consider swappable predicates as compatible opcodes,
712 // not alternate.
713 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
714 }();
715 // Check for one alternate opcode from another BinaryOperator.
716 // TODO - generalize to support all operators (types, calls etc.).
717 auto *IBase = cast<Instruction>(VL[BaseIndex]);
718 Intrinsic::ID BaseID = 0;
719 SmallVector<VFInfo> BaseMappings;
720 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
722 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
723 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
724 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
725 }
726 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
727 auto *I = cast<Instruction>(VL[Cnt]);
728 unsigned InstOpcode = I->getOpcode();
729 if (IsBinOp && isa<BinaryOperator>(I)) {
730 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
731 continue;
732 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
733 isValidForAlternation(Opcode)) {
734 AltOpcode = InstOpcode;
735 AltIndex = Cnt;
736 continue;
737 }
738 } else if (IsCastOp && isa<CastInst>(I)) {
739 Value *Op0 = IBase->getOperand(0);
740 Type *Ty0 = Op0->getType();
741 Value *Op1 = I->getOperand(0);
742 Type *Ty1 = Op1->getType();
743 if (Ty0 == Ty1) {
744 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
745 continue;
746 if (Opcode == AltOpcode) {
748 isValidForAlternation(InstOpcode) &&
749 "Cast isn't safe for alternation, logic needs to be updated!");
750 AltOpcode = InstOpcode;
751 AltIndex = Cnt;
752 continue;
753 }
754 }
755 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
756 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
757 Type *Ty0 = BaseInst->getOperand(0)->getType();
758 Type *Ty1 = Inst->getOperand(0)->getType();
759 if (Ty0 == Ty1) {
760 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
761 // Check for compatible operands. If the corresponding operands are not
762 // compatible - need to perform alternate vectorization.
763 CmpInst::Predicate CurrentPred = Inst->getPredicate();
764 CmpInst::Predicate SwappedCurrentPred =
765 CmpInst::getSwappedPredicate(CurrentPred);
766
767 if ((E == 2 || SwappedPredsCompatible) &&
768 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
769 continue;
770
771 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
772 continue;
773 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
774 if (AltIndex != BaseIndex) {
775 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
776 continue;
777 } else if (BasePred != CurrentPred) {
778 assert(
779 isValidForAlternation(InstOpcode) &&
780 "CmpInst isn't safe for alternation, logic needs to be updated!");
781 AltIndex = Cnt;
782 continue;
783 }
784 CmpInst::Predicate AltPred = AltInst->getPredicate();
785 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
786 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
787 continue;
788 }
789 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
790 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
791 if (Gep->getNumOperands() != 2 ||
792 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
793 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
794 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
796 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
797 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
798 auto *BaseLI = cast<LoadInst>(IBase);
799 if (!LI->isSimple() || !BaseLI->isSimple())
800 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
801 } else if (auto *Call = dyn_cast<CallInst>(I)) {
802 auto *CallBase = cast<CallInst>(IBase);
803 if (Call->getCalledFunction() != CallBase->getCalledFunction())
804 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
805 if (Call->hasOperandBundles() &&
806 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
807 Call->op_begin() + Call->getBundleOperandsEndIndex(),
808 CallBase->op_begin() +
810 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
812 if (ID != BaseID)
813 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
814 if (!ID) {
815 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
816 if (Mappings.size() != BaseMappings.size() ||
817 Mappings.front().ISA != BaseMappings.front().ISA ||
818 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
819 Mappings.front().VectorName != BaseMappings.front().VectorName ||
820 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
821 Mappings.front().Shape.Parameters !=
822 BaseMappings.front().Shape.Parameters)
823 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
824 }
825 }
826 continue;
827 }
828 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
829 }
830
831 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
832 cast<Instruction>(VL[AltIndex]));
833}
834
835/// \returns true if all of the values in \p VL have the same type or false
836/// otherwise.
838 Type *Ty = VL.front()->getType();
839 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
840}
841
842/// \returns True if in-tree use also needs extract. This refers to
843/// possible scalar operand in vectorized instruction.
844static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
845 TargetLibraryInfo *TLI) {
846 unsigned Opcode = UserInst->getOpcode();
847 switch (Opcode) {
848 case Instruction::Load: {
849 LoadInst *LI = cast<LoadInst>(UserInst);
850 return (LI->getPointerOperand() == Scalar);
851 }
852 case Instruction::Store: {
853 StoreInst *SI = cast<StoreInst>(UserInst);
854 return (SI->getPointerOperand() == Scalar);
855 }
856 case Instruction::Call: {
857 CallInst *CI = cast<CallInst>(UserInst);
859 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
860 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
861 Arg.value().get() == Scalar;
862 });
863 }
864 default:
865 return false;
866 }
867}
868
869/// \returns the AA location that is being access by the instruction.
871 if (StoreInst *SI = dyn_cast<StoreInst>(I))
872 return MemoryLocation::get(SI);
873 if (LoadInst *LI = dyn_cast<LoadInst>(I))
874 return MemoryLocation::get(LI);
875 return MemoryLocation();
876}
877
878/// \returns True if the instruction is not a volatile or atomic load/store.
879static bool isSimple(Instruction *I) {
880 if (LoadInst *LI = dyn_cast<LoadInst>(I))
881 return LI->isSimple();
882 if (StoreInst *SI = dyn_cast<StoreInst>(I))
883 return SI->isSimple();
884 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
885 return !MI->isVolatile();
886 return true;
887}
888
889/// Shuffles \p Mask in accordance with the given \p SubMask.
890/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
891/// one but two input vectors.
892static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
893 bool ExtendingManyInputs = false) {
894 if (SubMask.empty())
895 return;
896 assert(
897 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
898 // Check if input scalars were extended to match the size of other node.
899 (SubMask.size() == Mask.size() &&
900 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
901 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
902 "SubMask with many inputs support must be larger than the mask.");
903 if (Mask.empty()) {
904 Mask.append(SubMask.begin(), SubMask.end());
905 return;
906 }
907 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
908 int TermValue = std::min(Mask.size(), SubMask.size());
909 for (int I = 0, E = SubMask.size(); I < E; ++I) {
910 if (SubMask[I] == PoisonMaskElem ||
911 (!ExtendingManyInputs &&
912 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
913 continue;
914 NewMask[I] = Mask[SubMask[I]];
915 }
916 Mask.swap(NewMask);
917}
918
919/// Order may have elements assigned special value (size) which is out of
920/// bounds. Such indices only appear on places which correspond to undef values
921/// (see canReuseExtract for details) and used in order to avoid undef values
922/// have effect on operands ordering.
923/// The first loop below simply finds all unused indices and then the next loop
924/// nest assigns these indices for undef values positions.
925/// As an example below Order has two undef positions and they have assigned
926/// values 3 and 7 respectively:
927/// before: 6 9 5 4 9 2 1 0
928/// after: 6 3 5 4 7 2 1 0
930 const unsigned Sz = Order.size();
931 SmallBitVector UnusedIndices(Sz, /*t=*/true);
932 SmallBitVector MaskedIndices(Sz);
933 for (unsigned I = 0; I < Sz; ++I) {
934 if (Order[I] < Sz)
935 UnusedIndices.reset(Order[I]);
936 else
937 MaskedIndices.set(I);
938 }
939 if (MaskedIndices.none())
940 return;
941 assert(UnusedIndices.count() == MaskedIndices.count() &&
942 "Non-synced masked/available indices.");
943 int Idx = UnusedIndices.find_first();
944 int MIdx = MaskedIndices.find_first();
945 while (MIdx >= 0) {
946 assert(Idx >= 0 && "Indices must be synced.");
947 Order[MIdx] = Idx;
948 Idx = UnusedIndices.find_next(Idx);
949 MIdx = MaskedIndices.find_next(MIdx);
950 }
951}
952
953namespace llvm {
954
956 SmallVectorImpl<int> &Mask) {
957 Mask.clear();
958 const unsigned E = Indices.size();
959 Mask.resize(E, PoisonMaskElem);
960 for (unsigned I = 0; I < E; ++I)
961 Mask[Indices[I]] = I;
962}
963
964/// Reorders the list of scalars in accordance with the given \p Mask.
966 ArrayRef<int> Mask) {
967 assert(!Mask.empty() && "Expected non-empty mask.");
968 SmallVector<Value *> Prev(Scalars.size(),
969 UndefValue::get(Scalars.front()->getType()));
970 Prev.swap(Scalars);
971 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
972 if (Mask[I] != PoisonMaskElem)
973 Scalars[Mask[I]] = Prev[I];
974}
975
976/// Checks if the provided value does not require scheduling. It does not
977/// require scheduling if this is not an instruction or it is an instruction
978/// that does not read/write memory and all operands are either not instructions
979/// or phi nodes or instructions from different blocks.
981 auto *I = dyn_cast<Instruction>(V);
982 if (!I)
983 return true;
984 return !mayHaveNonDefUseDependency(*I) &&
985 all_of(I->operands(), [I](Value *V) {
986 auto *IO = dyn_cast<Instruction>(V);
987 if (!IO)
988 return true;
989 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
990 });
991}
992
993/// Checks if the provided value does not require scheduling. It does not
994/// require scheduling if this is not an instruction or it is an instruction
995/// that does not read/write memory and all users are phi nodes or instructions
996/// from the different blocks.
997static bool isUsedOutsideBlock(Value *V) {
998 auto *I = dyn_cast<Instruction>(V);
999 if (!I)
1000 return true;
1001 // Limits the number of uses to save compile time.
1002 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1003 all_of(I->users(), [I](User *U) {
1004 auto *IU = dyn_cast<Instruction>(U);
1005 if (!IU)
1006 return true;
1007 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1008 });
1009}
1010
1011/// Checks if the specified value does not require scheduling. It does not
1012/// require scheduling if all operands and all users do not need to be scheduled
1013/// in the current basic block.
1016}
1017
1018/// Checks if the specified array of instructions does not require scheduling.
1019/// It is so if all either instructions have operands that do not require
1020/// scheduling or their users do not require scheduling since they are phis or
1021/// in other basic blocks.
1023 return !VL.empty() &&
1025}
1026
1027namespace slpvectorizer {
1028
1029/// Bottom Up SLP Vectorizer.
1030class BoUpSLP {
1031 struct TreeEntry;
1032 struct ScheduleData;
1035
1036public:
1037 /// Tracks the state we can represent the loads in the given sequence.
1038 enum class LoadsState {
1039 Gather,
1040 Vectorize,
1043 };
1044
1052
1054 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1057 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1058 AC(AC), DB(DB), DL(DL), ORE(ORE),
1059 Builder(Se->getContext(), TargetFolder(*DL)) {
1060 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1061 // Use the vector register size specified by the target unless overridden
1062 // by a command-line option.
1063 // TODO: It would be better to limit the vectorization factor based on
1064 // data type rather than just register size. For example, x86 AVX has
1065 // 256-bit registers, but it does not support integer operations
1066 // at that width (that requires AVX2).
1067 if (MaxVectorRegSizeOption.getNumOccurrences())
1068 MaxVecRegSize = MaxVectorRegSizeOption;
1069 else
1070 MaxVecRegSize =
1072 .getFixedValue();
1073
1074 if (MinVectorRegSizeOption.getNumOccurrences())
1075 MinVecRegSize = MinVectorRegSizeOption;
1076 else
1077 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1078 }
1079
1080 /// Vectorize the tree that starts with the elements in \p VL.
1081 /// Returns the vectorized root.
1083
1084 /// Vectorize the tree but with the list of externally used values \p
1085 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1086 /// generated extractvalue instructions.
1087 /// \param ReplacedExternals containd list of replaced external values
1088 /// {scalar, replace} after emitting extractelement for external uses.
1089 Value *
1090 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1091 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1092 Instruction *ReductionRoot = nullptr);
1093
1094 /// \returns the cost incurred by unwanted spills and fills, caused by
1095 /// holding live values over call sites.
1097
1098 /// \returns the vectorization cost of the subtree that starts at \p VL.
1099 /// A negative number means that this is profitable.
1100 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1101
1102 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1103 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1104 void buildTree(ArrayRef<Value *> Roots,
1105 const SmallDenseSet<Value *> &UserIgnoreLst);
1106
1107 /// Construct a vectorizable tree that starts at \p Roots.
1108 void buildTree(ArrayRef<Value *> Roots);
1109
1110 /// Returns whether the root node has in-tree uses.
1112 return !VectorizableTree.empty() &&
1113 !VectorizableTree.front()->UserTreeIndices.empty();
1114 }
1115
1116 /// Return the scalars of the root node.
1118 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1119 return VectorizableTree.front()->Scalars;
1120 }
1121
1122 /// Builds external uses of the vectorized scalars, i.e. the list of
1123 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1124 /// ExternallyUsedValues contains additional list of external uses to handle
1125 /// vectorization of reductions.
1126 void
1127 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1128
1129 /// Transforms graph nodes to target specific representations, if profitable.
1130 void transformNodes();
1131
1132 /// Clear the internal data structures that are created by 'buildTree'.
1133 void deleteTree() {
1134 VectorizableTree.clear();
1135 ScalarToTreeEntry.clear();
1136 MultiNodeScalars.clear();
1137 MustGather.clear();
1138 EntryToLastInstruction.clear();
1139 ExternalUses.clear();
1140 ExternalUsesAsGEPs.clear();
1141 for (auto &Iter : BlocksSchedules) {
1142 BlockScheduling *BS = Iter.second.get();
1143 BS->clear();
1144 }
1145 MinBWs.clear();
1146 ReductionBitWidth = 0;
1147 CastMaxMinBWSizes.reset();
1148 ExtraBitWidthNodes.clear();
1149 InstrElementSize.clear();
1150 UserIgnoreList = nullptr;
1151 PostponedGathers.clear();
1152 ValueToGatherNodes.clear();
1153 }
1154
1155 unsigned getTreeSize() const { return VectorizableTree.size(); }
1156
1157 /// Perform LICM and CSE on the newly generated gather sequences.
1159
1160 /// Checks if the specified gather tree entry \p TE can be represented as a
1161 /// shuffled vector entry + (possibly) permutation with other gathers. It
1162 /// implements the checks only for possibly ordered scalars (Loads,
1163 /// ExtractElement, ExtractValue), which can be part of the graph.
1164 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1165
1166 /// Sort loads into increasing pointers offsets to allow greater clustering.
1167 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1168
1169 /// Gets reordering data for the given tree entry. If the entry is vectorized
1170 /// - just return ReorderIndices, otherwise check if the scalars can be
1171 /// reordered and return the most optimal order.
1172 /// \return std::nullopt if ordering is not important, empty order, if
1173 /// identity order is important, or the actual order.
1174 /// \param TopToBottom If true, include the order of vectorized stores and
1175 /// insertelement nodes, otherwise skip them.
1176 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1177 bool TopToBottom);
1178
1179 /// Reorders the current graph to the most profitable order starting from the
1180 /// root node to the leaf nodes. The best order is chosen only from the nodes
1181 /// of the same size (vectorization factor). Smaller nodes are considered
1182 /// parts of subgraph with smaller VF and they are reordered independently. We
1183 /// can make it because we still need to extend smaller nodes to the wider VF
1184 /// and we can merge reordering shuffles with the widening shuffles.
1185 void reorderTopToBottom();
1186
1187 /// Reorders the current graph to the most profitable order starting from
1188 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1189 /// number of reshuffles if the leaf nodes use the same order. In this case we
1190 /// can merge the orders and just shuffle user node instead of shuffling its
1191 /// operands. Plus, even the leaf nodes have different orders, it allows to
1192 /// sink reordering in the graph closer to the root node and merge it later
1193 /// during analysis.
1194 void reorderBottomToTop(bool IgnoreReorder = false);
1195
1196 /// \return The vector element size in bits to use when vectorizing the
1197 /// expression tree ending at \p V. If V is a store, the size is the width of
1198 /// the stored value. Otherwise, the size is the width of the largest loaded
1199 /// value reaching V. This method is used by the vectorizer to calculate
1200 /// vectorization factors.
1201 unsigned getVectorElementSize(Value *V);
1202
1203 /// Compute the minimum type sizes required to represent the entries in a
1204 /// vectorizable tree.
1206
1207 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1208 unsigned getMaxVecRegSize() const {
1209 return MaxVecRegSize;
1210 }
1211
1212 // \returns minimum vector register size as set by cl::opt.
1213 unsigned getMinVecRegSize() const {
1214 return MinVecRegSize;
1215 }
1216
1217 unsigned getMinVF(unsigned Sz) const {
1218 return std::max(2U, getMinVecRegSize() / Sz);
1219 }
1220
1221 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1222 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1223 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1224 return MaxVF ? MaxVF : UINT_MAX;
1225 }
1226
1227 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1228 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1229 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1230 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1231 ///
1232 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1233 unsigned canMapToVector(Type *T) const;
1234
1235 /// \returns True if the VectorizableTree is both tiny and not fully
1236 /// vectorizable. We do not vectorize such trees.
1237 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1238
1239 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1240 /// can be load combined in the backend. Load combining may not be allowed in
1241 /// the IR optimizer, so we do not want to alter the pattern. For example,
1242 /// partially transforming a scalar bswap() pattern into vector code is
1243 /// effectively impossible for the backend to undo.
1244 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1245 /// may not be necessary.
1246 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1247
1248 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1249 /// can be load combined in the backend. Load combining may not be allowed in
1250 /// the IR optimizer, so we do not want to alter the pattern. For example,
1251 /// partially transforming a scalar bswap() pattern into vector code is
1252 /// effectively impossible for the backend to undo.
1253 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1254 /// may not be necessary.
1255 bool isLoadCombineCandidate() const;
1256
1257 /// Checks if the given array of loads can be represented as a vectorized,
1258 /// scatter or just simple gather.
1259 /// \param VL list of loads.
1260 /// \param VL0 main load value.
1261 /// \param Order returned order of load instructions.
1262 /// \param PointerOps returned list of pointer operands.
1263 /// \param TryRecursiveCheck used to check if long masked gather can be
1264 /// represented as a serie of loads/insert subvector, if profitable.
1267 SmallVectorImpl<Value *> &PointerOps,
1268 bool TryRecursiveCheck = true) const;
1269
1271
1272 /// This structure holds any data we need about the edges being traversed
1273 /// during buildTree_rec(). We keep track of:
1274 /// (i) the user TreeEntry index, and
1275 /// (ii) the index of the edge.
1276 struct EdgeInfo {
1277 EdgeInfo() = default;
1278 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1280 /// The user TreeEntry.
1281 TreeEntry *UserTE = nullptr;
1282 /// The operand index of the use.
1283 unsigned EdgeIdx = UINT_MAX;
1284#ifndef NDEBUG
1286 const BoUpSLP::EdgeInfo &EI) {
1287 EI.dump(OS);
1288 return OS;
1289 }
1290 /// Debug print.
1291 void dump(raw_ostream &OS) const {
1292 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1293 << " EdgeIdx:" << EdgeIdx << "}";
1294 }
1295 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1296#endif
1297 bool operator == (const EdgeInfo &Other) const {
1298 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1299 }
1300 };
1301
1302 /// A helper class used for scoring candidates for two consecutive lanes.
1304 const TargetLibraryInfo &TLI;
1305 const DataLayout &DL;
1306 ScalarEvolution &SE;
1307 const BoUpSLP &R;
1308 int NumLanes; // Total number of lanes (aka vectorization factor).
1309 int MaxLevel; // The maximum recursion depth for accumulating score.
1310
1311 public:
1313 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1314 int MaxLevel)
1315 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1316 MaxLevel(MaxLevel) {}
1317
1318 // The hard-coded scores listed here are not very important, though it shall
1319 // be higher for better matches to improve the resulting cost. When
1320 // computing the scores of matching one sub-tree with another, we are
1321 // basically counting the number of values that are matching. So even if all
1322 // scores are set to 1, we would still get a decent matching result.
1323 // However, sometimes we have to break ties. For example we may have to
1324 // choose between matching loads vs matching opcodes. This is what these
1325 // scores are helping us with: they provide the order of preference. Also,
1326 // this is important if the scalar is externally used or used in another
1327 // tree entry node in the different lane.
1328
1329 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1330 static const int ScoreConsecutiveLoads = 4;
1331 /// The same load multiple times. This should have a better score than
1332 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1333 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1334 /// a vector load and 1.0 for a broadcast.
1335 static const int ScoreSplatLoads = 3;
1336 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1337 static const int ScoreReversedLoads = 3;
1338 /// A load candidate for masked gather.
1339 static const int ScoreMaskedGatherCandidate = 1;
1340 /// ExtractElementInst from same vector and consecutive indexes.
1341 static const int ScoreConsecutiveExtracts = 4;
1342 /// ExtractElementInst from same vector and reversed indices.
1343 static const int ScoreReversedExtracts = 3;
1344 /// Constants.
1345 static const int ScoreConstants = 2;
1346 /// Instructions with the same opcode.
1347 static const int ScoreSameOpcode = 2;
1348 /// Instructions with alt opcodes (e.g, add + sub).
1349 static const int ScoreAltOpcodes = 1;
1350 /// Identical instructions (a.k.a. splat or broadcast).
1351 static const int ScoreSplat = 1;
1352 /// Matching with an undef is preferable to failing.
1353 static const int ScoreUndef = 1;
1354 /// Score for failing to find a decent match.
1355 static const int ScoreFail = 0;
1356 /// Score if all users are vectorized.
1357 static const int ScoreAllUserVectorized = 1;
1358
1359 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1360 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1361 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1362 /// MainAltOps.
1364 ArrayRef<Value *> MainAltOps) const {
1365 if (!isValidElementType(V1->getType()) ||
1366 !isValidElementType(V2->getType()))
1368
1369 if (V1 == V2) {
1370 if (isa<LoadInst>(V1)) {
1371 // Retruns true if the users of V1 and V2 won't need to be extracted.
1372 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1373 // Bail out if we have too many uses to save compilation time.
1374 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1375 return false;
1376
1377 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1378 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1379 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1380 });
1381 };
1382 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1383 };
1384 // A broadcast of a load can be cheaper on some targets.
1385 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1386 ElementCount::getFixed(NumLanes)) &&
1387 ((int)V1->getNumUses() == NumLanes ||
1388 AllUsersAreInternal(V1, V2)))
1390 }
1392 }
1393
1394 auto *LI1 = dyn_cast<LoadInst>(V1);
1395 auto *LI2 = dyn_cast<LoadInst>(V2);
1396 if (LI1 && LI2) {
1397 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1398 !LI2->isSimple())
1400
1401 std::optional<int> Dist = getPointersDiff(
1402 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1403 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1404 if (!Dist || *Dist == 0) {
1405 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1406 getUnderlyingObject(LI2->getPointerOperand()) &&
1407 R.TTI->isLegalMaskedGather(
1408 FixedVectorType::get(LI1->getType(), NumLanes),
1409 LI1->getAlign()))
1412 }
1413 // The distance is too large - still may be profitable to use masked
1414 // loads/gathers.
1415 if (std::abs(*Dist) > NumLanes / 2)
1417 // This still will detect consecutive loads, but we might have "holes"
1418 // in some cases. It is ok for non-power-2 vectorization and may produce
1419 // better results. It should not affect current vectorization.
1422 }
1423
1424 auto *C1 = dyn_cast<Constant>(V1);
1425 auto *C2 = dyn_cast<Constant>(V2);
1426 if (C1 && C2)
1428
1429 // Extracts from consecutive indexes of the same vector better score as
1430 // the extracts could be optimized away.
1431 Value *EV1;
1432 ConstantInt *Ex1Idx;
1433 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1434 // Undefs are always profitable for extractelements.
1435 // Compiler can easily combine poison and extractelement <non-poison> or
1436 // undef and extractelement <poison>. But combining undef +
1437 // extractelement <non-poison-but-may-produce-poison> requires some
1438 // extra operations.
1439 if (isa<UndefValue>(V2))
1440 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1443 Value *EV2 = nullptr;
1444 ConstantInt *Ex2Idx = nullptr;
1445 if (match(V2,
1447 m_Undef())))) {
1448 // Undefs are always profitable for extractelements.
1449 if (!Ex2Idx)
1451 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1453 if (EV2 == EV1) {
1454 int Idx1 = Ex1Idx->getZExtValue();
1455 int Idx2 = Ex2Idx->getZExtValue();
1456 int Dist = Idx2 - Idx1;
1457 // The distance is too large - still may be profitable to use
1458 // shuffles.
1459 if (std::abs(Dist) == 0)
1461 if (std::abs(Dist) > NumLanes / 2)
1465 }
1467 }
1469 }
1470
1471 auto *I1 = dyn_cast<Instruction>(V1);
1472 auto *I2 = dyn_cast<Instruction>(V2);
1473 if (I1 && I2) {
1474 if (I1->getParent() != I2->getParent())
1476 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1477 Ops.push_back(I1);
1478 Ops.push_back(I2);
1479 InstructionsState S = getSameOpcode(Ops, TLI);
1480 // Note: Only consider instructions with <= 2 operands to avoid
1481 // complexity explosion.
1482 if (S.getOpcode() &&
1483 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1484 !S.isAltShuffle()) &&
1485 all_of(Ops, [&S](Value *V) {
1486 return cast<Instruction>(V)->getNumOperands() ==
1487 S.MainOp->getNumOperands();
1488 }))
1489 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1491 }
1492
1493 if (isa<UndefValue>(V2))
1495
1497 }
1498
1499 /// Go through the operands of \p LHS and \p RHS recursively until
1500 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1501 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1502 /// of \p U1 and \p U2), except at the beginning of the recursion where
1503 /// these are set to nullptr.
1504 ///
1505 /// For example:
1506 /// \verbatim
1507 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1508 /// \ / \ / \ / \ /
1509 /// + + + +
1510 /// G1 G2 G3 G4
1511 /// \endverbatim
1512 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1513 /// each level recursively, accumulating the score. It starts from matching
1514 /// the additions at level 0, then moves on to the loads (level 1). The
1515 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1516 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1517 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1518 /// Please note that the order of the operands does not matter, as we
1519 /// evaluate the score of all profitable combinations of operands. In
1520 /// other words the score of G1 and G4 is the same as G1 and G2. This
1521 /// heuristic is based on ideas described in:
1522 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1523 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1524 /// Luís F. W. Góes
1526 Instruction *U2, int CurrLevel,
1527 ArrayRef<Value *> MainAltOps) const {
1528
1529 // Get the shallow score of V1 and V2.
1530 int ShallowScoreAtThisLevel =
1531 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1532
1533 // If reached MaxLevel,
1534 // or if V1 and V2 are not instructions,
1535 // or if they are SPLAT,
1536 // or if they are not consecutive,
1537 // or if profitable to vectorize loads or extractelements, early return
1538 // the current cost.
1539 auto *I1 = dyn_cast<Instruction>(LHS);
1540 auto *I2 = dyn_cast<Instruction>(RHS);
1541 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1542 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1543 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1544 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1545 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1546 ShallowScoreAtThisLevel))
1547 return ShallowScoreAtThisLevel;
1548 assert(I1 && I2 && "Should have early exited.");
1549
1550 // Contains the I2 operand indexes that got matched with I1 operands.
1551 SmallSet<unsigned, 4> Op2Used;
1552
1553 // Recursion towards the operands of I1 and I2. We are trying all possible
1554 // operand pairs, and keeping track of the best score.
1555 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1556 OpIdx1 != NumOperands1; ++OpIdx1) {
1557 // Try to pair op1I with the best operand of I2.
1558 int MaxTmpScore = 0;
1559 unsigned MaxOpIdx2 = 0;
1560 bool FoundBest = false;
1561 // If I2 is commutative try all combinations.
1562 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1563 unsigned ToIdx = isCommutative(I2)
1564 ? I2->getNumOperands()
1565 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1566 assert(FromIdx <= ToIdx && "Bad index");
1567 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1568 // Skip operands already paired with OpIdx1.
1569 if (Op2Used.count(OpIdx2))
1570 continue;
1571 // Recursively calculate the cost at each level
1572 int TmpScore =
1573 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1574 I1, I2, CurrLevel + 1, std::nullopt);
1575 // Look for the best score.
1576 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1577 TmpScore > MaxTmpScore) {
1578 MaxTmpScore = TmpScore;
1579 MaxOpIdx2 = OpIdx2;
1580 FoundBest = true;
1581 }
1582 }
1583 if (FoundBest) {
1584 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1585 Op2Used.insert(MaxOpIdx2);
1586 ShallowScoreAtThisLevel += MaxTmpScore;
1587 }
1588 }
1589 return ShallowScoreAtThisLevel;
1590 }
1591 };
1592 /// A helper data structure to hold the operands of a vector of instructions.
1593 /// This supports a fixed vector length for all operand vectors.
1595 /// For each operand we need (i) the value, and (ii) the opcode that it
1596 /// would be attached to if the expression was in a left-linearized form.
1597 /// This is required to avoid illegal operand reordering.
1598 /// For example:
1599 /// \verbatim
1600 /// 0 Op1
1601 /// |/
1602 /// Op1 Op2 Linearized + Op2
1603 /// \ / ----------> |/
1604 /// - -
1605 ///
1606 /// Op1 - Op2 (0 + Op1) - Op2
1607 /// \endverbatim
1608 ///
1609 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1610 ///
1611 /// Another way to think of this is to track all the operations across the
1612 /// path from the operand all the way to the root of the tree and to
1613 /// calculate the operation that corresponds to this path. For example, the
1614 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1615 /// corresponding operation is a '-' (which matches the one in the
1616 /// linearized tree, as shown above).
1617 ///
1618 /// For lack of a better term, we refer to this operation as Accumulated
1619 /// Path Operation (APO).
1620 struct OperandData {
1621 OperandData() = default;
1622 OperandData(Value *V, bool APO, bool IsUsed)
1623 : V(V), APO(APO), IsUsed(IsUsed) {}
1624 /// The operand value.
1625 Value *V = nullptr;
1626 /// TreeEntries only allow a single opcode, or an alternate sequence of
1627 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1628 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1629 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1630 /// (e.g., Add/Mul)
1631 bool APO = false;
1632 /// Helper data for the reordering function.
1633 bool IsUsed = false;
1634 };
1635
1636 /// During operand reordering, we are trying to select the operand at lane
1637 /// that matches best with the operand at the neighboring lane. Our
1638 /// selection is based on the type of value we are looking for. For example,
1639 /// if the neighboring lane has a load, we need to look for a load that is
1640 /// accessing a consecutive address. These strategies are summarized in the
1641 /// 'ReorderingMode' enumerator.
1642 enum class ReorderingMode {
1643 Load, ///< Matching loads to consecutive memory addresses
1644 Opcode, ///< Matching instructions based on opcode (same or alternate)
1645 Constant, ///< Matching constants
1646 Splat, ///< Matching the same instruction multiple times (broadcast)
1647 Failed, ///< We failed to create a vectorizable group
1648 };
1649
1651
1652 /// A vector of operand vectors.
1654
1655 const TargetLibraryInfo &TLI;
1656 const DataLayout &DL;
1657 ScalarEvolution &SE;
1658 const BoUpSLP &R;
1659
1660 /// \returns the operand data at \p OpIdx and \p Lane.
1661 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1662 return OpsVec[OpIdx][Lane];
1663 }
1664
1665 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1666 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1667 return OpsVec[OpIdx][Lane];
1668 }
1669
1670 /// Clears the used flag for all entries.
1671 void clearUsed() {
1672 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1673 OpIdx != NumOperands; ++OpIdx)
1674 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1675 ++Lane)
1676 OpsVec[OpIdx][Lane].IsUsed = false;
1677 }
1678
1679 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1680 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1681 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1682 }
1683
1684 /// \param Lane lane of the operands under analysis.
1685 /// \param OpIdx operand index in \p Lane lane we're looking the best
1686 /// candidate for.
1687 /// \param Idx operand index of the current candidate value.
1688 /// \returns The additional score due to possible broadcasting of the
1689 /// elements in the lane. It is more profitable to have power-of-2 unique
1690 /// elements in the lane, it will be vectorized with higher probability
1691 /// after removing duplicates. Currently the SLP vectorizer supports only
1692 /// vectorization of the power-of-2 number of unique scalars.
1693 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1694 Value *IdxLaneV = getData(Idx, Lane).V;
1695 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1696 return 0;
1698 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1699 if (Ln == Lane)
1700 continue;
1701 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1702 if (!isa<Instruction>(OpIdxLnV))
1703 return 0;
1704 Uniques.insert(OpIdxLnV);
1705 }
1706 int UniquesCount = Uniques.size();
1707 int UniquesCntWithIdxLaneV =
1708 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1709 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1710 int UniquesCntWithOpIdxLaneV =
1711 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1712 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1713 return 0;
1714 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1715 UniquesCntWithOpIdxLaneV) -
1716 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1717 }
1718
1719 /// \param Lane lane of the operands under analysis.
1720 /// \param OpIdx operand index in \p Lane lane we're looking the best
1721 /// candidate for.
1722 /// \param Idx operand index of the current candidate value.
1723 /// \returns The additional score for the scalar which users are all
1724 /// vectorized.
1725 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1726 Value *IdxLaneV = getData(Idx, Lane).V;
1727 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1728 // Do not care about number of uses for vector-like instructions
1729 // (extractelement/extractvalue with constant indices), they are extracts
1730 // themselves and already externally used. Vectorization of such
1731 // instructions does not add extra extractelement instruction, just may
1732 // remove it.
1733 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1734 isVectorLikeInstWithConstOps(OpIdxLaneV))
1736 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1737 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1738 return 0;
1739 return R.areAllUsersVectorized(IdxLaneI)
1741 : 0;
1742 }
1743
1744 /// Score scaling factor for fully compatible instructions but with
1745 /// different number of external uses. Allows better selection of the
1746 /// instructions with less external uses.
1747 static const int ScoreScaleFactor = 10;
1748
1749 /// \Returns the look-ahead score, which tells us how much the sub-trees
1750 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1751 /// score. This helps break ties in an informed way when we cannot decide on
1752 /// the order of the operands by just considering the immediate
1753 /// predecessors.
1754 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1755 int Lane, unsigned OpIdx, unsigned Idx,
1756 bool &IsUsed) {
1757 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1759 // Keep track of the instruction stack as we recurse into the operands
1760 // during the look-ahead score exploration.
1761 int Score =
1762 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1763 /*CurrLevel=*/1, MainAltOps);
1764 if (Score) {
1765 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1766 if (Score <= -SplatScore) {
1767 // Set the minimum score for splat-like sequence to avoid setting
1768 // failed state.
1769 Score = 1;
1770 } else {
1771 Score += SplatScore;
1772 // Scale score to see the difference between different operands
1773 // and similar operands but all vectorized/not all vectorized
1774 // uses. It does not affect actual selection of the best
1775 // compatible operand in general, just allows to select the
1776 // operand with all vectorized uses.
1777 Score *= ScoreScaleFactor;
1778 Score += getExternalUseScore(Lane, OpIdx, Idx);
1779 IsUsed = true;
1780 }
1781 }
1782 return Score;
1783 }
1784
1785 /// Best defined scores per lanes between the passes. Used to choose the
1786 /// best operand (with the highest score) between the passes.
1787 /// The key - {Operand Index, Lane}.
1788 /// The value - the best score between the passes for the lane and the
1789 /// operand.
1791 BestScoresPerLanes;
1792
1793 // Search all operands in Ops[*][Lane] for the one that matches best
1794 // Ops[OpIdx][LastLane] and return its opreand index.
1795 // If no good match can be found, return std::nullopt.
1796 std::optional<unsigned>
1797 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1798 ArrayRef<ReorderingMode> ReorderingModes,
1799 ArrayRef<Value *> MainAltOps) {
1800 unsigned NumOperands = getNumOperands();
1801
1802 // The operand of the previous lane at OpIdx.
1803 Value *OpLastLane = getData(OpIdx, LastLane).V;
1804
1805 // Our strategy mode for OpIdx.
1806 ReorderingMode RMode = ReorderingModes[OpIdx];
1807 if (RMode == ReorderingMode::Failed)
1808 return std::nullopt;
1809
1810 // The linearized opcode of the operand at OpIdx, Lane.
1811 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1812
1813 // The best operand index and its score.
1814 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1815 // are using the score to differentiate between the two.
1816 struct BestOpData {
1817 std::optional<unsigned> Idx;
1818 unsigned Score = 0;
1819 } BestOp;
1820 BestOp.Score =
1821 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1822 .first->second;
1823
1824 // Track if the operand must be marked as used. If the operand is set to
1825 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1826 // want to reestimate the operands again on the following iterations).
1827 bool IsUsed =
1828 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1829 // Iterate through all unused operands and look for the best.
1830 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1831 // Get the operand at Idx and Lane.
1832 OperandData &OpData = getData(Idx, Lane);
1833 Value *Op = OpData.V;
1834 bool OpAPO = OpData.APO;
1835
1836 // Skip already selected operands.
1837 if (OpData.IsUsed)
1838 continue;
1839
1840 // Skip if we are trying to move the operand to a position with a
1841 // different opcode in the linearized tree form. This would break the
1842 // semantics.
1843 if (OpAPO != OpIdxAPO)
1844 continue;
1845
1846 // Look for an operand that matches the current mode.
1847 switch (RMode) {
1848 case ReorderingMode::Load:
1849 case ReorderingMode::Constant:
1850 case ReorderingMode::Opcode: {
1851 bool LeftToRight = Lane > LastLane;
1852 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1853 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1854 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1855 OpIdx, Idx, IsUsed);
1856 if (Score > static_cast<int>(BestOp.Score)) {
1857 BestOp.Idx = Idx;
1858 BestOp.Score = Score;
1859 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1860 }
1861 break;
1862 }
1863 case ReorderingMode::Splat:
1864 if (Op == OpLastLane)
1865 BestOp.Idx = Idx;
1866 break;
1867 case ReorderingMode::Failed:
1868 llvm_unreachable("Not expected Failed reordering mode.");
1869 }
1870 }
1871
1872 if (BestOp.Idx) {
1873 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1874 return BestOp.Idx;
1875 }
1876 // If we could not find a good match return std::nullopt.
1877 return std::nullopt;
1878 }
1879
1880 /// Helper for reorderOperandVecs.
1881 /// \returns the lane that we should start reordering from. This is the one
1882 /// which has the least number of operands that can freely move about or
1883 /// less profitable because it already has the most optimal set of operands.
1884 unsigned getBestLaneToStartReordering() const {
1885 unsigned Min = UINT_MAX;
1886 unsigned SameOpNumber = 0;
1887 // std::pair<unsigned, unsigned> is used to implement a simple voting
1888 // algorithm and choose the lane with the least number of operands that
1889 // can freely move about or less profitable because it already has the
1890 // most optimal set of operands. The first unsigned is a counter for
1891 // voting, the second unsigned is the counter of lanes with instructions
1892 // with same/alternate opcodes and same parent basic block.
1894 // Try to be closer to the original results, if we have multiple lanes
1895 // with same cost. If 2 lanes have the same cost, use the one with the
1896 // lowest index.
1897 for (int I = getNumLanes(); I > 0; --I) {
1898 unsigned Lane = I - 1;
1899 OperandsOrderData NumFreeOpsHash =
1900 getMaxNumOperandsThatCanBeReordered(Lane);
1901 // Compare the number of operands that can move and choose the one with
1902 // the least number.
1903 if (NumFreeOpsHash.NumOfAPOs < Min) {
1904 Min = NumFreeOpsHash.NumOfAPOs;
1905 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1906 HashMap.clear();
1907 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1908 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1909 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1910 // Select the most optimal lane in terms of number of operands that
1911 // should be moved around.
1912 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1913 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1914 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1915 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1916 auto *It = HashMap.find(NumFreeOpsHash.Hash);
1917 if (It == HashMap.end())
1918 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1919 else
1920 ++It->second.first;
1921 }
1922 }
1923 // Select the lane with the minimum counter.
1924 unsigned BestLane = 0;
1925 unsigned CntMin = UINT_MAX;
1926 for (const auto &Data : reverse(HashMap)) {
1927 if (Data.second.first < CntMin) {
1928 CntMin = Data.second.first;
1929 BestLane = Data.second.second;
1930 }
1931 }
1932 return BestLane;
1933 }
1934
1935 /// Data structure that helps to reorder operands.
1936 struct OperandsOrderData {
1937 /// The best number of operands with the same APOs, which can be
1938 /// reordered.
1939 unsigned NumOfAPOs = UINT_MAX;
1940 /// Number of operands with the same/alternate instruction opcode and
1941 /// parent.
1942 unsigned NumOpsWithSameOpcodeParent = 0;
1943 /// Hash for the actual operands ordering.
1944 /// Used to count operands, actually their position id and opcode
1945 /// value. It is used in the voting mechanism to find the lane with the
1946 /// least number of operands that can freely move about or less profitable
1947 /// because it already has the most optimal set of operands. Can be
1948 /// replaced with SmallVector<unsigned> instead but hash code is faster
1949 /// and requires less memory.
1950 unsigned Hash = 0;
1951 };
1952 /// \returns the maximum number of operands that are allowed to be reordered
1953 /// for \p Lane and the number of compatible instructions(with the same
1954 /// parent/opcode). This is used as a heuristic for selecting the first lane
1955 /// to start operand reordering.
1956 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
1957 unsigned CntTrue = 0;
1958 unsigned NumOperands = getNumOperands();
1959 // Operands with the same APO can be reordered. We therefore need to count
1960 // how many of them we have for each APO, like this: Cnt[APO] = x.
1961 // Since we only have two APOs, namely true and false, we can avoid using
1962 // a map. Instead we can simply count the number of operands that
1963 // correspond to one of them (in this case the 'true' APO), and calculate
1964 // the other by subtracting it from the total number of operands.
1965 // Operands with the same instruction opcode and parent are more
1966 // profitable since we don't need to move them in many cases, with a high
1967 // probability such lane already can be vectorized effectively.
1968 bool AllUndefs = true;
1969 unsigned NumOpsWithSameOpcodeParent = 0;
1970 Instruction *OpcodeI = nullptr;
1971 BasicBlock *Parent = nullptr;
1972 unsigned Hash = 0;
1973 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1974 const OperandData &OpData = getData(OpIdx, Lane);
1975 if (OpData.APO)
1976 ++CntTrue;
1977 // Use Boyer-Moore majority voting for finding the majority opcode and
1978 // the number of times it occurs.
1979 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
1980 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
1981 I->getParent() != Parent) {
1982 if (NumOpsWithSameOpcodeParent == 0) {
1983 NumOpsWithSameOpcodeParent = 1;
1984 OpcodeI = I;
1985 Parent = I->getParent();
1986 } else {
1987 --NumOpsWithSameOpcodeParent;
1988 }
1989 } else {
1990 ++NumOpsWithSameOpcodeParent;
1991 }
1992 }
1993 Hash = hash_combine(
1994 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
1995 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
1996 }
1997 if (AllUndefs)
1998 return {};
1999 OperandsOrderData Data;
2000 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2001 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2002 Data.Hash = Hash;
2003 return Data;
2004 }
2005
2006 /// Go through the instructions in VL and append their operands.
2007 void appendOperandsOfVL(ArrayRef<Value *> VL) {
2008 assert(!VL.empty() && "Bad VL");
2009 assert((empty() || VL.size() == getNumLanes()) &&
2010 "Expected same number of lanes");
2011 assert(isa<Instruction>(VL[0]) && "Expected instruction");
2012 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2013 constexpr unsigned IntrinsicNumOperands = 2;
2014 if (isa<IntrinsicInst>(VL[0]))
2015 NumOperands = IntrinsicNumOperands;
2016 OpsVec.resize(NumOperands);
2017 unsigned NumLanes = VL.size();
2018 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2019 OpsVec[OpIdx].resize(NumLanes);
2020 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2021 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2022 // Our tree has just 3 nodes: the root and two operands.
2023 // It is therefore trivial to get the APO. We only need to check the
2024 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2025 // RHS operand. The LHS operand of both add and sub is never attached
2026 // to an inversese operation in the linearized form, therefore its APO
2027 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2028
2029 // Since operand reordering is performed on groups of commutative
2030 // operations or alternating sequences (e.g., +, -), we can safely
2031 // tell the inverse operations by checking commutativity.
2032 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2033 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2034 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2035 APO, false};
2036 }
2037 }
2038 }
2039
2040 /// \returns the number of operands.
2041 unsigned getNumOperands() const { return OpsVec.size(); }
2042
2043 /// \returns the number of lanes.
2044 unsigned getNumLanes() const { return OpsVec[0].size(); }
2045
2046 /// \returns the operand value at \p OpIdx and \p Lane.
2047 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2048 return getData(OpIdx, Lane).V;
2049 }
2050
2051 /// \returns true if the data structure is empty.
2052 bool empty() const { return OpsVec.empty(); }
2053
2054 /// Clears the data.
2055 void clear() { OpsVec.clear(); }
2056
2057 /// \Returns true if there are enough operands identical to \p Op to fill
2058 /// the whole vector.
2059 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2060 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2061 bool OpAPO = getData(OpIdx, Lane).APO;
2062 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2063 if (Ln == Lane)
2064 continue;
2065 // This is set to true if we found a candidate for broadcast at Lane.
2066 bool FoundCandidate = false;
2067 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2068 OperandData &Data = getData(OpI, Ln);
2069 if (Data.APO != OpAPO || Data.IsUsed)
2070 continue;
2071 if (Data.V == Op) {
2072 FoundCandidate = true;
2073 Data.IsUsed = true;
2074 break;
2075 }
2076 }
2077 if (!FoundCandidate)
2078 return false;
2079 }
2080 return true;
2081 }
2082
2083 public:
2084 /// Initialize with all the operands of the instruction vector \p RootVL.
2086 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R) {
2087 // Append all the operands of RootVL.
2088 appendOperandsOfVL(RootVL);
2089 }
2090
2091 /// \Returns a value vector with the operands across all lanes for the
2092 /// opearnd at \p OpIdx.
2093 ValueList getVL(unsigned OpIdx) const {
2094 ValueList OpVL(OpsVec[OpIdx].size());
2095 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2096 "Expected same num of lanes across all operands");
2097 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2098 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2099 return OpVL;
2100 }
2101
2102 // Performs operand reordering for 2 or more operands.
2103 // The original operands are in OrigOps[OpIdx][Lane].
2104 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2105 void reorder() {
2106 unsigned NumOperands = getNumOperands();
2107 unsigned NumLanes = getNumLanes();
2108 // Each operand has its own mode. We are using this mode to help us select
2109 // the instructions for each lane, so that they match best with the ones
2110 // we have selected so far.
2111 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2112
2113 // This is a greedy single-pass algorithm. We are going over each lane
2114 // once and deciding on the best order right away with no back-tracking.
2115 // However, in order to increase its effectiveness, we start with the lane
2116 // that has operands that can move the least. For example, given the
2117 // following lanes:
2118 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2119 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2120 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2121 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2122 // we will start at Lane 1, since the operands of the subtraction cannot
2123 // be reordered. Then we will visit the rest of the lanes in a circular
2124 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2125
2126 // Find the first lane that we will start our search from.
2127 unsigned FirstLane = getBestLaneToStartReordering();
2128
2129 // Initialize the modes.
2130 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2131 Value *OpLane0 = getValue(OpIdx, FirstLane);
2132 // Keep track if we have instructions with all the same opcode on one
2133 // side.
2134 if (isa<LoadInst>(OpLane0))
2135 ReorderingModes[OpIdx] = ReorderingMode::Load;
2136 else if (isa<Instruction>(OpLane0)) {
2137 // Check if OpLane0 should be broadcast.
2138 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2139 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2140 else
2141 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2142 }
2143 else if (isa<Constant>(OpLane0))
2144 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2145 else if (isa<Argument>(OpLane0))
2146 // Our best hope is a Splat. It may save some cost in some cases.
2147 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2148 else
2149 // NOTE: This should be unreachable.
2150 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2151 }
2152
2153 // Check that we don't have same operands. No need to reorder if operands
2154 // are just perfect diamond or shuffled diamond match. Do not do it only
2155 // for possible broadcasts or non-power of 2 number of scalars (just for
2156 // now).
2157 auto &&SkipReordering = [this]() {
2158 SmallPtrSet<Value *, 4> UniqueValues;
2159 ArrayRef<OperandData> Op0 = OpsVec.front();
2160 for (const OperandData &Data : Op0)
2161 UniqueValues.insert(Data.V);
2162 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2163 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2164 return !UniqueValues.contains(Data.V);
2165 }))
2166 return false;
2167 }
2168 // TODO: Check if we can remove a check for non-power-2 number of
2169 // scalars after full support of non-power-2 vectorization.
2170 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2171 };
2172
2173 // If the initial strategy fails for any of the operand indexes, then we
2174 // perform reordering again in a second pass. This helps avoid assigning
2175 // high priority to the failed strategy, and should improve reordering for
2176 // the non-failed operand indexes.
2177 for (int Pass = 0; Pass != 2; ++Pass) {
2178 // Check if no need to reorder operands since they're are perfect or
2179 // shuffled diamond match.
2180 // Need to do it to avoid extra external use cost counting for
2181 // shuffled matches, which may cause regressions.
2182 if (SkipReordering())
2183 break;
2184 // Skip the second pass if the first pass did not fail.
2185 bool StrategyFailed = false;
2186 // Mark all operand data as free to use.
2187 clearUsed();
2188 // We keep the original operand order for the FirstLane, so reorder the
2189 // rest of the lanes. We are visiting the nodes in a circular fashion,
2190 // using FirstLane as the center point and increasing the radius
2191 // distance.
2192 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2193 for (unsigned I = 0; I < NumOperands; ++I)
2194 MainAltOps[I].push_back(getData(I, FirstLane).V);
2195
2196 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2197 // Visit the lane on the right and then the lane on the left.
2198 for (int Direction : {+1, -1}) {
2199 int Lane = FirstLane + Direction * Distance;
2200 if (Lane < 0 || Lane >= (int)NumLanes)
2201 continue;
2202 int LastLane = Lane - Direction;
2203 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2204 "Out of bounds");
2205 // Look for a good match for each operand.
2206 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2207 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2208 std::optional<unsigned> BestIdx = getBestOperand(
2209 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2210 // By not selecting a value, we allow the operands that follow to
2211 // select a better matching value. We will get a non-null value in
2212 // the next run of getBestOperand().
2213 if (BestIdx) {
2214 // Swap the current operand with the one returned by
2215 // getBestOperand().
2216 swap(OpIdx, *BestIdx, Lane);
2217 } else {
2218 // We failed to find a best operand, set mode to 'Failed'.
2219 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2220 // Enable the second pass.
2221 StrategyFailed = true;
2222 }
2223 // Try to get the alternate opcode and follow it during analysis.
2224 if (MainAltOps[OpIdx].size() != 2) {
2225 OperandData &AltOp = getData(OpIdx, Lane);
2226 InstructionsState OpS =
2227 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2228 if (OpS.getOpcode() && OpS.isAltShuffle())
2229 MainAltOps[OpIdx].push_back(AltOp.V);
2230 }
2231 }
2232 }
2233 }
2234 // Skip second pass if the strategy did not fail.
2235 if (!StrategyFailed)
2236 break;
2237 }
2238 }
2239
2240#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2241 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2242 switch (RMode) {
2243 case ReorderingMode::Load:
2244 return "Load";
2245 case ReorderingMode::Opcode:
2246 return "Opcode";
2247 case ReorderingMode::Constant:
2248 return "Constant";
2249 case ReorderingMode::Splat:
2250 return "Splat";
2251 case ReorderingMode::Failed:
2252 return "Failed";
2253 }
2254 llvm_unreachable("Unimplemented Reordering Type");
2255 }
2256
2257 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2258 raw_ostream &OS) {
2259 return OS << getModeStr(RMode);
2260 }
2261
2262 /// Debug print.
2263 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2264 printMode(RMode, dbgs());
2265 }
2266
2267 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2268 return printMode(RMode, OS);
2269 }
2270
2272 const unsigned Indent = 2;
2273 unsigned Cnt = 0;
2274 for (const OperandDataVec &OpDataVec : OpsVec) {
2275 OS << "Operand " << Cnt++ << "\n";
2276 for (const OperandData &OpData : OpDataVec) {
2277 OS.indent(Indent) << "{";
2278 if (Value *V = OpData.V)
2279 OS << *V;
2280 else
2281 OS << "null";
2282 OS << ", APO:" << OpData.APO << "}\n";
2283 }
2284 OS << "\n";
2285 }
2286 return OS;
2287 }
2288
2289 /// Debug print.
2290 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2291#endif
2292 };
2293
2294 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2295 /// for a pair which have highest score deemed to have best chance to form
2296 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2297 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2298 /// of the cost, considered to be good enough score.
2299 std::optional<int>
2300 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2301 int Limit = LookAheadHeuristics::ScoreFail) const {
2302 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2304 int BestScore = Limit;
2305 std::optional<int> Index;
2306 for (int I : seq<int>(0, Candidates.size())) {
2307 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2308 Candidates[I].second,
2309 /*U1=*/nullptr, /*U2=*/nullptr,
2310 /*Level=*/1, std::nullopt);
2311 if (Score > BestScore) {
2312 BestScore = Score;
2313 Index = I;
2314 }
2315 }
2316 return Index;
2317 }
2318
2319 /// Checks if the instruction is marked for deletion.
2320 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2321
2322 /// Removes an instruction from its block and eventually deletes it.
2323 /// It's like Instruction::eraseFromParent() except that the actual deletion
2324 /// is delayed until BoUpSLP is destructed.
2326 DeletedInstructions.insert(I);
2327 }
2328
2329 /// Checks if the instruction was already analyzed for being possible
2330 /// reduction root.
2332 return AnalyzedReductionsRoots.count(I);
2333 }
2334 /// Register given instruction as already analyzed for being possible
2335 /// reduction root.
2337 AnalyzedReductionsRoots.insert(I);
2338 }
2339 /// Checks if the provided list of reduced values was checked already for
2340 /// vectorization.
2342 return AnalyzedReductionVals.contains(hash_value(VL));
2343 }
2344 /// Adds the list of reduced values to list of already checked values for the
2345 /// vectorization.
2347 AnalyzedReductionVals.insert(hash_value(VL));
2348 }
2349 /// Clear the list of the analyzed reduction root instructions.
2351 AnalyzedReductionsRoots.clear();
2352 AnalyzedReductionVals.clear();
2353 AnalyzedMinBWVals.clear();
2354 }
2355 /// Checks if the given value is gathered in one of the nodes.
2356 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2357 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2358 }
2359
2360 /// Check if the value is vectorized in the tree.
2361 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2362
2363 ~BoUpSLP();
2364
2365private:
2366 /// Determine if a node \p E in can be demoted to a smaller type with a
2367 /// truncation. We collect the entries that will be demoted in ToDemote.
2368 /// \param E Node for analysis
2369 /// \param ToDemote indices of the nodes to be demoted.
2370 bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
2371 unsigned &BitWidth,
2372 SmallVectorImpl<unsigned> &ToDemote,
2374 unsigned &MaxDepthLevel,
2375 bool &IsProfitableToDemote,
2376 bool IsTruncRoot) const;
2377
2378 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2379 /// reordering (i.e. the operands can be reordered because they have only one
2380 /// user and reordarable).
2381 /// \param ReorderableGathers List of all gather nodes that require reordering
2382 /// (e.g., gather of extractlements or partially vectorizable loads).
2383 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2384 /// reordering, subset of \p NonVectorized.
2385 bool
2386 canReorderOperands(TreeEntry *UserTE,
2387 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2388 ArrayRef<TreeEntry *> ReorderableGathers,
2389 SmallVectorImpl<TreeEntry *> &GatherOps);
2390
2391 /// Checks if the given \p TE is a gather node with clustered reused scalars
2392 /// and reorders it per given \p Mask.
2393 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2394
2395 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2396 /// if any. If it is not vectorized (gather node), returns nullptr.
2397 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2398 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2399 TreeEntry *TE = nullptr;
2400 const auto *It = find_if(VL, [&](Value *V) {
2401 TE = getTreeEntry(V);
2402 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2403 return true;
2404 auto It = MultiNodeScalars.find(V);
2405 if (It != MultiNodeScalars.end()) {
2406 for (TreeEntry *E : It->second) {
2407 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2408 TE = E;
2409 return true;
2410 }
2411 }
2412 }
2413 return false;
2414 });
2415 if (It != VL.end()) {
2416 assert(TE->isSame(VL) && "Expected same scalars.");
2417 return TE;
2418 }
2419 return nullptr;
2420 }
2421
2422 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2423 /// if any. If it is not vectorized (gather node), returns nullptr.
2424 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2425 unsigned OpIdx) const {
2426 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2427 const_cast<TreeEntry *>(UserTE), OpIdx);
2428 }
2429
2430 /// Checks if all users of \p I are the part of the vectorization tree.
2431 bool areAllUsersVectorized(
2432 Instruction *I,
2433 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2434
2435 /// Return information about the vector formed for the specified index
2436 /// of a vector of (the same) instruction.
2438
2439 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2440 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2441
2442 /// \returns Cast context for the given graph node.
2444 getCastContextHint(const TreeEntry &TE) const;
2445
2446 /// \returns the cost of the vectorizable entry.
2447 InstructionCost getEntryCost(const TreeEntry *E,
2448 ArrayRef<Value *> VectorizedVals,
2449 SmallPtrSetImpl<Value *> &CheckedExtracts);
2450
2451 /// This is the recursive part of buildTree.
2452 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2453 const EdgeInfo &EI);
2454
2455 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2456 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2457 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2458 /// returns false, setting \p CurrentOrder to either an empty vector or a
2459 /// non-identity permutation that allows to reuse extract instructions.
2460 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2461 /// extract order.
2462 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2463 SmallVectorImpl<unsigned> &CurrentOrder,
2464 bool ResizeAllowed = false) const;
2465
2466 /// Vectorize a single entry in the tree.
2467 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2468 /// avoid issues with def-use order.
2469 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2470
2471 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2472 /// \p E.
2473 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2474 /// avoid issues with def-use order.
2475 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2476
2477 /// Create a new vector from a list of scalar values. Produces a sequence
2478 /// which exploits values reused across lanes, and arranges the inserts
2479 /// for ease of later optimization.
2480 template <typename BVTy, typename ResTy, typename... Args>
2481 ResTy processBuildVector(const TreeEntry *E, Args &...Params);
2482
2483 /// Create a new vector from a list of scalar values. Produces a sequence
2484 /// which exploits values reused across lanes, and arranges the inserts
2485 /// for ease of later optimization.
2486 Value *createBuildVector(const TreeEntry *E);
2487
2488 /// Returns the instruction in the bundle, which can be used as a base point
2489 /// for scheduling. Usually it is the last instruction in the bundle, except
2490 /// for the case when all operands are external (in this case, it is the first
2491 /// instruction in the list).
2492 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2493
2494 /// Tries to find extractelement instructions with constant indices from fixed
2495 /// vector type and gather such instructions into a bunch, which highly likely
2496 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2497 /// was successful, the matched scalars are replaced by poison values in \p VL
2498 /// for future analysis.
2499 std::optional<TargetTransformInfo::ShuffleKind>
2500 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2501 SmallVectorImpl<int> &Mask) const;
2502
2503 /// Tries to find extractelement instructions with constant indices from fixed
2504 /// vector type and gather such instructions into a bunch, which highly likely
2505 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2506 /// was successful, the matched scalars are replaced by poison values in \p VL
2507 /// for future analysis.
2509 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2511 unsigned NumParts) const;
2512
2513 /// Checks if the gathered \p VL can be represented as a single register
2514 /// shuffle(s) of previous tree entries.
2515 /// \param TE Tree entry checked for permutation.
2516 /// \param VL List of scalars (a subset of the TE scalar), checked for
2517 /// permutations. Must form single-register vector.
2518 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2519 /// commands to build the mask using the original vector value, without
2520 /// relying on the potential reordering.
2521 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2522 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2523 std::optional<TargetTransformInfo::ShuffleKind>
2524 isGatherShuffledSingleRegisterEntry(
2525 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2526 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2527 bool ForOrder);
2528
2529 /// Checks if the gathered \p VL can be represented as multi-register
2530 /// shuffle(s) of previous tree entries.
2531 /// \param TE Tree entry checked for permutation.
2532 /// \param VL List of scalars (a subset of the TE scalar), checked for
2533 /// permutations.
2534 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2535 /// commands to build the mask using the original vector value, without
2536 /// relying on the potential reordering.
2537 /// \returns per-register series of ShuffleKind, if gathered values can be
2538 /// represented as shuffles of previous tree entries. \p Mask is filled with
2539 /// the shuffle mask (also on per-register base).
2541 isGatherShuffledEntry(
2542 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2544 unsigned NumParts, bool ForOrder = false);
2545
2546 /// \returns the scalarization cost for this list of values. Assuming that
2547 /// this subtree gets vectorized, we may need to extract the values from the
2548 /// roots. This method calculates the cost of extracting the values.
2549 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2550 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc) const;
2551
2552 /// Set the Builder insert point to one after the last instruction in
2553 /// the bundle
2554 void setInsertPointAfterBundle(const TreeEntry *E);
2555
2556 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2557 /// specified, the starting vector value is poison.
2558 Value *gather(ArrayRef<Value *> VL, Value *Root);
2559
2560 /// \returns whether the VectorizableTree is fully vectorizable and will
2561 /// be beneficial even the tree height is tiny.
2562 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2563
2564 /// Reorder commutative or alt operands to get better probability of
2565 /// generating vectorized code.
2566 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2569 const BoUpSLP &R);
2570
2571 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2572 /// users of \p TE and collects the stores. It returns the map from the store
2573 /// pointers to the collected stores.
2575 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2576
2577 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2578 /// stores in \p StoresVec can form a vector instruction. If so it returns
2579 /// true and populates \p ReorderIndices with the shuffle indices of the
2580 /// stores when compared to the sorted vector.
2581 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2582 OrdersType &ReorderIndices) const;
2583
2584 /// Iterates through the users of \p TE, looking for scalar stores that can be
2585 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2586 /// their order and builds an order index vector for each store bundle. It
2587 /// returns all these order vectors found.
2588 /// We run this after the tree has formed, otherwise we may come across user
2589 /// instructions that are not yet in the tree.
2591 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2592
2593 struct TreeEntry {
2594 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2595 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2596
2597 /// \returns Common mask for reorder indices and reused scalars.
2598 SmallVector<int> getCommonMask() const {
2600 inversePermutation(ReorderIndices, Mask);
2601 ::addMask(Mask, ReuseShuffleIndices);
2602 return Mask;
2603 }
2604
2605 /// \returns true if the scalars in VL are equal to this entry.
2606 bool isSame(ArrayRef<Value *> VL) const {
2607 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2608 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2609 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2610 return VL.size() == Mask.size() &&
2611 std::equal(VL.begin(), VL.end(), Mask.begin(),
2612 [Scalars](Value *V, int Idx) {
2613 return (isa<UndefValue>(V) &&
2614 Idx == PoisonMaskElem) ||
2615 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2616 });
2617 };
2618 if (!ReorderIndices.empty()) {
2619 // TODO: implement matching if the nodes are just reordered, still can
2620 // treat the vector as the same if the list of scalars matches VL
2621 // directly, without reordering.
2623 inversePermutation(ReorderIndices, Mask);
2624 if (VL.size() == Scalars.size())
2625 return IsSame(Scalars, Mask);
2626 if (VL.size() == ReuseShuffleIndices.size()) {
2627 ::addMask(Mask, ReuseShuffleIndices);
2628 return IsSame(Scalars, Mask);
2629 }
2630 return false;
2631 }
2632 return IsSame(Scalars, ReuseShuffleIndices);
2633 }
2634
2635 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2636 return State == TreeEntry::NeedToGather &&
2637 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2638 UserTreeIndices.front().UserTE == UserEI.UserTE;
2639 }
2640
2641 /// \returns true if current entry has same operands as \p TE.
2642 bool hasEqualOperands(const TreeEntry &TE) const {
2643 if (TE.getNumOperands() != getNumOperands())
2644 return false;
2645 SmallBitVector Used(getNumOperands());
2646 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2647 unsigned PrevCount = Used.count();
2648 for (unsigned K = 0; K < E; ++K) {
2649 if (Used.test(K))
2650 continue;
2651 if (getOperand(K) == TE.getOperand(I)) {
2652 Used.set(K);
2653 break;
2654 }
2655 }
2656 // Check if we actually found the matching operand.
2657 if (PrevCount == Used.count())
2658 return false;
2659 }
2660 return true;
2661 }
2662
2663 /// \return Final vectorization factor for the node. Defined by the total
2664 /// number of vectorized scalars, including those, used several times in the
2665 /// entry and counted in the \a ReuseShuffleIndices, if any.
2666 unsigned getVectorFactor() const {
2667 if (!ReuseShuffleIndices.empty())
2668 return ReuseShuffleIndices.size();
2669 return Scalars.size();
2670 };
2671
2672 /// A vector of scalars.
2673 ValueList Scalars;
2674
2675 /// The Scalars are vectorized into this value. It is initialized to Null.
2676 WeakTrackingVH VectorizedValue = nullptr;
2677
2678 /// New vector phi instructions emitted for the vectorized phi nodes.
2679 PHINode *PHI = nullptr;
2680
2681 /// Do we need to gather this sequence or vectorize it
2682 /// (either with vector instruction or with scatter/gather
2683 /// intrinsics for store/load)?
2684 enum EntryState {
2685 Vectorize,
2686 ScatterVectorize,
2687 StridedVectorize,
2688 NeedToGather
2689 };
2690 EntryState State;
2691
2692 /// Does this sequence require some shuffling?
2693 SmallVector<int, 4> ReuseShuffleIndices;
2694
2695 /// Does this entry require reordering?
2696 SmallVector<unsigned, 4> ReorderIndices;
2697
2698 /// Points back to the VectorizableTree.
2699 ///
2700 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2701 /// to be a pointer and needs to be able to initialize the child iterator.
2702 /// Thus we need a reference back to the container to translate the indices
2703 /// to entries.
2704 VecTreeTy &Container;
2705
2706 /// The TreeEntry index containing the user of this entry. We can actually
2707 /// have multiple users so the data structure is not truly a tree.
2708 SmallVector<EdgeInfo, 1> UserTreeIndices;
2709
2710 /// The index of this treeEntry in VectorizableTree.
2711 int Idx = -1;
2712
2713 private:
2714 /// The operands of each instruction in each lane Operands[op_index][lane].
2715 /// Note: This helps avoid the replication of the code that performs the
2716 /// reordering of operands during buildTree_rec() and vectorizeTree().
2718
2719 /// The main/alternate instruction.
2720 Instruction *MainOp = nullptr;
2721 Instruction *AltOp = nullptr;
2722
2723 public:
2724 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
2725 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2726 if (Operands.size() < OpIdx + 1)
2727 Operands.resize(OpIdx + 1);
2728 assert(Operands[OpIdx].empty() && "Already resized?");
2729 assert(OpVL.size() <= Scalars.size() &&
2730 "Number of operands is greater than the number of scalars.");
2731 Operands[OpIdx].resize(OpVL.size());
2732 copy(OpVL, Operands[OpIdx].begin());
2733 }
2734
2735 /// Set the operands of this bundle in their original order.
2736 void setOperandsInOrder() {
2737 assert(Operands.empty() && "Already initialized?");
2738 auto *I0 = cast<Instruction>(Scalars[0]);
2739 Operands.resize(I0->getNumOperands());
2740 unsigned NumLanes = Scalars.size();
2741 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2742 OpIdx != NumOperands; ++OpIdx) {
2743 Operands[OpIdx].resize(NumLanes);
2744 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2745 auto *I = cast<Instruction>(Scalars[Lane]);
2746 assert(I->getNumOperands() == NumOperands &&
2747 "Expected same number of operands");
2748 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
2749 }
2750 }
2751 }
2752
2753 /// Reorders operands of the node to the given mask \p Mask.
2754 void reorderOperands(ArrayRef<int> Mask) {
2755 for (ValueList &Operand : Operands)
2756 reorderScalars(Operand, Mask);
2757 }
2758
2759 /// \returns the \p OpIdx operand of this TreeEntry.
2760 ValueList &getOperand(unsigned OpIdx) {
2761 assert(OpIdx < Operands.size() && "Off bounds");
2762 return Operands[OpIdx];
2763 }
2764
2765 /// \returns the \p OpIdx operand of this TreeEntry.
2766 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
2767 assert(OpIdx < Operands.size() && "Off bounds");
2768 return Operands[OpIdx];
2769 }
2770
2771 /// \returns the number of operands.
2772 unsigned getNumOperands() const { return Operands.size(); }
2773
2774 /// \return the single \p OpIdx operand.
2775 Value *getSingleOperand(unsigned OpIdx) const {
2776 assert(OpIdx < Operands.size() && "Off bounds");
2777 assert(!Operands[OpIdx].empty() && "No operand available");
2778 return Operands[OpIdx][0];
2779 }
2780
2781 /// Some of the instructions in the list have alternate opcodes.
2782 bool isAltShuffle() const { return MainOp != AltOp; }
2783
2784 bool isOpcodeOrAlt(Instruction *I) const {
2785 unsigned CheckedOpcode = I->getOpcode();
2786 return (getOpcode() == CheckedOpcode ||
2787 getAltOpcode() == CheckedOpcode);
2788 }
2789
2790 /// Chooses the correct key for scheduling data. If \p Op has the same (or
2791 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
2792 /// \p OpValue.
2793 Value *isOneOf(Value *Op) const {
2794 auto *I = dyn_cast<Instruction>(Op);
2795 if (I && isOpcodeOrAlt(I))
2796 return Op;
2797 return MainOp;
2798 }
2799
2800 void setOperations(const InstructionsState &S) {
2801 MainOp = S.MainOp;
2802 AltOp = S.AltOp;
2803 }
2804
2805 Instruction *getMainOp() const {
2806 return MainOp;
2807 }
2808
2809 Instruction *getAltOp() const {
2810 return AltOp;
2811 }
2812
2813 /// The main/alternate opcodes for the list of instructions.
2814 unsigned getOpcode() const {
2815 return MainOp ? MainOp->getOpcode() : 0;
2816 }
2817
2818 unsigned getAltOpcode() const {
2819 return AltOp ? AltOp->getOpcode() : 0;
2820 }
2821
2822 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
2823 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
2824 int findLaneForValue(Value *V) const {
2825 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
2826 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2827 if (!ReorderIndices.empty())
2828 FoundLane = ReorderIndices[FoundLane];
2829 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2830 if (!ReuseShuffleIndices.empty()) {
2831 FoundLane = std::distance(ReuseShuffleIndices.begin(),
2832 find(ReuseShuffleIndices, FoundLane));
2833 }
2834 return FoundLane;
2835 }
2836
2837 /// Build a shuffle mask for graph entry which represents a merge of main
2838 /// and alternate operations.
2839 void
2840 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
2842 SmallVectorImpl<Value *> *OpScalars = nullptr,
2843 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
2844
2845 /// Return true if this is a non-power-of-2 node.
2846 bool isNonPowOf2Vec() const {
2847 bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size());
2848 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
2849 "Reshuffling not supported with non-power-of-2 vectors yet.");
2850 return IsNonPowerOf2;
2851 }
2852
2853#ifndef NDEBUG
2854 /// Debug printer.
2855 LLVM_DUMP_METHOD void dump() const {
2856 dbgs() << Idx << ".\n";
2857 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
2858 dbgs() << "Operand " << OpI << ":\n";
2859 for (const Value *V : Operands[OpI])
2860 dbgs().indent(2) << *V << "\n";
2861 }
2862 dbgs() << "Scalars: \n";
2863 for (Value *V : Scalars)
2864 dbgs().indent(2) << *V << "\n";
2865 dbgs() << "State: ";
2866 switch (State) {
2867 case Vectorize:
2868 dbgs() << "Vectorize\n";
2869 break;
2870 case ScatterVectorize:
2871 dbgs() << "ScatterVectorize\n";
2872 break;
2873 case StridedVectorize:
2874 dbgs() << "StridedVectorize\n";
2875 break;
2876 case NeedToGather:
2877 dbgs() << "NeedToGather\n";
2878 break;
2879 }
2880 dbgs() << "MainOp: ";
2881 if (MainOp)
2882 dbgs() << *MainOp << "\n";
2883 else
2884 dbgs() << "NULL\n";
2885 dbgs() << "AltOp: ";
2886 if (AltOp)
2887 dbgs() << *AltOp << "\n";
2888 else
2889 dbgs() << "NULL\n";
2890 dbgs() << "VectorizedValue: ";
2891 if (VectorizedValue)
2892 dbgs() << *VectorizedValue << "\n";
2893 else
2894 dbgs() << "NULL\n";
2895 dbgs() << "ReuseShuffleIndices: ";
2896 if (ReuseShuffleIndices.empty())
2897 dbgs() << "Empty";
2898 else
2899 for (int ReuseIdx : ReuseShuffleIndices)
2900 dbgs() << ReuseIdx << ", ";
2901 dbgs() << "\n";
2902 dbgs() << "ReorderIndices: ";
2903 for (unsigned ReorderIdx : ReorderIndices)
2904 dbgs() << ReorderIdx << ", ";
2905 dbgs() << "\n";
2906 dbgs() << "UserTreeIndices: ";
2907 for (const auto &EInfo : UserTreeIndices)
2908 dbgs() << EInfo << ", ";
2909 dbgs() << "\n";
2910 }
2911#endif
2912 };
2913
2914#ifndef NDEBUG
2915 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
2916 InstructionCost VecCost, InstructionCost ScalarCost,
2917 StringRef Banner) const {
2918 dbgs() << "SLP: " << Banner << ":\n";
2919 E->dump();
2920 dbgs() << "SLP: Costs:\n";
2921 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
2922 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
2923 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
2924 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2925 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
2926 }
2927#endif
2928
2929 /// Create a new VectorizableTree entry.
2930 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2931 std::optional<ScheduleData *> Bundle,
2932 const InstructionsState &S,
2933 const EdgeInfo &UserTreeIdx,
2934 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2935 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2936 TreeEntry::EntryState EntryState =
2937 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2938 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2939 ReuseShuffleIndices, ReorderIndices);
2940 }
2941
2942 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2943 TreeEntry::EntryState EntryState,
2944 std::optional<ScheduleData *> Bundle,
2945 const InstructionsState &S,
2946 const EdgeInfo &UserTreeIdx,
2947 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2948 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2949 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2950 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2951 "Need to vectorize gather entry?");
2952 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
2953 TreeEntry *Last = VectorizableTree.back().get();
2954 Last->Idx = VectorizableTree.size() - 1;
2955 Last->State = EntryState;
2956 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2957 ReuseShuffleIndices.end());
2958 if (ReorderIndices.empty()) {
2959 Last->Scalars.assign(VL.begin(), VL.end());
2960 Last->setOperations(S);
2961 } else {
2962 // Reorder scalars and build final mask.
2963 Last->Scalars.assign(VL.size(), nullptr);
2964 transform(ReorderIndices, Last->Scalars.begin(),
2965 [VL](unsigned Idx) -> Value * {
2966 if (Idx >= VL.size())
2967 return UndefValue::get(VL.front()->getType());
2968 return VL[Idx];
2969 });
2970 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
2971 Last->setOperations(S);
2972 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
2973 }
2974 if (Last->State != TreeEntry::NeedToGather) {
2975 for (Value *V : VL) {
2976 const TreeEntry *TE = getTreeEntry(V);
2977 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
2978 "Scalar already in tree!");
2979 if (TE) {
2980 if (TE != Last)
2981 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
2982 continue;
2983 }
2984 ScalarToTreeEntry[V] = Last;
2985 }
2986 // Update the scheduler bundle to point to this TreeEntry.
2987 ScheduleData *BundleMember = *Bundle;
2988 assert((BundleMember || isa<PHINode>(S.MainOp) ||
2989 isVectorLikeInstWithConstOps(S.MainOp) ||
2990 doesNotNeedToSchedule(VL)) &&
2991 "Bundle and VL out of sync");
2992 if (BundleMember) {
2993 for (Value *V : VL) {
2995 continue;
2996 if (!BundleMember)
2997 continue;
2998 BundleMember->TE = Last;
2999 BundleMember = BundleMember->NextInBundle;
3000 }
3001 }
3002 assert(!BundleMember && "Bundle and VL out of sync");
3003 } else {
3004 // Build a map for gathered scalars to the nodes where they are used.
3005 bool AllConstsOrCasts = true;
3006 for (Value *V : VL)
3007 if (!isConstant(V)) {
3008 auto *I = dyn_cast<CastInst>(V);
3009 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3010 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3011 }
3012 if (AllConstsOrCasts)
3013 CastMaxMinBWSizes =
3014 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3015 MustGather.insert(VL.begin(), VL.end());
3016 }
3017
3018 if (UserTreeIdx.UserTE) {
3019 Last->UserTreeIndices.push_back(UserTreeIdx);
3020 assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3021 "Reordering isn't implemented for non-power-of-2 nodes yet");
3022 }
3023 return Last;
3024 }
3025
3026 /// -- Vectorization State --
3027 /// Holds all of the tree entries.
3028 TreeEntry::VecTreeTy VectorizableTree;
3029
3030#ifndef NDEBUG
3031 /// Debug printer.
3032 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3033 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3034 VectorizableTree[Id]->dump();
3035 dbgs() << "\n";
3036 }
3037 }
3038#endif
3039
3040 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3041
3042 const TreeEntry *getTreeEntry(Value *V) const {
3043 return ScalarToTreeEntry.lookup(V);
3044 }
3045
3046 /// Check that the operand node of alternate node does not generate
3047 /// buildvector sequence. If it is, then probably not worth it to build
3048 /// alternate shuffle, if number of buildvector operands + alternate
3049 /// instruction > than the number of buildvector instructions.
3050 /// \param S the instructions state of the analyzed values.
3051 /// \param VL list of the instructions with alternate opcodes.
3052 bool areAltOperandsProfitable(const InstructionsState &S,
3053 ArrayRef<Value *> VL) const;
3054
3055 /// Checks if the specified list of the instructions/values can be vectorized
3056 /// and fills required data before actual scheduling of the instructions.
3057 TreeEntry::EntryState getScalarsVectorizationState(
3058 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
3059 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
3060
3061 /// Maps a specific scalar to its tree entry.
3062 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3063
3064 /// List of scalars, used in several vectorize nodes, and the list of the
3065 /// nodes.
3067
3068 /// Maps a value to the proposed vectorizable size.
3069 SmallDenseMap<Value *, unsigned> InstrElementSize;
3070
3071 /// A list of scalars that we found that we need to keep as scalars.
3072 ValueSet MustGather;
3073
3074 /// A map between the vectorized entries and the last instructions in the
3075 /// bundles. The bundles are built in use order, not in the def order of the
3076 /// instructions. So, we cannot rely directly on the last instruction in the
3077 /// bundle being the last instruction in the program order during
3078 /// vectorization process since the basic blocks are affected, need to
3079 /// pre-gather them before.
3080 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3081
3082 /// List of gather nodes, depending on other gather/vector nodes, which should
3083 /// be emitted after the vector instruction emission process to correctly
3084 /// handle order of the vector instructions and shuffles.
3085 SetVector<const TreeEntry *> PostponedGathers;
3086
3087 using ValueToGatherNodesMap =
3089 ValueToGatherNodesMap ValueToGatherNodes;
3090
3091 /// This POD struct describes one external user in the vectorized tree.
3092 struct ExternalUser {
3093 ExternalUser(Value *S, llvm::User *U, int L)
3094 : Scalar(S), User(U), Lane(L) {}
3095
3096 // Which scalar in our function.
3097 Value *Scalar;
3098
3099 // Which user that uses the scalar.
3101
3102 // Which lane does the scalar belong to.
3103 int Lane;
3104 };
3105 using UserList = SmallVector<ExternalUser, 16>;
3106
3107 /// Checks if two instructions may access the same memory.
3108 ///
3109 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3110 /// is invariant in the calling loop.
3111 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3112 Instruction *Inst2) {
3113 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3114 return true;
3115 // First check if the result is already in the cache.
3116 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3117 auto It = AliasCache.find(Key);
3118 if (It != AliasCache.end())
3119 return It->second;
3120 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3121 // Store the result in the cache.
3122 AliasCache.try_emplace(Key, Aliased);
3123 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3124 return Aliased;
3125 }
3126
3127 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3128
3129 /// Cache for alias results.
3130 /// TODO: consider moving this to the AliasAnalysis itself.
3132
3133 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3134 // globally through SLP because we don't perform any action which
3135 // invalidates capture results.
3136 BatchAAResults BatchAA;
3137
3138 /// Temporary store for deleted instructions. Instructions will be deleted
3139 /// eventually when the BoUpSLP is destructed. The deferral is required to
3140 /// ensure that there are no incorrect collisions in the AliasCache, which
3141 /// can happen if a new instruction is allocated at the same address as a
3142 /// previously deleted instruction.
3143 DenseSet<Instruction *> DeletedInstructions;
3144
3145 /// Set of the instruction, being analyzed already for reductions.
3146 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3147
3148 /// Set of hashes for the list of reduction values already being analyzed.
3149 DenseSet<size_t> AnalyzedReductionVals;
3150
3151 /// Values, already been analyzed for mininmal bitwidth and found to be
3152 /// non-profitable.
3153 DenseSet<Value *> AnalyzedMinBWVals;
3154
3155 /// A list of values that need to extracted out of the tree.
3156 /// This list holds pairs of (Internal Scalar : External User). External User
3157 /// can be nullptr, it means that this Internal Scalar will be used later,
3158 /// after vectorization.
3159 UserList ExternalUses;
3160
3161 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3162 /// extractelement instructions.
3163 SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;
3164
3165 /// Values used only by @llvm.assume calls.
3167
3168 /// Holds all of the instructions that we gathered, shuffle instructions and
3169 /// extractelements.
3170 SetVector<Instruction *> GatherShuffleExtractSeq;
3171
3172 /// A list of blocks that we are going to CSE.
3173 DenseSet<BasicBlock *> CSEBlocks;
3174
3175 /// Contains all scheduling relevant data for an instruction.
3176 /// A ScheduleData either represents a single instruction or a member of an
3177 /// instruction bundle (= a group of instructions which is combined into a
3178 /// vector instruction).
3179 struct ScheduleData {
3180 // The initial value for the dependency counters. It means that the
3181 // dependencies are not calculated yet.
3182 enum { InvalidDeps = -1 };
3183
3184 ScheduleData() = default;
3185
3186 void init(int BlockSchedulingRegionID, Value *OpVal) {
3187 FirstInBundle = this;
3188 NextInBundle = nullptr;
3189 NextLoadStore = nullptr;
3190 IsScheduled = false;
3191 SchedulingRegionID = BlockSchedulingRegionID;
3192 clearDependencies();
3193 OpValue = OpVal;
3194 TE = nullptr;
3195 }
3196
3197 /// Verify basic self consistency properties
3198 void verify() {
3199 if (hasValidDependencies()) {
3200 assert(UnscheduledDeps <= Dependencies && "invariant");
3201 } else {
3202 assert(UnscheduledDeps == Dependencies && "invariant");
3203 }
3204
3205 if (IsScheduled) {
3206 assert(isSchedulingEntity() &&
3207 "unexpected scheduled state");
3208 for (const ScheduleData *BundleMember = this; BundleMember;
3209 BundleMember = BundleMember->NextInBundle) {
3210 assert(BundleMember->hasValidDependencies() &&
3211 BundleMember->UnscheduledDeps == 0 &&
3212 "unexpected scheduled state");
3213 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3214 "only bundle is marked scheduled");
3215 }
3216 }
3217
3218 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3219 "all bundle members must be in same basic block");
3220 }
3221
3222 /// Returns true if the dependency information has been calculated.
3223 /// Note that depenendency validity can vary between instructions within
3224 /// a single bundle.
3225 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3226
3227 /// Returns true for single instructions and for bundle representatives
3228 /// (= the head of a bundle).
3229 bool isSchedulingEntity() const { return FirstInBundle == this; }
3230
3231 /// Returns true if it represents an instruction bundle and not only a
3232 /// single instruction.
3233 bool isPartOfBundle() const {
3234 return NextInBundle != nullptr || FirstInBundle != this || TE;
3235 }
3236
3237 /// Returns true if it is ready for scheduling, i.e. it has no more
3238 /// unscheduled depending instructions/bundles.
3239 bool isReady() const {
3240 assert(isSchedulingEntity() &&
3241 "can't consider non-scheduling entity for ready list");
3242 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3243 }
3244
3245 /// Modifies the number of unscheduled dependencies for this instruction,
3246 /// and returns the number of remaining dependencies for the containing
3247 /// bundle.
3248 int incrementUnscheduledDeps(int Incr) {
3249 assert(hasValidDependencies() &&
3250 "increment of unscheduled deps would be meaningless");
3251 UnscheduledDeps += Incr;
3252 return FirstInBundle->unscheduledDepsInBundle();
3253 }
3254
3255 /// Sets the number of unscheduled dependencies to the number of
3256 /// dependencies.
3257 void resetUnscheduledDeps() {
3258 UnscheduledDeps = Dependencies;
3259 }
3260
3261 /// Clears all dependency information.
3262 void clearDependencies() {
3263 Dependencies = InvalidDeps;
3264 resetUnscheduledDeps();
3265 MemoryDependencies.clear();
3266 ControlDependencies.clear();
3267 }
3268
3269 int unscheduledDepsInBundle() const {
3270 assert(isSchedulingEntity() && "only meaningful on the bundle");
3271 int Sum = 0;
3272 for (const ScheduleData *BundleMember = this; BundleMember;
3273 BundleMember = BundleMember->NextInBundle) {
3274 if (BundleMember->UnscheduledDeps == InvalidDeps)
3275 return InvalidDeps;
3276 Sum += BundleMember->UnscheduledDeps;
3277 }
3278 return Sum;
3279 }
3280
3281 void dump(raw_ostream &os) const {
3282 if (!isSchedulingEntity()) {
3283 os << "/ " << *Inst;
3284 } else if (NextInBundle) {
3285 os << '[' << *Inst;
3286 ScheduleData *SD = NextInBundle;
3287 while (SD) {
3288 os << ';' << *SD->Inst;
3289 SD = SD->NextInBundle;
3290 }
3291 os << ']';
3292 } else {
3293 os << *Inst;
3294 }
3295 }
3296
3297 Instruction *Inst = nullptr;
3298
3299 /// Opcode of the current instruction in the schedule data.
3300 Value *OpValue = nullptr;
3301
3302 /// The TreeEntry that this instruction corresponds to.
3303 TreeEntry *TE = nullptr;
3304
3305 /// Points to the head in an instruction bundle (and always to this for
3306 /// single instructions).
3307 ScheduleData *FirstInBundle = nullptr;
3308
3309 /// Single linked list of all instructions in a bundle. Null if it is a
3310 /// single instruction.
3311 ScheduleData *NextInBundle = nullptr;
3312
3313 /// Single linked list of all memory instructions (e.g. load, store, call)
3314 /// in the block - until the end of the scheduling region.
3315 ScheduleData *NextLoadStore = nullptr;
3316
3317 /// The dependent memory instructions.
3318 /// This list is derived on demand in calculateDependencies().
3319 SmallVector<ScheduleData *, 4> MemoryDependencies;
3320
3321 /// List of instructions which this instruction could be control dependent
3322 /// on. Allowing such nodes to be scheduled below this one could introduce
3323 /// a runtime fault which didn't exist in the original program.
3324 /// ex: this is a load or udiv following a readonly call which inf loops
3325 SmallVector<ScheduleData *, 4> ControlDependencies;
3326
3327 /// This ScheduleData is in the current scheduling region if this matches
3328 /// the current SchedulingRegionID of BlockScheduling.
3329 int SchedulingRegionID = 0;
3330
3331 /// Used for getting a "good" final ordering of instructions.
3332 int SchedulingPriority = 0;
3333
3334 /// The number of dependencies. Constitutes of the number of users of the
3335 /// instruction plus the number of dependent memory instructions (if any).
3336 /// This value is calculated on demand.
3337 /// If InvalidDeps, the number of dependencies is not calculated yet.
3338 int Dependencies = InvalidDeps;
3339
3340 /// The number of dependencies minus the number of dependencies of scheduled
3341 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3342 /// for scheduling.
3343 /// Note that this is negative as long as Dependencies is not calculated.
3344 int UnscheduledDeps = InvalidDeps;
3345
3346 /// True if this instruction is scheduled (or considered as scheduled in the
3347 /// dry-run).
3348 bool IsScheduled = false;
3349 };
3350
3351#ifndef NDEBUG
3353 const BoUpSLP::ScheduleData &SD) {
3354 SD.dump(os);
3355 return os;
3356 }
3357#endif
3358
3359 friend struct GraphTraits<BoUpSLP *>;
3360 friend struct DOTGraphTraits<BoUpSLP *>;
3361
3362 /// Contains all scheduling data for a basic block.
3363 /// It does not schedules instructions, which are not memory read/write
3364 /// instructions and their operands are either constants, or arguments, or
3365 /// phis, or instructions from others blocks, or their users are phis or from
3366 /// the other blocks. The resulting vector instructions can be placed at the
3367 /// beginning of the basic block without scheduling (if operands does not need
3368 /// to be scheduled) or at the end of the block (if users are outside of the
3369 /// block). It allows to save some compile time and memory used by the
3370 /// compiler.
3371 /// ScheduleData is assigned for each instruction in between the boundaries of
3372 /// the tree entry, even for those, which are not part of the graph. It is
3373 /// required to correctly follow the dependencies between the instructions and
3374 /// their correct scheduling. The ScheduleData is not allocated for the
3375 /// instructions, which do not require scheduling, like phis, nodes with
3376 /// extractelements/insertelements only or nodes with instructions, with
3377 /// uses/operands outside of the block.
3378 struct BlockScheduling {
3379 BlockScheduling(BasicBlock *BB)
3380 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3381
3382 void clear() {
3383 ReadyInsts.clear();
3384 ScheduleStart = nullptr;
3385 ScheduleEnd = nullptr;
3386 FirstLoadStoreInRegion = nullptr;
3387 LastLoadStoreInRegion = nullptr;
3388 RegionHasStackSave = false;
3389
3390 // Reduce the maximum schedule region size by the size of the
3391 // previous scheduling run.
3392 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3393 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3394 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3395 ScheduleRegionSize = 0;
3396
3397 // Make a new scheduling region, i.e. all existing ScheduleData is not
3398 // in the new region yet.
3399 ++SchedulingRegionID;
3400 }
3401
3402 ScheduleData *getScheduleData(Instruction *I) {
3403 if (BB != I->getParent())
3404 // Avoid lookup if can't possibly be in map.
3405 return nullptr;
3406 ScheduleData *SD = ScheduleDataMap.lookup(I);
3407 if (SD && isInSchedulingRegion(SD))
3408 return SD;
3409 return nullptr;
3410 }
3411
3412 ScheduleData *getScheduleData(Value *V) {
3413 if (auto *I = dyn_cast<Instruction>(V))
3414 return getScheduleData(I);
3415 return nullptr;
3416 }
3417
3418 ScheduleData *getScheduleData(Value *V, Value *Key) {
3419 if (V == Key)
3420 return getScheduleData(V);
3421 auto I = ExtraScheduleDataMap.find(V);
3422 if (I != ExtraScheduleDataMap.end()) {
3423 ScheduleData *SD = I->second.lookup(Key);
3424 if (SD && isInSchedulingRegion(SD))
3425 return SD;
3426 }
3427 return nullptr;
3428 }
3429
3430 bool isInSchedulingRegion(ScheduleData *SD) const {
3431 return SD->SchedulingRegionID == SchedulingRegionID;
3432 }
3433
3434 /// Marks an instruction as scheduled and puts all dependent ready
3435 /// instructions into the ready-list.
3436 template <typename ReadyListType>
3437 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3438 SD->IsScheduled = true;
3439 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3440
3441 for (ScheduleData *BundleMember = SD; BundleMember;
3442 BundleMember = BundleMember->NextInBundle) {
3443 if (BundleMember->Inst != BundleMember->OpValue)
3444 continue;
3445
3446 // Handle the def-use chain dependencies.
3447
3448 // Decrement the unscheduled counter and insert to ready list if ready.
3449 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3450 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
3451 if (OpDef && OpDef->hasValidDependencies() &&
3452 OpDef->incrementUnscheduledDeps(-1) == 0) {
3453 // There are no more unscheduled dependencies after
3454 // decrementing, so we can put the dependent instruction
3455 // into the ready list.
3456 ScheduleData *DepBundle = OpDef->FirstInBundle;
3457 assert(!DepBundle->IsScheduled &&
3458 "already scheduled bundle gets ready");
3459 ReadyList.insert(DepBundle);
3460 LLVM_DEBUG(dbgs()
3461 << "SLP: gets ready (def): " << *DepBundle << "\n");
3462 }
3463 });
3464 };
3465
3466 // If BundleMember is a vector bundle, its operands may have been
3467 // reordered during buildTree(). We therefore need to get its operands
3468 // through the TreeEntry.
3469 if (TreeEntry *TE = BundleMember->TE) {
3470 // Need to search for the lane since the tree entry can be reordered.
3471 int Lane = std::distance(TE->Scalars.begin(),
3472 find(TE->Scalars, BundleMember->Inst));
3473 assert(Lane >= 0 && "Lane not set");
3474
3475 // Since vectorization tree is being built recursively this assertion
3476 // ensures that the tree entry has all operands set before reaching
3477 // this code. Couple of exceptions known at the moment are extracts
3478 // where their second (immediate) operand is not added. Since
3479 // immediates do not affect scheduler behavior this is considered
3480 // okay.
3481 auto *In = BundleMember->Inst;
3482 assert(
3483 In &&
3484 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3485 In->getNumOperands() == TE->getNumOperands()) &&
3486 "Missed TreeEntry operands?");
3487 (void)In; // fake use to avoid build failure when assertions disabled
3488
3489 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3490 OpIdx != NumOperands; ++OpIdx)
3491 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3492 DecrUnsched(I);
3493 } else {
3494 // If BundleMember is a stand-alone instruction, no operand reordering
3495 // has taken place, so we directly access its operands.
3496 for (Use &U : BundleMember->Inst->operands())
3497 if (auto *I = dyn_cast<Instruction>(U.get()))
3498 DecrUnsched(I);
3499 }
3500 // Handle the memory dependencies.
3501 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3502 if (MemoryDepSD->hasValidDependencies() &&
3503 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3504 // There are no more unscheduled dependencies after decrementing,
3505 // so we can put the dependent instruction into the ready list.
3506 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3507 assert(!DepBundle->IsScheduled &&
3508 "already scheduled bundle gets ready");
3509 ReadyList.insert(DepBundle);
3511 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3512 }
3513 }
3514 // Handle the control dependencies.
3515 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3516 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3517 // There are no more unscheduled dependencies after decrementing,
3518 // so we can put the dependent instruction into the ready list.
3519 ScheduleData *DepBundle = DepSD->FirstInBundle;
3520 assert(!DepBundle->IsScheduled &&
3521 "already scheduled bundle gets ready");
3522 ReadyList.insert(DepBundle);
3524 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3525 }
3526 }
3527 }
3528 }
3529
3530 /// Verify basic self consistency properties of the data structure.
3531 void verify() {
3532 if (!ScheduleStart)
3533 return;
3534
3535 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3536 ScheduleStart->comesBefore(ScheduleEnd) &&
3537 "Not a valid scheduling region?");
3538
3539 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3540 auto *SD = getScheduleData(I);
3541 if (!SD)
3542 continue;
3543 assert(isInSchedulingRegion(SD) &&
3544 "primary schedule data not in window?");
3545 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3546 "entire bundle in window!");
3547 (void)SD;
3548 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3549 }
3550
3551 for (auto *SD : ReadyInsts) {
3552 assert(SD->isSchedulingEntity() && SD->isReady() &&
3553 "item in ready list not ready?");
3554 (void)SD;
3555 }
3556 }
3557
3558 void doForAllOpcodes(Value *V,
3559 function_ref<void(ScheduleData *SD)> Action) {
3560 if (ScheduleData *SD = getScheduleData(V))
3561 Action(SD);
3562 auto I = ExtraScheduleDataMap.find(V);
3563 if (I != ExtraScheduleDataMap.end())
3564 for (auto &P : I->second)
3565 if (isInSchedulingRegion(P.second))
3566 Action(P.second);
3567 }
3568
3569 /// Put all instructions into the ReadyList which are ready for scheduling.
3570 template <typename ReadyListType>
3571 void initialFillReadyList(ReadyListType &ReadyList) {
3572 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3573 doForAllOpcodes(I, [&](ScheduleData *SD) {
3574 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3575 SD->isReady()) {
3576 ReadyList.insert(SD);
3577 LLVM_DEBUG(dbgs()
3578 << "SLP: initially in ready list: " << *SD << "\n");
3579 }
3580 });
3581 }
3582 }
3583
3584 /// Build a bundle from the ScheduleData nodes corresponding to the
3585 /// scalar instruction for each lane.
3586 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3587
3588 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3589 /// cyclic dependencies. This is only a dry-run, no instructions are
3590 /// actually moved at this stage.
3591 /// \returns the scheduling bundle. The returned Optional value is not
3592 /// std::nullopt if \p VL is allowed to be scheduled.
3593 std::optional<ScheduleData *>
3594 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3595 const InstructionsState &S);
3596
3597 /// Un-bundles a group of instructions.
3598 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3599
3600 /// Allocates schedule data chunk.
3601 ScheduleData *allocateScheduleDataChunks();
3602
3603 /// Extends the scheduling region so that V is inside the region.
3604 /// \returns true if the region size is within the limit.
3605 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3606
3607 /// Initialize the ScheduleData structures for new instructions in the
3608 /// scheduling region.
3609 void initScheduleData(Instruction *FromI, Instruction *ToI,
3610 ScheduleData *PrevLoadStore,
3611 ScheduleData *NextLoadStore);
3612
3613 /// Updates the dependency information of a bundle and of all instructions/
3614 /// bundles which depend on the original bundle.
3615 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3616 BoUpSLP *SLP);
3617
3618 /// Sets all instruction in the scheduling region to un-scheduled.
3619 void resetSchedule();
3620
3621 BasicBlock *BB;
3622
3623 /// Simple memory allocation for ScheduleData.
3625
3626 /// The size of a ScheduleData array in ScheduleDataChunks.
3627 int ChunkSize;
3628
3629 /// The allocator position in the current chunk, which is the last entry
3630 /// of ScheduleDataChunks.
3631 int ChunkPos;
3632
3633 /// Attaches ScheduleData to Instruction.
3634 /// Note that the mapping survives during all vectorization iterations, i.e.
3635 /// ScheduleData structures are recycled.
3637
3638 /// Attaches ScheduleData to Instruction with the leading key.
3640 ExtraScheduleDataMap;
3641
3642 /// The ready-list for scheduling (only used for the dry-run).
3643 SetVector<ScheduleData *> ReadyInsts;
3644
3645 /// The first instruction of the scheduling region.
3646 Instruction *ScheduleStart = nullptr;
3647
3648 /// The first instruction _after_ the scheduling region.
3649 Instruction *ScheduleEnd = nullptr;
3650
3651 /// The first memory accessing instruction in the scheduling region
3652 /// (can be null).
3653 ScheduleData *FirstLoadStoreInRegion = nullptr;
3654
3655 /// The last memory accessing instruction in the scheduling region
3656 /// (can be null).
3657 ScheduleData *LastLoadStoreInRegion = nullptr;
3658
3659 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3660 /// region? Used to optimize the dependence calculation for the
3661 /// common case where there isn't.
3662 bool RegionHasStackSave = false;
3663
3664 /// The current size of the scheduling region.
3665 int ScheduleRegionSize = 0;
3666
3667 /// The maximum size allowed for the scheduling region.
3668 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3669
3670 /// The ID of the scheduling region. For a new vectorization iteration this
3671 /// is incremented which "removes" all ScheduleData from the region.
3672 /// Make sure that the initial SchedulingRegionID is greater than the
3673 /// initial SchedulingRegionID in ScheduleData (which is 0).
3674 int SchedulingRegionID = 1;
3675 };
3676
3677 /// Attaches the BlockScheduling structures to basic blocks.
3679
3680 /// Performs the "real" scheduling. Done before vectorization is actually
3681 /// performed in a basic block.
3682 void scheduleBlock(BlockScheduling *BS);
3683
3684 /// List of users to ignore during scheduling and that don't need extracting.
3685 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3686
3687 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3688 /// sorted SmallVectors of unsigned.
3689 struct OrdersTypeDenseMapInfo {
3690 static OrdersType getEmptyKey() {
3691 OrdersType V;
3692 V.push_back(~1U);
3693 return V;
3694 }
3695
3696 static OrdersType getTombstoneKey() {
3697 OrdersType V;
3698 V.push_back(~2U);
3699 return V;
3700 }
3701
3702 static unsigned getHashValue(const OrdersType &V) {
3703 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3704 }
3705
3706 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3707 return LHS == RHS;
3708 }
3709 };
3710
3711 // Analysis and block reference.
3712 Function *F;
3713 ScalarEvolution *SE;
3715 TargetLibraryInfo *TLI;
3716 LoopInfo *LI;
3717 DominatorTree *DT;
3718 AssumptionCache *AC;
3719 DemandedBits *DB;
3720 const DataLayout *DL;
3722
3723 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3724 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3725
3726 /// Instruction builder to construct the vectorized tree.
3728
3729 /// A map of scalar integer values to the smallest bit width with which they
3730 /// can legally be represented. The values map to (width, signed) pairs,
3731 /// where "width" indicates the minimum bit width and "signed" is True if the
3732 /// value must be signed-extended, rather than zero-extended, back to its
3733 /// original width.
3735
3736 /// Final size of the reduced vector, if the current graph represents the
3737 /// input for the reduction and it was possible to narrow the size of the
3738 /// reduction.
3739 unsigned ReductionBitWidth = 0;
3740
3741 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
3742 /// type sizes, used in the tree.
3743 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
3744
3745 /// Indices of the vectorized nodes, which supposed to be the roots of the new
3746 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
3747 DenseSet<unsigned> ExtraBitWidthNodes;
3748};
3749
3750} // end namespace slpvectorizer
3751
3752template <> struct GraphTraits<BoUpSLP *> {
3753 using TreeEntry = BoUpSLP::TreeEntry;
3754
3755 /// NodeRef has to be a pointer per the GraphWriter.
3757
3759
3760 /// Add the VectorizableTree to the index iterator to be able to return
3761 /// TreeEntry pointers.
3762 struct ChildIteratorType
3763 : public iterator_adaptor_base<
3764 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3766
3768 ContainerTy &VT)
3769 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
3770
3771 NodeRef operator*() { return I->UserTE; }
3772 };
3773
3775 return R.VectorizableTree[0].get();
3776 }
3777
3778 static ChildIteratorType child_begin(NodeRef N) {
3779 return {N->UserTreeIndices.begin(), N->Container};
3780 }
3781
3782 static ChildIteratorType child_end(NodeRef N) {
3783 return {N->UserTreeIndices.end(), N->Container};
3784 }
3785
3786 /// For the node iterator we just need to turn the TreeEntry iterator into a
3787 /// TreeEntry* iterator so that it dereferences to NodeRef.
3788 class nodes_iterator {
3790 ItTy It;
3791
3792 public:
3793 nodes_iterator(const ItTy &It2) : It(It2) {}
3794 NodeRef operator*() { return It->get(); }
3795 nodes_iterator operator++() {
3796 ++It;
3797 return *this;
3798 }
3799 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
3800 };
3801
3802 static nodes_iterator nodes_begin(BoUpSLP *R) {
3803 return nodes_iterator(R->VectorizableTree.begin());
3804 }
3805
3806 static nodes_iterator nodes_end(BoUpSLP *R) {
3807 return nodes_iterator(R->VectorizableTree.end());
3808 }
3809
3810 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
3811};
3812
3813template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
3814 using TreeEntry = BoUpSLP::TreeEntry;
3815
3816 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
3817
3818 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
3819 std::string Str;
3821 OS << Entry->Idx << ".\n";
3822 if (isSplat(Entry->Scalars))
3823 OS << "<splat> ";
3824 for (auto *V : Entry->Scalars) {
3825 OS << *V;
3826 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
3827 return EU.Scalar == V;
3828 }))
3829 OS << " <extract>";
3830 OS << "\n";
3831 }
3832 return Str;
3833 }
3834
3835 static std::string getNodeAttributes(const TreeEntry *Entry,
3836 const BoUpSLP *) {
3837 if (Entry->State == TreeEntry::NeedToGather)
3838 return "color=red";
3839 if (Entry->State == TreeEntry::ScatterVectorize ||
3840 Entry->State == TreeEntry::StridedVectorize)
3841 return "color=blue";
3842 return "";
3843 }
3844};
3845
3846} // end namespace llvm
3847
3850 for (auto *I : DeletedInstructions) {
3851 for (Use &U : I->operands()) {
3852 auto *Op = dyn_cast<Instruction>(U.get());
3853 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
3855 DeadInsts.emplace_back(Op);
3856 }
3857 I->dropAllReferences();
3858 }
3859 for (auto *I : DeletedInstructions) {
3860 assert(I->use_empty() &&
3861 "trying to erase instruction with users.");
3862 I->eraseFromParent();
3863 }
3864
3865 // Cleanup any dead scalar code feeding the vectorized instructions
3867
3868#ifdef EXPENSIVE_CHECKS
3869 // If we could guarantee that this call is not extremely slow, we could
3870 // remove the ifdef limitation (see PR47712).
3871 assert(!verifyFunction(*F, &dbgs()));
3872#endif
3873}
3874
3875/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
3876/// contains original mask for the scalars reused in the node. Procedure
3877/// transform this mask in accordance with the given \p Mask.
3879 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
3880 "Expected non-empty mask.");
3881 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
3882 Prev.swap(Reuses);
3883 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
3884 if (Mask[I] != PoisonMaskElem)
3885 Reuses[Mask[I]] = Prev[I];
3886}
3887
3888/// Reorders the given \p Order according to the given \p Mask. \p Order - is
3889/// the original order of the scalars. Procedure transforms the provided order
3890/// in accordance with the given \p Mask. If the resulting \p Order is just an
3891/// identity order, \p Order is cleared.
3893 bool BottomOrder = false) {
3894 assert(!Mask.empty() && "Expected non-empty mask.");
3895 unsigned Sz = Mask.size();
3896 if (BottomOrder) {
3897 SmallVector<unsigned> PrevOrder;
3898 if (Order.empty()) {
3899 PrevOrder.resize(Sz);
3900 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
3901 } else {
3902 PrevOrder.swap(Order);
3903 }
3904 Order.assign(Sz, Sz);
3905 for (unsigned I = 0; I < Sz; ++I)
3906 if (Mask[I] != PoisonMaskElem)
3907 Order[I] = PrevOrder[Mask[I]];
3908 if (all_of(enumerate(Order), [&](const auto &Data) {
3909 return Data.value() == Sz || Data.index() == Data.value();
3910 })) {
3911 Order.clear();
3912 return;
3913 }
3914 fixupOrderingIndices(Order);
3915 return;
3916 }
3917 SmallVector<int> MaskOrder;
3918 if (Order.empty()) {
3919 MaskOrder.resize(Sz);
3920 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
3921 } else {
3922 inversePermutation(Order, MaskOrder);
3923 }
3924 reorderReuses(MaskOrder, Mask);
3925 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
3926 Order.clear();
3927 return;
3928 }
3929 Order.assign(Sz, Sz);
3930 for (unsigned I = 0; I < Sz; ++I)
3931 if (MaskOrder[I] != PoisonMaskElem)
3932 Order[MaskOrder[I]] = I;
3933 fixupOrderingIndices(Order);
3934}
3935
3936std::optional<BoUpSLP::OrdersType>
3937BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
3938 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
3939 // Try to find subvector extract/insert patterns and reorder only such
3940 // patterns.
3941 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
3942 Type *ScalarTy = GatheredScalars.front()->getType();
3943 int NumScalars = GatheredScalars.size();
3944 if (!isValidElementType(ScalarTy))
3945 return std::nullopt;
3946 auto *VecTy = FixedVectorType::get(ScalarTy, NumScalars);
3947 int NumParts = TTI->getNumberOfParts(VecTy);
3948 if (NumParts == 0 || NumParts >= NumScalars)
3949 NumParts = 1;
3950 SmallVector<int> ExtractMask;
3951 SmallVector<int> Mask;
3954 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
3956 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
3957 /*ForOrder=*/true);
3958 // No shuffled operands - ignore.
3959 if (GatherShuffles.empty() && ExtractShuffles.empty())
3960 return std::nullopt;
3961 OrdersType CurrentOrder(NumScalars, NumScalars);
3962 if (GatherShuffles.size() == 1 &&
3963 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
3964 Entries.front().front()->isSame(TE.Scalars)) {
3965 // Perfect match in the graph, will reuse the previously vectorized
3966 // node. Cost is 0.
3967 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
3968 return CurrentOrder;
3969 }
3970 auto IsSplatMask = [](ArrayRef<int> Mask) {
3971 int SingleElt = PoisonMaskElem;
3972 return all_of(Mask, [&](int I) {
3973 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
3974 SingleElt = I;
3975 return I == PoisonMaskElem || I == SingleElt;
3976 });
3977 };
3978 // Exclusive broadcast mask - ignore.
3979 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
3980 (Entries.size() != 1 ||
3981 Entries.front().front()->ReorderIndices.empty())) ||
3982 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
3983 return std::nullopt;
3984 SmallBitVector ShuffledSubMasks(NumParts);
3985 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
3986 ArrayRef<int> Mask, int PartSz, int NumParts,
3987 function_ref<unsigned(unsigned)> GetVF) {
3988 for (int I : seq<int>(0, NumParts)) {
3989 if (ShuffledSubMasks.test(I))
3990 continue;
3991 const int VF = GetVF(I);
3992 if (VF == 0)
3993 continue;
3994 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, PartSz);
3995 // Shuffle of at least 2 vectors - ignore.
3996 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
3997 std::fill(Slice.begin(), Slice.end(), NumScalars);
3998 ShuffledSubMasks.set(I);
3999 continue;
4000 }
4001 // Try to include as much elements from the mask as possible.
4002 int FirstMin = INT_MAX;
4003 int SecondVecFound = false;
4004 for (int K : seq<int>(0, PartSz)) {
4005 int Idx = Mask[I * PartSz + K];
4006 if (Idx == PoisonMaskElem) {
4007 Value *V = GatheredScalars[I * PartSz + K];
4008 if (isConstant(V) && !isa<PoisonValue>(V)) {
4009 SecondVecFound = true;
4010 break;
4011 }
4012 continue;
4013 }
4014 if (Idx < VF) {
4015 if (FirstMin > Idx)
4016 FirstMin = Idx;
4017 } else {
4018 SecondVecFound = true;
4019 break;
4020 }
4021 }
4022 FirstMin = (FirstMin / PartSz) * PartSz;
4023 // Shuffle of at least 2 vectors - ignore.
4024 if (SecondVecFound) {
4025 std::fill(Slice.begin(), Slice.end(), NumScalars);
4026 ShuffledSubMasks.set(I);
4027 continue;
4028 }
4029 for (int K : seq<int>(0, PartSz)) {
4030 int Idx = Mask[I * PartSz + K];
4031 if (Idx == PoisonMaskElem)
4032 continue;
4033 Idx -= FirstMin;
4034 if (Idx >= PartSz) {
4035 SecondVecFound = true;
4036 break;
4037 }
4038 if (CurrentOrder[I * PartSz + Idx] >
4039 static_cast<unsigned>(I * PartSz + K) &&
4040 CurrentOrder[I * PartSz + Idx] !=
4041 static_cast<unsigned>(I * PartSz + Idx))
4042 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4043 }
4044 // Shuffle of at least 2 vectors - ignore.
4045 if (SecondVecFound) {
4046 std::fill(Slice.begin(), Slice.end(), NumScalars);
4047 ShuffledSubMasks.set(I);
4048 continue;
4049 }
4050 }
4051 };
4052 int PartSz = NumScalars / NumParts;
4053 if (!ExtractShuffles.empty())
4054 TransformMaskToOrder(
4055 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4056 if (!ExtractShuffles[I])
4057 return 0U;
4058 unsigned VF = 0;
4059 for (unsigned Idx : seq<unsigned>(0, PartSz)) {
4060 int K = I * PartSz + Idx;
4061 if (ExtractMask[K] == PoisonMaskElem)
4062 continue;
4063 if (!TE.ReuseShuffleIndices.empty())
4064 K = TE.ReuseShuffleIndices[K];
4065 if (!TE.ReorderIndices.empty())
4066 K = std::distance(TE.ReorderIndices.begin(),
4067 find(TE.ReorderIndices, K));
4068 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4069 if (!EI)
4070 continue;
4071 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4072 ->getElementCount()
4073 .getKnownMinValue());
4074 }
4075 return VF;
4076 });
4077 // Check special corner case - single shuffle of the same entry.
4078 if (GatherShuffles.size() == 1 && NumParts != 1) {
4079 if (ShuffledSubMasks.any())
4080 return std::nullopt;
4081 PartSz = NumScalars;
4082 NumParts = 1;
4083 }
4084 if (!Entries.empty())
4085 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4086 if (!GatherShuffles[I])
4087 return 0U;
4088 return std::max(Entries[I].front()->getVectorFactor(),
4089 Entries[I].back()->getVectorFactor());
4090 });
4091 int NumUndefs =
4092 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4093 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4094 return std::nullopt;
4095 return std::move(CurrentOrder);
4096}
4097
4098static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4099 const TargetLibraryInfo &TLI,
4100 bool CompareOpcodes = true) {
4101 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
4102 return false;
4103 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4104 if (!GEP1)
4105 return false;
4106 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4107 if (!GEP2)
4108 return false;
4109 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4110 ((isConstant(GEP1->getOperand(1)) &&
4111 isConstant(GEP2->getOperand(1))) ||
4112 !CompareOpcodes ||
4113 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4114 .getOpcode());
4115}
4116
4117/// Calculates minimal alignment as a common alignment.
4118template <typename T>
4120 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4121 for (Value *V : VL.drop_front())
4122 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4123 return CommonAlignment;
4124}
4125
4126/// Check if \p Order represents reverse order.
4128 unsigned Sz = Order.size();
4129 return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
4130 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4131 });
4132}
4133
4134/// Checks if the provided list of pointers \p Pointers represents the strided
4135/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4136/// Otherwise, if \p Inst is not specified, just initialized optional value is
4137/// returned to show that the pointers represent strided pointers. If \p Inst
4138/// specified, the runtime stride is materialized before the given \p Inst.
4139/// \returns std::nullopt if the pointers are not pointers with the runtime
4140/// stride, nullptr or actual stride value, otherwise.
4141static std::optional<Value *>
4143 const DataLayout &DL, ScalarEvolution &SE,
4144 SmallVectorImpl<unsigned> &SortedIndices,
4145 Instruction *Inst = nullptr) {
4147 const SCEV *PtrSCEVLowest = nullptr;
4148 const SCEV *PtrSCEVHighest = nullptr;
4149 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4150 // addresses).
4151 for (Value *Ptr : PointerOps) {
4152 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4153 if (!PtrSCEV)
4154 return std::nullopt;
4155 SCEVs.push_back(PtrSCEV);
4156 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4157 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4158 continue;
4159 }
4160 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4161 if (isa<SCEVCouldNotCompute>(Diff))
4162 return std::nullopt;
4163 if (Diff->isNonConstantNegative()) {
4164 PtrSCEVLowest = PtrSCEV;
4165 continue;
4166 }
4167 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4168 if (isa<SCEVCouldNotCompute>(Diff1))
4169 return std::nullopt;
4170 if (Diff1->isNonConstantNegative()) {
4171 PtrSCEVHighest = PtrSCEV;
4172 continue;
4173 }
4174 }
4175 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4176 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4177 if (isa<SCEVCouldNotCompute>(Dist))
4178 return std::nullopt;
4179 int Size = DL.getTypeStoreSize(ElemTy);
4180 auto TryGetStride = [&](const SCEV *Dist,
4181 const SCEV *Multiplier) -> const SCEV * {
4182 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4183 if (M->getOperand(0) == Multiplier)
4184 return M->getOperand(1);
4185 if (M->getOperand(1) == Multiplier)
4186 return M->getOperand(0);
4187 return nullptr;
4188 }
4189 if (Multiplier == Dist)
4190 return SE.getConstant(Dist->getType(), 1);
4191 return SE.getUDivExactExpr(Dist, Multiplier);
4192 };
4193 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4194 const SCEV *Stride = nullptr;
4195 if (Size != 1 || SCEVs.size() > 2) {
4196 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4197 Stride = TryGetStride(Dist, Sz);
4198 if (!Stride)
4199 return std::nullopt;
4200 }
4201 if (!Stride || isa<SCEVConstant>(Stride))
4202 return std::nullopt;
4203 // Iterate through all pointers and check if all distances are
4204 // unique multiple of Stride.
4205 using DistOrdPair = std::pair<int64_t, int>;
4206 auto Compare = llvm::less_first();
4207 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4208 int Cnt = 0;
4209 bool IsConsecutive = true;
4210 for (const SCEV *PtrSCEV : SCEVs) {
4211 unsigned Dist = 0;
4212 if (PtrSCEV != PtrSCEVLowest) {
4213 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4214 const SCEV *Coeff = TryGetStride(Diff, Stride);
4215 if (!Coeff)
4216 return std::nullopt;
4217 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4218 if (!SC || isa<SCEVCouldNotCompute>(SC))
4219 return std::nullopt;
4220 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4221 SE.getMulExpr(Stride, SC)))
4222 ->isZero())
4223 return std::nullopt;
4224 Dist = SC->getAPInt().getZExtValue();
4225 }
4226 // If the strides are not the same or repeated, we can't vectorize.
4227 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4228 return std::nullopt;
4229 auto Res = Offsets.emplace(Dist, Cnt);
4230 if (!Res.second)
4231 return std::nullopt;
4232 // Consecutive order if the inserted element is the last one.
4233 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4234 ++Cnt;
4235 }
4236 if (Offsets.size() != SCEVs.size())
4237 return std::nullopt;
4238 SortedIndices.clear();
4239 if (!IsConsecutive) {
4240 // Fill SortedIndices array only if it is non-consecutive.
4241 SortedIndices.resize(PointerOps.size());
4242 Cnt = 0;
4243 for (const std::pair<int64_t, int> &Pair : Offsets) {
4244 SortedIndices[Cnt] = Pair.second;
4245 ++Cnt;
4246 }
4247 }
4248 if (!Inst)
4249 return nullptr;
4250 SCEVExpander Expander(SE, DL, "strided-load-vec");
4251 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4252}
4253
4255 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
4256 SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
4257 // Check that a vectorized load would load the same memory as a scalar
4258 // load. For example, we don't want to vectorize loads that are smaller
4259 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4260 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4261 // from such a struct, we read/write packed bits disagreeing with the
4262 // unvectorized version.
4263 Type *ScalarTy = VL0->getType();
4264
4265 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4266 return LoadsState::Gather;
4267
4268 // Make sure all loads in the bundle are simple - we can't vectorize
4269 // atomic or volatile loads.
4270 PointerOps.clear();
4271 const unsigned Sz = VL.size();
4272 PointerOps.resize(Sz);
4273 auto *POIter = PointerOps.begin();
4274 for (Value *V : VL) {
4275 auto *L = cast<LoadInst>(V);
4276 if (!L->isSimple())
4277 return LoadsState::Gather;
4278 *POIter = L->getPointerOperand();
4279 ++POIter;
4280 }
4281
4282 Order.clear();
4283 auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
4284 // Check the order of pointer operands or that all pointers are the same.
4285 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4286 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4287 if (!Order.empty() && !isPowerOf2_32(VL.size())) {
4288 assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4289 "supported with VectorizeNonPowerOf2");
4290 return LoadsState::Gather;
4291 }
4292
4293 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4294 if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
4295 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4296 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4298 if (IsSorted || all_of(PointerOps, [&](Value *P) {
4299 return arePointersCompatible(P, PointerOps.front(), *TLI);
4300 })) {
4301 if (IsSorted) {
4302 Value *Ptr0;
4303 Value *PtrN;
4304 if (Order.empty()) {
4305 Ptr0 = PointerOps.front();
4306 PtrN = PointerOps.back();
4307 } else {
4308 Ptr0 = PointerOps[Order.front()];
4309 PtrN = PointerOps[Order.back()];
4310 }
4311 std::optional<int> Diff =
4312 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
4313 // Check that the sorted loads are consecutive.
4314 if (static_cast<unsigned>(*Diff) == Sz - 1)
4315 return LoadsState::Vectorize;
4316 // Simple check if not a strided access - clear order.
4317 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4318 // Try to generate strided load node if:
4319 // 1. Target with strided load support is detected.
4320 // 2. The number of loads is greater than MinProfitableStridedLoads,
4321 // or the potential stride <= MaxProfitableLoadStride and the
4322 // potential stride is power-of-2 (to avoid perf regressions for the very
4323 // small number of loads) and max distance > number of loads, or potential
4324 // stride is -1.
4325 // 3. The loads are ordered, or number of unordered loads <=
4326 // MaxProfitableUnorderedLoads, or loads are in reversed order.
4327 // (this check is to avoid extra costs for very expensive shuffles).
4328 if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
4329 (static_cast<unsigned>(std::abs(*Diff)) <=
4331 isPowerOf2_32(std::abs(*Diff)))) &&
4332 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4333 *Diff == -(static_cast<int>(Sz) - 1))) {
4334 int Stride = *Diff / static_cast<int>(Sz - 1);
4335 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4336 Align Alignment =
4337 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
4338 ->getAlign();
4339 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
4340 // Iterate through all pointers and check if all distances are
4341 // unique multiple of Dist.
4342 SmallSet<int, 4> Dists;
4343 for (Value *Ptr : PointerOps) {
4344 int Dist = 0;
4345 if (Ptr == PtrN)
4346 Dist = *Diff;
4347 else if (Ptr != Ptr0)
4348 Dist =
4349 *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
4350 // If the strides are not the same or repeated, we can't
4351 // vectorize.
4352 if (((Dist / Stride) * Stride) != Dist ||
4353 !Dists.insert(Dist).second)
4354 break;
4355 }
4356 if (Dists.size() == Sz)
4358 }
4359 }
4360 }
4361 }
4362 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4363 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
4364 unsigned MinVF = getMinVF(Sz);
4365 unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
4366 MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
4367 for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4368 unsigned VectorizedCnt = 0;
4370 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4371 Cnt += VF, ++VectorizedCnt) {
4372 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4374 SmallVector<Value *> PointerOps;
4375 LoadsState LS =
4376 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
4377 /*TryRecursiveCheck=*/false);
4378 // Check that the sorted loads are consecutive.
4379 if (LS == LoadsState::Gather)
4380 break;
4381 // If need the reorder - consider as high-cost masked gather for now.
4382 if ((LS == LoadsState::Vectorize ||
4384 !Order.empty() && !isReverseOrder(Order))
4386 States.push_back(LS);
4387 }
4388 // Can be vectorized later as a serie of loads/insertelements.
4389 if (VectorizedCnt == VL.size() / VF) {
4390 // Compare masked gather cost and loads + insersubvector costs.
4392 InstructionCost MaskedGatherCost = TTI.getGatherScatterOpCost(
4393 Instruction::Load, VecTy,
4394 cast<LoadInst>(VL0)->getPointerOperand(),
4395 /*VariableMask=*/false, CommonAlignment, CostKind);
4396 InstructionCost VecLdCost = 0;
4397 auto *SubVecTy = FixedVectorType::get(ScalarTy, VF);
4398 for (auto [I, LS] : enumerate(States)) {
4399 auto *LI0 = cast<LoadInst>(VL[I * VF]);
4400 switch (LS) {
4402 VecLdCost += TTI.getMemoryOpCost(
4403 Instruction::Load, SubVecTy, LI0->getAlign(),
4404 LI0->getPointerAddressSpace(), CostKind,
4406 break;
4408 VecLdCost += TTI.getStridedMemoryOpCost(
4409 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4410 /*VariableMask=*/false, CommonAlignment, CostKind);
4411 break;
4413 VecLdCost += TTI.getGatherScatterOpCost(
4414 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4415 /*VariableMask=*/false, CommonAlignment, CostKind);
4416 break;
4417 case LoadsState::Gather:
4419 "Expected only consecutive, strided or masked gather loads.");
4420 }
4421 SmallVector<int> ShuffleMask(VL.size());
4422 for (int Idx : seq<int>(0, VL.size()))
4423 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4424 VecLdCost +=
4425 TTI.getShuffleCost(TTI ::SK_InsertSubvector, VecTy,
4426 ShuffleMask, CostKind, I * VF, SubVecTy);
4427 }
4428 // If masked gather cost is higher - better to vectorize, so
4429 // consider it as a gather node. It will be better estimated
4430 // later.
4431 if (MaskedGatherCost > VecLdCost)
4432 return true;
4433 }
4434 }
4435 return false;
4436 };
4437 // TODO: need to improve analysis of the pointers, if not all of them are
4438 // GEPs or have > 2 operands, we end up with a gather node, which just
4439 // increases the cost.
4440 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
4441 bool ProfitableGatherPointers =
4442 L && Sz > 2 &&
4443 static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
4444 return L->isLoopInvariant(V);
4445 })) <= Sz / 2;
4446 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
4447 auto *GEP = dyn_cast<GetElementPtrInst>(P);
4448 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
4449 (GEP && GEP->getNumOperands() == 2 &&
4450 isa<Constant, Instruction>(GEP->getOperand(1)));
4451 })) {
4452 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4453 if (TTI->isLegalMaskedGather(VecTy, CommonAlignment) &&
4454 !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
4455 // Check if potential masked gather can be represented as series
4456 // of loads + insertsubvectors.
4457 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4458 // If masked gather cost is higher - better to vectorize, so
4459 // consider it as a gather node. It will be better estimated
4460 // later.
4461 return LoadsState::Gather;
4462 }
4464 }
4465 }
4466 }
4467
4468 return LoadsState::Gather;
4469}
4470
4472 const DataLayout &DL, ScalarEvolution &SE,
4473 SmallVectorImpl<unsigned> &SortedIndices) {
4475 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4476 "Expected list of pointer operands.");
4477 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4478 // Ptr into, sort and return the sorted indices with values next to one
4479 // another.
4481 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4482
4483 unsigned Cnt = 1;
4484 for (Value *Ptr : VL.drop_front()) {
4485 bool Found = any_of(Bases, [&](auto &Base) {
4486 std::optional<int> Diff =
4487 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4488 /*StrictCheck=*/true);
4489 if (!Diff)
4490 return false;
4491
4492 Base.second.emplace_back(Ptr, *Diff, Cnt++);
4493 return true;
4494 });
4495
4496 if (!Found) {
4497 // If we haven't found enough to usefully cluster, return early.
4498 if (Bases.size() > VL.size() / 2 - 1)
4499 return false;
4500
4501 // Not found already - add a new Base
4502 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
4503 }
4504 }
4505
4506 // For each of the bases sort the pointers by Offset and check if any of the
4507 // base become consecutively allocated.
4508 bool AnyConsecutive = false;
4509 for (auto &Base : Bases) {
4510 auto &Vec = Base.second;
4511 if (Vec.size() > 1) {
4512 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
4513 const std::tuple<Value *, int, unsigned> &Y) {
4514 return std::get<1>(X) < std::get<1>(Y);
4515 });
4516 int InitialOffset = std::get<1>(Vec[0]);
4517 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
4518 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4519 });
4520 }
4521 }
4522
4523 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4524 SortedIndices.clear();
4525 if (!AnyConsecutive)
4526 return false;
4527
4528 for (auto &Base : Bases) {
4529 for (auto &T : Base.second)
4530 SortedIndices.push_back(std::get<2>(T));
4531 }
4532
4533 assert(SortedIndices.size() == VL.size() &&
4534 "Expected SortedIndices to be the size of VL");
4535 return true;
4536}
4537
4538std::optional<BoUpSLP::OrdersType>
4539BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4540 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
4541 Type *ScalarTy = TE.Scalars[0]->getType();
4542
4544 Ptrs.reserve(TE.Scalars.size());
4545 for (Value *V : TE.Scalars) {
4546 auto *L = dyn_cast<LoadInst>(V);
4547 if (!L || !L->isSimple())
4548 return std::nullopt;
4549 Ptrs.push_back(L->getPointerOperand());
4550 }
4551
4552 BoUpSLP::OrdersType Order;
4553 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
4554 return std::move(Order);
4555 return std::nullopt;
4556}
4557
4558/// Check if two insertelement instructions are from the same buildvector.
4561 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4562 // Instructions must be from the same basic blocks.
4563 if (VU->getParent() != V->getParent())
4564 return false;
4565 // Checks if 2 insertelements are from the same buildvector.
4566 if (VU->getType() != V->getType())
4567 return false;
4568 // Multiple used inserts are separate nodes.
4569 if (!VU->hasOneUse() && !V->hasOneUse())
4570 return false;
4571 auto *IE1 = VU;
4572 auto *IE2 = V;
4573 std::optional<unsigned> Idx1 = getInsertIndex(IE1);
4574 std::optional<unsigned> Idx2 = getInsertIndex(IE2);
4575 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4576 return false;
4577 // Go through the vector operand of insertelement instructions trying to find
4578 // either VU as the original vector for IE2 or V as the original vector for
4579 // IE1.
4580 SmallBitVector ReusedIdx(
4581 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
4582 bool IsReusedIdx = false;
4583 do {
4584 if (IE2 == VU && !IE1)
4585 return VU->hasOneUse();
4586 if (IE1 == V && !IE2)
4587 return V->hasOneUse();
4588 if (IE1 && IE1 != V) {
4589 unsigned Idx1 = getInsertIndex(IE1).value_or(*Idx2);
4590 IsReusedIdx |= ReusedIdx.test(Idx1);
4591 ReusedIdx.set(Idx1);
4592 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4593 IE1 = nullptr;
4594 else
4595 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4596 }
4597 if (IE2 && IE2 != VU) {
4598 unsigned Idx2 = getInsertIndex(IE2).value_or(*Idx1);
4599 IsReusedIdx |= ReusedIdx.test(Idx2);
4600 ReusedIdx.set(Idx2);
4601 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4602 IE2 = nullptr;
4603 else
4604 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4605 }
4606 } while (!IsReusedIdx && (IE1 || IE2));
4607 return false;
4608}
4609
4610std::optional<BoUpSLP::OrdersType>
4611BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4612 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
4613 if (TE.isNonPowOf2Vec())
4614 return std::nullopt;
4615
4616 // No need to reorder if need to shuffle reuses, still need to shuffle the
4617 // node.
4618 if (!TE.ReuseShuffleIndices.empty()) {
4619 if (isSplat(TE.Scalars))
4620 return std::nullopt;
4621 // Check if reuse shuffle indices can be improved by reordering.
4622 // For this, check that reuse mask is "clustered", i.e. each scalar values
4623 // is used once in each submask of size <number_of_scalars>.
4624 // Example: 4 scalar values.
4625 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4626 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4627 // element 3 is used twice in the second submask.
4628 unsigned Sz = TE.Scalars.size();
4629 if (TE.State == TreeEntry::NeedToGather) {
4630 if (std::optional<OrdersType> CurrentOrder =
4632 SmallVector<int> Mask;
4633 fixupOrderingIndices(*CurrentOrder);
4634 inversePermutation(*CurrentOrder, Mask);
4635 ::addMask(Mask, TE.ReuseShuffleIndices);
4636 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4637 unsigned Sz = TE.Scalars.size();
4638 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
4639 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
4640 if (Idx != PoisonMaskElem)
4641 Res[Idx + K * Sz] = I + K * Sz;
4642 }
4643 return std::move(Res);
4644 }
4645 }
4646 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4648 TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
4649 return std::nullopt;
4650 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
4651 Sz)) {
4652 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4653 if (TE.ReorderIndices.empty())
4654 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4655 else
4656 inversePermutation(TE.ReorderIndices, ReorderMask);
4657 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4658 unsigned VF = ReorderMask.size();
4659 OrdersType ResOrder(VF, VF);
4660 unsigned NumParts = VF / Sz;
4661 SmallBitVector UsedVals(NumParts);
4662 for (unsigned I = 0; I < VF; I += Sz) {
4663 int Val = PoisonMaskElem;
4664 unsigned UndefCnt = 0;
4665 if (any_of(ArrayRef(ReorderMask).slice(I, Sz),
4666 [&](int Idx) {
4667 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
4668 Val = Idx;
4669 if (Idx == PoisonMaskElem)
4670 ++UndefCnt;
4671 return Idx != PoisonMaskElem && Idx != Val;
4672 }) ||
4673 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
4674 UndefCnt > Sz / 2)
4675 return std::nullopt;
4676 UsedVals.set(Val);
4677 for (unsigned K = 0; K < NumParts; ++K)
4678 ResOrder[Val + Sz * K] = I + K;
4679 }
4680 return std::move(ResOrder);
4681 }
4682 unsigned VF = TE.getVectorFactor();
4683 // Try build correct order for extractelement instructions.
4684 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
4685 TE.ReuseShuffleIndices.end());
4686 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4687 all_of(TE.Scalars, [Sz](Value *V) {
4688 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4689 return Idx && *Idx < Sz;
4690 })) {
4691 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4692 if (TE.ReorderIndices.empty())
4693 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4694 else
4695 inversePermutation(TE.ReorderIndices, ReorderMask);
4696 for (unsigned I = 0; I < VF; ++I) {
4697 int &Idx = ReusedMask[I];
4698 if (Idx == PoisonMaskElem)
4699 continue;
4700 Value *V = TE.Scalars[ReorderMask[Idx]];
4701 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
4702 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
4703 }
4704 }
4705 // Build the order of the VF size, need to reorder reuses shuffles, they are
4706 // always of VF size.
4707 OrdersType ResOrder(VF);
4708 std::iota(ResOrder.begin(), ResOrder.end(), 0);
4709 auto *It = ResOrder.begin();
4710 for (unsigned K = 0; K < VF; K += Sz) {
4711 OrdersType CurrentOrder(TE.ReorderIndices);
4712 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
4713 if (SubMask.front() == PoisonMaskElem)
4714 std::iota(SubMask.begin(), SubMask.end(), 0);
4715 reorderOrder(CurrentOrder, SubMask);
4716 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
4717 std::advance(It, Sz);
4718 }
4719 if (TE.State == TreeEntry::NeedToGather &&
4720 all_of(enumerate(ResOrder),
4721 [](const auto &Data) { return Data.index() == Data.value(); }))
4722 return std::nullopt; // No need to reorder.
4723 return std::move(ResOrder);
4724 }
4725 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
4726 any_of(TE.UserTreeIndices,
4727 [](const EdgeInfo &EI) {
4728 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
4729 }) &&
4730 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
4731 return std::nullopt;
4732 if ((TE.State == TreeEntry::Vectorize ||
4733 TE.State == TreeEntry::StridedVectorize) &&
4734 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4735 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4736 !TE.isAltShuffle())
4737 return TE.ReorderIndices;
4738 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4739 auto PHICompare = [&](unsigned I1, unsigned I2) {
4740 Value *V1 = TE.Scalars[I1];
4741 Value *V2 = TE.Scalars[I2];
4742 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
4743 return false;
4744 if (V1->getNumUses() < V2->getNumUses())
4745 return true;
4746 if (V1->getNumUses() > V2->getNumUses())
4747 return false;
4748 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
4749 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4750 if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4751 if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4753 IE1, IE2,
4754 [](InsertElementInst *II) { return II->getOperand(0); }))
4755 return I1 < I2;
4756 return getInsertIndex(IE1) < getInsertIndex(IE2);
4757 }
4758 if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4759 if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4760 if (EE1->getOperand(0) != EE2->getOperand(0))
4761 return I1 < I2;
4762 return getInsertIndex(EE1) < getInsertIndex(EE2);
4763 }
4764 return I1 < I2;
4765 };
4766 auto IsIdentityOrder = [](const OrdersType &Order) {
4767 for (unsigned Idx : seq<unsigned>(0, Order.size()))
4768 if (Idx != Order[Idx])
4769 return false;
4770 return true;
4771 };
4772 if (!TE.ReorderIndices.empty())
4773 return TE.ReorderIndices;
4775 SmallVector<unsigned> Phis(TE.Scalars.size());
4776 std::iota(Phis.begin(), Phis.end(), 0);
4777 OrdersType ResOrder(TE.Scalars.size());
4778 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4779 PhiToId[Id] = Id;
4780 stable_sort(Phis, PHICompare);
4781 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4782 ResOrder[Id] = PhiToId[Phis[Id]];
4783 if (IsIdentityOrder(ResOrder))
4784 return std::nullopt; // No need to reorder.
4785 return std::move(ResOrder);
4786 }
4787 if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
4788 allSameType(TE.Scalars)) {
4789 // TODO: add analysis of other gather nodes with extractelement
4790 // instructions and other values/instructions, not only undefs.
4791 if ((TE.getOpcode() == Instruction::ExtractElement ||
4792 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
4793 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
4794 all_of(TE.Scalars, [](Value *V) {
4795 auto *EE = dyn_cast<ExtractElementInst>(V);
4796 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4797 })) {
4798 // Check that gather of extractelements can be represented as
4799 // just a shuffle of a single vector.
4800 OrdersType CurrentOrder;
4801 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4802 /*ResizeAllowed=*/true);
4803 if (Reuse || !CurrentOrder.empty())
4804 return std::move(CurrentOrder);
4805 }
4806 // If the gather node is <undef, v, .., poison> and
4807 // insertelement poison, v, 0 [+ permute]
4808 // is cheaper than
4809 // insertelement poison, v, n - try to reorder.
4810 // If rotating the whole graph, exclude the permute cost, the whole graph
4811 // might be transformed.
4812 int Sz = TE.Scalars.size();
4813 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
4814 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
4815 const auto *It =
4816 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
4817 if (It == TE.Scalars.begin())
4818 return OrdersType();
4819 auto *Ty = FixedVectorType::get(TE.Scalars.front()->getType(), Sz);
4820 if (It != TE.Scalars.end()) {
4821 OrdersType Order(Sz, Sz);
4822 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4823 Order[Idx] = 0;
4824 fixupOrderingIndices(Order);
4825 SmallVector<int> Mask;
4826 inversePermutation(Order, Mask);
4827 InstructionCost PermuteCost =
4828 TopToBottom
4829 ? 0
4831 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
4832 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
4833 PoisonValue::get(Ty), *It);
4834 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
4835 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
4836 PoisonValue::get(Ty), *It);
4837 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
4838 OrdersType Order(Sz, Sz);
4839 Order[Idx] = 0;
4840 return std::move(Order);
4841 }
4842 }
4843 }
4844 if (isSplat(TE.Scalars))
4845 return std::nullopt;
4846 if (TE.Scalars.size() >= 4)
4847 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
4848 return Order;
4849 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
4850 return CurrentOrder;
4851 }
4852 return std::nullopt;
4853}
4854
4855/// Checks if the given mask is a "clustered" mask with the same clusters of
4856/// size \p Sz, which are not identity submasks.
4858 unsigned Sz) {
4859 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
4860 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
4861 return false;
4862 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
4863 ArrayRef<int> Cluster = Mask.slice(I, Sz);
4864 if (Cluster != FirstCluster)
4865 return false;
4866 }
4867 return true;
4868}
4869
4870void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
4871 // Reorder reuses mask.
4872 reorderReuses(TE.ReuseShuffleIndices, Mask);
4873 const unsigned Sz = TE.Scalars.size();
4874 // For vectorized and non-clustered reused no need to do anything else.
4875 if (TE.State != TreeEntry::NeedToGather ||
4877 Sz) ||
4878 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
4879 return;
4880 SmallVector<int> NewMask;
4881 inversePermutation(TE.ReorderIndices, NewMask);
4882 addMask(NewMask, TE.ReuseShuffleIndices);
4883 // Clear reorder since it is going to be applied to the new mask.
4884 TE.ReorderIndices.clear();
4885 // Try to improve gathered nodes with clustered reuses, if possible.
4886 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
4887 SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
4888 inversePermutation(NewOrder, NewMask);
4889 reorderScalars(TE.Scalars, NewMask);
4890 // Fill the reuses mask with the identity submasks.
4891 for (auto *It = TE.ReuseShuffleIndices.begin(),
4892 *End = TE.ReuseShuffleIndices.end();
4893 It != End; std::advance(It, Sz))
4894 std::iota(It, std::next(It, Sz), 0);
4895}
4896
4898 ArrayRef<unsigned> SecondaryOrder) {
4899 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
4900 "Expected same size of orders");
4901 unsigned Sz = Order.size();
4902 SmallBitVector UsedIndices(Sz);
4903 for (unsigned Idx : seq<unsigned>(0, Sz)) {
4904 if (Order[Idx] != Sz)
4905 UsedIndices.set(Order[Idx]);
4906 }
4907 if (SecondaryOrder.empty()) {
4908 for (unsigned Idx : seq<unsigned>(0, Sz))
4909 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
4910 Order[Idx] = Idx;
4911 } else {
4912 for (unsigned Idx : seq<unsigned>(0, Sz))
4913 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
4914 !UsedIndices.test(SecondaryOrder[Idx]))
4915 Order[Idx] = SecondaryOrder[Idx];
4916 }
4917}
4918
4920 // Maps VF to the graph nodes.
4922 // ExtractElement gather nodes which can be vectorized and need to handle
4923 // their ordering.
4925
4926 // Phi nodes can have preferred ordering based on their result users
4928
4929 // AltShuffles can also have a preferred ordering that leads to fewer
4930 // instructions, e.g., the addsub instruction in x86.
4931 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
4932
4933 // Maps a TreeEntry to the reorder indices of external users.
4935 ExternalUserReorderMap;
4936 // Find all reorderable nodes with the given VF.
4937 // Currently the are vectorized stores,loads,extracts + some gathering of
4938 // extracts.
4939 for_each(VectorizableTree, [&, &TTIRef = *TTI](
4940 const std::unique_ptr<TreeEntry> &TE) {
4941 // Look for external users that will probably be vectorized.
4942 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
4943 findExternalStoreUsersReorderIndices(TE.get());
4944 if (!ExternalUserReorderIndices.empty()) {
4945 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4946 ExternalUserReorderMap.try_emplace(TE.get(),
4947 std::move(ExternalUserReorderIndices));
4948 }
4949
4950 // Patterns like [fadd,fsub] can be combined into a single instruction in
4951 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
4952 // to take into account their order when looking for the most used order.
4953 if (TE->isAltShuffle()) {
4954 VectorType *VecTy =
4955 FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
4956 unsigned Opcode0 = TE->getOpcode();
4957 unsigned Opcode1 = TE->getAltOpcode();
4958 // The opcode mask selects between the two opcodes.
4959 SmallBitVector OpcodeMask(TE->Scalars.size(), false);
4960 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
4961 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
4962 OpcodeMask.set(Lane);
4963 // If this pattern is supported by the target then we consider the order.
4964 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
4965 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4966 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
4967 }
4968 // TODO: Check the reverse order too.
4969 }
4970
4971 if (std::optional<OrdersType> CurrentOrder =
4972 getReorderingData(*TE, /*TopToBottom=*/true)) {
4973 // Do not include ordering for nodes used in the alt opcode vectorization,
4974 // better to reorder them during bottom-to-top stage. If follow the order
4975 // here, it causes reordering of the whole graph though actually it is
4976 // profitable just to reorder the subgraph that starts from the alternate
4977 // opcode vectorization node. Such nodes already end-up with the shuffle
4978 // instruction and it is just enough to change this shuffle rather than
4979 // rotate the scalars for the whole graph.
4980 unsigned Cnt = 0;
4981 const TreeEntry *UserTE = TE.get();
4982 while (UserTE && Cnt < RecursionMaxDepth) {
4983 if (UserTE->UserTreeIndices.size() != 1)
4984 break;
4985 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
4986 return EI.UserTE->State == TreeEntry::Vectorize &&
4987 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
4988 }))
4989 return;
4990 UserTE = UserTE->UserTreeIndices.back().UserTE;
4991 ++Cnt;
4992 }
4993 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4994 if (!(TE->State == TreeEntry::Vectorize ||
4995 TE->State == TreeEntry::StridedVectorize) ||
4996 !TE->ReuseShuffleIndices.empty())
4997 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
4998 if (TE->State == TreeEntry::Vectorize &&
4999 TE->getOpcode() == Instruction::PHI)
5000 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
5001 }
5002 });
5003
5004 // Reorder the graph nodes according to their vectorization factor.
5005 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5006 VF /= 2) {
5007 auto It = VFToOrderedEntries.find(VF);
5008 if (It == VFToOrderedEntries.end())
5009 continue;
5010 // Try to find the most profitable order. We just are looking for the most
5011 // used order and reorder scalar elements in the nodes according to this
5012 // mostly used order.
5013 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5014 // All operands are reordered and used only in this node - propagate the
5015 // most used order to the user node.
5018 OrdersUses;
5020 for (const TreeEntry *OpTE : OrderedEntries) {
5021 // No need to reorder this nodes, still need to extend and to use shuffle,
5022 // just need to merge reordering shuffle and the reuse shuffle.
5023 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5024 continue;
5025 // Count number of orders uses.
5026 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5027 &PhisToOrders]() -> const OrdersType & {
5028 if (OpTE->State == TreeEntry::NeedToGather ||
5029 !OpTE->ReuseShuffleIndices.empty()) {
5030 auto It = GathersToOrders.find(OpTE);
5031 if (It != GathersToOrders.end())
5032 return It->second;
5033 }
5034 if (OpTE->isAltShuffle()) {
5035 auto It = AltShufflesToOrders.find(OpTE);
5036 if (It != AltShufflesToOrders.end())
5037 return It->second;
5038 }
5039 if (OpTE->State == TreeEntry::Vectorize &&
5040 OpTE->getOpcode() == Instruction::PHI) {
5041 auto It = PhisToOrders.find(OpTE);
5042 if (It != PhisToOrders.end())
5043 return It->second;
5044 }
5045 return OpTE->ReorderIndices;
5046 }();
5047 // First consider the order of the external scalar users.
5048 auto It = ExternalUserReorderMap.find(OpTE);
5049 if (It != ExternalUserReorderMap.end()) {
5050 const auto &ExternalUserReorderIndices = It->second;
5051 // If the OpTE vector factor != number of scalars - use natural order,
5052 // it is an attempt to reorder node with reused scalars but with
5053 // external uses.
5054 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5055 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
5056 ExternalUserReorderIndices.size();
5057 } else {
5058 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
5059 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5060 }
5061 // No other useful reorder data in this entry.
5062 if (Order.empty())
5063 continue;
5064 }
5065 // Stores actually store the mask, not the order, need to invert.
5066 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5067 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5068 SmallVector<int> Mask;
5069 inversePermutation(Order, Mask);
5070 unsigned E = Order.size();
5071 OrdersType CurrentOrder(E, E);
5072 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5073 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5074 });
5075 fixupOrderingIndices(CurrentOrder);
5076 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5077 } else {
5078 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5079 }
5080 }
5081 if (OrdersUses.empty())
5082 continue;
5083 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5084 const unsigned Sz = Order.size();
5085 for (unsigned Idx : seq<unsigned>(0, Sz))
5086 if (Idx != Order[Idx] && Order[Idx] != Sz)
5087 return false;
5088 return true;
5089 };
5090 // Choose the most used order.
5091 unsigned IdentityCnt = 0;
5092 unsigned FilledIdentityCnt = 0;
5093 OrdersType IdentityOrder(VF, VF);
5094 for (auto &Pair : OrdersUses) {
5095 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5096 if (!Pair.first.empty())
5097 FilledIdentityCnt += Pair.second;
5098 IdentityCnt += Pair.second;
5099 combineOrders(IdentityOrder, Pair.first);
5100 }
5101 }
5102 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5103 unsigned Cnt = IdentityCnt;
5104 for (auto &Pair : OrdersUses) {
5105 // Prefer identity order. But, if filled identity found (non-empty order)
5106 // with same number of uses, as the new candidate order, we can choose
5107 // this candidate order.
5108 if (Cnt < Pair.second ||
5109 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5110 Cnt == Pair.second && !BestOrder.empty() &&
5111 IsIdentityOrder(BestOrder))) {
5112 combineOrders(Pair.first, BestOrder);
5113 BestOrder = Pair.first;
5114 Cnt = Pair.second;
5115 } else {
5116 combineOrders(BestOrder, Pair.first);
5117 }
5118 }
5119 // Set order of the user node.
5120 if (IsIdentityOrder(BestOrder))
5121 continue;
5122 fixupOrderingIndices(BestOrder);
5123 SmallVector<int> Mask;
5124 inversePermutation(BestOrder, Mask);
5125 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5126 unsigned E = BestOrder.size();
5127 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5128 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5129 });
5130 // Do an actual reordering, if profitable.
5131 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5132 // Just do the reordering for the nodes with the given VF.
5133 if (TE->Scalars.size() != VF) {
5134 if (TE->ReuseShuffleIndices.size() == VF) {
5135 // Need to reorder the reuses masks of the operands with smaller VF to
5136 // be able to find the match between the graph nodes and scalar
5137 // operands of the given node during vectorization/cost estimation.
5138 assert(all_of(TE->UserTreeIndices,
5139 [VF, &TE](const EdgeInfo &EI) {
5140 return EI.UserTE->Scalars.size() == VF ||
5141 EI.UserTE->Scalars.size() ==
5142 TE->Scalars.size();
5143 }) &&
5144 "All users must be of VF size.");
5145 // Update ordering of the operands with the smaller VF than the given
5146 // one.
5147 reorderNodeWithReuses(*TE, Mask);
5148 }
5149 continue;
5150 }
5151 if ((TE->State == TreeEntry::Vectorize ||
5152 TE->State == TreeEntry::StridedVectorize) &&
5154 InsertElementInst>(TE->getMainOp()) &&
5155 !TE->isAltShuffle()) {
5156 // Build correct orders for extract{element,value}, loads and
5157 // stores.
5158 reorderOrder(TE->ReorderIndices, Mask);
5159 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5160 TE->reorderOperands(Mask);
5161 } else {
5162 // Reorder the node and its operands.
5163 TE->reorderOperands(Mask);
5164 assert(TE->ReorderIndices.empty() &&
5165 "Expected empty reorder sequence.");
5166 reorderScalars(TE->Scalars, Mask);
5167 }
5168 if (!TE->ReuseShuffleIndices.empty()) {
5169 // Apply reversed order to keep the original ordering of the reused
5170 // elements to avoid extra reorder indices shuffling.
5171 OrdersType CurrentOrder;
5172 reorderOrder(CurrentOrder, MaskOrder);
5173 SmallVector<int> NewReuses;
5174 inversePermutation(CurrentOrder, NewReuses);
5175 addMask(NewReuses, TE->ReuseShuffleIndices);
5176 TE->ReuseShuffleIndices.swap(NewReuses);
5177 }
5178 }
5179 }
5180}
5181
5182bool BoUpSLP::canReorderOperands(
5183 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5184 ArrayRef<TreeEntry *> ReorderableGathers,
5185 SmallVectorImpl<TreeEntry *> &GatherOps) {
5186 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5187 if (UserTE->isNonPowOf2Vec())
5188 return false;
5189
5190 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
5191 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
5192 return OpData.first == I &&
5193 (OpData.second->State == TreeEntry::Vectorize ||
5194 OpData.second->State == TreeEntry::StridedVectorize);
5195 }))
5196 continue;
5197 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
5198 // Do not reorder if operand node is used by many user nodes.
5199 if (any_of(TE->UserTreeIndices,
5200 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5201 return false;
5202 // Add the node to the list of the ordered nodes with the identity
5203 // order.
5204 Edges.emplace_back(I, TE);
5205 // Add ScatterVectorize nodes to the list of operands, where just
5206 // reordering of the scalars is required. Similar to the gathers, so
5207 // simply add to the list of gathered ops.
5208 // If there are reused scalars, process this node as a regular vectorize
5209 // node, just reorder reuses mask.
5210 if (TE->State != TreeEntry::Vectorize &&
5211 TE->State != TreeEntry::StridedVectorize &&
5212 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5213 GatherOps.push_back(TE);
5214 continue;
5215 }
5216 TreeEntry *Gather = nullptr;
5217 if (count_if(ReorderableGathers,
5218 [&Gather, UserTE, I](TreeEntry *TE) {
5219 assert(TE->State != TreeEntry::Vectorize &&
5220 TE->State != TreeEntry::StridedVectorize &&
5221 "Only non-vectorized nodes are expected.");
5222 if (any_of(TE->UserTreeIndices,
5223 [UserTE, I](const EdgeInfo &EI) {
5224 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5225 })) {
5226 assert(TE->isSame(UserTE->getOperand(I)) &&
5227 "Operand entry does not match operands.");
5228 Gather = TE;
5229 return true;
5230 }
5231 return false;
5232 }) > 1 &&
5233 !allConstant(UserTE->getOperand(I)))
5234 return false;
5235 if (Gather)
5236 GatherOps.push_back(Gather);
5237 }
5238 return true;
5239}
5240
5241void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
5242 SetVector<TreeEntry *> OrderedEntries;
5243 DenseSet<const TreeEntry *> GathersToOrders;
5244 // Find all reorderable leaf nodes with the given VF.
5245 // Currently the are vectorized loads,extracts without alternate operands +
5246 // some gathering of extracts.
5247 SmallVector<TreeEntry *> NonVectorized;
5248 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5249 if (TE->State != TreeEntry::Vectorize &&
5250 TE->State != TreeEntry::StridedVectorize)
5251 NonVectorized.push_back(TE.get());
5252 if (std::optional<OrdersType> CurrentOrder =
5253 getReorderingData(*TE, /*TopToBottom=*/false)) {
5254 OrderedEntries.insert(TE.get());
5255 if (!(TE->State == TreeEntry::Vectorize ||
5256 TE->State == TreeEntry::StridedVectorize) ||
5257 !TE->ReuseShuffleIndices.empty())
5258 GathersToOrders.insert(TE.get());
5259 }
5260 }
5261
5262 // 1. Propagate order to the graph nodes, which use only reordered nodes.
5263 // I.e., if the node has operands, that are reordered, try to make at least
5264 // one operand order in the natural order and reorder others + reorder the
5265 // user node itself.
5267 while (!OrderedEntries.empty()) {
5268 // 1. Filter out only reordered nodes.
5269 // 2. If the entry has multiple uses - skip it and jump to the next node.
5271 SmallVector<TreeEntry *> Filtered;
5272 for (TreeEntry *TE : OrderedEntries) {
5273 if (!(TE->State == TreeEntry::Vectorize ||
5274 TE->State == TreeEntry::StridedVectorize ||
5275 (TE->State == TreeEntry::NeedToGather &&
5276 GathersToOrders.contains(TE))) ||
5277 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5278 !all_of(drop_begin(TE->UserTreeIndices),
5279 [TE](const EdgeInfo &EI) {
5280 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5281 }) ||
5282 !Visited.insert(TE).second) {
5283 Filtered.push_back(TE);
5284 continue;
5285 }
5286 // Build a map between user nodes and their operands order to speedup
5287 // search. The graph currently does not provide this dependency directly.
5288 for (EdgeInfo &EI : TE->UserTreeIndices) {
5289 TreeEntry *UserTE = EI.UserTE;
5290 auto It = Users.find(UserTE);
5291 if (It == Users.end())
5292 It = Users.insert({UserTE, {}}).first;
5293 It->second.emplace_back(EI.EdgeIdx, TE);
5294 }
5295 }
5296 // Erase filtered entries.
5297 for (TreeEntry *TE : Filtered)
5298 OrderedEntries.remove(TE);
5300 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5301 UsersVec(Users.begin(), Users.end());
5302 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
5303 return Data1.first->Idx > Data2.first->Idx;
5304 });
5305 for (auto &Data : UsersVec) {
5306 // Check that operands are used only in the User node.
5307 SmallVector<TreeEntry *> GatherOps;
5308 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
5309 GatherOps)) {
5310 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5311 OrderedEntries.remove(Op.second);
5312 continue;
5313 }
5314 // All operands are reordered and used only in this node - propagate the
5315 // most used order to the user node.
5318 OrdersUses;
5319 // Do the analysis for each tree entry only once, otherwise the order of
5320 // the same node my be considered several times, though might be not
5321 // profitable.
5324 for (const auto &Op : Data.second) {
5325 TreeEntry *OpTE = Op.second;
5326 if (!VisitedOps.insert(OpTE).second)
5327 continue;
5328 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5329 continue;
5330 const auto Order = [&]() -> const OrdersType {
5331 if (OpTE->State == TreeEntry::NeedToGather ||
5332 !OpTE->ReuseShuffleIndices.empty())
5333 return getReorderingData(*OpTE, /*TopToBottom=*/false)
5334 .value_or(OrdersType(1));
5335 return OpTE->ReorderIndices;
5336 }();
5337 // The order is partially ordered, skip it in favor of fully non-ordered
5338 // orders.
5339 if (Order.size() == 1)
5340 continue;
5341 unsigned NumOps = count_if(
5342 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
5343 return P.second == OpTE;
5344 });
5345 // Stores actually store the mask, not the order, need to invert.
5346 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5347 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5348 SmallVector<int> Mask;
5349 inversePermutation(Order, Mask);
5350 unsigned E = Order.size();
5351 OrdersType CurrentOrder(E, E);
5352 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5353 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5354 });
5355 fixupOrderingIndices(CurrentOrder);
5356 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5357 NumOps;
5358 } else {
5359 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5360 }
5361 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
5362 const auto AllowsReordering = [&](const TreeEntry *TE) {
5363 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5364 if (TE->isNonPowOf2Vec())
5365 return false;
5366 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5367 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5368 (IgnoreReorder && TE->Idx == 0))
5369 return true;
5370 if (TE->State == TreeEntry::NeedToGather) {
5371 if (GathersToOrders.contains(TE))
5372 return !getReorderingData(*TE, /*TopToBottom=*/false)
5373 .value_or(OrdersType(1))
5374 .empty();
5375 return true;
5376 }
5377 return false;
5378 };
5379 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
5380 TreeEntry *UserTE = EI.UserTE;
5381 if (!VisitedUsers.insert(UserTE).second)
5382 continue;
5383 // May reorder user node if it requires reordering, has reused
5384 // scalars, is an alternate op vectorize node or its op nodes require
5385 // reordering.
5386 if (AllowsReordering(UserTE))
5387 continue;
5388 // Check if users allow reordering.
5389 // Currently look up just 1 level of operands to avoid increase of
5390 // the compile time.
5391 // Profitable to reorder if definitely more operands allow
5392 // reordering rather than those with natural order.
5394 if (static_cast<unsigned>(count_if(
5395 Ops, [UserTE, &AllowsReordering](
5396 const std::pair<unsigned, TreeEntry *> &Op) {
5397 return AllowsReordering(Op.second) &&
5398 all_of(Op.second->UserTreeIndices,
5399 [UserTE](const EdgeInfo &EI) {
5400 return EI.UserTE == UserTE;
5401 });
5402 })) <= Ops.size() / 2)
5403 ++Res.first->second;
5404 }
5405 }
5406 if (OrdersUses.empty()) {
5407 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5408 OrderedEntries.remove(Op.second);
5409 continue;
5410 }
5411 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5412 const unsigned Sz = Order.size();
5413 for (unsigned Idx : seq<unsigned>(0, Sz))
5414 if (Idx != Order[Idx] && Order[Idx] != Sz)
5415 return false;
5416 return true;
5417 };
5418 // Choose the most used order.
5419 unsigned IdentityCnt = 0;
5420 unsigned VF = Data.second.front().second->getVectorFactor();
5421 OrdersType IdentityOrder(VF, VF);
5422 for (auto &Pair : OrdersUses) {
5423 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5424 IdentityCnt += Pair.second;
5425 combineOrders(IdentityOrder, Pair.first);
5426 }
5427 }
5428 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5429 unsigned Cnt = IdentityCnt;
5430 for (auto &Pair : OrdersUses) {
5431 // Prefer identity order. But, if filled identity found (non-empty
5432 // order) with same number of uses, as the new candidate order, we can
5433 // choose this candidate order.
5434 if (Cnt < Pair.second) {
5435 combineOrders(Pair.first, BestOrder);
5436 BestOrder = Pair.first;
5437 Cnt = Pair.second;
5438 } else {
5439 combineOrders(BestOrder, Pair.first);
5440 }
5441 }
5442 // Set order of the user node.
5443 if (IsIdentityOrder(BestOrder)) {
5444 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5445 OrderedEntries.remove(Op.second);
5446 continue;
5447 }
5448 fixupOrderingIndices(BestOrder);
5449 // Erase operands from OrderedEntries list and adjust their orders.
5450 VisitedOps.clear();
5451 SmallVector<int> Mask;
5452 inversePermutation(BestOrder, Mask);
5453 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5454 unsigned E = BestOrder.size();
5455 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5456 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5457 });
5458 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
5459 TreeEntry *TE = Op.second;
5460 OrderedEntries.remove(TE);
5461 if (!VisitedOps.insert(TE).second)
5462 continue;
5463 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
5464 reorderNodeWithReuses(*TE, Mask);
5465 continue;
5466 }
5467 // Gathers are processed separately.
5468 if (TE->State != TreeEntry::Vectorize &&
5469 TE->State != TreeEntry::StridedVectorize &&
5470 (TE->State != TreeEntry::ScatterVectorize ||
5471 TE->ReorderIndices.empty()))
5472 continue;
5473 assert((BestOrder.size() == TE->ReorderIndices.size() ||
5474 TE->ReorderIndices.empty()) &&
5475 "Non-matching sizes of user/operand entries.");
5476 reorderOrder(TE->ReorderIndices, Mask);
5477 if (IgnoreReorder && TE == VectorizableTree.front().get())
5478 IgnoreReorder = false;
5479 }
5480 // For gathers just need to reorder its scalars.
5481 for (TreeEntry *Gather : GatherOps) {
5482 assert(Gather->ReorderIndices.empty() &&
5483 "Unexpected reordering of gathers.");
5484 if (!Gather->ReuseShuffleIndices.empty()) {
5485 // Just reorder reuses indices.
5486 reorderReuses(Gather->ReuseShuffleIndices, Mask);
5487 continue;
5488 }
5489 reorderScalars(Gather->Scalars, Mask);
5490 OrderedEntries.remove(Gather);
5491 }
5492 // Reorder operands of the user node and set the ordering for the user
5493 // node itself.
5494 if (Data.first->State != TreeEntry::Vectorize ||
5495 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5496 Data.first->getMainOp()) ||
5497 Data.first->isAltShuffle())
5498 Data.first->reorderOperands(Mask);
5499 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
5500 Data.first->isAltShuffle() ||
5501 Data.first->State == TreeEntry::StridedVectorize) {
5502 reorderScalars(Data.first->Scalars, Mask);
5503 reorderOrder(Data.first->ReorderIndices, MaskOrder,
5504 /*BottomOrder=*/true);
5505 if (Data.first->ReuseShuffleIndices.empty() &&
5506 !Data.first->ReorderIndices.empty() &&
5507 !Data.first->isAltShuffle()) {
5508 // Insert user node to the list to try to sink reordering deeper in
5509 // the graph.
5510 OrderedEntries.insert(Data.first);
5511 }
5512 } else {
5513 reorderOrder(Data.first->ReorderIndices, Mask);
5514 }
5515 }
5516 }
5517 // If the reordering is unnecessary, just remove the reorder.
5518 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5519 VectorizableTree.front()->ReuseShuffleIndices.empty())
5520 VectorizableTree.front()->ReorderIndices.clear();
5521}
5522
5524 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
5525 DenseMap<Value *, unsigned> ScalarToExtUses;
5526 // Collect the values that we need to extract from the tree.
5527 for (auto &TEPtr : VectorizableTree) {
5528 TreeEntry *Entry = TEPtr.get();
5529
5530 // No need to handle users of gathered values.
5531 if (Entry->State == TreeEntry::NeedToGather)
5532 continue;
5533
5534 // For each lane:
5535 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5536 Value *Scalar = Entry->Scalars[Lane];
5537 if (!isa<Instruction>(Scalar))
5538 continue;
5539 // All uses must be replaced already? No need to do it again.
5540 auto It = ScalarToExtUses.find(Scalar);
5541 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
5542 continue;
5543
5544 // Check if the scalar is externally used as an extra arg.
5545 const auto *ExtI = ExternallyUsedValues.find(Scalar);
5546 if (ExtI != ExternallyUsedValues.end()) {
5547 int FoundLane = Entry->findLaneForValue(Scalar);
5548 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
5549 << FoundLane << " from " << *Scalar << ".\n");
5550 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
5551 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
5552 continue;
5553 }
5554 for (User *U : Scalar->users()) {
5555 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
5556
5557 Instruction *UserInst = dyn_cast<Instruction>(U);
5558 if (!UserInst || isDeleted(UserInst))
5559 continue;
5560
5561 // Ignore users in the user ignore list.
5562 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5563 continue;
5564
5565 // Skip in-tree scalars that become vectors
5566 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5567 // Some in-tree scalars will remain as scalar in vectorized
5568 // instructions. If that is the case, the one in FoundLane will
5569 // be used.
5570 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5572 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5573 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
5574 << ".\n");
5575 assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
5576 continue;
5577 }
5578 U = nullptr;
5579 if (It != ScalarToExtUses.end()) {
5580 ExternalUses[It->second].User = nullptr;
5581 break;
5582 }
5583 }
5584
5585 int FoundLane = Entry->findLaneForValue(Scalar);
5586 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
5587 << " from lane " << FoundLane << " from " << *Scalar
5588 << ".\n");
5589 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
5590 ExternalUses.emplace_back(Scalar, U, FoundLane);
5591 if (!U)
5592 break;
5593 }
5594 }
5595 }
5596}
5597
5599BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
5601 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5602 Value *V = TE->Scalars[Lane];
5603 // To save compilation time we don't visit if we have too many users.
5604 if (V->hasNUsesOrMore(UsesLimit))
5605 break;
5606
5607 // Collect stores per pointer object.
5608 for (User *U : V->users()) {
5609 auto *SI = dyn_cast<StoreInst>(U);
5610 if (SI == nullptr || !SI->isSimple() ||
5611 !isValidElementType(SI->getValueOperand()->getType()))
5612 continue;
5613 // Skip entry if already
5614 if (getTreeEntry(U))
5615 continue;
5616
5617 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
5618 auto &StoresVec = PtrToStoresMap[Ptr];
5619 // For now just keep one store per pointer object per lane.
5620 // TODO: Extend this to support multiple stores per pointer per lane
5621 if (StoresVec.size() > Lane)
5622 continue;
5623 // Skip if in different BBs.
5624 if (!StoresVec.empty() &&
5625 SI->getParent() != StoresVec.back()->getParent())
5626 continue;
5627 // Make sure that the stores are of the same type.
5628 if (!StoresVec.empty() &&
5629 SI->getValueOperand()->getType() !=
5630 StoresVec.back()->getValueOperand()->getType())
5631 continue;
5632 StoresVec.push_back(SI);
5633 }
5634 }
5635 return PtrToStoresMap;
5636}
5637
5638bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
5639 OrdersType &ReorderIndices) const {
5640 // We check whether the stores in StoreVec can form a vector by sorting them
5641 // and checking whether they are consecutive.
5642
5643 // To avoid calling getPointersDiff() while sorting we create a vector of
5644 // pairs {store, offset from first} and sort this instead.
5645 SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
5646 StoreInst *S0 = StoresVec[0];
5647 StoreOffsetVec[0] = {S0, 0};
5648 Type *S0Ty = S0->getValueOperand()->getType();
5649 Value *S0Ptr = S0->getPointerOperand();
5650 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
5651 StoreInst *SI = StoresVec[Idx];
5652 std::optional<int> Diff =
5653 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
5654 SI->getPointerOperand(), *DL, *SE,
5655 /*StrictCheck=*/true);
5656 // We failed to compare the pointers so just abandon this StoresVec.
5657 if (!Diff)
5658 return false;
5659 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
5660 }
5661
5662 // Sort the vector based on the pointers. We create a copy because we may
5663 // need the original later for calculating the reorder (shuffle) indices.
5664 stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
5665 const std::pair<StoreInst *, int> &Pair2) {
5666 int Offset1 = Pair1.second;
5667 int Offset2 = Pair2.second;
5668 return Offset1 < Offset2;
5669 });
5670
5671 // Check if the stores are consecutive by checking if their difference is 1.
5672 for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5673 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
5674 return false;
5675
5676 // Calculate the shuffle indices according to their offset against the sorted
5677 // StoreOffsetVec.
5678 ReorderIndices.reserve(StoresVec.size());
5679 for (StoreInst *SI : StoresVec) {
5680 unsigned Idx = find_if(StoreOffsetVec,
5681 [SI](const std::pair<StoreInst *, int> &Pair) {
5682 return Pair.first == SI;
5683 }) -
5684 StoreOffsetVec.begin();
5685 ReorderIndices.push_back(Idx);
5686 }
5687 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
5688 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
5689 // same convention here.
5690 auto IsIdentityOrder = [](const OrdersType &Order) {
5691 for (unsigned Idx : seq<unsigned>(0, Order.size()))
5692 if (Idx != Order[Idx])
5693 return false;
5694 return true;
5695 };
5696 if (IsIdentityOrder(ReorderIndices))
5697 ReorderIndices.clear();
5698
5699 return true;
5700}
5701
5702#ifndef NDEBUG
5704 for (unsigned Idx : Order)
5705 dbgs() << Idx << ", ";
5706 dbgs() << "\n";
5707}
5708#endif
5709
5711BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
5712 unsigned NumLanes = TE->Scalars.size();
5713
5715 collectUserStores(TE);
5716
5717 // Holds the reorder indices for each candidate store vector that is a user of
5718 // the current TreeEntry.
5719 SmallVector<OrdersType, 1> ExternalReorderIndices;
5720
5721 // Now inspect the stores collected per pointer and look for vectorization
5722 // candidates. For each candidate calculate the reorder index vector and push
5723 // it into `ExternalReorderIndices`
5724 for (const auto &Pair : PtrToStoresMap) {
5725 auto &StoresVec = Pair.second;
5726 // If we have fewer than NumLanes stores, then we can't form a vector.
5727 if (StoresVec.size() != NumLanes)
5728 continue;
5729
5730 // If the stores are not consecutive then abandon this StoresVec.
5731 OrdersType ReorderIndices;
5732 if (!canFormVector(StoresVec, ReorderIndices))
5733 continue;
5734
5735 // We now know that the scalars in StoresVec can form a vector instruction,
5736 // so set the reorder indices.
5737 ExternalReorderIndices.push_back(ReorderIndices);
5738 }
5739 return ExternalReorderIndices;
5740}
5741
5743 const SmallDenseSet<Value *> &UserIgnoreLst) {
5744 deleteTree();
5745 UserIgnoreList = &UserIgnoreLst;
5746 if (!allSameType(Roots))
5747 return;
5748 buildTree_rec(Roots, 0, EdgeInfo());
5749}
5750
5752 deleteTree();
5753 if (!allSameType(Roots))
5754 return;
5755 buildTree_rec(Roots, 0, EdgeInfo());
5756}
5757
5758/// \return true if the specified list of values has only one instruction that
5759/// requires scheduling, false otherwise.
5760#ifndef NDEBUG
5762 Value *NeedsScheduling = nullptr;
5763 for (Value *V : VL) {
5765 continue;
5766 if (!NeedsScheduling) {
5767 NeedsScheduling = V;
5768 continue;
5769 }
5770 return false;
5771 }
5772 return NeedsScheduling;
5773}
5774#endif
5775
5776/// Generates key/subkey pair for the given value to provide effective sorting
5777/// of the values and better detection of the vectorizable values sequences. The
5778/// keys/subkeys can be used for better sorting of the values themselves (keys)
5779/// and in values subgroups (subkeys).
5780static std::pair<size_t, size_t> generateKeySubkey(
5781 Value *V, const TargetLibraryInfo *TLI,
5782 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
5783 bool AllowAlternate) {
5784 hash_code Key = hash_value(V->getValueID() + 2);
5785 hash_code SubKey = hash_value(0);
5786 // Sort the loads by the distance between the pointers.
5787 if (auto *LI = dyn_cast<LoadInst>(V)) {
5788 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
5789 if (LI->isSimple())
5790 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
5791 else
5792 Key = SubKey = hash_value(LI);
5793 } else if (isVectorLikeInstWithConstOps(V)) {
5794 // Sort extracts by the vector operands.
5795 if (isa<ExtractElementInst, UndefValue>(V))
5796 Key = hash_value(Value::UndefValueVal + 1);
5797 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
5798 if (!isUndefVector(EI->getVectorOperand()).all() &&
5799 !isa<UndefValue>(EI->getIndexOperand()))
5800 SubKey = hash_value(EI->getVectorOperand());
5801 }
5802 } else if (auto *I = dyn_cast<Instruction>(V)) {
5803 // Sort other instructions just by the opcodes except for CMPInst.
5804 // For CMP also sort by the predicate kind.
5805 if ((isa<BinaryOperator, CastInst>(I)) &&
5806 isValidForAlternation(I->getOpcode())) {
5807 if (AllowAlternate)
5808 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
5809 else
5810 Key = hash_combine(hash_value(I->getOpcode()), Key);
5811 SubKey = hash_combine(
5812 hash_value(I->getOpcode()), hash_value(I->getType()),
5813 hash_value(isa<BinaryOperator>(I)
5814 ? I->getType()
5815 : cast<CastInst>(I)->getOperand(0)->getType()));
5816 // For casts, look through the only operand to improve compile time.
5817 if (isa<CastInst>(I)) {
5818 std::pair<size_t, size_t> OpVals =
5819 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
5820 /*AllowAlternate=*/true);
5821 Key = hash_combine(OpVals.first, Key);
5822 SubKey = hash_combine(OpVals.first, SubKey);
5823 }
5824 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
5825 CmpInst::Predicate Pred = CI->getPredicate();
5826 if (CI->isCommutative())
5827 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
5829 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
5830 hash_value(SwapPred),
5831 hash_value(CI->getOperand(0)->getType()));
5832 } else if (auto *Call = dyn_cast<CallInst>(I)) {
5835 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
5836 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
5837 SubKey = hash_combine(hash_value(I->getOpcode()),
5838 hash_value(Call->getCalledFunction()));
5839 } else {
5840 Key = hash_combine(hash_value(Call), Key);
5841 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
5842 }
5843 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
5844 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
5845 hash_value(Op.Tag), SubKey);
5846 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
5847 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
5848 SubKey = hash_value(Gep->getPointerOperand());
5849 else
5850 SubKey = hash_value(Gep);
5851 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
5852 !isa<ConstantInt>(I->getOperand(1))) {
5853 // Do not try to vectorize instructions with potentially high cost.
5854 SubKey = hash_value(I);
5855 } else {
5856 SubKey = hash_value(I->getOpcode());
5857 }
5858 Key = hash_combine(hash_value(I->getParent()), Key);
5859 }
5860 return std::make_pair(Key, SubKey);
5861}
5862
5863/// Checks if the specified instruction \p I is an alternate operation for
5864/// the given \p MainOp and \p AltOp instructions.
5865static bool isAlternateInstruction(const Instruction *I,
5866 const Instruction *MainOp,
5867 const Instruction *AltOp,
5868 const TargetLibraryInfo &TLI);
5869
5870bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
5871 ArrayRef<Value *> VL) const {
5872 unsigned Opcode0 = S.getOpcode();
5873 unsigned Opcode1 = S.getAltOpcode();
5874 // The opcode mask selects between the two opcodes.
5875 SmallBitVector OpcodeMask(VL.size(), false);
5876 for (unsigned Lane : seq<unsigned>(0, VL.size()))
5877 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
5878 OpcodeMask.set(Lane);
5879 // If this pattern is supported by the target then consider it profitable.
5880 if (TTI->isLegalAltInstr(FixedVectorType::get(S.MainOp->getType(), VL.size()),
5881 Opcode0, Opcode1, OpcodeMask))
5882 return true;
5884 for (unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
5885 Operands.emplace_back();
5886 // Prepare the operand vector.
5887 for (Value *V : VL)
5888 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
5889 }
5890 if (Operands.size() == 2) {
5891 // Try find best operands candidates.
5892 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
5894 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
5895 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
5896 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
5897 std::optional<int> Res = findBestRootPair(Candidates);
5898 switch (Res.value_or(0)) {
5899 case 0:
5900 break;
5901 case 1:
5902 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
5903 break;
5904 case 2:
5905 std::swap(Operands[0][I], Operands[1][I]);
5906 break;
5907 default:
5908 llvm_unreachable("Unexpected index.");
5909 }
5910 }
5911 }
5912 DenseSet<unsigned> UniqueOpcodes;
5913 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
5914 unsigned NonInstCnt = 0;
5915 // Estimate number of instructions, required for the vectorized node and for
5916 // the buildvector node.
5917 unsigned UndefCnt = 0;
5918 // Count the number of extra shuffles, required for vector nodes.
5919 unsigned ExtraShuffleInsts = 0;
5920 // Check that operands do not contain same values and create either perfect
5921 // diamond match or shuffled match.
5922 if (Operands.size() == 2) {
5923 // Do not count same operands twice.
5924 if (Operands.front() == Operands.back()) {
5925 Operands.erase(Operands.begin());
5926 } else if (!allConstant(Operands.front()) &&
5927 all_of(Operands.front(), [&](Value *V) {
5928 return is_contained(Operands.back(), V);
5929 })) {
5930 Operands.erase(Operands.begin());
5931 ++ExtraShuffleInsts;
5932 }
5933 }
5934 const Loop *L = LI->getLoopFor(S.MainOp->getParent());
5935 // Vectorize node, if:
5936 // 1. at least single operand is constant or splat.
5937 // 2. Operands have many loop invariants (the instructions are not loop
5938 // invariants).
5939 // 3. At least single unique operands is supposed to vectorized.
5940 return none_of(Operands,
5941 [&](ArrayRef<Value *> Op) {
5942 if (allConstant(Op) ||
5943 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
5944 getSameOpcode(Op, *TLI).MainOp))
5945 return false;
5947 for (Value *V : Op) {
5948 if (isa<Constant, ExtractElementInst>(V) ||
5949 getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
5950 if (isa<UndefValue>(V))
5951 ++UndefCnt;
5952 continue;
5953 }
5954 auto Res = Uniques.try_emplace(V, 0);
5955 // Found first duplicate - need to add shuffle.
5956 if (!Res.second && Res.first->second == 1)
5957 ++ExtraShuffleInsts;
5958 ++Res.first->getSecond();
5959 if (auto *I = dyn_cast<Instruction>(V))
5960 UniqueOpcodes.insert(I->getOpcode());
5961 else if (Res.second)
5962 ++NonInstCnt;
5963 }
5964 return none_of(Uniques, [&](const auto &P) {
5965 return P.first->hasNUsesOrMore(P.second + 1) &&
5966 none_of(P.first->users(), [&](User *U) {
5967 return getTreeEntry(U) || Uniques.contains(U);
5968 });
5969 });
5970 }) ||
5971 // Do not vectorize node, if estimated number of vector instructions is
5972 // more than estimated number of buildvector instructions. Number of
5973 // vector operands is number of vector instructions + number of vector
5974 // instructions for operands (buildvectors). Number of buildvector
5975 // instructions is just number_of_operands * number_of_scalars.
5976 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
5977 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
5978 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
5979}
5980
5981BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
5982 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
5983 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
5984 assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
5985
5986 unsigned ShuffleOrOp =
5987 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
5988 auto *VL0 = cast<Instruction>(S.OpValue);
5989 switch (ShuffleOrOp) {
5990 case Instruction::PHI: {
5991 // Check for terminator values (e.g. invoke).
5992 for (Value *V : VL)
5993 for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
5994 Instruction *Term = dyn_cast<Instruction>(Incoming);
5995 if (Term && Term->isTerminator()) {
5997 << "SLP: Need to swizzle PHINodes (terminator use).\n");
5998 return TreeEntry::NeedToGather;
5999 }
6000 }
6001
6002 return TreeEntry::Vectorize;
6003 }
6004 case Instruction::ExtractValue:
6005 case Instruction::ExtractElement: {
6006 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6007 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6008 if (!isPowerOf2_32(VL.size()))
6009 return TreeEntry::NeedToGather;
6010 if (Reuse || !CurrentOrder.empty())
6011 return TreeEntry::Vectorize;
6012 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
6013 return TreeEntry::NeedToGather;
6014 }
6015 case Instruction::InsertElement: {
6016 // Check that we have a buildvector and not a shuffle of 2 or more
6017 // different vectors.
6018 ValueSet SourceVectors;
6019 for (Value *V : VL) {
6020 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
6021 assert(getInsertIndex(V) != std::nullopt &&
6022 "Non-constant or undef index?");
6023 }
6024
6025 if (count_if(VL, [&SourceVectors](Value *V) {
6026 return !SourceVectors.contains(V);
6027 }) >= 2) {
6028 // Found 2nd source vector - cancel.
6029 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
6030 "different source vectors.\n");
6031 return TreeEntry::NeedToGather;
6032 }
6033
6034 return TreeEntry::Vectorize;
6035 }
6036 case Instruction::Load: {
6037 // Check that a vectorized load would load the same memory as a scalar
6038 // load. For example, we don't want to vectorize loads that are smaller
6039 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6040 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6041 // from such a struct, we read/write packed bits disagreeing with the
6042 // unvectorized version.
6043 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
6045 return TreeEntry::Vectorize;
6047 return TreeEntry::ScatterVectorize;
6049 return TreeEntry::StridedVectorize;
6050 case LoadsState::Gather:
6051#ifndef NDEBUG
6052 Type *ScalarTy = VL0->getType();
6053 if (DL->getTypeSizeInBits(ScalarTy) !=
6054 DL->getTypeAllocSizeInBits(ScalarTy))
6055 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
6056 else if (any_of(VL,
6057 [](Value *V) { return !cast<LoadInst>(V)->isSimple(); }))
6058 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
6059 else
6060 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
6061#endif // NDEBUG
6062 return TreeEntry::NeedToGather;
6063 }
6064 llvm_unreachable("Unexpected state of loads");
6065 }
6066 case Instruction::ZExt:
6067 case Instruction::SExt:
6068 case Instruction::FPToUI:
6069 case Instruction::FPToSI:
6070 case Instruction::FPExt:
6071 case Instruction::PtrToInt:
6072 case Instruction::IntToPtr:
6073 case Instruction::SIToFP:
6074 case Instruction::UIToFP:
6075 case Instruction::Trunc:
6076 case Instruction::FPTrunc:
6077 case Instruction::BitCast: {
6078 Type *SrcTy = VL0->getOperand(0)->getType();
6079 for (Value *V : VL) {
6080 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6081 if (Ty != SrcTy || !isValidElementType(Ty)) {
6082 LLVM_DEBUG(
6083 dbgs() << "SLP: Gathering casts with different src types.\n");
6084 return TreeEntry::NeedToGather;
6085 }
6086 }
6087 return TreeEntry::Vectorize;
6088 }
6089 case Instruction::ICmp:
6090 case Instruction::FCmp: {
6091 // Check that all of the compares have the same predicate.
6092 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
6094 Type *ComparedTy = VL0->getOperand(0)->getType();
6095 for (Value *V : VL) {
6096 CmpInst *Cmp = cast<CmpInst>(V);
6097 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
6098 Cmp->getOperand(0)->getType() != ComparedTy) {
6099 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
6100 return TreeEntry::NeedToGather;
6101 }
6102 }
6103 return TreeEntry::Vectorize;
6104 }
6105 case Instruction::Select:
6106 case Instruction::FNeg:
6107 case Instruction::Add:
6108 case Instruction::FAdd:
6109 case Instruction::Sub:
6110 case Instruction::FSub:
6111 case Instruction::Mul:
6112 case Instruction::FMul:
6113 case Instruction::UDiv:
6114 case Instruction::SDiv:
6115 case Instruction::FDiv:
6116 case Instruction::URem:
6117 case Instruction::SRem:
6118 case Instruction::FRem:
6119 case Instruction::Shl:
6120 case Instruction::LShr:
6121 case Instruction::AShr:
6122 case Instruction::And:
6123 case Instruction::Or:
6124 case Instruction::Xor:
6125 return TreeEntry::Vectorize;
6126 case Instruction::GetElementPtr: {
6127 // We don't combine GEPs with complicated (nested) indexing.
6128 for (Value *V : VL) {
6129 auto *I = dyn_cast<GetElementPtrInst>(V);
6130 if (!I)
6131 continue;
6132 if (I->getNumOperands() != 2) {
6133 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
6134 return TreeEntry::NeedToGather;
6135 }
6136 }
6137
6138 // We can't combine several GEPs into one vector if they operate on
6139 // different types.
6140 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6141 for (Value *V : VL) {
6142 auto *GEP = dyn_cast<GEPOperator>(V);
6143 if (!GEP)
6144 continue;
6145 Type *CurTy = GEP->getSourceElementType();
6146 if (Ty0 != CurTy) {
6147 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
6148 return TreeEntry::NeedToGather;
6149 }
6150 }
6151
6152 // We don't combine GEPs with non-constant indexes.
6153 Type *Ty1 = VL0->getOperand(1)->getType();
6154 for (Value *V : VL) {
6155 auto *I = dyn_cast<GetElementPtrInst>(V);
6156 if (!I)
6157 continue;
6158 auto *Op = I->getOperand(1);
6159 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6160 (Op->getType() != Ty1 &&
6161 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6162 Op->getType()->getScalarSizeInBits() >
6163 DL->getIndexSizeInBits(
6164 V->getType()->getPointerAddressSpace())))) {
6165 LLVM_DEBUG(
6166 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
6167 return TreeEntry::NeedToGather;
6168 }
6169 }
6170
6171 return TreeEntry::Vectorize;
6172 }
6173 case Instruction::Store: {
6174 // Check if the stores are consecutive or if we need to swizzle them.
6175 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6176 // Avoid types that are padded when being allocated as scalars, while
6177 // being packed together in a vector (such as i1).
6178 if (DL->getTypeSizeInBits(ScalarTy) !=
6179 DL->getTypeAllocSizeInBits(ScalarTy)) {
6180 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
6181 return TreeEntry::NeedToGather;
6182 }
6183 // Make sure all stores in the bundle are simple - we can't vectorize
6184 // atomic or volatile stores.
6185 for (Value *V : VL) {
6186 auto *SI = cast<StoreInst>(V);
6187 if (!SI->isSimple()) {
6188 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
6189 return TreeEntry::NeedToGather;
6190 }
6191 PointerOps.push_back(SI->getPointerOperand());
6192 }
6193
6194 // Check the order of pointer operands.
6195 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
6196 Value *Ptr0;
6197 Value *PtrN;
6198 if (CurrentOrder.empty()) {
6199 Ptr0 = PointerOps.front();
6200 PtrN = PointerOps.back();
6201 } else {
6202 Ptr0 = PointerOps[CurrentOrder.front()];
6203 PtrN = PointerOps[CurrentOrder.back()];
6204 }
6205 std::optional<int> Dist =
6206 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
6207 // Check that the sorted pointer operands are consecutive.
6208 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
6209 return TreeEntry::Vectorize;
6210 }
6211
6212 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
6213 return TreeEntry::NeedToGather;
6214 }
6215 case Instruction::Call: {
6216 // Check if the calls are all to the same vectorizable intrinsic or
6217 // library function.
6218 CallInst *CI = cast<CallInst>(VL0);
6220
6221 VFShape Shape = VFShape::get(
6222 CI->getFunctionType(),
6223 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
6224 false /*HasGlobalPred*/);
6225 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
6226
6227 if (!VecFunc && !isTriviallyVectorizable(ID)) {
6228 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
6229 return TreeEntry::NeedToGather;
6230 }
6231 Function *F = CI->getCalledFunction();
6232 unsigned NumArgs = CI->arg_size();
6233 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
6234 for (unsigned J = 0; J != NumArgs; ++J)
6236 ScalarArgs[J] = CI->getArgOperand(J);
6237 for (Value *V : VL) {
6238 CallInst *CI2 = dyn_cast<CallInst>(V);
6239 if (!CI2 || CI2->getCalledFunction() != F ||
6240 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
6241 (VecFunc &&
6242 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
6244 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
6245 << "\n");
6246 return TreeEntry::NeedToGather;
6247 }
6248 // Some intrinsics have scalar arguments and should be same in order for
6249 // them to be vectorized.
6250 for (unsigned J = 0; J != NumArgs; ++J) {
6252 Value *A1J = CI2->getArgOperand(J);
6253 if (ScalarArgs[J] != A1J) {
6255 << "SLP: mismatched arguments in call:" << *CI
6256 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
6257 return TreeEntry::NeedToGather;
6258 }
6259 }
6260 }
6261 // Verify that the bundle operands are identical between the two calls.
6262 if (CI->hasOperandBundles() &&
6263 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
6264 CI->op_begin() + CI->getBundleOperandsEndIndex(),
6265 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
6266 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
6267 << "!=" << *V << '\n');
6268 return TreeEntry::NeedToGather;
6269 }
6270 }
6271
6272 return TreeEntry::Vectorize;
6273 }
6274 case Instruction::ShuffleVector: {
6275 // If this is not an alternate sequence of opcode like add-sub
6276 // then do not vectorize this instruction.
6277 if (!S.isAltShuffle()) {
6278 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
6279 return TreeEntry::NeedToGather;
6280 }
6281 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
6282 LLVM_DEBUG(
6283 dbgs()
6284 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
6285 "the whole alt sequence is not profitable.\n");
6286 return TreeEntry::NeedToGather;
6287 }
6288
6289 return TreeEntry::Vectorize;
6290 }
6291 default:
6292 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
6293 return TreeEntry::NeedToGather;
6294 }
6295}
6296
6297void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
6298 const EdgeInfo &UserTreeIdx) {
6299 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
6300
6301 SmallVector<int> ReuseShuffleIndicies;
6302 SmallVector<Value *> UniqueValues;
6303 SmallVector<Value *> NonUniqueValueVL;
6304 auto TryToFindDuplicates = [&](const InstructionsState &S,
6305 bool DoNotFail = false) {
6306 // Check that every instruction appears once in this bundle.
6307 DenseMap<Value *, unsigned> UniquePositions(VL.size());
6308 for (Value *V : VL) {
6309 if (isConstant(V)) {
6310 ReuseShuffleIndicies.emplace_back(
6311 isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());
6312 UniqueValues.emplace_back(V);
6313 continue;
6314 }
6315 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
6316 ReuseShuffleIndicies.emplace_back(Res.first->second);
6317 if (Res.second)
6318 UniqueValues.emplace_back(V);
6319 }
6320 size_t NumUniqueScalarValues = UniqueValues.size();
6321 if (NumUniqueScalarValues == VL.size()) {
6322 ReuseShuffleIndicies.clear();
6323 } else {
6324 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6325 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6326 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
6327 "for nodes with padding.\n");
6328 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6329 return false;
6330 }
6331 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
6332 if (NumUniqueScalarValues <= 1 ||
6333 (UniquePositions.size() == 1 && all_of(UniqueValues,
6334 [](Value *V) {
6335 return isa<UndefValue>(V) ||
6336 !isConstant(V);
6337 })) ||
6338 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6339 if (DoNotFail && UniquePositions.size() > 1 &&
6340 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6341 all_of(UniqueValues, [=](Value *V) {
6342 return isa<ExtractElementInst>(V) ||
6343 areAllUsersVectorized(cast<Instruction>(V),
6344 UserIgnoreList);
6345 })) {
6346 unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
6347 if (PWSz == VL.size()) {
6348 ReuseShuffleIndicies.clear();
6349 } else {
6350 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
6351 NonUniqueValueVL.append(PWSz - UniqueValues.size(),
6352 UniqueValues.back());
6353 VL = NonUniqueValueVL;
6354 }
6355 return true;
6356 }
6357 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
6358 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6359 return false;
6360 }
6361 VL = UniqueValues;
6362 }
6363 return true;
6364 };
6365
6366 InstructionsState S = getSameOpcode(VL, *TLI);
6367
6368 // Don't vectorize ephemeral values.
6369 if (!EphValues.empty()) {
6370 for (Value *V : VL) {
6371 if (EphValues.count(V)) {
6372 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6373 << ") is ephemeral.\n");
6374 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6375 return;
6376 }
6377 }
6378 }
6379
6380 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
6381 // a load), in which case peek through to include it in the tree, without
6382 // ballooning over-budget.
6383 if (Depth >= RecursionMaxDepth &&
6384 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6385 VL.size() >= 4 &&
6386 (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {
6387 return match(I,
6389 cast<Instruction>(I)->getOpcode() ==
6390 cast<Instruction>(S.MainOp)->getOpcode();
6391 })))) {
6392 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
6393 if (TryToFindDuplicates(S))
6394 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6395 ReuseShuffleIndicies);
6396 return;
6397 }
6398
6399 // Don't handle scalable vectors
6400 if (S.getOpcode() == Instruction::ExtractElement &&
6401 isa<ScalableVectorType>(
6402 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6403 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
6404 if (TryToFindDuplicates(S))
6405 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6406 ReuseShuffleIndicies);
6407 return;
6408 }
6409
6410 // Don't handle vectors.
6411 if (S.OpValue->getType()->isVectorTy() &&
6412 !isa<InsertElementInst>(S.OpValue)) {
6413 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
6414 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6415 return;
6416 }
6417
6418 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6419 if (SI->getValueOperand()->getType()->isVectorTy()) {
6420 LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
6421 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6422 return;
6423 }
6424
6425 // If all of the operands are identical or constant we have a simple solution.
6426 // If we deal with insert/extract instructions, they all must have constant
6427 // indices, otherwise we should gather them, not try to vectorize.
6428 // If alternate op node with 2 elements with gathered operands - do not
6429 // vectorize.
6430 auto &&NotProfitableForVectorization = [&S, this,
6432 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6433 return false;
6434 if (VectorizableTree.size() < MinTreeSize)
6435 return false;
6436 if (Depth >= RecursionMaxDepth - 1)
6437 return true;
6438 // Check if all operands are extracts, part of vector node or can build a
6439 // regular vectorize node.
6440 SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
6441 for (Value *V : VL) {
6442 auto *I = cast<Instruction>(V);
6443 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
6444 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6445 }));
6446 }
6447 bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
6448 if ((IsCommutative &&
6449 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6450 (!IsCommutative &&
6451 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
6452 return true;
6453 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
6455 auto *I1 = cast<Instruction>(VL.front());
6456 auto *I2 = cast<Instruction>(VL.back());
6457 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6458 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6459 I2->getOperand(Op));
6460 if (static_cast<unsigned>(count_if(
6461 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6463 })) >= S.MainOp->getNumOperands() / 2)
6464 return false;
6465 if (S.MainOp->getNumOperands() > 2)
6466 return true;
6467 if (IsCommutative) {
6468 // Check permuted operands.
6469 Candidates.clear();
6470 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6471 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6472 I2->getOperand((Op + 1) % E));
6473 if (any_of(
6474 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6476 }))
6477 return false;
6478 }
6479 return true;
6480 };
6481 SmallVector<unsigned> SortedIndices;
6482 BasicBlock *BB = nullptr;
6483 bool IsScatterVectorizeUserTE =
6484 UserTreeIdx.UserTE &&
6485 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6486 bool AreAllSameInsts =
6487 (S.getOpcode() && allSameBlock(VL)) ||
6488 (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
6489 VL.size() > 2 &&
6490 all_of(VL,
6491 [&BB](Value *V) {
6492 auto *I = dyn_cast<GetElementPtrInst>(V);
6493 if (!I)
6494 return doesNotNeedToBeScheduled(V);
6495 if (!BB)
6496 BB = I->getParent();
6497 return BB == I->getParent() && I->getNumOperands() == 2;
6498 }) &&
6499 BB &&
6500 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
6501 SortedIndices));
6502 if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) ||
6503 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6504 S.OpValue) &&
6506 NotProfitableForVectorization(VL)) {
6507 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
6508 if (TryToFindDuplicates(S))
6509 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6510 ReuseShuffleIndicies);
6511 return;
6512 }
6513
6514 // We now know that this is a vector of instructions of the same type from
6515 // the same block.
6516
6517 // Check if this is a duplicate of another entry.
6518 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
6519 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
6520 if (!E->isSame(VL)) {
6521 auto It = MultiNodeScalars.find(S.OpValue);
6522 if (It != MultiNodeScalars.end()) {
6523 auto *TEIt = find_if(It->getSecond(),
6524 [&](TreeEntry *ME) { return ME->isSame(VL); });
6525 if (TEIt != It->getSecond().end())
6526 E = *TEIt;
6527 else
6528 E = nullptr;
6529 } else {
6530 E = nullptr;
6531 }
6532 }
6533 if (!E) {
6534 if (!doesNotNeedToBeScheduled(S.OpValue)) {
6535 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
6536 if (TryToFindDuplicates(S))
6537 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6538 ReuseShuffleIndicies);
6539 return;
6540 }
6541 } else {
6542 // Record the reuse of the tree node. FIXME, currently this is only used
6543 // to properly draw the graph rather than for the actual vectorization.
6544 E->UserTreeIndices.push_back(UserTreeIdx);
6545 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
6546 << ".\n");
6547 return;
6548 }
6549 }
6550
6551 // Check that none of the instructions in the bundle are already in the tree.
6552 for (Value *V : VL) {
6553 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
6555 continue;
6556 if (getTreeEntry(V)) {
6557 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6558 << ") is already in tree.\n");
6559 if (TryToFindDuplicates(S))
6560 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6561 ReuseShuffleIndicies);
6562 return;
6563 }
6564 }
6565
6566 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
6567 if (UserIgnoreList && !UserIgnoreList->empty()) {
6568 for (Value *V : VL) {
6569 if (UserIgnoreList && UserIgnoreList->contains(V)) {
6570 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
6571 if (TryToFindDuplicates(S))
6572 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6573 ReuseShuffleIndicies);
6574 return;
6575 }
6576 }
6577 }
6578
6579 // Special processing for sorted pointers for ScatterVectorize node with
6580 // constant indeces only.
6581 if (AreAllSameInsts && UserTreeIdx.UserTE &&
6582 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
6583 !(S.getOpcode() && allSameBlock(VL))) {
6584 assert(S.OpValue->getType()->isPointerTy() &&
6585 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
6586 "Expected pointers only.");
6587 // Reset S to make it GetElementPtr kind of node.
6588 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
6589 assert(It != VL.end() && "Expected at least one GEP.");
6590 S = getSameOpcode(*It, *TLI);
6591 }
6592
6593 // Check that all of the users of the scalars that we want to vectorize are
6594 // schedulable.
6595 auto *VL0 = cast<Instruction>(S.OpValue);
6596 BB = VL0->getParent();
6597
6598 if (!DT->isReachableFromEntry(BB)) {
6599 // Don't go into unreachable blocks. They may contain instructions with
6600 // dependency cycles which confuse the final scheduling.
6601 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
6602 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6603 return;
6604 }
6605
6606 // Don't go into catchswitch blocks, which can happen with PHIs.
6607 // Such blocks can only have PHIs and the catchswitch. There is no
6608 // place to insert a shuffle if we need to, so just avoid that issue.
6609 if (isa<CatchSwitchInst>(BB->getTerminator())) {
6610 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
6611 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6612 return;
6613 }
6614
6615 // Check that every instruction appears once in this bundle.
6616 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
6617 return;
6618
6619 // Perform specific checks for each particular instruction kind.
6620 OrdersType CurrentOrder;
6621 SmallVector<Value *> PointerOps;
6622 TreeEntry::EntryState State = getScalarsVectorizationState(
6623 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
6624 if (State == TreeEntry::NeedToGather) {
6625 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6626 ReuseShuffleIndicies);
6627 return;
6628 }
6629
6630 auto &BSRef = BlocksSchedules[BB];
6631 if (!BSRef)
6632 BSRef = std::make_unique<BlockScheduling>(BB);
6633
6634 BlockScheduling &BS = *BSRef;
6635
6636 std::optional<ScheduleData *> Bundle =
6637 BS.tryScheduleBundle(UniqueValues, this, S);
6638#ifdef EXPENSIVE_CHECKS
6639 // Make sure we didn't break any internal invariants
6640 BS.verify();
6641#endif
6642 if (!Bundle) {
6643 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
6644 assert((!BS.getScheduleData(VL0) ||
6645 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
6646 "tryScheduleBundle should cancelScheduling on failure");
6647 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6648 ReuseShuffleIndicies);
6649 return;
6650 }
6651 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
6652
6653 unsigned ShuffleOrOp = S.isAltShuffle() ?
6654 (unsigned) Instruction::ShuffleVector : S.getOpcode();
6655 switch (ShuffleOrOp) {
6656 case Instruction::PHI: {
6657 auto *PH = cast<PHINode>(VL0);
6658
6659 TreeEntry *TE =
6660 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
6661 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
6662
6663 // Keeps the reordered operands to avoid code duplication.
6664 SmallVector<ValueList, 2> OperandsVec;
6665 for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
6666 if (!DT->isReachableFromEntry(PH->getIncomingBlock(I))) {
6667 ValueList Operands(VL.size(), PoisonValue::get(PH->getType()));
6668 TE->setOperand(I, Operands);
6669 OperandsVec.push_back(Operands);
6670 continue;
6671 }
6673 // Prepare the operand vector.
6674 for (Value *V : VL)
6675 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
6676 PH->getIncomingBlock(I)));
6677 TE->setOperand(I, Operands);
6678 OperandsVec.push_back(Operands);
6679 }
6680 for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
6681 buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
6682 return;
6683 }
6684 case Instruction::ExtractValue:
6685 case Instruction::ExtractElement: {
6686 if (CurrentOrder.empty()) {
6687 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
6688 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6689 ReuseShuffleIndicies);
6690 // This is a special case, as it does not gather, but at the same time
6691 // we are not extending buildTree_rec() towards the operands.
6692 ValueList Op0;
6693 Op0.assign(VL.size(), VL0->getOperand(0));
6694 VectorizableTree.back()->setOperand(0, Op0);
6695 return;
6696 }
6697 LLVM_DEBUG({
6698 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
6699 "with order";
6700 for (unsigned Idx : CurrentOrder)
6701 dbgs() << " " << Idx;
6702 dbgs() << "\n";
6703 });
6704 fixupOrderingIndices(CurrentOrder);
6705 // Insert new order with initial value 0, if it does not exist,
6706 // otherwise return the iterator to the existing one.
6707 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6708 ReuseShuffleIndicies, CurrentOrder);
6709 // This is a special case, as it does not gather, but at the same time
6710 // we are not extending buildTree_rec() towards the operands.
6711 ValueList Op0;
6712 Op0.assign(VL.size(), VL0->getOperand(0));
6713 VectorizableTree.back()->setOperand(0, Op0);
6714 return;
6715 }
6716 case Instruction::InsertElement: {
6717 assert(ReuseShuffleIndicies.empty() && "All inserts should be unique");
6718
6719 auto OrdCompare = [](const std::pair<int, int> &P1,
6720 const std::pair<int, int> &P2) {
6721 return P1.first > P2.first;
6722 };
6724 decltype(OrdCompare)>
6725 Indices(OrdCompare);
6726 for (int I = 0, E = VL.size(); I < E; ++I) {
6727 unsigned Idx = *getInsertIndex(VL[I]);
6728 Indices.emplace(Idx, I);
6729 }
6730 OrdersType CurrentOrder(VL.size(), VL.size());
6731 bool IsIdentity = true;
6732 for (int I = 0, E = VL.size(); I < E; ++I) {
6733 CurrentOrder[Indices.top().second] = I;
6734 IsIdentity &= Indices.top().second == I;
6735 Indices.pop();
6736 }
6737 if (IsIdentity)
6738 CurrentOrder.clear();
6739 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6740 std::nullopt, CurrentOrder);
6741 LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
6742
6743 constexpr int NumOps = 2;
6744 ValueList VectorOperands[NumOps];
6745 for (int I = 0; I < NumOps; ++I) {
6746 for (Value *V : VL)
6747 VectorOperands[I].push_back(cast<Instruction>(V)->getOperand(I));
6748
6749 TE->setOperand(I, VectorOperands[I]);
6750 }
6751 buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, NumOps - 1});
6752 return;
6753 }
6754 case Instruction::Load: {
6755 // Check that a vectorized load would load the same memory as a scalar
6756 // load. For example, we don't want to vectorize loads that are smaller
6757 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6758 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6759 // from such a struct, we read/write packed bits disagreeing with the
6760 // unvectorized version.
6761 TreeEntry *TE = nullptr;
6762 fixupOrderingIndices(CurrentOrder);
6763 switch (State) {
6764 case TreeEntry::Vectorize:
6765 if (CurrentOrder.empty()) {
6766 // Original loads are consecutive and does not require reordering.
6767 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6768 ReuseShuffleIndicies);
6769 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
6770 } else {
6771 // Need to reorder.
6772 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6773 ReuseShuffleIndicies, CurrentOrder);
6774 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
6775 }
6776 TE->setOperandsInOrder();
6777 break;
6778 case TreeEntry::StridedVectorize:
6779 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
6780 if (CurrentOrder.empty()) {
6781 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6782 UserTreeIdx, ReuseShuffleIndicies);
6783 } else {
6784 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6785 UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
6786 }
6787 TE->setOperandsInOrder();
6788 LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
6789 break;
6790 case TreeEntry::ScatterVectorize:
6791 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
6792 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
6793 UserTreeIdx, ReuseShuffleIndicies);
6794 TE->setOperandsInOrder();
6795 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
6796 LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
6797 break;
6798 case TreeEntry::NeedToGather:
6799 llvm_unreachable("Unexpected loads state.");
6800 }
6801 return;
6802 }
6803 case Instruction::ZExt:
6804 case Instruction::SExt:
6805 case Instruction::FPToUI:
6806 case Instruction::FPToSI:
6807 case Instruction::FPExt:
6808 case Instruction::PtrToInt:
6809 case Instruction::IntToPtr:
6810 case Instruction::SIToFP:
6811 case Instruction::UIToFP:
6812 case Instruction::Trunc:
6813 case Instruction::FPTrunc:
6814 case Instruction::BitCast: {
6815 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
6816 std::make_pair(std::numeric_limits<unsigned>::min(),
6817 std::numeric_limits<unsigned>::max()));
6818 if (ShuffleOrOp == Instruction::ZExt ||
6819 ShuffleOrOp == Instruction::SExt) {
6820 CastMaxMinBWSizes = std::make_pair(
6821 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
6822 PrevMaxBW),
6823 std::min<unsigned>(
6824 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
6825 PrevMinBW));
6826 } else if (ShuffleOrOp == Instruction::Trunc) {
6827 CastMaxMinBWSizes = std::make_pair(
6828 std::max<unsigned>(
6829 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
6830 PrevMaxBW),
6831 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
6832 PrevMinBW));
6833 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
6834 } else if (ShuffleOrOp == Instruction::SIToFP ||
6835 ShuffleOrOp == Instruction::UIToFP) {
6836 unsigned NumSignBits =
6837 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
6838 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
6839 APInt Mask = DB->getDemandedBits(OpI);
6840 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
6841 }
6842 if (NumSignBits * 2 >=
6843 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
6844 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
6845 }
6846 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6847 ReuseShuffleIndicies);
6848 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
6849
6850 TE->setOperandsInOrder();
6851 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6853 // Prepare the operand vector.
6854 for (Value *V : VL)
6855 Operands.push_back(cast<Instruction>(V)->getOperand(I));
6856
6857 buildTree_rec(Operands, Depth + 1, {TE, I});
6858 }
6859 return;
6860 }
6861 case Instruction::ICmp:
6862 case Instruction::FCmp: {
6863 // Check that all of the compares have the same predicate.
6864 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
6865 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6866 ReuseShuffleIndicies);
6867 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
6868
6870 if (cast<CmpInst>(VL0)->isCommutative()) {
6871 // Commutative predicate - collect + sort operands of the instructions
6872 // so that each side is more likely to have the same opcode.
6874 "Commutative Predicate mismatch");
6875 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
6876 } else {
6877 // Collect operands - commute if it uses the swapped predicate.
6878 for (Value *V : VL) {
6879 auto *Cmp = cast<CmpInst>(V);
6880 Value *LHS = Cmp->getOperand(0);
6881 Value *RHS = Cmp->getOperand(1);
6882 if (Cmp->getPredicate() != P0)
6883 std::swap(LHS, RHS);
6884 Left.push_back(LHS);
6885 Right.push_back(RHS);
6886 }
6887 }
6888 TE->setOperand(0, Left);
6889 TE->setOperand(1, Right);
6890 buildTree_rec(Left, Depth + 1, {TE, 0});
6891 buildTree_rec(Right, Depth + 1, {TE, 1});
6892 if (ShuffleOrOp == Instruction::ICmp) {
6893 unsigned NumSignBits0 =
6894 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
6895 if (NumSignBits0 * 2 >=
6896 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
6897 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
6898 unsigned NumSignBits1 =
6899 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
6900 if (NumSignBits1 * 2 >=
6901 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
6902 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
6903 }
6904 return;
6905 }
6906 case Instruction::Select:
6907 case Instruction::FNeg:
6908 case Instruction::Add:
6909 case Instruction::FAdd:
6910 case Instruction::Sub:
6911 case Instruction::FSub:
6912 case Instruction::Mul:
6913 case Instruction::FMul:
6914 case Instruction::UDiv:
6915 case Instruction::SDiv:
6916 case Instruction::FDiv:
6917 case Instruction::URem:
6918 case Instruction::SRem:
6919 case Instruction::FRem:
6920 case Instruction::Shl:
6921 case Instruction::LShr:
6922 case Instruction::AShr:
6923 case Instruction::And:
6924 case Instruction::Or:
6925 case Instruction::Xor: {
6926 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6927 ReuseShuffleIndicies);
6928 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
6929
6930 // Sort operands of the instructions so that each side is more likely to
6931 // have the same opcode.
6932 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
6934 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
6935 TE->setOperand(0, Left);
6936 TE->setOperand(1, Right);
6937 buildTree_rec(Left, Depth + 1, {TE, 0});
6938 buildTree_rec(Right, Depth + 1, {TE, 1});
6939 return;
6940 }
6941
6942 TE->setOperandsInOrder();
6943 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6945 // Prepare the operand vector.
6946 for (Value *V : VL)
6947 Operands.push_back(cast<Instruction>(V)->getOperand(I));
6948
6949 buildTree_rec(Operands, Depth + 1, {TE, I});
6950 }
6951 return;
6952 }
6953 case Instruction::GetElementPtr: {
6954 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6955 ReuseShuffleIndicies);
6956 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
6958 // Prepare the operand vector for pointer operands.
6959 for (Value *V : VL) {
6960 auto *GEP = dyn_cast<GetElementPtrInst>(V);
6961 if (!GEP) {
6962 Operands.front().push_back(V);
6963 continue;
6964 }
6965 Operands.front().push_back(GEP->getPointerOperand());
6966 }
6967 TE->setOperand(0, Operands.front());
6968 // Need to cast all indices to the same type before vectorization to
6969 // avoid crash.
6970 // Required to be able to find correct matches between different gather
6971 // nodes and reuse the vectorized values rather than trying to gather them
6972 // again.
6973 int IndexIdx = 1;
6974 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
6975 Type *Ty = all_of(VL,
6976 [VL0Ty, IndexIdx](Value *V) {
6977 auto *GEP = dyn_cast<GetElementPtrInst>(V);
6978 if (!GEP)
6979 return true;
6980 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
6981 })
6982 ? VL0Ty
6983 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
6984 ->getPointerOperandType()
6985 ->getScalarType());
6986 // Prepare the operand vector.
6987 for (Value *V : VL) {
6988 auto *I = dyn_cast<GetElementPtrInst>(V);
6989 if (!I) {
6990 Operands.back().push_back(
6991 ConstantInt::get(Ty, 0, /*isSigned=*/false));
6992 continue;
6993 }
6994 auto *Op = I->getOperand(IndexIdx);
6995 auto *CI = dyn_cast<ConstantInt>(Op);
6996 if (!CI)
6997 Operands.back().push_back(Op);
6998 else
6999 Operands.back().push_back(ConstantFoldIntegerCast(
7000 CI, Ty, CI->getValue().isSignBitSet(), *DL));
7001 }
7002 TE->setOperand(IndexIdx, Operands.back());
7003
7004 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
7005 buildTree_rec(Operands[I], Depth + 1, {TE, I});
7006 return;
7007 }
7008 case Instruction::Store: {
7009 // Check if the stores are consecutive or if we need to swizzle them.
7010 ValueList Operands(VL.size());
7011 auto *OIter = Operands.begin();
7012 for (Value *V : VL) {
7013 auto *SI = cast<StoreInst>(V);
7014 *OIter = SI->getValueOperand();
7015 ++OIter;
7016 }
7017 // Check that the sorted pointer operands are consecutive.
7018 if (CurrentOrder.empty()) {
7019 // Original stores are consecutive and does not require reordering.
7020 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7021 ReuseShuffleIndicies);
7022 TE->setOperandsInOrder();
7023 buildTree_rec(Operands, Depth + 1, {TE, 0});
7024 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
7025 } else {
7026 fixupOrderingIndices(CurrentOrder);
7027 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7028 ReuseShuffleIndicies, CurrentOrder);
7029 TE->setOperandsInOrder();
7030 buildTree_rec(Operands, Depth + 1, {TE, 0});
7031 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
7032 }
7033 return;
7034 }
7035 case Instruction::Call: {
7036 // Check if the calls are all to the same vectorizable intrinsic or
7037 // library function.
7038 CallInst *CI = cast<CallInst>(VL0);
7040
7041 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7042 ReuseShuffleIndicies);
7043 // Sort operands of the instructions so that each side is more likely to
7044 // have the same opcode.
7045 if (isCommutative(VL0)) {
7047 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7048 TE->setOperand(0, Left);
7049 TE->setOperand(1, Right);
7051 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7052 Operands.emplace_back();
7054 continue;
7055 for (Value *V : VL) {
7056 auto *CI2 = cast<CallInst>(V);
7057 Operands.back().push_back(CI2->getArgOperand(I));
7058 }
7059 TE->setOperand(I, Operands.back());
7060 }
7061 buildTree_rec(Left, Depth + 1, {TE, 0});
7062 buildTree_rec(Right, Depth + 1, {TE, 1});
7063 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7064 if (Operands[I - 2].empty())
7065 continue;
7066 buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
7067 }
7068 return;
7069 }
7070 TE->setOperandsInOrder();
7071 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
7072 // For scalar operands no need to create an entry since no need to
7073 // vectorize it.
7075 continue;
7077 // Prepare the operand vector.
7078 for (Value *V : VL) {
7079 auto *CI2 = cast<CallInst>(V);
7080 Operands.push_back(CI2->getArgOperand(I));
7081 }
7082 buildTree_rec(Operands, Depth + 1, {TE, I});
7083 }
7084 return;
7085 }
7086 case Instruction::ShuffleVector: {
7087 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7088 ReuseShuffleIndicies);
7089 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
7090
7091 // Reorder operands if reordering would enable vectorization.
7092 auto *CI = dyn_cast<CmpInst>(VL0);
7093 if (isa<BinaryOperator>(VL0) || CI) {
7095 if (!CI || all_of(VL, [](Value *V) {
7096 return cast<CmpInst>(V)->isCommutative();
7097 })) {
7098 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7099 } else {
7100 auto *MainCI = cast<CmpInst>(S.MainOp);
7101 auto *AltCI = cast<CmpInst>(S.AltOp);
7102 CmpInst::Predicate MainP = MainCI->getPredicate();
7103 CmpInst::Predicate AltP = AltCI->getPredicate();
7104 assert(MainP != AltP &&
7105 "Expected different main/alternate predicates.");
7106 // Collect operands - commute if it uses the swapped predicate or
7107 // alternate operation.
7108 for (Value *V : VL) {
7109 auto *Cmp = cast<CmpInst>(V);
7110 Value *LHS = Cmp->getOperand(0);
7111 Value *RHS = Cmp->getOperand(1);
7112
7113 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
7114 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7115 std::swap(LHS, RHS);
7116 } else {
7117 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7118 std::swap(LHS, RHS);
7119 }
7120 Left.push_back(LHS);
7121 Right.push_back(RHS);
7122 }
7123 }
7124 TE->setOperand(0, Left);
7125 TE->setOperand(1, Right);
7126 buildTree_rec(Left, Depth + 1, {TE, 0});
7127 buildTree_rec(Right, Depth + 1, {TE, 1});
7128 return;
7129 }
7130
7131 TE->setOperandsInOrder();
7132 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
7134 // Prepare the operand vector.
7135 for (Value *V : VL)
7136 Operands.push_back(cast<Instruction>(V)->getOperand(I));
7137
7138 buildTree_rec(Operands, Depth + 1, {TE, I});
7139 }
7140 return;
7141 }
7142 default:
7143 break;
7144 }
7145 llvm_unreachable("Unexpected vectorization of the instructions.");
7146}
7147
7149 unsigned N = 1;
7150 Type *EltTy = T;
7151
7152 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7153 if (auto *ST = dyn_cast<StructType>(EltTy)) {
7154 // Check that struct is homogeneous.
7155 for (const auto *Ty : ST->elements())
7156 if (Ty != *ST->element_begin())
7157 return 0;
7158 N *= ST->getNumElements();
7159 EltTy = *ST->element_begin();
7160 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
7161 N *= AT->getNumElements();
7162 EltTy = AT->getElementType();
7163 } else {
7164 auto *VT = cast<FixedVectorType>(EltTy);
7165 N *= VT->getNumElements();
7166 EltTy = VT->getElementType();
7167 }
7168 }
7169
7170 if (!isValidElementType(EltTy))
7171 return 0;
7172 uint64_t VTSize = DL->getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
7173 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7174 VTSize != DL->getTypeStoreSizeInBits(T))
7175 return 0;
7176 return N;
7177}
7178
7179bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
7180 SmallVectorImpl<unsigned> &CurrentOrder,
7181 bool ResizeAllowed) const {
7182 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7183 assert(It != VL.end() && "Expected at least one extract instruction.");
7184 auto *E0 = cast<Instruction>(*It);
7185 assert(
7186 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7187 "Invalid opcode");
7188 // Check if all of the extracts come from the same vector and from the
7189 // correct offset.
7190 Value *Vec = E0->getOperand(0);
7191
7192 CurrentOrder.clear();
7193
7194 // We have to extract from a vector/aggregate with the same number of elements.
7195 unsigned NElts;
7196 if (E0->getOpcode() == Instruction::ExtractValue) {
7197 NElts = canMapToVector(Vec->getType());
7198 if (!NElts)
7199 return false;
7200 // Check if load can be rewritten as load of vector.
7201 LoadInst *LI = dyn_cast<LoadInst>(Vec);
7202 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
7203 return false;
7204 } else {
7205 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
7206 }
7207
7208 unsigned E = VL.size();
7209 if (!ResizeAllowed && NElts != E)
7210 return false;
7212 unsigned MinIdx = NElts, MaxIdx = 0;
7213 for (auto [I, V] : enumerate(VL)) {
7214 auto *Inst = dyn_cast<Instruction>(V);
7215 if (!Inst)
7216 continue;
7217 if (Inst->getOperand(0) != Vec)
7218 return false;
7219 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
7220 if (isa<UndefValue>(EE->getIndexOperand()))
7221 continue;
7222 std::optional<unsigned> Idx = getExtractIndex(Inst);
7223 if (!Idx)
7224 return false;
7225 const unsigned ExtIdx = *Idx;
7226 if (ExtIdx >= NElts)
7227 continue;
7228 Indices[I] = ExtIdx;
7229 if (MinIdx > ExtIdx)
7230 MinIdx = ExtIdx;
7231 if (MaxIdx < ExtIdx)
7232 MaxIdx = ExtIdx;
7233 }
7234 if (MaxIdx - MinIdx + 1 > E)
7235 return false;
7236 if (MaxIdx + 1 <= E)
7237 MinIdx = 0;
7238
7239 // Check that all of the indices extract from the correct offset.
7240 bool ShouldKeepOrder = true;
7241 // Assign to all items the initial value E + 1 so we can check if the extract
7242 // instruction index was used already.
7243 // Also, later we can check that all the indices are used and we have a
7244 // consecutive access in the extract instructions, by checking that no
7245 // element of CurrentOrder still has value E + 1.
7246 CurrentOrder.assign(E, E);
7247 for (unsigned I = 0; I < E; ++I) {
7248 if (Indices[I] == PoisonMaskElem)
7249 continue;
7250 const unsigned ExtIdx = Indices[I] - MinIdx;
7251 if (CurrentOrder[ExtIdx] != E) {
7252 CurrentOrder.clear();
7253 return false;
7254 }
7255 ShouldKeepOrder &= ExtIdx == I;
7256 CurrentOrder[ExtIdx] = I;
7257 }
7258 if (ShouldKeepOrder)
7259 CurrentOrder.clear();
7260
7261 return ShouldKeepOrder;
7262}
7263
7264bool BoUpSLP::areAllUsersVectorized(
7265 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
7266 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
7267 all_of(I->users(), [this](User *U) {
7268 return ScalarToTreeEntry.contains(U) ||
7269 isVectorLikeInstWithConstOps(U) ||
7270 (isa<ExtractElementInst>(U) && MustGather.contains(U));
7271 });
7272}
7273
7274static std::pair<InstructionCost, InstructionCost>
7277 ArrayRef<Type *> ArgTys) {
7279
7280 // Calculate the cost of the scalar and vector calls.
7281 FastMathFlags FMF;
7282 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
7283 FMF = FPCI->getFastMathFlags();
7285 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
7286 dyn_cast<IntrinsicInst>(CI));
7287 auto IntrinsicCost =
7289
7290 auto Shape = VFShape::get(CI->getFunctionType(),
7292 false /*HasGlobalPred*/);
7293 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7294 auto LibCost = IntrinsicCost;
7295 if (!CI->isNoBuiltin() && VecFunc) {
7296 // Calculate the cost of the vector library call.
7297 // If the corresponding vector call is cheaper, return its cost.
7298 LibCost =
7299 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
7300 }
7301 return {IntrinsicCost, LibCost};
7302}
7303
7304void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7305 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
7306 SmallVectorImpl<Value *> *OpScalars,
7307 SmallVectorImpl<Value *> *AltScalars) const {
7308 unsigned Sz = Scalars.size();
7309 Mask.assign(Sz, PoisonMaskElem);
7310 SmallVector<int> OrderMask;
7311 if (!ReorderIndices.empty())
7312 inversePermutation(ReorderIndices, OrderMask);
7313 for (unsigned I = 0; I < Sz; ++I) {
7314 unsigned Idx = I;
7315 if (!ReorderIndices.empty())
7316 Idx = OrderMask[I];
7317 auto *OpInst = cast<Instruction>(Scalars[Idx]);
7318 if (IsAltOp(OpInst)) {
7319 Mask[I] = Sz + Idx;
7320 if (AltScalars)
7321 AltScalars->push_back(OpInst);
7322 } else {
7323 Mask[I] = Idx;
7324 if (OpScalars)
7325 OpScalars->push_back(OpInst);
7326 }
7327 }
7328 if (!ReuseShuffleIndices.empty()) {
7329 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
7330 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
7331 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7332 });
7333 Mask.swap(NewMask);
7334 }
7335}
7336
7338 const Instruction *MainOp,
7339 const Instruction *AltOp,
7340 const TargetLibraryInfo &TLI) {
7341 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7342 auto *AltCI = cast<CmpInst>(AltOp);
7343 CmpInst::Predicate MainP = MainCI->getPredicate();
7344 CmpInst::Predicate AltP = AltCI->getPredicate();
7345 assert(MainP != AltP && "Expected different main/alternate predicates.");
7346 auto *CI = cast<CmpInst>(I);
7347 if (isCmpSameOrSwapped(MainCI, CI, TLI))
7348 return false;
7349 if (isCmpSameOrSwapped(AltCI, CI, TLI))
7350 return true;
7351 CmpInst::Predicate P = CI->getPredicate();
7353
7354 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
7355 "CmpInst expected to match either main or alternate predicate or "
7356 "their swap.");
7357 (void)AltP;
7358 return MainP != P && MainP != SwappedP;
7359 }
7360 return I->getOpcode() == AltOp->getOpcode();
7361}
7362
7363TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
7364 assert(!Ops.empty());
7365 const auto *Op0 = Ops.front();
7366
7367 const bool IsConstant = all_of(Ops, [](Value *V) {
7368 // TODO: We should allow undef elements here
7369 return isConstant(V) && !isa<UndefValue>(V);
7370 });
7371 const bool IsUniform = all_of(Ops, [=](Value *V) {
7372 // TODO: We should allow undef elements here
7373 return V == Op0;
7374 });
7375 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
7376 // TODO: We should allow undef elements here
7377 if (auto *CI = dyn_cast<ConstantInt>(V))
7378 return CI->getValue().isPowerOf2();
7379 return false;
7380 });
7381 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
7382 // TODO: We should allow undef elements here
7383 if (auto *CI = dyn_cast<ConstantInt>(V))
7384 return CI->getValue().isNegatedPowerOf2();
7385 return false;
7386 });
7387
7389 if (IsConstant && IsUniform)
7391 else if (IsConstant)
7393 else if (IsUniform)
7395
7397 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
7398 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
7399
7400 return {VK, VP};
7401}
7402
7403namespace {
7404/// The base class for shuffle instruction emission and shuffle cost estimation.
7405class BaseShuffleAnalysis {
7406protected:
7407 /// Checks if the mask is an identity mask.
7408 /// \param IsStrict if is true the function returns false if mask size does
7409 /// not match vector size.
7410 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
7411 bool IsStrict) {
7412 int Limit = Mask.size();
7413 int VF = VecTy->getNumElements();
7414 int Index = -1;
7415 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
7416 return true;
7417 if (!IsStrict) {
7418 // Consider extract subvector starting from index 0.
7420 Index == 0)
7421 return true;
7422 // All VF-size submasks are identity (e.g.
7423 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
7424 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
7425 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
7426 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
7428 }))
7429 return true;
7430 }
7431 return false;
7432 }
7433
7434 /// Tries to combine 2 different masks into single one.
7435 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
7436 /// change the size of the vector, \p LocalVF is the original size of the
7437 /// shuffled vector.
7438 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
7439 ArrayRef<int> ExtMask) {
7440 unsigned VF = Mask.size();
7441 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
7442 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
7443 if (ExtMask[I] == PoisonMaskElem)
7444 continue;
7445 int MaskedIdx = Mask[ExtMask[I] % VF];
7446 NewMask[I] =
7447 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
7448 }
7449 Mask.swap(NewMask);
7450 }
7451
7452 /// Looks through shuffles trying to reduce final number of shuffles in the
7453 /// code. The function looks through the previously emitted shuffle
7454 /// instructions and properly mark indices in mask as undef.
7455 /// For example, given the code
7456 /// \code
7457 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
7458 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
7459 /// \endcode
7460 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
7461 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7462 /// <0, 1, 2, 3> for the shuffle.
7463 /// If 2 operands are of different size, the smallest one will be resized and
7464 /// the mask recalculated properly.
7465 /// For example, given the code
7466 /// \code
7467 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
7468 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
7469 /// \endcode
7470 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
7471 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7472 /// <0, 1, 2, 3> for the shuffle.
7473 /// So, it tries to transform permutations to simple vector merge, if
7474 /// possible.
7475 /// \param V The input vector which must be shuffled using the given \p Mask.
7476 /// If the better candidate is found, \p V is set to this best candidate
7477 /// vector.
7478 /// \param Mask The input mask for the shuffle. If the best candidate is found
7479 /// during looking-through-shuffles attempt, it is updated accordingly.
7480 /// \param SinglePermute true if the shuffle operation is originally a
7481 /// single-value-permutation. In this case the look-through-shuffles procedure
7482 /// may look for resizing shuffles as the best candidates.
7483 /// \return true if the shuffle results in the non-resizing identity shuffle
7484 /// (and thus can be ignored), false - otherwise.
7485 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
7486 bool SinglePermute) {
7487 Value *Op = V;
7488 ShuffleVectorInst *IdentityOp = nullptr;
7489 SmallVector<int> IdentityMask;
7490 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
7491 // Exit if not a fixed vector type or changing size shuffle.
7492 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7493 if (!SVTy)
7494 break;
7495 // Remember the identity or broadcast mask, if it is not a resizing
7496 // shuffle. If no better candidates are found, this Op and Mask will be
7497 // used in the final shuffle.
7498 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
7499 if (!IdentityOp || !SinglePermute ||
7500 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
7502 IdentityMask.size()))) {
7503 IdentityOp = SV;
7504 // Store current mask in the IdentityMask so later we did not lost
7505 // this info if IdentityOp is selected as the best candidate for the
7506 // permutation.
7507 IdentityMask.assign(Mask);
7508 }
7509 }
7510 // Remember the broadcast mask. If no better candidates are found, this Op
7511 // and Mask will be used in the final shuffle.
7512 // Zero splat can be used as identity too, since it might be used with
7513 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
7514 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
7515 // expensive, the analysis founds out, that the source vector is just a
7516 // broadcast, this original mask can be transformed to identity mask <0,
7517 // 1, 2, 3>.
7518 // \code
7519 // %0 = shuffle %v, poison, zeroinitalizer
7520 // %res = shuffle %0, poison, <3, 1, 2, 0>
7521 // \endcode
7522 // may be transformed to
7523 // \code
7524 // %0 = shuffle %v, poison, zeroinitalizer
7525 // %res = shuffle %0, poison, <0, 1, 2, 3>
7526 // \endcode
7527 if (SV->isZeroEltSplat()) {
7528 IdentityOp = SV;
7529 IdentityMask.assign(Mask);
7530 }
7531 int LocalVF = Mask.size();
7532 if (auto *SVOpTy =
7533 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7534 LocalVF = SVOpTy->getNumElements();
7535 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
7536 for (auto [Idx, I] : enumerate(Mask)) {
7537 if (I == PoisonMaskElem ||
7538 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
7539 continue;
7540 ExtMask[Idx] = SV->getMaskValue(I);
7541 }
7542 bool IsOp1Undef =
7543 isUndefVector(SV->getOperand(0),
7544 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
7545 .all();
7546 bool IsOp2Undef =
7547 isUndefVector(SV->getOperand(1),
7548 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
7549 .all();
7550 if (!IsOp1Undef && !IsOp2Undef) {
7551 // Update mask and mark undef elems.
7552 for (int &I : Mask) {
7553 if (I == PoisonMaskElem)
7554 continue;
7555 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
7557 I = PoisonMaskElem;
7558 }
7559 break;
7560 }
7561 SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
7562 SV->getShuffleMask().end());
7563 combineMasks(LocalVF, ShuffleMask, Mask);
7564 Mask.swap(ShuffleMask);
7565 if (IsOp2Undef)
7566 Op = SV->getOperand(0);
7567 else
7568 Op = SV->getOperand(1);
7569 }
7570 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
7571 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7573 if (IdentityOp) {
7574 V = IdentityOp;
7575 assert(Mask.size() == IdentityMask.size() &&
7576 "Expected masks of same sizes.");
7577 // Clear known poison elements.
7578 for (auto [I, Idx] : enumerate(Mask))
7579 if (Idx == PoisonMaskElem)
7580 IdentityMask[I] = PoisonMaskElem;
7581 Mask.swap(IdentityMask);
7582 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7583 return SinglePermute &&
7584 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
7585 /*IsStrict=*/true) ||
7586 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
7587 Shuffle->isZeroEltSplat() &&
7589 }
7590 V = Op;
7591 return false;
7592 }
7593 V = Op;
7594 return true;
7595 }
7596
7597 /// Smart shuffle instruction emission, walks through shuffles trees and
7598 /// tries to find the best matching vector for the actual shuffle
7599 /// instruction.
7600 template <typename T, typename ShuffleBuilderTy>
7601 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
7602 ShuffleBuilderTy &Builder) {
7603 assert(V1 && "Expected at least one vector value.");
7604 if (V2)
7605 Builder.resizeToMatch(V1, V2);
7606 int VF = Mask.size();
7607 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
7608 VF = FTy->getNumElements();
7609 if (V2 &&
7610 !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) {
7611 // Peek through shuffles.
7612 Value *Op1 = V1;
7613 Value *Op2 = V2;
7614 int VF =
7615 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
7616 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
7617 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
7618 for (int I = 0, E = Mask.size(); I < E; ++I) {
7619 if (Mask[I] < VF)
7620 CombinedMask1[I] = Mask[I];
7621 else
7622 CombinedMask2[I] = Mask[I] - VF;
7623 }
7624 Value *PrevOp1;
7625 Value *PrevOp2;
7626 do {
7627 PrevOp1 = Op1;
7628 PrevOp2 = Op2;
7629 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
7630 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
7631 // Check if we have 2 resizing shuffles - need to peek through operands
7632 // again.
7633 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
7634 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
7635 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
7636 for (auto [Idx, I] : enumerate(CombinedMask1)) {
7637 if (I == PoisonMaskElem)
7638 continue;
7639 ExtMask1[Idx] = SV1->getMaskValue(I);
7640 }
7641 SmallBitVector UseMask1 = buildUseMask(
7642 cast<FixedVectorType>(SV1->getOperand(1)->getType())
7643 ->getNumElements(),
7644 ExtMask1, UseMask::SecondArg);
7645 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
7646 for (auto [Idx, I] : enumerate(CombinedMask2)) {
7647 if (I == PoisonMaskElem)
7648 continue;
7649 ExtMask2[Idx] = SV2->getMaskValue(I);
7650 }
7651 SmallBitVector UseMask2 = buildUseMask(
7652 cast<FixedVectorType>(SV2->getOperand(1)->getType())
7653 ->getNumElements(),
7654 ExtMask2, UseMask::SecondArg);
7655 if (SV1->getOperand(0)->getType() ==
7656 SV2->getOperand(0)->getType() &&
7657 SV1->getOperand(0)->getType() != SV1->getType() &&
7658 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
7659 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
7660 Op1 = SV1->getOperand(0);
7661 Op2 = SV2->getOperand(0);
7662 SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
7663 SV1->getShuffleMask().end());
7664 int LocalVF = ShuffleMask1.size();
7665 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
7666 LocalVF = FTy->getNumElements();
7667 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
7668 CombinedMask1.swap(ShuffleMask1);
7669 SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
7670 SV2->getShuffleMask().end());
7671 LocalVF = ShuffleMask2.size();
7672 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
7673 LocalVF = FTy->getNumElements();
7674 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
7675 CombinedMask2.swap(ShuffleMask2);
7676 }
7677 }
7678 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
7679 Builder.resizeToMatch(Op1, Op2);
7680 VF = std::max(cast<VectorType>(Op1->getType())
7681 ->getElementCount()
7682 .getKnownMinValue(),
7683 cast<VectorType>(Op2->getType())
7684 ->getElementCount()
7685 .getKnownMinValue());
7686 for (int I = 0, E = Mask.size(); I < E; ++I) {
7687 if (CombinedMask2[I] != PoisonMaskElem) {
7688 assert(CombinedMask1[I] == PoisonMaskElem &&
7689 "Expected undefined mask element");
7690 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
7691 }
7692 }
7693 if (Op1 == Op2 &&
7694 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
7695 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
7696 isa<ShuffleVectorInst>(Op1) &&
7697 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
7698 ArrayRef(CombinedMask1))))
7699 return Builder.createIdentity(Op1);
7700 return Builder.createShuffleVector(
7701 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
7702 CombinedMask1);
7703 }
7704 if (isa<PoisonValue>(V1))
7705 return Builder.createPoison(
7706 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
7707 SmallVector<int> NewMask(Mask.begin(), Mask.end());
7708 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
7709 assert(V1 && "Expected non-null value after looking through shuffles.");
7710
7711 if (!IsIdentity)
7712 return Builder.createShuffleVector(V1, NewMask);
7713 return Builder.createIdentity(V1);
7714 }
7715};
7716} // namespace
7717
7718/// Returns the cost of the shuffle instructions with the given \p Kind, vector
7719/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
7720/// subvector pattern.
7721static InstructionCost
7723 VectorType *Tp, ArrayRef<int> Mask = std::nullopt,
7725 int Index = 0, VectorType *SubTp = nullptr,
7726 ArrayRef<const Value *> Args = std::nullopt) {
7727 if (Kind != TTI::SK_PermuteTwoSrc)
7728 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
7729 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7730 int NumSubElts;
7731 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
7732 Mask, NumSrcElts, NumSubElts, Index)) {
7733 if (Index + NumSubElts > NumSrcElts &&
7734 Index + NumSrcElts <= static_cast<int>(Mask.size()))
7735 return TTI.getShuffleCost(
7737 FixedVectorType::get(Tp->getElementType(), Mask.size()), Mask,
7739 }
7740 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
7741}
7742
7743/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
7744static std::pair<InstructionCost, InstructionCost>
7746 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
7747 Type *ScalarTy, VectorType *VecTy) {
7748 InstructionCost ScalarCost = 0;
7749 InstructionCost VecCost = 0;
7750 // Here we differentiate two cases: (1) when Ptrs represent a regular
7751 // vectorization tree node (as they are pointer arguments of scattered
7752 // loads) or (2) when Ptrs are the arguments of loads or stores being
7753 // vectorized as plane wide unit-stride load/store since all the
7754 // loads/stores are known to be from/to adjacent locations.
7755 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
7756 // Case 2: estimate costs for pointer related costs when vectorizing to
7757 // a wide load/store.
7758 // Scalar cost is estimated as a set of pointers with known relationship
7759 // between them.
7760 // For vector code we will use BasePtr as argument for the wide load/store
7761 // but we also need to account all the instructions which are going to
7762 // stay in vectorized code due to uses outside of these scalar
7763 // loads/stores.
7764 ScalarCost = TTI.getPointersChainCost(
7765 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
7766 CostKind);
7767
7768 SmallVector<const Value *> PtrsRetainedInVecCode;
7769 for (Value *V : Ptrs) {
7770 if (V == BasePtr) {
7771 PtrsRetainedInVecCode.push_back(V);
7772 continue;
7773 }
7774 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
7775 // For simplicity assume Ptr to stay in vectorized code if it's not a
7776 // GEP instruction. We don't care since it's cost considered free.
7777 // TODO: We should check for any uses outside of vectorizable tree
7778 // rather than just single use.
7779 if (!Ptr || !Ptr->hasOneUse())
7780 PtrsRetainedInVecCode.push_back(V);
7781 }
7782
7783 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
7784 // If all pointers stay in vectorized code then we don't have
7785 // any savings on that.
7786 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
7787 }
7788 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
7789 TTI::PointersChainInfo::getKnownStride(),
7790 VecTy, CostKind);
7791 } else {
7792 // Case 1: Ptrs are the arguments of loads that we are going to transform
7793 // into masked gather load intrinsic.
7794 // All the scalar GEPs will be removed as a result of vectorization.
7795 // For any external uses of some lanes extract element instructions will
7796 // be generated (which cost is estimated separately).
7797 TTI::PointersChainInfo PtrsInfo =
7798 all_of(Ptrs,
7799 [](const Value *V) {
7800 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
7801 return Ptr && !Ptr->hasAllConstantIndices();
7802 })
7803 ? TTI::PointersChainInfo::getUnknownStride()
7804 : TTI::PointersChainInfo::getKnownStride();
7805
7806 ScalarCost =
7807 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
7808 if (auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {
7809 SmallVector<const Value *> Indices(BaseGEP->indices());
7810 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
7811 BaseGEP->getPointerOperand(), Indices, VecTy,
7812 CostKind);
7813 }
7814 }
7815
7816 return std::make_pair(ScalarCost, VecCost);
7817}
7818
7821 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
7822 TreeEntry &E = *TE.get();
7823 switch (E.getOpcode()) {
7824 case Instruction::Load: {
7825 Type *ScalarTy = E.getMainOp()->getType();
7826 auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
7827 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
7828 // Check if profitable to represent consecutive load + reverse as strided
7829 // load with stride -1.
7830 if (isReverseOrder(E.ReorderIndices) &&
7831 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
7832 SmallVector<int> Mask;
7833 inversePermutation(E.ReorderIndices, Mask);
7834 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
7835 InstructionCost OriginalVecCost =
7836 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
7841 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
7842 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
7843 if (StridedCost < OriginalVecCost)
7844 // Strided load is more profitable than consecutive load + reverse -
7845 // transform the node to strided load.
7846 E.State = TreeEntry::StridedVectorize;
7847 }
7848 break;
7849 }
7850 default:
7851 break;
7852 }
7853 }
7854}
7855
7856/// Merges shuffle masks and emits final shuffle instruction, if required. It
7857/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
7858/// when the actual shuffle instruction is generated only if this is actually
7859/// required. Otherwise, the shuffle instruction emission is delayed till the
7860/// end of the process, to reduce the number of emitted instructions and further
7861/// analysis/transformations.
7862class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7863 bool IsFinalized = false;
7864 SmallVector<int> CommonMask;
7866 const TargetTransformInfo &TTI;
7868 SmallDenseSet<Value *> VectorizedVals;
7869 BoUpSLP &R;
7870 SmallPtrSetImpl<Value *> &CheckedExtracts;
7871 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7872 /// While set, still trying to estimate the cost for the same nodes and we
7873 /// can delay actual cost estimation (virtual shuffle instruction emission).
7874 /// May help better estimate the cost if same nodes must be permuted + allows
7875 /// to move most of the long shuffles cost estimation to TTI.
7876 bool SameNodesEstimated = true;
7877
7878 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
7879 if (Ty->getScalarType()->isPointerTy()) {
7883 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
7884 Ty->getScalarType());
7885 if (auto *VTy = dyn_cast<VectorType>(Ty))
7886 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
7887 return Res;
7888 }
7889 return Constant::getAllOnesValue(Ty);
7890 }
7891
7892 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
7893 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
7894 return TTI::TCC_Free;
7895 auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
7896 InstructionCost GatherCost = 0;
7897 SmallVector<Value *> Gathers(VL.begin(), VL.end());
7898 // Improve gather cost for gather of loads, if we can group some of the
7899 // loads into vector loads.
7900 InstructionsState S = getSameOpcode(VL, *R.TLI);
7901 const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType());
7902 unsigned MinVF = R.getMinVF(2 * Sz);
7903 if (VL.size() > 2 &&
7904 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
7905 (InVectors.empty() &&
7906 any_of(seq<unsigned>(0, VL.size() / MinVF),
7907 [&](unsigned Idx) {
7908 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
7909 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
7910 return S.getOpcode() == Instruction::Load &&
7911 !S.isAltShuffle();
7912 }))) &&
7913 !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
7914 !isSplat(Gathers)) {
7915 InstructionCost BaseCost = R.getGatherCost(Gathers, !Root);
7916 SetVector<Value *> VectorizedLoads;
7918 SmallVector<unsigned> ScatterVectorized;
7919 unsigned StartIdx = 0;
7920 unsigned VF = VL.size() / 2;
7921 for (; VF >= MinVF; VF /= 2) {
7922 for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
7923 Cnt += VF) {
7924 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7925 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
7926 InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);
7927 if (SliceS.getOpcode() != Instruction::Load ||
7928 SliceS.isAltShuffle())
7929 continue;
7930 }
7931 if (!VectorizedLoads.count(Slice.front()) &&
7932 !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
7933 SmallVector<Value *> PointerOps;
7934 OrdersType CurrentOrder;
7935 LoadsState LS = R.canVectorizeLoads(Slice, Slice.front(),
7936 CurrentOrder, PointerOps);
7937 switch (LS) {
7941 // Mark the vectorized loads so that we don't vectorize them
7942 // again.
7943 // TODO: better handling of loads with reorders.
7944 if (((LS == LoadsState::Vectorize ||
7946 CurrentOrder.empty()) ||
7948 isReverseOrder(CurrentOrder)))
7949 VectorizedStarts.emplace_back(Cnt, LS);
7950 else
7951 ScatterVectorized.push_back(Cnt);
7952 VectorizedLoads.insert(Slice.begin(), Slice.end());
7953 // If we vectorized initial block, no need to try to vectorize
7954 // it again.
7955 if (Cnt == StartIdx)
7956 StartIdx += VF;
7957 break;
7958 case LoadsState::Gather:
7959 break;
7960 }
7961 }
7962 }
7963 // Check if the whole array was vectorized already - exit.
7964 if (StartIdx >= VL.size())
7965 break;
7966 // Found vectorizable parts - exit.
7967 if (!VectorizedLoads.empty())
7968 break;
7969 }
7970 if (!VectorizedLoads.empty()) {
7971 unsigned NumParts = TTI.getNumberOfParts(VecTy);
7972 bool NeedInsertSubvectorAnalysis =
7973 !NumParts || (VL.size() / VF) > NumParts;
7974 // Get the cost for gathered loads.
7975 for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
7976 if (VectorizedLoads.contains(VL[I]))
7977 continue;
7978 GatherCost +=
7979 getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root);
7980 }
7981 // Exclude potentially vectorized loads from list of gathered
7982 // scalars.
7983 Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));
7984 // The cost for vectorized loads.
7985 InstructionCost ScalarsCost = 0;
7986 for (Value *V : VectorizedLoads) {
7987 auto *LI = cast<LoadInst>(V);
7988 ScalarsCost +=
7989 TTI.getMemoryOpCost(Instruction::Load, LI->getType(),
7990 LI->getAlign(), LI->getPointerAddressSpace(),
7991 CostKind, TTI::OperandValueInfo(), LI);
7992 }
7993 auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);
7994 for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
7995 auto *LI = cast<LoadInst>(VL[P.first]);
7996 Align Alignment = LI->getAlign();
7997 GatherCost +=
7998 P.second == LoadsState::Vectorize
7999 ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
8000 LI->getPointerAddressSpace(), CostKind,
8003 Instruction::Load, LoadTy, LI->getPointerOperand(),
8004 /*VariableMask=*/false, Alignment, CostKind, LI);
8005 // Estimate GEP cost.
8006 SmallVector<Value *> PointerOps(VF);
8007 for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
8008 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8009 auto [ScalarGEPCost, VectorGEPCost] =
8010 getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
8011 Instruction::Load, CostKind, LI->getType(), LoadTy);
8012 GatherCost += VectorGEPCost - ScalarGEPCost;
8013 }
8014 for (unsigned P : ScatterVectorized) {
8015 auto *LI0 = cast<LoadInst>(VL[P]);
8016 ArrayRef<Value *> Slice = VL.slice(P, VF);
8017 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
8018 GatherCost += TTI.getGatherScatterOpCost(
8019 Instruction::Load, LoadTy, LI0->getPointerOperand(),
8020 /*VariableMask=*/false, CommonAlignment, CostKind, LI0);
8021 // Estimate GEP cost.
8022 SmallVector<Value *> PointerOps(VF);
8023 for (auto [I, V] : enumerate(Slice))
8024 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8025 OrdersType Order;
8026 if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE,
8027 Order)) {
8028 // TODO: improve checks if GEPs can be vectorized.
8029 Value *Ptr0 = PointerOps.front();
8030 Type *ScalarTy = Ptr0->getType();
8031 auto *VecTy = FixedVectorType::get(ScalarTy, VF);
8032 auto [ScalarGEPCost, VectorGEPCost] =
8033 getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr,
8034 CostKind, ScalarTy, VecTy);
8035 GatherCost += VectorGEPCost - ScalarGEPCost;
8036 if (!Order.empty()) {
8037 SmallVector<int> Mask;
8038 inversePermutation(Order, Mask);
8040 VecTy, Mask, CostKind);
8041 }
8042 } else {
8043 GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true);
8044 }
8045 }
8046 if (NeedInsertSubvectorAnalysis) {
8047 // Add the cost for the subvectors insert.
8048 SmallVector<int> ShuffleMask(VL.size());
8049 for (unsigned I = VF, E = VL.size(); I < E; I += VF) {
8050 for (unsigned Idx : seq<unsigned>(0, E))
8051 ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx;
8052 GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy,
8053 ShuffleMask, CostKind, I, LoadTy);
8054 }
8055 }
8056 GatherCost -= ScalarsCost;
8057 }
8058 GatherCost = std::min(BaseCost, GatherCost);
8059 } else if (!Root && isSplat(VL)) {
8060 // Found the broadcasting of the single scalar, calculate the cost as
8061 // the broadcast.
8062 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
8063 assert(It != VL.end() && "Expected at least one non-undef value.");
8064 // Add broadcast for non-identity shuffle only.
8065 bool NeedShuffle =
8066 count(VL, *It) > 1 &&
8067 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
8068 if (!NeedShuffle)
8069 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
8070 CostKind, std::distance(VL.begin(), It),
8071 PoisonValue::get(VecTy), *It);
8072
8073 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
8074 transform(VL, ShuffleMask.begin(), [](Value *V) {
8075 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8076 });
8078 Instruction::InsertElement, VecTy, CostKind, 0,
8079 PoisonValue::get(VecTy), *It);
8080 return InsertCost +
8082 ShuffleMask, CostKind, /*Index=*/0,
8083 /*SubTp=*/nullptr, /*Args=*/*It);
8084 }
8085 return GatherCost +
8086 (all_of(Gathers, IsaPred<UndefValue>)
8088 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));
8089 };
8090
8091 /// Compute the cost of creating a vector containing the extracted values from
8092 /// \p VL.
8094 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
8095 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8096 unsigned NumParts) {
8097 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
8098 unsigned NumElts =
8099 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
8100 auto *EE = dyn_cast<ExtractElementInst>(V);
8101 if (!EE)
8102 return Sz;
8103 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8104 if (!VecTy)
8105 return Sz;
8106 return std::max(Sz, VecTy->getNumElements());
8107 });
8108 unsigned NumSrcRegs = TTI.getNumberOfParts(
8109 FixedVectorType::get(VL.front()->getType(), NumElts));
8110 if (NumSrcRegs == 0)
8111 NumSrcRegs = 1;
8112 // FIXME: this must be moved to TTI for better estimation.
8113 unsigned EltsPerVector = PowerOf2Ceil(std::max(
8114 divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
8115 auto CheckPerRegistersShuffle =
8116 [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
8117 DenseSet<int> RegIndices;
8118 // Check that if trying to permute same single/2 input vectors.
8120 int FirstRegId = -1;
8121 for (int &I : Mask) {
8122 if (I == PoisonMaskElem)
8123 continue;
8124 int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
8125 if (FirstRegId < 0)
8126 FirstRegId = RegId;
8127 RegIndices.insert(RegId);
8128 if (RegIndices.size() > 2)
8129 return std::nullopt;
8130 if (RegIndices.size() == 2)
8131 ShuffleKind = TTI::SK_PermuteTwoSrc;
8132 I = (I % NumElts) % EltsPerVector +
8133 (RegId == FirstRegId ? 0 : EltsPerVector);
8134 }
8135 return ShuffleKind;
8136 };
8138
8139 // Process extracts in blocks of EltsPerVector to check if the source vector
8140 // operand can be re-used directly. If not, add the cost of creating a
8141 // shuffle to extract the values into a vector register.
8142 for (unsigned Part = 0; Part < NumParts; ++Part) {
8143 if (!ShuffleKinds[Part])
8144 continue;
8145 ArrayRef<int> MaskSlice =
8146 Mask.slice(Part * EltsPerVector,
8147 (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
8148 ? Mask.size() % EltsPerVector
8149 : EltsPerVector);
8150 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
8151 copy(MaskSlice, SubMask.begin());
8152 std::optional<TTI::ShuffleKind> RegShuffleKind =
8153 CheckPerRegistersShuffle(SubMask);
8154 if (!RegShuffleKind) {
8156 TTI, *ShuffleKinds[Part],
8157 FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
8158 continue;
8159 }
8160 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
8161 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
8163 TTI, *RegShuffleKind,
8164 FixedVectorType::get(VL.front()->getType(), EltsPerVector),
8165 SubMask);
8166 }
8167 }
8168 return Cost;
8169 }
8170 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
8171 /// shuffle emission.
8172 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
8173 ArrayRef<int> Mask) {
8174 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8175 if (Mask[Idx] != PoisonMaskElem)
8176 CommonMask[Idx] = Idx;
8177 }
8178 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
8179 /// mask \p Mask, register number \p Part, that includes \p SliceSize
8180 /// elements.
8181 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
8182 ArrayRef<int> Mask, unsigned Part,
8183 unsigned SliceSize) {
8184 if (SameNodesEstimated) {
8185 // Delay the cost estimation if the same nodes are reshuffling.
8186 // If we already requested the cost of reshuffling of E1 and E2 before, no
8187 // need to estimate another cost with the sub-Mask, instead include this
8188 // sub-Mask into the CommonMask to estimate it later and avoid double cost
8189 // estimation.
8190 if ((InVectors.size() == 2 &&
8191 InVectors.front().get<const TreeEntry *>() == &E1 &&
8192 InVectors.back().get<const TreeEntry *>() == E2) ||
8193 (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
8194 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
8195 [](int Idx) { return Idx == PoisonMaskElem; }) &&
8196 "Expected all poisoned elements.");
8197 ArrayRef<int> SubMask =
8198 ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
8199 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
8200 return;
8201 }
8202 // Found non-matching nodes - need to estimate the cost for the matched
8203 // and transform mask.
8204 Cost += createShuffle(InVectors.front(),
8205 InVectors.size() == 1 ? nullptr : InVectors.back(),
8206 CommonMask);
8207 transformMaskAfterShuffle(CommonMask, CommonMask);
8208 }
8209 SameNodesEstimated = false;
8210 if (!E2 && InVectors.size() == 1) {
8211 unsigned VF = E1.getVectorFactor();
8212 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
8213 VF = std::max(VF,
8214 cast<FixedVectorType>(V1->getType())->getNumElements());
8215 } else {
8216 const auto *E = InVectors.front().get<const TreeEntry *>();
8217 VF = std::max(VF, E->getVectorFactor());
8218 }
8219 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8220 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8221 CommonMask[Idx] = Mask[Idx] + VF;
8222 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
8223 transformMaskAfterShuffle(CommonMask, CommonMask);
8224 } else {
8225 Cost += createShuffle(&E1, E2, Mask);
8226 transformMaskAfterShuffle(CommonMask, Mask);
8227 }
8228 }
8229
8230 class ShuffleCostBuilder {
8231 const TargetTransformInfo &TTI;
8232
8233 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
8234 int Index = -1;
8235 return Mask.empty() ||
8236 (VF == Mask.size() &&
8239 Index == 0);
8240 }
8241
8242 public:
8243 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
8244 ~ShuffleCostBuilder() = default;
8245 InstructionCost createShuffleVector(Value *V1, Value *,
8246 ArrayRef<int> Mask) const {
8247 // Empty mask or identity mask are free.
8248 unsigned VF =
8249 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8250 if (isEmptyOrIdentity(Mask, VF))
8251 return TTI::TCC_Free;
8252 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
8253 cast<VectorType>(V1->getType()), Mask);
8254 }
8255 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
8256 // Empty mask or identity mask are free.
8257 unsigned VF =
8258 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8259 if (isEmptyOrIdentity(Mask, VF))
8260 return TTI::TCC_Free;
8262 cast<VectorType>(V1->getType()), Mask);
8263 }
8264 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
8265 InstructionCost createPoison(Type *Ty, unsigned VF) const {
8266 return TTI::TCC_Free;
8267 }
8268 void resizeToMatch(Value *&, Value *&) const {}
8269 };
8270
8271 /// Smart shuffle instruction emission, walks through shuffles trees and
8272 /// tries to find the best matching vector for the actual shuffle
8273 /// instruction.
8275 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
8277 ArrayRef<int> Mask) {
8278 ShuffleCostBuilder Builder(TTI);
8279 SmallVector<int> CommonMask(Mask.begin(), Mask.end());
8280 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
8281 unsigned CommonVF = Mask.size();
8282 if (!V1 && !V2 && !P2.isNull()) {
8283 // Shuffle 2 entry nodes.
8284 const TreeEntry *E = P1.get<const TreeEntry *>();
8285 unsigned VF = E->getVectorFactor();
8286 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8287 CommonVF = std::max(VF, E2->getVectorFactor());
8288 assert(all_of(Mask,
8289 [=](int Idx) {
8290 return Idx < 2 * static_cast<int>(CommonVF);
8291 }) &&
8292 "All elements in mask must be less than 2 * CommonVF.");
8293 if (E->Scalars.size() == E2->Scalars.size()) {
8294 SmallVector<int> EMask = E->getCommonMask();
8295 SmallVector<int> E2Mask = E2->getCommonMask();
8296 if (!EMask.empty() || !E2Mask.empty()) {
8297 for (int &Idx : CommonMask) {
8298 if (Idx == PoisonMaskElem)
8299 continue;
8300 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
8301 Idx = EMask[Idx];
8302 else if (Idx >= static_cast<int>(CommonVF))
8303 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
8304 E->Scalars.size();
8305 }
8306 }
8307 CommonVF = E->Scalars.size();
8308 }
8310 FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
8311 V2 = getAllOnesValue(
8312 *R.DL, FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
8313 } else if (!V1 && P2.isNull()) {
8314 // Shuffle single entry node.
8315 const TreeEntry *E = P1.get<const TreeEntry *>();
8316 unsigned VF = E->getVectorFactor();
8317 CommonVF = VF;
8318 assert(
8319 all_of(Mask,
8320 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8321 "All elements in mask must be less than CommonVF.");
8322 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8323 SmallVector<int> EMask = E->getCommonMask();
8324 assert(!EMask.empty() && "Expected non-empty common mask.");
8325 for (int &Idx : CommonMask) {
8326 if (Idx != PoisonMaskElem)
8327 Idx = EMask[Idx];
8328 }
8329 CommonVF = E->Scalars.size();
8330 }
8332 FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
8333 // Not identity/broadcast? Try to see if the original vector is better.
8334 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8335 CommonVF == CommonMask.size() &&
8336 any_of(enumerate(CommonMask),
8337 [](const auto &&P) {
8338 return P.value() != PoisonMaskElem &&
8339 static_cast<unsigned>(P.value()) != P.index();
8340 }) &&
8341 any_of(CommonMask,
8342 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
8343 SmallVector<int> ReorderMask;
8344 inversePermutation(E->ReorderIndices, ReorderMask);
8345 ::addMask(CommonMask, ReorderMask);
8346 }
8347 } else if (V1 && P2.isNull()) {
8348 // Shuffle single vector.
8349 CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
8350 assert(
8351 all_of(Mask,
8352 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8353 "All elements in mask must be less than CommonVF.");
8354 } else if (V1 && !V2) {
8355 // Shuffle vector and tree node.
8356 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8357 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8358 CommonVF = std::max(VF, E2->getVectorFactor());
8359 assert(all_of(Mask,
8360 [=](int Idx) {
8361 return Idx < 2 * static_cast<int>(CommonVF);
8362 }) &&
8363 "All elements in mask must be less than 2 * CommonVF.");
8364 if (E2->Scalars.size() == VF && VF != CommonVF) {
8365 SmallVector<int> E2Mask = E2->getCommonMask();
8366 assert(!E2Mask.empty() && "Expected non-empty common mask.");
8367 for (int &Idx : CommonMask) {
8368 if (Idx == PoisonMaskElem)
8369 continue;
8370 if (Idx >= static_cast<int>(CommonVF))
8371 Idx = E2Mask[Idx - CommonVF] + VF;
8372 }
8373 CommonVF = VF;
8374 }
8376 FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
8377 V2 = getAllOnesValue(
8378 *R.DL,
8379 FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
8380 } else if (!V1 && V2) {
8381 // Shuffle vector and tree node.
8382 unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
8383 const TreeEntry *E1 = P1.get<const TreeEntry *>();
8384 CommonVF = std::max(VF, E1->getVectorFactor());
8385 assert(all_of(Mask,
8386 [=](int Idx) {
8387 return Idx < 2 * static_cast<int>(CommonVF);
8388 }) &&
8389 "All elements in mask must be less than 2 * CommonVF.");
8390 if (E1->Scalars.size() == VF && VF != CommonVF) {
8391 SmallVector<int> E1Mask = E1->getCommonMask();
8392 assert(!E1Mask.empty() && "Expected non-empty common mask.");
8393 for (int &Idx : CommonMask) {
8394 if (Idx == PoisonMaskElem)
8395 continue;
8396 if (Idx >= static_cast<int>(CommonVF))
8397 Idx = E1Mask[Idx - CommonVF] + VF;
8398 else
8399 Idx = E1Mask[Idx];
8400 }
8401 CommonVF = VF;
8402 }
8404 FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));
8405 V2 = getAllOnesValue(
8406 *R.DL,
8407 FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));
8408 } else {
8409 assert(V1 && V2 && "Expected both vectors.");
8410 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8411 CommonVF =
8412 std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
8413 assert(all_of(Mask,
8414 [=](int Idx) {
8415 return Idx < 2 * static_cast<int>(CommonVF);
8416 }) &&
8417 "All elements in mask must be less than 2 * CommonVF.");
8418 if (V1->getType() != V2->getType()) {
8420 cast<FixedVectorType>(V1->getType())->getElementType(), CommonVF));
8421 V2 = getAllOnesValue(
8422 *R.DL, FixedVectorType::get(
8423 cast<FixedVectorType>(V1->getType())->getElementType(),
8424 CommonVF));
8425 }
8426 }
8428 cast<FixedVectorType>(V1->getType())->getElementType(),
8429 CommonMask.size()));
8430 if (InVectors.size() == 2)
8431 InVectors.pop_back();
8432 return BaseShuffleAnalysis::createShuffle<InstructionCost>(
8433 V1, V2, CommonMask, Builder);
8434 }
8435
8436public:
8438 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
8439 SmallPtrSetImpl<Value *> &CheckedExtracts)
8440 : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),
8441 R(R), CheckedExtracts(CheckedExtracts) {}
8442 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
8443 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8444 unsigned NumParts, bool &UseVecBaseAsInput) {
8445 UseVecBaseAsInput = false;
8446 if (Mask.empty())
8447 return nullptr;
8448 Value *VecBase = nullptr;
8449 ArrayRef<Value *> VL = E->Scalars;
8450 // If the resulting type is scalarized, do not adjust the cost.
8451 if (NumParts == VL.size())
8452 return nullptr;
8453 // Check if it can be considered reused if same extractelements were
8454 // vectorized already.
8455 bool PrevNodeFound = any_of(
8456 ArrayRef(R.VectorizableTree).take_front(E->Idx),
8457 [&](const std::unique_ptr<TreeEntry> &TE) {
8458 return ((!TE->isAltShuffle() &&
8459 TE->getOpcode() == Instruction::ExtractElement) ||
8460 TE->State == TreeEntry::NeedToGather) &&
8461 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
8462 return VL.size() > Data.index() &&
8463 (Mask[Data.index()] == PoisonMaskElem ||
8464 isa<UndefValue>(VL[Data.index()]) ||
8465 Data.value() == VL[Data.index()]);
8466 });
8467 });
8468 SmallPtrSet<Value *, 4> UniqueBases;
8469 unsigned SliceSize = VL.size() / NumParts;
8470 for (unsigned Part = 0; Part < NumParts; ++Part) {
8471 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
8472 for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
8473 // Ignore non-extractelement scalars.
8474 if (isa<UndefValue>(V) ||
8475 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
8476 continue;
8477 // If all users of instruction are going to be vectorized and this
8478 // instruction itself is not going to be vectorized, consider this
8479 // instruction as dead and remove its cost from the final cost of the
8480 // vectorized tree.
8481 // Also, avoid adjusting the cost for extractelements with multiple uses
8482 // in different graph entries.
8483 auto *EE = cast<ExtractElementInst>(V);
8484 VecBase = EE->getVectorOperand();
8485 UniqueBases.insert(VecBase);
8486 const TreeEntry *VE = R.getTreeEntry(V);
8487 if (!CheckedExtracts.insert(V).second ||
8488 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
8489 (VE && VE != E))
8490 continue;
8491 std::optional<unsigned> EEIdx = getExtractIndex(EE);
8492 if (!EEIdx)
8493 continue;
8494 unsigned Idx = *EEIdx;
8495 // Take credit for instruction that will become dead.
8496 if (EE->hasOneUse() || !PrevNodeFound) {
8497 Instruction *Ext = EE->user_back();
8498 if (isa<SExtInst, ZExtInst>(Ext) &&
8499 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
8500 // Use getExtractWithExtendCost() to calculate the cost of
8501 // extractelement/ext pair.
8502 Cost -=
8503 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
8504 EE->getVectorOperandType(), Idx);
8505 // Add back the cost of s|zext which is subtracted separately.
8507 Ext->getOpcode(), Ext->getType(), EE->getType(),
8508 TTI::getCastContextHint(Ext), CostKind, Ext);
8509 continue;
8510 }
8511 }
8512 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
8513 CostKind, Idx);
8514 }
8515 }
8516 // Check that gather of extractelements can be represented as just a
8517 // shuffle of a single/two vectors the scalars are extracted from.
8518 // Found the bunch of extractelement instructions that must be gathered
8519 // into a vector and can be represented as a permutation elements in a
8520 // single input vector or of 2 input vectors.
8521 // Done for reused if same extractelements were vectorized already.
8522 if (!PrevNodeFound)
8523 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8524 InVectors.assign(1, E);
8525 CommonMask.assign(Mask.begin(), Mask.end());
8526 transformMaskAfterShuffle(CommonMask, CommonMask);
8527 SameNodesEstimated = false;
8528 if (NumParts != 1 && UniqueBases.size() != 1) {
8529 UseVecBaseAsInput = true;
8530 VecBase = Constant::getNullValue(
8531 FixedVectorType::get(VL.front()->getType(), CommonMask.size()));
8532 }
8533 return VecBase;
8534 }
8535 /// Checks if the specified entry \p E needs to be delayed because of its
8536 /// dependency nodes.
8537 std::optional<InstructionCost>
8538 needToDelay(const TreeEntry *,
8540 // No need to delay the cost estimation during analysis.
8541 return std::nullopt;
8542 }
8543 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
8544 if (&E1 == &E2) {
8545 assert(all_of(Mask,
8546 [&](int Idx) {
8547 return Idx < static_cast<int>(E1.getVectorFactor());
8548 }) &&
8549 "Expected single vector shuffle mask.");
8550 add(E1, Mask);
8551 return;
8552 }
8553 if (InVectors.empty()) {
8554 CommonMask.assign(Mask.begin(), Mask.end());
8555 InVectors.assign({&E1, &E2});
8556 return;
8557 }
8558 assert(!CommonMask.empty() && "Expected non-empty common mask.");
8559 auto *MaskVecTy =
8560 FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
8561 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
8562 if (NumParts == 0 || NumParts >= Mask.size())
8563 NumParts = 1;
8564 unsigned SliceSize = Mask.size() / NumParts;
8565 const auto *It =
8566 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
8567 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8568 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
8569 }
8570 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
8571 if (InVectors.empty()) {
8572 CommonMask.assign(Mask.begin(), Mask.end());
8573 InVectors.assign(1, &E1);
8574 return;
8575 }
8576 assert(!CommonMask.empty() && "Expected non-empty common mask.");
8577 auto *MaskVecTy =
8578 FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
8579 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
8580 if (NumParts == 0 || NumParts >= Mask.size())
8581 NumParts = 1;
8582 unsigned SliceSize = Mask.size() / NumParts;
8583 const auto *It =
8584 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
8585 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8586 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
8587 if (!SameNodesEstimated && InVectors.size() == 1)
8588 InVectors.emplace_back(&E1);
8589 }
8590 /// Adds 2 input vectors and the mask for their shuffling.
8591 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
8592 // May come only for shuffling of 2 vectors with extractelements, already
8593 // handled in adjustExtracts.
8594 assert(InVectors.size() == 1 &&
8595 all_of(enumerate(CommonMask),
8596 [&](auto P) {
8597 if (P.value() == PoisonMaskElem)
8598 return Mask[P.index()] == PoisonMaskElem;
8599 auto *EI =
8600 cast<ExtractElementInst>(InVectors.front()
8601 .get<const TreeEntry *>()
8602 ->Scalars[P.index()]);
8603 return EI->getVectorOperand() == V1 ||
8604 EI->getVectorOperand() == V2;
8605 }) &&
8606 "Expected extractelement vectors.");
8607 }
8608 /// Adds another one input vector and the mask for the shuffling.
8609 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
8610 if (InVectors.empty()) {
8611 assert(CommonMask.empty() && !ForExtracts &&
8612 "Expected empty input mask/vectors.");
8613 CommonMask.assign(Mask.begin(), Mask.end());
8614 InVectors.assign(1, V1);
8615 return;
8616 }
8617 if (ForExtracts) {
8618 // No need to add vectors here, already handled them in adjustExtracts.
8619 assert(InVectors.size() == 1 &&
8620 InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
8621 all_of(enumerate(CommonMask),
8622 [&](auto P) {
8623 Value *Scalar = InVectors.front()
8624 .get<const TreeEntry *>()
8625 ->Scalars[P.index()];
8626 if (P.value() == PoisonMaskElem)
8627 return P.value() == Mask[P.index()] ||
8628 isa<UndefValue>(Scalar);
8629 if (isa<Constant>(V1))
8630 return true;
8631 auto *EI = cast<ExtractElementInst>(Scalar);
8632 return EI->getVectorOperand() == V1;
8633 }) &&
8634 "Expected only tree entry for extractelement vectors.");
8635 return;
8636 }
8637 assert(!InVectors.empty() && !CommonMask.empty() &&
8638 "Expected only tree entries from extracts/reused buildvectors.");
8639 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8640 if (InVectors.size() == 2) {
8641 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
8642 transformMaskAfterShuffle(CommonMask, CommonMask);
8643 VF = std::max<unsigned>(VF, CommonMask.size());
8644 } else if (const auto *InTE =
8645 InVectors.front().dyn_cast<const TreeEntry *>()) {
8646 VF = std::max(VF, InTE->getVectorFactor());
8647 } else {
8648 VF = std::max(
8649 VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())
8650 ->getNumElements());
8651 }
8652 InVectors.push_back(V1);
8653 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8654 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8655 CommonMask[Idx] = Mask[Idx] + VF;
8656 }
8657 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
8658 Value *Root = nullptr) {
8659 Cost += getBuildVectorCost(VL, Root);
8660 if (!Root) {
8661 // FIXME: Need to find a way to avoid use of getNullValue here.
8663 unsigned VF = VL.size();
8664 if (MaskVF != 0)
8665 VF = std::min(VF, MaskVF);
8666 for (Value *V : VL.take_front(VF)) {
8667 if (isa<UndefValue>(V)) {
8668 Vals.push_back(cast<Constant>(V));
8669 continue;
8670 }
8671 Vals.push_back(Constant::getNullValue(V->getType()));
8672 }
8673 return ConstantVector::get(Vals);
8674 }
8677 cast<FixedVectorType>(Root->getType())->getNumElements()),
8678 getAllOnesValue(*R.DL, VL.front()->getType()));
8679 }
8681 /// Finalize emission of the shuffles.
8683 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
8684 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
8685 IsFinalized = true;
8686 if (Action) {
8687 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
8688 if (InVectors.size() == 2)
8689 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
8690 else
8691 Cost += createShuffle(Vec, nullptr, CommonMask);
8692 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8693 if (CommonMask[Idx] != PoisonMaskElem)
8694 CommonMask[Idx] = Idx;
8695 assert(VF > 0 &&
8696 "Expected vector length for the final value before action.");
8697 Value *V = Vec.get<Value *>();
8698 Action(V, CommonMask);
8699 InVectors.front() = V;
8700 }
8701 ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
8702 if (CommonMask.empty()) {
8703 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
8704 return Cost;
8705 }
8706 return Cost +
8707 createShuffle(InVectors.front(),
8708 InVectors.size() == 2 ? InVectors.back() : nullptr,
8709 CommonMask);
8710 }
8711
8713 assert((IsFinalized || CommonMask.empty()) &&
8714 "Shuffle construction must be finalized.");
8715 }
8716};
8717
8718const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
8719 unsigned Idx) const {
8720 Value *Op = E->getOperand(Idx).front();
8721 if (const TreeEntry *TE = getTreeEntry(Op)) {
8722 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
8723 return EI.EdgeIdx == Idx && EI.UserTE == E;
8724 }) != TE->UserTreeIndices.end())
8725 return TE;
8726 auto MIt = MultiNodeScalars.find(Op);
8727 if (MIt != MultiNodeScalars.end()) {
8728 for (const TreeEntry *TE : MIt->second) {
8729 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
8730 return EI.EdgeIdx == Idx && EI.UserTE == E;
8731 }) != TE->UserTreeIndices.end())
8732 return TE;
8733 }
8734 }
8735 }
8736 const auto *It =
8737 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8738 return TE->State == TreeEntry::NeedToGather &&
8739 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
8740 return EI.EdgeIdx == Idx && EI.UserTE == E;
8741 }) != TE->UserTreeIndices.end();
8742 });
8743 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
8744 return It->get();
8745}
8746
8747TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
8748 if (TE.State == TreeEntry::ScatterVectorize ||
8749 TE.State == TreeEntry::StridedVectorize)
8751 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
8752 !TE.isAltShuffle()) {
8753 if (TE.ReorderIndices.empty())
8756 inversePermutation(TE.ReorderIndices, Mask);
8757 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
8759 }
8761}
8762
8763/// Builds the arguments types vector for the given call instruction with the
8764/// given \p ID for the specified vector factor.
8766 const Intrinsic::ID ID,
8767 const unsigned VF,
8768 unsigned MinBW) {
8769 SmallVector<Type *> ArgTys;
8770 for (auto [Idx, Arg] : enumerate(CI->args())) {
8773 ArgTys.push_back(Arg->getType());
8774 continue;
8775 }
8776 if (MinBW > 0) {
8778 IntegerType::get(CI->getContext(), MinBW), VF));
8779 continue;
8780 }
8781 }
8782 ArgTys.push_back(FixedVectorType::get(Arg->getType(), VF));
8783 }
8784 return ArgTys;
8785}
8786
8788BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
8789 SmallPtrSetImpl<Value *> &CheckedExtracts) {
8790 ArrayRef<Value *> VL = E->Scalars;
8791
8792 Type *ScalarTy = VL[0]->getType();
8793 if (E->State != TreeEntry::NeedToGather) {
8794 if (auto *SI = dyn_cast<StoreInst>(VL[0]))
8795 ScalarTy = SI->getValueOperand()->getType();
8796 else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
8797 ScalarTy = CI->getOperand(0)->getType();
8798 else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
8799 ScalarTy = IE->getOperand(1)->getType();
8800 }
8801 if (!isValidElementType(ScalarTy))
8803 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
8805
8806 // If we have computed a smaller type for the expression, update VecTy so
8807 // that the costs will be accurate.
8808 auto It = MinBWs.find(E);
8809 Type *OrigScalarTy = ScalarTy;
8810 if (It != MinBWs.end()) {
8811 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
8812 VecTy = FixedVectorType::get(ScalarTy, VL.size());
8813 }
8814 unsigned EntryVF = E->getVectorFactor();
8815 auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
8816
8817 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
8818 if (E->State == TreeEntry::NeedToGather) {
8819 if (allConstant(VL))
8820 return 0;
8821 if (isa<InsertElementInst>(VL[0]))
8823 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
8824 E, *TTI, VectorizedVals, *this, CheckedExtracts);
8825 }
8826 InstructionCost CommonCost = 0;
8828 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
8829 if (!E->ReorderIndices.empty() &&
8830 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
8831 SmallVector<int> NewMask;
8832 if (E->getOpcode() == Instruction::Store) {
8833 // For stores the order is actually a mask.
8834 NewMask.resize(E->ReorderIndices.size());
8835 copy(E->ReorderIndices, NewMask.begin());
8836 } else {
8837 inversePermutation(E->ReorderIndices, NewMask);
8838 }
8839 ::addMask(Mask, NewMask);
8840 }
8841 if (NeedToShuffleReuses)
8842 ::addMask(Mask, E->ReuseShuffleIndices);
8843 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
8844 CommonCost =
8845 TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
8846 assert((E->State == TreeEntry::Vectorize ||
8847 E->State == TreeEntry::ScatterVectorize ||
8848 E->State == TreeEntry::StridedVectorize) &&
8849 "Unhandled state");
8850 assert(E->getOpcode() &&
8851 ((allSameType(VL) && allSameBlock(VL)) ||
8852 (E->getOpcode() == Instruction::GetElementPtr &&
8853 E->getMainOp()->getType()->isPointerTy())) &&
8854 "Invalid VL");
8855 Instruction *VL0 = E->getMainOp();
8856 unsigned ShuffleOrOp =
8857 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
8858 SetVector<Value *> UniqueValues(VL.begin(), VL.end());
8859 const unsigned Sz = UniqueValues.size();
8860 SmallBitVector UsedScalars(Sz, false);
8861 for (unsigned I = 0; I < Sz; ++I) {
8862 if (getTreeEntry(UniqueValues[I]) == E)
8863 continue;
8864 UsedScalars.set(I);
8865 }
8866 auto GetCastContextHint = [&](Value *V) {
8867 if (const TreeEntry *OpTE = getTreeEntry(V))
8868 return getCastContextHint(*OpTE);
8869 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
8870 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
8873 };
8874 auto GetCostDiff =
8875 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
8877 // Calculate the cost of this instruction.
8878 InstructionCost ScalarCost = 0;
8879 if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) {
8880 // For some of the instructions no need to calculate cost for each
8881 // particular instruction, we can use the cost of the single
8882 // instruction x total number of scalar instructions.
8883 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
8884 } else {
8885 for (unsigned I = 0; I < Sz; ++I) {
8886 if (UsedScalars.test(I))
8887 continue;
8888 ScalarCost += ScalarEltCost(I);
8889 }
8890 }
8891
8892 InstructionCost VecCost = VectorCost(CommonCost);
8893 // Check if the current node must be resized, if the parent node is not
8894 // resized.
8895 if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) {
8896 const EdgeInfo &EI = E->UserTreeIndices.front();
8897 if ((EI.UserTE->getOpcode() != Instruction::Select ||
8898 EI.EdgeIdx != 0) &&
8899 It != MinBWs.end()) {
8900 auto UserBWIt = MinBWs.find(EI.UserTE);
8901 Type *UserScalarTy =
8902 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
8903 if (UserBWIt != MinBWs.end())
8904 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
8905 UserBWIt->second.first);
8906 if (ScalarTy != UserScalarTy) {
8907 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
8908 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
8909 unsigned VecOpcode;
8910 auto *UserVecTy =
8911 FixedVectorType::get(UserScalarTy, E->getVectorFactor());
8912 if (BWSz > SrcBWSz)
8913 VecOpcode = Instruction::Trunc;
8914 else
8915 VecOpcode =
8916 It->second.second ? Instruction::SExt : Instruction::ZExt;
8917 TTI::CastContextHint CCH = GetCastContextHint(VL0);
8918 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
8919 CostKind);
8920 }
8921 }
8922 }
8923 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
8924 ScalarCost, "Calculated costs for Tree"));
8925 return VecCost - ScalarCost;
8926 };
8927 // Calculate cost difference from vectorizing set of GEPs.
8928 // Negative value means vectorizing is profitable.
8929 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
8930 assert((E->State == TreeEntry::Vectorize ||
8931 E->State == TreeEntry::StridedVectorize) &&
8932 "Entry state expected to be Vectorize or StridedVectorize here.");
8933 InstructionCost ScalarCost = 0;
8934 InstructionCost VecCost = 0;
8935 std::tie(ScalarCost, VecCost) = getGEPCosts(
8936 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
8937 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
8938 "Calculated GEPs cost for Tree"));
8939
8940 return VecCost - ScalarCost;
8941 };
8942
8943 switch (ShuffleOrOp) {
8944 case Instruction::PHI: {
8945 // Count reused scalars.
8946 InstructionCost ScalarCost = 0;
8948 for (Value *V : UniqueValues) {
8949 auto *PHI = dyn_cast<PHINode>(V);
8950 if (!PHI)
8951 continue;
8952
8953 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
8954 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
8955 Value *Op = PHI->getIncomingValue(I);
8956 Operands[I] = Op;
8957 }
8958 if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
8959 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
8960 if (!OpTE->ReuseShuffleIndices.empty())
8961 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
8962 OpTE->Scalars.size());
8963 }
8964
8965 return CommonCost - ScalarCost;
8966 }
8967 case Instruction::ExtractValue:
8968 case Instruction::ExtractElement: {
8969 auto GetScalarCost = [&](unsigned Idx) {
8970 auto *I = cast<Instruction>(UniqueValues[Idx]);
8971 VectorType *SrcVecTy;
8972 if (ShuffleOrOp == Instruction::ExtractElement) {
8973 auto *EE = cast<ExtractElementInst>(I);
8974 SrcVecTy = EE->getVectorOperandType();
8975 } else {
8976 auto *EV = cast<ExtractValueInst>(I);
8977 Type *AggregateTy = EV->getAggregateOperand()->getType();
8978 unsigned NumElts;
8979 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
8980 NumElts = ATy->getNumElements();
8981 else
8982 NumElts = AggregateTy->getStructNumElements();
8983 SrcVecTy = FixedVectorType::get(OrigScalarTy, NumElts);
8984 }
8985 if (I->hasOneUse()) {
8986 Instruction *Ext = I->user_back();
8987 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
8988 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
8989 // Use getExtractWithExtendCost() to calculate the cost of
8990 // extractelement/ext pair.
8992 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
8993 // Subtract the cost of s|zext which is subtracted separately.
8995 Ext->getOpcode(), Ext->getType(), I->getType(),
8997 return Cost;
8998 }
8999 }
9000 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
9002 };
9003 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
9004 return GetCostDiff(GetScalarCost, GetVectorCost);
9005 }
9006 case Instruction::InsertElement: {
9007 assert(E->ReuseShuffleIndices.empty() &&
9008 "Unique insertelements only are expected.");
9009 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
9010 unsigned const NumElts = SrcVecTy->getNumElements();
9011 unsigned const NumScalars = VL.size();
9012
9013 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
9014
9015 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
9016 unsigned OffsetBeg = *getInsertIndex(VL.front());
9017 unsigned OffsetEnd = OffsetBeg;
9018 InsertMask[OffsetBeg] = 0;
9019 for (auto [I, V] : enumerate(VL.drop_front())) {
9020 unsigned Idx = *getInsertIndex(V);
9021 if (OffsetBeg > Idx)
9022 OffsetBeg = Idx;
9023 else if (OffsetEnd < Idx)
9024 OffsetEnd = Idx;
9025 InsertMask[Idx] = I + 1;
9026 }
9027 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
9028 if (NumOfParts > 0)
9029 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
9030 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9031 VecScalarsSz;
9032 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9033 unsigned InsertVecSz = std::min<unsigned>(
9034 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
9035 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9036 bool IsWholeSubvector =
9037 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9038 // Check if we can safely insert a subvector. If it is not possible, just
9039 // generate a whole-sized vector and shuffle the source vector and the new
9040 // subvector.
9041 if (OffsetBeg + InsertVecSz > VecSz) {
9042 // Align OffsetBeg to generate correct mask.
9043 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
9044 InsertVecSz = VecSz;
9045 }
9046
9047 APInt DemandedElts = APInt::getZero(NumElts);
9048 // TODO: Add support for Instruction::InsertValue.
9050 if (!E->ReorderIndices.empty()) {
9051 inversePermutation(E->ReorderIndices, Mask);
9052 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
9053 } else {
9054 Mask.assign(VecSz, PoisonMaskElem);
9055 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
9056 }
9057 bool IsIdentity = true;
9058 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
9059 Mask.swap(PrevMask);
9060 for (unsigned I = 0; I < NumScalars; ++I) {
9061 unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
9062 DemandedElts.setBit(InsertIdx);
9063 IsIdentity &= InsertIdx - OffsetBeg == I;
9064 Mask[InsertIdx - OffsetBeg] = I;
9065 }
9066 assert(Offset < NumElts && "Failed to find vector index offset");
9067
9069 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
9070 /*Insert*/ true, /*Extract*/ false,
9071 CostKind);
9072
9073 // First cost - resize to actual vector size if not identity shuffle or
9074 // need to shift the vector.
9075 // Do not calculate the cost if the actual size is the register size and
9076 // we can merge this shuffle with the following SK_Select.
9077 auto *InsertVecTy = FixedVectorType::get(ScalarTy, InsertVecSz);
9078 if (!IsIdentity)
9080 InsertVecTy, Mask);
9081 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
9082 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9083 }));
9084 // Second cost - permutation with subvector, if some elements are from the
9085 // initial vector or inserting a subvector.
9086 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
9087 // subvector of ActualVecTy.
9088 SmallBitVector InMask =
9089 isUndefVector(FirstInsert->getOperand(0),
9090 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9091 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
9092 if (InsertVecSz != VecSz) {
9093 auto *ActualVecTy = FixedVectorType::get(ScalarTy, VecSz);
9095 std::nullopt, CostKind, OffsetBeg - Offset,
9096 InsertVecTy);
9097 } else {
9098 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
9099 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
9100 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
9101 I <= End; ++I)
9102 if (Mask[I] != PoisonMaskElem)
9103 Mask[I] = I + VecSz;
9104 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
9105 Mask[I] =
9106 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
9107 Cost +=
9108 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
9109 }
9110 }
9111 return Cost;
9112 }
9113 case Instruction::ZExt:
9114 case Instruction::SExt:
9115 case Instruction::FPToUI:
9116 case Instruction::FPToSI:
9117 case Instruction::FPExt:
9118 case Instruction::PtrToInt:
9119 case Instruction::IntToPtr:
9120 case Instruction::SIToFP:
9121 case Instruction::UIToFP:
9122 case Instruction::Trunc:
9123 case Instruction::FPTrunc:
9124 case Instruction::BitCast: {
9125 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9126 Type *SrcScalarTy = VL0->getOperand(0)->getType();
9127 auto *SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
9128 unsigned Opcode = ShuffleOrOp;
9129 unsigned VecOpcode = Opcode;
9130 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
9131 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
9132 // Check if the values are candidates to demote.
9133 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
9134 if (SrcIt != MinBWs.end()) {
9135 SrcBWSz = SrcIt->second.first;
9136 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
9137 SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
9138 }
9139 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9140 if (BWSz == SrcBWSz) {
9141 VecOpcode = Instruction::BitCast;
9142 } else if (BWSz < SrcBWSz) {
9143 VecOpcode = Instruction::Trunc;
9144 } else if (It != MinBWs.end()) {
9145 assert(BWSz > SrcBWSz && "Invalid cast!");
9146 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9147 } else if (SrcIt != MinBWs.end()) {
9148 assert(BWSz > SrcBWSz && "Invalid cast!");
9149 VecOpcode =
9150 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9151 }
9152 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
9153 !SrcIt->second.second) {
9154 VecOpcode = Instruction::UIToFP;
9155 }
9156 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
9157 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9158 return TTI->getCastInstrCost(Opcode, VL0->getType(),
9159 VL0->getOperand(0)->getType(),
9161 };
9162 auto GetVectorCost = [=](InstructionCost CommonCost) {
9163 // Do not count cost here if minimum bitwidth is in effect and it is just
9164 // a bitcast (here it is just a noop).
9165 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9166 return CommonCost;
9167 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
9168 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
9169 return CommonCost +
9170 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
9171 VecOpcode == Opcode ? VI : nullptr);
9172 };
9173 return GetCostDiff(GetScalarCost, GetVectorCost);
9174 }
9175 case Instruction::FCmp:
9176 case Instruction::ICmp:
9177 case Instruction::Select: {
9178 CmpInst::Predicate VecPred, SwappedVecPred;
9179 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
9180 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
9181 match(VL0, MatchCmp))
9182 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
9183 else
9184 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
9187 auto GetScalarCost = [&](unsigned Idx) {
9188 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9189 CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
9192 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
9193 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
9194 !match(VI, MatchCmp)) ||
9195 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9196 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
9199
9200 return TTI->getCmpSelInstrCost(E->getOpcode(), OrigScalarTy,
9201 Builder.getInt1Ty(), CurrentPred, CostKind,
9202 VI);
9203 };
9204 auto GetVectorCost = [&](InstructionCost CommonCost) {
9205 auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
9206
9208 E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
9209 // Check if it is possible and profitable to use min/max for selects
9210 // in VL.
9211 //
9212 auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
9213 if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
9214 IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
9215 {VecTy, VecTy});
9216 InstructionCost IntrinsicCost =
9217 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9218 // If the selects are the only uses of the compares, they will be
9219 // dead and we can adjust the cost by removing their cost.
9220 if (IntrinsicAndUse.second)
9221 IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy,
9222 MaskTy, VecPred, CostKind);
9223 VecCost = std::min(VecCost, IntrinsicCost);
9224 }
9225 return VecCost + CommonCost;
9226 };
9227 return GetCostDiff(GetScalarCost, GetVectorCost);
9228 }
9229 case Instruction::FNeg:
9230 case Instruction::Add:
9231 case Instruction::FAdd:
9232 case Instruction::Sub:
9233 case Instruction::FSub:
9234 case Instruction::Mul:
9235 case Instruction::FMul:
9236 case Instruction::UDiv:
9237 case Instruction::SDiv:
9238 case Instruction::FDiv:
9239 case Instruction::URem:
9240 case Instruction::SRem:
9241 case Instruction::FRem:
9242 case Instruction::Shl:
9243 case Instruction::LShr:
9244 case Instruction::AShr:
9245 case Instruction::And:
9246 case Instruction::Or:
9247 case Instruction::Xor: {
9248 auto GetScalarCost = [&](unsigned Idx) {
9249 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9250 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9251 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
9252 TTI::OperandValueInfo Op2Info =
9253 TTI::getOperandInfo(VI->getOperand(OpIdx));
9254 SmallVector<const Value *> Operands(VI->operand_values());
9255 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
9256 Op1Info, Op2Info, Operands, VI);
9257 };
9258 auto GetVectorCost = [=](InstructionCost CommonCost) {
9259 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
9260 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
9261 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
9262 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
9263 Op2Info, std::nullopt, nullptr, TLI) +
9264 CommonCost;
9265 };
9266 return GetCostDiff(GetScalarCost, GetVectorCost);
9267 }
9268 case Instruction::GetElementPtr: {
9269 return CommonCost + GetGEPCostDiff(VL, VL0);
9270 }
9271 case Instruction::Load: {
9272 auto GetScalarCost = [&](unsigned Idx) {
9273 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
9274 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
9275 VI->getAlign(), VI->getPointerAddressSpace(),
9277 };
9278 auto *LI0 = cast<LoadInst>(VL0);
9279 auto GetVectorCost = [&](InstructionCost CommonCost) {
9280 InstructionCost VecLdCost;
9281 if (E->State == TreeEntry::Vectorize) {
9282 VecLdCost = TTI->getMemoryOpCost(
9283 Instruction::Load, VecTy, LI0->getAlign(),
9284 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
9285 } else if (E->State == TreeEntry::StridedVectorize) {
9286 Align CommonAlignment =
9287 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9288 VecLdCost = TTI->getStridedMemoryOpCost(
9289 Instruction::Load, VecTy, LI0->getPointerOperand(),
9290 /*VariableMask=*/false, CommonAlignment, CostKind);
9291 } else {
9292 assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
9293 Align CommonAlignment =
9294 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9295 VecLdCost = TTI->getGatherScatterOpCost(
9296 Instruction::Load, VecTy, LI0->getPointerOperand(),
9297 /*VariableMask=*/false, CommonAlignment, CostKind);
9298 }
9299 return VecLdCost + CommonCost;
9300 };
9301
9302 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
9303 // If this node generates masked gather load then it is not a terminal node.
9304 // Hence address operand cost is estimated separately.
9305 if (E->State == TreeEntry::ScatterVectorize)
9306 return Cost;
9307
9308 // Estimate cost of GEPs since this tree node is a terminator.
9309 SmallVector<Value *> PointerOps(VL.size());
9310 for (auto [I, V] : enumerate(VL))
9311 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
9312 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9313 }
9314 case Instruction::Store: {
9315 bool IsReorder = !E->ReorderIndices.empty();
9316 auto GetScalarCost = [=](unsigned Idx) {
9317 auto *VI = cast<StoreInst>(VL[Idx]);
9318 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
9319 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
9320 VI->getAlign(), VI->getPointerAddressSpace(),
9321 CostKind, OpInfo, VI);
9322 };
9323 auto *BaseSI =
9324 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9325 auto GetVectorCost = [=](InstructionCost CommonCost) {
9326 // We know that we can merge the stores. Calculate the cost.
9327 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
9328 return TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
9329 BaseSI->getPointerAddressSpace(), CostKind,
9330 OpInfo) +
9331 CommonCost;
9332 };
9333 SmallVector<Value *> PointerOps(VL.size());
9334 for (auto [I, V] : enumerate(VL)) {
9335 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
9336 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
9337 }
9338
9339 return GetCostDiff(GetScalarCost, GetVectorCost) +
9340 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9341 }
9342 case Instruction::Call: {
9343 auto GetScalarCost = [&](unsigned Idx) {
9344 auto *CI = cast<CallInst>(UniqueValues[Idx]);
9347 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
9348 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9349 }
9352 CI->getFunctionType()->params(), CostKind);
9353 };
9354 auto GetVectorCost = [=](InstructionCost CommonCost) {
9355 auto *CI = cast<CallInst>(VL0);
9357 SmallVector<Type *> ArgTys =
9359 It != MinBWs.end() ? It->second.first : 0);
9360 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
9361 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
9362 };
9363 return GetCostDiff(GetScalarCost, GetVectorCost);
9364 }
9365 case Instruction::ShuffleVector: {
9366 assert(E->isAltShuffle() &&
9367 ((Instruction::isBinaryOp(E->getOpcode()) &&
9368 Instruction::isBinaryOp(E->getAltOpcode())) ||
9369 (Instruction::isCast(E->getOpcode()) &&
9370 Instruction::isCast(E->getAltOpcode())) ||
9371 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9372 "Invalid Shuffle Vector Operand");
9373 // Try to find the previous shuffle node with the same operands and same
9374 // main/alternate ops.
9375 auto TryFindNodeWithEqualOperands = [=]() {
9376 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9377 if (TE.get() == E)
9378 break;
9379 if (TE->isAltShuffle() &&
9380 ((TE->getOpcode() == E->getOpcode() &&
9381 TE->getAltOpcode() == E->getAltOpcode()) ||
9382 (TE->getOpcode() == E->getAltOpcode() &&
9383 TE->getAltOpcode() == E->getOpcode())) &&
9384 TE->hasEqualOperands(*E))
9385 return true;
9386 }
9387 return false;
9388 };
9389 auto GetScalarCost = [&](unsigned Idx) {
9390 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9391 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
9392 (void)E;
9393 return TTI->getInstructionCost(VI, CostKind);
9394 };
9395 // Need to clear CommonCost since the final shuffle cost is included into
9396 // vector cost.
9397 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
9398 // VecCost is equal to sum of the cost of creating 2 vectors
9399 // and the cost of creating shuffle.
9400 InstructionCost VecCost = 0;
9401 if (TryFindNodeWithEqualOperands()) {
9402 LLVM_DEBUG({
9403 dbgs() << "SLP: diamond match for alternate node found.\n";
9404 E->dump();
9405 });
9406 // No need to add new vector costs here since we're going to reuse
9407 // same main/alternate vector ops, just do different shuffling.
9408 } else if (Instruction::isBinaryOp(E->getOpcode())) {
9409 VecCost =
9410 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
9411 VecCost +=
9412 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
9413 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
9414 auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
9415 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
9416 CI0->getPredicate(), CostKind, VL0);
9417 VecCost += TTIRef.getCmpSelInstrCost(
9418 E->getOpcode(), VecTy, MaskTy,
9419 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
9420 E->getAltOp());
9421 } else {
9422 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
9423 auto *SrcTy = FixedVectorType::get(SrcSclTy, VL.size());
9424 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
9425 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9426 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9427 unsigned SrcBWSz =
9428 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
9429 if (SrcIt != MinBWs.end()) {
9430 SrcBWSz = SrcIt->second.first;
9431 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
9432 SrcTy = FixedVectorType::get(SrcSclTy, VL.size());
9433 }
9434 if (BWSz <= SrcBWSz) {
9435 if (BWSz < SrcBWSz)
9436 VecCost =
9437 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
9439 LLVM_DEBUG({
9440 dbgs()
9441 << "SLP: alternate extension, which should be truncated.\n";
9442 E->dump();
9443 });
9444 return VecCost;
9445 }
9446 }
9447 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
9449 VecCost +=
9450 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
9452 }
9454 E->buildAltOpShuffleMask(
9455 [E](Instruction *I) {
9456 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
9457 return I->getOpcode() == E->getAltOpcode();
9458 },
9459 Mask);
9461 FinalVecTy, Mask);
9462 // Patterns like [fadd,fsub] can be combined into a single instruction
9463 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
9464 // need to take into account their order when looking for the most used
9465 // order.
9466 unsigned Opcode0 = E->getOpcode();
9467 unsigned Opcode1 = E->getAltOpcode();
9468 // The opcode mask selects between the two opcodes.
9469 SmallBitVector OpcodeMask(E->Scalars.size(), false);
9470 for (unsigned Lane : seq<unsigned>(0, E->Scalars.size()))
9471 if (cast<Instruction>(E->Scalars[Lane])->getOpcode() == Opcode1)
9472 OpcodeMask.set(Lane);
9473 // If this pattern is supported by the target then we consider the
9474 // order.
9475 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9476 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
9477 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
9478 return AltVecCost < VecCost ? AltVecCost : VecCost;
9479 }
9480 // TODO: Check the reverse order too.
9481 return VecCost;
9482 };
9483 return GetCostDiff(GetScalarCost, GetVectorCost);
9484 }
9485 default:
9486 llvm_unreachable("Unknown instruction");
9487 }
9488}
9489
9490bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
9491 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
9492 << VectorizableTree.size() << " is fully vectorizable .\n");
9493
9494 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
9496 return TE->State == TreeEntry::NeedToGather &&
9497 !any_of(TE->Scalars,
9498 [this](Value *V) { return EphValues.contains(V); }) &&
9499 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
9500 TE->Scalars.size() < Limit ||
9501 ((TE->getOpcode() == Instruction::ExtractElement ||
9502 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
9503 isFixedVectorShuffle(TE->Scalars, Mask)) ||
9504 (TE->State == TreeEntry::NeedToGather &&
9505 TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()));
9506 };
9507
9508 // We only handle trees of heights 1 and 2.
9509 if (VectorizableTree.size() == 1 &&
9510 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
9511 (ForReduction &&
9512 AreVectorizableGathers(VectorizableTree[0].get(),
9513 VectorizableTree[0]->Scalars.size()) &&
9514 VectorizableTree[0]->getVectorFactor() > 2)))
9515 return true;
9516
9517 if (VectorizableTree.size() != 2)
9518 return false;
9519
9520 // Handle splat and all-constants stores. Also try to vectorize tiny trees
9521 // with the second gather nodes if they have less scalar operands rather than
9522 // the initial tree element (may be profitable to shuffle the second gather)
9523 // or they are extractelements, which form shuffle.
9525 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
9526 AreVectorizableGathers(VectorizableTree[1].get(),
9527 VectorizableTree[0]->Scalars.size()))
9528 return true;
9529
9530 // Gathering cost would be too much for tiny trees.
9531 if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
9532 (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9533 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
9534 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
9535 return false;
9536
9537 return true;
9538}
9539
9540static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
9542 bool MustMatchOrInst) {
9543 // Look past the root to find a source value. Arbitrarily follow the
9544 // path through operand 0 of any 'or'. Also, peek through optional
9545 // shift-left-by-multiple-of-8-bits.
9546 Value *ZextLoad = Root;
9547 const APInt *ShAmtC;
9548 bool FoundOr = false;
9549 while (!isa<ConstantExpr>(ZextLoad) &&
9550 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
9551 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
9552 ShAmtC->urem(8) == 0))) {
9553 auto *BinOp = cast<BinaryOperator>(ZextLoad);
9554 ZextLoad = BinOp->getOperand(0);
9555 if (BinOp->getOpcode() == Instruction::Or)
9556 FoundOr = true;
9557 }
9558 // Check if the input is an extended load of the required or/shift expression.
9559 Value *Load;
9560 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
9561 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
9562 return false;
9563
9564 // Require that the total load bit width is a legal integer type.
9565 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
9566 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
9567 Type *SrcTy = Load->getType();
9568 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
9569 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
9570 return false;
9571
9572 // Everything matched - assume that we can fold the whole sequence using
9573 // load combining.
9574 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
9575 << *(cast<Instruction>(Root)) << "\n");
9576
9577 return true;
9578}
9579
9581 if (RdxKind != RecurKind::Or)
9582 return false;
9583
9584 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9585 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
9586 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
9587 /* MatchOr */ false);
9588}
9589
9591 // Peek through a final sequence of stores and check if all operations are
9592 // likely to be load-combined.
9593 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9594 for (Value *Scalar : VectorizableTree[0]->Scalars) {
9595 Value *X;
9596 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
9597 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
9598 return false;
9599 }
9600 return true;
9601}
9602
9603bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
9604 // No need to vectorize inserts of gathered values.
9605 if (VectorizableTree.size() == 2 &&
9606 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
9607 VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9608 (VectorizableTree[1]->getVectorFactor() <= 2 ||
9609 !(isSplat(VectorizableTree[1]->Scalars) ||
9610 allConstant(VectorizableTree[1]->Scalars))))
9611 return true;
9612
9613 // If the graph includes only PHI nodes and gathers, it is defnitely not
9614 // profitable for the vectorization, we can skip it, if the cost threshold is
9615 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
9616 // gathers/buildvectors.
9617 constexpr int Limit = 4;
9618 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
9619 !VectorizableTree.empty() &&
9620 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9621 return (TE->State == TreeEntry::NeedToGather &&
9622 TE->getOpcode() != Instruction::ExtractElement &&
9623 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
9624 TE->getOpcode() == Instruction::PHI;
9625 }))
9626 return true;
9627
9628 // We can vectorize the tree if its size is greater than or equal to the
9629 // minimum size specified by the MinTreeSize command line option.
9630 if (VectorizableTree.size() >= MinTreeSize)
9631 return false;
9632
9633 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
9634 // can vectorize it if we can prove it fully vectorizable.
9635 if (isFullyVectorizableTinyTree(ForReduction))
9636 return false;
9637
9638 // Check if any of the gather node forms an insertelement buildvector
9639 // somewhere.
9640 bool IsAllowedSingleBVNode =
9641 VectorizableTree.size() > 1 ||
9642 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
9643 !VectorizableTree.front()->isAltShuffle() &&
9644 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
9645 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
9646 allSameBlock(VectorizableTree.front()->Scalars));
9647 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9648 return TE->State == TreeEntry::NeedToGather &&
9649 all_of(TE->Scalars, [&](Value *V) {
9650 return isa<ExtractElementInst, UndefValue>(V) ||
9651 (IsAllowedSingleBVNode &&
9652 !V->hasNUsesOrMore(UsesLimit) &&
9653 any_of(V->users(), IsaPred<InsertElementInst>));
9654 });
9655 }))
9656 return false;
9657
9658 assert(VectorizableTree.empty()
9659 ? ExternalUses.empty()
9660 : true && "We shouldn't have any external users");
9661
9662 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
9663 // vectorizable.
9664 return true;
9665}
9666
9668 // Walk from the bottom of the tree to the top, tracking which values are
9669 // live. When we see a call instruction that is not part of our tree,
9670 // query TTI to see if there is a cost to keeping values live over it
9671 // (for example, if spills and fills are required).
9672 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
9674
9676 Instruction *PrevInst = nullptr;
9677
9678 // The entries in VectorizableTree are not necessarily ordered by their
9679 // position in basic blocks. Collect them and order them by dominance so later
9680 // instructions are guaranteed to be visited first. For instructions in
9681 // different basic blocks, we only scan to the beginning of the block, so
9682 // their order does not matter, as long as all instructions in a basic block
9683 // are grouped together. Using dominance ensures a deterministic order.
9684 SmallVector<Instruction *, 16> OrderedScalars;
9685 for (const auto &TEPtr : VectorizableTree) {
9686 if (TEPtr->State != TreeEntry::Vectorize)
9687 continue;
9688 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
9689 if (!Inst)
9690 continue;
9691 OrderedScalars.push_back(Inst);
9692 }
9693 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
9694 auto *NodeA = DT->getNode(A->getParent());
9695 auto *NodeB = DT->getNode(B->getParent());
9696 assert(NodeA && "Should only process reachable instructions");
9697 assert(NodeB && "Should only process reachable instructions");
9698 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
9699 "Different nodes should have different DFS numbers");
9700 if (NodeA != NodeB)
9701 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
9702 return B->comesBefore(A);
9703 });
9704
9705 for (Instruction *Inst : OrderedScalars) {
9706 if (!PrevInst) {
9707 PrevInst = Inst;
9708 continue;
9709 }
9710
9711 // Update LiveValues.
9712 LiveValues.erase(PrevInst);
9713 for (auto &J : PrevInst->operands()) {
9714 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
9715 LiveValues.insert(cast<Instruction>(&*J));
9716 }
9717
9718 LLVM_DEBUG({
9719 dbgs() << "SLP: #LV: " << LiveValues.size();
9720 for (auto *X : LiveValues)
9721 dbgs() << " " << X->getName();
9722 dbgs() << ", Looking at ";
9723 Inst->dump();
9724 });
9725
9726 // Now find the sequence of instructions between PrevInst and Inst.
9727 unsigned NumCalls = 0;
9728 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
9729 PrevInstIt =
9730 PrevInst->getIterator().getReverse();
9731 while (InstIt != PrevInstIt) {
9732 if (PrevInstIt == PrevInst->getParent()->rend()) {
9733 PrevInstIt = Inst->getParent()->rbegin();
9734 continue;
9735 }
9736
9737 auto NoCallIntrinsic = [this](Instruction *I) {
9738 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
9739 if (II->isAssumeLikeIntrinsic())
9740 return true;
9741 FastMathFlags FMF;
9743 for (auto &ArgOp : II->args())
9744 Tys.push_back(ArgOp->getType());
9745 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
9746 FMF = FPMO->getFastMathFlags();
9747 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
9748 FMF);
9749 InstructionCost IntrCost =
9752 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
9753 if (IntrCost < CallCost)
9754 return true;
9755 }
9756 return false;
9757 };
9758
9759 // Debug information does not impact spill cost.
9760 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
9761 &*PrevInstIt != PrevInst)
9762 NumCalls++;
9763
9764 ++PrevInstIt;
9765 }
9766
9767 if (NumCalls) {
9769 for (auto *II : LiveValues) {
9770 auto *ScalarTy = II->getType();
9771 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
9772 ScalarTy = VectorTy->getElementType();
9773 V.push_back(FixedVectorType::get(ScalarTy, BundleWidth));
9774 }
9775 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
9776 }
9777
9778 PrevInst = Inst;
9779 }
9780
9781 return Cost;
9782}
9783
9784/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
9785/// buildvector sequence.
9787 const InsertElementInst *IE2) {
9788 if (IE1 == IE2)
9789 return false;
9790 const auto *I1 = IE1;
9791 const auto *I2 = IE2;
9792 const InsertElementInst *PrevI1;
9793 const InsertElementInst *PrevI2;
9794 unsigned Idx1 = *getInsertIndex(IE1);
9795 unsigned Idx2 = *getInsertIndex(IE2);
9796 do {
9797 if (I2 == IE1)
9798 return true;
9799 if (I1 == IE2)
9800 return false;
9801 PrevI1 = I1;
9802 PrevI2 = I2;
9803 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
9804 getInsertIndex(I1).value_or(Idx2) != Idx2)
9805 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
9806 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
9807 getInsertIndex(I2).value_or(Idx1) != Idx1)
9808 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
9809 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
9810 llvm_unreachable("Two different buildvectors not expected.");
9811}
9812
9813namespace {
9814/// Returns incoming Value *, if the requested type is Value * too, or a default
9815/// value, otherwise.
9816struct ValueSelect {
9817 template <typename U>
9818 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
9819 return V;
9820 }
9821 template <typename U>
9822 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
9823 return U();
9824 }
9825};
9826} // namespace
9827
9828/// Does the analysis of the provided shuffle masks and performs the requested
9829/// actions on the vectors with the given shuffle masks. It tries to do it in
9830/// several steps.
9831/// 1. If the Base vector is not undef vector, resizing the very first mask to
9832/// have common VF and perform action for 2 input vectors (including non-undef
9833/// Base). Other shuffle masks are combined with the resulting after the 1 stage
9834/// and processed as a shuffle of 2 elements.
9835/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
9836/// action only for 1 vector with the given mask, if it is not the identity
9837/// mask.
9838/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
9839/// vectors, combing the masks properly between the steps.
9840template <typename T>
9842 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
9843 function_ref<unsigned(T *)> GetVF,
9844 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
9846 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
9847 SmallVector<int> Mask(ShuffleMask.begin()->second);
9848 auto VMIt = std::next(ShuffleMask.begin());
9849 T *Prev = nullptr;
9850 SmallBitVector UseMask =
9851 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
9852 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
9853 if (!IsBaseUndef.all()) {
9854 // Base is not undef, need to combine it with the next subvectors.
9855 std::pair<T *, bool> Res =
9856 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
9857 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
9858 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
9859 if (Mask[Idx] == PoisonMaskElem)
9860 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
9861 else
9862 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
9863 }
9864 auto *V = ValueSelect::get<T *>(Base);
9865 (void)V;
9866 assert((!V || GetVF(V) == Mask.size()) &&
9867 "Expected base vector of VF number of elements.");
9868 Prev = Action(Mask, {nullptr, Res.first});
9869 } else if (ShuffleMask.size() == 1) {
9870 // Base is undef and only 1 vector is shuffled - perform the action only for
9871 // single vector, if the mask is not the identity mask.
9872 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
9873 /*ForSingleMask=*/true);
9874 if (Res.second)
9875 // Identity mask is found.
9876 Prev = Res.first;
9877 else
9878 Prev = Action(Mask, {ShuffleMask.begin()->first});
9879 } else {
9880 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
9881 // shuffles step by step, combining shuffle between the steps.
9882 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
9883 unsigned Vec2VF = GetVF(VMIt->first);
9884 if (Vec1VF == Vec2VF) {
9885 // No need to resize the input vectors since they are of the same size, we
9886 // can shuffle them directly.
9887 ArrayRef<int> SecMask = VMIt->second;
9888 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
9889 if (SecMask[I] != PoisonMaskElem) {
9890 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
9891 Mask[I] = SecMask[I] + Vec1VF;
9892 }
9893 }
9894 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
9895 } else {
9896 // Vectors of different sizes - resize and reshuffle.
9897 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
9898 /*ForSingleMask=*/false);
9899 std::pair<T *, bool> Res2 =
9900 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
9901 ArrayRef<int> SecMask = VMIt->second;
9902 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
9903 if (Mask[I] != PoisonMaskElem) {
9904 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
9905 if (Res1.second)
9906 Mask[I] = I;
9907 } else if (SecMask[I] != PoisonMaskElem) {
9908 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
9909 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
9910 }
9911 }
9912 Prev = Action(Mask, {Res1.first, Res2.first});
9913 }
9914 VMIt = std::next(VMIt);
9915 }
9916 bool IsBaseNotUndef = !IsBaseUndef.all();
9917 (void)IsBaseNotUndef;
9918 // Perform requested actions for the remaining masks/vectors.
9919 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
9920 // Shuffle other input vectors, if any.
9921 std::pair<T *, bool> Res =
9922 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
9923 ArrayRef<int> SecMask = VMIt->second;
9924 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
9925 if (SecMask[I] != PoisonMaskElem) {
9926 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
9927 "Multiple uses of scalars.");
9928 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
9929 } else if (Mask[I] != PoisonMaskElem) {
9930 Mask[I] = I;
9931 }
9932 }
9933 Prev = Action(Mask, {Prev, Res.first});
9934 }
9935 return Prev;
9936}
9937
9940 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
9941 << VectorizableTree.size() << ".\n");
9942
9943 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
9944
9945 SmallPtrSet<Value *, 4> CheckedExtracts;
9946 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
9947 TreeEntry &TE = *VectorizableTree[I];
9948 if (TE.State == TreeEntry::NeedToGather) {
9949 if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
9950 E && E->getVectorFactor() == TE.getVectorFactor() &&
9951 E->isSame(TE.Scalars)) {
9952 // Some gather nodes might be absolutely the same as some vectorizable
9953 // nodes after reordering, need to handle it.
9954 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
9955 << shortBundleName(TE.Scalars) << ".\n"
9956 << "SLP: Current total cost = " << Cost << "\n");
9957 continue;
9958 }
9959 }
9960
9961 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
9962 Cost += C;
9963 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
9964 << shortBundleName(TE.Scalars) << ".\n"
9965 << "SLP: Current total cost = " << Cost << "\n");
9966 }
9967
9968 SmallPtrSet<Value *, 16> ExtractCostCalculated;
9969 InstructionCost ExtractCost = 0;
9972 SmallVector<APInt> DemandedElts;
9973 SmallDenseSet<Value *, 4> UsedInserts;
9975 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
9976 for (ExternalUser &EU : ExternalUses) {
9977 // We only add extract cost once for the same scalar.
9978 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
9979 !ExtractCostCalculated.insert(EU.Scalar).second)
9980 continue;
9981
9982 // Uses by ephemeral values are free (because the ephemeral value will be
9983 // removed prior to code generation, and so the extraction will be
9984 // removed as well).
9985 if (EphValues.count(EU.User))
9986 continue;
9987
9988 // No extract cost for vector "scalar"
9989 if (isa<FixedVectorType>(EU.Scalar->getType()))
9990 continue;
9991
9992 // If found user is an insertelement, do not calculate extract cost but try
9993 // to detect it as a final shuffled/identity match.
9994 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
9995 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
9996 if (!UsedInserts.insert(VU).second)
9997 continue;
9998 std::optional<unsigned> InsertIdx = getInsertIndex(VU);
9999 if (InsertIdx) {
10000 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
10001 auto *It = find_if(
10002 FirstUsers,
10003 [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
10005 VU, cast<InsertElementInst>(Pair.first),
10006 [this](InsertElementInst *II) -> Value * {
10007 Value *Op0 = II->getOperand(0);
10008 if (getTreeEntry(II) && !getTreeEntry(Op0))
10009 return nullptr;
10010 return Op0;
10011 });
10012 });
10013 int VecId = -1;
10014 if (It == FirstUsers.end()) {
10015 (void)ShuffleMasks.emplace_back();
10016 SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
10017 if (Mask.empty())
10018 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10019 // Find the insertvector, vectorized in tree, if any.
10020 Value *Base = VU;
10021 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
10022 if (IEBase != EU.User &&
10023 (!IEBase->hasOneUse() ||
10024 getInsertIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
10025 break;
10026 // Build the mask for the vectorized insertelement instructions.
10027 if (const TreeEntry *E = getTreeEntry(IEBase)) {
10028 VU = IEBase;
10029 do {
10030 IEBase = cast<InsertElementInst>(Base);
10031 int Idx = *getInsertIndex(IEBase);
10032 assert(Mask[Idx] == PoisonMaskElem &&
10033 "InsertElementInstruction used already.");
10034 Mask[Idx] = Idx;
10035 Base = IEBase->getOperand(0);
10036 } while (E == getTreeEntry(Base));
10037 break;
10038 }
10039 Base = cast<InsertElementInst>(Base)->getOperand(0);
10040 }
10041 FirstUsers.emplace_back(VU, ScalarTE);
10042 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
10043 VecId = FirstUsers.size() - 1;
10044 auto It = MinBWs.find(ScalarTE);
10045 if (It != MinBWs.end() &&
10046 VectorCasts
10047 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
10048 .second) {
10049 unsigned BWSz = It->second.first;
10050 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
10051 unsigned VecOpcode;
10052 if (DstBWSz < BWSz)
10053 VecOpcode = Instruction::Trunc;
10054 else
10055 VecOpcode =
10056 It->second.second ? Instruction::SExt : Instruction::ZExt;
10059 VecOpcode, FTy,
10061 IntegerType::get(FTy->getContext(), BWSz),
10062 FTy->getNumElements()),
10064 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10065 << " for extending externally used vector with "
10066 "non-equal minimum bitwidth.\n");
10067 Cost += C;
10068 }
10069 } else {
10070 if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
10071 It->first = VU;
10072 VecId = std::distance(FirstUsers.begin(), It);
10073 }
10074 int InIdx = *InsertIdx;
10075 SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
10076 if (Mask.empty())
10077 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10078 Mask[InIdx] = EU.Lane;
10079 DemandedElts[VecId].setBit(InIdx);
10080 continue;
10081 }
10082 }
10083 }
10084 // Leave the GEPs as is, they are free in most cases and better to keep them
10085 // as GEPs.
10087 if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10088 if (!ValueToExtUses) {
10089 ValueToExtUses.emplace();
10090 for_each(enumerate(ExternalUses), [&](const auto &P) {
10091 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
10092 });
10093 }
10094 // Can use original GEP, if no operands vectorized or they are marked as
10095 // externally used already.
10096 bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
10097 if (!getTreeEntry(V))
10098 return true;
10099 auto It = ValueToExtUses->find(V);
10100 if (It != ValueToExtUses->end()) {
10101 // Replace all uses to avoid compiler crash.
10102 ExternalUses[It->second].User = nullptr;
10103 return true;
10104 }
10105 return false;
10106 });
10107 if (CanBeUsedAsGEP) {
10108 ExtractCost += TTI->getInstructionCost(GEP, CostKind);
10109 ExternalUsesAsGEPs.insert(EU.Scalar);
10110 continue;
10111 }
10112 }
10113
10114 // If we plan to rewrite the tree in a smaller type, we will need to sign
10115 // extend the extracted value back to the original type. Here, we account
10116 // for the extract and the added cost of the sign extend if needed.
10117 auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
10118 auto It = MinBWs.find(getTreeEntry(EU.Scalar));
10119 if (It != MinBWs.end()) {
10120 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
10121 unsigned Extend =
10122 It->second.second ? Instruction::SExt : Instruction::ZExt;
10123 VecTy = FixedVectorType::get(MinTy, BundleWidth);
10124 ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
10125 VecTy, EU.Lane);
10126 } else {
10127 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
10128 CostKind, EU.Lane);
10129 }
10130 }
10131 // Add reduced value cost, if resized.
10132 if (!VectorizedVals.empty()) {
10133 const TreeEntry &Root = *VectorizableTree.front().get();
10134 auto BWIt = MinBWs.find(&Root);
10135 if (BWIt != MinBWs.end()) {
10136 Type *DstTy = Root.Scalars.front()->getType();
10137 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
10138 unsigned SrcSz =
10139 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10140 if (OriginalSz != SrcSz) {
10141 unsigned Opcode = Instruction::Trunc;
10142 if (OriginalSz > SrcSz)
10143 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10144 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
10145 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
10148 }
10149 }
10150 }
10151
10152 InstructionCost SpillCost = getSpillCost();
10153 Cost += SpillCost + ExtractCost;
10154 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
10155 bool) {
10156 InstructionCost C = 0;
10157 unsigned VF = Mask.size();
10158 unsigned VecVF = TE->getVectorFactor();
10159 if (VF != VecVF &&
10160 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
10162 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
10163 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
10164 OrigMask.begin());
10165 C = TTI->getShuffleCost(
10167 FixedVectorType::get(TE->getMainOp()->getType(), VecVF), OrigMask);
10168 LLVM_DEBUG(
10169 dbgs() << "SLP: Adding cost " << C
10170 << " for final shuffle of insertelement external users.\n";
10171 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
10172 Cost += C;
10173 return std::make_pair(TE, true);
10174 }
10175 return std::make_pair(TE, false);
10176 };
10177 // Calculate the cost of the reshuffled vectors, if any.
10178 for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
10179 Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
10180 auto Vector = ShuffleMasks[I].takeVector();
10181 unsigned VF = 0;
10182 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
10184 assert((TEs.size() == 1 || TEs.size() == 2) &&
10185 "Expected exactly 1 or 2 tree entries.");
10186 if (TEs.size() == 1) {
10187 if (VF == 0)
10188 VF = TEs.front()->getVectorFactor();
10189 auto *FTy =
10190 FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
10191 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
10192 !all_of(enumerate(Mask), [=](const auto &Data) {
10193 return Data.value() == PoisonMaskElem ||
10194 (Data.index() < VF &&
10195 static_cast<int>(Data.index()) == Data.value());
10196 })) {
10199 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10200 << " for final shuffle of insertelement "
10201 "external users.\n";
10202 TEs.front()->dump();
10203 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10204 Cost += C;
10205 }
10206 } else {
10207 if (VF == 0) {
10208 if (TEs.front() &&
10209 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10210 VF = TEs.front()->getVectorFactor();
10211 else
10212 VF = Mask.size();
10213 }
10214 auto *FTy =
10215 FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
10218 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10219 << " for final shuffle of vector node and external "
10220 "insertelement users.\n";
10221 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10222 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10223 Cost += C;
10224 }
10225 VF = Mask.size();
10226 return TEs.back();
10227 };
10228 (void)performExtractsShuffleAction<const TreeEntry>(
10229 MutableArrayRef(Vector.data(), Vector.size()), Base,
10230 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
10231 EstimateShufflesCost);
10233 cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
10234 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
10235 Cost -= InsertCost;
10236 }
10237
10238 // Add the cost for reduced value resize (if required).
10239 if (ReductionBitWidth != 0) {
10240 assert(UserIgnoreList && "Expected reduction tree.");
10241 const TreeEntry &E = *VectorizableTree.front().get();
10242 auto It = MinBWs.find(&E);
10243 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10244 unsigned SrcSize = It->second.first;
10245 unsigned DstSize = ReductionBitWidth;
10246 unsigned Opcode = Instruction::Trunc;
10247 if (SrcSize < DstSize)
10248 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10249 auto *SrcVecTy =
10250 FixedVectorType::get(Builder.getIntNTy(SrcSize), E.getVectorFactor());
10251 auto *DstVecTy =
10252 FixedVectorType::get(Builder.getIntNTy(DstSize), E.getVectorFactor());
10253 TTI::CastContextHint CCH = getCastContextHint(E);
10254 InstructionCost CastCost;
10255 switch (E.getOpcode()) {
10256 case Instruction::SExt:
10257 case Instruction::ZExt:
10258 case Instruction::Trunc: {
10259 const TreeEntry *OpTE = getOperandEntry(&E, 0);
10260 CCH = getCastContextHint(*OpTE);
10261 break;
10262 }
10263 default:
10264 break;
10265 }
10266 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
10268 Cost += CastCost;
10269 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
10270 << " for final resize for reduction from " << SrcVecTy
10271 << " to " << DstVecTy << "\n";
10272 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10273 }
10274 }
10275
10276#ifndef NDEBUG
10277 SmallString<256> Str;
10278 {
10280 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
10281 << "SLP: Extract Cost = " << ExtractCost << ".\n"
10282 << "SLP: Total Cost = " << Cost << ".\n";
10283 }
10284 LLVM_DEBUG(dbgs() << Str);
10285 if (ViewSLPTree)
10286 ViewGraph(this, "SLP" + F->getName(), false, Str);
10287#endif
10288
10289 return Cost;
10290}
10291
10292/// Tries to find extractelement instructions with constant indices from fixed
10293/// vector type and gather such instructions into a bunch, which highly likely
10294/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10295/// successful, the matched scalars are replaced by poison values in \p VL for
10296/// future analysis.
10297std::optional<TTI::ShuffleKind>
10298BoUpSLP::tryToGatherSingleRegisterExtractElements(
10300 // Scan list of gathered scalars for extractelements that can be represented
10301 // as shuffles.
10303 SmallVector<int> UndefVectorExtracts;
10304 for (int I = 0, E = VL.size(); I < E; ++I) {
10305 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10306 if (!EI) {
10307 if (isa<UndefValue>(VL[I]))
10308 UndefVectorExtracts.push_back(I);
10309 continue;
10310 }
10311 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
10312 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
10313 continue;
10314 std::optional<unsigned> Idx = getExtractIndex(EI);
10315 // Undefined index.
10316 if (!Idx) {
10317 UndefVectorExtracts.push_back(I);
10318 continue;
10319 }
10320 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
10321 ExtractMask.reset(*Idx);
10322 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
10323 UndefVectorExtracts.push_back(I);
10324 continue;
10325 }
10326 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
10327 }
10328 // Sort the vector operands by the maximum number of uses in extractelements.
10330 for (const auto &Data : VectorOpToIdx)
10331 VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]
10332 .push_back(Data.first);
10333 for (auto &Data : VFToVector) {
10334 stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {
10335 return VectorOpToIdx.find(V1)->second.size() >
10336 VectorOpToIdx.find(V2)->second.size();
10337 });
10338 }
10339 // Find the best pair of the vectors with the same number of elements or a
10340 // single vector.
10341 const int UndefSz = UndefVectorExtracts.size();
10342 unsigned SingleMax = 0;
10343 Value *SingleVec = nullptr;
10344 unsigned PairMax = 0;
10345 std::pair<Value *, Value *> PairVec(nullptr, nullptr);
10346 for (auto &Data : VFToVector) {
10347 Value *V1 = Data.second.front();
10348 if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {
10349 SingleMax = VectorOpToIdx[V1].size() + UndefSz;
10350 SingleVec = V1;
10351 }
10352 Value *V2 = nullptr;
10353 if (Data.second.size() > 1)
10354 V2 = *std::next(Data.second.begin());
10355 if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +
10356 UndefSz) {
10357 PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;
10358 PairVec = std::make_pair(V1, V2);
10359 }
10360 }
10361 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10362 return std::nullopt;
10363 // Check if better to perform a shuffle of 2 vectors or just of a single
10364 // vector.
10365 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
10366 SmallVector<Value *> GatheredExtracts(
10367 VL.size(), PoisonValue::get(VL.front()->getType()));
10368 if (SingleMax >= PairMax && SingleMax) {
10369 for (int Idx : VectorOpToIdx[SingleVec])
10370 std::swap(GatheredExtracts[Idx], VL[Idx]);
10371 } else {
10372 for (Value *V : {PairVec.first, PairVec.second})
10373 for (int Idx : VectorOpToIdx[V])
10374 std::swap(GatheredExtracts[Idx], VL[Idx]);
10375 }
10376 // Add extracts from undefs too.
10377 for (int Idx : UndefVectorExtracts)
10378 std::swap(GatheredExtracts[Idx], VL[Idx]);
10379 // Check that gather of extractelements can be represented as just a
10380 // shuffle of a single/two vectors the scalars are extracted from.
10381 std::optional<TTI::ShuffleKind> Res =
10382 isFixedVectorShuffle(GatheredExtracts, Mask);
10383 if (!Res) {
10384 // TODO: try to check other subsets if possible.
10385 // Restore the original VL if attempt was not successful.
10386 copy(SavedVL, VL.begin());
10387 return std::nullopt;
10388 }
10389 // Restore unused scalars from mask, if some of the extractelements were not
10390 // selected for shuffle.
10391 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
10392 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
10393 isa<UndefValue>(GatheredExtracts[I])) {
10394 std::swap(VL[I], GatheredExtracts[I]);
10395 continue;
10396 }
10397 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10398 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
10399 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
10400 is_contained(UndefVectorExtracts, I))
10401 continue;
10402 }
10403 return Res;
10404}
10405
10406/// Tries to find extractelement instructions with constant indices from fixed
10407/// vector type and gather such instructions into a bunch, which highly likely
10408/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10409/// successful, the matched scalars are replaced by poison values in \p VL for
10410/// future analysis.
10412BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
10414 unsigned NumParts) const {
10415 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
10416 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
10417 Mask.assign(VL.size(), PoisonMaskElem);
10418 unsigned SliceSize = VL.size() / NumParts;
10419 for (unsigned Part = 0; Part < NumParts; ++Part) {
10420 // Scan list of gathered scalars for extractelements that can be represented
10421 // as shuffles.
10423 MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
10424 SmallVector<int> SubMask;
10425 std::optional<TTI::ShuffleKind> Res =
10426 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
10427 ShufflesRes[Part] = Res;
10428 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
10429 }
10430 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
10431 return Res.has_value();
10432 }))
10433 ShufflesRes.clear();
10434 return ShufflesRes;
10435}
10436
10437std::optional<TargetTransformInfo::ShuffleKind>
10438BoUpSLP::isGatherShuffledSingleRegisterEntry(
10439 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
10440 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
10441 Entries.clear();
10442 // TODO: currently checking only for Scalars in the tree entry, need to count
10443 // reused elements too for better cost estimation.
10444 const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
10445 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
10446 const BasicBlock *TEInsertBlock = nullptr;
10447 // Main node of PHI entries keeps the correct order of operands/incoming
10448 // blocks.
10449 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
10450 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
10451 TEInsertPt = TEInsertBlock->getTerminator();
10452 } else {
10453 TEInsertBlock = TEInsertPt->getParent();
10454 }
10455 if (!DT->isReachableFromEntry(TEInsertBlock))
10456 return std::nullopt;
10457 auto *NodeUI = DT->getNode(TEInsertBlock);
10458 assert(NodeUI && "Should only process reachable instructions");
10459 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
10460 auto CheckOrdering = [&](const Instruction *InsertPt) {
10461 // Argument InsertPt is an instruction where vector code for some other
10462 // tree entry (one that shares one or more scalars with TE) is going to be
10463 // generated. This lambda returns true if insertion point of vector code
10464 // for the TE dominates that point (otherwise dependency is the other way
10465 // around). The other node is not limited to be of a gather kind. Gather
10466 // nodes are not scheduled and their vector code is inserted before their
10467 // first user. If user is PHI, that is supposed to be at the end of a
10468 // predecessor block. Otherwise it is the last instruction among scalars of
10469 // the user node. So, instead of checking dependency between instructions
10470 // themselves, we check dependency between their insertion points for vector
10471 // code (since each scalar instruction ends up as a lane of a vector
10472 // instruction).
10473 const BasicBlock *InsertBlock = InsertPt->getParent();
10474 auto *NodeEUI = DT->getNode(InsertBlock);
10475 if (!NodeEUI)
10476 return false;
10477 assert((NodeUI == NodeEUI) ==
10478 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10479 "Different nodes should have different DFS numbers");
10480 // Check the order of the gather nodes users.
10481 if (TEInsertPt->getParent() != InsertBlock &&
10482 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
10483 return false;
10484 if (TEInsertPt->getParent() == InsertBlock &&
10485 TEInsertPt->comesBefore(InsertPt))
10486 return false;
10487 return true;
10488 };
10489 // Find all tree entries used by the gathered values. If no common entries
10490 // found - not a shuffle.
10491 // Here we build a set of tree nodes for each gathered value and trying to
10492 // find the intersection between these sets. If we have at least one common
10493 // tree node for each gathered value - we have just a permutation of the
10494 // single vector. If we have 2 different sets, we're in situation where we
10495 // have a permutation of 2 input vectors.
10497 DenseMap<Value *, int> UsedValuesEntry;
10498 for (Value *V : VL) {
10499 if (isConstant(V))
10500 continue;
10501 // Build a list of tree entries where V is used.
10503 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
10504 if (TEPtr == TE)
10505 continue;
10506 assert(any_of(TEPtr->Scalars,
10507 [&](Value *V) { return GatheredScalars.contains(V); }) &&
10508 "Must contain at least single gathered value.");
10509 assert(TEPtr->UserTreeIndices.size() == 1 &&
10510 "Expected only single user of a gather node.");
10511 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
10512
10513 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
10514 const Instruction *InsertPt =
10515 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
10516 : &getLastInstructionInBundle(UseEI.UserTE);
10517 if (TEInsertPt == InsertPt) {
10518 // If 2 gathers are operands of the same entry (regardless of whether
10519 // user is PHI or else), compare operands indices, use the earlier one
10520 // as the base.
10521 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
10522 continue;
10523 // If the user instruction is used for some reason in different
10524 // vectorized nodes - make it depend on index.
10525 if (TEUseEI.UserTE != UseEI.UserTE &&
10526 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
10527 continue;
10528 }
10529
10530 // Check if the user node of the TE comes after user node of TEPtr,
10531 // otherwise TEPtr depends on TE.
10532 if ((TEInsertBlock != InsertPt->getParent() ||
10533 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
10534 !CheckOrdering(InsertPt))
10535 continue;
10536 VToTEs.insert(TEPtr);
10537 }
10538 if (const TreeEntry *VTE = getTreeEntry(V)) {
10539 if (ForOrder) {
10540 if (VTE->State != TreeEntry::Vectorize) {
10541 auto It = MultiNodeScalars.find(V);
10542 if (It == MultiNodeScalars.end())
10543 continue;
10544 VTE = *It->getSecond().begin();
10545 // Iterate through all vectorized nodes.
10546 auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
10547 return MTE->State == TreeEntry::Vectorize;
10548 });
10549 if (MIt == It->getSecond().end())
10550 continue;
10551 VTE = *MIt;
10552 }
10553 }
10554 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
10555 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
10556 continue;
10557 VToTEs.insert(VTE);
10558 }
10559 if (VToTEs.empty())
10560 continue;
10561 if (UsedTEs.empty()) {
10562 // The first iteration, just insert the list of nodes to vector.
10563 UsedTEs.push_back(VToTEs);
10564 UsedValuesEntry.try_emplace(V, 0);
10565 } else {
10566 // Need to check if there are any previously used tree nodes which use V.
10567 // If there are no such nodes, consider that we have another one input
10568 // vector.
10569 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
10570 unsigned Idx = 0;
10571 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
10572 // Do we have a non-empty intersection of previously listed tree entries
10573 // and tree entries using current V?
10574 set_intersect(VToTEs, Set);
10575 if (!VToTEs.empty()) {
10576 // Yes, write the new subset and continue analysis for the next
10577 // scalar.
10578 Set.swap(VToTEs);
10579 break;
10580 }
10581 VToTEs = SavedVToTEs;
10582 ++Idx;
10583 }
10584 // No non-empty intersection found - need to add a second set of possible
10585 // source vectors.
10586 if (Idx == UsedTEs.size()) {
10587 // If the number of input vectors is greater than 2 - not a permutation,
10588 // fallback to the regular gather.
10589 // TODO: support multiple reshuffled nodes.
10590 if (UsedTEs.size() == 2)
10591 continue;
10592 UsedTEs.push_back(SavedVToTEs);
10593 Idx = UsedTEs.size() - 1;
10594 }
10595 UsedValuesEntry.try_emplace(V, Idx);
10596 }
10597 }
10598
10599 if (UsedTEs.empty()) {
10600 Entries.clear();
10601 return std::nullopt;
10602 }
10603
10604 unsigned VF = 0;
10605 if (UsedTEs.size() == 1) {
10606 // Keep the order to avoid non-determinism.
10607 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
10608 UsedTEs.front().end());
10609 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
10610 return TE1->Idx < TE2->Idx;
10611 });
10612 // Try to find the perfect match in another gather node at first.
10613 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
10614 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
10615 });
10616 if (It != FirstEntries.end() &&
10617 ((*It)->getVectorFactor() == VL.size() ||
10618 ((*It)->getVectorFactor() == TE->Scalars.size() &&
10619 TE->ReuseShuffleIndices.size() == VL.size() &&
10620 (*It)->isSame(TE->Scalars)))) {
10621 Entries.push_back(*It);
10622 if ((*It)->getVectorFactor() == VL.size()) {
10623 std::iota(std::next(Mask.begin(), Part * VL.size()),
10624 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
10625 } else {
10626 SmallVector<int> CommonMask = TE->getCommonMask();
10627 copy(CommonMask, Mask.begin());
10628 }
10629 // Clear undef scalars.
10630 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
10631 if (isa<PoisonValue>(VL[I]))
10634 }
10635 // No perfect match, just shuffle, so choose the first tree node from the
10636 // tree.
10637 Entries.push_back(FirstEntries.front());
10638 } else {
10639 // Try to find nodes with the same vector factor.
10640 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
10641 // Keep the order of tree nodes to avoid non-determinism.
10643 for (const TreeEntry *TE : UsedTEs.front()) {
10644 unsigned VF = TE->getVectorFactor();
10645 auto It = VFToTE.find(VF);
10646 if (It != VFToTE.end()) {
10647 if (It->second->Idx > TE->Idx)
10648 It->getSecond() = TE;
10649 continue;
10650 }
10651 VFToTE.try_emplace(VF, TE);
10652 }
10653 // Same, keep the order to avoid non-determinism.
10654 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
10655 UsedTEs.back().end());
10656 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
10657 return TE1->Idx < TE2->Idx;
10658 });
10659 for (const TreeEntry *TE : SecondEntries) {
10660 auto It = VFToTE.find(TE->getVectorFactor());
10661 if (It != VFToTE.end()) {
10662 VF = It->first;
10663 Entries.push_back(It->second);
10664 Entries.push_back(TE);
10665 break;
10666 }
10667 }
10668 // No 2 source vectors with the same vector factor - just choose 2 with max
10669 // index.
10670 if (Entries.empty()) {
10671 Entries.push_back(*llvm::max_element(
10672 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
10673 return TE1->Idx < TE2->Idx;
10674 }));
10675 Entries.push_back(SecondEntries.front());
10676 VF = std::max(Entries.front()->getVectorFactor(),
10677 Entries.back()->getVectorFactor());
10678 }
10679 }
10680
10681 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
10682 // Checks if the 2 PHIs are compatible in terms of high possibility to be
10683 // vectorized.
10684 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
10685 auto *PHI = cast<PHINode>(V);
10686 auto *PHI1 = cast<PHINode>(V1);
10687 // Check that all incoming values are compatible/from same parent (if they
10688 // are instructions).
10689 // The incoming values are compatible if they all are constants, or
10690 // instruction with the same/alternate opcodes from the same basic block.
10691 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
10692 Value *In = PHI->getIncomingValue(I);
10693 Value *In1 = PHI1->getIncomingValue(I);
10694 if (isConstant(In) && isConstant(In1))
10695 continue;
10696 if (!getSameOpcode({In, In1}, *TLI).getOpcode())
10697 return false;
10698 if (cast<Instruction>(In)->getParent() !=
10699 cast<Instruction>(In1)->getParent())
10700 return false;
10701 }
10702 return true;
10703 };
10704 // Check if the value can be ignored during analysis for shuffled gathers.
10705 // We suppose it is better to ignore instruction, which do not form splats,
10706 // are not vectorized/not extractelements (these instructions will be handled
10707 // by extractelements processing) or may form vector node in future.
10708 auto MightBeIgnored = [=](Value *V) {
10709 auto *I = dyn_cast<Instruction>(V);
10710 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
10712 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
10713 };
10714 // Check that the neighbor instruction may form a full vector node with the
10715 // current instruction V. It is possible, if they have same/alternate opcode
10716 // and same parent basic block.
10717 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
10718 Value *V1 = VL[Idx];
10719 bool UsedInSameVTE = false;
10720 auto It = UsedValuesEntry.find(V1);
10721 if (It != UsedValuesEntry.end())
10722 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
10723 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
10724 getSameOpcode({V, V1}, *TLI).getOpcode() &&
10725 cast<Instruction>(V)->getParent() ==
10726 cast<Instruction>(V1)->getParent() &&
10727 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
10728 };
10729 // Build a shuffle mask for better cost estimation and vector emission.
10730 SmallBitVector UsedIdxs(Entries.size());
10732 for (int I = 0, E = VL.size(); I < E; ++I) {
10733 Value *V = VL[I];
10734 auto It = UsedValuesEntry.find(V);
10735 if (It == UsedValuesEntry.end())
10736 continue;
10737 // Do not try to shuffle scalars, if they are constants, or instructions
10738 // that can be vectorized as a result of the following vector build
10739 // vectorization.
10740 if (isConstant(V) || (MightBeIgnored(V) &&
10741 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
10742 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
10743 continue;
10744 unsigned Idx = It->second;
10745 EntryLanes.emplace_back(Idx, I);
10746 UsedIdxs.set(Idx);
10747 }
10748 // Iterate through all shuffled scalars and select entries, which can be used
10749 // for final shuffle.
10751 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
10752 if (!UsedIdxs.test(I))
10753 continue;
10754 // Fix the entry number for the given scalar. If it is the first entry, set
10755 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
10756 // These indices are used when calculating final shuffle mask as the vector
10757 // offset.
10758 for (std::pair<unsigned, int> &Pair : EntryLanes)
10759 if (Pair.first == I)
10760 Pair.first = TempEntries.size();
10761 TempEntries.push_back(Entries[I]);
10762 }
10763 Entries.swap(TempEntries);
10764 if (EntryLanes.size() == Entries.size() &&
10765 !VL.equals(ArrayRef(TE->Scalars)
10766 .slice(Part * VL.size(),
10767 std::min<int>(VL.size(), TE->Scalars.size())))) {
10768 // We may have here 1 or 2 entries only. If the number of scalars is equal
10769 // to the number of entries, no need to do the analysis, it is not very
10770 // profitable. Since VL is not the same as TE->Scalars, it means we already
10771 // have some shuffles before. Cut off not profitable case.
10772 Entries.clear();
10773 return std::nullopt;
10774 }
10775 // Build the final mask, check for the identity shuffle, if possible.
10776 bool IsIdentity = Entries.size() == 1;
10777 // Pair.first is the offset to the vector, while Pair.second is the index of
10778 // scalar in the list.
10779 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
10780 unsigned Idx = Part * VL.size() + Pair.second;
10781 Mask[Idx] =
10782 Pair.first * VF +
10783 (ForOrder ? std::distance(
10784 Entries[Pair.first]->Scalars.begin(),
10785 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
10786 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
10787 IsIdentity &= Mask[Idx] == Pair.second;
10788 }
10789 switch (Entries.size()) {
10790 case 1:
10791 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
10793 break;
10794 case 2:
10795 if (EntryLanes.size() > 2 || VL.size() <= 2)
10797 break;
10798 default:
10799 break;
10800 }
10801 Entries.clear();
10802 // Clear the corresponding mask elements.
10803 std::fill(std::next(Mask.begin(), Part * VL.size()),
10804 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
10805 return std::nullopt;
10806}
10807
10809BoUpSLP::isGatherShuffledEntry(
10810 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
10811 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
10812 bool ForOrder) {
10813 assert(NumParts > 0 && NumParts < VL.size() &&
10814 "Expected positive number of registers.");
10815 Entries.clear();
10816 // No need to check for the topmost gather node.
10817 if (TE == VectorizableTree.front().get())
10818 return {};
10819 // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
10820 if (TE->isNonPowOf2Vec())
10821 return {};
10822 Mask.assign(VL.size(), PoisonMaskElem);
10823 assert(TE->UserTreeIndices.size() == 1 &&
10824 "Expected only single user of the gather node.");
10825 assert(VL.size() % NumParts == 0 &&
10826 "Number of scalars must be divisible by NumParts.");
10827 unsigned SliceSize = VL.size() / NumParts;
10829 for (unsigned Part = 0; Part < NumParts; ++Part) {
10830 ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
10831 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
10832 std::optional<TTI::ShuffleKind> SubRes =
10833 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
10834 ForOrder);
10835 if (!SubRes)
10836 SubEntries.clear();
10837 Res.push_back(SubRes);
10838 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
10839 SubEntries.front()->getVectorFactor() == VL.size() &&
10840 (SubEntries.front()->isSame(TE->Scalars) ||
10841 SubEntries.front()->isSame(VL))) {
10842 SmallVector<const TreeEntry *> LocalSubEntries;
10843 LocalSubEntries.swap(SubEntries);
10844 Entries.clear();
10845 Res.clear();
10846 std::iota(Mask.begin(), Mask.end(), 0);
10847 // Clear undef scalars.
10848 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
10849 if (isa<PoisonValue>(VL[I]))
10851 Entries.emplace_back(1, LocalSubEntries.front());
10853 return Res;
10854 }
10855 }
10856 if (all_of(Res,
10857 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
10858 Entries.clear();
10859 return {};
10860 }
10861 return Res;
10862}
10863
10864InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
10865 bool ForPoisonSrc) const {
10866 // Find the type of the operands in VL.
10867 Type *ScalarTy = VL[0]->getType();
10868 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
10869 ScalarTy = SI->getValueOperand()->getType();
10870 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
10871 bool DuplicateNonConst = false;
10872 // Find the cost of inserting/extracting values from the vector.
10873 // Check if the same elements are inserted several times and count them as
10874 // shuffle candidates.
10875 APInt ShuffledElements = APInt::getZero(VL.size());
10876 DenseMap<Value *, unsigned> UniqueElements;
10879 auto EstimateInsertCost = [&](unsigned I, Value *V) {
10880 if (!ForPoisonSrc)
10881 Cost +=
10882 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
10883 I, Constant::getNullValue(VecTy), V);
10884 };
10885 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
10886 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
10887 Value *V = VL[I];
10888 // No need to shuffle duplicates for constants.
10889 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
10890 ShuffledElements.setBit(I);
10891 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
10892 continue;
10893 }
10894
10895 auto Res = UniqueElements.try_emplace(V, I);
10896 if (Res.second) {
10897 EstimateInsertCost(I, V);
10898 ShuffleMask[I] = I;
10899 continue;
10900 }
10901
10902 DuplicateNonConst = true;
10903 ShuffledElements.setBit(I);
10904 ShuffleMask[I] = Res.first->second;
10905 }
10906 if (ForPoisonSrc)
10907 Cost =
10908 TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
10909 /*Extract*/ false, CostKind);
10910 if (DuplicateNonConst)
10912 VecTy, ShuffleMask);
10913 return Cost;
10914}
10915
10916// Perform operand reordering on the instructions in VL and return the reordered
10917// operands in Left and Right.
10918void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
10921 const BoUpSLP &R) {
10922 if (VL.empty())
10923 return;
10924 VLOperands Ops(VL, R);
10925 // Reorder the operands in place.
10926 Ops.reorder();
10927 Left = Ops.getVL(0);
10928 Right = Ops.getVL(1);
10929}
10930
10931Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
10932 auto &Res = EntryToLastInstruction.FindAndConstruct(E);
10933 if (Res.second)
10934 return *Res.second;
10935 // Get the basic block this bundle is in. All instructions in the bundle
10936 // should be in this block (except for extractelement-like instructions with
10937 // constant indeces).
10938 auto *Front = E->getMainOp();
10939 auto *BB = Front->getParent();
10940 assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
10941 if (E->getOpcode() == Instruction::GetElementPtr &&
10942 !isa<GetElementPtrInst>(V))
10943 return true;
10944 auto *I = cast<Instruction>(V);
10945 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
10946 isVectorLikeInstWithConstOps(I);
10947 }));
10948
10949 auto FindLastInst = [&]() {
10950 Instruction *LastInst = Front;
10951 for (Value *V : E->Scalars) {
10952 auto *I = dyn_cast<Instruction>(V);
10953 if (!I)
10954 continue;
10955 if (LastInst->getParent() == I->getParent()) {
10956 if (LastInst->comesBefore(I))
10957 LastInst = I;
10958 continue;
10959 }
10960 assert(((E->getOpcode() == Instruction::GetElementPtr &&
10961 !isa<GetElementPtrInst>(I)) ||
10962 (isVectorLikeInstWithConstOps(LastInst) &&
10964 "Expected vector-like or non-GEP in GEP node insts only.");
10965 if (!DT->isReachableFromEntry(LastInst->getParent())) {
10966 LastInst = I;
10967 continue;
10968 }
10969 if (!DT->isReachableFromEntry(I->getParent()))
10970 continue;
10971 auto *NodeA = DT->getNode(LastInst->getParent());
10972 auto *NodeB = DT->getNode(I->getParent());
10973 assert(NodeA && "Should only process reachable instructions");
10974 assert(NodeB && "Should only process reachable instructions");
10975 assert((NodeA == NodeB) ==
10976 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10977 "Different nodes should have different DFS numbers");
10978 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
10979 LastInst = I;
10980 }
10981 BB = LastInst->getParent();
10982 return LastInst;
10983 };
10984
10985 auto FindFirstInst = [&]() {
10986 Instruction *FirstInst = Front;
10987 for (Value *V : E->Scalars) {
10988 auto *I = dyn_cast<Instruction>(V);
10989 if (!I)
10990 continue;
10991 if (FirstInst->getParent() == I->getParent()) {
10992 if (I->comesBefore(FirstInst))
10993 FirstInst = I;
10994 continue;
10995 }
10996 assert(((E->getOpcode() == Instruction::GetElementPtr &&
10997 !isa<GetElementPtrInst>(I)) ||
10998 (isVectorLikeInstWithConstOps(FirstInst) &&
11000 "Expected vector-like or non-GEP in GEP node insts only.");
11001 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
11002 FirstInst = I;
11003 continue;
11004 }
11005 if (!DT->isReachableFromEntry(I->getParent()))
11006 continue;
11007 auto *NodeA = DT->getNode(FirstInst->getParent());
11008 auto *NodeB = DT->getNode(I->getParent());
11009 assert(NodeA && "Should only process reachable instructions");
11010 assert(NodeB && "Should only process reachable instructions");
11011 assert((NodeA == NodeB) ==
11012 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11013 "Different nodes should have different DFS numbers");
11014 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11015 FirstInst = I;
11016 }
11017 return FirstInst;
11018 };
11019
11020 // Set the insert point to the beginning of the basic block if the entry
11021 // should not be scheduled.
11022 if (doesNotNeedToSchedule(E->Scalars) ||
11023 (E->State != TreeEntry::NeedToGather &&
11024 all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
11025 if ((E->getOpcode() == Instruction::GetElementPtr &&
11026 any_of(E->Scalars,
11027 [](Value *V) {
11028 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
11029 })) ||
11030 all_of(E->Scalars,
11031 [](Value *V) {
11032 return !isVectorLikeInstWithConstOps(V) &&
11033 isUsedOutsideBlock(V);
11034 }) ||
11035 (E->State == TreeEntry::NeedToGather && E->Idx == 0 &&
11036 all_of(E->Scalars, [](Value *V) {
11037 return isa<ExtractElementInst, UndefValue>(V) ||
11038 areAllOperandsNonInsts(V);
11039 })))
11040 Res.second = FindLastInst();
11041 else
11042 Res.second = FindFirstInst();
11043 return *Res.second;
11044 }
11045
11046 // Find the last instruction. The common case should be that BB has been
11047 // scheduled, and the last instruction is VL.back(). So we start with
11048 // VL.back() and iterate over schedule data until we reach the end of the
11049 // bundle. The end of the bundle is marked by null ScheduleData.
11050 if (BlocksSchedules.count(BB)) {
11051 Value *V = E->isOneOf(E->Scalars.back());
11053 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
11054 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11055 if (Bundle && Bundle->isPartOfBundle())
11056 for (; Bundle; Bundle = Bundle->NextInBundle)
11057 if (Bundle->OpValue == Bundle->Inst)
11058 Res.second = Bundle->Inst;
11059 }
11060
11061 // LastInst can still be null at this point if there's either not an entry
11062 // for BB in BlocksSchedules or there's no ScheduleData available for
11063 // VL.back(). This can be the case if buildTree_rec aborts for various
11064 // reasons (e.g., the maximum recursion depth is reached, the maximum region
11065 // size is reached, etc.). ScheduleData is initialized in the scheduling
11066 // "dry-run".
11067 //
11068 // If this happens, we can still find the last instruction by brute force. We
11069 // iterate forwards from Front (inclusive) until we either see all
11070 // instructions in the bundle or reach the end of the block. If Front is the
11071 // last instruction in program order, LastInst will be set to Front, and we
11072 // will visit all the remaining instructions in the block.
11073 //
11074 // One of the reasons we exit early from buildTree_rec is to place an upper
11075 // bound on compile-time. Thus, taking an additional compile-time hit here is
11076 // not ideal. However, this should be exceedingly rare since it requires that
11077 // we both exit early from buildTree_rec and that the bundle be out-of-order
11078 // (causing us to iterate all the way to the end of the block).
11079 if (!Res.second)
11080 Res.second = FindLastInst();
11081 assert(Res.second && "Failed to find last instruction in bundle");
11082 return *Res.second;
11083}
11084
11085void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
11086 auto *Front = E->getMainOp();
11087 Instruction *LastInst = &getLastInstructionInBundle(E);
11088 assert(LastInst && "Failed to find last instruction in bundle");
11089 BasicBlock::iterator LastInstIt = LastInst->getIterator();
11090 // If the instruction is PHI, set the insert point after all the PHIs.
11091 bool IsPHI = isa<PHINode>(LastInst);
11092 if (IsPHI)
11093 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
11094 if (IsPHI || (E->State != TreeEntry::NeedToGather &&
11095 doesNotNeedToSchedule(E->Scalars))) {
11096 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
11097 } else {
11098 // Set the insertion point after the last instruction in the bundle. Set the
11099 // debug location to Front.
11100 Builder.SetInsertPoint(
11101 LastInst->getParent(),
11103 }
11104 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11105}
11106
11107Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
11108 // List of instructions/lanes from current block and/or the blocks which are
11109 // part of the current loop. These instructions will be inserted at the end to
11110 // make it possible to optimize loops and hoist invariant instructions out of
11111 // the loops body with better chances for success.
11113 SmallSet<int, 4> PostponedIndices;
11114 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
11115 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
11117 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
11118 InsertBB = InsertBB->getSinglePredecessor();
11119 return InsertBB && InsertBB == InstBB;
11120 };
11121 for (int I = 0, E = VL.size(); I < E; ++I) {
11122 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
11123 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11124 getTreeEntry(Inst) ||
11125 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
11126 PostponedIndices.insert(I).second)
11127 PostponedInsts.emplace_back(Inst, I);
11128 }
11129
11130 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
11131 Type *Ty) {
11132 Value *Scalar = V;
11133 if (cast<VectorType>(Vec->getType())->getElementType() != Ty) {
11134 assert(V->getType()->isIntegerTy() && Ty->isIntegerTy() &&
11135 "Expected integer types only.");
11136 Vec = Builder.CreateIntCast(
11137 Vec,
11138 VectorType::get(Ty,
11139 cast<VectorType>(Vec->getType())->getElementCount()),
11140 !isKnownNonNegative(Vec, SimplifyQuery(*DL)));
11141 }
11142
11143 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11144 auto *InsElt = dyn_cast<InsertElementInst>(Vec);
11145 if (!InsElt)
11146 return Vec;
11147 GatherShuffleExtractSeq.insert(InsElt);
11148 CSEBlocks.insert(InsElt->getParent());
11149 // Add to our 'need-to-extract' list.
11150 if (isa<Instruction>(V)) {
11151 if (TreeEntry *Entry = getTreeEntry(V)) {
11152 // Find which lane we need to extract.
11153 User *UserOp = nullptr;
11154 if (Scalar != V) {
11155 if (auto *SI = dyn_cast<Instruction>(Scalar))
11156 UserOp = SI;
11157 } else {
11158 UserOp = InsElt;
11159 }
11160 if (UserOp) {
11161 unsigned FoundLane = Entry->findLaneForValue(V);
11162 ExternalUses.emplace_back(V, UserOp, FoundLane);
11163 }
11164 }
11165 }
11166 return Vec;
11167 };
11168 Value *Val0 =
11169 isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
11170 Type *ScalarTy = Val0->getType();
11171 FixedVectorType *VecTy = FixedVectorType::get(ScalarTy, VL.size());
11172 Value *Vec = Root ? Root : PoisonValue::get(VecTy);
11173 SmallVector<int> NonConsts;
11174 // Insert constant values at first.
11175 for (int I = 0, E = VL.size(); I < E; ++I) {
11176 if (PostponedIndices.contains(I))
11177 continue;
11178 if (!isConstant(VL[I])) {
11179 NonConsts.push_back(I);
11180 continue;
11181 }
11182 if (Root) {
11183 if (!isa<UndefValue>(VL[I])) {
11184 NonConsts.push_back(I);
11185 continue;
11186 }
11187 if (isa<PoisonValue>(VL[I]))
11188 continue;
11189 if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
11190 if (SV->getMaskValue(I) == PoisonMaskElem)
11191 continue;
11192 }
11193 }
11194 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11195 }
11196 // Insert non-constant values.
11197 for (int I : NonConsts)
11198 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11199 // Append instructions, which are/may be part of the loop, in the end to make
11200 // it possible to hoist non-loop-based instructions.
11201 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11202 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11203
11204 return Vec;
11205}
11206
11207/// Merges shuffle masks and emits final shuffle instruction, if required. It
11208/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
11209/// when the actual shuffle instruction is generated only if this is actually
11210/// required. Otherwise, the shuffle instruction emission is delayed till the
11211/// end of the process, to reduce the number of emitted instructions and further
11212/// analysis/transformations.
11213/// The class also will look through the previously emitted shuffle instructions
11214/// and properly mark indices in mask as undef.
11215/// For example, given the code
11216/// \code
11217/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
11218/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
11219/// \endcode
11220/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
11221/// look through %s1 and %s2 and emit
11222/// \code
11223/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11224/// \endcode
11225/// instead.
11226/// If 2 operands are of different size, the smallest one will be resized and
11227/// the mask recalculated properly.
11228/// For example, given the code
11229/// \code
11230/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
11231/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
11232/// \endcode
11233/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
11234/// look through %s1 and %s2 and emit
11235/// \code
11236/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11237/// \endcode
11238/// instead.
11239class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
11240 bool IsFinalized = false;
11241 /// Combined mask for all applied operands and masks. It is built during
11242 /// analysis and actual emission of shuffle vector instructions.
11243 SmallVector<int> CommonMask;
11244 /// List of operands for the shuffle vector instruction. It hold at max 2
11245 /// operands, if the 3rd is going to be added, the first 2 are combined into
11246 /// shuffle with \p CommonMask mask, the first operand sets to be the
11247 /// resulting shuffle and the second operand sets to be the newly added
11248 /// operand. The \p CommonMask is transformed in the proper way after that.
11249 SmallVector<Value *, 2> InVectors;
11250 IRBuilderBase &Builder;
11251 BoUpSLP &R;
11252
11253 class ShuffleIRBuilder {
11254 IRBuilderBase &Builder;
11255 /// Holds all of the instructions that we gathered.
11256 SetVector<Instruction *> &GatherShuffleExtractSeq;
11257 /// A list of blocks that we are going to CSE.
11258 DenseSet<BasicBlock *> &CSEBlocks;
11259 /// Data layout.
11260 const DataLayout &DL;
11261
11262 public:
11263 ShuffleIRBuilder(IRBuilderBase &Builder,
11264 SetVector<Instruction *> &GatherShuffleExtractSeq,
11265 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
11266 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11267 CSEBlocks(CSEBlocks), DL(DL) {}
11268 ~ShuffleIRBuilder() = default;
11269 /// Creates shufflevector for the 2 operands with the given mask.
11270 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
11271 if (V1->getType() != V2->getType()) {
11273 V1->getType()->isIntOrIntVectorTy() &&
11274 "Expected integer vector types only.");
11275 if (V1->getType() != V2->getType()) {
11276 if (cast<VectorType>(V2->getType())
11277 ->getElementType()
11278 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
11279 ->getElementType()
11280 ->getIntegerBitWidth())
11281 V2 = Builder.CreateIntCast(
11282 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
11283 else
11284 V1 = Builder.CreateIntCast(
11285 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
11286 }
11287 }
11288 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
11289 if (auto *I = dyn_cast<Instruction>(Vec)) {
11290 GatherShuffleExtractSeq.insert(I);
11291 CSEBlocks.insert(I->getParent());
11292 }
11293 return Vec;
11294 }
11295 /// Creates permutation of the single vector operand with the given mask, if
11296 /// it is not identity mask.
11297 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
11298 if (Mask.empty())
11299 return V1;
11300 unsigned VF = Mask.size();
11301 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
11302 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
11303 return V1;
11304 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
11305 if (auto *I = dyn_cast<Instruction>(Vec)) {
11306 GatherShuffleExtractSeq.insert(I);
11307 CSEBlocks.insert(I->getParent());
11308 }
11309 return Vec;
11310 }
11311 Value *createIdentity(Value *V) { return V; }
11312 Value *createPoison(Type *Ty, unsigned VF) {
11313 return PoisonValue::get(FixedVectorType::get(Ty, VF));
11314 }
11315 /// Resizes 2 input vector to match the sizes, if the they are not equal
11316 /// yet. The smallest vector is resized to the size of the larger vector.
11317 void resizeToMatch(Value *&V1, Value *&V2) {
11318 if (V1->getType() == V2->getType())
11319 return;
11320 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
11321 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
11322 int VF = std::max(V1VF, V2VF);
11323 int MinVF = std::min(V1VF, V2VF);
11324 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
11325 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
11326 0);
11327 Value *&Op = MinVF == V1VF ? V1 : V2;
11328 Op = Builder.CreateShuffleVector(Op, IdentityMask);
11329 if (auto *I = dyn_cast<Instruction>(Op)) {
11330 GatherShuffleExtractSeq.insert(I);
11331 CSEBlocks.insert(I->getParent());
11332 }
11333 if (MinVF == V1VF)
11334 V1 = Op;
11335 else
11336 V2 = Op;
11337 }
11338 };
11339
11340 /// Smart shuffle instruction emission, walks through shuffles trees and
11341 /// tries to find the best matching vector for the actual shuffle
11342 /// instruction.
11343 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
11344 assert(V1 && "Expected at least one vector value.");
11345 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11346 R.CSEBlocks, *R.DL);
11347 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11348 ShuffleBuilder);
11349 }
11350
11351 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
11352 /// shuffle emission.
11353 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
11354 ArrayRef<int> Mask) {
11355 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11356 if (Mask[Idx] != PoisonMaskElem)
11357 CommonMask[Idx] = Idx;
11358 }
11359
11360public:
11362 : Builder(Builder), R(R) {}
11363
11364 /// Adjusts extractelements after reusing them.
11365 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
11366 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11367 unsigned NumParts, bool &UseVecBaseAsInput) {
11368 UseVecBaseAsInput = false;
11369 SmallPtrSet<Value *, 4> UniqueBases;
11370 Value *VecBase = nullptr;
11371 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
11372 int Idx = Mask[I];
11373 if (Idx == PoisonMaskElem)
11374 continue;
11375 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
11376 VecBase = EI->getVectorOperand();
11377 if (const TreeEntry *TE = R.getTreeEntry(VecBase))
11378 VecBase = TE->VectorizedValue;
11379 assert(VecBase && "Expected vectorized value.");
11380 UniqueBases.insert(VecBase);
11381 // If the only one use is vectorized - can delete the extractelement
11382 // itself.
11383 if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) ||
11384 any_of(EI->users(), [&](User *U) {
11385 const TreeEntry *UTE = R.getTreeEntry(U);
11386 return !UTE || R.MultiNodeScalars.contains(U) ||
11387 count_if(R.VectorizableTree,
11388 [&](const std::unique_ptr<TreeEntry> &TE) {
11389 return any_of(TE->UserTreeIndices,
11390 [&](const EdgeInfo &Edge) {
11391 return Edge.UserTE == UTE;
11392 }) &&
11393 is_contained(TE->Scalars, EI);
11394 }) != 1;
11395 }))
11396 continue;
11397 R.eraseInstruction(EI);
11398 }
11399 if (NumParts == 1 || UniqueBases.size() == 1)
11400 return VecBase;
11401 UseVecBaseAsInput = true;
11402 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
11403 for (auto [I, Idx] : enumerate(Mask))
11404 if (Idx != PoisonMaskElem)
11405 Idx = I;
11406 };
11407 // Perform multi-register vector shuffle, joining them into a single virtual
11408 // long vector.
11409 // Need to shuffle each part independently and then insert all this parts
11410 // into a long virtual vector register, forming the original vector.
11411 Value *Vec = nullptr;
11412 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
11413 unsigned SliceSize = E->Scalars.size() / NumParts;
11414 for (unsigned Part = 0; Part < NumParts; ++Part) {
11416 ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
11417 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
11418 constexpr int MaxBases = 2;
11419 SmallVector<Value *, MaxBases> Bases(MaxBases);
11420#ifndef NDEBUG
11421 int PrevSize = 0;
11422#endif // NDEBUG
11423 for (const auto [I, V]: enumerate(VL)) {
11424 if (SubMask[I] == PoisonMaskElem)
11425 continue;
11426 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
11427 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
11428 VecOp = TE->VectorizedValue;
11429 assert(VecOp && "Expected vectorized value.");
11430 const int Size =
11431 cast<FixedVectorType>(VecOp->getType())->getNumElements();
11432#ifndef NDEBUG
11433 assert((PrevSize == Size || PrevSize == 0) &&
11434 "Expected vectors of the same size.");
11435 PrevSize = Size;
11436#endif // NDEBUG
11437 Bases[SubMask[I] < Size ? 0 : 1] = VecOp;
11438 }
11439 if (!Bases.front())
11440 continue;
11441 Value *SubVec;
11442 if (Bases.back()) {
11443 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
11444 TransformToIdentity(SubMask);
11445 } else {
11446 SubVec = Bases.front();
11447 }
11448 if (!Vec) {
11449 Vec = SubVec;
11450 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
11451 [&](unsigned P) {
11452 ArrayRef<int> SubMask =
11453 Mask.slice(P * SliceSize, SliceSize);
11454 return all_of(SubMask, [](int Idx) {
11455 return Idx == PoisonMaskElem;
11456 });
11457 })) &&
11458 "Expected first part or all previous parts masked.");
11459 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11460 } else {
11461 unsigned VF = cast<FixedVectorType>(Vec->getType())->getNumElements();
11462 if (Vec->getType() != SubVec->getType()) {
11463 unsigned SubVecVF =
11464 cast<FixedVectorType>(SubVec->getType())->getNumElements();
11465 VF = std::max(VF, SubVecVF);
11466 }
11467 // Adjust SubMask.
11468 for (int &Idx : SubMask)
11469 if (Idx != PoisonMaskElem)
11470 Idx += VF;
11471 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11472 Vec = createShuffle(Vec, SubVec, VecMask);
11473 TransformToIdentity(VecMask);
11474 }
11475 }
11476 copy(VecMask, Mask.begin());
11477 return Vec;
11478 }
11479 /// Checks if the specified entry \p E needs to be delayed because of its
11480 /// dependency nodes.
11481 std::optional<Value *>
11482 needToDelay(const TreeEntry *E,
11484 // No need to delay emission if all deps are ready.
11485 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
11486 return all_of(
11487 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
11488 }))
11489 return std::nullopt;
11490 // Postpone gather emission, will be emitted after the end of the
11491 // process to keep correct order.
11492 auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(),
11493 E->getVectorFactor());
11494 return Builder.CreateAlignedLoad(
11496 MaybeAlign());
11497 }
11498 /// Adds 2 input vectors (in form of tree entries) and the mask for their
11499 /// shuffling.
11500 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
11501 add(E1.VectorizedValue, E2.VectorizedValue, Mask);
11502 }
11503 /// Adds single input vector (in form of tree entry) and the mask for its
11504 /// shuffling.
11505 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
11506 add(E1.VectorizedValue, Mask);
11507 }
11508 /// Adds 2 input vectors and the mask for their shuffling.
11509 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
11510 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
11511 if (InVectors.empty()) {
11512 InVectors.push_back(V1);
11513 InVectors.push_back(V2);
11514 CommonMask.assign(Mask.begin(), Mask.end());
11515 return;
11516 }
11517 Value *Vec = InVectors.front();
11518 if (InVectors.size() == 2) {
11519 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
11520 transformMaskAfterShuffle(CommonMask, CommonMask);
11521 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
11522 Mask.size()) {
11523 Vec = createShuffle(Vec, nullptr, CommonMask);
11524 transformMaskAfterShuffle(CommonMask, CommonMask);
11525 }
11526 V1 = createShuffle(V1, V2, Mask);
11527 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11528 if (Mask[Idx] != PoisonMaskElem)
11529 CommonMask[Idx] = Idx + Sz;
11530 InVectors.front() = Vec;
11531 if (InVectors.size() == 2)
11532 InVectors.back() = V1;
11533 else
11534 InVectors.push_back(V1);
11535 }
11536 /// Adds another one input vector and the mask for the shuffling.
11537 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
11538 if (InVectors.empty()) {
11539 if (!isa<FixedVectorType>(V1->getType())) {
11540 V1 = createShuffle(V1, nullptr, CommonMask);
11541 CommonMask.assign(Mask.size(), PoisonMaskElem);
11542 transformMaskAfterShuffle(CommonMask, Mask);
11543 }
11544 InVectors.push_back(V1);
11545 CommonMask.assign(Mask.begin(), Mask.end());
11546 return;
11547 }
11548 const auto *It = find(InVectors, V1);
11549 if (It == InVectors.end()) {
11550 if (InVectors.size() == 2 ||
11551 InVectors.front()->getType() != V1->getType() ||
11552 !isa<FixedVectorType>(V1->getType())) {
11553 Value *V = InVectors.front();
11554 if (InVectors.size() == 2) {
11555 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
11556 transformMaskAfterShuffle(CommonMask, CommonMask);
11557 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
11558 CommonMask.size()) {
11559 V = createShuffle(InVectors.front(), nullptr, CommonMask);
11560 transformMaskAfterShuffle(CommonMask, CommonMask);
11561 }
11562 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11563 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
11564 CommonMask[Idx] =
11565 V->getType() != V1->getType()
11566 ? Idx + Sz
11567 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
11568 ->getNumElements();
11569 if (V->getType() != V1->getType())
11570 V1 = createShuffle(V1, nullptr, Mask);
11571 InVectors.front() = V;
11572 if (InVectors.size() == 2)
11573 InVectors.back() = V1;
11574 else
11575 InVectors.push_back(V1);
11576 return;
11577 }
11578 // Check if second vector is required if the used elements are already
11579 // used from the first one.
11580 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11581 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
11582 InVectors.push_back(V1);
11583 break;
11584 }
11585 }
11586 int VF = CommonMask.size();
11587 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
11588 VF = FTy->getNumElements();
11589 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11590 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
11591 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
11592 }
11593 /// Adds another one input vector and the mask for the shuffling.
11595 SmallVector<int> NewMask;
11596 inversePermutation(Order, NewMask);
11597 add(V1, NewMask);
11598 }
11599 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
11600 Value *Root = nullptr) {
11601 return R.gather(VL, Root);
11602 }
11603 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
11604 /// Finalize emission of the shuffles.
11605 /// \param Action the action (if any) to be performed before final applying of
11606 /// the \p ExtMask mask.
11607 Value *
11608 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
11609 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
11610 IsFinalized = true;
11611 if (Action) {
11612 Value *Vec = InVectors.front();
11613 if (InVectors.size() == 2) {
11614 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
11615 InVectors.pop_back();
11616 } else {
11617 Vec = createShuffle(Vec, nullptr, CommonMask);
11618 }
11619 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11620 if (CommonMask[Idx] != PoisonMaskElem)
11621 CommonMask[Idx] = Idx;
11622 assert(VF > 0 &&
11623 "Expected vector length for the final value before action.");
11624 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
11625 if (VecVF < VF) {
11626 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
11627 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
11628 Vec = createShuffle(Vec, nullptr, ResizeMask);
11629 }
11630 Action(Vec, CommonMask);
11631 InVectors.front() = Vec;
11632 }
11633 if (!ExtMask.empty()) {
11634 if (CommonMask.empty()) {
11635 CommonMask.assign(ExtMask.begin(), ExtMask.end());
11636 } else {
11637 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
11638 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
11639 if (ExtMask[I] == PoisonMaskElem)
11640 continue;
11641 NewMask[I] = CommonMask[ExtMask[I]];
11642 }
11643 CommonMask.swap(NewMask);
11644 }
11645 }
11646 if (CommonMask.empty()) {
11647 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
11648 return InVectors.front();
11649 }
11650 if (InVectors.size() == 2)
11651 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
11652 return createShuffle(InVectors.front(), nullptr, CommonMask);
11653 }
11654
11656 assert((IsFinalized || CommonMask.empty()) &&
11657 "Shuffle construction must be finalized.");
11658 }
11659};
11660
11661Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
11662 bool PostponedPHIs) {
11663 ValueList &VL = E->getOperand(NodeIdx);
11664 const unsigned VF = VL.size();
11665 InstructionsState S = getSameOpcode(VL, *TLI);
11666 // Special processing for GEPs bundle, which may include non-gep values.
11667 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
11668 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11669 if (It != VL.end())
11670 S = getSameOpcode(*It, *TLI);
11671 }
11672 if (S.getOpcode()) {
11673 auto CheckSameVE = [&](const TreeEntry *VE) {
11674 return VE->isSame(VL) &&
11675 (any_of(VE->UserTreeIndices,
11676 [E, NodeIdx](const EdgeInfo &EI) {
11677 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11678 }) ||
11679 any_of(VectorizableTree,
11680 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
11681 return TE->isOperandGatherNode({E, NodeIdx}) &&
11682 VE->isSame(TE->Scalars);
11683 }));
11684 };
11685 TreeEntry *VE = getTreeEntry(S.OpValue);
11686 bool IsSameVE = VE && CheckSameVE(VE);
11687 if (!IsSameVE) {
11688 auto It = MultiNodeScalars.find(S.OpValue);
11689 if (It != MultiNodeScalars.end()) {
11690 auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
11691 return TE != VE && CheckSameVE(TE);
11692 });
11693 if (I != It->getSecond().end()) {
11694 VE = *I;
11695 IsSameVE = true;
11696 }
11697 }
11698 }
11699 if (IsSameVE) {
11700 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
11701 ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
11702 ShuffleBuilder.add(V, Mask);
11703 return ShuffleBuilder.finalize(std::nullopt);
11704 };
11705 Value *V = vectorizeTree(VE, PostponedPHIs);
11706 if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
11707 if (!VE->ReuseShuffleIndices.empty()) {
11708 // Reshuffle to get only unique values.
11709 // If some of the scalars are duplicated in the vectorization
11710 // tree entry, we do not vectorize them but instead generate a
11711 // mask for the reuses. But if there are several users of the
11712 // same entry, they may have different vectorization factors.
11713 // This is especially important for PHI nodes. In this case, we
11714 // need to adapt the resulting instruction for the user
11715 // vectorization factor and have to reshuffle it again to take
11716 // only unique elements of the vector. Without this code the
11717 // function incorrectly returns reduced vector instruction with
11718 // the same elements, not with the unique ones.
11719
11720 // block:
11721 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
11722 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
11723 // ... (use %2)
11724 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
11725 // br %block
11727 for (auto [I, V] : enumerate(VL)) {
11728 if (isa<PoisonValue>(V))
11729 continue;
11730 Mask[I] = VE->findLaneForValue(V);
11731 }
11732 V = FinalShuffle(V, Mask);
11733 } else {
11734 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
11735 "Expected vectorization factor less "
11736 "than original vector size.");
11737 SmallVector<int> UniformMask(VF, 0);
11738 std::iota(UniformMask.begin(), UniformMask.end(), 0);
11739 V = FinalShuffle(V, UniformMask);
11740 }
11741 }
11742 // Need to update the operand gather node, if actually the operand is not a
11743 // vectorized node, but the buildvector/gather node, which matches one of
11744 // the vectorized nodes.
11745 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
11746 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11747 }) == VE->UserTreeIndices.end()) {
11748 auto *It = find_if(
11749 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
11750 return TE->State == TreeEntry::NeedToGather &&
11751 TE->UserTreeIndices.front().UserTE == E &&
11752 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
11753 });
11754 assert(It != VectorizableTree.end() && "Expected gather node operand.");
11755 (*It)->VectorizedValue = V;
11756 }
11757 return V;
11758 }
11759 }
11760
11761 // Find the corresponding gather entry and vectorize it.
11762 // Allows to be more accurate with tree/graph transformations, checks for the
11763 // correctness of the transformations in many cases.
11764 auto *I = find_if(VectorizableTree,
11765 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
11766 return TE->isOperandGatherNode({E, NodeIdx});
11767 });
11768 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
11769 assert(I->get()->UserTreeIndices.size() == 1 &&
11770 "Expected only single user for the gather node.");
11771 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
11772 return vectorizeTree(I->get(), PostponedPHIs);
11773}
11774
11775template <typename BVTy, typename ResTy, typename... Args>
11776ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
11777 assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
11778 unsigned VF = E->getVectorFactor();
11779
11780 bool NeedFreeze = false;
11781 SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
11782 E->ReuseShuffleIndices.end());
11783 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
11784 // Build a mask out of the reorder indices and reorder scalars per this
11785 // mask.
11786 SmallVector<int> ReorderMask;
11787 inversePermutation(E->ReorderIndices, ReorderMask);
11788 if (!ReorderMask.empty())
11789 reorderScalars(GatheredScalars, ReorderMask);
11790 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
11791 unsigned I, unsigned SliceSize) {
11792 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
11793 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
11794 }))
11795 return false;
11796 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
11797 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
11798 if (UserTE->getNumOperands() != 2)
11799 return false;
11800 auto *It =
11801 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
11802 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
11803 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
11804 }) != TE->UserTreeIndices.end();
11805 });
11806 if (It == VectorizableTree.end())
11807 return false;
11808 int Idx;
11809 if ((Mask.size() < InputVF &&
11811 Idx == 0) ||
11812 (Mask.size() == InputVF &&
11813 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
11814 std::iota(std::next(Mask.begin(), I * SliceSize),
11815 std::next(Mask.begin(), (I + 1) * SliceSize), 0);
11816 } else {
11817 unsigned IVal =
11818 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
11819 std::fill(std::next(Mask.begin(), I * SliceSize),
11820 std::next(Mask.begin(), (I + 1) * SliceSize), IVal);
11821 }
11822 return true;
11823 };
11824 BVTy ShuffleBuilder(Params...);
11825 ResTy Res = ResTy();
11827 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
11829 Value *ExtractVecBase = nullptr;
11830 bool UseVecBaseAsInput = false;
11833 Type *ScalarTy = GatheredScalars.front()->getType();
11834 auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());
11835 unsigned NumParts = TTI->getNumberOfParts(VecTy);
11836 if (NumParts == 0 || NumParts >= GatheredScalars.size())
11837 NumParts = 1;
11838 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
11839 // Check for gathered extracts.
11840 bool Resized = false;
11841 ExtractShuffles =
11842 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
11843 if (!ExtractShuffles.empty()) {
11844 SmallVector<const TreeEntry *> ExtractEntries;
11845 for (auto [Idx, I] : enumerate(ExtractMask)) {
11846 if (I == PoisonMaskElem)
11847 continue;
11848 if (const auto *TE = getTreeEntry(
11849 cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
11850 ExtractEntries.push_back(TE);
11851 }
11852 if (std::optional<ResTy> Delayed =
11853 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
11854 // Delay emission of gathers which are not ready yet.
11855 PostponedGathers.insert(E);
11856 // Postpone gather emission, will be emitted after the end of the
11857 // process to keep correct order.
11858 return *Delayed;
11859 }
11860 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
11861 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
11862 ExtractVecBase = VecBase;
11863 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
11864 if (VF == VecBaseTy->getNumElements() &&
11865 GatheredScalars.size() != VF) {
11866 Resized = true;
11867 GatheredScalars.append(VF - GatheredScalars.size(),
11868 PoisonValue::get(ScalarTy));
11869 }
11870 }
11871 }
11872 // Gather extracts after we check for full matched gathers only.
11873 if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
11874 E->isAltShuffle() ||
11875 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
11876 isSplat(E->Scalars) ||
11877 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
11878 GatherShuffles =
11879 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
11880 }
11881 if (!GatherShuffles.empty()) {
11882 if (std::optional<ResTy> Delayed =
11883 ShuffleBuilder.needToDelay(E, Entries)) {
11884 // Delay emission of gathers which are not ready yet.
11885 PostponedGathers.insert(E);
11886 // Postpone gather emission, will be emitted after the end of the
11887 // process to keep correct order.
11888 return *Delayed;
11889 }
11890 if (GatherShuffles.size() == 1 &&
11891 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
11892 Entries.front().front()->isSame(E->Scalars)) {
11893 // Perfect match in the graph, will reuse the previously vectorized
11894 // node. Cost is 0.
11895 LLVM_DEBUG(
11896 dbgs()
11897 << "SLP: perfect diamond match for gather bundle "
11898 << shortBundleName(E->Scalars) << ".\n");
11899 // Restore the mask for previous partially matched values.
11900 Mask.resize(E->Scalars.size());
11901 const TreeEntry *FrontTE = Entries.front().front();
11902 if (FrontTE->ReorderIndices.empty() &&
11903 ((FrontTE->ReuseShuffleIndices.empty() &&
11904 E->Scalars.size() == FrontTE->Scalars.size()) ||
11905 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
11906 std::iota(Mask.begin(), Mask.end(), 0);
11907 } else {
11908 for (auto [I, V] : enumerate(E->Scalars)) {
11909 if (isa<PoisonValue>(V)) {
11911 continue;
11912 }
11913 Mask[I] = FrontTE->findLaneForValue(V);
11914 }
11915 }
11916 ShuffleBuilder.add(*FrontTE, Mask);
11917 Res = ShuffleBuilder.finalize(E->getCommonMask());
11918 return Res;
11919 }
11920 if (!Resized) {
11921 if (GatheredScalars.size() != VF &&
11922 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
11923 return any_of(TEs, [&](const TreeEntry *TE) {
11924 return TE->getVectorFactor() == VF;
11925 });
11926 }))
11927 GatheredScalars.append(VF - GatheredScalars.size(),
11928 PoisonValue::get(ScalarTy));
11929 }
11930 // Remove shuffled elements from list of gathers.
11931 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
11932 if (Mask[I] != PoisonMaskElem)
11933 GatheredScalars[I] = PoisonValue::get(ScalarTy);
11934 }
11935 }
11936 }
11937 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
11938 SmallVectorImpl<int> &ReuseMask,
11939 bool IsRootPoison) {
11940 // For splats with can emit broadcasts instead of gathers, so try to find
11941 // such sequences.
11942 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
11943 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
11944 Scalars.append(VF - Scalars.size(), PoisonValue::get(ScalarTy));
11945 SmallVector<int> UndefPos;
11946 DenseMap<Value *, unsigned> UniquePositions;
11947 // Gather unique non-const values and all constant values.
11948 // For repeated values, just shuffle them.
11949 int NumNonConsts = 0;
11950 int SinglePos = 0;
11951 for (auto [I, V] : enumerate(Scalars)) {
11952 if (isa<UndefValue>(V)) {
11953 if (!isa<PoisonValue>(V)) {
11954 ReuseMask[I] = I;
11955 UndefPos.push_back(I);
11956 }
11957 continue;
11958 }
11959 if (isConstant(V)) {
11960 ReuseMask[I] = I;
11961 continue;
11962 }
11963 ++NumNonConsts;
11964 SinglePos = I;
11965 Value *OrigV = V;
11966 Scalars[I] = PoisonValue::get(ScalarTy);
11967 if (IsSplat) {
11968 Scalars.front() = OrigV;
11969 ReuseMask[I] = 0;
11970 } else {
11971 const auto Res = UniquePositions.try_emplace(OrigV, I);
11972 Scalars[Res.first->second] = OrigV;
11973 ReuseMask[I] = Res.first->second;
11974 }
11975 }
11976 if (NumNonConsts == 1) {
11977 // Restore single insert element.
11978 if (IsSplat) {
11979 ReuseMask.assign(VF, PoisonMaskElem);
11980 std::swap(Scalars.front(), Scalars[SinglePos]);
11981 if (!UndefPos.empty() && UndefPos.front() == 0)
11982 Scalars.front() = UndefValue::get(ScalarTy);
11983 }
11984 ReuseMask[SinglePos] = SinglePos;
11985 } else if (!UndefPos.empty() && IsSplat) {
11986 // For undef values, try to replace them with the simple broadcast.
11987 // We can do it if the broadcasted value is guaranteed to be
11988 // non-poisonous, or by freezing the incoming scalar value first.
11989 auto *It = find_if(Scalars, [this, E](Value *V) {
11990 return !isa<UndefValue>(V) &&
11991 (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
11992 (E->UserTreeIndices.size() == 1 &&
11993 any_of(V->uses(), [E](const Use &U) {
11994 // Check if the value already used in the same operation in
11995 // one of the nodes already.
11996 return E->UserTreeIndices.front().EdgeIdx !=
11997 U.getOperandNo() &&
11998 is_contained(
11999 E->UserTreeIndices.front().UserTE->Scalars,
12000 U.getUser());
12001 })));
12002 });
12003 if (It != Scalars.end()) {
12004 // Replace undefs by the non-poisoned scalars and emit broadcast.
12005 int Pos = std::distance(Scalars.begin(), It);
12006 for (int I : UndefPos) {
12007 // Set the undef position to the non-poisoned scalar.
12008 ReuseMask[I] = Pos;
12009 // Replace the undef by the poison, in the mask it is replaced by
12010 // non-poisoned scalar already.
12011 if (I != Pos)
12012 Scalars[I] = PoisonValue::get(ScalarTy);
12013 }
12014 } else {
12015 // Replace undefs by the poisons, emit broadcast and then emit
12016 // freeze.
12017 for (int I : UndefPos) {
12018 ReuseMask[I] = PoisonMaskElem;
12019 if (isa<UndefValue>(Scalars[I]))
12020 Scalars[I] = PoisonValue::get(ScalarTy);
12021 }
12022 NeedFreeze = true;
12023 }
12024 }
12025 };
12026 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
12027 bool IsNonPoisoned = true;
12028 bool IsUsedInExpr = true;
12029 Value *Vec1 = nullptr;
12030 if (!ExtractShuffles.empty()) {
12031 // Gather of extractelements can be represented as just a shuffle of
12032 // a single/two vectors the scalars are extracted from.
12033 // Find input vectors.
12034 Value *Vec2 = nullptr;
12035 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12036 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
12037 ExtractMask[I] = PoisonMaskElem;
12038 }
12039 if (UseVecBaseAsInput) {
12040 Vec1 = ExtractVecBase;
12041 } else {
12042 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12043 if (ExtractMask[I] == PoisonMaskElem)
12044 continue;
12045 if (isa<UndefValue>(E->Scalars[I]))
12046 continue;
12047 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
12048 Value *VecOp = EI->getVectorOperand();
12049 if (const auto *TE = getTreeEntry(VecOp))
12050 if (TE->VectorizedValue)
12051 VecOp = TE->VectorizedValue;
12052 if (!Vec1) {
12053 Vec1 = VecOp;
12054 } else if (Vec1 != VecOp) {
12055 assert((!Vec2 || Vec2 == VecOp) &&
12056 "Expected only 1 or 2 vectors shuffle.");
12057 Vec2 = VecOp;
12058 }
12059 }
12060 }
12061 if (Vec2) {
12062 IsUsedInExpr = false;
12063 IsNonPoisoned &=
12065 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12066 } else if (Vec1) {
12067 IsUsedInExpr &= FindReusedSplat(
12068 ExtractMask,
12069 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
12070 ExtractMask.size());
12071 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
12072 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
12073 } else {
12074 IsUsedInExpr = false;
12075 ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get(
12076 ScalarTy, GatheredScalars.size())),
12077 ExtractMask, /*ForExtracts=*/true);
12078 }
12079 }
12080 if (!GatherShuffles.empty()) {
12081 unsigned SliceSize = E->Scalars.size() / NumParts;
12082 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
12083 for (const auto [I, TEs] : enumerate(Entries)) {
12084 if (TEs.empty()) {
12085 assert(!GatherShuffles[I] &&
12086 "No shuffles with empty entries list expected.");
12087 continue;
12088 }
12089 assert((TEs.size() == 1 || TEs.size() == 2) &&
12090 "Expected shuffle of 1 or 2 entries.");
12091 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
12092 VecMask.assign(VecMask.size(), PoisonMaskElem);
12093 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
12094 if (TEs.size() == 1) {
12095 IsUsedInExpr &= FindReusedSplat(
12096 VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
12097 ShuffleBuilder.add(*TEs.front(), VecMask);
12098 if (TEs.front()->VectorizedValue)
12099 IsNonPoisoned &=
12100 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
12101 } else {
12102 IsUsedInExpr = false;
12103 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
12104 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
12105 IsNonPoisoned &=
12106 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
12107 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
12108 }
12109 }
12110 }
12111 // Try to figure out best way to combine values: build a shuffle and insert
12112 // elements or just build several shuffles.
12113 // Insert non-constant scalars.
12114 SmallVector<Value *> NonConstants(GatheredScalars);
12115 int EMSz = ExtractMask.size();
12116 int MSz = Mask.size();
12117 // Try to build constant vector and shuffle with it only if currently we
12118 // have a single permutation and more than 1 scalar constants.
12119 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
12120 bool IsIdentityShuffle =
12121 ((UseVecBaseAsInput ||
12122 all_of(ExtractShuffles,
12123 [](const std::optional<TTI::ShuffleKind> &SK) {
12124 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12126 })) &&
12127 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
12128 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
12129 (!GatherShuffles.empty() &&
12130 all_of(GatherShuffles,
12131 [](const std::optional<TTI::ShuffleKind> &SK) {
12132 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12134 }) &&
12135 none_of(Mask, [&](int I) { return I >= MSz; }) &&
12137 bool EnoughConstsForShuffle =
12138 IsSingleShuffle &&
12139 (none_of(GatheredScalars,
12140 [](Value *V) {
12141 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12142 }) ||
12143 any_of(GatheredScalars,
12144 [](Value *V) {
12145 return isa<Constant>(V) && !isa<UndefValue>(V);
12146 })) &&
12147 (!IsIdentityShuffle ||
12148 (GatheredScalars.size() == 2 &&
12149 any_of(GatheredScalars,
12150 [](Value *V) { return !isa<UndefValue>(V); })) ||
12151 count_if(GatheredScalars, [](Value *V) {
12152 return isa<Constant>(V) && !isa<PoisonValue>(V);
12153 }) > 1);
12154 // NonConstants array contains just non-constant values, GatheredScalars
12155 // contains only constant to build final vector and then shuffle.
12156 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
12157 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
12158 NonConstants[I] = PoisonValue::get(ScalarTy);
12159 else
12160 GatheredScalars[I] = PoisonValue::get(ScalarTy);
12161 }
12162 // Generate constants for final shuffle and build a mask for them.
12163 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
12164 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
12165 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
12166 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12167 ShuffleBuilder.add(BV, BVMask);
12168 }
12169 if (all_of(NonConstants, [=](Value *V) {
12170 return isa<PoisonValue>(V) ||
12171 (IsSingleShuffle && ((IsIdentityShuffle &&
12172 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
12173 }))
12174 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12175 else
12176 Res = ShuffleBuilder.finalize(
12177 E->ReuseShuffleIndices, E->Scalars.size(),
12178 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
12179 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
12180 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
12181 });
12182 } else if (!allConstant(GatheredScalars)) {
12183 // Gather unique scalars and all constants.
12184 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
12185 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
12186 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
12187 ShuffleBuilder.add(BV, ReuseMask);
12188 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12189 } else {
12190 // Gather all constants.
12191 SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem);
12192 for (auto [I, V] : enumerate(E->Scalars)) {
12193 if (!isa<PoisonValue>(V))
12194 Mask[I] = I;
12195 }
12196 Value *BV = ShuffleBuilder.gather(E->Scalars);
12197 ShuffleBuilder.add(BV, Mask);
12198 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12199 }
12200
12201 if (NeedFreeze)
12202 Res = ShuffleBuilder.createFreeze(Res);
12203 return Res;
12204}
12205
12206Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
12207 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, Builder,
12208 *this);
12209}
12210
12211Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
12212 IRBuilderBase::InsertPointGuard Guard(Builder);
12213
12214 if (E->VectorizedValue &&
12215 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12216 E->isAltShuffle())) {
12217 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
12218 return E->VectorizedValue;
12219 }
12220
12221 if (E->State == TreeEntry::NeedToGather) {
12222 // Set insert point for non-reduction initial nodes.
12223 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12224 setInsertPointAfterBundle(E);
12225 Value *Vec = createBuildVector(E);
12226 E->VectorizedValue = Vec;
12227 return Vec;
12228 }
12229
12230 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
12231 auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
12232 ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
12233 if (E->getOpcode() == Instruction::Store) {
12235 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
12236 E->ReorderIndices.size());
12237 ShuffleBuilder.add(V, Mask);
12238 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12239 ShuffleBuilder.addOrdered(V, std::nullopt);
12240 } else {
12241 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
12242 }
12243 return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12244 };
12245
12246 assert((E->State == TreeEntry::Vectorize ||
12247 E->State == TreeEntry::ScatterVectorize ||
12248 E->State == TreeEntry::StridedVectorize) &&
12249 "Unhandled state");
12250 unsigned ShuffleOrOp =
12251 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
12252 Instruction *VL0 = E->getMainOp();
12253 Type *ScalarTy = VL0->getType();
12254 if (auto *Store = dyn_cast<StoreInst>(VL0))
12255 ScalarTy = Store->getValueOperand()->getType();
12256 else if (auto *IE = dyn_cast<InsertElementInst>(VL0))
12257 ScalarTy = IE->getOperand(1)->getType();
12258 auto It = MinBWs.find(E);
12259 if (It != MinBWs.end())
12260 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
12261 auto GetOperandSignedness = [&](unsigned Idx) {
12262 const TreeEntry *OpE = getOperandEntry(E, Idx);
12263 bool IsSigned = false;
12264 auto It = MinBWs.find(OpE);
12265 if (It != MinBWs.end())
12266 IsSigned = It->second.second;
12267 else
12268 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
12269 return !isKnownNonNegative(R, SimplifyQuery(*DL));
12270 });
12271 return IsSigned;
12272 };
12273 auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
12274 switch (ShuffleOrOp) {
12275 case Instruction::PHI: {
12276 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
12277 E != VectorizableTree.front().get() ||
12278 !E->UserTreeIndices.empty()) &&
12279 "PHI reordering is free.");
12280 if (PostponedPHIs && E->VectorizedValue)
12281 return E->VectorizedValue;
12282 auto *PH = cast<PHINode>(VL0);
12283 Builder.SetInsertPoint(PH->getParent(),
12284 PH->getParent()->getFirstNonPHIIt());
12285 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12286 if (PostponedPHIs || !E->VectorizedValue) {
12287 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
12288 E->PHI = NewPhi;
12289 Value *V = NewPhi;
12290
12291 // Adjust insertion point once all PHI's have been generated.
12292 Builder.SetInsertPoint(PH->getParent(),
12293 PH->getParent()->getFirstInsertionPt());
12294 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12295
12296 V = FinalShuffle(V, E, VecTy);
12297
12298 E->VectorizedValue = V;
12299 if (PostponedPHIs)
12300 return V;
12301 }
12302 PHINode *NewPhi = cast<PHINode>(E->PHI);
12303 // If phi node is fully emitted - exit.
12304 if (NewPhi->getNumIncomingValues() != 0)
12305 return NewPhi;
12306
12307 // PHINodes may have multiple entries from the same block. We want to
12308 // visit every block once.
12310
12311 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12313 BasicBlock *IBB = PH->getIncomingBlock(I);
12314
12315 // Stop emission if all incoming values are generated.
12316 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
12317 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12318 return NewPhi;
12319 }
12320
12321 if (!VisitedBBs.insert(IBB).second) {
12322 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
12323 continue;
12324 }
12325
12326 Builder.SetInsertPoint(IBB->getTerminator());
12327 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12328 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
12329 if (VecTy != Vec->getType()) {
12330 assert((It != MinBWs.end() ||
12331 getOperandEntry(E, I)->State == TreeEntry::NeedToGather ||
12332 MinBWs.contains(getOperandEntry(E, I))) &&
12333 "Expected item in MinBWs.");
12334 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
12335 }
12336 NewPhi->addIncoming(Vec, IBB);
12337 }
12338
12339 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
12340 "Invalid number of incoming values");
12341 return NewPhi;
12342 }
12343
12344 case Instruction::ExtractElement: {
12345 Value *V = E->getSingleOperand(0);
12346 if (const TreeEntry *TE = getTreeEntry(V))
12347 V = TE->VectorizedValue;
12348 setInsertPointAfterBundle(E);
12349 V = FinalShuffle(V, E, VecTy);
12350 E->VectorizedValue = V;
12351 return V;
12352 }
12353 case Instruction::ExtractValue: {
12354 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
12355 Builder.SetInsertPoint(LI);
12356 Value *Ptr = LI->getPointerOperand();
12357 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
12358 Value *NewV = propagateMetadata(V, E->Scalars);
12359 NewV = FinalShuffle(NewV, E, VecTy);
12360 E->VectorizedValue = NewV;
12361 return NewV;
12362 }
12363 case Instruction::InsertElement: {
12364 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
12365 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
12366 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
12367 ArrayRef<Value *> Op = E->getOperand(1);
12368 Type *ScalarTy = Op.front()->getType();
12369 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
12370 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
12371 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
12372 assert(Res.first > 0 && "Expected item in MinBWs.");
12373 V = Builder.CreateIntCast(
12374 V,
12376 ScalarTy,
12377 cast<FixedVectorType>(V->getType())->getNumElements()),
12378 Res.second);
12379 }
12380
12381 // Create InsertVector shuffle if necessary
12382 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
12383 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
12384 }));
12385 const unsigned NumElts =
12386 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
12387 const unsigned NumScalars = E->Scalars.size();
12388
12389 unsigned Offset = *getInsertIndex(VL0);
12390 assert(Offset < NumElts && "Failed to find vector index offset");
12391
12392 // Create shuffle to resize vector
12394 if (!E->ReorderIndices.empty()) {
12395 inversePermutation(E->ReorderIndices, Mask);
12396 Mask.append(NumElts - NumScalars, PoisonMaskElem);
12397 } else {
12398 Mask.assign(NumElts, PoisonMaskElem);
12399 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
12400 }
12401 // Create InsertVector shuffle if necessary
12402 bool IsIdentity = true;
12403 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
12404 Mask.swap(PrevMask);
12405 for (unsigned I = 0; I < NumScalars; ++I) {
12406 Value *Scalar = E->Scalars[PrevMask[I]];
12407 unsigned InsertIdx = *getInsertIndex(Scalar);
12408 IsIdentity &= InsertIdx - Offset == I;
12409 Mask[InsertIdx - Offset] = I;
12410 }
12411 if (!IsIdentity || NumElts != NumScalars) {
12412 Value *V2 = nullptr;
12413 bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
12414 SmallVector<int> InsertMask(Mask);
12415 if (NumElts != NumScalars && Offset == 0) {
12416 // Follow all insert element instructions from the current buildvector
12417 // sequence.
12418 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
12419 do {
12420 std::optional<unsigned> InsertIdx = getInsertIndex(Ins);
12421 if (!InsertIdx)
12422 break;
12423 if (InsertMask[*InsertIdx] == PoisonMaskElem)
12424 InsertMask[*InsertIdx] = *InsertIdx;
12425 if (!Ins->hasOneUse())
12426 break;
12427 Ins = dyn_cast_or_null<InsertElementInst>(
12428 Ins->getUniqueUndroppableUser());
12429 } while (Ins);
12430 SmallBitVector UseMask =
12431 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12432 SmallBitVector IsFirstPoison =
12433 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12434 SmallBitVector IsFirstUndef =
12435 isUndefVector(FirstInsert->getOperand(0), UseMask);
12436 if (!IsFirstPoison.all()) {
12437 unsigned Idx = 0;
12438 for (unsigned I = 0; I < NumElts; I++) {
12439 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
12440 IsFirstUndef.test(I)) {
12441 if (IsVNonPoisonous) {
12442 InsertMask[I] = I < NumScalars ? I : 0;
12443 continue;
12444 }
12445 if (!V2)
12446 V2 = UndefValue::get(V->getType());
12447 if (Idx >= NumScalars)
12448 Idx = NumScalars - 1;
12449 InsertMask[I] = NumScalars + Idx;
12450 ++Idx;
12451 } else if (InsertMask[I] != PoisonMaskElem &&
12452 Mask[I] == PoisonMaskElem) {
12453 InsertMask[I] = PoisonMaskElem;
12454 }
12455 }
12456 } else {
12457 InsertMask = Mask;
12458 }
12459 }
12460 if (!V2)
12461 V2 = PoisonValue::get(V->getType());
12462 V = Builder.CreateShuffleVector(V, V2, InsertMask);
12463 if (auto *I = dyn_cast<Instruction>(V)) {
12464 GatherShuffleExtractSeq.insert(I);
12465 CSEBlocks.insert(I->getParent());
12466 }
12467 }
12468
12469 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
12470 for (unsigned I = 0; I < NumElts; I++) {
12471 if (Mask[I] != PoisonMaskElem)
12472 InsertMask[Offset + I] = I;
12473 }
12474 SmallBitVector UseMask =
12475 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12476 SmallBitVector IsFirstUndef =
12477 isUndefVector(FirstInsert->getOperand(0), UseMask);
12478 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
12479 NumElts != NumScalars) {
12480 if (IsFirstUndef.all()) {
12481 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
12482 SmallBitVector IsFirstPoison =
12483 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12484 if (!IsFirstPoison.all()) {
12485 for (unsigned I = 0; I < NumElts; I++) {
12486 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
12487 InsertMask[I] = I + NumElts;
12488 }
12489 }
12490 V = Builder.CreateShuffleVector(
12491 V,
12492 IsFirstPoison.all() ? PoisonValue::get(V->getType())
12493 : FirstInsert->getOperand(0),
12494 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
12495 if (auto *I = dyn_cast<Instruction>(V)) {
12496 GatherShuffleExtractSeq.insert(I);
12497 CSEBlocks.insert(I->getParent());
12498 }
12499 }
12500 } else {
12501 SmallBitVector IsFirstPoison =
12502 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12503 for (unsigned I = 0; I < NumElts; I++) {
12504 if (InsertMask[I] == PoisonMaskElem)
12505 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
12506 else
12507 InsertMask[I] += NumElts;
12508 }
12509 V = Builder.CreateShuffleVector(
12510 FirstInsert->getOperand(0), V, InsertMask,
12511 cast<Instruction>(E->Scalars.back())->getName());
12512 if (auto *I = dyn_cast<Instruction>(V)) {
12513 GatherShuffleExtractSeq.insert(I);
12514 CSEBlocks.insert(I->getParent());
12515 }
12516 }
12517 }
12518
12519 ++NumVectorInstructions;
12520 E->VectorizedValue = V;
12521 return V;
12522 }
12523 case Instruction::ZExt:
12524 case Instruction::SExt:
12525 case Instruction::FPToUI:
12526 case Instruction::FPToSI:
12527 case Instruction::FPExt:
12528 case Instruction::PtrToInt:
12529 case Instruction::IntToPtr:
12530 case Instruction::SIToFP:
12531 case Instruction::UIToFP:
12532 case Instruction::Trunc:
12533 case Instruction::FPTrunc:
12534 case Instruction::BitCast: {
12535 setInsertPointAfterBundle(E);
12536
12537 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
12538 if (E->VectorizedValue) {
12539 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12540 return E->VectorizedValue;
12541 }
12542
12543 auto *CI = cast<CastInst>(VL0);
12544 Instruction::CastOps VecOpcode = CI->getOpcode();
12545 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
12546 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
12547 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
12548 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
12549 SrcScalarTy != CI->getOperand(0)->getType())) {
12550 // Check if the values are candidates to demote.
12551 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
12552 if (SrcIt != MinBWs.end())
12553 SrcBWSz = SrcIt->second.first;
12554 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
12555 if (BWSz == SrcBWSz) {
12556 VecOpcode = Instruction::BitCast;
12557 } else if (BWSz < SrcBWSz) {
12558 VecOpcode = Instruction::Trunc;
12559 } else if (It != MinBWs.end()) {
12560 assert(BWSz > SrcBWSz && "Invalid cast!");
12561 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12562 } else if (SrcIt != MinBWs.end()) {
12563 assert(BWSz > SrcBWSz && "Invalid cast!");
12564 VecOpcode =
12565 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
12566 }
12567 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
12568 !SrcIt->second.second) {
12569 VecOpcode = Instruction::UIToFP;
12570 }
12571 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
12572 ? InVec
12573 : Builder.CreateCast(VecOpcode, InVec, VecTy);
12574 V = FinalShuffle(V, E, VecTy);
12575
12576 E->VectorizedValue = V;
12577 ++NumVectorInstructions;
12578 return V;
12579 }
12580 case Instruction::FCmp:
12581 case Instruction::ICmp: {
12582 setInsertPointAfterBundle(E);
12583
12584 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
12585 if (E->VectorizedValue) {
12586 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12587 return E->VectorizedValue;
12588 }
12589 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
12590 if (E->VectorizedValue) {
12591 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12592 return E->VectorizedValue;
12593 }
12594 if (L->getType() != R->getType()) {
12595 assert((getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12596 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12597 MinBWs.contains(getOperandEntry(E, 0)) ||
12598 MinBWs.contains(getOperandEntry(E, 1))) &&
12599 "Expected item in MinBWs.");
12600 if (cast<VectorType>(L->getType())
12601 ->getElementType()
12602 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
12603 ->getElementType()
12604 ->getIntegerBitWidth()) {
12605 Type *CastTy = R->getType();
12606 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
12607 } else {
12608 Type *CastTy = L->getType();
12609 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
12610 }
12611 }
12612
12613 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
12614 Value *V = Builder.CreateCmp(P0, L, R);
12615 propagateIRFlags(V, E->Scalars, VL0);
12616 // Do not cast for cmps.
12617 VecTy = cast<FixedVectorType>(V->getType());
12618 V = FinalShuffle(V, E, VecTy);
12619
12620 E->VectorizedValue = V;
12621 ++NumVectorInstructions;
12622 return V;
12623 }
12624 case Instruction::Select: {
12625 setInsertPointAfterBundle(E);
12626
12627 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
12628 if (E->VectorizedValue) {
12629 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12630 return E->VectorizedValue;
12631 }
12632 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
12633 if (E->VectorizedValue) {
12634 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12635 return E->VectorizedValue;
12636 }
12637 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
12638 if (E->VectorizedValue) {
12639 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12640 return E->VectorizedValue;
12641 }
12642 if (True->getType() != VecTy || False->getType() != VecTy) {
12643 assert((It != MinBWs.end() ||
12644 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12645 getOperandEntry(E, 2)->State == TreeEntry::NeedToGather ||
12646 MinBWs.contains(getOperandEntry(E, 1)) ||
12647 MinBWs.contains(getOperandEntry(E, 2))) &&
12648 "Expected item in MinBWs.");
12649 if (True->getType() != VecTy)
12650 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
12651 if (False->getType() != VecTy)
12652 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
12653 }
12654
12655 Value *V = Builder.CreateSelect(Cond, True, False);
12656 V = FinalShuffle(V, E, VecTy);
12657
12658 E->VectorizedValue = V;
12659 ++NumVectorInstructions;
12660 return V;
12661 }
12662 case Instruction::FNeg: {
12663 setInsertPointAfterBundle(E);
12664
12665 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
12666
12667 if (E->VectorizedValue) {
12668 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12669 return E->VectorizedValue;
12670 }
12671
12672 Value *V = Builder.CreateUnOp(
12673 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
12674 propagateIRFlags(V, E->Scalars, VL0);
12675 if (auto *I = dyn_cast<Instruction>(V))
12676 V = propagateMetadata(I, E->Scalars);
12677
12678 V = FinalShuffle(V, E, VecTy);
12679
12680 E->VectorizedValue = V;
12681 ++NumVectorInstructions;
12682
12683 return V;
12684 }
12685 case Instruction::Add:
12686 case Instruction::FAdd:
12687 case Instruction::Sub:
12688 case Instruction::FSub:
12689 case Instruction::Mul:
12690 case Instruction::FMul:
12691 case Instruction::UDiv:
12692 case Instruction::SDiv:
12693 case Instruction::FDiv:
12694 case Instruction::URem:
12695 case Instruction::SRem:
12696 case Instruction::FRem:
12697 case Instruction::Shl:
12698 case Instruction::LShr:
12699 case Instruction::AShr:
12700 case Instruction::And:
12701 case Instruction::Or:
12702 case Instruction::Xor: {
12703 setInsertPointAfterBundle(E);
12704
12705 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
12706 if (E->VectorizedValue) {
12707 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12708 return E->VectorizedValue;
12709 }
12710 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
12711 if (E->VectorizedValue) {
12712 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12713 return E->VectorizedValue;
12714 }
12715 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
12716 assert((It != MinBWs.end() ||
12717 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12718 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12719 MinBWs.contains(getOperandEntry(E, 0)) ||
12720 MinBWs.contains(getOperandEntry(E, 1))) &&
12721 "Expected item in MinBWs.");
12722 if (LHS->getType() != VecTy)
12723 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
12724 if (RHS->getType() != VecTy)
12725 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
12726 }
12727
12728 Value *V = Builder.CreateBinOp(
12729 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
12730 RHS);
12731 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
12732 if (auto *I = dyn_cast<Instruction>(V)) {
12733 V = propagateMetadata(I, E->Scalars);
12734 // Drop nuw flags for abs(sub(commutative), true).
12735 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
12736 any_of(E->Scalars, [](Value *V) {
12737 return isCommutative(cast<Instruction>(V));
12738 }))
12739 I->setHasNoUnsignedWrap(/*b=*/false);
12740 }
12741
12742 V = FinalShuffle(V, E, VecTy);
12743
12744 E->VectorizedValue = V;
12745 ++NumVectorInstructions;
12746
12747 return V;
12748 }
12749 case Instruction::Load: {
12750 // Loads are inserted at the head of the tree because we don't want to
12751 // sink them all the way down past store instructions.
12752 setInsertPointAfterBundle(E);
12753
12754 LoadInst *LI = cast<LoadInst>(VL0);
12755 Instruction *NewLI;
12756 Value *PO = LI->getPointerOperand();
12757 if (E->State == TreeEntry::Vectorize) {
12758 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
12759 } else if (E->State == TreeEntry::StridedVectorize) {
12760 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
12761 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
12762 PO = IsReverseOrder ? PtrN : Ptr0;
12763 std::optional<int> Diff = getPointersDiff(
12764 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
12765 Type *StrideTy = DL->getIndexType(PO->getType());
12766 Value *StrideVal;
12767 if (Diff) {
12768 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
12769 StrideVal =
12770 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
12771 DL->getTypeAllocSize(ScalarTy));
12772 } else {
12773 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
12774 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
12775 return cast<LoadInst>(V)->getPointerOperand();
12776 });
12777 OrdersType Order;
12778 std::optional<Value *> Stride =
12779 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
12780 &*Builder.GetInsertPoint());
12781 Value *NewStride =
12782 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
12783 StrideVal = Builder.CreateMul(
12784 NewStride,
12785 ConstantInt::get(
12786 StrideTy,
12787 (IsReverseOrder ? -1 : 1) *
12788 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
12789 }
12790 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
12791 auto *Inst = Builder.CreateIntrinsic(
12792 Intrinsic::experimental_vp_strided_load,
12793 {VecTy, PO->getType(), StrideTy},
12794 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
12795 Builder.getInt32(E->Scalars.size())});
12796 Inst->addParamAttr(
12797 /*ArgNo=*/0,
12798 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
12799 NewLI = Inst;
12800 } else {
12801 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
12802 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
12803 if (E->VectorizedValue) {
12804 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12805 return E->VectorizedValue;
12806 }
12807 // Use the minimum alignment of the gathered loads.
12808 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
12809 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
12810 }
12811 Value *V = propagateMetadata(NewLI, E->Scalars);
12812
12813 V = FinalShuffle(V, E, VecTy);
12814 E->VectorizedValue = V;
12815 ++NumVectorInstructions;
12816 return V;
12817 }
12818 case Instruction::Store: {
12819 auto *SI = cast<StoreInst>(VL0);
12820
12821 setInsertPointAfterBundle(E);
12822
12823 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
12824 if (VecValue->getType() != VecTy)
12825 VecValue =
12826 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
12827 VecValue = FinalShuffle(VecValue, E, VecTy);
12828
12829 Value *Ptr = SI->getPointerOperand();
12830 StoreInst *ST =
12831 Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
12832
12833 Value *V = propagateMetadata(ST, E->Scalars);
12834
12835 E->VectorizedValue = V;
12836 ++NumVectorInstructions;
12837 return V;
12838 }
12839 case Instruction::GetElementPtr: {
12840 auto *GEP0 = cast<GetElementPtrInst>(VL0);
12841 setInsertPointAfterBundle(E);
12842
12843 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
12844 if (E->VectorizedValue) {
12845 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12846 return E->VectorizedValue;
12847 }
12848
12849 SmallVector<Value *> OpVecs;
12850 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
12851 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
12852 if (E->VectorizedValue) {
12853 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12854 return E->VectorizedValue;
12855 }
12856 OpVecs.push_back(OpVec);
12857 }
12858
12859 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
12860 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
12862 for (Value *V : E->Scalars) {
12863 if (isa<GetElementPtrInst>(V))
12864 GEPs.push_back(V);
12865 }
12866 V = propagateMetadata(I, GEPs);
12867 }
12868
12869 V = FinalShuffle(V, E, VecTy);
12870
12871 E->VectorizedValue = V;
12872 ++NumVectorInstructions;
12873
12874 return V;
12875 }
12876 case Instruction::Call: {
12877 CallInst *CI = cast<CallInst>(VL0);
12878 setInsertPointAfterBundle(E);
12879
12881
12882 SmallVector<Type *> ArgTys =
12884 It != MinBWs.end() ? It->second.first : 0);
12885 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
12886 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
12887 VecCallCosts.first <= VecCallCosts.second;
12888
12889 Value *ScalarArg = nullptr;
12890 SmallVector<Value *> OpVecs;
12891 SmallVector<Type *, 2> TysForDecl;
12892 // Add return type if intrinsic is overloaded on it.
12893 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
12894 TysForDecl.push_back(VecTy);
12895 auto *CEI = cast<CallInst>(VL0);
12896 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
12897 ValueList OpVL;
12898 // Some intrinsics have scalar arguments. This argument should not be
12899 // vectorized.
12900 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
12901 ScalarArg = CEI->getArgOperand(I);
12902 // if decided to reduce bitwidth of abs intrinsic, it second argument
12903 // must be set false (do not return poison, if value issigned min).
12904 if (ID == Intrinsic::abs && It != MinBWs.end() &&
12905 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
12906 ScalarArg = Builder.getFalse();
12907 OpVecs.push_back(ScalarArg);
12909 TysForDecl.push_back(ScalarArg->getType());
12910 continue;
12911 }
12912
12913 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
12914 if (E->VectorizedValue) {
12915 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12916 return E->VectorizedValue;
12917 }
12918 ScalarArg = CEI->getArgOperand(I);
12919 if (cast<VectorType>(OpVec->getType())->getElementType() !=
12920 ScalarArg->getType() &&
12921 It == MinBWs.end()) {
12922 auto *CastTy = FixedVectorType::get(ScalarArg->getType(),
12923 VecTy->getNumElements());
12924 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
12925 } else if (It != MinBWs.end()) {
12926 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
12927 }
12928 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
12929 OpVecs.push_back(OpVec);
12930 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
12931 TysForDecl.push_back(OpVec->getType());
12932 }
12933
12934 Function *CF;
12935 if (!UseIntrinsic) {
12936 VFShape Shape =
12939 static_cast<unsigned>(VecTy->getNumElements())),
12940 false /*HasGlobalPred*/);
12941 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
12942 } else {
12943 CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl);
12944 }
12945
12947 CI->getOperandBundlesAsDefs(OpBundles);
12948 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
12949
12950 propagateIRFlags(V, E->Scalars, VL0);
12951 V = FinalShuffle(V, E, VecTy);
12952
12953 E->VectorizedValue = V;
12954 ++NumVectorInstructions;
12955 return V;
12956 }
12957 case Instruction::ShuffleVector: {
12958 assert(E->isAltShuffle() &&
12959 ((Instruction::isBinaryOp(E->getOpcode()) &&
12960 Instruction::isBinaryOp(E->getAltOpcode())) ||
12961 (Instruction::isCast(E->getOpcode()) &&
12962 Instruction::isCast(E->getAltOpcode())) ||
12963 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
12964 "Invalid Shuffle Vector Operand");
12965
12966 Value *LHS = nullptr, *RHS = nullptr;
12967 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
12968 setInsertPointAfterBundle(E);
12969 LHS = vectorizeOperand(E, 0, PostponedPHIs);
12970 if (E->VectorizedValue) {
12971 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12972 return E->VectorizedValue;
12973 }
12974 RHS = vectorizeOperand(E, 1, PostponedPHIs);
12975 } else {
12976 setInsertPointAfterBundle(E);
12977 LHS = vectorizeOperand(E, 0, PostponedPHIs);
12978 }
12979 if (E->VectorizedValue) {
12980 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12981 return E->VectorizedValue;
12982 }
12983 if (LHS && RHS &&
12984 ((Instruction::isBinaryOp(E->getOpcode()) &&
12985 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
12986 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
12987 assert((It != MinBWs.end() ||
12988 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12989 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12990 MinBWs.contains(getOperandEntry(E, 0)) ||
12991 MinBWs.contains(getOperandEntry(E, 1))) &&
12992 "Expected item in MinBWs.");
12993 Type *CastTy = VecTy;
12994 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
12995 if (cast<VectorType>(LHS->getType())
12996 ->getElementType()
12997 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
12998 ->getElementType()
12999 ->getIntegerBitWidth())
13000 CastTy = RHS->getType();
13001 else
13002 CastTy = LHS->getType();
13003 }
13004 if (LHS->getType() != CastTy)
13005 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
13006 if (RHS->getType() != CastTy)
13007 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
13008 }
13009
13010 Value *V0, *V1;
13011 if (Instruction::isBinaryOp(E->getOpcode())) {
13012 V0 = Builder.CreateBinOp(
13013 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
13014 V1 = Builder.CreateBinOp(
13015 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
13016 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
13017 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
13018 auto *AltCI = cast<CmpInst>(E->getAltOp());
13019 CmpInst::Predicate AltPred = AltCI->getPredicate();
13020 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
13021 } else {
13022 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
13023 unsigned SrcBWSz = DL->getTypeSizeInBits(
13024 cast<VectorType>(LHS->getType())->getElementType());
13025 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
13026 if (BWSz <= SrcBWSz) {
13027 if (BWSz < SrcBWSz)
13028 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
13029 assert(LHS->getType() == VecTy && "Expected same type as operand.");
13030 if (auto *I = dyn_cast<Instruction>(LHS))
13031 LHS = propagateMetadata(I, E->Scalars);
13032 E->VectorizedValue = LHS;
13033 ++NumVectorInstructions;
13034 return LHS;
13035 }
13036 }
13037 V0 = Builder.CreateCast(
13038 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
13039 V1 = Builder.CreateCast(
13040 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
13041 }
13042 // Add V0 and V1 to later analysis to try to find and remove matching
13043 // instruction, if any.
13044 for (Value *V : {V0, V1}) {
13045 if (auto *I = dyn_cast<Instruction>(V)) {
13046 GatherShuffleExtractSeq.insert(I);
13047 CSEBlocks.insert(I->getParent());
13048 }
13049 }
13050
13051 // Create shuffle to take alternate operations from the vector.
13052 // Also, gather up main and alt scalar ops to propagate IR flags to
13053 // each vector operation.
13054 ValueList OpScalars, AltScalars;
13056 E->buildAltOpShuffleMask(
13057 [E, this](Instruction *I) {
13058 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
13059 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
13060 *TLI);
13061 },
13062 Mask, &OpScalars, &AltScalars);
13063
13064 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
13065 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
13066 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
13067 // Drop nuw flags for abs(sub(commutative), true).
13068 if (auto *I = dyn_cast<Instruction>(Vec);
13069 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
13070 any_of(E->Scalars, [](Value *V) {
13071 auto *IV = cast<Instruction>(V);
13072 return IV->getOpcode() == Instruction::Sub &&
13073 isCommutative(cast<Instruction>(IV));
13074 }))
13075 I->setHasNoUnsignedWrap(/*b=*/false);
13076 };
13077 DropNuwFlag(V0, E->getOpcode());
13078 DropNuwFlag(V1, E->getAltOpcode());
13079
13080 Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
13081 if (auto *I = dyn_cast<Instruction>(V)) {
13082 V = propagateMetadata(I, E->Scalars);
13083 GatherShuffleExtractSeq.insert(I);
13084 CSEBlocks.insert(I->getParent());
13085 }
13086
13087 E->VectorizedValue = V;
13088 ++NumVectorInstructions;
13089
13090 return V;
13091 }
13092 default:
13093 llvm_unreachable("unknown inst");
13094 }
13095 return nullptr;
13096}
13097
13099 ExtraValueToDebugLocsMap ExternallyUsedValues;
13100 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
13101 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13102}
13103
13104namespace {
13105/// Data type for handling buildvector sequences with the reused scalars from
13106/// other tree entries.
13107struct ShuffledInsertData {
13108 /// List of insertelements to be replaced by shuffles.
13109 SmallVector<InsertElementInst *> InsertElements;
13110 /// The parent vectors and shuffle mask for the given list of inserts.
13112};
13113} // namespace
13114
13116 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
13117 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
13118 Instruction *ReductionRoot) {
13119 // All blocks must be scheduled before any instructions are inserted.
13120 for (auto &BSIter : BlocksSchedules) {
13121 scheduleBlock(BSIter.second.get());
13122 }
13123 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
13124 // need to rebuild it.
13125 EntryToLastInstruction.clear();
13126
13127 if (ReductionRoot)
13128 Builder.SetInsertPoint(ReductionRoot->getParent(),
13129 ReductionRoot->getIterator());
13130 else
13131 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13132
13133 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
13134 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
13135 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13136 if (TE->State == TreeEntry::Vectorize &&
13137 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13138 TE->VectorizedValue)
13139 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
13140 // Run through the list of postponed gathers and emit them, replacing the temp
13141 // emitted allocas with actual vector instructions.
13142 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
13144 for (const TreeEntry *E : PostponedNodes) {
13145 auto *TE = const_cast<TreeEntry *>(E);
13146 if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
13147 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
13148 TE->UserTreeIndices.front().EdgeIdx)))
13149 // Found gather node which is absolutely the same as one of the
13150 // vectorized nodes. It may happen after reordering.
13151 continue;
13152 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
13153 TE->VectorizedValue = nullptr;
13154 auto *UserI =
13155 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
13156 // If user is a PHI node, its vector code have to be inserted right before
13157 // block terminator. Since the node was delayed, there were some unresolved
13158 // dependencies at the moment when stab instruction was emitted. In a case
13159 // when any of these dependencies turn out an operand of another PHI, coming
13160 // from this same block, position of a stab instruction will become invalid.
13161 // The is because source vector that supposed to feed this gather node was
13162 // inserted at the end of the block [after stab instruction]. So we need
13163 // to adjust insertion point again to the end of block.
13164 if (isa<PHINode>(UserI)) {
13165 // Insert before all users.
13166 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
13167 for (User *U : PrevVec->users()) {
13168 if (U == UserI)
13169 continue;
13170 auto *UI = dyn_cast<Instruction>(U);
13171 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
13172 continue;
13173 if (UI->comesBefore(InsertPt))
13174 InsertPt = UI;
13175 }
13176 Builder.SetInsertPoint(InsertPt);
13177 } else {
13178 Builder.SetInsertPoint(PrevVec);
13179 }
13180 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
13181 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
13182 if (Vec->getType() != PrevVec->getType()) {
13183 assert(Vec->getType()->isIntOrIntVectorTy() &&
13184 PrevVec->getType()->isIntOrIntVectorTy() &&
13185 "Expected integer vector types only.");
13186 std::optional<bool> IsSigned;
13187 for (Value *V : TE->Scalars) {
13188 if (const TreeEntry *BaseTE = getTreeEntry(V)) {
13189 auto It = MinBWs.find(BaseTE);
13190 if (It != MinBWs.end()) {
13191 IsSigned = IsSigned.value_or(false) || It->second.second;
13192 if (*IsSigned)
13193 break;
13194 }
13195 for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
13196 auto It = MinBWs.find(MNTE);
13197 if (It != MinBWs.end()) {
13198 IsSigned = IsSigned.value_or(false) || It->second.second;
13199 if (*IsSigned)
13200 break;
13201 }
13202 }
13203 if (IsSigned.value_or(false))
13204 break;
13205 // Scan through gather nodes.
13206 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
13207 auto It = MinBWs.find(BVE);
13208 if (It != MinBWs.end()) {
13209 IsSigned = IsSigned.value_or(false) || It->second.second;
13210 if (*IsSigned)
13211 break;
13212 }
13213 }
13214 if (IsSigned.value_or(false))
13215 break;
13216 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
13217 IsSigned =
13218 IsSigned.value_or(false) ||
13219 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
13220 continue;
13221 }
13222 if (IsSigned.value_or(false))
13223 break;
13224 }
13225 }
13226 if (IsSigned.value_or(false)) {
13227 // Final attempt - check user node.
13228 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
13229 if (It != MinBWs.end())
13230 IsSigned = It->second.second;
13231 }
13232 assert(IsSigned &&
13233 "Expected user node or perfect diamond match in MinBWs.");
13234 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
13235 }
13236 PrevVec->replaceAllUsesWith(Vec);
13237 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
13238 // Replace the stub vector node, if it was used before for one of the
13239 // buildvector nodes already.
13240 auto It = PostponedValues.find(PrevVec);
13241 if (It != PostponedValues.end()) {
13242 for (TreeEntry *VTE : It->getSecond())
13243 VTE->VectorizedValue = Vec;
13244 }
13245 eraseInstruction(PrevVec);
13246 }
13247
13248 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
13249 << " values .\n");
13250
13251 SmallVector<ShuffledInsertData> ShuffledInserts;
13252 // Maps vector instruction to original insertelement instruction
13253 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
13254 // Maps extract Scalar to the corresponding extractelement instruction in the
13255 // basic block. Only one extractelement per block should be emitted.
13256 DenseMap<Value *,
13258 ScalarToEEs;
13259 SmallDenseSet<Value *, 4> UsedInserts;
13261 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
13262 // Extract all of the elements with the external uses.
13263 for (const auto &ExternalUse : ExternalUses) {
13264 Value *Scalar = ExternalUse.Scalar;
13265 llvm::User *User = ExternalUse.User;
13266
13267 // Skip users that we already RAUW. This happens when one instruction
13268 // has multiple uses of the same value.
13269 if (User && !is_contained(Scalar->users(), User))
13270 continue;
13271 TreeEntry *E = getTreeEntry(Scalar);
13272 assert(E && "Invalid scalar");
13273 assert(E->State != TreeEntry::NeedToGather &&
13274 "Extracting from a gather list");
13275 // Non-instruction pointers are not deleted, just skip them.
13276 if (E->getOpcode() == Instruction::GetElementPtr &&
13277 !isa<GetElementPtrInst>(Scalar))
13278 continue;
13279
13280 Value *Vec = E->VectorizedValue;
13281 assert(Vec && "Can't find vectorizable value");
13282
13283 Value *Lane = Builder.getInt32(ExternalUse.Lane);
13284 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
13285 if (Scalar->getType() != Vec->getType()) {
13286 Value *Ex = nullptr;
13287 Value *ExV = nullptr;
13288 auto *GEP = dyn_cast<GetElementPtrInst>(Scalar);
13289 bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP);
13290 auto It = ScalarToEEs.find(Scalar);
13291 if (It != ScalarToEEs.end()) {
13292 // No need to emit many extracts, just move the only one in the
13293 // current block.
13294 auto EEIt = It->second.find(Builder.GetInsertBlock());
13295 if (EEIt != It->second.end()) {
13296 Instruction *I = EEIt->second.first;
13297 if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
13298 Builder.GetInsertPoint()->comesBefore(I)) {
13299 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
13300 Builder.GetInsertPoint());
13301 if (auto *CI = EEIt->second.second)
13302 CI->moveAfter(I);
13303 }
13304 Ex = I;
13305 ExV = EEIt->second.second ? EEIt->second.second : Ex;
13306 }
13307 }
13308 if (!Ex) {
13309 // "Reuse" the existing extract to improve final codegen.
13310 if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
13311 Value *V = ES->getVectorOperand();
13312 if (const TreeEntry *ETE = getTreeEntry(V))
13313 V = ETE->VectorizedValue;
13314 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
13315 } else if (ReplaceGEP) {
13316 // Leave the GEPs as is, they are free in most cases and better to
13317 // keep them as GEPs.
13318 auto *CloneGEP = GEP->clone();
13319 CloneGEP->insertBefore(*Builder.GetInsertBlock(),
13320 Builder.GetInsertPoint());
13321 if (GEP->hasName())
13322 CloneGEP->takeName(GEP);
13323 Ex = CloneGEP;
13324 } else {
13325 Ex = Builder.CreateExtractElement(Vec, Lane);
13326 }
13327 // If necessary, sign-extend or zero-extend ScalarRoot
13328 // to the larger type.
13329 ExV = Ex;
13330 if (Scalar->getType() != Ex->getType())
13331 ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
13332 MinBWs.find(E)->second.second);
13333 if (auto *I = dyn_cast<Instruction>(Ex))
13334 ScalarToEEs[Scalar].try_emplace(
13335 Builder.GetInsertBlock(),
13336 std::make_pair(I, cast<Instruction>(ExV)));
13337 }
13338 // The then branch of the previous if may produce constants, since 0
13339 // operand might be a constant.
13340 if (auto *ExI = dyn_cast<Instruction>(Ex)) {
13341 GatherShuffleExtractSeq.insert(ExI);
13342 CSEBlocks.insert(ExI->getParent());
13343 }
13344 return ExV;
13345 }
13346 assert(isa<FixedVectorType>(Scalar->getType()) &&
13347 isa<InsertElementInst>(Scalar) &&
13348 "In-tree scalar of vector type is not insertelement?");
13349 auto *IE = cast<InsertElementInst>(Scalar);
13350 VectorToInsertElement.try_emplace(Vec, IE);
13351 return Vec;
13352 };
13353 // If User == nullptr, the Scalar remains as scalar in vectorized
13354 // instructions or is used as extra arg. Generate ExtractElement instruction
13355 // and update the record for this scalar in ExternallyUsedValues.
13356 if (!User) {
13357 if (!ScalarsWithNullptrUser.insert(Scalar).second)
13358 continue;
13359 assert((ExternallyUsedValues.count(Scalar) ||
13360 any_of(Scalar->users(),
13361 [&](llvm::User *U) {
13362 if (ExternalUsesAsGEPs.contains(U))
13363 return true;
13364 TreeEntry *UseEntry = getTreeEntry(U);
13365 return UseEntry &&
13366 (UseEntry->State == TreeEntry::Vectorize ||
13367 UseEntry->State ==
13368 TreeEntry::StridedVectorize) &&
13369 (E->State == TreeEntry::Vectorize ||
13370 E->State == TreeEntry::StridedVectorize) &&
13371 doesInTreeUserNeedToExtract(
13372 Scalar,
13373 cast<Instruction>(UseEntry->Scalars.front()),
13374 TLI);
13375 })) &&
13376 "Scalar with nullptr User must be registered in "
13377 "ExternallyUsedValues map or remain as scalar in vectorized "
13378 "instructions");
13379 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
13380 if (auto *PHI = dyn_cast<PHINode>(VecI))
13381 Builder.SetInsertPoint(PHI->getParent(),
13382 PHI->getParent()->getFirstNonPHIIt());
13383 else
13384 Builder.SetInsertPoint(VecI->getParent(),
13385 std::next(VecI->getIterator()));
13386 } else {
13387 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13388 }
13389 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13390 // Required to update internally referenced instructions.
13391 Scalar->replaceAllUsesWith(NewInst);
13392 ReplacedExternals.emplace_back(Scalar, NewInst);
13393 continue;
13394 }
13395
13396 if (auto *VU = dyn_cast<InsertElementInst>(User)) {
13397 // Skip if the scalar is another vector op or Vec is not an instruction.
13398 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
13399 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
13400 if (!UsedInserts.insert(VU).second)
13401 continue;
13402 // Need to use original vector, if the root is truncated.
13403 auto BWIt = MinBWs.find(E);
13404 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
13405 auto *ScalarTy = FTy->getElementType();
13406 auto Key = std::make_pair(Vec, ScalarTy);
13407 auto VecIt = VectorCasts.find(Key);
13408 if (VecIt == VectorCasts.end()) {
13409 IRBuilderBase::InsertPointGuard Guard(Builder);
13410 if (auto *IVec = dyn_cast<Instruction>(Vec))
13411 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
13412 Vec = Builder.CreateIntCast(
13413 Vec,
13415 ScalarTy,
13416 cast<FixedVectorType>(Vec->getType())->getNumElements()),
13417 BWIt->second.second);
13418 VectorCasts.try_emplace(Key, Vec);
13419 } else {
13420 Vec = VecIt->second;
13421 }
13422 }
13423
13424 std::optional<unsigned> InsertIdx = getInsertIndex(VU);
13425 if (InsertIdx) {
13426 auto *It =
13427 find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
13428 // Checks if 2 insertelements are from the same buildvector.
13429 InsertElementInst *VecInsert = Data.InsertElements.front();
13431 VU, VecInsert,
13432 [](InsertElementInst *II) { return II->getOperand(0); });
13433 });
13434 unsigned Idx = *InsertIdx;
13435 if (It == ShuffledInserts.end()) {
13436 (void)ShuffledInserts.emplace_back();
13437 It = std::next(ShuffledInserts.begin(),
13438 ShuffledInserts.size() - 1);
13439 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
13440 if (Mask.empty())
13441 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
13442 // Find the insertvector, vectorized in tree, if any.
13443 Value *Base = VU;
13444 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
13445 if (IEBase != User &&
13446 (!IEBase->hasOneUse() ||
13447 getInsertIndex(IEBase).value_or(Idx) == Idx))
13448 break;
13449 // Build the mask for the vectorized insertelement instructions.
13450 if (const TreeEntry *E = getTreeEntry(IEBase)) {
13451 do {
13452 IEBase = cast<InsertElementInst>(Base);
13453 int IEIdx = *getInsertIndex(IEBase);
13454 assert(Mask[IEIdx] == PoisonMaskElem &&
13455 "InsertElementInstruction used already.");
13456 Mask[IEIdx] = IEIdx;
13457 Base = IEBase->getOperand(0);
13458 } while (E == getTreeEntry(Base));
13459 break;
13460 }
13461 Base = cast<InsertElementInst>(Base)->getOperand(0);
13462 // After the vectorization the def-use chain has changed, need
13463 // to look through original insertelement instructions, if they
13464 // get replaced by vector instructions.
13465 auto It = VectorToInsertElement.find(Base);
13466 if (It != VectorToInsertElement.end())
13467 Base = It->second;
13468 }
13469 }
13470 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
13471 if (Mask.empty())
13472 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
13473 Mask[Idx] = ExternalUse.Lane;
13474 It->InsertElements.push_back(cast<InsertElementInst>(User));
13475 continue;
13476 }
13477 }
13478 }
13479 }
13480
13481 // Generate extracts for out-of-tree users.
13482 // Find the insertion point for the extractelement lane.
13483 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
13484 if (PHINode *PH = dyn_cast<PHINode>(User)) {
13485 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
13486 if (PH->getIncomingValue(I) == Scalar) {
13487 Instruction *IncomingTerminator =
13488 PH->getIncomingBlock(I)->getTerminator();
13489 if (isa<CatchSwitchInst>(IncomingTerminator)) {
13490 Builder.SetInsertPoint(VecI->getParent(),
13491 std::next(VecI->getIterator()));
13492 } else {
13493 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
13494 }
13495 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13496 PH->setOperand(I, NewInst);
13497 }
13498 }
13499 } else {
13500 Builder.SetInsertPoint(cast<Instruction>(User));
13501 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13502 User->replaceUsesOfWith(Scalar, NewInst);
13503 }
13504 } else {
13505 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13506 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13507 User->replaceUsesOfWith(Scalar, NewInst);
13508 }
13509
13510 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
13511 }
13512
13513 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
13514 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
13515 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
13516 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
13517 for (int I = 0, E = Mask.size(); I < E; ++I) {
13518 if (Mask[I] < VF)
13519 CombinedMask1[I] = Mask[I];
13520 else
13521 CombinedMask2[I] = Mask[I] - VF;
13522 }
13523 ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
13524 ShuffleBuilder.add(V1, CombinedMask1);
13525 if (V2)
13526 ShuffleBuilder.add(V2, CombinedMask2);
13527 return ShuffleBuilder.finalize(std::nullopt);
13528 };
13529
13530 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
13531 bool ForSingleMask) {
13532 unsigned VF = Mask.size();
13533 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
13534 if (VF != VecVF) {
13535 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
13536 Vec = CreateShuffle(Vec, nullptr, Mask);
13537 return std::make_pair(Vec, true);
13538 }
13539 if (!ForSingleMask) {
13540 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
13541 for (unsigned I = 0; I < VF; ++I) {
13542 if (Mask[I] != PoisonMaskElem)
13543 ResizeMask[Mask[I]] = Mask[I];
13544 }
13545 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
13546 }
13547 }
13548
13549 return std::make_pair(Vec, false);
13550 };
13551 // Perform shuffling of the vectorize tree entries for better handling of
13552 // external extracts.
13553 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
13554 // Find the first and the last instruction in the list of insertelements.
13555 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
13556 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
13557 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
13558 Builder.SetInsertPoint(LastInsert);
13559 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
13560 Value *NewInst = performExtractsShuffleAction<Value>(
13561 MutableArrayRef(Vector.data(), Vector.size()),
13562 FirstInsert->getOperand(0),
13563 [](Value *Vec) {
13564 return cast<VectorType>(Vec->getType())
13565 ->getElementCount()
13566 .getKnownMinValue();
13567 },
13568 ResizeToVF,
13569 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
13570 ArrayRef<Value *> Vals) {
13571 assert((Vals.size() == 1 || Vals.size() == 2) &&
13572 "Expected exactly 1 or 2 input values.");
13573 if (Vals.size() == 1) {
13574 // Do not create shuffle if the mask is a simple identity
13575 // non-resizing mask.
13576 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
13577 ->getNumElements() ||
13578 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
13579 return CreateShuffle(Vals.front(), nullptr, Mask);
13580 return Vals.front();
13581 }
13582 return CreateShuffle(Vals.front() ? Vals.front()
13583 : FirstInsert->getOperand(0),
13584 Vals.back(), Mask);
13585 });
13586 auto It = ShuffledInserts[I].InsertElements.rbegin();
13587 // Rebuild buildvector chain.
13588 InsertElementInst *II = nullptr;
13589 if (It != ShuffledInserts[I].InsertElements.rend())
13590 II = *It;
13592 while (It != ShuffledInserts[I].InsertElements.rend()) {
13593 assert(II && "Must be an insertelement instruction.");
13594 if (*It == II)
13595 ++It;
13596 else
13597 Inserts.push_back(cast<Instruction>(II));
13598 II = dyn_cast<InsertElementInst>(II->getOperand(0));
13599 }
13600 for (Instruction *II : reverse(Inserts)) {
13601 II->replaceUsesOfWith(II->getOperand(0), NewInst);
13602 if (auto *NewI = dyn_cast<Instruction>(NewInst))
13603 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
13604 II->moveAfter(NewI);
13605 NewInst = II;
13606 }
13607 LastInsert->replaceAllUsesWith(NewInst);
13608 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
13609 IE->replaceUsesOfWith(IE->getOperand(0),
13610 PoisonValue::get(IE->getOperand(0)->getType()));
13611 IE->replaceUsesOfWith(IE->getOperand(1),
13612 PoisonValue::get(IE->getOperand(1)->getType()));
13613 eraseInstruction(IE);
13614 }
13615 CSEBlocks.insert(LastInsert->getParent());
13616 }
13617
13618 SmallVector<Instruction *> RemovedInsts;
13619 // For each vectorized value:
13620 for (auto &TEPtr : VectorizableTree) {
13621 TreeEntry *Entry = TEPtr.get();
13622
13623 // No need to handle users of gathered values.
13624 if (Entry->State == TreeEntry::NeedToGather)
13625 continue;
13626
13627 assert(Entry->VectorizedValue && "Can't find vectorizable value");
13628
13629 // For each lane:
13630 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
13631 Value *Scalar = Entry->Scalars[Lane];
13632
13633 if (Entry->getOpcode() == Instruction::GetElementPtr &&
13634 !isa<GetElementPtrInst>(Scalar))
13635 continue;
13636#ifndef NDEBUG
13637 Type *Ty = Scalar->getType();
13638 if (!Ty->isVoidTy()) {
13639 for (User *U : Scalar->users()) {
13640 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
13641
13642 // It is legal to delete users in the ignorelist.
13643 assert((getTreeEntry(U) ||
13644 (UserIgnoreList && UserIgnoreList->contains(U)) ||
13645 (isa_and_nonnull<Instruction>(U) &&
13646 isDeleted(cast<Instruction>(U)))) &&
13647 "Deleting out-of-tree value");
13648 }
13649 }
13650#endif
13651 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
13652 eraseInstruction(cast<Instruction>(Scalar));
13653 // Retain to-be-deleted instructions for some debug-info
13654 // bookkeeping. NOTE: eraseInstruction only marks the instruction for
13655 // deletion - instructions are not deleted until later.
13656 RemovedInsts.push_back(cast<Instruction>(Scalar));
13657 }
13658 }
13659
13660 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
13661 // new vector instruction.
13662 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
13663 V->mergeDIAssignID(RemovedInsts);
13664
13665 Builder.ClearInsertionPoint();
13666 InstrElementSize.clear();
13667
13668 const TreeEntry &RootTE = *VectorizableTree.front().get();
13669 Value *Vec = RootTE.VectorizedValue;
13670 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
13671 It != MinBWs.end() &&
13672 ReductionBitWidth != It->second.first) {
13673 IRBuilder<>::InsertPointGuard Guard(Builder);
13674 Builder.SetInsertPoint(ReductionRoot->getParent(),
13675 ReductionRoot->getIterator());
13676 Vec = Builder.CreateIntCast(
13677 Vec,
13678 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
13679 cast<VectorType>(Vec->getType())->getElementCount()),
13680 It->second.second);
13681 }
13682 return Vec;
13683}
13684
13686 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
13687 << " gather sequences instructions.\n");
13688 // LICM InsertElementInst sequences.
13689 for (Instruction *I : GatherShuffleExtractSeq) {
13690 if (isDeleted(I))
13691 continue;
13692
13693 // Check if this block is inside a loop.
13694 Loop *L = LI->getLoopFor(I->getParent());
13695 if (!L)
13696 continue;
13697
13698 // Check if it has a preheader.
13699 BasicBlock *PreHeader = L->getLoopPreheader();
13700 if (!PreHeader)
13701 continue;
13702
13703 // If the vector or the element that we insert into it are
13704 // instructions that are defined in this basic block then we can't
13705 // hoist this instruction.
13706 if (any_of(I->operands(), [L](Value *V) {
13707 auto *OpI = dyn_cast<Instruction>(V);
13708 return OpI && L->contains(OpI);
13709 }))
13710 continue;
13711
13712 // We can hoist this instruction. Move it to the pre-header.
13713 I->moveBefore(PreHeader->getTerminator());
13714 CSEBlocks.insert(PreHeader);
13715 }
13716
13717 // Make a list of all reachable blocks in our CSE queue.
13719 CSEWorkList.reserve(CSEBlocks.size());
13720 for (BasicBlock *BB : CSEBlocks)
13721 if (DomTreeNode *N = DT->getNode(BB)) {
13723 CSEWorkList.push_back(N);
13724 }
13725
13726 // Sort blocks by domination. This ensures we visit a block after all blocks
13727 // dominating it are visited.
13728 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
13729 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
13730 "Different nodes should have different DFS numbers");
13731 return A->getDFSNumIn() < B->getDFSNumIn();
13732 });
13733
13734 // Less defined shuffles can be replaced by the more defined copies.
13735 // Between two shuffles one is less defined if it has the same vector operands
13736 // and its mask indeces are the same as in the first one or undefs. E.g.
13737 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
13738 // poison, <0, 0, 0, 0>.
13739 auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
13740 SmallVectorImpl<int> &NewMask) {
13741 if (I1->getType() != I2->getType())
13742 return false;
13743 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
13744 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
13745 if (!SI1 || !SI2)
13746 return I1->isIdenticalTo(I2);
13747 if (SI1->isIdenticalTo(SI2))
13748 return true;
13749 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
13750 if (SI1->getOperand(I) != SI2->getOperand(I))
13751 return false;
13752 // Check if the second instruction is more defined than the first one.
13753 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
13754 ArrayRef<int> SM1 = SI1->getShuffleMask();
13755 // Count trailing undefs in the mask to check the final number of used
13756 // registers.
13757 unsigned LastUndefsCnt = 0;
13758 for (int I = 0, E = NewMask.size(); I < E; ++I) {
13759 if (SM1[I] == PoisonMaskElem)
13760 ++LastUndefsCnt;
13761 else
13762 LastUndefsCnt = 0;
13763 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
13764 NewMask[I] != SM1[I])
13765 return false;
13766 if (NewMask[I] == PoisonMaskElem)
13767 NewMask[I] = SM1[I];
13768 }
13769 // Check if the last undefs actually change the final number of used vector
13770 // registers.
13771 return SM1.size() - LastUndefsCnt > 1 &&
13772 TTI->getNumberOfParts(SI1->getType()) ==
13774 FixedVectorType::get(SI1->getType()->getElementType(),
13775 SM1.size() - LastUndefsCnt));
13776 };
13777 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
13778 // instructions. TODO: We can further optimize this scan if we split the
13779 // instructions into different buckets based on the insert lane.
13781 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
13782 assert(*I &&
13783 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
13784 "Worklist not sorted properly!");
13785 BasicBlock *BB = (*I)->getBlock();
13786 // For all instructions in blocks containing gather sequences:
13787 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
13788 if (isDeleted(&In))
13789 continue;
13790 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
13791 !GatherShuffleExtractSeq.contains(&In))
13792 continue;
13793
13794 // Check if we can replace this instruction with any of the
13795 // visited instructions.
13796 bool Replaced = false;
13797 for (Instruction *&V : Visited) {
13798 SmallVector<int> NewMask;
13799 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
13800 DT->dominates(V->getParent(), In.getParent())) {
13801 In.replaceAllUsesWith(V);
13802 eraseInstruction(&In);
13803 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
13804 if (!NewMask.empty())
13805 SI->setShuffleMask(NewMask);
13806 Replaced = true;
13807 break;
13808 }
13809 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
13810 GatherShuffleExtractSeq.contains(V) &&
13811 IsIdenticalOrLessDefined(V, &In, NewMask) &&
13812 DT->dominates(In.getParent(), V->getParent())) {
13813 In.moveAfter(V);
13814 V->replaceAllUsesWith(&In);
13816 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
13817 if (!NewMask.empty())
13818 SI->setShuffleMask(NewMask);
13819 V = &In;
13820 Replaced = true;
13821 break;
13822 }
13823 }
13824 if (!Replaced) {
13825 assert(!is_contained(Visited, &In));
13826 Visited.push_back(&In);
13827 }
13828 }
13829 }
13830 CSEBlocks.clear();
13831 GatherShuffleExtractSeq.clear();
13832}
13833
13834BoUpSLP::ScheduleData *
13835BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
13836 ScheduleData *Bundle = nullptr;
13837 ScheduleData *PrevInBundle = nullptr;
13838 for (Value *V : VL) {
13840 continue;
13841 ScheduleData *BundleMember = getScheduleData(V);
13842 assert(BundleMember &&
13843 "no ScheduleData for bundle member "
13844 "(maybe not in same basic block)");
13845 assert(BundleMember->isSchedulingEntity() &&
13846 "bundle member already part of other bundle");
13847 if (PrevInBundle) {
13848 PrevInBundle->NextInBundle = BundleMember;
13849 } else {
13850 Bundle = BundleMember;
13851 }
13852
13853 // Group the instructions to a bundle.
13854 BundleMember->FirstInBundle = Bundle;
13855 PrevInBundle = BundleMember;
13856 }
13857 assert(Bundle && "Failed to find schedule bundle");
13858 return Bundle;
13859}
13860
13861// Groups the instructions to a bundle (which is then a single scheduling entity)
13862// and schedules instructions until the bundle gets ready.
13863std::optional<BoUpSLP::ScheduleData *>
13864BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
13865 const InstructionsState &S) {
13866 // No need to schedule PHIs, insertelement, extractelement and extractvalue
13867 // instructions.
13868 if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
13870 return nullptr;
13871
13872 // Initialize the instruction bundle.
13873 Instruction *OldScheduleEnd = ScheduleEnd;
13874 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
13875
13876 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
13877 ScheduleData *Bundle) {
13878 // The scheduling region got new instructions at the lower end (or it is a
13879 // new region for the first bundle). This makes it necessary to
13880 // recalculate all dependencies.
13881 // It is seldom that this needs to be done a second time after adding the
13882 // initial bundle to the region.
13883 if (ScheduleEnd != OldScheduleEnd) {
13884 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
13885 doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
13886 ReSchedule = true;
13887 }
13888 if (Bundle) {
13889 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
13890 << " in block " << BB->getName() << "\n");
13891 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
13892 }
13893
13894 if (ReSchedule) {
13895 resetSchedule();
13896 initialFillReadyList(ReadyInsts);
13897 }
13898
13899 // Now try to schedule the new bundle or (if no bundle) just calculate
13900 // dependencies. As soon as the bundle is "ready" it means that there are no
13901 // cyclic dependencies and we can schedule it. Note that's important that we
13902 // don't "schedule" the bundle yet (see cancelScheduling).
13903 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
13904 !ReadyInsts.empty()) {
13905 ScheduleData *Picked = ReadyInsts.pop_back_val();
13906 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
13907 "must be ready to schedule");
13908 schedule(Picked, ReadyInsts);
13909 }
13910 };
13911
13912 // Make sure that the scheduling region contains all
13913 // instructions of the bundle.
13914 for (Value *V : VL) {
13916 continue;
13917 if (!extendSchedulingRegion(V, S)) {
13918 // If the scheduling region got new instructions at the lower end (or it
13919 // is a new region for the first bundle). This makes it necessary to
13920 // recalculate all dependencies.
13921 // Otherwise the compiler may crash trying to incorrectly calculate
13922 // dependencies and emit instruction in the wrong order at the actual
13923 // scheduling.
13924 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
13925 return std::nullopt;
13926 }
13927 }
13928
13929 bool ReSchedule = false;
13930 for (Value *V : VL) {
13932 continue;
13933 ScheduleData *BundleMember = getScheduleData(V);
13934 assert(BundleMember &&
13935 "no ScheduleData for bundle member (maybe not in same basic block)");
13936
13937 // Make sure we don't leave the pieces of the bundle in the ready list when
13938 // whole bundle might not be ready.
13939 ReadyInsts.remove(BundleMember);
13940
13941 if (!BundleMember->IsScheduled)
13942 continue;
13943 // A bundle member was scheduled as single instruction before and now
13944 // needs to be scheduled as part of the bundle. We just get rid of the
13945 // existing schedule.
13946 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
13947 << " was already scheduled\n");
13948 ReSchedule = true;
13949 }
13950
13951 auto *Bundle = buildBundle(VL);
13952 TryScheduleBundleImpl(ReSchedule, Bundle);
13953 if (!Bundle->isReady()) {
13954 cancelScheduling(VL, S.OpValue);
13955 return std::nullopt;
13956 }
13957 return Bundle;
13958}
13959
13960void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
13961 Value *OpValue) {
13962 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
13964 return;
13965
13966 if (doesNotNeedToBeScheduled(OpValue))
13967 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
13968 ScheduleData *Bundle = getScheduleData(OpValue);
13969 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
13970 assert(!Bundle->IsScheduled &&
13971 "Can't cancel bundle which is already scheduled");
13972 assert(Bundle->isSchedulingEntity() &&
13973 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
13974 "tried to unbundle something which is not a bundle");
13975
13976 // Remove the bundle from the ready list.
13977 if (Bundle->isReady())
13978 ReadyInsts.remove(Bundle);
13979
13980 // Un-bundle: make single instructions out of the bundle.
13981 ScheduleData *BundleMember = Bundle;
13982 while (BundleMember) {
13983 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
13984 BundleMember->FirstInBundle = BundleMember;
13985 ScheduleData *Next = BundleMember->NextInBundle;
13986 BundleMember->NextInBundle = nullptr;
13987 BundleMember->TE = nullptr;
13988 if (BundleMember->unscheduledDepsInBundle() == 0) {
13989 ReadyInsts.insert(BundleMember);
13990 }
13991 BundleMember = Next;
13992 }
13993}
13994
13995BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
13996 // Allocate a new ScheduleData for the instruction.
13997 if (ChunkPos >= ChunkSize) {
13998 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
13999 ChunkPos = 0;
14000 }
14001 return &(ScheduleDataChunks.back()[ChunkPos++]);
14002}
14003
14004bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
14005 const InstructionsState &S) {
14006 if (getScheduleData(V, isOneOf(S, V)))
14007 return true;
14008 Instruction *I = dyn_cast<Instruction>(V);
14009 assert(I && "bundle member must be an instruction");
14010 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
14012 "phi nodes/insertelements/extractelements/extractvalues don't need to "
14013 "be scheduled");
14014 auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
14015 ScheduleData *ISD = getScheduleData(I);
14016 if (!ISD)
14017 return false;
14018 assert(isInSchedulingRegion(ISD) &&
14019 "ScheduleData not in scheduling region");
14020 ScheduleData *SD = allocateScheduleDataChunks();
14021 SD->Inst = I;
14022 SD->init(SchedulingRegionID, S.OpValue);
14023 ExtraScheduleDataMap[I][S.OpValue] = SD;
14024 return true;
14025 };
14026 if (CheckScheduleForI(I))
14027 return true;
14028 if (!ScheduleStart) {
14029 // It's the first instruction in the new region.
14030 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
14031 ScheduleStart = I;
14032 ScheduleEnd = I->getNextNode();
14033 if (isOneOf(S, I) != I)
14034 CheckScheduleForI(I);
14035 assert(ScheduleEnd && "tried to vectorize a terminator?");
14036 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
14037 return true;
14038 }
14039 // Search up and down at the same time, because we don't know if the new
14040 // instruction is above or below the existing scheduling region.
14041 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
14042 // against the budget. Otherwise debug info could affect codegen.
14044 ++ScheduleStart->getIterator().getReverse();
14045 BasicBlock::reverse_iterator UpperEnd = BB->rend();
14046 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
14047 BasicBlock::iterator LowerEnd = BB->end();
14048 auto IsAssumeLikeIntr = [](const Instruction &I) {
14049 if (auto *II = dyn_cast<IntrinsicInst>(&I))
14050 return II->isAssumeLikeIntrinsic();
14051 return false;
14052 };
14053 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14054 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14055 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
14056 &*DownIter != I) {
14057 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14058 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
14059 return false;
14060 }
14061
14062 ++UpIter;
14063 ++DownIter;
14064
14065 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14066 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14067 }
14068 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
14069 assert(I->getParent() == ScheduleStart->getParent() &&
14070 "Instruction is in wrong basic block.");
14071 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
14072 ScheduleStart = I;
14073 if (isOneOf(S, I) != I)
14074 CheckScheduleForI(I);
14075 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
14076 << "\n");
14077 return true;
14078 }
14079 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
14080 "Expected to reach top of the basic block or instruction down the "
14081 "lower end.");
14082 assert(I->getParent() == ScheduleEnd->getParent() &&
14083 "Instruction is in wrong basic block.");
14084 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
14085 nullptr);
14086 ScheduleEnd = I->getNextNode();
14087 if (isOneOf(S, I) != I)
14088 CheckScheduleForI(I);
14089 assert(ScheduleEnd && "tried to vectorize a terminator?");
14090 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
14091 return true;
14092}
14093
14094void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
14095 Instruction *ToI,
14096 ScheduleData *PrevLoadStore,
14097 ScheduleData *NextLoadStore) {
14098 ScheduleData *CurrentLoadStore = PrevLoadStore;
14099 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
14100 // No need to allocate data for non-schedulable instructions.
14102 continue;
14103 ScheduleData *SD = ScheduleDataMap.lookup(I);
14104 if (!SD) {
14105 SD = allocateScheduleDataChunks();
14106 ScheduleDataMap[I] = SD;
14107 SD->Inst = I;
14108 }
14109 assert(!isInSchedulingRegion(SD) &&
14110 "new ScheduleData already in scheduling region");
14111 SD->init(SchedulingRegionID, I);
14112
14113 if (I->mayReadOrWriteMemory() &&
14114 (!isa<IntrinsicInst>(I) ||
14115 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
14116 cast<IntrinsicInst>(I)->getIntrinsicID() !=
14117 Intrinsic::pseudoprobe))) {
14118 // Update the linked list of memory accessing instructions.
14119 if (CurrentLoadStore) {
14120 CurrentLoadStore->NextLoadStore = SD;
14121 } else {
14122 FirstLoadStoreInRegion = SD;
14123 }
14124 CurrentLoadStore = SD;
14125 }
14126
14127 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14128 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14129 RegionHasStackSave = true;
14130 }
14131 if (NextLoadStore) {
14132 if (CurrentLoadStore)
14133 CurrentLoadStore->NextLoadStore = NextLoadStore;
14134 } else {
14135 LastLoadStoreInRegion = CurrentLoadStore;
14136 }
14137}
14138
14139void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14140 bool InsertInReadyList,
14141 BoUpSLP *SLP) {
14142 assert(SD->isSchedulingEntity());
14143
14145 WorkList.push_back(SD);
14146
14147 while (!WorkList.empty()) {
14148 ScheduleData *SD = WorkList.pop_back_val();
14149 for (ScheduleData *BundleMember = SD; BundleMember;
14150 BundleMember = BundleMember->NextInBundle) {
14151 assert(isInSchedulingRegion(BundleMember));
14152 if (BundleMember->hasValidDependencies())
14153 continue;
14154
14155 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
14156 << "\n");
14157 BundleMember->Dependencies = 0;
14158 BundleMember->resetUnscheduledDeps();
14159
14160 // Handle def-use chain dependencies.
14161 if (BundleMember->OpValue != BundleMember->Inst) {
14162 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
14163 BundleMember->Dependencies++;
14164 ScheduleData *DestBundle = UseSD->FirstInBundle;
14165 if (!DestBundle->IsScheduled)
14166 BundleMember->incrementUnscheduledDeps(1);
14167 if (!DestBundle->hasValidDependencies())
14168 WorkList.push_back(DestBundle);
14169 }
14170 } else {
14171 for (User *U : BundleMember->Inst->users()) {
14172 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
14173 BundleMember->Dependencies++;
14174 ScheduleData *DestBundle = UseSD->FirstInBundle;
14175 if (!DestBundle->IsScheduled)
14176 BundleMember->incrementUnscheduledDeps(1);
14177 if (!DestBundle->hasValidDependencies())
14178 WorkList.push_back(DestBundle);
14179 }
14180 }
14181 }
14182
14183 auto MakeControlDependent = [&](Instruction *I) {
14184 auto *DepDest = getScheduleData(I);
14185 assert(DepDest && "must be in schedule window");
14186 DepDest->ControlDependencies.push_back(BundleMember);
14187 BundleMember->Dependencies++;
14188 ScheduleData *DestBundle = DepDest->FirstInBundle;
14189 if (!DestBundle->IsScheduled)
14190 BundleMember->incrementUnscheduledDeps(1);
14191 if (!DestBundle->hasValidDependencies())
14192 WorkList.push_back(DestBundle);
14193 };
14194
14195 // Any instruction which isn't safe to speculate at the beginning of the
14196 // block is control dependend on any early exit or non-willreturn call
14197 // which proceeds it.
14198 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
14199 for (Instruction *I = BundleMember->Inst->getNextNode();
14200 I != ScheduleEnd; I = I->getNextNode()) {
14201 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
14202 continue;
14203
14204 // Add the dependency
14205 MakeControlDependent(I);
14206
14208 // Everything past here must be control dependent on I.
14209 break;
14210 }
14211 }
14212
14213 if (RegionHasStackSave) {
14214 // If we have an inalloc alloca instruction, it needs to be scheduled
14215 // after any preceeding stacksave. We also need to prevent any alloca
14216 // from reordering above a preceeding stackrestore.
14217 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
14218 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
14219 for (Instruction *I = BundleMember->Inst->getNextNode();
14220 I != ScheduleEnd; I = I->getNextNode()) {
14221 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14222 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14223 // Any allocas past here must be control dependent on I, and I
14224 // must be memory dependend on BundleMember->Inst.
14225 break;
14226
14227 if (!isa<AllocaInst>(I))
14228 continue;
14229
14230 // Add the dependency
14231 MakeControlDependent(I);
14232 }
14233 }
14234
14235 // In addition to the cases handle just above, we need to prevent
14236 // allocas and loads/stores from moving below a stacksave or a
14237 // stackrestore. Avoiding moving allocas below stackrestore is currently
14238 // thought to be conservatism. Moving loads/stores below a stackrestore
14239 // can lead to incorrect code.
14240 if (isa<AllocaInst>(BundleMember->Inst) ||
14241 BundleMember->Inst->mayReadOrWriteMemory()) {
14242 for (Instruction *I = BundleMember->Inst->getNextNode();
14243 I != ScheduleEnd; I = I->getNextNode()) {
14244 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
14245 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14246 continue;
14247
14248 // Add the dependency
14249 MakeControlDependent(I);
14250 break;
14251 }
14252 }
14253 }
14254
14255 // Handle the memory dependencies (if any).
14256 ScheduleData *DepDest = BundleMember->NextLoadStore;
14257 if (!DepDest)
14258 continue;
14259 Instruction *SrcInst = BundleMember->Inst;
14260 assert(SrcInst->mayReadOrWriteMemory() &&
14261 "NextLoadStore list for non memory effecting bundle?");
14262 MemoryLocation SrcLoc = getLocation(SrcInst);
14263 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14264 unsigned NumAliased = 0;
14265 unsigned DistToSrc = 1;
14266
14267 for (; DepDest; DepDest = DepDest->NextLoadStore) {
14268 assert(isInSchedulingRegion(DepDest));
14269
14270 // We have two limits to reduce the complexity:
14271 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
14272 // SLP->isAliased (which is the expensive part in this loop).
14273 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
14274 // the whole loop (even if the loop is fast, it's quadratic).
14275 // It's important for the loop break condition (see below) to
14276 // check this limit even between two read-only instructions.
14277 if (DistToSrc >= MaxMemDepDistance ||
14278 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
14279 (NumAliased >= AliasedCheckLimit ||
14280 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
14281
14282 // We increment the counter only if the locations are aliased
14283 // (instead of counting all alias checks). This gives a better
14284 // balance between reduced runtime and accurate dependencies.
14285 NumAliased++;
14286
14287 DepDest->MemoryDependencies.push_back(BundleMember);
14288 BundleMember->Dependencies++;
14289 ScheduleData *DestBundle = DepDest->FirstInBundle;
14290 if (!DestBundle->IsScheduled) {
14291 BundleMember->incrementUnscheduledDeps(1);
14292 }
14293 if (!DestBundle->hasValidDependencies()) {
14294 WorkList.push_back(DestBundle);
14295 }
14296 }
14297
14298 // Example, explaining the loop break condition: Let's assume our
14299 // starting instruction is i0 and MaxMemDepDistance = 3.
14300 //
14301 // +--------v--v--v
14302 // i0,i1,i2,i3,i4,i5,i6,i7,i8
14303 // +--------^--^--^
14304 //
14305 // MaxMemDepDistance let us stop alias-checking at i3 and we add
14306 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
14307 // Previously we already added dependencies from i3 to i6,i7,i8
14308 // (because of MaxMemDepDistance). As we added a dependency from
14309 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
14310 // and we can abort this loop at i6.
14311 if (DistToSrc >= 2 * MaxMemDepDistance)
14312 break;
14313 DistToSrc++;
14314 }
14315 }
14316 if (InsertInReadyList && SD->isReady()) {
14317 ReadyInsts.insert(SD);
14318 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
14319 << "\n");
14320 }
14321 }
14322}
14323
14324void BoUpSLP::BlockScheduling::resetSchedule() {
14325 assert(ScheduleStart &&
14326 "tried to reset schedule on block which has not been scheduled");
14327 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
14328 doForAllOpcodes(I, [&](ScheduleData *SD) {
14329 assert(isInSchedulingRegion(SD) &&
14330 "ScheduleData not in scheduling region");
14331 SD->IsScheduled = false;
14332 SD->resetUnscheduledDeps();
14333 });
14334 }
14335 ReadyInsts.clear();
14336}
14337
14338void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14339 if (!BS->ScheduleStart)
14340 return;
14341
14342 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
14343
14344 // A key point - if we got here, pre-scheduling was able to find a valid
14345 // scheduling of the sub-graph of the scheduling window which consists
14346 // of all vector bundles and their transitive users. As such, we do not
14347 // need to reschedule anything *outside of* that subgraph.
14348
14349 BS->resetSchedule();
14350
14351 // For the real scheduling we use a more sophisticated ready-list: it is
14352 // sorted by the original instruction location. This lets the final schedule
14353 // be as close as possible to the original instruction order.
14354 // WARNING: If changing this order causes a correctness issue, that means
14355 // there is some missing dependence edge in the schedule data graph.
14356 struct ScheduleDataCompare {
14357 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
14358 return SD2->SchedulingPriority < SD1->SchedulingPriority;
14359 }
14360 };
14361 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14362
14363 // Ensure that all dependency data is updated (for nodes in the sub-graph)
14364 // and fill the ready-list with initial instructions.
14365 int Idx = 0;
14366 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
14367 I = I->getNextNode()) {
14368 BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
14369 TreeEntry *SDTE = getTreeEntry(SD->Inst);
14370 (void)SDTE;
14372 SD->isPartOfBundle() ==
14373 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
14374 "scheduler and vectorizer bundle mismatch");
14375 SD->FirstInBundle->SchedulingPriority = Idx++;
14376
14377 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14378 BS->calculateDependencies(SD, false, this);
14379 });
14380 }
14381 BS->initialFillReadyList(ReadyInsts);
14382
14383 Instruction *LastScheduledInst = BS->ScheduleEnd;
14384
14385 // Do the "real" scheduling.
14386 while (!ReadyInsts.empty()) {
14387 ScheduleData *Picked = *ReadyInsts.begin();
14388 ReadyInsts.erase(ReadyInsts.begin());
14389
14390 // Move the scheduled instruction(s) to their dedicated places, if not
14391 // there yet.
14392 for (ScheduleData *BundleMember = Picked; BundleMember;
14393 BundleMember = BundleMember->NextInBundle) {
14394 Instruction *PickedInst = BundleMember->Inst;
14395 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
14396 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
14397 LastScheduledInst = PickedInst;
14398 }
14399
14400 BS->schedule(Picked, ReadyInsts);
14401 }
14402
14403 // Check that we didn't break any of our invariants.
14404#ifdef EXPENSIVE_CHECKS
14405 BS->verify();
14406#endif
14407
14408#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
14409 // Check that all schedulable entities got scheduled
14410 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
14411 BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
14412 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
14413 assert(SD->IsScheduled && "must be scheduled at this point");
14414 }
14415 });
14416 }
14417#endif
14418
14419 // Avoid duplicate scheduling of the block.
14420 BS->ScheduleStart = nullptr;
14421}
14422
14424 // If V is a store, just return the width of the stored value (or value
14425 // truncated just before storing) without traversing the expression tree.
14426 // This is the common case.
14427 if (auto *Store = dyn_cast<StoreInst>(V))
14428 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
14429
14430 if (auto *IEI = dyn_cast<InsertElementInst>(V))
14431 return getVectorElementSize(IEI->getOperand(1));
14432
14433 auto E = InstrElementSize.find(V);
14434 if (E != InstrElementSize.end())
14435 return E->second;
14436
14437 // If V is not a store, we can traverse the expression tree to find loads
14438 // that feed it. The type of the loaded value may indicate a more suitable
14439 // width than V's type. We want to base the vector element size on the width
14440 // of memory operations where possible.
14443 if (auto *I = dyn_cast<Instruction>(V)) {
14444 Worklist.emplace_back(I, I->getParent(), 0);
14445 Visited.insert(I);
14446 }
14447
14448 // Traverse the expression tree in bottom-up order looking for loads. If we
14449 // encounter an instruction we don't yet handle, we give up.
14450 auto Width = 0u;
14451 Value *FirstNonBool = nullptr;
14452 while (!Worklist.empty()) {
14453 auto [I, Parent, Level] = Worklist.pop_back_val();
14454
14455 // We should only be looking at scalar instructions here. If the current
14456 // instruction has a vector type, skip.
14457 auto *Ty = I->getType();
14458 if (isa<VectorType>(Ty))
14459 continue;
14460 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
14461 FirstNonBool = I;
14462 if (Level > RecursionMaxDepth)
14463 continue;
14464
14465 // If the current instruction is a load, update MaxWidth to reflect the
14466 // width of the loaded value.
14467 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
14468 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
14469
14470 // Otherwise, we need to visit the operands of the instruction. We only
14471 // handle the interesting cases from buildTree here. If an operand is an
14472 // instruction we haven't yet visited and from the same basic block as the
14473 // user or the use is a PHI node, we add it to the worklist.
14476 for (Use &U : I->operands()) {
14477 if (auto *J = dyn_cast<Instruction>(U.get()))
14478 if (Visited.insert(J).second &&
14479 (isa<PHINode>(I) || J->getParent() == Parent)) {
14480 Worklist.emplace_back(J, J->getParent(), Level + 1);
14481 continue;
14482 }
14483 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
14484 FirstNonBool = U.get();
14485 }
14486 } else {
14487 break;
14488 }
14489 }
14490
14491 // If we didn't encounter a memory access in the expression tree, or if we
14492 // gave up for some reason, just return the width of V. Otherwise, return the
14493 // maximum width we found.
14494 if (!Width) {
14495 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
14496 V = FirstNonBool;
14497 Width = DL->getTypeSizeInBits(V->getType());
14498 }
14499
14500 for (Instruction *I : Visited)
14501 InstrElementSize[I] = Width;
14502
14503 return Width;
14504}
14505
14506bool BoUpSLP::collectValuesToDemote(
14507 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
14509 unsigned &MaxDepthLevel, bool &IsProfitableToDemote,
14510 bool IsTruncRoot) const {
14511 // We can always demote constants.
14512 if (all_of(E.Scalars, IsaPred<Constant>))
14513 return true;
14514
14515 unsigned OrigBitWidth = DL->getTypeSizeInBits(E.Scalars.front()->getType());
14516 if (OrigBitWidth == BitWidth) {
14517 MaxDepthLevel = 1;
14518 return true;
14519 }
14520
14521 // If the value is not a vectorized instruction in the expression and not used
14522 // by the insertelement instruction and not used in multiple vector nodes, it
14523 // cannot be demoted.
14524 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
14525 if (MultiNodeScalars.contains(V))
14526 return false;
14527 if (OrigBitWidth > BitWidth) {
14528 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14529 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
14530 return true;
14531 }
14532 auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
14533 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
14534 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*DL));
14535 if (IsSigned)
14536 ++BitWidth1;
14537 if (auto *I = dyn_cast<Instruction>(V)) {
14538 APInt Mask = DB->getDemandedBits(I);
14539 unsigned BitWidth2 =
14540 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
14541 while (!IsSigned && BitWidth2 < OrigBitWidth) {
14542 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
14543 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
14544 break;
14545 BitWidth2 *= 2;
14546 }
14547 BitWidth1 = std::min(BitWidth1, BitWidth2);
14548 }
14549 BitWidth = std::max(BitWidth, BitWidth1);
14550 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
14551 };
14552 using namespace std::placeholders;
14553 auto FinalAnalysis = [&]() {
14554 if (!IsProfitableToDemote)
14555 return false;
14556 bool Res = all_of(
14557 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
14558 // Gather demoted constant operands.
14559 if (Res && E.State == TreeEntry::NeedToGather &&
14560 all_of(E.Scalars, IsaPred<Constant>))
14561 ToDemote.push_back(E.Idx);
14562 return Res;
14563 };
14564 // TODO: improve handling of gathered values and others.
14565 if (E.State == TreeEntry::NeedToGather || !Visited.insert(&E).second ||
14566 any_of(E.Scalars, [&](Value *V) {
14567 return all_of(V->users(), [&](User *U) {
14568 return isa<InsertElementInst>(U) && !getTreeEntry(U);
14569 });
14570 }))
14571 return FinalAnalysis();
14572
14573 if (any_of(E.Scalars, [&](Value *V) {
14574 return !all_of(V->users(), [=](User *U) {
14575 return getTreeEntry(U) ||
14576 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14577 (!isa<CmpInst>(U) && U->getType()->isSized() &&
14578 !U->getType()->isScalableTy() &&
14579 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
14580 }) && !IsPotentiallyTruncated(V, BitWidth);
14581 }))
14582 return false;
14583
14584 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
14585 bool &NeedToExit) {
14586 NeedToExit = false;
14587 unsigned InitLevel = MaxDepthLevel;
14588 for (const TreeEntry *Op : Operands) {
14589 unsigned Level = InitLevel;
14590 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
14591 ToDemote, Visited, Level, IsProfitableToDemote,
14592 IsTruncRoot)) {
14593 if (!IsProfitableToDemote)
14594 return false;
14595 NeedToExit = true;
14596 if (!FinalAnalysis())
14597 return false;
14598 continue;
14599 }
14600 MaxDepthLevel = std::max(MaxDepthLevel, Level);
14601 }
14602 return true;
14603 };
14604 auto AttemptCheckBitwidth =
14605 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
14606 // Try all bitwidth < OrigBitWidth.
14607 NeedToExit = false;
14608 unsigned BestFailBitwidth = 0;
14609 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
14610 if (Checker(BitWidth, OrigBitWidth))
14611 return true;
14612 if (BestFailBitwidth == 0 && FinalAnalysis())
14613 BestFailBitwidth = BitWidth;
14614 }
14615 if (BitWidth >= OrigBitWidth) {
14616 if (BestFailBitwidth == 0) {
14617 BitWidth = OrigBitWidth;
14618 return false;
14619 }
14620 MaxDepthLevel = 1;
14621 BitWidth = BestFailBitwidth;
14622 NeedToExit = true;
14623 return true;
14624 }
14625 return false;
14626 };
14627 auto TryProcessInstruction =
14628 [&](unsigned &BitWidth,
14630 function_ref<bool(unsigned, unsigned)> Checker = {}) {
14631 if (Operands.empty()) {
14632 if (!IsTruncRoot)
14633 MaxDepthLevel = 1;
14634 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
14635 std::ref(BitWidth)));
14636 } else {
14637 // Several vectorized uses? Check if we can truncate it, otherwise -
14638 // exit.
14639 if (E.UserTreeIndices.size() > 1 &&
14640 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
14641 std::ref(BitWidth))))
14642 return false;
14643 bool NeedToExit = false;
14644 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
14645 return false;
14646 if (NeedToExit)
14647 return true;
14648 if (!ProcessOperands(Operands, NeedToExit))
14649 return false;
14650 if (NeedToExit)
14651 return true;
14652 }
14653
14654 ++MaxDepthLevel;
14655 // Record the entry that we can demote.
14656 ToDemote.push_back(E.Idx);
14657 return IsProfitableToDemote;
14658 };
14659 switch (E.getOpcode()) {
14660
14661 // We can always demote truncations and extensions. Since truncations can
14662 // seed additional demotion, we save the truncated value.
14663 case Instruction::Trunc:
14664 if (IsProfitableToDemoteRoot)
14665 IsProfitableToDemote = true;
14666 return TryProcessInstruction(BitWidth);
14667 case Instruction::ZExt:
14668 case Instruction::SExt:
14669 IsProfitableToDemote = true;
14670 return TryProcessInstruction(BitWidth);
14671
14672 // We can demote certain binary operations if we can demote both of their
14673 // operands.
14674 case Instruction::Add:
14675 case Instruction::Sub:
14676 case Instruction::Mul:
14677 case Instruction::And:
14678 case Instruction::Or:
14679 case Instruction::Xor: {
14680 return TryProcessInstruction(
14681 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
14682 }
14683 case Instruction::Shl: {
14684 // If we are truncating the result of this SHL, and if it's a shift of an
14685 // inrange amount, we can always perform a SHL in a smaller type.
14686 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
14687 return all_of(E.Scalars, [&](Value *V) {
14688 auto *I = cast<Instruction>(V);
14689 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14690 return AmtKnownBits.getMaxValue().ult(BitWidth);
14691 });
14692 };
14693 return TryProcessInstruction(
14694 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
14695 }
14696 case Instruction::LShr: {
14697 // If this is a truncate of a logical shr, we can truncate it to a smaller
14698 // lshr iff we know that the bits we would otherwise be shifting in are
14699 // already zeros.
14700 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14701 return all_of(E.Scalars, [&](Value *V) {
14702 auto *I = cast<Instruction>(V);
14703 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14704 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14705 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
14706 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
14707 SimplifyQuery(*DL));
14708 });
14709 };
14710 return TryProcessInstruction(
14711 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
14712 LShrChecker);
14713 }
14714 case Instruction::AShr: {
14715 // If this is a truncate of an arithmetic shr, we can truncate it to a
14716 // smaller ashr iff we know that all the bits from the sign bit of the
14717 // original type and the sign bit of the truncate type are similar.
14718 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14719 return all_of(E.Scalars, [&](Value *V) {
14720 auto *I = cast<Instruction>(V);
14721 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14722 unsigned ShiftedBits = OrigBitWidth - BitWidth;
14723 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
14724 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
14725 nullptr, DT);
14726 });
14727 };
14728 return TryProcessInstruction(
14729 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
14730 AShrChecker);
14731 }
14732 case Instruction::UDiv:
14733 case Instruction::URem: {
14734 // UDiv and URem can be truncated if all the truncated bits are zero.
14735 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14736 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
14737 return all_of(E.Scalars, [&](Value *V) {
14738 auto *I = cast<Instruction>(V);
14739 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14740 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
14741 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
14742 });
14743 };
14744 return TryProcessInstruction(
14745 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
14746 }
14747
14748 // We can demote selects if we can demote their true and false values.
14749 case Instruction::Select: {
14750 return TryProcessInstruction(
14751 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
14752 }
14753
14754 // We can demote phis if we can demote all their incoming operands. Note that
14755 // we don't need to worry about cycles since we ensure single use above.
14756 case Instruction::PHI: {
14757 const unsigned NumOps = E.getNumOperands();
14759 transform(seq<unsigned>(0, NumOps), Ops.begin(),
14760 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
14761
14762 return TryProcessInstruction(BitWidth, Ops);
14763 }
14764
14765 case Instruction::Call: {
14766 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
14767 if (!IC)
14768 break;
14770 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
14771 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
14772 break;
14773 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
14774 function_ref<bool(unsigned, unsigned)> CallChecker;
14775 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14776 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
14777 return all_of(E.Scalars, [&](Value *V) {
14778 auto *I = cast<Instruction>(V);
14779 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
14780 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14781 return MaskedValueIsZero(I->getOperand(0), Mask,
14782 SimplifyQuery(*DL)) &&
14783 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
14784 }
14785 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
14786 "Expected min/max intrinsics only.");
14787 unsigned SignBits = OrigBitWidth - BitWidth;
14788 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
14789 return SignBits <= ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
14790 nullptr, DT) &&
14791 (!isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL)) ||
14792 MaskedValueIsZero(I->getOperand(0), Mask,
14793 SimplifyQuery(*DL))) &&
14794 SignBits <= ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
14795 nullptr, DT) &&
14796 (!isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL)) ||
14797 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
14798 });
14799 };
14800 if (ID != Intrinsic::abs) {
14801 Operands.push_back(getOperandEntry(&E, 1));
14802 CallChecker = CompChecker;
14803 }
14804 InstructionCost BestCost =
14805 std::numeric_limits<InstructionCost::CostType>::max();
14806 unsigned BestBitWidth = BitWidth;
14807 unsigned VF = E.Scalars.size();
14808 // Choose the best bitwidth based on cost estimations.
14809 auto Checker = [&](unsigned BitWidth, unsigned) {
14810 unsigned MinBW = PowerOf2Ceil(BitWidth);
14811 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(IC, ID, VF, MinBW);
14812 auto VecCallCosts = getVectorCallCosts(
14813 IC,
14814 FixedVectorType::get(IntegerType::get(IC->getContext(), MinBW), VF),
14815 TTI, TLI, ArgTys);
14816 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
14817 if (Cost < BestCost) {
14818 BestCost = Cost;
14819 BestBitWidth = BitWidth;
14820 }
14821 return false;
14822 };
14823 [[maybe_unused]] bool NeedToExit;
14824 (void)AttemptCheckBitwidth(Checker, NeedToExit);
14825 BitWidth = BestBitWidth;
14826 return TryProcessInstruction(BitWidth, Operands, CallChecker);
14827 }
14828
14829 // Otherwise, conservatively give up.
14830 default:
14831 break;
14832 }
14833 MaxDepthLevel = 1;
14834 return FinalAnalysis();
14835}
14836
14837static RecurKind getRdxKind(Value *V);
14838
14840 // We only attempt to truncate integer expressions.
14841 bool IsStoreOrInsertElt =
14842 VectorizableTree.front()->getOpcode() == Instruction::Store ||
14843 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
14844 if ((IsStoreOrInsertElt || UserIgnoreList) &&
14845 ExtraBitWidthNodes.size() <= 1 &&
14846 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
14847 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
14848 return;
14849
14850 unsigned NodeIdx = 0;
14851 if (IsStoreOrInsertElt &&
14852 VectorizableTree.front()->State != TreeEntry::NeedToGather)
14853 NodeIdx = 1;
14854
14855 // Ensure the roots of the vectorizable tree don't form a cycle.
14856 if (VectorizableTree[NodeIdx]->State == TreeEntry::NeedToGather ||
14857 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
14858 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
14859 [NodeIdx](const EdgeInfo &EI) {
14860 return EI.UserTE->Idx >
14861 static_cast<int>(NodeIdx);
14862 })))
14863 return;
14864
14865 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
14866 // resize to the final type.
14867 bool IsTruncRoot = false;
14868 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
14869 SmallVector<unsigned> RootDemotes;
14870 if (NodeIdx != 0 &&
14871 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
14872 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
14873 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
14874 IsTruncRoot = true;
14875 RootDemotes.push_back(NodeIdx);
14876 IsProfitableToDemoteRoot = true;
14877 ++NodeIdx;
14878 }
14879
14880 // Analyzed the reduction already and not profitable - exit.
14881 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
14882 return;
14883
14884 SmallVector<unsigned> ToDemote;
14885 auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
14886 bool IsProfitableToDemoteRoot, unsigned Opcode,
14887 unsigned Limit, bool IsTruncRoot,
14888 bool IsSignedCmp) {
14889 ToDemote.clear();
14890 unsigned VF = E.getVectorFactor();
14891 auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType());
14892 if (!TreeRootIT || !Opcode)
14893 return 0u;
14894
14895 if (any_of(E.Scalars,
14896 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
14897 return 0u;
14898
14899 unsigned NumParts =
14900 TTI->getNumberOfParts(FixedVectorType::get(TreeRootIT, VF));
14901
14902 // The maximum bit width required to represent all the values that can be
14903 // demoted without loss of precision. It would be safe to truncate the roots
14904 // of the expression to this width.
14905 unsigned MaxBitWidth = 1u;
14906
14907 // True if the roots can be zero-extended back to their original type,
14908 // rather than sign-extended. We know that if the leading bits are not
14909 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
14910 // True.
14911 // Determine if the sign bit of all the roots is known to be zero. If not,
14912 // IsKnownPositive is set to False.
14913 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
14914 KnownBits Known = computeKnownBits(R, *DL);
14915 return Known.isNonNegative();
14916 });
14917
14918 // We first check if all the bits of the roots are demanded. If they're not,
14919 // we can truncate the roots to this narrower type.
14920 for (Value *Root : E.Scalars) {
14921 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
14922 TypeSize NumTypeBits = DL->getTypeSizeInBits(Root->getType());
14923 unsigned BitWidth1 = NumTypeBits - NumSignBits;
14924 // If we can't prove that the sign bit is zero, we must add one to the
14925 // maximum bit width to account for the unknown sign bit. This preserves
14926 // the existing sign bit so we can safely sign-extend the root back to the
14927 // original type. Otherwise, if we know the sign bit is zero, we will
14928 // zero-extend the root instead.
14929 //
14930 // FIXME: This is somewhat suboptimal, as there will be cases where adding
14931 // one to the maximum bit width will yield a larger-than-necessary
14932 // type. In general, we need to add an extra bit only if we can't
14933 // prove that the upper bit of the original type is equal to the
14934 // upper bit of the proposed smaller type. If these two bits are
14935 // the same (either zero or one) we know that sign-extending from
14936 // the smaller type will result in the same value. Here, since we
14937 // can't yet prove this, we are just making the proposed smaller
14938 // type larger to ensure correctness.
14939 if (!IsKnownPositive)
14940 ++BitWidth1;
14941
14942 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
14943 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
14944 MaxBitWidth =
14945 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
14946 }
14947
14948 if (MaxBitWidth < 8 && MaxBitWidth > 1)
14949 MaxBitWidth = 8;
14950
14951 // If the original type is large, but reduced type does not improve the reg
14952 // use - ignore it.
14953 if (NumParts > 1 &&
14954 NumParts ==
14956 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
14957 return 0u;
14958
14959 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
14960 Opcode == Instruction::SExt ||
14961 Opcode == Instruction::ZExt || NumParts > 1;
14962 // Conservatively determine if we can actually truncate the roots of the
14963 // expression. Collect the values that can be demoted in ToDemote and
14964 // additional roots that require investigating in Roots.
14966 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
14967 bool NeedToDemote = IsProfitableToDemote;
14968
14969 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
14970 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
14971 IsTruncRoot) ||
14972 (MaxDepthLevel <= Limit &&
14973 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
14974 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
14975 DL->getTypeSizeInBits(TreeRootIT) /
14976 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
14977 ->getOperand(0)
14978 ->getType()) >
14979 2)))))
14980 return 0u;
14981 // Round MaxBitWidth up to the next power-of-two.
14982 MaxBitWidth = bit_ceil(MaxBitWidth);
14983
14984 return MaxBitWidth;
14985 };
14986
14987 // If we can truncate the root, we must collect additional values that might
14988 // be demoted as a result. That is, those seeded by truncations we will
14989 // modify.
14990 // Add reduction ops sizes, if any.
14991 if (UserIgnoreList &&
14992 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
14993 for (Value *V : *UserIgnoreList) {
14994 auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
14995 auto NumTypeBits = DL->getTypeSizeInBits(V->getType());
14996 unsigned BitWidth1 = NumTypeBits - NumSignBits;
14998 ++BitWidth1;
14999 unsigned BitWidth2 = BitWidth1;
15001 auto Mask = DB->getDemandedBits(cast<Instruction>(V));
15002 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15003 }
15004 ReductionBitWidth =
15005 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
15006 }
15007 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
15008 ReductionBitWidth = 8;
15009
15010 ReductionBitWidth = bit_ceil(ReductionBitWidth);
15011 }
15012 bool IsTopRoot = NodeIdx == 0;
15013 while (NodeIdx < VectorizableTree.size() &&
15014 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15015 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15016 RootDemotes.push_back(NodeIdx);
15017 ++NodeIdx;
15018 IsTruncRoot = true;
15019 }
15020 bool IsSignedCmp = false;
15021 while (NodeIdx < VectorizableTree.size()) {
15022 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
15023 unsigned Limit = 2;
15024 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
15025 if (IsTopRoot &&
15026 ReductionBitWidth ==
15027 DL->getTypeSizeInBits(
15028 VectorizableTree.front()->Scalars.front()->getType()))
15029 Limit = 3;
15030 unsigned MaxBitWidth = ComputeMaxBitWidth(
15031 *VectorizableTree[NodeIdx].get(), IsTopRoot, IsProfitableToDemoteRoot,
15032 Opcode, Limit, IsTruncRoot, IsSignedCmp);
15033 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
15034 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
15035 ReductionBitWidth = bit_ceil(MaxBitWidth);
15036 else if (MaxBitWidth == 0)
15037 ReductionBitWidth = 0;
15038 }
15039
15040 for (unsigned Idx : RootDemotes) {
15041 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
15042 uint32_t OrigBitWidth = DL->getTypeSizeInBits(V->getType());
15043 if (OrigBitWidth > MaxBitWidth) {
15044 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
15045 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
15046 }
15047 return false;
15048 }))
15049 ToDemote.push_back(Idx);
15050 }
15051 RootDemotes.clear();
15052 IsTopRoot = false;
15053 IsProfitableToDemoteRoot = true;
15054
15055 if (ExtraBitWidthNodes.empty()) {
15056 NodeIdx = VectorizableTree.size();
15057 } else {
15058 unsigned NewIdx = 0;
15059 do {
15060 NewIdx = *ExtraBitWidthNodes.begin();
15061 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
15062 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
15063 NodeIdx = NewIdx;
15064 IsTruncRoot =
15065 NodeIdx < VectorizableTree.size() &&
15066 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15067 [](const EdgeInfo &EI) {
15068 return EI.EdgeIdx == 0 &&
15069 EI.UserTE->getOpcode() == Instruction::Trunc &&
15070 !EI.UserTE->isAltShuffle();
15071 });
15072 IsSignedCmp =
15073 NodeIdx < VectorizableTree.size() &&
15074 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15075 [](const EdgeInfo &EI) {
15076 return EI.UserTE->getOpcode() == Instruction::ICmp &&
15077 any_of(EI.UserTE->Scalars, [](Value *V) {
15078 auto *IC = dyn_cast<ICmpInst>(V);
15079 return IC && IC->isSigned();
15080 });
15081 });
15082 }
15083
15084 // If the maximum bit width we compute is less than the with of the roots'
15085 // type, we can proceed with the narrowing. Otherwise, do nothing.
15086 if (MaxBitWidth == 0 ||
15087 MaxBitWidth >=
15088 cast<IntegerType>(TreeRoot.front()->getType())->getBitWidth()) {
15089 if (UserIgnoreList)
15090 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
15091 continue;
15092 }
15093
15094 // Finally, map the values we can demote to the maximum bit with we
15095 // computed.
15096 for (unsigned Idx : ToDemote) {
15097 TreeEntry *TE = VectorizableTree[Idx].get();
15098 if (MinBWs.contains(TE))
15099 continue;
15100 bool IsSigned = TE->getOpcode() == Instruction::SExt ||
15101 any_of(TE->Scalars, [&](Value *R) {
15102 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15103 });
15104 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
15105 }
15106 }
15107}
15108
15110 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
15111 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
15112 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
15113 auto *AA = &AM.getResult<AAManager>(F);
15114 auto *LI = &AM.getResult<LoopAnalysis>(F);
15115 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
15116 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
15117 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
15119
15120 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
15121 if (!Changed)
15122 return PreservedAnalyses::all();
15123
15126 return PA;
15127}
15128
15130 TargetTransformInfo *TTI_,
15131 TargetLibraryInfo *TLI_, AAResults *AA_,
15132 LoopInfo *LI_, DominatorTree *DT_,
15133 AssumptionCache *AC_, DemandedBits *DB_,
15136 return false;
15137 SE = SE_;
15138 TTI = TTI_;
15139 TLI = TLI_;
15140 AA = AA_;
15141 LI = LI_;
15142 DT = DT_;
15143 AC = AC_;
15144 DB = DB_;
15145 DL = &F.getParent()->getDataLayout();
15146
15147 Stores.clear();
15148 GEPs.clear();
15149 bool Changed = false;
15150
15151 // If the target claims to have no vector registers don't attempt
15152 // vectorization.
15154 LLVM_DEBUG(
15155 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
15156 return false;
15157 }
15158
15159 // Don't vectorize when the attribute NoImplicitFloat is used.
15160 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
15161 return false;
15162
15163 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
15164
15165 // Use the bottom up slp vectorizer to construct chains that start with
15166 // store instructions.
15167 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
15168
15169 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
15170 // delete instructions.
15171
15172 // Update DFS numbers now so that we can use them for ordering.
15173 DT->updateDFSNumbers();
15174
15175 // Scan the blocks in the function in post order.
15176 for (auto *BB : post_order(&F.getEntryBlock())) {
15177 // Start new block - clear the list of reduction roots.
15178 R.clearReductionData();
15179 collectSeedInstructions(BB);
15180
15181 // Vectorize trees that end at stores.
15182 if (!Stores.empty()) {
15183 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
15184 << " underlying objects.\n");
15185 Changed |= vectorizeStoreChains(R);
15186 }
15187
15188 // Vectorize trees that end at reductions.
15189 Changed |= vectorizeChainsInBlock(BB, R);
15190
15191 // Vectorize the index computations of getelementptr instructions. This
15192 // is primarily intended to catch gather-like idioms ending at
15193 // non-consecutive loads.
15194 if (!GEPs.empty()) {
15195 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
15196 << " underlying objects.\n");
15197 Changed |= vectorizeGEPIndices(BB, R);
15198 }
15199 }
15200
15201 if (Changed) {
15202 R.optimizeGatherSequence();
15203 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
15204 }
15205 return Changed;
15206}
15207
15208bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
15209 unsigned Idx, unsigned MinVF) {
15210 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
15211 << "\n");
15212 const unsigned Sz = R.getVectorElementSize(Chain[0]);
15213 unsigned VF = Chain.size();
15214
15215 if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) {
15216 // Check if vectorizing with a non-power-of-2 VF should be considered. At
15217 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
15218 // all vector lanes are used.
15219 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
15220 return false;
15221 }
15222
15223 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
15224 << "\n");
15225
15226 R.buildTree(Chain);
15227 if (R.isTreeTinyAndNotFullyVectorizable())
15228 return false;
15229 if (R.isLoadCombineCandidate())
15230 return false;
15231 R.reorderTopToBottom();
15232 R.reorderBottomToTop();
15233 R.buildExternalUses();
15234
15235 R.computeMinimumValueSizes();
15236 R.transformNodes();
15237
15238 InstructionCost Cost = R.getTreeCost();
15239
15240 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
15241 if (Cost < -SLPCostThreshold) {
15242 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
15243
15244 using namespace ore;
15245
15246 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
15247 cast<StoreInst>(Chain[0]))
15248 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
15249 << " and with tree size "
15250 << NV("TreeSize", R.getTreeSize()));
15251
15252 R.vectorizeTree();
15253 return true;
15254 }
15255
15256 return false;
15257}
15258
15259bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
15260 BoUpSLP &R) {
15261 // We may run into multiple chains that merge into a single chain. We mark the
15262 // stores that we vectorized so that we don't visit the same store twice.
15263 BoUpSLP::ValueSet VectorizedStores;
15264 bool Changed = false;
15265
15266 // Stores the pair of stores (first_store, last_store) in a range, that were
15267 // already tried to be vectorized. Allows to skip the store ranges that were
15268 // already tried to be vectorized but the attempts were unsuccessful.
15270 struct StoreDistCompare {
15271 bool operator()(const std::pair<unsigned, int> &Op1,
15272 const std::pair<unsigned, int> &Op2) const {
15273 return Op1.second < Op2.second;
15274 }
15275 };
15276 // A set of pairs (index of store in Stores array ref, Distance of the store
15277 // address relative to base store address in units).
15278 using StoreIndexToDistSet =
15279 std::set<std::pair<unsigned, int>, StoreDistCompare>;
15280 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
15281 int PrevDist = -1;
15283 // Collect the chain into a list.
15284 for (auto [Idx, Data] : enumerate(Set)) {
15285 if (Operands.empty() || Data.second - PrevDist == 1) {
15286 Operands.push_back(Stores[Data.first]);
15287 PrevDist = Data.second;
15288 if (Idx != Set.size() - 1)
15289 continue;
15290 }
15291 auto E = make_scope_exit([&, &DataVar = Data]() {
15292 Operands.clear();
15293 Operands.push_back(Stores[DataVar.first]);
15294 PrevDist = DataVar.second;
15295 });
15296
15297 if (Operands.size() <= 1)
15298 continue;
15299
15300 unsigned MaxVecRegSize = R.getMaxVecRegSize();
15301 unsigned EltSize = R.getVectorElementSize(Operands[0]);
15302 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
15303
15304 unsigned MaxVF =
15305 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
15306 auto *Store = cast<StoreInst>(Operands[0]);
15307 Type *StoreTy = Store->getValueOperand()->getType();
15308 Type *ValueTy = StoreTy;
15309 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
15310 ValueTy = Trunc->getSrcTy();
15311 unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
15312 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy, ValueTy));
15313
15314 if (MaxVF < MinVF) {
15315 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
15316 << ") < "
15317 << "MinVF (" << MinVF << ")\n");
15318 continue;
15319 }
15320
15321 unsigned NonPowerOf2VF = 0;
15323 // First try vectorizing with a non-power-of-2 VF. At the moment, only
15324 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
15325 // lanes are used.
15326 unsigned CandVF = Operands.size();
15327 if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF)
15328 NonPowerOf2VF = CandVF;
15329 }
15330
15331 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
15332 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
15333 unsigned Size = MinVF;
15334 for_each(reverse(CandidateVFs), [&](unsigned &VF) {
15335 VF = Size > MaxVF ? NonPowerOf2VF : Size;
15336 Size *= 2;
15337 });
15338 unsigned StartIdx = 0;
15339 for (unsigned Size : CandidateVFs) {
15340 for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
15342 assert(
15343 all_of(
15344 Slice,
15345 [&](Value *V) {
15346 return cast<StoreInst>(V)->getValueOperand()->getType() ==
15347 cast<StoreInst>(Slice.front())
15348 ->getValueOperand()
15349 ->getType();
15350 }) &&
15351 "Expected all operands of same type.");
15352 if (!VectorizedStores.count(Slice.front()) &&
15353 !VectorizedStores.count(Slice.back()) &&
15354 TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))
15355 .second &&
15356 vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
15357 // Mark the vectorized stores so that we don't vectorize them again.
15358 VectorizedStores.insert(Slice.begin(), Slice.end());
15359 Changed = true;
15360 // If we vectorized initial block, no need to try to vectorize it
15361 // again.
15362 if (Cnt == StartIdx)
15363 StartIdx += Size;
15364 Cnt += Size;
15365 continue;
15366 }
15367 ++Cnt;
15368 }
15369 // Check if the whole array was vectorized already - exit.
15370 if (StartIdx >= Operands.size())
15371 break;
15372 }
15373 }
15374 };
15375
15376 // Stores pair (first: index of the store into Stores array ref, address of
15377 // which taken as base, second: sorted set of pairs {index, dist}, which are
15378 // indices of stores in the set and their store location distances relative to
15379 // the base address).
15380
15381 // Need to store the index of the very first store separately, since the set
15382 // may be reordered after the insertion and the first store may be moved. This
15383 // container allows to reduce number of calls of getPointersDiff() function.
15385 // Inserts the specified store SI with the given index Idx to the set of the
15386 // stores. If the store with the same distance is found already - stop
15387 // insertion, try to vectorize already found stores. If some stores from this
15388 // sequence were not vectorized - try to vectorize them with the new store
15389 // later. But this logic is applied only to the stores, that come before the
15390 // previous store with the same distance.
15391 // Example:
15392 // 1. store x, %p
15393 // 2. store y, %p+1
15394 // 3. store z, %p+2
15395 // 4. store a, %p
15396 // 5. store b, %p+3
15397 // - Scan this from the last to first store. The very first bunch of stores is
15398 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
15399 // vector).
15400 // - The next store in the list - #1 - has the same distance from store #5 as
15401 // the store #4.
15402 // - Try to vectorize sequence of stores 4,2,3,5.
15403 // - If all these stores are vectorized - just drop them.
15404 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
15405 // - Start new stores sequence.
15406 // The new bunch of stores is {1, {1, 0}}.
15407 // - Add the stores from previous sequence, that were not vectorized.
15408 // Here we consider the stores in the reversed order, rather they are used in
15409 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
15410 // Store #3 can be added -> comes after store #4 with the same distance as
15411 // store #1.
15412 // Store #5 cannot be added - comes before store #4.
15413 // This logic allows to improve the compile time, we assume that the stores
15414 // after previous store with the same distance most likely have memory
15415 // dependencies and no need to waste compile time to try to vectorize them.
15416 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
15417 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
15418 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
15419 std::optional<int> Diff = getPointersDiff(
15420 Stores[Set.first]->getValueOperand()->getType(),
15421 Stores[Set.first]->getPointerOperand(),
15422 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
15423 /*StrictCheck=*/true);
15424 if (!Diff)
15425 continue;
15426 auto It = Set.second.find(std::make_pair(Idx, *Diff));
15427 if (It == Set.second.end()) {
15428 Set.second.emplace(Idx, *Diff);
15429 return;
15430 }
15431 // Try to vectorize the first found set to avoid duplicate analysis.
15432 TryToVectorize(Set.second);
15433 StoreIndexToDistSet PrevSet;
15434 PrevSet.swap(Set.second);
15435 Set.first = Idx;
15436 Set.second.emplace(Idx, 0);
15437 // Insert stores that followed previous match to try to vectorize them
15438 // with this store.
15439 unsigned StartIdx = It->first + 1;
15440 SmallBitVector UsedStores(Idx - StartIdx);
15441 // Distances to previously found dup store (or this store, since they
15442 // store to the same addresses).
15443 SmallVector<int> Dists(Idx - StartIdx, 0);
15444 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
15445 // Do not try to vectorize sequences, we already tried.
15446 if (Pair.first <= It->first ||
15447 VectorizedStores.contains(Stores[Pair.first]))
15448 break;
15449 unsigned BI = Pair.first - StartIdx;
15450 UsedStores.set(BI);
15451 Dists[BI] = Pair.second - It->second;
15452 }
15453 for (unsigned I = StartIdx; I < Idx; ++I) {
15454 unsigned BI = I - StartIdx;
15455 if (UsedStores.test(BI))
15456 Set.second.emplace(I, Dists[BI]);
15457 }
15458 return;
15459 }
15460 auto &Res = SortedStores.emplace_back();
15461 Res.first = Idx;
15462 Res.second.emplace(Idx, 0);
15463 };
15464 StoreInst *PrevStore = Stores.front();
15465 for (auto [I, SI] : enumerate(Stores)) {
15466 // Check that we do not try to vectorize stores of different types.
15467 if (PrevStore->getValueOperand()->getType() !=
15468 SI->getValueOperand()->getType()) {
15469 for (auto &Set : SortedStores)
15470 TryToVectorize(Set.second);
15471 SortedStores.clear();
15472 PrevStore = SI;
15473 }
15474 FillStoresSet(I, SI);
15475 }
15476
15477 // Final vectorization attempt.
15478 for (auto &Set : SortedStores)
15479 TryToVectorize(Set.second);
15480
15481 return Changed;
15482}
15483
15484void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
15485 // Initialize the collections. We will make a single pass over the block.
15486 Stores.clear();
15487 GEPs.clear();
15488
15489 // Visit the store and getelementptr instructions in BB and organize them in
15490 // Stores and GEPs according to the underlying objects of their pointer
15491 // operands.
15492 for (Instruction &I : *BB) {
15493 // Ignore store instructions that are volatile or have a pointer operand
15494 // that doesn't point to a scalar type.
15495 if (auto *SI = dyn_cast<StoreInst>(&I)) {
15496 if (!SI->isSimple())
15497 continue;
15498 if (!isValidElementType(SI->getValueOperand()->getType()))
15499 continue;
15500 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
15501 }
15502
15503 // Ignore getelementptr instructions that have more than one index, a
15504 // constant index, or a pointer operand that doesn't point to a scalar
15505 // type.
15506 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
15507 if (GEP->getNumIndices() != 1)
15508 continue;
15509 Value *Idx = GEP->idx_begin()->get();
15510 if (isa<Constant>(Idx))
15511 continue;
15512 if (!isValidElementType(Idx->getType()))
15513 continue;
15514 if (GEP->getType()->isVectorTy())
15515 continue;
15516 GEPs[GEP->getPointerOperand()].push_back(GEP);
15517 }
15518 }
15519}
15520
15521bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
15522 bool MaxVFOnly) {
15523 if (VL.size() < 2)
15524 return false;
15525
15526 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
15527 << VL.size() << ".\n");
15528
15529 // Check that all of the parts are instructions of the same type,
15530 // we permit an alternate opcode via InstructionsState.
15531 InstructionsState S = getSameOpcode(VL, *TLI);
15532 if (!S.getOpcode())
15533 return false;
15534
15535 Instruction *I0 = cast<Instruction>(S.OpValue);
15536 // Make sure invalid types (including vector type) are rejected before
15537 // determining vectorization factor for scalar instructions.
15538 for (Value *V : VL) {
15539 Type *Ty = V->getType();
15540 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
15541 // NOTE: the following will give user internal llvm type name, which may
15542 // not be useful.
15543 R.getORE()->emit([&]() {
15544 std::string TypeStr;
15545 llvm::raw_string_ostream rso(TypeStr);
15546 Ty->print(rso);
15547 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
15548 << "Cannot SLP vectorize list: type "
15549 << rso.str() + " is unsupported by vectorizer";
15550 });
15551 return false;
15552 }
15553 }
15554
15555 unsigned Sz = R.getVectorElementSize(I0);
15556 unsigned MinVF = R.getMinVF(Sz);
15557 unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
15558 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
15559 if (MaxVF < 2) {
15560 R.getORE()->emit([&]() {
15561 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
15562 << "Cannot SLP vectorize list: vectorization factor "
15563 << "less than 2 is not supported";
15564 });
15565 return false;
15566 }
15567
15568 bool Changed = false;
15569 bool CandidateFound = false;
15570 InstructionCost MinCost = SLPCostThreshold.getValue();
15571 Type *ScalarTy = VL[0]->getType();
15572 if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
15573 ScalarTy = IE->getOperand(1)->getType();
15574
15575 unsigned NextInst = 0, MaxInst = VL.size();
15576 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
15577 // No actual vectorization should happen, if number of parts is the same as
15578 // provided vectorization factor (i.e. the scalar type is used for vector
15579 // code during codegen).
15580 auto *VecTy = FixedVectorType::get(ScalarTy, VF);
15581 if (TTI->getNumberOfParts(VecTy) == VF)
15582 continue;
15583 for (unsigned I = NextInst; I < MaxInst; ++I) {
15584 unsigned ActualVF = std::min(MaxInst - I, VF);
15585
15586 if (!isPowerOf2_32(ActualVF))
15587 continue;
15588
15589 if (MaxVFOnly && ActualVF < MaxVF)
15590 break;
15591 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
15592 break;
15593
15594 ArrayRef<Value *> Ops = VL.slice(I, ActualVF);
15595 // Check that a previous iteration of this loop did not delete the Value.
15596 if (llvm::any_of(Ops, [&R](Value *V) {
15597 auto *I = dyn_cast<Instruction>(V);
15598 return I && R.isDeleted(I);
15599 }))
15600 continue;
15601
15602 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
15603 << "\n");
15604
15605 R.buildTree(Ops);
15606 if (R.isTreeTinyAndNotFullyVectorizable())
15607 continue;
15608 R.reorderTopToBottom();
15609 R.reorderBottomToTop(
15610 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
15611 !R.doesRootHaveInTreeUses());
15612 R.buildExternalUses();
15613
15614 R.computeMinimumValueSizes();
15615 R.transformNodes();
15616 InstructionCost Cost = R.getTreeCost();
15617 CandidateFound = true;
15618 MinCost = std::min(MinCost, Cost);
15619
15620 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
15621 << " for VF=" << ActualVF << "\n");
15622 if (Cost < -SLPCostThreshold) {
15623 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
15624 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
15625 cast<Instruction>(Ops[0]))
15626 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
15627 << " and with tree size "
15628 << ore::NV("TreeSize", R.getTreeSize()));
15629
15630 R.vectorizeTree();
15631 // Move to the next bundle.
15632 I += VF - 1;
15633 NextInst = I + 1;
15634 Changed = true;
15635 }
15636 }
15637 }
15638
15639 if (!Changed && CandidateFound) {
15640 R.getORE()->emit([&]() {
15641 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
15642 << "List vectorization was possible but not beneficial with cost "
15643 << ore::NV("Cost", MinCost) << " >= "
15644 << ore::NV("Treshold", -SLPCostThreshold);
15645 });
15646 } else if (!Changed) {
15647 R.getORE()->emit([&]() {
15648 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
15649 << "Cannot SLP vectorize list: vectorization was impossible"
15650 << " with available vectorization factors";
15651 });
15652 }
15653 return Changed;
15654}
15655
15656bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
15657 if (!I)
15658 return false;
15659
15660 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
15661 return false;
15662
15663 Value *P = I->getParent();
15664
15665 // Vectorize in current basic block only.
15666 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
15667 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
15668 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
15669 return false;
15670
15671 // First collect all possible candidates
15673 Candidates.emplace_back(Op0, Op1);
15674
15675 auto *A = dyn_cast<BinaryOperator>(Op0);
15676 auto *B = dyn_cast<BinaryOperator>(Op1);
15677 // Try to skip B.
15678 if (A && B && B->hasOneUse()) {
15679 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
15680 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
15681 if (B0 && B0->getParent() == P)
15682 Candidates.emplace_back(A, B0);
15683 if (B1 && B1->getParent() == P)
15684 Candidates.emplace_back(A, B1);
15685 }
15686 // Try to skip A.
15687 if (B && A && A->hasOneUse()) {
15688 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
15689 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
15690 if (A0 && A0->getParent() == P)
15691 Candidates.emplace_back(A0, B);
15692 if (A1 && A1->getParent() == P)
15693 Candidates.emplace_back(A1, B);
15694 }
15695
15696 if (Candidates.size() == 1)
15697 return tryToVectorizeList({Op0, Op1}, R);
15698
15699 // We have multiple options. Try to pick the single best.
15700 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
15701 if (!BestCandidate)
15702 return false;
15703 return tryToVectorizeList(
15704 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
15705}
15706
15707namespace {
15708
15709/// Model horizontal reductions.
15710///
15711/// A horizontal reduction is a tree of reduction instructions that has values
15712/// that can be put into a vector as its leaves. For example:
15713///
15714/// mul mul mul mul
15715/// \ / \ /
15716/// + +
15717/// \ /
15718/// +
15719/// This tree has "mul" as its leaf values and "+" as its reduction
15720/// instructions. A reduction can feed into a store or a binary operation
15721/// feeding a phi.
15722/// ...
15723/// \ /
15724/// +
15725/// |
15726/// phi +=
15727///
15728/// Or:
15729/// ...
15730/// \ /
15731/// +
15732/// |
15733/// *p =
15734///
15735class HorizontalReduction {
15736 using ReductionOpsType = SmallVector<Value *, 16>;
15737 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
15738 ReductionOpsListType ReductionOps;
15739 /// List of possibly reduced values.
15741 /// Maps reduced value to the corresponding reduction operation.
15743 // Use map vector to make stable output.
15745 WeakTrackingVH ReductionRoot;
15746 /// The type of reduction operation.
15747 RecurKind RdxKind;
15748 /// Checks if the optimization of original scalar identity operations on
15749 /// matched horizontal reductions is enabled and allowed.
15750 bool IsSupportedHorRdxIdentityOp = false;
15751
15752 static bool isCmpSelMinMax(Instruction *I) {
15753 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
15755 }
15756
15757 // And/or are potentially poison-safe logical patterns like:
15758 // select x, y, false
15759 // select x, true, y
15760 static bool isBoolLogicOp(Instruction *I) {
15761 return isa<SelectInst>(I) &&
15762 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
15763 }
15764
15765 /// Checks if instruction is associative and can be vectorized.
15766 static bool isVectorizable(RecurKind Kind, Instruction *I) {
15767 if (Kind == RecurKind::None)
15768 return false;
15769
15770 // Integer ops that map to select instructions or intrinsics are fine.
15772 isBoolLogicOp(I))
15773 return true;
15774
15775 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
15776 // FP min/max are associative except for NaN and -0.0. We do not
15777 // have to rule out -0.0 here because the intrinsic semantics do not
15778 // specify a fixed result for it.
15779 return I->getFastMathFlags().noNaNs();
15780 }
15781
15782 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
15783 return true;
15784
15785 return I->isAssociative();
15786 }
15787
15788 static Value *getRdxOperand(Instruction *I, unsigned Index) {
15789 // Poison-safe 'or' takes the form: select X, true, Y
15790 // To make that work with the normal operand processing, we skip the
15791 // true value operand.
15792 // TODO: Change the code and data structures to handle this without a hack.
15793 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
15794 return I->getOperand(2);
15795 return I->getOperand(Index);
15796 }
15797
15798 /// Creates reduction operation with the current opcode.
15799 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
15800 Value *RHS, const Twine &Name, bool UseSelect) {
15801 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
15802 switch (Kind) {
15803 case RecurKind::Or:
15804 if (UseSelect &&
15806 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
15807 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
15808 Name);
15809 case RecurKind::And:
15810 if (UseSelect &&
15812 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
15813 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
15814 Name);
15815 case RecurKind::Add:
15816 case RecurKind::Mul:
15817 case RecurKind::Xor:
15818 case RecurKind::FAdd:
15819 case RecurKind::FMul:
15820 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
15821 Name);
15822 case RecurKind::FMax:
15823 return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
15824 case RecurKind::FMin:
15825 return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
15826 case RecurKind::FMaximum:
15827 return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
15828 case RecurKind::FMinimum:
15829 return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
15830 case RecurKind::SMax:
15831 if (UseSelect) {
15832 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
15833 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
15834 }
15835 return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
15836 case RecurKind::SMin:
15837 if (UseSelect) {
15838 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
15839 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
15840 }
15841 return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
15842 case RecurKind::UMax:
15843 if (UseSelect) {
15844 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
15845 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
15846 }
15847 return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
15848 case RecurKind::UMin:
15849 if (UseSelect) {
15850 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
15851 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
15852 }
15853 return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
15854 default:
15855 llvm_unreachable("Unknown reduction operation.");
15856 }
15857 }
15858
15859 /// Creates reduction operation with the current opcode with the IR flags
15860 /// from \p ReductionOps, dropping nuw/nsw flags.
15861 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
15862 Value *RHS, const Twine &Name,
15863 const ReductionOpsListType &ReductionOps) {
15864 bool UseSelect = ReductionOps.size() == 2 ||
15865 // Logical or/and.
15866 (ReductionOps.size() == 1 &&
15867 any_of(ReductionOps.front(), IsaPred<SelectInst>));
15868 assert((!UseSelect || ReductionOps.size() != 2 ||
15869 isa<SelectInst>(ReductionOps[1][0])) &&
15870 "Expected cmp + select pairs for reduction");
15871 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
15873 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
15874 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
15875 /*IncludeWrapFlags=*/false);
15876 propagateIRFlags(Op, ReductionOps[1], nullptr,
15877 /*IncludeWrapFlags=*/false);
15878 return Op;
15879 }
15880 }
15881 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
15882 return Op;
15883 }
15884
15885public:
15886 static RecurKind getRdxKind(Value *V) {
15887 auto *I = dyn_cast<Instruction>(V);
15888 if (!I)
15889 return RecurKind::None;
15890 if (match(I, m_Add(m_Value(), m_Value())))
15891 return RecurKind::Add;
15892 if (match(I, m_Mul(m_Value(), m_Value())))
15893 return RecurKind::Mul;
15894 if (match(I, m_And(m_Value(), m_Value())) ||
15896 return RecurKind::And;
15897 if (match(I, m_Or(m_Value(), m_Value())) ||
15899 return RecurKind::Or;
15900 if (match(I, m_Xor(m_Value(), m_Value())))
15901 return RecurKind::Xor;
15902 if (match(I, m_FAdd(m_Value(), m_Value())))
15903 return RecurKind::FAdd;
15904 if (match(I, m_FMul(m_Value(), m_Value())))
15905 return RecurKind::FMul;
15906
15907 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
15908 return RecurKind::FMax;
15909 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
15910 return RecurKind::FMin;
15911
15912 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
15913 return RecurKind::FMaximum;
15914 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
15915 return RecurKind::FMinimum;
15916 // This matches either cmp+select or intrinsics. SLP is expected to handle
15917 // either form.
15918 // TODO: If we are canonicalizing to intrinsics, we can remove several
15919 // special-case paths that deal with selects.
15920 if (match(I, m_SMax(m_Value(), m_Value())))
15921 return RecurKind::SMax;
15922 if (match(I, m_SMin(m_Value(), m_Value())))
15923 return RecurKind::SMin;
15924 if (match(I, m_UMax(m_Value(), m_Value())))
15925 return RecurKind::UMax;
15926 if (match(I, m_UMin(m_Value(), m_Value())))
15927 return RecurKind::UMin;
15928
15929 if (auto *Select = dyn_cast<SelectInst>(I)) {
15930 // Try harder: look for min/max pattern based on instructions producing
15931 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
15932 // During the intermediate stages of SLP, it's very common to have
15933 // pattern like this (since optimizeGatherSequence is run only once
15934 // at the end):
15935 // %1 = extractelement <2 x i32> %a, i32 0
15936 // %2 = extractelement <2 x i32> %a, i32 1
15937 // %cond = icmp sgt i32 %1, %2
15938 // %3 = extractelement <2 x i32> %a, i32 0
15939 // %4 = extractelement <2 x i32> %a, i32 1
15940 // %select = select i1 %cond, i32 %3, i32 %4
15941 CmpInst::Predicate Pred;
15942 Instruction *L1;
15943 Instruction *L2;
15944
15945 Value *LHS = Select->getTrueValue();
15946 Value *RHS = Select->getFalseValue();
15947 Value *Cond = Select->getCondition();
15948
15949 // TODO: Support inverse predicates.
15950 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
15951 if (!isa<ExtractElementInst>(RHS) ||
15952 !L2->isIdenticalTo(cast<Instruction>(RHS)))
15953 return RecurKind::None;
15954 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
15955 if (!isa<ExtractElementInst>(LHS) ||
15956 !L1->isIdenticalTo(cast<Instruction>(LHS)))
15957 return RecurKind::None;
15958 } else {
15959 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
15960 return RecurKind::None;
15961 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
15962 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
15963 !L2->isIdenticalTo(cast<Instruction>(RHS)))
15964 return RecurKind::None;
15965 }
15966
15967 switch (Pred) {
15968 default:
15969 return RecurKind::None;
15970 case CmpInst::ICMP_SGT:
15971 case CmpInst::ICMP_SGE:
15972 return RecurKind::SMax;
15973 case CmpInst::ICMP_SLT:
15974 case CmpInst::ICMP_SLE:
15975 return RecurKind::SMin;
15976 case CmpInst::ICMP_UGT:
15977 case CmpInst::ICMP_UGE:
15978 return RecurKind::UMax;
15979 case CmpInst::ICMP_ULT:
15980 case CmpInst::ICMP_ULE:
15981 return RecurKind::UMin;
15982 }
15983 }
15984 return RecurKind::None;
15985 }
15986
15987 /// Get the index of the first operand.
15988 static unsigned getFirstOperandIndex(Instruction *I) {
15989 return isCmpSelMinMax(I) ? 1 : 0;
15990 }
15991
15992private:
15993 /// Total number of operands in the reduction operation.
15994 static unsigned getNumberOfOperands(Instruction *I) {
15995 return isCmpSelMinMax(I) ? 3 : 2;
15996 }
15997
15998 /// Checks if the instruction is in basic block \p BB.
15999 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
16000 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
16001 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
16002 auto *Sel = cast<SelectInst>(I);
16003 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
16004 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
16005 }
16006 return I->getParent() == BB;
16007 }
16008
16009 /// Expected number of uses for reduction operations/reduced values.
16010 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
16011 if (IsCmpSelMinMax) {
16012 // SelectInst must be used twice while the condition op must have single
16013 // use only.
16014 if (auto *Sel = dyn_cast<SelectInst>(I))
16015 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
16016 return I->hasNUses(2);
16017 }
16018
16019 // Arithmetic reduction operation must be used once only.
16020 return I->hasOneUse();
16021 }
16022
16023 /// Initializes the list of reduction operations.
16024 void initReductionOps(Instruction *I) {
16025 if (isCmpSelMinMax(I))
16026 ReductionOps.assign(2, ReductionOpsType());
16027 else
16028 ReductionOps.assign(1, ReductionOpsType());
16029 }
16030
16031 /// Add all reduction operations for the reduction instruction \p I.
16032 void addReductionOps(Instruction *I) {
16033 if (isCmpSelMinMax(I)) {
16034 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
16035 ReductionOps[1].emplace_back(I);
16036 } else {
16037 ReductionOps[0].emplace_back(I);
16038 }
16039 }
16040
16041 static bool isGoodForReduction(ArrayRef<Value *> Data) {
16042 int Sz = Data.size();
16043 auto *I = dyn_cast<Instruction>(Data.front());
16044 return Sz > 1 || isConstant(Data.front()) ||
16045 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
16046 }
16047
16048public:
16049 HorizontalReduction() = default;
16050
16051 /// Try to find a reduction tree.
16052 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
16053 ScalarEvolution &SE, const DataLayout &DL,
16054 const TargetLibraryInfo &TLI) {
16055 RdxKind = HorizontalReduction::getRdxKind(Root);
16056 if (!isVectorizable(RdxKind, Root))
16057 return false;
16058
16059 // Analyze "regular" integer/FP types for reductions - no target-specific
16060 // types or pointers.
16061 Type *Ty = Root->getType();
16062 if (!isValidElementType(Ty) || Ty->isPointerTy())
16063 return false;
16064
16065 // Though the ultimate reduction may have multiple uses, its condition must
16066 // have only single use.
16067 if (auto *Sel = dyn_cast<SelectInst>(Root))
16068 if (!Sel->getCondition()->hasOneUse())
16069 return false;
16070
16071 ReductionRoot = Root;
16072
16073 // Iterate through all the operands of the possible reduction tree and
16074 // gather all the reduced values, sorting them by their value id.
16075 BasicBlock *BB = Root->getParent();
16076 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
16077 SmallVector<Instruction *> Worklist(1, Root);
16078 // Checks if the operands of the \p TreeN instruction are also reduction
16079 // operations or should be treated as reduced values or an extra argument,
16080 // which is not part of the reduction.
16081 auto CheckOperands = [&](Instruction *TreeN,
16082 SmallVectorImpl<Value *> &ExtraArgs,
16083 SmallVectorImpl<Value *> &PossibleReducedVals,
16084 SmallVectorImpl<Instruction *> &ReductionOps) {
16085 for (int I = getFirstOperandIndex(TreeN),
16086 End = getNumberOfOperands(TreeN);
16087 I < End; ++I) {
16088 Value *EdgeVal = getRdxOperand(TreeN, I);
16089 ReducedValsToOps[EdgeVal].push_back(TreeN);
16090 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
16091 // Edge has wrong parent - mark as an extra argument.
16092 if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
16093 !hasSameParent(EdgeInst, BB)) {
16094 ExtraArgs.push_back(EdgeVal);
16095 continue;
16096 }
16097 // If the edge is not an instruction, or it is different from the main
16098 // reduction opcode or has too many uses - possible reduced value.
16099 // Also, do not try to reduce const values, if the operation is not
16100 // foldable.
16101 if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
16102 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
16103 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
16104 !isVectorizable(RdxKind, EdgeInst) ||
16105 (R.isAnalyzedReductionRoot(EdgeInst) &&
16106 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
16107 PossibleReducedVals.push_back(EdgeVal);
16108 continue;
16109 }
16110 ReductionOps.push_back(EdgeInst);
16111 }
16112 };
16113 // Try to regroup reduced values so that it gets more profitable to try to
16114 // reduce them. Values are grouped by their value ids, instructions - by
16115 // instruction op id and/or alternate op id, plus do extra analysis for
16116 // loads (grouping them by the distabce between pointers) and cmp
16117 // instructions (grouping them by the predicate).
16119 PossibleReducedVals;
16120 initReductionOps(Root);
16122 SmallSet<size_t, 2> LoadKeyUsed;
16123 SmallPtrSet<Value *, 4> DoNotReverseVals;
16124
16125 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
16127 if (LoadKeyUsed.contains(Key)) {
16128 auto LIt = LoadsMap.find(Ptr);
16129 if (LIt != LoadsMap.end()) {
16130 for (LoadInst *RLI : LIt->second) {
16131 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
16132 LI->getType(), LI->getPointerOperand(), DL, SE,
16133 /*StrictCheck=*/true))
16134 return hash_value(RLI->getPointerOperand());
16135 }
16136 for (LoadInst *RLI : LIt->second) {
16138 LI->getPointerOperand(), TLI)) {
16139 hash_code SubKey = hash_value(RLI->getPointerOperand());
16140 DoNotReverseVals.insert(RLI);
16141 return SubKey;
16142 }
16143 }
16144 if (LIt->second.size() > 2) {
16145 hash_code SubKey =
16146 hash_value(LIt->second.back()->getPointerOperand());
16147 DoNotReverseVals.insert(LIt->second.back());
16148 return SubKey;
16149 }
16150 }
16151 }
16152 LoadKeyUsed.insert(Key);
16153 LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
16154 return hash_value(LI->getPointerOperand());
16155 };
16156
16157 while (!Worklist.empty()) {
16158 Instruction *TreeN = Worklist.pop_back_val();
16160 SmallVector<Value *> PossibleRedVals;
16161 SmallVector<Instruction *> PossibleReductionOps;
16162 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
16163 // If too many extra args - mark the instruction itself as a reduction
16164 // value, not a reduction operation.
16165 if (Args.size() < 2) {
16166 addReductionOps(TreeN);
16167 // Add extra args.
16168 if (!Args.empty()) {
16169 assert(Args.size() == 1 && "Expected only single argument.");
16170 ExtraArgs[TreeN] = Args.front();
16171 }
16172 // Add reduction values. The values are sorted for better vectorization
16173 // results.
16174 for (Value *V : PossibleRedVals) {
16175 size_t Key, Idx;
16176 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
16177 /*AllowAlternate=*/false);
16178 ++PossibleReducedVals[Key][Idx]
16179 .insert(std::make_pair(V, 0))
16180 .first->second;
16181 }
16182 Worklist.append(PossibleReductionOps.rbegin(),
16183 PossibleReductionOps.rend());
16184 } else {
16185 size_t Key, Idx;
16186 std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey,
16187 /*AllowAlternate=*/false);
16188 ++PossibleReducedVals[Key][Idx]
16189 .insert(std::make_pair(TreeN, 0))
16190 .first->second;
16191 }
16192 }
16193 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
16194 // Sort values by the total number of values kinds to start the reduction
16195 // from the longest possible reduced values sequences.
16196 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
16197 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
16198 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
16199 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
16200 It != E; ++It) {
16201 PossibleRedValsVect.emplace_back();
16202 auto RedValsVect = It->second.takeVector();
16203 stable_sort(RedValsVect, llvm::less_second());
16204 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
16205 PossibleRedValsVect.back().append(Data.second, Data.first);
16206 }
16207 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
16208 return P1.size() > P2.size();
16209 });
16210 int NewIdx = -1;
16211 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
16212 if (isGoodForReduction(Data) ||
16213 (isa<LoadInst>(Data.front()) && NewIdx >= 0 &&
16214 isa<LoadInst>(ReducedVals[NewIdx].front()) &&
16216 cast<LoadInst>(Data.front())->getPointerOperand()) ==
16217 getUnderlyingObject(cast<LoadInst>(ReducedVals[NewIdx].front())
16218 ->getPointerOperand()))) {
16219 if (NewIdx < 0) {
16220 NewIdx = ReducedVals.size();
16221 ReducedVals.emplace_back();
16222 }
16223 if (DoNotReverseVals.contains(Data.front()))
16224 ReducedVals[NewIdx].append(Data.begin(), Data.end());
16225 else
16226 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
16227 } else {
16228 ReducedVals.emplace_back().append(Data.rbegin(), Data.rend());
16229 }
16230 }
16231 }
16232 // Sort the reduced values by number of same/alternate opcode and/or pointer
16233 // operand.
16234 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
16235 return P1.size() > P2.size();
16236 });
16237 return true;
16238 }
16239
16240 /// Attempt to vectorize the tree found by matchAssociativeReduction.
16241 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
16242 const TargetLibraryInfo &TLI) {
16243 constexpr int ReductionLimit = 4;
16244 constexpr unsigned RegMaxNumber = 4;
16245 constexpr unsigned RedValsMaxNumber = 128;
16246 // If there are a sufficient number of reduction values, reduce
16247 // to a nearby power-of-2. We can safely generate oversized
16248 // vectors and rely on the backend to split them to legal sizes.
16249 unsigned NumReducedVals =
16250 std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0,
16251 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
16252 if (!isGoodForReduction(Vals))
16253 return Num;
16254 return Num + Vals.size();
16255 });
16256 if (NumReducedVals < ReductionLimit &&
16258 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
16259 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
16260 }))) {
16261 for (ReductionOpsType &RdxOps : ReductionOps)
16262 for (Value *RdxOp : RdxOps)
16263 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
16264 return nullptr;
16265 }
16266
16267 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
16268 TargetFolder(DL));
16269 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
16270
16271 // Track the reduced values in case if they are replaced by extractelement
16272 // because of the vectorization.
16274 ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
16275 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
16276 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
16277 ExternallyUsedValues.reserve(ExtraArgs.size() + 1);
16278 // The same extra argument may be used several times, so log each attempt
16279 // to use it.
16280 for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
16281 assert(Pair.first && "DebugLoc must be set.");
16282 ExternallyUsedValues[Pair.second].push_back(Pair.first);
16283 TrackedVals.try_emplace(Pair.second, Pair.second);
16284 }
16285
16286 // The compare instruction of a min/max is the insertion point for new
16287 // instructions and may be replaced with a new compare instruction.
16288 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
16289 assert(isa<SelectInst>(RdxRootInst) &&
16290 "Expected min/max reduction to have select root instruction");
16291 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
16292 assert(isa<Instruction>(ScalarCond) &&
16293 "Expected min/max reduction to have compare condition");
16294 return cast<Instruction>(ScalarCond);
16295 };
16296
16297 // Return new VectorizedTree, based on previous value.
16298 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
16299 if (VectorizedTree) {
16300 // Update the final value in the reduction.
16302 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
16303 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
16305 !isGuaranteedNotToBePoison(VectorizedTree))) {
16306 auto It = ReducedValsToOps.find(Res);
16307 if (It != ReducedValsToOps.end() &&
16308 any_of(It->getSecond(),
16309 [](Instruction *I) { return isBoolLogicOp(I); }))
16310 std::swap(VectorizedTree, Res);
16311 }
16312
16313 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
16314 ReductionOps);
16315 }
16316 // Initialize the final value in the reduction.
16317 return Res;
16318 };
16319 bool AnyBoolLogicOp =
16320 any_of(ReductionOps.back(), [](Value *V) {
16321 return isBoolLogicOp(cast<Instruction>(V));
16322 });
16323 // The reduction root is used as the insertion point for new instructions,
16324 // so set it as externally used to prevent it from being deleted.
16325 ExternallyUsedValues[ReductionRoot];
16326 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
16327 ReductionOps.front().size());
16328 for (ReductionOpsType &RdxOps : ReductionOps)
16329 for (Value *RdxOp : RdxOps) {
16330 if (!RdxOp)
16331 continue;
16332 IgnoreList.insert(RdxOp);
16333 }
16334 // Intersect the fast-math-flags from all reduction operations.
16335 FastMathFlags RdxFMF;
16336 RdxFMF.set();
16337 for (Value *U : IgnoreList)
16338 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
16339 RdxFMF &= FPMO->getFastMathFlags();
16340 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
16341
16342 // Need to track reduced vals, they may be changed during vectorization of
16343 // subvectors.
16344 for (ArrayRef<Value *> Candidates : ReducedVals)
16345 for (Value *V : Candidates)
16346 TrackedVals.try_emplace(V, V);
16347
16348 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
16349 // List of the values that were reduced in other trees as part of gather
16350 // nodes and thus requiring extract if fully vectorized in other trees.
16351 SmallPtrSet<Value *, 4> RequiredExtract;
16352 Value *VectorizedTree = nullptr;
16353 bool CheckForReusedReductionOps = false;
16354 // Try to vectorize elements based on their type.
16355 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
16356 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
16357 InstructionsState S = getSameOpcode(OrigReducedVals, TLI);
16358 SmallVector<Value *> Candidates;
16359 Candidates.reserve(2 * OrigReducedVals.size());
16360 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
16361 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
16362 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
16363 // Check if the reduction value was not overriden by the extractelement
16364 // instruction because of the vectorization and exclude it, if it is not
16365 // compatible with other values.
16366 // Also check if the instruction was folded to constant/other value.
16367 auto *Inst = dyn_cast<Instruction>(RdxVal);
16368 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
16369 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
16370 (S.getOpcode() && !Inst))
16371 continue;
16372 Candidates.push_back(RdxVal);
16373 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
16374 }
16375 bool ShuffledExtracts = false;
16376 // Try to handle shuffled extractelements.
16377 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
16378 I + 1 < E) {
16379 InstructionsState NextS = getSameOpcode(ReducedVals[I + 1], TLI);
16380 if (NextS.getOpcode() == Instruction::ExtractElement &&
16381 !NextS.isAltShuffle()) {
16382 SmallVector<Value *> CommonCandidates(Candidates);
16383 for (Value *RV : ReducedVals[I + 1]) {
16384 Value *RdxVal = TrackedVals.find(RV)->second;
16385 // Check if the reduction value was not overriden by the
16386 // extractelement instruction because of the vectorization and
16387 // exclude it, if it is not compatible with other values.
16388 if (auto *Inst = dyn_cast<Instruction>(RdxVal))
16389 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
16390 continue;
16391 CommonCandidates.push_back(RdxVal);
16392 TrackedToOrig.try_emplace(RdxVal, RV);
16393 }
16395 if (isFixedVectorShuffle(CommonCandidates, Mask)) {
16396 ++I;
16397 Candidates.swap(CommonCandidates);
16398 ShuffledExtracts = true;
16399 }
16400 }
16401 }
16402
16403 // Emit code for constant values.
16404 if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 &&
16405 allConstant(Candidates)) {
16406 Value *Res = Candidates.front();
16407 ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond();
16408 for (Value *VC : ArrayRef(Candidates).drop_front()) {
16409 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
16410 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
16411 if (auto *ResI = dyn_cast<Instruction>(Res))
16412 V.analyzedReductionRoot(ResI);
16413 }
16414 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
16415 continue;
16416 }
16417
16418 unsigned NumReducedVals = Candidates.size();
16419 if (NumReducedVals < ReductionLimit &&
16420 (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization ||
16421 !isSplat(Candidates)))
16422 continue;
16423
16424 // Check if we support repeated scalar values processing (optimization of
16425 // original scalar identity operations on matched horizontal reductions).
16426 IsSupportedHorRdxIdentityOp =
16427 AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
16428 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
16429 // Gather same values.
16430 MapVector<Value *, unsigned> SameValuesCounter;
16431 if (IsSupportedHorRdxIdentityOp)
16432 for (Value *V : Candidates)
16433 ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second;
16434 // Used to check if the reduced values used same number of times. In this
16435 // case the compiler may produce better code. E.g. if reduced values are
16436 // aabbccdd (8 x values), then the first node of the tree will have a node
16437 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
16438 // Plus, the final reduction will be performed on <8 x aabbccdd>.
16439 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
16440 // x abcd) * 2.
16441 // Currently it only handles add/fadd/xor. and/or/min/max do not require
16442 // this analysis, other operations may require an extra estimation of
16443 // the profitability.
16444 bool SameScaleFactor = false;
16445 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
16446 SameValuesCounter.size() != Candidates.size();
16447 if (OptReusedScalars) {
16448 SameScaleFactor =
16449 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
16450 RdxKind == RecurKind::Xor) &&
16451 all_of(drop_begin(SameValuesCounter),
16452 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
16453 return P.second == SameValuesCounter.front().second;
16454 });
16455 Candidates.resize(SameValuesCounter.size());
16456 transform(SameValuesCounter, Candidates.begin(),
16457 [](const auto &P) { return P.first; });
16458 NumReducedVals = Candidates.size();
16459 // Have a reduction of the same element.
16460 if (NumReducedVals == 1) {
16461 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
16462 unsigned Cnt = SameValuesCounter.lookup(OrigV);
16463 Value *RedVal =
16464 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
16465 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
16466 VectorizedVals.try_emplace(OrigV, Cnt);
16467 continue;
16468 }
16469 }
16470
16471 unsigned MaxVecRegSize = V.getMaxVecRegSize();
16472 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
16473 unsigned MaxElts =
16474 RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize);
16475
16476 unsigned ReduxWidth = std::min<unsigned>(
16477 llvm::bit_floor(NumReducedVals),
16478 std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
16479 RegMaxNumber * RedValsMaxNumber));
16480 unsigned Start = 0;
16481 unsigned Pos = Start;
16482 // Restarts vectorization attempt with lower vector factor.
16483 unsigned PrevReduxWidth = ReduxWidth;
16484 bool CheckForReusedReductionOpsLocal = false;
16485 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
16486 &CheckForReusedReductionOpsLocal,
16487 &PrevReduxWidth, &V,
16488 &IgnoreList](bool IgnoreVL = false) {
16489 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
16490 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
16491 // Check if any of the reduction ops are gathered. If so, worth
16492 // trying again with less number of reduction ops.
16493 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
16494 }
16495 ++Pos;
16496 if (Pos < NumReducedVals - ReduxWidth + 1)
16497 return IsAnyRedOpGathered;
16498 Pos = Start;
16499 ReduxWidth /= 2;
16500 return IsAnyRedOpGathered;
16501 };
16502 bool AnyVectorized = false;
16503 while (Pos < NumReducedVals - ReduxWidth + 1 &&
16504 ReduxWidth >= ReductionLimit) {
16505 // Dependency in tree of the reduction ops - drop this attempt, try
16506 // later.
16507 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
16508 Start == 0) {
16509 CheckForReusedReductionOps = true;
16510 break;
16511 }
16512 PrevReduxWidth = ReduxWidth;
16513 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
16514 // Beeing analyzed already - skip.
16515 if (V.areAnalyzedReductionVals(VL)) {
16516 (void)AdjustReducedVals(/*IgnoreVL=*/true);
16517 continue;
16518 }
16519 // Early exit if any of the reduction values were deleted during
16520 // previous vectorization attempts.
16521 if (any_of(VL, [&V](Value *RedVal) {
16522 auto *RedValI = dyn_cast<Instruction>(RedVal);
16523 if (!RedValI)
16524 return false;
16525 return V.isDeleted(RedValI);
16526 }))
16527 break;
16528 V.buildTree(VL, IgnoreList);
16529 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
16530 if (!AdjustReducedVals())
16531 V.analyzedReductionVals(VL);
16532 continue;
16533 }
16534 if (V.isLoadCombineReductionCandidate(RdxKind)) {
16535 if (!AdjustReducedVals())
16536 V.analyzedReductionVals(VL);
16537 continue;
16538 }
16539 V.reorderTopToBottom();
16540 // No need to reorder the root node at all.
16541 V.reorderBottomToTop(/*IgnoreReorder=*/true);
16542 // Keep extracted other reduction values, if they are used in the
16543 // vectorization trees.
16544 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
16545 ExternallyUsedValues);
16546 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
16547 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
16548 continue;
16549 for (Value *V : ReducedVals[Cnt])
16550 if (isa<Instruction>(V))
16551 LocalExternallyUsedValues[TrackedVals[V]];
16552 }
16553 if (!IsSupportedHorRdxIdentityOp) {
16554 // Number of uses of the candidates in the vector of values.
16555 assert(SameValuesCounter.empty() &&
16556 "Reused values counter map is not empty");
16557 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
16558 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
16559 continue;
16560 Value *V = Candidates[Cnt];
16561 Value *OrigV = TrackedToOrig.find(V)->second;
16562 ++SameValuesCounter[OrigV];
16563 }
16564 }
16565 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
16566 // Gather externally used values.
16568 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
16569 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
16570 continue;
16571 Value *RdxVal = Candidates[Cnt];
16572 if (!Visited.insert(RdxVal).second)
16573 continue;
16574 // Check if the scalar was vectorized as part of the vectorization
16575 // tree but not the top node.
16576 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
16577 LocalExternallyUsedValues[RdxVal];
16578 continue;
16579 }
16580 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
16581 unsigned NumOps =
16582 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
16583 if (NumOps != ReducedValsToOps.find(OrigV)->second.size())
16584 LocalExternallyUsedValues[RdxVal];
16585 }
16586 // Do not need the list of reused scalars in regular mode anymore.
16587 if (!IsSupportedHorRdxIdentityOp)
16588 SameValuesCounter.clear();
16589 for (Value *RdxVal : VL)
16590 if (RequiredExtract.contains(RdxVal))
16591 LocalExternallyUsedValues[RdxVal];
16592 // Update LocalExternallyUsedValues for the scalar, replaced by
16593 // extractelement instructions.
16594 DenseMap<Value *, Value *> ReplacementToExternal;
16595 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals)
16596 ReplacementToExternal.try_emplace(Pair.second, Pair.first);
16597 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
16598 Value *Ext = Pair.first;
16599 auto RIt = ReplacementToExternal.find(Ext);
16600 while (RIt != ReplacementToExternal.end()) {
16601 Ext = RIt->second;
16602 RIt = ReplacementToExternal.find(Ext);
16603 }
16604 auto *It = ExternallyUsedValues.find(Ext);
16605 if (It == ExternallyUsedValues.end())
16606 continue;
16607 LocalExternallyUsedValues[Pair.second].append(It->second);
16608 }
16609 V.buildExternalUses(LocalExternallyUsedValues);
16610
16611 V.computeMinimumValueSizes();
16612 V.transformNodes();
16613
16614 // Estimate cost.
16615 InstructionCost TreeCost = V.getTreeCost(VL);
16616 InstructionCost ReductionCost =
16617 getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
16618 InstructionCost Cost = TreeCost + ReductionCost;
16619 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
16620 << " for reduction\n");
16621 if (!Cost.isValid())
16622 break;
16623 if (Cost >= -SLPCostThreshold) {
16624 V.getORE()->emit([&]() {
16626 SV_NAME, "HorSLPNotBeneficial",
16627 ReducedValsToOps.find(VL[0])->second.front())
16628 << "Vectorizing horizontal reduction is possible "
16629 << "but not beneficial with cost " << ore::NV("Cost", Cost)
16630 << " and threshold "
16631 << ore::NV("Threshold", -SLPCostThreshold);
16632 });
16633 if (!AdjustReducedVals())
16634 V.analyzedReductionVals(VL);
16635 continue;
16636 }
16637
16638 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
16639 << Cost << ". (HorRdx)\n");
16640 V.getORE()->emit([&]() {
16641 return OptimizationRemark(
16642 SV_NAME, "VectorizedHorizontalReduction",
16643 ReducedValsToOps.find(VL[0])->second.front())
16644 << "Vectorized horizontal reduction with cost "
16645 << ore::NV("Cost", Cost) << " and with tree size "
16646 << ore::NV("TreeSize", V.getTreeSize());
16647 });
16648
16649 Builder.setFastMathFlags(RdxFMF);
16650
16651 // Emit a reduction. If the root is a select (min/max idiom), the insert
16652 // point is the compare condition of that select.
16653 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
16654 Instruction *InsertPt = RdxRootInst;
16655 if (IsCmpSelMinMax)
16656 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
16657
16658 // Vectorize a tree.
16659 Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,
16660 ReplacedExternals, InsertPt);
16661
16662 Builder.SetInsertPoint(InsertPt);
16663
16664 // To prevent poison from leaking across what used to be sequential,
16665 // safe, scalar boolean logic operations, the reduction operand must be
16666 // frozen.
16667 if ((isBoolLogicOp(RdxRootInst) ||
16668 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
16669 !isGuaranteedNotToBePoison(VectorizedRoot))
16670 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
16671
16672 // Emit code to correctly handle reused reduced values, if required.
16673 if (OptReusedScalars && !SameScaleFactor) {
16674 VectorizedRoot =
16675 emitReusedOps(VectorizedRoot, Builder, V.getRootNodeScalars(),
16676 SameValuesCounter, TrackedToOrig);
16677 }
16678
16679 Value *ReducedSubTree =
16680 emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
16681 if (ReducedSubTree->getType() != VL.front()->getType()) {
16682 ReducedSubTree = Builder.CreateIntCast(
16683 ReducedSubTree, VL.front()->getType(), any_of(VL, [&](Value *R) {
16685 R, cast<Instruction>(ReductionOps.front().front())
16686 ->getModule()
16687 ->getDataLayout());
16688 return !Known.isNonNegative();
16689 }));
16690 }
16691
16692 // Improved analysis for add/fadd/xor reductions with same scale factor
16693 // for all operands of reductions. We can emit scalar ops for them
16694 // instead.
16695 if (OptReusedScalars && SameScaleFactor)
16696 ReducedSubTree = emitScaleForReusedOps(
16697 ReducedSubTree, Builder, SameValuesCounter.front().second);
16698
16699 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
16700 // Count vectorized reduced values to exclude them from final reduction.
16701 for (Value *RdxVal : VL) {
16702 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
16703 if (IsSupportedHorRdxIdentityOp) {
16704 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
16705 continue;
16706 }
16707 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
16708 if (!V.isVectorized(RdxVal))
16709 RequiredExtract.insert(RdxVal);
16710 }
16711 Pos += ReduxWidth;
16712 Start = Pos;
16713 ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
16714 AnyVectorized = true;
16715 }
16716 if (OptReusedScalars && !AnyVectorized) {
16717 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
16718 Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second);
16719 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
16720 Value *OrigV = TrackedToOrig.find(P.first)->second;
16721 VectorizedVals.try_emplace(OrigV, P.second);
16722 }
16723 continue;
16724 }
16725 }
16726 if (VectorizedTree) {
16727 // Reorder operands of bool logical op in the natural order to avoid
16728 // possible problem with poison propagation. If not possible to reorder
16729 // (both operands are originally RHS), emit an extra freeze instruction
16730 // for the LHS operand.
16731 // I.e., if we have original code like this:
16732 // RedOp1 = select i1 ?, i1 LHS, i1 false
16733 // RedOp2 = select i1 RHS, i1 ?, i1 false
16734
16735 // Then, we swap LHS/RHS to create a new op that matches the poison
16736 // semantics of the original code.
16737
16738 // If we have original code like this and both values could be poison:
16739 // RedOp1 = select i1 ?, i1 LHS, i1 false
16740 // RedOp2 = select i1 ?, i1 RHS, i1 false
16741
16742 // Then, we must freeze LHS in the new op.
16743 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
16744 Instruction *RedOp1,
16745 Instruction *RedOp2,
16746 bool InitStep) {
16747 if (!AnyBoolLogicOp)
16748 return;
16749 if (isBoolLogicOp(RedOp1) &&
16750 ((!InitStep && LHS == VectorizedTree) ||
16751 getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
16752 return;
16753 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
16754 getRdxOperand(RedOp2, 0) == RHS ||
16756 std::swap(LHS, RHS);
16757 return;
16758 }
16759 if (LHS != VectorizedTree)
16760 LHS = Builder.CreateFreeze(LHS);
16761 };
16762 // Finish the reduction.
16763 // Need to add extra arguments and not vectorized possible reduction
16764 // values.
16765 // Try to avoid dependencies between the scalar remainders after
16766 // reductions.
16767 auto FinalGen =
16769 bool InitStep) {
16770 unsigned Sz = InstVals.size();
16772 Sz % 2);
16773 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
16774 Instruction *RedOp = InstVals[I + 1].first;
16775 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
16776 Value *RdxVal1 = InstVals[I].second;
16777 Value *StableRdxVal1 = RdxVal1;
16778 auto It1 = TrackedVals.find(RdxVal1);
16779 if (It1 != TrackedVals.end())
16780 StableRdxVal1 = It1->second;
16781 Value *RdxVal2 = InstVals[I + 1].second;
16782 Value *StableRdxVal2 = RdxVal2;
16783 auto It2 = TrackedVals.find(RdxVal2);
16784 if (It2 != TrackedVals.end())
16785 StableRdxVal2 = It2->second;
16786 // To prevent poison from leaking across what used to be
16787 // sequential, safe, scalar boolean logic operations, the
16788 // reduction operand must be frozen.
16789 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
16790 RedOp, InitStep);
16791 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
16792 StableRdxVal2, "op.rdx", ReductionOps);
16793 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
16794 }
16795 if (Sz % 2 == 1)
16796 ExtraReds[Sz / 2] = InstVals.back();
16797 return ExtraReds;
16798 };
16800 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
16801 VectorizedTree);
16803 for (ArrayRef<Value *> Candidates : ReducedVals) {
16804 for (Value *RdxVal : Candidates) {
16805 if (!Visited.insert(RdxVal).second)
16806 continue;
16807 unsigned NumOps = VectorizedVals.lookup(RdxVal);
16808 for (Instruction *RedOp :
16809 ArrayRef(ReducedValsToOps.find(RdxVal)->second)
16810 .drop_back(NumOps))
16811 ExtraReductions.emplace_back(RedOp, RdxVal);
16812 }
16813 }
16814 for (auto &Pair : ExternallyUsedValues) {
16815 // Add each externally used value to the final reduction.
16816 for (auto *I : Pair.second)
16817 ExtraReductions.emplace_back(I, Pair.first);
16818 }
16819 // Iterate through all not-vectorized reduction values/extra arguments.
16820 bool InitStep = true;
16821 while (ExtraReductions.size() > 1) {
16822 VectorizedTree = ExtraReductions.front().second;
16824 FinalGen(ExtraReductions, InitStep);
16825 ExtraReductions.swap(NewReds);
16826 InitStep = false;
16827 }
16828 VectorizedTree = ExtraReductions.front().second;
16829
16830 ReductionRoot->replaceAllUsesWith(VectorizedTree);
16831
16832 // The original scalar reduction is expected to have no remaining
16833 // uses outside the reduction tree itself. Assert that we got this
16834 // correct, replace internal uses with undef, and mark for eventual
16835 // deletion.
16836#ifndef NDEBUG
16837 SmallSet<Value *, 4> IgnoreSet;
16838 for (ArrayRef<Value *> RdxOps : ReductionOps)
16839 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
16840#endif
16841 for (ArrayRef<Value *> RdxOps : ReductionOps) {
16842 for (Value *Ignore : RdxOps) {
16843 if (!Ignore)
16844 continue;
16845#ifndef NDEBUG
16846 for (auto *U : Ignore->users()) {
16847 assert(IgnoreSet.count(U) &&
16848 "All users must be either in the reduction ops list.");
16849 }
16850#endif
16851 if (!Ignore->use_empty()) {
16852 Value *Undef = UndefValue::get(Ignore->getType());
16853 Ignore->replaceAllUsesWith(Undef);
16854 }
16855 V.eraseInstruction(cast<Instruction>(Ignore));
16856 }
16857 }
16858 } else if (!CheckForReusedReductionOps) {
16859 for (ReductionOpsType &RdxOps : ReductionOps)
16860 for (Value *RdxOp : RdxOps)
16861 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
16862 }
16863 return VectorizedTree;
16864 }
16865
16866private:
16867 /// Calculate the cost of a reduction.
16868 InstructionCost getReductionCost(TargetTransformInfo *TTI,
16869 ArrayRef<Value *> ReducedVals,
16870 bool IsCmpSelMinMax, unsigned ReduxWidth,
16871 FastMathFlags FMF) {
16873 Type *ScalarTy = ReducedVals.front()->getType();
16874 FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
16875 InstructionCost VectorCost = 0, ScalarCost;
16876 // If all of the reduced values are constant, the vector cost is 0, since
16877 // the reduction value can be calculated at the compile time.
16878 bool AllConsts = allConstant(ReducedVals);
16879 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
16881 // Scalar cost is repeated for N-1 elements.
16882 int Cnt = ReducedVals.size();
16883 for (Value *RdxVal : ReducedVals) {
16884 if (Cnt == 1)
16885 break;
16886 --Cnt;
16887 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
16888 Cost += GenCostFn();
16889 continue;
16890 }
16891 InstructionCost ScalarCost = 0;
16892 for (User *U : RdxVal->users()) {
16893 auto *RdxOp = cast<Instruction>(U);
16894 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
16895 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
16896 continue;
16897 }
16898 ScalarCost = InstructionCost::getInvalid();
16899 break;
16900 }
16901 if (ScalarCost.isValid())
16902 Cost += ScalarCost;
16903 else
16904 Cost += GenCostFn();
16905 }
16906 return Cost;
16907 };
16908 switch (RdxKind) {
16909 case RecurKind::Add:
16910 case RecurKind::Mul:
16911 case RecurKind::Or:
16912 case RecurKind::And:
16913 case RecurKind::Xor:
16914 case RecurKind::FAdd:
16915 case RecurKind::FMul: {
16916 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
16917 if (!AllConsts)
16918 VectorCost =
16919 TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
16920 ScalarCost = EvaluateScalarCost([&]() {
16921 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
16922 });
16923 break;
16924 }
16925 case RecurKind::FMax:
16926 case RecurKind::FMin:
16927 case RecurKind::FMaximum:
16928 case RecurKind::FMinimum:
16929 case RecurKind::SMax:
16930 case RecurKind::SMin:
16931 case RecurKind::UMax:
16932 case RecurKind::UMin: {
16934 if (!AllConsts)
16935 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
16936 ScalarCost = EvaluateScalarCost([&]() {
16937 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
16938 return TTI->getIntrinsicInstrCost(ICA, CostKind);
16939 });
16940 break;
16941 }
16942 default:
16943 llvm_unreachable("Expected arithmetic or min/max reduction operation");
16944 }
16945
16946 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
16947 << " for reduction of " << shortBundleName(ReducedVals)
16948 << " (It is a splitting reduction)\n");
16949 return VectorCost - ScalarCost;
16950 }
16951
16952 /// Emit a horizontal reduction of the vectorized value.
16953 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
16954 unsigned ReduxWidth, const TargetTransformInfo *TTI) {
16955 assert(VectorizedValue && "Need to have a vectorized tree node");
16956 assert(isPowerOf2_32(ReduxWidth) &&
16957 "We only handle power-of-two reductions for now");
16958 assert(RdxKind != RecurKind::FMulAdd &&
16959 "A call to the llvm.fmuladd intrinsic is not handled yet");
16960
16961 ++NumVectorInstructions;
16962 return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind);
16963 }
16964
16965 /// Emits optimized code for unique scalar value reused \p Cnt times.
16966 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
16967 unsigned Cnt) {
16968 assert(IsSupportedHorRdxIdentityOp &&
16969 "The optimization of matched scalar identity horizontal reductions "
16970 "must be supported.");
16971 switch (RdxKind) {
16972 case RecurKind::Add: {
16973 // res = mul vv, n
16974 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
16975 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
16976 << VectorizedValue << ". (HorRdx)\n");
16977 return Builder.CreateMul(VectorizedValue, Scale);
16978 }
16979 case RecurKind::Xor: {
16980 // res = n % 2 ? 0 : vv
16981 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
16982 << ". (HorRdx)\n");
16983 if (Cnt % 2 == 0)
16984 return Constant::getNullValue(VectorizedValue->getType());
16985 return VectorizedValue;
16986 }
16987 case RecurKind::FAdd: {
16988 // res = fmul v, n
16989 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
16990 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
16991 << VectorizedValue << ". (HorRdx)\n");
16992 return Builder.CreateFMul(VectorizedValue, Scale);
16993 }
16994 case RecurKind::And:
16995 case RecurKind::Or:
16996 case RecurKind::SMax:
16997 case RecurKind::SMin:
16998 case RecurKind::UMax:
16999 case RecurKind::UMin:
17000 case RecurKind::FMax:
17001 case RecurKind::FMin:
17002 case RecurKind::FMaximum:
17003 case RecurKind::FMinimum:
17004 // res = vv
17005 return VectorizedValue;
17006 case RecurKind::Mul:
17007 case RecurKind::FMul:
17008 case RecurKind::FMulAdd:
17009 case RecurKind::IAnyOf:
17010 case RecurKind::FAnyOf:
17011 case RecurKind::None:
17012 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
17013 }
17014 return nullptr;
17015 }
17016
17017 /// Emits actual operation for the scalar identity values, found during
17018 /// horizontal reduction analysis.
17019 Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17021 const MapVector<Value *, unsigned> &SameValuesCounter,
17022 const DenseMap<Value *, Value *> &TrackedToOrig) {
17023 assert(IsSupportedHorRdxIdentityOp &&
17024 "The optimization of matched scalar identity horizontal reductions "
17025 "must be supported.");
17026 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
17027 if (VTy->getElementType() != VL.front()->getType()) {
17028 VectorizedValue = Builder.CreateIntCast(
17029 VectorizedValue,
17030 FixedVectorType::get(VL.front()->getType(), VTy->getNumElements()),
17031 any_of(VL, [&](Value *R) {
17033 R, cast<Instruction>(ReductionOps.front().front())
17034 ->getModule()
17035 ->getDataLayout());
17036 return !Known.isNonNegative();
17037 }));
17038 }
17039 switch (RdxKind) {
17040 case RecurKind::Add: {
17041 // root = mul prev_root, <1, 1, n, 1>
17043 for (Value *V : VL) {
17044 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17045 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
17046 }
17047 auto *Scale = ConstantVector::get(Vals);
17048 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
17049 << VectorizedValue << ". (HorRdx)\n");
17050 return Builder.CreateMul(VectorizedValue, Scale);
17051 }
17052 case RecurKind::And:
17053 case RecurKind::Or:
17054 // No need for multiple or/and(s).
17055 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
17056 << ". (HorRdx)\n");
17057 return VectorizedValue;
17058 case RecurKind::SMax:
17059 case RecurKind::SMin:
17060 case RecurKind::UMax:
17061 case RecurKind::UMin:
17062 case RecurKind::FMax:
17063 case RecurKind::FMin:
17064 case RecurKind::FMaximum:
17065 case RecurKind::FMinimum:
17066 // No need for multiple min/max(s) of the same value.
17067 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
17068 << ". (HorRdx)\n");
17069 return VectorizedValue;
17070 case RecurKind::Xor: {
17071 // Replace values with even number of repeats with 0, since
17072 // x xor x = 0.
17073 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
17074 // 7>, if elements 4th and 6th elements have even number of repeats.
17076 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
17078 std::iota(Mask.begin(), Mask.end(), 0);
17079 bool NeedShuffle = false;
17080 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
17081 Value *V = VL[I];
17082 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17083 if (Cnt % 2 == 0) {
17084 Mask[I] = VF;
17085 NeedShuffle = true;
17086 }
17087 }
17088 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
17089 : Mask) dbgs()
17090 << I << " ";
17091 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
17092 if (NeedShuffle)
17093 VectorizedValue = Builder.CreateShuffleVector(
17094 VectorizedValue,
17095 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
17096 return VectorizedValue;
17097 }
17098 case RecurKind::FAdd: {
17099 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
17101 for (Value *V : VL) {
17102 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17103 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
17104 }
17105 auto *Scale = ConstantVector::get(Vals);
17106 return Builder.CreateFMul(VectorizedValue, Scale);
17107 }
17108 case RecurKind::Mul:
17109 case RecurKind::FMul:
17110 case RecurKind::FMulAdd:
17111 case RecurKind::IAnyOf:
17112 case RecurKind::FAnyOf:
17113 case RecurKind::None:
17114 llvm_unreachable("Unexpected reduction kind for reused scalars.");
17115 }
17116 return nullptr;
17117 }
17118};
17119} // end anonymous namespace
17120
17121/// Gets recurrence kind from the specified value.
17123 return HorizontalReduction::getRdxKind(V);
17124}
17125static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
17126 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
17127 return cast<FixedVectorType>(IE->getType())->getNumElements();
17128
17129 unsigned AggregateSize = 1;
17130 auto *IV = cast<InsertValueInst>(InsertInst);
17131 Type *CurrentType = IV->getType();
17132 do {
17133 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
17134 for (auto *Elt : ST->elements())
17135 if (Elt != ST->getElementType(0)) // check homogeneity
17136 return std::nullopt;
17137 AggregateSize *= ST->getNumElements();
17138 CurrentType = ST->getElementType(0);
17139 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
17140 AggregateSize *= AT->getNumElements();
17141 CurrentType = AT->getElementType();
17142 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
17143 AggregateSize *= VT->getNumElements();
17144 return AggregateSize;
17145 } else if (CurrentType->isSingleValueType()) {
17146 return AggregateSize;
17147 } else {
17148 return std::nullopt;
17149 }
17150 } while (true);
17151}
17152
17153static void findBuildAggregate_rec(Instruction *LastInsertInst,
17155 SmallVectorImpl<Value *> &BuildVectorOpds,
17156 SmallVectorImpl<Value *> &InsertElts,
17157 unsigned OperandOffset) {
17158 do {
17159 Value *InsertedOperand = LastInsertInst->getOperand(1);
17160 std::optional<unsigned> OperandIndex =
17161 getInsertIndex(LastInsertInst, OperandOffset);
17162 if (!OperandIndex)
17163 return;
17164 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
17165 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
17166 BuildVectorOpds, InsertElts, *OperandIndex);
17167
17168 } else {
17169 BuildVectorOpds[*OperandIndex] = InsertedOperand;
17170 InsertElts[*OperandIndex] = LastInsertInst;
17171 }
17172 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
17173 } while (LastInsertInst != nullptr &&
17174 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
17175 LastInsertInst->hasOneUse());
17176}
17177
17178/// Recognize construction of vectors like
17179/// %ra = insertelement <4 x float> poison, float %s0, i32 0
17180/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
17181/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
17182/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
17183/// starting from the last insertelement or insertvalue instruction.
17184///
17185/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
17186/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
17187/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
17188///
17189/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
17190///
17191/// \return true if it matches.
17192static bool findBuildAggregate(Instruction *LastInsertInst,
17194 SmallVectorImpl<Value *> &BuildVectorOpds,
17195 SmallVectorImpl<Value *> &InsertElts) {
17196
17197 assert((isa<InsertElementInst>(LastInsertInst) ||
17198 isa<InsertValueInst>(LastInsertInst)) &&
17199 "Expected insertelement or insertvalue instruction!");
17200
17201 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
17202 "Expected empty result vectors!");
17203
17204 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
17205 if (!AggregateSize)
17206 return false;
17207 BuildVectorOpds.resize(*AggregateSize);
17208 InsertElts.resize(*AggregateSize);
17209
17210 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);
17211 llvm::erase(BuildVectorOpds, nullptr);
17212 llvm::erase(InsertElts, nullptr);
17213 if (BuildVectorOpds.size() >= 2)
17214 return true;
17215
17216 return false;
17217}
17218
17219/// Try and get a reduction instruction from a phi node.
17220///
17221/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
17222/// if they come from either \p ParentBB or a containing loop latch.
17223///
17224/// \returns A candidate reduction value if possible, or \code nullptr \endcode
17225/// if not possible.
17227 BasicBlock *ParentBB, LoopInfo *LI) {
17228 // There are situations where the reduction value is not dominated by the
17229 // reduction phi. Vectorizing such cases has been reported to cause
17230 // miscompiles. See PR25787.
17231 auto DominatedReduxValue = [&](Value *R) {
17232 return isa<Instruction>(R) &&
17233 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
17234 };
17235
17236 Instruction *Rdx = nullptr;
17237
17238 // Return the incoming value if it comes from the same BB as the phi node.
17239 if (P->getIncomingBlock(0) == ParentBB) {
17240 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
17241 } else if (P->getIncomingBlock(1) == ParentBB) {
17242 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
17243 }
17244
17245 if (Rdx && DominatedReduxValue(Rdx))
17246 return Rdx;
17247
17248 // Otherwise, check whether we have a loop latch to look at.
17249 Loop *BBL = LI->getLoopFor(ParentBB);
17250 if (!BBL)
17251 return nullptr;
17252 BasicBlock *BBLatch = BBL->getLoopLatch();
17253 if (!BBLatch)
17254 return nullptr;
17255
17256 // There is a loop latch, return the incoming value if it comes from
17257 // that. This reduction pattern occasionally turns up.
17258 if (P->getIncomingBlock(0) == BBLatch) {
17259 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
17260 } else if (P->getIncomingBlock(1) == BBLatch) {
17261 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
17262 }
17263
17264 if (Rdx && DominatedReduxValue(Rdx))
17265 return Rdx;
17266
17267 return nullptr;
17268}
17269
17270static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
17271 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
17272 return true;
17273 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
17274 return true;
17275 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
17276 return true;
17277 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
17278 return true;
17279 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
17280 return true;
17281 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
17282 return true;
17283 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
17284 return true;
17285 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
17286 return true;
17287 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
17288 return true;
17289 return false;
17290}
17291
17292/// We could have an initial reduction that is not an add.
17293/// r *= v1 + v2 + v3 + v4
17294/// In such a case start looking for a tree rooted in the first '+'.
17295/// \Returns the new root if found, which may be nullptr if not an instruction.
17297 Instruction *Root) {
17298 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
17299 isa<IntrinsicInst>(Root)) &&
17300 "Expected binop, select, or intrinsic for reduction matching");
17301 Value *LHS =
17302 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
17303 Value *RHS =
17304 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
17305 if (LHS == Phi)
17306 return dyn_cast<Instruction>(RHS);
17307 if (RHS == Phi)
17308 return dyn_cast<Instruction>(LHS);
17309 return nullptr;
17310}
17311
17312/// \p Returns the first operand of \p I that does not match \p Phi. If
17313/// operand is not an instruction it returns nullptr.
17315 Value *Op0 = nullptr;
17316 Value *Op1 = nullptr;
17317 if (!matchRdxBop(I, Op0, Op1))
17318 return nullptr;
17319 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
17320}
17321
17322/// \Returns true if \p I is a candidate instruction for reduction vectorization.
17324 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
17325 Value *B0 = nullptr, *B1 = nullptr;
17326 bool IsBinop = matchRdxBop(I, B0, B1);
17327 return IsBinop || IsSelect;
17328}
17329
17330bool SLPVectorizerPass::vectorizeHorReduction(
17332 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
17333 if (!ShouldVectorizeHor)
17334 return false;
17335 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
17336
17337 if (Root->getParent() != BB || isa<PHINode>(Root))
17338 return false;
17339
17340 // If we can find a secondary reduction root, use that instead.
17341 auto SelectRoot = [&]() {
17342 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
17343 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
17344 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
17345 return NewRoot;
17346 return Root;
17347 };
17348
17349 // Start analysis starting from Root instruction. If horizontal reduction is
17350 // found, try to vectorize it. If it is not a horizontal reduction or
17351 // vectorization is not possible or not effective, and currently analyzed
17352 // instruction is a binary operation, try to vectorize the operands, using
17353 // pre-order DFS traversal order. If the operands were not vectorized, repeat
17354 // the same procedure considering each operand as a possible root of the
17355 // horizontal reduction.
17356 // Interrupt the process if the Root instruction itself was vectorized or all
17357 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
17358 // If a horizintal reduction was not matched or vectorized we collect
17359 // instructions for possible later attempts for vectorization.
17360 std::queue<std::pair<Instruction *, unsigned>> Stack;
17361 Stack.emplace(SelectRoot(), 0);
17362 SmallPtrSet<Value *, 8> VisitedInstrs;
17363 bool Res = false;
17364 auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
17365 if (R.isAnalyzedReductionRoot(Inst))
17366 return nullptr;
17367 if (!isReductionCandidate(Inst))
17368 return nullptr;
17369 HorizontalReduction HorRdx;
17370 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
17371 return nullptr;
17372 return HorRdx.tryToReduce(R, *DL, TTI, *TLI);
17373 };
17374 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
17375 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
17376 FutureSeed = getNonPhiOperand(Root, P);
17377 if (!FutureSeed)
17378 return false;
17379 }
17380 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
17381 // analysis is done separately.
17382 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
17383 PostponedInsts.push_back(FutureSeed);
17384 return true;
17385 };
17386
17387 while (!Stack.empty()) {
17388 Instruction *Inst;
17389 unsigned Level;
17390 std::tie(Inst, Level) = Stack.front();
17391 Stack.pop();
17392 // Do not try to analyze instruction that has already been vectorized.
17393 // This may happen when we vectorize instruction operands on a previous
17394 // iteration while stack was populated before that happened.
17395 if (R.isDeleted(Inst))
17396 continue;
17397 if (Value *VectorizedV = TryToReduce(Inst)) {
17398 Res = true;
17399 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
17400 // Try to find another reduction.
17401 Stack.emplace(I, Level);
17402 continue;
17403 }
17404 } else {
17405 // We could not vectorize `Inst` so try to use it as a future seed.
17406 if (!TryAppendToPostponedInsts(Inst)) {
17407 assert(Stack.empty() && "Expected empty stack");
17408 break;
17409 }
17410 }
17411
17412 // Try to vectorize operands.
17413 // Continue analysis for the instruction from the same basic block only to
17414 // save compile time.
17415 if (++Level < RecursionMaxDepth)
17416 for (auto *Op : Inst->operand_values())
17417 if (VisitedInstrs.insert(Op).second)
17418 if (auto *I = dyn_cast<Instruction>(Op))
17419 // Do not try to vectorize CmpInst operands, this is done
17420 // separately.
17421 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
17422 !R.isDeleted(I) && I->getParent() == BB)
17423 Stack.emplace(I, Level);
17424 }
17425 return Res;
17426}
17427
17428bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
17429 BasicBlock *BB, BoUpSLP &R,
17431 SmallVector<WeakTrackingVH> PostponedInsts;
17432 bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
17433 Res |= tryToVectorize(PostponedInsts, R);
17434 return Res;
17435}
17436
17437bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
17438 BoUpSLP &R) {
17439 bool Res = false;
17440 for (Value *V : Insts)
17441 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
17442 Res |= tryToVectorize(Inst, R);
17443 return Res;
17444}
17445
17446bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
17447 BasicBlock *BB, BoUpSLP &R) {
17448 if (!R.canMapToVector(IVI->getType()))
17449 return false;
17450
17451 SmallVector<Value *, 16> BuildVectorOpds;
17452 SmallVector<Value *, 16> BuildVectorInsts;
17453 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
17454 return false;
17455
17456 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
17457 // Aggregate value is unlikely to be processed in vector register.
17458 return tryToVectorizeList(BuildVectorOpds, R);
17459}
17460
17461bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
17462 BasicBlock *BB, BoUpSLP &R) {
17463 SmallVector<Value *, 16> BuildVectorInsts;
17464 SmallVector<Value *, 16> BuildVectorOpds;
17466 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
17467 (llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
17468 isFixedVectorShuffle(BuildVectorOpds, Mask)))
17469 return false;
17470
17471 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
17472 return tryToVectorizeList(BuildVectorInsts, R);
17473}
17474
17475template <typename T>
17477 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
17478 function_ref<bool(T *, T *)> AreCompatible,
17479 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
17480 bool MaxVFOnly, BoUpSLP &R) {
17481 bool Changed = false;
17482 // Sort by type, parent, operands.
17483 stable_sort(Incoming, Comparator);
17484
17485 // Try to vectorize elements base on their type.
17486 SmallVector<T *> Candidates;
17487 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) {
17488 // Look for the next elements with the same type, parent and operand
17489 // kinds.
17490 auto *SameTypeIt = IncIt;
17491 while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))
17492 ++SameTypeIt;
17493
17494 // Try to vectorize them.
17495 unsigned NumElts = (SameTypeIt - IncIt);
17496 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
17497 << NumElts << ")\n");
17498 // The vectorization is a 3-state attempt:
17499 // 1. Try to vectorize instructions with the same/alternate opcodes with the
17500 // size of maximal register at first.
17501 // 2. Try to vectorize remaining instructions with the same type, if
17502 // possible. This may result in the better vectorization results rather than
17503 // if we try just to vectorize instructions with the same/alternate opcodes.
17504 // 3. Final attempt to try to vectorize all instructions with the
17505 // same/alternate ops only, this may result in some extra final
17506 // vectorization.
17507 if (NumElts > 1 &&
17508 TryToVectorizeHelper(ArrayRef(IncIt, NumElts), MaxVFOnly)) {
17509 // Success start over because instructions might have been changed.
17510 Changed = true;
17511 } else {
17512 /// \Returns the minimum number of elements that we will attempt to
17513 /// vectorize.
17514 auto GetMinNumElements = [&R](Value *V) {
17515 unsigned EltSize = R.getVectorElementSize(V);
17516 return std::max(2U, R.getMaxVecRegSize() / EltSize);
17517 };
17518 if (NumElts < GetMinNumElements(*IncIt) &&
17519 (Candidates.empty() ||
17520 Candidates.front()->getType() == (*IncIt)->getType())) {
17521 Candidates.append(IncIt, std::next(IncIt, NumElts));
17522 }
17523 }
17524 // Final attempt to vectorize instructions with the same types.
17525 if (Candidates.size() > 1 &&
17526 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
17527 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
17528 // Success start over because instructions might have been changed.
17529 Changed = true;
17530 } else if (MaxVFOnly) {
17531 // Try to vectorize using small vectors.
17532 for (auto *It = Candidates.begin(), *End = Candidates.end();
17533 It != End;) {
17534 auto *SameTypeIt = It;
17535 while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))
17536 ++SameTypeIt;
17537 unsigned NumElts = (SameTypeIt - It);
17538 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(It, NumElts),
17539 /*MaxVFOnly=*/false))
17540 Changed = true;
17541 It = SameTypeIt;
17542 }
17543 }
17544 Candidates.clear();
17545 }
17546
17547 // Start over at the next instruction of a different type (or the end).
17548 IncIt = SameTypeIt;
17549 }
17550 return Changed;
17551}
17552
17553/// Compare two cmp instructions. If IsCompatibility is true, function returns
17554/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
17555/// operands. If IsCompatibility is false, function implements strict weak
17556/// ordering relation between two cmp instructions, returning true if the first
17557/// instruction is "less" than the second, i.e. its predicate is less than the
17558/// predicate of the second or the operands IDs are less than the operands IDs
17559/// of the second cmp instruction.
17560template <bool IsCompatibility>
17561static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
17562 const DominatorTree &DT) {
17563 assert(isValidElementType(V->getType()) &&
17564 isValidElementType(V2->getType()) &&
17565 "Expected valid element types only.");
17566 if (V == V2)
17567 return IsCompatibility;
17568 auto *CI1 = cast<CmpInst>(V);
17569 auto *CI2 = cast<CmpInst>(V2);
17570 if (CI1->getOperand(0)->getType()->getTypeID() <
17571 CI2->getOperand(0)->getType()->getTypeID())
17572 return !IsCompatibility;
17573 if (CI1->getOperand(0)->getType()->getTypeID() >
17574 CI2->getOperand(0)->getType()->getTypeID())
17575 return false;
17576 CmpInst::Predicate Pred1 = CI1->getPredicate();
17577 CmpInst::Predicate Pred2 = CI2->getPredicate();
17580 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
17581 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
17582 if (BasePred1 < BasePred2)
17583 return !IsCompatibility;
17584 if (BasePred1 > BasePred2)
17585 return false;
17586 // Compare operands.
17587 bool CI1Preds = Pred1 == BasePred1;
17588 bool CI2Preds = Pred2 == BasePred1;
17589 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
17590 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
17591 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
17592 if (Op1 == Op2)
17593 continue;
17594 if (Op1->getValueID() < Op2->getValueID())
17595 return !IsCompatibility;
17596 if (Op1->getValueID() > Op2->getValueID())
17597 return false;
17598 if (auto *I1 = dyn_cast<Instruction>(Op1))
17599 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
17600 if (IsCompatibility) {
17601 if (I1->getParent() != I2->getParent())
17602 return false;
17603 } else {
17604 // Try to compare nodes with same parent.
17605 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
17606 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
17607 if (!NodeI1)
17608 return NodeI2 != nullptr;
17609 if (!NodeI2)
17610 return false;
17611 assert((NodeI1 == NodeI2) ==
17612 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
17613 "Different nodes should have different DFS numbers");
17614 if (NodeI1 != NodeI2)
17615 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
17616 }
17617 InstructionsState S = getSameOpcode({I1, I2}, TLI);
17618 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
17619 continue;
17620 if (IsCompatibility)
17621 return false;
17622 if (I1->getOpcode() != I2->getOpcode())
17623 return I1->getOpcode() < I2->getOpcode();
17624 }
17625 }
17626 return IsCompatibility;
17627}
17628
17629template <typename ItT>
17630bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
17631 BasicBlock *BB, BoUpSLP &R) {
17632 bool Changed = false;
17633 // Try to find reductions first.
17634 for (CmpInst *I : CmpInsts) {
17635 if (R.isDeleted(I))
17636 continue;
17637 for (Value *Op : I->operands())
17638 if (auto *RootOp = dyn_cast<Instruction>(Op))
17639 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI);
17640 }
17641 // Try to vectorize operands as vector bundles.
17642 for (CmpInst *I : CmpInsts) {
17643 if (R.isDeleted(I))
17644 continue;
17645 Changed |= tryToVectorize(I, R);
17646 }
17647 // Try to vectorize list of compares.
17648 // Sort by type, compare predicate, etc.
17649 auto CompareSorter = [&](Value *V, Value *V2) {
17650 if (V == V2)
17651 return false;
17652 return compareCmp<false>(V, V2, *TLI, *DT);
17653 };
17654
17655 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
17656 if (V1 == V2)
17657 return true;
17658 return compareCmp<true>(V1, V2, *TLI, *DT);
17659 };
17660
17662 for (Instruction *V : CmpInsts)
17663 if (!R.isDeleted(V) && isValidElementType(V->getType()))
17664 Vals.push_back(V);
17665 if (Vals.size() <= 1)
17666 return Changed;
17667 Changed |= tryToVectorizeSequence<Value>(
17668 Vals, CompareSorter, AreCompatibleCompares,
17669 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
17670 // Exclude possible reductions from other blocks.
17671 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
17672 return any_of(V->users(), [V](User *U) {
17673 auto *Select = dyn_cast<SelectInst>(U);
17674 return Select &&
17675 Select->getParent() != cast<Instruction>(V)->getParent();
17676 });
17677 });
17678 if (ArePossiblyReducedInOtherBlock)
17679 return false;
17680 return tryToVectorizeList(Candidates, R, MaxVFOnly);
17681 },
17682 /*MaxVFOnly=*/true, R);
17683 return Changed;
17684}
17685
17686bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
17687 BasicBlock *BB, BoUpSLP &R) {
17688 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
17689 "This function only accepts Insert instructions");
17690 bool OpsChanged = false;
17691 SmallVector<WeakTrackingVH> PostponedInsts;
17692 // pass1 - try to vectorize reductions only
17693 for (auto *I : reverse(Instructions)) {
17694 if (R.isDeleted(I))
17695 continue;
17696 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
17697 }
17698 // pass2 - try to match and vectorize a buildvector sequence.
17699 for (auto *I : reverse(Instructions)) {
17700 if (R.isDeleted(I) || isa<CmpInst>(I))
17701 continue;
17702 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
17703 OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
17704 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
17705 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
17706 }
17707 }
17708 // Now try to vectorize postponed instructions.
17709 OpsChanged |= tryToVectorize(PostponedInsts, R);
17710
17711 Instructions.clear();
17712 return OpsChanged;
17713}
17714
17715bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
17716 bool Changed = false;
17718 SmallPtrSet<Value *, 16> VisitedInstrs;
17719 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
17720 // node. Allows better to identify the chains that can be vectorized in the
17721 // better way.
17723 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
17725 isValidElementType(V2->getType()) &&
17726 "Expected vectorizable types only.");
17727 // It is fine to compare type IDs here, since we expect only vectorizable
17728 // types, like ints, floats and pointers, we don't care about other type.
17729 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
17730 return true;
17731 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
17732 return false;
17733 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
17734 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
17735 if (Opcodes1.size() < Opcodes2.size())
17736 return true;
17737 if (Opcodes1.size() > Opcodes2.size())
17738 return false;
17739 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
17740 {
17741 // Instructions come first.
17742 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
17743 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
17744 if (I1 && I2) {
17745 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
17746 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
17747 if (!NodeI1)
17748 return NodeI2 != nullptr;
17749 if (!NodeI2)
17750 return false;
17751 assert((NodeI1 == NodeI2) ==
17752 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
17753 "Different nodes should have different DFS numbers");
17754 if (NodeI1 != NodeI2)
17755 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
17756 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
17757 if (S.getOpcode() && !S.isAltShuffle())
17758 continue;
17759 return I1->getOpcode() < I2->getOpcode();
17760 }
17761 if (I1)
17762 return true;
17763 if (I2)
17764 return false;
17765 }
17766 {
17767 // Non-undef constants come next.
17768 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
17769 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
17770 if (C1 && C2)
17771 continue;
17772 if (C1)
17773 return true;
17774 if (C2)
17775 return false;
17776 }
17777 bool U1 = isa<UndefValue>(Opcodes1[I]);
17778 bool U2 = isa<UndefValue>(Opcodes2[I]);
17779 {
17780 // Non-constant non-instructions come next.
17781 if (!U1 && !U2) {
17782 auto ValID1 = Opcodes1[I]->getValueID();
17783 auto ValID2 = Opcodes2[I]->getValueID();
17784 if (ValID1 == ValID2)
17785 continue;
17786 if (ValID1 < ValID2)
17787 return true;
17788 if (ValID1 > ValID2)
17789 return false;
17790 }
17791 if (!U1)
17792 return true;
17793 if (!U2)
17794 return false;
17795 }
17796 // Undefs come last.
17797 assert(U1 && U2 && "The only thing left should be undef & undef.");
17798 continue;
17799 }
17800 return false;
17801 };
17802 auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) {
17803 if (V1 == V2)
17804 return true;
17805 if (V1->getType() != V2->getType())
17806 return false;
17807 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
17808 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
17809 if (Opcodes1.size() != Opcodes2.size())
17810 return false;
17811 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
17812 // Undefs are compatible with any other value.
17813 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
17814 continue;
17815 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
17816 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
17817 if (I1->getParent() != I2->getParent())
17818 return false;
17819 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
17820 if (S.getOpcode())
17821 continue;
17822 return false;
17823 }
17824 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
17825 continue;
17826 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
17827 return false;
17828 }
17829 return true;
17830 };
17831
17832 bool HaveVectorizedPhiNodes = false;
17833 do {
17834 // Collect the incoming values from the PHIs.
17835 Incoming.clear();
17836 for (Instruction &I : *BB) {
17837 PHINode *P = dyn_cast<PHINode>(&I);
17838 if (!P)
17839 break;
17840
17841 // No need to analyze deleted, vectorized and non-vectorizable
17842 // instructions.
17843 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
17844 isValidElementType(P->getType()))
17845 Incoming.push_back(P);
17846 }
17847
17848 if (Incoming.size() <= 1)
17849 break;
17850
17851 // Find the corresponding non-phi nodes for better matching when trying to
17852 // build the tree.
17853 for (Value *V : Incoming) {
17854 SmallVectorImpl<Value *> &Opcodes =
17855 PHIToOpcodes.try_emplace(V).first->getSecond();
17856 if (!Opcodes.empty())
17857 continue;
17858 SmallVector<Value *, 4> Nodes(1, V);
17860 while (!Nodes.empty()) {
17861 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
17862 if (!Visited.insert(PHI).second)
17863 continue;
17864 for (Value *V : PHI->incoming_values()) {
17865 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
17866 Nodes.push_back(PHI1);
17867 continue;
17868 }
17869 Opcodes.emplace_back(V);
17870 }
17871 }
17872 }
17873
17874 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
17875 Incoming, PHICompare, AreCompatiblePHIs,
17876 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
17877 return tryToVectorizeList(Candidates, R, MaxVFOnly);
17878 },
17879 /*MaxVFOnly=*/true, R);
17880 Changed |= HaveVectorizedPhiNodes;
17881 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
17882 } while (HaveVectorizedPhiNodes);
17883
17884 VisitedInstrs.clear();
17885
17886 InstSetVector PostProcessInserts;
17887 SmallSetVector<CmpInst *, 8> PostProcessCmps;
17888 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
17889 // also vectorizes `PostProcessCmps`.
17890 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
17891 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
17892 if (VectorizeCmps) {
17893 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
17894 PostProcessCmps.clear();
17895 }
17896 PostProcessInserts.clear();
17897 return Changed;
17898 };
17899 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
17900 auto IsInPostProcessInstrs = [&](Instruction *I) {
17901 if (auto *Cmp = dyn_cast<CmpInst>(I))
17902 return PostProcessCmps.contains(Cmp);
17903 return isa<InsertElementInst, InsertValueInst>(I) &&
17904 PostProcessInserts.contains(I);
17905 };
17906 // Returns true if `I` is an instruction without users, like terminator, or
17907 // function call with ignored return value, store. Ignore unused instructions
17908 // (basing on instruction type, except for CallInst and InvokeInst).
17909 auto HasNoUsers = [](Instruction *I) {
17910 return I->use_empty() &&
17911 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
17912 };
17913 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
17914 // Skip instructions with scalable type. The num of elements is unknown at
17915 // compile-time for scalable type.
17916 if (isa<ScalableVectorType>(It->getType()))
17917 continue;
17918
17919 // Skip instructions marked for the deletion.
17920 if (R.isDeleted(&*It))
17921 continue;
17922 // We may go through BB multiple times so skip the one we have checked.
17923 if (!VisitedInstrs.insert(&*It).second) {
17924 if (HasNoUsers(&*It) &&
17925 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
17926 // We would like to start over since some instructions are deleted
17927 // and the iterator may become invalid value.
17928 Changed = true;
17929 It = BB->begin();
17930 E = BB->end();
17931 }
17932 continue;
17933 }
17934
17935 if (isa<DbgInfoIntrinsic>(It))
17936 continue;
17937
17938 // Try to vectorize reductions that use PHINodes.
17939 if (PHINode *P = dyn_cast<PHINode>(It)) {
17940 // Check that the PHI is a reduction PHI.
17941 if (P->getNumIncomingValues() == 2) {
17942 // Try to match and vectorize a horizontal reduction.
17943 Instruction *Root = getReductionInstr(DT, P, BB, LI);
17944 if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
17945 Changed = true;
17946 It = BB->begin();
17947 E = BB->end();
17948 continue;
17949 }
17950 }
17951 // Try to vectorize the incoming values of the PHI, to catch reductions
17952 // that feed into PHIs.
17953 for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) {
17954 // Skip if the incoming block is the current BB for now. Also, bypass
17955 // unreachable IR for efficiency and to avoid crashing.
17956 // TODO: Collect the skipped incoming values and try to vectorize them
17957 // after processing BB.
17958 if (BB == P->getIncomingBlock(I) ||
17959 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
17960 continue;
17961
17962 // Postponed instructions should not be vectorized here, delay their
17963 // vectorization.
17964 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
17965 PI && !IsInPostProcessInstrs(PI))
17966 Changed |= vectorizeRootInstruction(nullptr, PI,
17967 P->getIncomingBlock(I), R, TTI);
17968 }
17969 continue;
17970 }
17971
17972 if (HasNoUsers(&*It)) {
17973 bool OpsChanged = false;
17974 auto *SI = dyn_cast<StoreInst>(It);
17975 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
17976 if (SI) {
17977 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
17978 // Try to vectorize chain in store, if this is the only store to the
17979 // address in the block.
17980 // TODO: This is just a temporarily solution to save compile time. Need
17981 // to investigate if we can safely turn on slp-vectorize-hor-store
17982 // instead to allow lookup for reduction chains in all non-vectorized
17983 // stores (need to check side effects and compile time).
17984 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
17985 SI->getValueOperand()->hasOneUse();
17986 }
17987 if (TryToVectorizeRoot) {
17988 for (auto *V : It->operand_values()) {
17989 // Postponed instructions should not be vectorized here, delay their
17990 // vectorization.
17991 if (auto *VI = dyn_cast<Instruction>(V);
17992 VI && !IsInPostProcessInstrs(VI))
17993 // Try to match and vectorize a horizontal reduction.
17994 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI);
17995 }
17996 }
17997 // Start vectorization of post-process list of instructions from the
17998 // top-tree instructions to try to vectorize as many instructions as
17999 // possible.
18000 OpsChanged |=
18001 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
18002 if (OpsChanged) {
18003 // We would like to start over since some instructions are deleted
18004 // and the iterator may become invalid value.
18005 Changed = true;
18006 It = BB->begin();
18007 E = BB->end();
18008 continue;
18009 }
18010 }
18011
18012 if (isa<InsertElementInst, InsertValueInst>(It))
18013 PostProcessInserts.insert(&*It);
18014 else if (isa<CmpInst>(It))
18015 PostProcessCmps.insert(cast<CmpInst>(&*It));
18016 }
18017
18018 return Changed;
18019}
18020
18021bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
18022 auto Changed = false;
18023 for (auto &Entry : GEPs) {
18024 // If the getelementptr list has fewer than two elements, there's nothing
18025 // to do.
18026 if (Entry.second.size() < 2)
18027 continue;
18028
18029 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
18030 << Entry.second.size() << ".\n");
18031
18032 // Process the GEP list in chunks suitable for the target's supported
18033 // vector size. If a vector register can't hold 1 element, we are done. We
18034 // are trying to vectorize the index computations, so the maximum number of
18035 // elements is based on the size of the index expression, rather than the
18036 // size of the GEP itself (the target's pointer size).
18037 unsigned MaxVecRegSize = R.getMaxVecRegSize();
18038 unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin());
18039 if (MaxVecRegSize < EltSize)
18040 continue;
18041
18042 unsigned MaxElts = MaxVecRegSize / EltSize;
18043 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
18044 auto Len = std::min<unsigned>(BE - BI, MaxElts);
18045 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
18046
18047 // Initialize a set a candidate getelementptrs. Note that we use a
18048 // SetVector here to preserve program order. If the index computations
18049 // are vectorizable and begin with loads, we want to minimize the chance
18050 // of having to reorder them later.
18051 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
18052
18053 // Some of the candidates may have already been vectorized after we
18054 // initially collected them or their index is optimized to constant value.
18055 // If so, they are marked as deleted, so remove them from the set of
18056 // candidates.
18057 Candidates.remove_if([&R](Value *I) {
18058 return R.isDeleted(cast<Instruction>(I)) ||
18059 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
18060 });
18061
18062 // Remove from the set of candidates all pairs of getelementptrs with
18063 // constant differences. Such getelementptrs are likely not good
18064 // candidates for vectorization in a bottom-up phase since one can be
18065 // computed from the other. We also ensure all candidate getelementptr
18066 // indices are unique.
18067 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
18068 auto *GEPI = GEPList[I];
18069 if (!Candidates.count(GEPI))
18070 continue;
18071 auto *SCEVI = SE->getSCEV(GEPList[I]);
18072 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
18073 auto *GEPJ = GEPList[J];
18074 auto *SCEVJ = SE->getSCEV(GEPList[J]);
18075 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
18076 Candidates.remove(GEPI);
18077 Candidates.remove(GEPJ);
18078 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
18079 Candidates.remove(GEPJ);
18080 }
18081 }
18082 }
18083
18084 // We break out of the above computation as soon as we know there are
18085 // fewer than two candidates remaining.
18086 if (Candidates.size() < 2)
18087 continue;
18088
18089 // Add the single, non-constant index of each candidate to the bundle. We
18090 // ensured the indices met these constraints when we originally collected
18091 // the getelementptrs.
18092 SmallVector<Value *, 16> Bundle(Candidates.size());
18093 auto BundleIndex = 0u;
18094 for (auto *V : Candidates) {
18095 auto *GEP = cast<GetElementPtrInst>(V);
18096 auto *GEPIdx = GEP->idx_begin()->get();
18097 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
18098 Bundle[BundleIndex++] = GEPIdx;
18099 }
18100
18101 // Try and vectorize the indices. We are currently only interested in
18102 // gather-like cases of the form:
18103 //
18104 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
18105 //
18106 // where the loads of "a", the loads of "b", and the subtractions can be
18107 // performed in parallel. It's likely that detecting this pattern in a
18108 // bottom-up phase will be simpler and less costly than building a
18109 // full-blown top-down phase beginning at the consecutive loads.
18110 Changed |= tryToVectorizeList(Bundle, R);
18111 }
18112 }
18113 return Changed;
18114}
18115
18116bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
18117 bool Changed = false;
18118 // Sort by type, base pointers and values operand. Value operands must be
18119 // compatible (have the same opcode, same parent), otherwise it is
18120 // definitely not profitable to try to vectorize them.
18121 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
18122 if (V->getValueOperand()->getType()->getTypeID() <
18123 V2->getValueOperand()->getType()->getTypeID())
18124 return true;
18125 if (V->getValueOperand()->getType()->getTypeID() >
18126 V2->getValueOperand()->getType()->getTypeID())
18127 return false;
18128 if (V->getPointerOperandType()->getTypeID() <
18129 V2->getPointerOperandType()->getTypeID())
18130 return true;
18131 if (V->getPointerOperandType()->getTypeID() >
18132 V2->getPointerOperandType()->getTypeID())
18133 return false;
18134 // UndefValues are compatible with all other values.
18135 if (isa<UndefValue>(V->getValueOperand()) ||
18136 isa<UndefValue>(V2->getValueOperand()))
18137 return false;
18138 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
18139 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
18141 DT->getNode(I1->getParent());
18143 DT->getNode(I2->getParent());
18144 assert(NodeI1 && "Should only process reachable instructions");
18145 assert(NodeI2 && "Should only process reachable instructions");
18146 assert((NodeI1 == NodeI2) ==
18147 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18148 "Different nodes should have different DFS numbers");
18149 if (NodeI1 != NodeI2)
18150 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18151 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18152 if (S.getOpcode())
18153 return false;
18154 return I1->getOpcode() < I2->getOpcode();
18155 }
18156 if (isa<Constant>(V->getValueOperand()) &&
18157 isa<Constant>(V2->getValueOperand()))
18158 return false;
18159 return V->getValueOperand()->getValueID() <
18160 V2->getValueOperand()->getValueID();
18161 };
18162
18163 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
18164 if (V1 == V2)
18165 return true;
18166 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
18167 return false;
18168 if (V1->getPointerOperandType() != V2->getPointerOperandType())
18169 return false;
18170 // Undefs are compatible with any other value.
18171 if (isa<UndefValue>(V1->getValueOperand()) ||
18172 isa<UndefValue>(V2->getValueOperand()))
18173 return true;
18174 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
18175 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
18176 if (I1->getParent() != I2->getParent())
18177 return false;
18178 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18179 return S.getOpcode() > 0;
18180 }
18181 if (isa<Constant>(V1->getValueOperand()) &&
18182 isa<Constant>(V2->getValueOperand()))
18183 return true;
18184 return V1->getValueOperand()->getValueID() ==
18185 V2->getValueOperand()->getValueID();
18186 };
18187
18188 // Attempt to sort and vectorize each of the store-groups.
18189 for (auto &Pair : Stores) {
18190 if (Pair.second.size() < 2)
18191 continue;
18192
18193 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
18194 << Pair.second.size() << ".\n");
18195
18196 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
18197 continue;
18198
18199 // Reverse stores to do bottom-to-top analysis. This is important if the
18200 // values are stores to the same addresses several times, in this case need
18201 // to follow the stores order (reversed to meet the memory dependecies).
18202 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
18203 Pair.second.rend());
18204 Changed |= tryToVectorizeSequence<StoreInst>(
18205 ReversedStores, StoreSorter, AreCompatibleStores,
18206 [this, &R](ArrayRef<StoreInst *> Candidates, bool) {
18207 return vectorizeStores(Candidates, R);
18208 },
18209 /*MaxVFOnly=*/false, R);
18210 }
18211 return Changed;
18212}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:529
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition: LICM.cpp:1497
Loop::LoopBounds::Direction Direction
Definition: LoopInfo.cpp:230
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
#define T
Module.h This file contains the declarations for the Module class.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(VerifyEach)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > AllowHorRdxIdenityOptimization("slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, cl::desc("Allow optimization of original scalar identity operations on " "matched horizontal reductions."))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static Value * isOneOf(const InstructionsState &S, Value *Op)
Chooses the correct key for scheduling data.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
#define SV_NAME
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static std::string shortBundleName(ArrayRef< Value * > VL)
Print a short descriptor of the instruction bundle suitable for debug output.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static std::optional< unsigned > getInsertIndex(const Value *InsertInst, unsigned Offset=0)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt)
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
This defines the Use class.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleCostEstimator(TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:76
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1636
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:264
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:492
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:473
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:187
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:174
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:228
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:204
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:168
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:195
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:194
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator end()
Definition: BasicBlock.h:443
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:430
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
Definition: BasicBlock.cpp:367
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:167
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
reverse_iterator rend()
Definition: BasicBlock.h:448
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:165
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:221
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:70
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1494
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:2332
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:2227
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1742
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2469
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:2326
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1600
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1678
unsigned arg_size() const
Definition: InstrTypes.h:1685
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1871
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:2323
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:601
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:983
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1362
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:1022
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:1023
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:1017
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:1016
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:1020
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:1018
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:1021
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:1019
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:1167
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:1129
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:1105
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2126
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:154
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1449
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1398
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:417
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:101
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
value_type & FindAndConstruct(const KeyT &Key)
Definition: DenseMap.h:348
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:321
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:296
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void set()
Definition: FMF.h:62
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:130
Type * getReturnType() const
Definition: DerivedTypes.h:124
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2257
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:921
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:511
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2460
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:539
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2265
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1807
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:466
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1110
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:175
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2535
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:311
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:220
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:848
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1753
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:486
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2366
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2397
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2249
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2494
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:471
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1666
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:169
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2273
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2161
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2196
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1826
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2412
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1866
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1587
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1361
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:630
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:260
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:742
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:454
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
Definition: Instruction.h:257
const BasicBlock * getParent() const
Definition: Instruction.h:152
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:258
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
An instruction for reading from memory.
Definition: Instructions.h:184
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:286
Value * getPointerOperand()
Definition: Instructions.h:280
bool isSimple() const
Definition: Instructions.h:272
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:236
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
iterator end()
Definition: MapVector.h:71
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:55
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:118
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
void reserve(size_type NumEntries)
Grow the MapVector so that it can contain at least NumEntries items before resizing again.
Definition: MapVector.h:64
size_type size() const
Definition: MapVector.h:60
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:83
void clear()
Definition: MapVector.h:88
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
T & front() const
front - Get the first element.
Definition: ArrayRef.h:363
iterator end() const
Definition: ArrayRef.h:357
iterator begin() const
Definition: ArrayRef.h:356
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:376
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:155
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:144
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:356
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
iterator end() const
Definition: SmallPtrSet.h:385
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
iterator begin() const
Definition: SmallPtrSet.h:380
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:366
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:236
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:981
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
Type * getPointerOperandType() const
Definition: Instructions.h:420
Value * getValueOperand()
Definition: Instructions.h:414
Value * getPointerOperand()
Definition: Instructions.h:417
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:34
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr) const
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:160
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:287
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:166
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1808
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:242
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Definition: User.h:73
op_iterator op_begin()
Definition: User.h:234
Value * getOperand(unsigned i) const
Definition: User.h:169
iterator_range< value_op_iterator > operand_values()
Definition: User.h:266
The Vector Function Database.
Definition: VectorUtils.h:29
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:70
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:532
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:153
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type size() const
Definition: DenseSet.h:81
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
bool erase(const ValueT &V)
Definition: DenseSet.h:101
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:74
self_iterator getIterator()
Definition: ilist_node.h:109
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:660
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:690
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals=std::nullopt)
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isLoadCombineCandidate() const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
InstructionCost getSpillCost() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:103
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1465
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:777
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:836
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239
@ Undef
Value of the register doesn't matter.
ManagedStatic< cl::opt< FnT >, OptCreatorT > Action
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:456
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1715
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:128
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:950
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:540
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
Value * createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a target reduction of the given vector.
Definition: LoopUtils.cpp:1166
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:40
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7063
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool getAlign(const Function &F, unsigned index, unsigned &align)
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
iterator_range< po_iterator< T > > post_order(const T &G)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2059
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1754
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:116
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:419
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1235
constexpr int PoisonMaskElem
@ Other
Any other memory.
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ Or
Bitwise or logical OR of integers.
@ None
Not a recurrence.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Definition: STLExtras.h:1986
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:427
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
InstructionCost Cost
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:439
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:613
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:491
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2490
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:104
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:220
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1450
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1459
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const