LLVM 19.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <memory>
97#include <optional>
98#include <set>
99#include <string>
100#include <tuple>
101#include <utility>
102
103using namespace llvm;
104using namespace llvm::PatternMatch;
105using namespace slpvectorizer;
106
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
109
110STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111
112static cl::opt<bool>
113 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
114 cl::desc("Run the SLP vectorization passes"));
115
116static cl::opt<int>
118 cl::desc("Only vectorize if you gain more than this "
119 "number "));
120
122 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
123 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
124 "heuristics and makes vectorization decision via cost modeling."));
125
126static cl::opt<bool>
127ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
128 cl::desc("Attempt to vectorize horizontal reductions"));
129
131 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
132 cl::desc(
133 "Attempt to vectorize horizontal reductions feeding into a store"));
134
135// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
136// even if we match a reduction but do not vectorize in the end.
138 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
139 cl::desc("Allow optimization of original scalar identity operations on "
140 "matched horizontal reductions."));
141
142static cl::opt<int>
144 cl::desc("Attempt to vectorize for this register size in bits"));
145
148 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
149
150/// Limits the size of scheduling regions in a block.
151/// It avoid long compile times for _very_ large blocks where vector
152/// instructions are spread over a wide range.
153/// This limit is way higher than needed by real-world functions.
154static cl::opt<int>
155ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
156 cl::desc("Limit the size of the SLP scheduling region per block"));
157
159 "slp-min-reg-size", cl::init(128), cl::Hidden,
160 cl::desc("Attempt to vectorize for this register size in bits"));
161
163 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
164 cl::desc("Limit the recursion depth when building a vectorizable tree"));
165
167 "slp-min-tree-size", cl::init(3), cl::Hidden,
168 cl::desc("Only vectorize small trees if they are fully vectorizable"));
169
170// The maximum depth that the look-ahead score heuristic will explore.
171// The higher this value, the higher the compilation time overhead.
173 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
174 cl::desc("The maximum look-ahead depth for operand reordering scores"));
175
176// The maximum depth that the look-ahead score heuristic will explore
177// when it probing among candidates for vectorization tree roots.
178// The higher this value, the higher the compilation time overhead but unlike
179// similar limit for operands ordering this is less frequently used, hence
180// impact of higher value is less noticeable.
182 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
183 cl::desc("The maximum look-ahead depth for searching best rooting option"));
184
186 "slp-min-strided-loads", cl::init(2), cl::Hidden,
187 cl::desc("The minimum number of loads, which should be considered strided, "
188 "if the stride is > 1 or is runtime value"));
189
191 "slp-max-stride", cl::init(8), cl::Hidden,
192 cl::desc("The maximum stride, considered to be profitable."));
193
194static cl::opt<bool>
195 ViewSLPTree("view-slp-tree", cl::Hidden,
196 cl::desc("Display the SLP trees with Graphviz"));
197
199 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
200 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
201
202// Limit the number of alias checks. The limit is chosen so that
203// it has no negative effect on the llvm benchmarks.
204static const unsigned AliasedCheckLimit = 10;
205
206// Limit of the number of uses for potentially transformed instructions/values,
207// used in checks to avoid compile-time explode.
208static constexpr int UsesLimit = 8;
209
210// Another limit for the alias checks: The maximum distance between load/store
211// instructions where alias checks are done.
212// This limit is useful for very large basic blocks.
213static const unsigned MaxMemDepDistance = 160;
214
215/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
216/// regions to be handled.
217static const int MinScheduleRegionSize = 16;
218
219/// Predicate for the element types that the SLP vectorizer supports.
220///
221/// The most important thing to filter here are types which are invalid in LLVM
222/// vectors. We also filter target specific types which have absolutely no
223/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
224/// avoids spending time checking the cost model and realizing that they will
225/// be inevitably scalarized.
226static bool isValidElementType(Type *Ty) {
227 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
228 !Ty->isPPC_FP128Ty();
229}
230
231/// \returns True if the value is a constant (but not globals/constant
232/// expressions).
233static bool isConstant(Value *V) {
234 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
235}
236
237/// Checks if \p V is one of vector-like instructions, i.e. undef,
238/// insertelement/extractelement with constant indices for fixed vector type or
239/// extractvalue instruction.
241 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
242 !isa<ExtractValueInst, UndefValue>(V))
243 return false;
244 auto *I = dyn_cast<Instruction>(V);
245 if (!I || isa<ExtractValueInst>(I))
246 return true;
247 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
248 return false;
249 if (isa<ExtractElementInst>(I))
250 return isConstant(I->getOperand(1));
251 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
252 return isConstant(I->getOperand(2));
253}
254
255#if !defined(NDEBUG)
256/// Print a short descriptor of the instruction bundle suitable for debug output.
257static std::string shortBundleName(ArrayRef<Value *> VL) {
258 std::string Result;
259 raw_string_ostream OS(Result);
260 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
261 OS.flush();
262 return Result;
263}
264#endif
265
266/// \returns true if all of the instructions in \p VL are in the same block or
267/// false otherwise.
269 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
270 if (!I0)
271 return false;
273 return true;
274
275 BasicBlock *BB = I0->getParent();
276 for (int I = 1, E = VL.size(); I < E; I++) {
277 auto *II = dyn_cast<Instruction>(VL[I]);
278 if (!II)
279 return false;
280
281 if (BB != II->getParent())
282 return false;
283 }
284 return true;
285}
286
287/// \returns True if all of the values in \p VL are constants (but not
288/// globals/constant expressions).
290 // Constant expressions and globals can't be vectorized like normal integer/FP
291 // constants.
292 return all_of(VL, isConstant);
293}
294
295/// \returns True if all of the values in \p VL are identical or some of them
296/// are UndefValue.
297static bool isSplat(ArrayRef<Value *> VL) {
298 Value *FirstNonUndef = nullptr;
299 for (Value *V : VL) {
300 if (isa<UndefValue>(V))
301 continue;
302 if (!FirstNonUndef) {
303 FirstNonUndef = V;
304 continue;
305 }
306 if (V != FirstNonUndef)
307 return false;
308 }
309 return FirstNonUndef != nullptr;
310}
311
312/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
314 if (auto *Cmp = dyn_cast<CmpInst>(I))
315 return Cmp->isCommutative();
316 if (auto *BO = dyn_cast<BinaryOperator>(I))
317 return BO->isCommutative() ||
318 (BO->getOpcode() == Instruction::Sub &&
319 !BO->hasNUsesOrMore(UsesLimit) &&
320 all_of(
321 BO->uses(),
322 [](const Use &U) {
323 // Commutative, if icmp eq/ne sub, 0
324 ICmpInst::Predicate Pred;
325 if (match(U.getUser(),
326 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
327 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
328 return true;
329 // Commutative, if abs(sub nsw, true) or abs(sub, false).
330 ConstantInt *Flag;
331 return match(U.getUser(),
332 m_Intrinsic<Intrinsic::abs>(
333 m_Specific(U.get()), m_ConstantInt(Flag))) &&
334 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
335 Flag->isOne());
336 })) ||
337 (BO->getOpcode() == Instruction::FSub &&
338 !BO->hasNUsesOrMore(UsesLimit) &&
339 all_of(BO->uses(), [](const Use &U) {
340 return match(U.getUser(),
341 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
342 }));
343 return I->isCommutative();
344}
345
346/// \returns inserting index of InsertElement or InsertValue instruction,
347/// using Offset as base offset for index.
348static std::optional<unsigned> getInsertIndex(const Value *InsertInst,
349 unsigned Offset = 0) {
350 int Index = Offset;
351 if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
352 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
353 if (!VT)
354 return std::nullopt;
355 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
356 if (!CI)
357 return std::nullopt;
358 if (CI->getValue().uge(VT->getNumElements()))
359 return std::nullopt;
360 Index *= VT->getNumElements();
361 Index += CI->getZExtValue();
362 return Index;
363 }
364
365 const auto *IV = cast<InsertValueInst>(InsertInst);
366 Type *CurrentType = IV->getType();
367 for (unsigned I : IV->indices()) {
368 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
369 Index *= ST->getNumElements();
370 CurrentType = ST->getElementType(I);
371 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
372 Index *= AT->getNumElements();
373 CurrentType = AT->getElementType();
374 } else {
375 return std::nullopt;
376 }
377 Index += I;
378 }
379 return Index;
380}
381
382namespace {
383/// Specifies the way the mask should be analyzed for undefs/poisonous elements
384/// in the shuffle mask.
385enum class UseMask {
386 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
387 ///< check for the mask elements for the first argument (mask
388 ///< indices are in range [0:VF)).
389 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
390 ///< for the mask elements for the second argument (mask indices
391 ///< are in range [VF:2*VF))
392 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
393 ///< future shuffle elements and mark them as ones as being used
394 ///< in future. Non-undef elements are considered as unused since
395 ///< they're already marked as used in the mask.
396};
397} // namespace
398
399/// Prepares a use bitset for the given mask either for the first argument or
400/// for the second.
402 UseMask MaskArg) {
403 SmallBitVector UseMask(VF, true);
404 for (auto [Idx, Value] : enumerate(Mask)) {
405 if (Value == PoisonMaskElem) {
406 if (MaskArg == UseMask::UndefsAsMask)
407 UseMask.reset(Idx);
408 continue;
409 }
410 if (MaskArg == UseMask::FirstArg && Value < VF)
411 UseMask.reset(Value);
412 else if (MaskArg == UseMask::SecondArg && Value >= VF)
413 UseMask.reset(Value - VF);
414 }
415 return UseMask;
416}
417
418/// Checks if the given value is actually an undefined constant vector.
419/// Also, if the \p UseMask is not empty, tries to check if the non-masked
420/// elements actually mask the insertelement buildvector, if any.
421template <bool IsPoisonOnly = false>
423 const SmallBitVector &UseMask = {}) {
424 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
425 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
426 if (isa<T>(V))
427 return Res;
428 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
429 if (!VecTy)
430 return Res.reset();
431 auto *C = dyn_cast<Constant>(V);
432 if (!C) {
433 if (!UseMask.empty()) {
434 const Value *Base = V;
435 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
436 Base = II->getOperand(0);
437 if (isa<T>(II->getOperand(1)))
438 continue;
439 std::optional<unsigned> Idx = getInsertIndex(II);
440 if (!Idx) {
441 Res.reset();
442 return Res;
443 }
444 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
445 Res.reset(*Idx);
446 }
447 // TODO: Add analysis for shuffles here too.
448 if (V == Base) {
449 Res.reset();
450 } else {
451 SmallBitVector SubMask(UseMask.size(), false);
452 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
453 }
454 } else {
455 Res.reset();
456 }
457 return Res;
458 }
459 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
460 if (Constant *Elem = C->getAggregateElement(I))
461 if (!isa<T>(Elem) &&
462 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
463 Res.reset(I);
464 }
465 return Res;
466}
467
468/// Checks if the vector of instructions can be represented as a shuffle, like:
469/// %x0 = extractelement <4 x i8> %x, i32 0
470/// %x3 = extractelement <4 x i8> %x, i32 3
471/// %y1 = extractelement <4 x i8> %y, i32 1
472/// %y2 = extractelement <4 x i8> %y, i32 2
473/// %x0x0 = mul i8 %x0, %x0
474/// %x3x3 = mul i8 %x3, %x3
475/// %y1y1 = mul i8 %y1, %y1
476/// %y2y2 = mul i8 %y2, %y2
477/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
478/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
479/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
480/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
481/// ret <4 x i8> %ins4
482/// can be transformed into:
483/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
484/// i32 6>
485/// %2 = mul <4 x i8> %1, %1
486/// ret <4 x i8> %2
487/// Mask will return the Shuffle Mask equivalent to the extracted elements.
488/// TODO: Can we split off and reuse the shuffle mask detection from
489/// ShuffleVectorInst/getShuffleCost?
490static std::optional<TargetTransformInfo::ShuffleKind>
492 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
493 if (It == VL.end())
494 return std::nullopt;
495 auto *EI0 = cast<ExtractElementInst>(*It);
496 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
497 return std::nullopt;
498 unsigned Size =
499 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
500 Value *Vec1 = nullptr;
501 Value *Vec2 = nullptr;
502 enum ShuffleMode { Unknown, Select, Permute };
503 ShuffleMode CommonShuffleMode = Unknown;
504 Mask.assign(VL.size(), PoisonMaskElem);
505 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
506 // Undef can be represented as an undef element in a vector.
507 if (isa<UndefValue>(VL[I]))
508 continue;
509 auto *EI = cast<ExtractElementInst>(VL[I]);
510 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
511 return std::nullopt;
512 auto *Vec = EI->getVectorOperand();
513 // We can extractelement from undef or poison vector.
514 if (isUndefVector(Vec).all())
515 continue;
516 // All vector operands must have the same number of vector elements.
517 if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
518 return std::nullopt;
519 if (isa<UndefValue>(EI->getIndexOperand()))
520 continue;
521 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
522 if (!Idx)
523 return std::nullopt;
524 // Undefined behavior if Idx is negative or >= Size.
525 if (Idx->getValue().uge(Size))
526 continue;
527 unsigned IntIdx = Idx->getValue().getZExtValue();
528 Mask[I] = IntIdx;
529 // For correct shuffling we have to have at most 2 different vector operands
530 // in all extractelement instructions.
531 if (!Vec1 || Vec1 == Vec) {
532 Vec1 = Vec;
533 } else if (!Vec2 || Vec2 == Vec) {
534 Vec2 = Vec;
535 Mask[I] += Size;
536 } else {
537 return std::nullopt;
538 }
539 if (CommonShuffleMode == Permute)
540 continue;
541 // If the extract index is not the same as the operation number, it is a
542 // permutation.
543 if (IntIdx != I) {
544 CommonShuffleMode = Permute;
545 continue;
546 }
547 CommonShuffleMode = Select;
548 }
549 // If we're not crossing lanes in different vectors, consider it as blending.
550 if (CommonShuffleMode == Select && Vec2)
552 // If Vec2 was never used, we have a permutation of a single vector, otherwise
553 // we have permutation of 2 vectors.
556}
557
558/// \returns True if Extract{Value,Element} instruction extracts element Idx.
559static std::optional<unsigned> getExtractIndex(Instruction *E) {
560 unsigned Opcode = E->getOpcode();
561 assert((Opcode == Instruction::ExtractElement ||
562 Opcode == Instruction::ExtractValue) &&
563 "Expected extractelement or extractvalue instruction.");
564 if (Opcode == Instruction::ExtractElement) {
565 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
566 if (!CI)
567 return std::nullopt;
568 return CI->getZExtValue();
569 }
570 auto *EI = cast<ExtractValueInst>(E);
571 if (EI->getNumIndices() != 1)
572 return std::nullopt;
573 return *EI->idx_begin();
574}
575
576namespace {
577
578/// Main data required for vectorization of instructions.
579struct InstructionsState {
580 /// The very first instruction in the list with the main opcode.
581 Value *OpValue = nullptr;
582
583 /// The main/alternate instruction.
584 Instruction *MainOp = nullptr;
585 Instruction *AltOp = nullptr;
586
587 /// The main/alternate opcodes for the list of instructions.
588 unsigned getOpcode() const {
589 return MainOp ? MainOp->getOpcode() : 0;
590 }
591
592 unsigned getAltOpcode() const {
593 return AltOp ? AltOp->getOpcode() : 0;
594 }
595
596 /// Some of the instructions in the list have alternate opcodes.
597 bool isAltShuffle() const { return AltOp != MainOp; }
598
599 bool isOpcodeOrAlt(Instruction *I) const {
600 unsigned CheckedOpcode = I->getOpcode();
601 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
602 }
603
604 InstructionsState() = delete;
605 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
606 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
607};
608
609} // end anonymous namespace
610
611/// Chooses the correct key for scheduling data. If \p Op has the same (or
612/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
613/// OpValue.
614static Value *isOneOf(const InstructionsState &S, Value *Op) {
615 auto *I = dyn_cast<Instruction>(Op);
616 if (I && S.isOpcodeOrAlt(I))
617 return Op;
618 return S.OpValue;
619}
620
621/// \returns true if \p Opcode is allowed as part of the main/alternate
622/// instruction for SLP vectorization.
623///
624/// Example of unsupported opcode is SDIV that can potentially cause UB if the
625/// "shuffled out" lane would result in division by zero.
626static bool isValidForAlternation(unsigned Opcode) {
627 if (Instruction::isIntDivRem(Opcode))
628 return false;
629
630 return true;
631}
632
633static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
634 const TargetLibraryInfo &TLI,
635 unsigned BaseIndex = 0);
636
637/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
638/// compatible instructions or constants, or just some other regular values.
639static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
640 Value *Op1, const TargetLibraryInfo &TLI) {
641 return (isConstant(BaseOp0) && isConstant(Op0)) ||
642 (isConstant(BaseOp1) && isConstant(Op1)) ||
643 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
644 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
645 BaseOp0 == Op0 || BaseOp1 == Op1 ||
646 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
647 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
648}
649
650/// \returns true if a compare instruction \p CI has similar "look" and
651/// same predicate as \p BaseCI, "as is" or with its operands and predicate
652/// swapped, false otherwise.
653static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
654 const TargetLibraryInfo &TLI) {
655 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
656 "Assessing comparisons of different types?");
657 CmpInst::Predicate BasePred = BaseCI->getPredicate();
658 CmpInst::Predicate Pred = CI->getPredicate();
660
661 Value *BaseOp0 = BaseCI->getOperand(0);
662 Value *BaseOp1 = BaseCI->getOperand(1);
663 Value *Op0 = CI->getOperand(0);
664 Value *Op1 = CI->getOperand(1);
665
666 return (BasePred == Pred &&
667 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
668 (BasePred == SwappedPred &&
669 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
670}
671
672/// \returns analysis of the Instructions in \p VL described in
673/// InstructionsState, the Opcode that we suppose the whole list
674/// could be vectorized even if its structure is diverse.
675static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
676 const TargetLibraryInfo &TLI,
677 unsigned BaseIndex) {
678 // Make sure these are all Instructions.
679 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
680 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
681
682 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
683 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
684 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
685 CmpInst::Predicate BasePred =
686 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
688 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
689 unsigned AltOpcode = Opcode;
690 unsigned AltIndex = BaseIndex;
691
692 bool SwappedPredsCompatible = [&]() {
693 if (!IsCmpOp)
694 return false;
695 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
696 UniquePreds.insert(BasePred);
697 UniqueNonSwappedPreds.insert(BasePred);
698 for (Value *V : VL) {
699 auto *I = dyn_cast<CmpInst>(V);
700 if (!I)
701 return false;
702 CmpInst::Predicate CurrentPred = I->getPredicate();
703 CmpInst::Predicate SwappedCurrentPred =
704 CmpInst::getSwappedPredicate(CurrentPred);
705 UniqueNonSwappedPreds.insert(CurrentPred);
706 if (!UniquePreds.contains(CurrentPred) &&
707 !UniquePreds.contains(SwappedCurrentPred))
708 UniquePreds.insert(CurrentPred);
709 }
710 // Total number of predicates > 2, but if consider swapped predicates
711 // compatible only 2, consider swappable predicates as compatible opcodes,
712 // not alternate.
713 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
714 }();
715 // Check for one alternate opcode from another BinaryOperator.
716 // TODO - generalize to support all operators (types, calls etc.).
717 auto *IBase = cast<Instruction>(VL[BaseIndex]);
718 Intrinsic::ID BaseID = 0;
719 SmallVector<VFInfo> BaseMappings;
720 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
722 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
723 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
724 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
725 }
726 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
727 auto *I = cast<Instruction>(VL[Cnt]);
728 unsigned InstOpcode = I->getOpcode();
729 if (IsBinOp && isa<BinaryOperator>(I)) {
730 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
731 continue;
732 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
733 isValidForAlternation(Opcode)) {
734 AltOpcode = InstOpcode;
735 AltIndex = Cnt;
736 continue;
737 }
738 } else if (IsCastOp && isa<CastInst>(I)) {
739 Value *Op0 = IBase->getOperand(0);
740 Type *Ty0 = Op0->getType();
741 Value *Op1 = I->getOperand(0);
742 Type *Ty1 = Op1->getType();
743 if (Ty0 == Ty1) {
744 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
745 continue;
746 if (Opcode == AltOpcode) {
748 isValidForAlternation(InstOpcode) &&
749 "Cast isn't safe for alternation, logic needs to be updated!");
750 AltOpcode = InstOpcode;
751 AltIndex = Cnt;
752 continue;
753 }
754 }
755 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
756 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
757 Type *Ty0 = BaseInst->getOperand(0)->getType();
758 Type *Ty1 = Inst->getOperand(0)->getType();
759 if (Ty0 == Ty1) {
760 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
761 // Check for compatible operands. If the corresponding operands are not
762 // compatible - need to perform alternate vectorization.
763 CmpInst::Predicate CurrentPred = Inst->getPredicate();
764 CmpInst::Predicate SwappedCurrentPred =
765 CmpInst::getSwappedPredicate(CurrentPred);
766
767 if ((E == 2 || SwappedPredsCompatible) &&
768 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
769 continue;
770
771 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
772 continue;
773 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
774 if (AltIndex != BaseIndex) {
775 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
776 continue;
777 } else if (BasePred != CurrentPred) {
778 assert(
779 isValidForAlternation(InstOpcode) &&
780 "CmpInst isn't safe for alternation, logic needs to be updated!");
781 AltIndex = Cnt;
782 continue;
783 }
784 CmpInst::Predicate AltPred = AltInst->getPredicate();
785 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
786 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
787 continue;
788 }
789 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
790 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
791 if (Gep->getNumOperands() != 2 ||
792 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
793 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
794 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
796 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
797 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
798 auto *BaseLI = cast<LoadInst>(IBase);
799 if (!LI->isSimple() || !BaseLI->isSimple())
800 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
801 } else if (auto *Call = dyn_cast<CallInst>(I)) {
802 auto *CallBase = cast<CallInst>(IBase);
803 if (Call->getCalledFunction() != CallBase->getCalledFunction())
804 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
805 if (Call->hasOperandBundles() &&
806 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
807 Call->op_begin() + Call->getBundleOperandsEndIndex(),
808 CallBase->op_begin() +
810 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
812 if (ID != BaseID)
813 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
814 if (!ID) {
815 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
816 if (Mappings.size() != BaseMappings.size() ||
817 Mappings.front().ISA != BaseMappings.front().ISA ||
818 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
819 Mappings.front().VectorName != BaseMappings.front().VectorName ||
820 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
821 Mappings.front().Shape.Parameters !=
822 BaseMappings.front().Shape.Parameters)
823 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
824 }
825 }
826 continue;
827 }
828 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
829 }
830
831 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
832 cast<Instruction>(VL[AltIndex]));
833}
834
835/// \returns true if all of the values in \p VL have the same type or false
836/// otherwise.
838 Type *Ty = VL.front()->getType();
839 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
840}
841
842/// \returns True if in-tree use also needs extract. This refers to
843/// possible scalar operand in vectorized instruction.
844static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
845 TargetLibraryInfo *TLI) {
846 unsigned Opcode = UserInst->getOpcode();
847 switch (Opcode) {
848 case Instruction::Load: {
849 LoadInst *LI = cast<LoadInst>(UserInst);
850 return (LI->getPointerOperand() == Scalar);
851 }
852 case Instruction::Store: {
853 StoreInst *SI = cast<StoreInst>(UserInst);
854 return (SI->getPointerOperand() == Scalar);
855 }
856 case Instruction::Call: {
857 CallInst *CI = cast<CallInst>(UserInst);
859 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
860 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
861 Arg.value().get() == Scalar;
862 });
863 }
864 default:
865 return false;
866 }
867}
868
869/// \returns the AA location that is being access by the instruction.
871 if (StoreInst *SI = dyn_cast<StoreInst>(I))
872 return MemoryLocation::get(SI);
873 if (LoadInst *LI = dyn_cast<LoadInst>(I))
874 return MemoryLocation::get(LI);
875 return MemoryLocation();
876}
877
878/// \returns True if the instruction is not a volatile or atomic load/store.
879static bool isSimple(Instruction *I) {
880 if (LoadInst *LI = dyn_cast<LoadInst>(I))
881 return LI->isSimple();
882 if (StoreInst *SI = dyn_cast<StoreInst>(I))
883 return SI->isSimple();
884 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
885 return !MI->isVolatile();
886 return true;
887}
888
889/// Shuffles \p Mask in accordance with the given \p SubMask.
890/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
891/// one but two input vectors.
892static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
893 bool ExtendingManyInputs = false) {
894 if (SubMask.empty())
895 return;
896 assert(
897 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
898 // Check if input scalars were extended to match the size of other node.
899 (SubMask.size() == Mask.size() &&
900 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
901 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
902 "SubMask with many inputs support must be larger than the mask.");
903 if (Mask.empty()) {
904 Mask.append(SubMask.begin(), SubMask.end());
905 return;
906 }
907 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
908 int TermValue = std::min(Mask.size(), SubMask.size());
909 for (int I = 0, E = SubMask.size(); I < E; ++I) {
910 if (SubMask[I] == PoisonMaskElem ||
911 (!ExtendingManyInputs &&
912 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
913 continue;
914 NewMask[I] = Mask[SubMask[I]];
915 }
916 Mask.swap(NewMask);
917}
918
919/// Order may have elements assigned special value (size) which is out of
920/// bounds. Such indices only appear on places which correspond to undef values
921/// (see canReuseExtract for details) and used in order to avoid undef values
922/// have effect on operands ordering.
923/// The first loop below simply finds all unused indices and then the next loop
924/// nest assigns these indices for undef values positions.
925/// As an example below Order has two undef positions and they have assigned
926/// values 3 and 7 respectively:
927/// before: 6 9 5 4 9 2 1 0
928/// after: 6 3 5 4 7 2 1 0
930 const unsigned Sz = Order.size();
931 SmallBitVector UnusedIndices(Sz, /*t=*/true);
932 SmallBitVector MaskedIndices(Sz);
933 for (unsigned I = 0; I < Sz; ++I) {
934 if (Order[I] < Sz)
935 UnusedIndices.reset(Order[I]);
936 else
937 MaskedIndices.set(I);
938 }
939 if (MaskedIndices.none())
940 return;
941 assert(UnusedIndices.count() == MaskedIndices.count() &&
942 "Non-synced masked/available indices.");
943 int Idx = UnusedIndices.find_first();
944 int MIdx = MaskedIndices.find_first();
945 while (MIdx >= 0) {
946 assert(Idx >= 0 && "Indices must be synced.");
947 Order[MIdx] = Idx;
948 Idx = UnusedIndices.find_next(Idx);
949 MIdx = MaskedIndices.find_next(MIdx);
950 }
951}
952
953namespace llvm {
954
956 SmallVectorImpl<int> &Mask) {
957 Mask.clear();
958 const unsigned E = Indices.size();
959 Mask.resize(E, PoisonMaskElem);
960 for (unsigned I = 0; I < E; ++I)
961 Mask[Indices[I]] = I;
962}
963
964/// Reorders the list of scalars in accordance with the given \p Mask.
966 ArrayRef<int> Mask) {
967 assert(!Mask.empty() && "Expected non-empty mask.");
968 SmallVector<Value *> Prev(Scalars.size(),
969 UndefValue::get(Scalars.front()->getType()));
970 Prev.swap(Scalars);
971 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
972 if (Mask[I] != PoisonMaskElem)
973 Scalars[Mask[I]] = Prev[I];
974}
975
976/// Checks if the provided value does not require scheduling. It does not
977/// require scheduling if this is not an instruction or it is an instruction
978/// that does not read/write memory and all operands are either not instructions
979/// or phi nodes or instructions from different blocks.
981 auto *I = dyn_cast<Instruction>(V);
982 if (!I)
983 return true;
984 return !mayHaveNonDefUseDependency(*I) &&
985 all_of(I->operands(), [I](Value *V) {
986 auto *IO = dyn_cast<Instruction>(V);
987 if (!IO)
988 return true;
989 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
990 });
991}
992
993/// Checks if the provided value does not require scheduling. It does not
994/// require scheduling if this is not an instruction or it is an instruction
995/// that does not read/write memory and all users are phi nodes or instructions
996/// from the different blocks.
997static bool isUsedOutsideBlock(Value *V) {
998 auto *I = dyn_cast<Instruction>(V);
999 if (!I)
1000 return true;
1001 // Limits the number of uses to save compile time.
1002 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1003 all_of(I->users(), [I](User *U) {
1004 auto *IU = dyn_cast<Instruction>(U);
1005 if (!IU)
1006 return true;
1007 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1008 });
1009}
1010
1011/// Checks if the specified value does not require scheduling. It does not
1012/// require scheduling if all operands and all users do not need to be scheduled
1013/// in the current basic block.
1016}
1017
1018/// Checks if the specified array of instructions does not require scheduling.
1019/// It is so if all either instructions have operands that do not require
1020/// scheduling or their users do not require scheduling since they are phis or
1021/// in other basic blocks.
1023 return !VL.empty() &&
1025}
1026
1027namespace slpvectorizer {
1028
1029/// Bottom Up SLP Vectorizer.
1030class BoUpSLP {
1031 struct TreeEntry;
1032 struct ScheduleData;
1035
1036public:
1037 /// Tracks the state we can represent the loads in the given sequence.
1038 enum class LoadsState {
1039 Gather,
1040 Vectorize,
1043 };
1044
1052
1054 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1057 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1058 AC(AC), DB(DB), DL(DL), ORE(ORE),
1059 Builder(Se->getContext(), TargetFolder(*DL)) {
1060 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1061 // Use the vector register size specified by the target unless overridden
1062 // by a command-line option.
1063 // TODO: It would be better to limit the vectorization factor based on
1064 // data type rather than just register size. For example, x86 AVX has
1065 // 256-bit registers, but it does not support integer operations
1066 // at that width (that requires AVX2).
1067 if (MaxVectorRegSizeOption.getNumOccurrences())
1068 MaxVecRegSize = MaxVectorRegSizeOption;
1069 else
1070 MaxVecRegSize =
1072 .getFixedValue();
1073
1074 if (MinVectorRegSizeOption.getNumOccurrences())
1075 MinVecRegSize = MinVectorRegSizeOption;
1076 else
1077 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1078 }
1079
1080 /// Vectorize the tree that starts with the elements in \p VL.
1081 /// Returns the vectorized root.
1083
1084 /// Vectorize the tree but with the list of externally used values \p
1085 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1086 /// generated extractvalue instructions.
1087 /// \param ReplacedExternals containd list of replaced external values
1088 /// {scalar, replace} after emitting extractelement for external uses.
1089 Value *
1090 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1091 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1092 Instruction *ReductionRoot = nullptr);
1093
1094 /// \returns the cost incurred by unwanted spills and fills, caused by
1095 /// holding live values over call sites.
1097
1098 /// \returns the vectorization cost of the subtree that starts at \p VL.
1099 /// A negative number means that this is profitable.
1100 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1101
1102 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1103 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1104 void buildTree(ArrayRef<Value *> Roots,
1105 const SmallDenseSet<Value *> &UserIgnoreLst);
1106
1107 /// Construct a vectorizable tree that starts at \p Roots.
1108 void buildTree(ArrayRef<Value *> Roots);
1109
1110 /// Returns whether the root node has in-tree uses.
1112 return !VectorizableTree.empty() &&
1113 !VectorizableTree.front()->UserTreeIndices.empty();
1114 }
1115
1116 /// Return the scalars of the root node.
1118 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1119 return VectorizableTree.front()->Scalars;
1120 }
1121
1122 /// Builds external uses of the vectorized scalars, i.e. the list of
1123 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1124 /// ExternallyUsedValues contains additional list of external uses to handle
1125 /// vectorization of reductions.
1126 void
1127 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1128
1129 /// Transforms graph nodes to target specific representations, if profitable.
1130 void transformNodes();
1131
1132 /// Clear the internal data structures that are created by 'buildTree'.
1133 void deleteTree() {
1134 VectorizableTree.clear();
1135 ScalarToTreeEntry.clear();
1136 MultiNodeScalars.clear();
1137 MustGather.clear();
1138 NonScheduledFirst.clear();
1139 EntryToLastInstruction.clear();
1140 ExternalUses.clear();
1141 ExternalUsesAsGEPs.clear();
1142 for (auto &Iter : BlocksSchedules) {
1143 BlockScheduling *BS = Iter.second.get();
1144 BS->clear();
1145 }
1146 MinBWs.clear();
1147 ReductionBitWidth = 0;
1148 CastMaxMinBWSizes.reset();
1149 ExtraBitWidthNodes.clear();
1150 InstrElementSize.clear();
1151 UserIgnoreList = nullptr;
1152 PostponedGathers.clear();
1153 ValueToGatherNodes.clear();
1154 }
1155
1156 unsigned getTreeSize() const { return VectorizableTree.size(); }
1157
1158 /// Perform LICM and CSE on the newly generated gather sequences.
1160
1161 /// Checks if the specified gather tree entry \p TE can be represented as a
1162 /// shuffled vector entry + (possibly) permutation with other gathers. It
1163 /// implements the checks only for possibly ordered scalars (Loads,
1164 /// ExtractElement, ExtractValue), which can be part of the graph.
1165 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1166
1167 /// Sort loads into increasing pointers offsets to allow greater clustering.
1168 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1169
1170 /// Gets reordering data for the given tree entry. If the entry is vectorized
1171 /// - just return ReorderIndices, otherwise check if the scalars can be
1172 /// reordered and return the most optimal order.
1173 /// \return std::nullopt if ordering is not important, empty order, if
1174 /// identity order is important, or the actual order.
1175 /// \param TopToBottom If true, include the order of vectorized stores and
1176 /// insertelement nodes, otherwise skip them.
1177 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1178 bool TopToBottom);
1179
1180 /// Reorders the current graph to the most profitable order starting from the
1181 /// root node to the leaf nodes. The best order is chosen only from the nodes
1182 /// of the same size (vectorization factor). Smaller nodes are considered
1183 /// parts of subgraph with smaller VF and they are reordered independently. We
1184 /// can make it because we still need to extend smaller nodes to the wider VF
1185 /// and we can merge reordering shuffles with the widening shuffles.
1186 void reorderTopToBottom();
1187
1188 /// Reorders the current graph to the most profitable order starting from
1189 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1190 /// number of reshuffles if the leaf nodes use the same order. In this case we
1191 /// can merge the orders and just shuffle user node instead of shuffling its
1192 /// operands. Plus, even the leaf nodes have different orders, it allows to
1193 /// sink reordering in the graph closer to the root node and merge it later
1194 /// during analysis.
1195 void reorderBottomToTop(bool IgnoreReorder = false);
1196
1197 /// \return The vector element size in bits to use when vectorizing the
1198 /// expression tree ending at \p V. If V is a store, the size is the width of
1199 /// the stored value. Otherwise, the size is the width of the largest loaded
1200 /// value reaching V. This method is used by the vectorizer to calculate
1201 /// vectorization factors.
1202 unsigned getVectorElementSize(Value *V);
1203
1204 /// Compute the minimum type sizes required to represent the entries in a
1205 /// vectorizable tree.
1207
1208 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1209 unsigned getMaxVecRegSize() const {
1210 return MaxVecRegSize;
1211 }
1212
1213 // \returns minimum vector register size as set by cl::opt.
1214 unsigned getMinVecRegSize() const {
1215 return MinVecRegSize;
1216 }
1217
1218 unsigned getMinVF(unsigned Sz) const {
1219 return std::max(2U, getMinVecRegSize() / Sz);
1220 }
1221
1222 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1223 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1224 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1225 return MaxVF ? MaxVF : UINT_MAX;
1226 }
1227
1228 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1229 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1230 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1231 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1232 ///
1233 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1234 unsigned canMapToVector(Type *T) const;
1235
1236 /// \returns True if the VectorizableTree is both tiny and not fully
1237 /// vectorizable. We do not vectorize such trees.
1238 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1239
1240 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1241 /// can be load combined in the backend. Load combining may not be allowed in
1242 /// the IR optimizer, so we do not want to alter the pattern. For example,
1243 /// partially transforming a scalar bswap() pattern into vector code is
1244 /// effectively impossible for the backend to undo.
1245 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1246 /// may not be necessary.
1247 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1248
1249 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1250 /// can be load combined in the backend. Load combining may not be allowed in
1251 /// the IR optimizer, so we do not want to alter the pattern. For example,
1252 /// partially transforming a scalar bswap() pattern into vector code is
1253 /// effectively impossible for the backend to undo.
1254 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1255 /// may not be necessary.
1256 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1257
1258 /// Checks if the given array of loads can be represented as a vectorized,
1259 /// scatter or just simple gather.
1260 /// \param VL list of loads.
1261 /// \param VL0 main load value.
1262 /// \param Order returned order of load instructions.
1263 /// \param PointerOps returned list of pointer operands.
1264 /// \param TryRecursiveCheck used to check if long masked gather can be
1265 /// represented as a serie of loads/insert subvector, if profitable.
1268 SmallVectorImpl<Value *> &PointerOps,
1269 bool TryRecursiveCheck = true) const;
1270
1272
1273 /// This structure holds any data we need about the edges being traversed
1274 /// during buildTree_rec(). We keep track of:
1275 /// (i) the user TreeEntry index, and
1276 /// (ii) the index of the edge.
1277 struct EdgeInfo {
1278 EdgeInfo() = default;
1279 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1281 /// The user TreeEntry.
1282 TreeEntry *UserTE = nullptr;
1283 /// The operand index of the use.
1284 unsigned EdgeIdx = UINT_MAX;
1285#ifndef NDEBUG
1287 const BoUpSLP::EdgeInfo &EI) {
1288 EI.dump(OS);
1289 return OS;
1290 }
1291 /// Debug print.
1292 void dump(raw_ostream &OS) const {
1293 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1294 << " EdgeIdx:" << EdgeIdx << "}";
1295 }
1296 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1297#endif
1298 bool operator == (const EdgeInfo &Other) const {
1299 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1300 }
1301 };
1302
1303 /// A helper class used for scoring candidates for two consecutive lanes.
1305 const TargetLibraryInfo &TLI;
1306 const DataLayout &DL;
1307 ScalarEvolution &SE;
1308 const BoUpSLP &R;
1309 int NumLanes; // Total number of lanes (aka vectorization factor).
1310 int MaxLevel; // The maximum recursion depth for accumulating score.
1311
1312 public:
1314 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1315 int MaxLevel)
1316 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1317 MaxLevel(MaxLevel) {}
1318
1319 // The hard-coded scores listed here are not very important, though it shall
1320 // be higher for better matches to improve the resulting cost. When
1321 // computing the scores of matching one sub-tree with another, we are
1322 // basically counting the number of values that are matching. So even if all
1323 // scores are set to 1, we would still get a decent matching result.
1324 // However, sometimes we have to break ties. For example we may have to
1325 // choose between matching loads vs matching opcodes. This is what these
1326 // scores are helping us with: they provide the order of preference. Also,
1327 // this is important if the scalar is externally used or used in another
1328 // tree entry node in the different lane.
1329
1330 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1331 static const int ScoreConsecutiveLoads = 4;
1332 /// The same load multiple times. This should have a better score than
1333 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1334 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1335 /// a vector load and 1.0 for a broadcast.
1336 static const int ScoreSplatLoads = 3;
1337 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1338 static const int ScoreReversedLoads = 3;
1339 /// A load candidate for masked gather.
1340 static const int ScoreMaskedGatherCandidate = 1;
1341 /// ExtractElementInst from same vector and consecutive indexes.
1342 static const int ScoreConsecutiveExtracts = 4;
1343 /// ExtractElementInst from same vector and reversed indices.
1344 static const int ScoreReversedExtracts = 3;
1345 /// Constants.
1346 static const int ScoreConstants = 2;
1347 /// Instructions with the same opcode.
1348 static const int ScoreSameOpcode = 2;
1349 /// Instructions with alt opcodes (e.g, add + sub).
1350 static const int ScoreAltOpcodes = 1;
1351 /// Identical instructions (a.k.a. splat or broadcast).
1352 static const int ScoreSplat = 1;
1353 /// Matching with an undef is preferable to failing.
1354 static const int ScoreUndef = 1;
1355 /// Score for failing to find a decent match.
1356 static const int ScoreFail = 0;
1357 /// Score if all users are vectorized.
1358 static const int ScoreAllUserVectorized = 1;
1359
1360 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1361 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1362 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1363 /// MainAltOps.
1365 ArrayRef<Value *> MainAltOps) const {
1366 if (!isValidElementType(V1->getType()) ||
1367 !isValidElementType(V2->getType()))
1369
1370 if (V1 == V2) {
1371 if (isa<LoadInst>(V1)) {
1372 // Retruns true if the users of V1 and V2 won't need to be extracted.
1373 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1374 // Bail out if we have too many uses to save compilation time.
1375 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1376 return false;
1377
1378 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1379 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1380 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1381 });
1382 };
1383 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1384 };
1385 // A broadcast of a load can be cheaper on some targets.
1386 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1387 ElementCount::getFixed(NumLanes)) &&
1388 ((int)V1->getNumUses() == NumLanes ||
1389 AllUsersAreInternal(V1, V2)))
1391 }
1393 }
1394
1395 auto *LI1 = dyn_cast<LoadInst>(V1);
1396 auto *LI2 = dyn_cast<LoadInst>(V2);
1397 if (LI1 && LI2) {
1398 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1399 !LI2->isSimple())
1401
1402 std::optional<int> Dist = getPointersDiff(
1403 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1404 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1405 if (!Dist || *Dist == 0) {
1406 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1407 getUnderlyingObject(LI2->getPointerOperand()) &&
1408 R.TTI->isLegalMaskedGather(
1409 FixedVectorType::get(LI1->getType(), NumLanes),
1410 LI1->getAlign()))
1413 }
1414 // The distance is too large - still may be profitable to use masked
1415 // loads/gathers.
1416 if (std::abs(*Dist) > NumLanes / 2)
1418 // This still will detect consecutive loads, but we might have "holes"
1419 // in some cases. It is ok for non-power-2 vectorization and may produce
1420 // better results. It should not affect current vectorization.
1423 }
1424
1425 auto *C1 = dyn_cast<Constant>(V1);
1426 auto *C2 = dyn_cast<Constant>(V2);
1427 if (C1 && C2)
1429
1430 // Extracts from consecutive indexes of the same vector better score as
1431 // the extracts could be optimized away.
1432 Value *EV1;
1433 ConstantInt *Ex1Idx;
1434 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1435 // Undefs are always profitable for extractelements.
1436 // Compiler can easily combine poison and extractelement <non-poison> or
1437 // undef and extractelement <poison>. But combining undef +
1438 // extractelement <non-poison-but-may-produce-poison> requires some
1439 // extra operations.
1440 if (isa<UndefValue>(V2))
1441 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1444 Value *EV2 = nullptr;
1445 ConstantInt *Ex2Idx = nullptr;
1446 if (match(V2,
1448 m_Undef())))) {
1449 // Undefs are always profitable for extractelements.
1450 if (!Ex2Idx)
1452 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1454 if (EV2 == EV1) {
1455 int Idx1 = Ex1Idx->getZExtValue();
1456 int Idx2 = Ex2Idx->getZExtValue();
1457 int Dist = Idx2 - Idx1;
1458 // The distance is too large - still may be profitable to use
1459 // shuffles.
1460 if (std::abs(Dist) == 0)
1462 if (std::abs(Dist) > NumLanes / 2)
1466 }
1468 }
1470 }
1471
1472 auto *I1 = dyn_cast<Instruction>(V1);
1473 auto *I2 = dyn_cast<Instruction>(V2);
1474 if (I1 && I2) {
1475 if (I1->getParent() != I2->getParent())
1477 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1478 Ops.push_back(I1);
1479 Ops.push_back(I2);
1480 InstructionsState S = getSameOpcode(Ops, TLI);
1481 // Note: Only consider instructions with <= 2 operands to avoid
1482 // complexity explosion.
1483 if (S.getOpcode() &&
1484 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1485 !S.isAltShuffle()) &&
1486 all_of(Ops, [&S](Value *V) {
1487 return cast<Instruction>(V)->getNumOperands() ==
1488 S.MainOp->getNumOperands();
1489 }))
1490 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1492 }
1493
1494 if (isa<UndefValue>(V2))
1496
1498 }
1499
1500 /// Go through the operands of \p LHS and \p RHS recursively until
1501 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1502 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1503 /// of \p U1 and \p U2), except at the beginning of the recursion where
1504 /// these are set to nullptr.
1505 ///
1506 /// For example:
1507 /// \verbatim
1508 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1509 /// \ / \ / \ / \ /
1510 /// + + + +
1511 /// G1 G2 G3 G4
1512 /// \endverbatim
1513 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1514 /// each level recursively, accumulating the score. It starts from matching
1515 /// the additions at level 0, then moves on to the loads (level 1). The
1516 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1517 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1518 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1519 /// Please note that the order of the operands does not matter, as we
1520 /// evaluate the score of all profitable combinations of operands. In
1521 /// other words the score of G1 and G4 is the same as G1 and G2. This
1522 /// heuristic is based on ideas described in:
1523 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1524 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1525 /// Luís F. W. Góes
1527 Instruction *U2, int CurrLevel,
1528 ArrayRef<Value *> MainAltOps) const {
1529
1530 // Get the shallow score of V1 and V2.
1531 int ShallowScoreAtThisLevel =
1532 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1533
1534 // If reached MaxLevel,
1535 // or if V1 and V2 are not instructions,
1536 // or if they are SPLAT,
1537 // or if they are not consecutive,
1538 // or if profitable to vectorize loads or extractelements, early return
1539 // the current cost.
1540 auto *I1 = dyn_cast<Instruction>(LHS);
1541 auto *I2 = dyn_cast<Instruction>(RHS);
1542 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1543 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1544 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1545 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1546 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1547 ShallowScoreAtThisLevel))
1548 return ShallowScoreAtThisLevel;
1549 assert(I1 && I2 && "Should have early exited.");
1550
1551 // Contains the I2 operand indexes that got matched with I1 operands.
1552 SmallSet<unsigned, 4> Op2Used;
1553
1554 // Recursion towards the operands of I1 and I2. We are trying all possible
1555 // operand pairs, and keeping track of the best score.
1556 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1557 OpIdx1 != NumOperands1; ++OpIdx1) {
1558 // Try to pair op1I with the best operand of I2.
1559 int MaxTmpScore = 0;
1560 unsigned MaxOpIdx2 = 0;
1561 bool FoundBest = false;
1562 // If I2 is commutative try all combinations.
1563 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1564 unsigned ToIdx = isCommutative(I2)
1565 ? I2->getNumOperands()
1566 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1567 assert(FromIdx <= ToIdx && "Bad index");
1568 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1569 // Skip operands already paired with OpIdx1.
1570 if (Op2Used.count(OpIdx2))
1571 continue;
1572 // Recursively calculate the cost at each level
1573 int TmpScore =
1574 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1575 I1, I2, CurrLevel + 1, std::nullopt);
1576 // Look for the best score.
1577 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1578 TmpScore > MaxTmpScore) {
1579 MaxTmpScore = TmpScore;
1580 MaxOpIdx2 = OpIdx2;
1581 FoundBest = true;
1582 }
1583 }
1584 if (FoundBest) {
1585 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1586 Op2Used.insert(MaxOpIdx2);
1587 ShallowScoreAtThisLevel += MaxTmpScore;
1588 }
1589 }
1590 return ShallowScoreAtThisLevel;
1591 }
1592 };
1593 /// A helper data structure to hold the operands of a vector of instructions.
1594 /// This supports a fixed vector length for all operand vectors.
1596 /// For each operand we need (i) the value, and (ii) the opcode that it
1597 /// would be attached to if the expression was in a left-linearized form.
1598 /// This is required to avoid illegal operand reordering.
1599 /// For example:
1600 /// \verbatim
1601 /// 0 Op1
1602 /// |/
1603 /// Op1 Op2 Linearized + Op2
1604 /// \ / ----------> |/
1605 /// - -
1606 ///
1607 /// Op1 - Op2 (0 + Op1) - Op2
1608 /// \endverbatim
1609 ///
1610 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1611 ///
1612 /// Another way to think of this is to track all the operations across the
1613 /// path from the operand all the way to the root of the tree and to
1614 /// calculate the operation that corresponds to this path. For example, the
1615 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1616 /// corresponding operation is a '-' (which matches the one in the
1617 /// linearized tree, as shown above).
1618 ///
1619 /// For lack of a better term, we refer to this operation as Accumulated
1620 /// Path Operation (APO).
1621 struct OperandData {
1622 OperandData() = default;
1623 OperandData(Value *V, bool APO, bool IsUsed)
1624 : V(V), APO(APO), IsUsed(IsUsed) {}
1625 /// The operand value.
1626 Value *V = nullptr;
1627 /// TreeEntries only allow a single opcode, or an alternate sequence of
1628 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1629 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1630 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1631 /// (e.g., Add/Mul)
1632 bool APO = false;
1633 /// Helper data for the reordering function.
1634 bool IsUsed = false;
1635 };
1636
1637 /// During operand reordering, we are trying to select the operand at lane
1638 /// that matches best with the operand at the neighboring lane. Our
1639 /// selection is based on the type of value we are looking for. For example,
1640 /// if the neighboring lane has a load, we need to look for a load that is
1641 /// accessing a consecutive address. These strategies are summarized in the
1642 /// 'ReorderingMode' enumerator.
1643 enum class ReorderingMode {
1644 Load, ///< Matching loads to consecutive memory addresses
1645 Opcode, ///< Matching instructions based on opcode (same or alternate)
1646 Constant, ///< Matching constants
1647 Splat, ///< Matching the same instruction multiple times (broadcast)
1648 Failed, ///< We failed to create a vectorizable group
1649 };
1650
1652
1653 /// A vector of operand vectors.
1655
1656 const TargetLibraryInfo &TLI;
1657 const DataLayout &DL;
1658 ScalarEvolution &SE;
1659 const BoUpSLP &R;
1660
1661 /// \returns the operand data at \p OpIdx and \p Lane.
1662 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1663 return OpsVec[OpIdx][Lane];
1664 }
1665
1666 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1667 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1668 return OpsVec[OpIdx][Lane];
1669 }
1670
1671 /// Clears the used flag for all entries.
1672 void clearUsed() {
1673 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1674 OpIdx != NumOperands; ++OpIdx)
1675 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1676 ++Lane)
1677 OpsVec[OpIdx][Lane].IsUsed = false;
1678 }
1679
1680 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1681 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1682 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1683 }
1684
1685 /// \param Lane lane of the operands under analysis.
1686 /// \param OpIdx operand index in \p Lane lane we're looking the best
1687 /// candidate for.
1688 /// \param Idx operand index of the current candidate value.
1689 /// \returns The additional score due to possible broadcasting of the
1690 /// elements in the lane. It is more profitable to have power-of-2 unique
1691 /// elements in the lane, it will be vectorized with higher probability
1692 /// after removing duplicates. Currently the SLP vectorizer supports only
1693 /// vectorization of the power-of-2 number of unique scalars.
1694 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1695 Value *IdxLaneV = getData(Idx, Lane).V;
1696 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1697 return 0;
1699 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1700 if (Ln == Lane)
1701 continue;
1702 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1703 if (!isa<Instruction>(OpIdxLnV))
1704 return 0;
1705 Uniques.insert(OpIdxLnV);
1706 }
1707 int UniquesCount = Uniques.size();
1708 int UniquesCntWithIdxLaneV =
1709 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1710 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1711 int UniquesCntWithOpIdxLaneV =
1712 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1713 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1714 return 0;
1715 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1716 UniquesCntWithOpIdxLaneV) -
1717 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1718 }
1719
1720 /// \param Lane lane of the operands under analysis.
1721 /// \param OpIdx operand index in \p Lane lane we're looking the best
1722 /// candidate for.
1723 /// \param Idx operand index of the current candidate value.
1724 /// \returns The additional score for the scalar which users are all
1725 /// vectorized.
1726 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1727 Value *IdxLaneV = getData(Idx, Lane).V;
1728 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1729 // Do not care about number of uses for vector-like instructions
1730 // (extractelement/extractvalue with constant indices), they are extracts
1731 // themselves and already externally used. Vectorization of such
1732 // instructions does not add extra extractelement instruction, just may
1733 // remove it.
1734 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1735 isVectorLikeInstWithConstOps(OpIdxLaneV))
1737 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1738 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1739 return 0;
1740 return R.areAllUsersVectorized(IdxLaneI)
1742 : 0;
1743 }
1744
1745 /// Score scaling factor for fully compatible instructions but with
1746 /// different number of external uses. Allows better selection of the
1747 /// instructions with less external uses.
1748 static const int ScoreScaleFactor = 10;
1749
1750 /// \Returns the look-ahead score, which tells us how much the sub-trees
1751 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1752 /// score. This helps break ties in an informed way when we cannot decide on
1753 /// the order of the operands by just considering the immediate
1754 /// predecessors.
1755 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1756 int Lane, unsigned OpIdx, unsigned Idx,
1757 bool &IsUsed) {
1758 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1760 // Keep track of the instruction stack as we recurse into the operands
1761 // during the look-ahead score exploration.
1762 int Score =
1763 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1764 /*CurrLevel=*/1, MainAltOps);
1765 if (Score) {
1766 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1767 if (Score <= -SplatScore) {
1768 // Set the minimum score for splat-like sequence to avoid setting
1769 // failed state.
1770 Score = 1;
1771 } else {
1772 Score += SplatScore;
1773 // Scale score to see the difference between different operands
1774 // and similar operands but all vectorized/not all vectorized
1775 // uses. It does not affect actual selection of the best
1776 // compatible operand in general, just allows to select the
1777 // operand with all vectorized uses.
1778 Score *= ScoreScaleFactor;
1779 Score += getExternalUseScore(Lane, OpIdx, Idx);
1780 IsUsed = true;
1781 }
1782 }
1783 return Score;
1784 }
1785
1786 /// Best defined scores per lanes between the passes. Used to choose the
1787 /// best operand (with the highest score) between the passes.
1788 /// The key - {Operand Index, Lane}.
1789 /// The value - the best score between the passes for the lane and the
1790 /// operand.
1792 BestScoresPerLanes;
1793
1794 // Search all operands in Ops[*][Lane] for the one that matches best
1795 // Ops[OpIdx][LastLane] and return its opreand index.
1796 // If no good match can be found, return std::nullopt.
1797 std::optional<unsigned>
1798 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1799 ArrayRef<ReorderingMode> ReorderingModes,
1800 ArrayRef<Value *> MainAltOps) {
1801 unsigned NumOperands = getNumOperands();
1802
1803 // The operand of the previous lane at OpIdx.
1804 Value *OpLastLane = getData(OpIdx, LastLane).V;
1805
1806 // Our strategy mode for OpIdx.
1807 ReorderingMode RMode = ReorderingModes[OpIdx];
1808 if (RMode == ReorderingMode::Failed)
1809 return std::nullopt;
1810
1811 // The linearized opcode of the operand at OpIdx, Lane.
1812 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1813
1814 // The best operand index and its score.
1815 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1816 // are using the score to differentiate between the two.
1817 struct BestOpData {
1818 std::optional<unsigned> Idx;
1819 unsigned Score = 0;
1820 } BestOp;
1821 BestOp.Score =
1822 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1823 .first->second;
1824
1825 // Track if the operand must be marked as used. If the operand is set to
1826 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1827 // want to reestimate the operands again on the following iterations).
1828 bool IsUsed =
1829 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1830 // Iterate through all unused operands and look for the best.
1831 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1832 // Get the operand at Idx and Lane.
1833 OperandData &OpData = getData(Idx, Lane);
1834 Value *Op = OpData.V;
1835 bool OpAPO = OpData.APO;
1836
1837 // Skip already selected operands.
1838 if (OpData.IsUsed)
1839 continue;
1840
1841 // Skip if we are trying to move the operand to a position with a
1842 // different opcode in the linearized tree form. This would break the
1843 // semantics.
1844 if (OpAPO != OpIdxAPO)
1845 continue;
1846
1847 // Look for an operand that matches the current mode.
1848 switch (RMode) {
1849 case ReorderingMode::Load:
1850 case ReorderingMode::Constant:
1851 case ReorderingMode::Opcode: {
1852 bool LeftToRight = Lane > LastLane;
1853 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1854 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1855 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1856 OpIdx, Idx, IsUsed);
1857 if (Score > static_cast<int>(BestOp.Score)) {
1858 BestOp.Idx = Idx;
1859 BestOp.Score = Score;
1860 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1861 }
1862 break;
1863 }
1864 case ReorderingMode::Splat:
1865 if (Op == OpLastLane)
1866 BestOp.Idx = Idx;
1867 break;
1868 case ReorderingMode::Failed:
1869 llvm_unreachable("Not expected Failed reordering mode.");
1870 }
1871 }
1872
1873 if (BestOp.Idx) {
1874 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1875 return BestOp.Idx;
1876 }
1877 // If we could not find a good match return std::nullopt.
1878 return std::nullopt;
1879 }
1880
1881 /// Helper for reorderOperandVecs.
1882 /// \returns the lane that we should start reordering from. This is the one
1883 /// which has the least number of operands that can freely move about or
1884 /// less profitable because it already has the most optimal set of operands.
1885 unsigned getBestLaneToStartReordering() const {
1886 unsigned Min = UINT_MAX;
1887 unsigned SameOpNumber = 0;
1888 // std::pair<unsigned, unsigned> is used to implement a simple voting
1889 // algorithm and choose the lane with the least number of operands that
1890 // can freely move about or less profitable because it already has the
1891 // most optimal set of operands. The first unsigned is a counter for
1892 // voting, the second unsigned is the counter of lanes with instructions
1893 // with same/alternate opcodes and same parent basic block.
1895 // Try to be closer to the original results, if we have multiple lanes
1896 // with same cost. If 2 lanes have the same cost, use the one with the
1897 // lowest index.
1898 for (int I = getNumLanes(); I > 0; --I) {
1899 unsigned Lane = I - 1;
1900 OperandsOrderData NumFreeOpsHash =
1901 getMaxNumOperandsThatCanBeReordered(Lane);
1902 // Compare the number of operands that can move and choose the one with
1903 // the least number.
1904 if (NumFreeOpsHash.NumOfAPOs < Min) {
1905 Min = NumFreeOpsHash.NumOfAPOs;
1906 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1907 HashMap.clear();
1908 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1909 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1910 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1911 // Select the most optimal lane in terms of number of operands that
1912 // should be moved around.
1913 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1914 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1915 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1916 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1917 auto *It = HashMap.find(NumFreeOpsHash.Hash);
1918 if (It == HashMap.end())
1919 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1920 else
1921 ++It->second.first;
1922 }
1923 }
1924 // Select the lane with the minimum counter.
1925 unsigned BestLane = 0;
1926 unsigned CntMin = UINT_MAX;
1927 for (const auto &Data : reverse(HashMap)) {
1928 if (Data.second.first < CntMin) {
1929 CntMin = Data.second.first;
1930 BestLane = Data.second.second;
1931 }
1932 }
1933 return BestLane;
1934 }
1935
1936 /// Data structure that helps to reorder operands.
1937 struct OperandsOrderData {
1938 /// The best number of operands with the same APOs, which can be
1939 /// reordered.
1940 unsigned NumOfAPOs = UINT_MAX;
1941 /// Number of operands with the same/alternate instruction opcode and
1942 /// parent.
1943 unsigned NumOpsWithSameOpcodeParent = 0;
1944 /// Hash for the actual operands ordering.
1945 /// Used to count operands, actually their position id and opcode
1946 /// value. It is used in the voting mechanism to find the lane with the
1947 /// least number of operands that can freely move about or less profitable
1948 /// because it already has the most optimal set of operands. Can be
1949 /// replaced with SmallVector<unsigned> instead but hash code is faster
1950 /// and requires less memory.
1951 unsigned Hash = 0;
1952 };
1953 /// \returns the maximum number of operands that are allowed to be reordered
1954 /// for \p Lane and the number of compatible instructions(with the same
1955 /// parent/opcode). This is used as a heuristic for selecting the first lane
1956 /// to start operand reordering.
1957 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
1958 unsigned CntTrue = 0;
1959 unsigned NumOperands = getNumOperands();
1960 // Operands with the same APO can be reordered. We therefore need to count
1961 // how many of them we have for each APO, like this: Cnt[APO] = x.
1962 // Since we only have two APOs, namely true and false, we can avoid using
1963 // a map. Instead we can simply count the number of operands that
1964 // correspond to one of them (in this case the 'true' APO), and calculate
1965 // the other by subtracting it from the total number of operands.
1966 // Operands with the same instruction opcode and parent are more
1967 // profitable since we don't need to move them in many cases, with a high
1968 // probability such lane already can be vectorized effectively.
1969 bool AllUndefs = true;
1970 unsigned NumOpsWithSameOpcodeParent = 0;
1971 Instruction *OpcodeI = nullptr;
1972 BasicBlock *Parent = nullptr;
1973 unsigned Hash = 0;
1974 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1975 const OperandData &OpData = getData(OpIdx, Lane);
1976 if (OpData.APO)
1977 ++CntTrue;
1978 // Use Boyer-Moore majority voting for finding the majority opcode and
1979 // the number of times it occurs.
1980 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
1981 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
1982 I->getParent() != Parent) {
1983 if (NumOpsWithSameOpcodeParent == 0) {
1984 NumOpsWithSameOpcodeParent = 1;
1985 OpcodeI = I;
1986 Parent = I->getParent();
1987 } else {
1988 --NumOpsWithSameOpcodeParent;
1989 }
1990 } else {
1991 ++NumOpsWithSameOpcodeParent;
1992 }
1993 }
1994 Hash = hash_combine(
1995 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
1996 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
1997 }
1998 if (AllUndefs)
1999 return {};
2000 OperandsOrderData Data;
2001 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2002 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2003 Data.Hash = Hash;
2004 return Data;
2005 }
2006
2007 /// Go through the instructions in VL and append their operands.
2008 void appendOperandsOfVL(ArrayRef<Value *> VL) {
2009 assert(!VL.empty() && "Bad VL");
2010 assert((empty() || VL.size() == getNumLanes()) &&
2011 "Expected same number of lanes");
2012 assert(isa<Instruction>(VL[0]) && "Expected instruction");
2013 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2014 constexpr unsigned IntrinsicNumOperands = 2;
2015 if (isa<IntrinsicInst>(VL[0]))
2016 NumOperands = IntrinsicNumOperands;
2017 OpsVec.resize(NumOperands);
2018 unsigned NumLanes = VL.size();
2019 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2020 OpsVec[OpIdx].resize(NumLanes);
2021 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2022 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2023 // Our tree has just 3 nodes: the root and two operands.
2024 // It is therefore trivial to get the APO. We only need to check the
2025 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2026 // RHS operand. The LHS operand of both add and sub is never attached
2027 // to an inversese operation in the linearized form, therefore its APO
2028 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2029
2030 // Since operand reordering is performed on groups of commutative
2031 // operations or alternating sequences (e.g., +, -), we can safely
2032 // tell the inverse operations by checking commutativity.
2033 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2034 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2035 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2036 APO, false};
2037 }
2038 }
2039 }
2040
2041 /// \returns the number of operands.
2042 unsigned getNumOperands() const { return OpsVec.size(); }
2043
2044 /// \returns the number of lanes.
2045 unsigned getNumLanes() const { return OpsVec[0].size(); }
2046
2047 /// \returns the operand value at \p OpIdx and \p Lane.
2048 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2049 return getData(OpIdx, Lane).V;
2050 }
2051
2052 /// \returns true if the data structure is empty.
2053 bool empty() const { return OpsVec.empty(); }
2054
2055 /// Clears the data.
2056 void clear() { OpsVec.clear(); }
2057
2058 /// \Returns true if there are enough operands identical to \p Op to fill
2059 /// the whole vector.
2060 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2061 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2062 bool OpAPO = getData(OpIdx, Lane).APO;
2063 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2064 if (Ln == Lane)
2065 continue;
2066 // This is set to true if we found a candidate for broadcast at Lane.
2067 bool FoundCandidate = false;
2068 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2069 OperandData &Data = getData(OpI, Ln);
2070 if (Data.APO != OpAPO || Data.IsUsed)
2071 continue;
2072 if (Data.V == Op) {
2073 FoundCandidate = true;
2074 Data.IsUsed = true;
2075 break;
2076 }
2077 }
2078 if (!FoundCandidate)
2079 return false;
2080 }
2081 return true;
2082 }
2083
2084 public:
2085 /// Initialize with all the operands of the instruction vector \p RootVL.
2087 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R) {
2088 // Append all the operands of RootVL.
2089 appendOperandsOfVL(RootVL);
2090 }
2091
2092 /// \Returns a value vector with the operands across all lanes for the
2093 /// opearnd at \p OpIdx.
2094 ValueList getVL(unsigned OpIdx) const {
2095 ValueList OpVL(OpsVec[OpIdx].size());
2096 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2097 "Expected same num of lanes across all operands");
2098 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2099 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2100 return OpVL;
2101 }
2102
2103 // Performs operand reordering for 2 or more operands.
2104 // The original operands are in OrigOps[OpIdx][Lane].
2105 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2106 void reorder() {
2107 unsigned NumOperands = getNumOperands();
2108 unsigned NumLanes = getNumLanes();
2109 // Each operand has its own mode. We are using this mode to help us select
2110 // the instructions for each lane, so that they match best with the ones
2111 // we have selected so far.
2112 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2113
2114 // This is a greedy single-pass algorithm. We are going over each lane
2115 // once and deciding on the best order right away with no back-tracking.
2116 // However, in order to increase its effectiveness, we start with the lane
2117 // that has operands that can move the least. For example, given the
2118 // following lanes:
2119 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2120 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2121 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2122 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2123 // we will start at Lane 1, since the operands of the subtraction cannot
2124 // be reordered. Then we will visit the rest of the lanes in a circular
2125 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2126
2127 // Find the first lane that we will start our search from.
2128 unsigned FirstLane = getBestLaneToStartReordering();
2129
2130 // Initialize the modes.
2131 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2132 Value *OpLane0 = getValue(OpIdx, FirstLane);
2133 // Keep track if we have instructions with all the same opcode on one
2134 // side.
2135 if (isa<LoadInst>(OpLane0))
2136 ReorderingModes[OpIdx] = ReorderingMode::Load;
2137 else if (isa<Instruction>(OpLane0)) {
2138 // Check if OpLane0 should be broadcast.
2139 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2140 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2141 else
2142 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2143 }
2144 else if (isa<Constant>(OpLane0))
2145 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2146 else if (isa<Argument>(OpLane0))
2147 // Our best hope is a Splat. It may save some cost in some cases.
2148 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2149 else
2150 // NOTE: This should be unreachable.
2151 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2152 }
2153
2154 // Check that we don't have same operands. No need to reorder if operands
2155 // are just perfect diamond or shuffled diamond match. Do not do it only
2156 // for possible broadcasts or non-power of 2 number of scalars (just for
2157 // now).
2158 auto &&SkipReordering = [this]() {
2159 SmallPtrSet<Value *, 4> UniqueValues;
2160 ArrayRef<OperandData> Op0 = OpsVec.front();
2161 for (const OperandData &Data : Op0)
2162 UniqueValues.insert(Data.V);
2163 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2164 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2165 return !UniqueValues.contains(Data.V);
2166 }))
2167 return false;
2168 }
2169 // TODO: Check if we can remove a check for non-power-2 number of
2170 // scalars after full support of non-power-2 vectorization.
2171 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2172 };
2173
2174 // If the initial strategy fails for any of the operand indexes, then we
2175 // perform reordering again in a second pass. This helps avoid assigning
2176 // high priority to the failed strategy, and should improve reordering for
2177 // the non-failed operand indexes.
2178 for (int Pass = 0; Pass != 2; ++Pass) {
2179 // Check if no need to reorder operands since they're are perfect or
2180 // shuffled diamond match.
2181 // Need to do it to avoid extra external use cost counting for
2182 // shuffled matches, which may cause regressions.
2183 if (SkipReordering())
2184 break;
2185 // Skip the second pass if the first pass did not fail.
2186 bool StrategyFailed = false;
2187 // Mark all operand data as free to use.
2188 clearUsed();
2189 // We keep the original operand order for the FirstLane, so reorder the
2190 // rest of the lanes. We are visiting the nodes in a circular fashion,
2191 // using FirstLane as the center point and increasing the radius
2192 // distance.
2193 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2194 for (unsigned I = 0; I < NumOperands; ++I)
2195 MainAltOps[I].push_back(getData(I, FirstLane).V);
2196
2197 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2198 // Visit the lane on the right and then the lane on the left.
2199 for (int Direction : {+1, -1}) {
2200 int Lane = FirstLane + Direction * Distance;
2201 if (Lane < 0 || Lane >= (int)NumLanes)
2202 continue;
2203 int LastLane = Lane - Direction;
2204 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2205 "Out of bounds");
2206 // Look for a good match for each operand.
2207 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2208 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2209 std::optional<unsigned> BestIdx = getBestOperand(
2210 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2211 // By not selecting a value, we allow the operands that follow to
2212 // select a better matching value. We will get a non-null value in
2213 // the next run of getBestOperand().
2214 if (BestIdx) {
2215 // Swap the current operand with the one returned by
2216 // getBestOperand().
2217 swap(OpIdx, *BestIdx, Lane);
2218 } else {
2219 // We failed to find a best operand, set mode to 'Failed'.
2220 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2221 // Enable the second pass.
2222 StrategyFailed = true;
2223 }
2224 // Try to get the alternate opcode and follow it during analysis.
2225 if (MainAltOps[OpIdx].size() != 2) {
2226 OperandData &AltOp = getData(OpIdx, Lane);
2227 InstructionsState OpS =
2228 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2229 if (OpS.getOpcode() && OpS.isAltShuffle())
2230 MainAltOps[OpIdx].push_back(AltOp.V);
2231 }
2232 }
2233 }
2234 }
2235 // Skip second pass if the strategy did not fail.
2236 if (!StrategyFailed)
2237 break;
2238 }
2239 }
2240
2241#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2242 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2243 switch (RMode) {
2244 case ReorderingMode::Load:
2245 return "Load";
2246 case ReorderingMode::Opcode:
2247 return "Opcode";
2248 case ReorderingMode::Constant:
2249 return "Constant";
2250 case ReorderingMode::Splat:
2251 return "Splat";
2252 case ReorderingMode::Failed:
2253 return "Failed";
2254 }
2255 llvm_unreachable("Unimplemented Reordering Type");
2256 }
2257
2258 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2259 raw_ostream &OS) {
2260 return OS << getModeStr(RMode);
2261 }
2262
2263 /// Debug print.
2264 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2265 printMode(RMode, dbgs());
2266 }
2267
2268 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2269 return printMode(RMode, OS);
2270 }
2271
2273 const unsigned Indent = 2;
2274 unsigned Cnt = 0;
2275 for (const OperandDataVec &OpDataVec : OpsVec) {
2276 OS << "Operand " << Cnt++ << "\n";
2277 for (const OperandData &OpData : OpDataVec) {
2278 OS.indent(Indent) << "{";
2279 if (Value *V = OpData.V)
2280 OS << *V;
2281 else
2282 OS << "null";
2283 OS << ", APO:" << OpData.APO << "}\n";
2284 }
2285 OS << "\n";
2286 }
2287 return OS;
2288 }
2289
2290 /// Debug print.
2291 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2292#endif
2293 };
2294
2295 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2296 /// for a pair which have highest score deemed to have best chance to form
2297 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2298 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2299 /// of the cost, considered to be good enough score.
2300 std::optional<int>
2301 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2302 int Limit = LookAheadHeuristics::ScoreFail) const {
2303 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2305 int BestScore = Limit;
2306 std::optional<int> Index;
2307 for (int I : seq<int>(0, Candidates.size())) {
2308 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2309 Candidates[I].second,
2310 /*U1=*/nullptr, /*U2=*/nullptr,
2311 /*Level=*/1, std::nullopt);
2312 if (Score > BestScore) {
2313 BestScore = Score;
2314 Index = I;
2315 }
2316 }
2317 return Index;
2318 }
2319
2320 /// Checks if the instruction is marked for deletion.
2321 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2322
2323 /// Removes an instruction from its block and eventually deletes it.
2324 /// It's like Instruction::eraseFromParent() except that the actual deletion
2325 /// is delayed until BoUpSLP is destructed.
2327 DeletedInstructions.insert(I);
2328 }
2329
2330 /// Checks if the instruction was already analyzed for being possible
2331 /// reduction root.
2333 return AnalyzedReductionsRoots.count(I);
2334 }
2335 /// Register given instruction as already analyzed for being possible
2336 /// reduction root.
2338 AnalyzedReductionsRoots.insert(I);
2339 }
2340 /// Checks if the provided list of reduced values was checked already for
2341 /// vectorization.
2343 return AnalyzedReductionVals.contains(hash_value(VL));
2344 }
2345 /// Adds the list of reduced values to list of already checked values for the
2346 /// vectorization.
2348 AnalyzedReductionVals.insert(hash_value(VL));
2349 }
2350 /// Clear the list of the analyzed reduction root instructions.
2352 AnalyzedReductionsRoots.clear();
2353 AnalyzedReductionVals.clear();
2354 AnalyzedMinBWVals.clear();
2355 }
2356 /// Checks if the given value is gathered in one of the nodes.
2357 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2358 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2359 }
2360 /// Checks if the given value is gathered in one of the nodes.
2361 bool isGathered(const Value *V) const {
2362 return MustGather.contains(V);
2363 }
2364 /// Checks if the specified value was not schedule.
2365 bool isNotScheduled(const Value *V) const {
2366 return NonScheduledFirst.contains(V);
2367 }
2368
2369 /// Check if the value is vectorized in the tree.
2370 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2371
2372 ~BoUpSLP();
2373
2374private:
2375 /// Determine if a node \p E in can be demoted to a smaller type with a
2376 /// truncation. We collect the entries that will be demoted in ToDemote.
2377 /// \param E Node for analysis
2378 /// \param ToDemote indices of the nodes to be demoted.
2379 bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
2380 unsigned &BitWidth,
2381 SmallVectorImpl<unsigned> &ToDemote,
2383 unsigned &MaxDepthLevel,
2384 bool &IsProfitableToDemote,
2385 bool IsTruncRoot) const;
2386
2387 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2388 /// reordering (i.e. the operands can be reordered because they have only one
2389 /// user and reordarable).
2390 /// \param ReorderableGathers List of all gather nodes that require reordering
2391 /// (e.g., gather of extractlements or partially vectorizable loads).
2392 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2393 /// reordering, subset of \p NonVectorized.
2394 bool
2395 canReorderOperands(TreeEntry *UserTE,
2396 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2397 ArrayRef<TreeEntry *> ReorderableGathers,
2398 SmallVectorImpl<TreeEntry *> &GatherOps);
2399
2400 /// Checks if the given \p TE is a gather node with clustered reused scalars
2401 /// and reorders it per given \p Mask.
2402 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2403
2404 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2405 /// if any. If it is not vectorized (gather node), returns nullptr.
2406 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2407 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2408 TreeEntry *TE = nullptr;
2409 const auto *It = find_if(VL, [&](Value *V) {
2410 TE = getTreeEntry(V);
2411 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2412 return true;
2413 auto It = MultiNodeScalars.find(V);
2414 if (It != MultiNodeScalars.end()) {
2415 for (TreeEntry *E : It->second) {
2416 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2417 TE = E;
2418 return true;
2419 }
2420 }
2421 }
2422 return false;
2423 });
2424 if (It != VL.end()) {
2425 assert(TE->isSame(VL) && "Expected same scalars.");
2426 return TE;
2427 }
2428 return nullptr;
2429 }
2430
2431 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2432 /// if any. If it is not vectorized (gather node), returns nullptr.
2433 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2434 unsigned OpIdx) const {
2435 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2436 const_cast<TreeEntry *>(UserTE), OpIdx);
2437 }
2438
2439 /// Checks if all users of \p I are the part of the vectorization tree.
2440 bool areAllUsersVectorized(
2441 Instruction *I,
2442 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2443
2444 /// Return information about the vector formed for the specified index
2445 /// of a vector of (the same) instruction.
2447
2448 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2449 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2450
2451 /// \returns Cast context for the given graph node.
2453 getCastContextHint(const TreeEntry &TE) const;
2454
2455 /// \returns the cost of the vectorizable entry.
2456 InstructionCost getEntryCost(const TreeEntry *E,
2457 ArrayRef<Value *> VectorizedVals,
2458 SmallPtrSetImpl<Value *> &CheckedExtracts);
2459
2460 /// This is the recursive part of buildTree.
2461 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2462 const EdgeInfo &EI);
2463
2464 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2465 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2466 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2467 /// returns false, setting \p CurrentOrder to either an empty vector or a
2468 /// non-identity permutation that allows to reuse extract instructions.
2469 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2470 /// extract order.
2471 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2472 SmallVectorImpl<unsigned> &CurrentOrder,
2473 bool ResizeAllowed = false) const;
2474
2475 /// Vectorize a single entry in the tree.
2476 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2477 /// avoid issues with def-use order.
2478 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2479
2480 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2481 /// \p E.
2482 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2483 /// avoid issues with def-use order.
2484 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2485
2486 /// Create a new vector from a list of scalar values. Produces a sequence
2487 /// which exploits values reused across lanes, and arranges the inserts
2488 /// for ease of later optimization.
2489 template <typename BVTy, typename ResTy, typename... Args>
2490 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
2491
2492 /// Create a new vector from a list of scalar values. Produces a sequence
2493 /// which exploits values reused across lanes, and arranges the inserts
2494 /// for ease of later optimization.
2495 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
2496
2497 /// Returns the instruction in the bundle, which can be used as a base point
2498 /// for scheduling. Usually it is the last instruction in the bundle, except
2499 /// for the case when all operands are external (in this case, it is the first
2500 /// instruction in the list).
2501 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2502
2503 /// Tries to find extractelement instructions with constant indices from fixed
2504 /// vector type and gather such instructions into a bunch, which highly likely
2505 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2506 /// was successful, the matched scalars are replaced by poison values in \p VL
2507 /// for future analysis.
2508 std::optional<TargetTransformInfo::ShuffleKind>
2509 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2510 SmallVectorImpl<int> &Mask) const;
2511
2512 /// Tries to find extractelement instructions with constant indices from fixed
2513 /// vector type and gather such instructions into a bunch, which highly likely
2514 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2515 /// was successful, the matched scalars are replaced by poison values in \p VL
2516 /// for future analysis.
2518 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2520 unsigned NumParts) const;
2521
2522 /// Checks if the gathered \p VL can be represented as a single register
2523 /// shuffle(s) of previous tree entries.
2524 /// \param TE Tree entry checked for permutation.
2525 /// \param VL List of scalars (a subset of the TE scalar), checked for
2526 /// permutations. Must form single-register vector.
2527 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2528 /// commands to build the mask using the original vector value, without
2529 /// relying on the potential reordering.
2530 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2531 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2532 std::optional<TargetTransformInfo::ShuffleKind>
2533 isGatherShuffledSingleRegisterEntry(
2534 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2535 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2536 bool ForOrder);
2537
2538 /// Checks if the gathered \p VL can be represented as multi-register
2539 /// shuffle(s) of previous tree entries.
2540 /// \param TE Tree entry checked for permutation.
2541 /// \param VL List of scalars (a subset of the TE scalar), checked for
2542 /// permutations.
2543 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2544 /// commands to build the mask using the original vector value, without
2545 /// relying on the potential reordering.
2546 /// \returns per-register series of ShuffleKind, if gathered values can be
2547 /// represented as shuffles of previous tree entries. \p Mask is filled with
2548 /// the shuffle mask (also on per-register base).
2550 isGatherShuffledEntry(
2551 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2553 unsigned NumParts, bool ForOrder = false);
2554
2555 /// \returns the scalarization cost for this list of values. Assuming that
2556 /// this subtree gets vectorized, we may need to extract the values from the
2557 /// roots. This method calculates the cost of extracting the values.
2558 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2559 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
2560 Type *ScalarTy) const;
2561
2562 /// Set the Builder insert point to one after the last instruction in
2563 /// the bundle
2564 void setInsertPointAfterBundle(const TreeEntry *E);
2565
2566 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2567 /// specified, the starting vector value is poison.
2568 Value *gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy);
2569
2570 /// \returns whether the VectorizableTree is fully vectorizable and will
2571 /// be beneficial even the tree height is tiny.
2572 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2573
2574 /// Reorder commutative or alt operands to get better probability of
2575 /// generating vectorized code.
2576 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2579 const BoUpSLP &R);
2580
2581 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2582 /// users of \p TE and collects the stores. It returns the map from the store
2583 /// pointers to the collected stores.
2585 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2586
2587 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2588 /// stores in \p StoresVec can form a vector instruction. If so it returns
2589 /// true and populates \p ReorderIndices with the shuffle indices of the
2590 /// stores when compared to the sorted vector.
2591 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2592 OrdersType &ReorderIndices) const;
2593
2594 /// Iterates through the users of \p TE, looking for scalar stores that can be
2595 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2596 /// their order and builds an order index vector for each store bundle. It
2597 /// returns all these order vectors found.
2598 /// We run this after the tree has formed, otherwise we may come across user
2599 /// instructions that are not yet in the tree.
2601 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2602
2603 struct TreeEntry {
2604 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2605 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2606
2607 /// \returns Common mask for reorder indices and reused scalars.
2608 SmallVector<int> getCommonMask() const {
2610 inversePermutation(ReorderIndices, Mask);
2611 ::addMask(Mask, ReuseShuffleIndices);
2612 return Mask;
2613 }
2614
2615 /// \returns true if the scalars in VL are equal to this entry.
2616 bool isSame(ArrayRef<Value *> VL) const {
2617 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2618 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2619 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2620 return VL.size() == Mask.size() &&
2621 std::equal(VL.begin(), VL.end(), Mask.begin(),
2622 [Scalars](Value *V, int Idx) {
2623 return (isa<UndefValue>(V) &&
2624 Idx == PoisonMaskElem) ||
2625 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2626 });
2627 };
2628 if (!ReorderIndices.empty()) {
2629 // TODO: implement matching if the nodes are just reordered, still can
2630 // treat the vector as the same if the list of scalars matches VL
2631 // directly, without reordering.
2633 inversePermutation(ReorderIndices, Mask);
2634 if (VL.size() == Scalars.size())
2635 return IsSame(Scalars, Mask);
2636 if (VL.size() == ReuseShuffleIndices.size()) {
2637 ::addMask(Mask, ReuseShuffleIndices);
2638 return IsSame(Scalars, Mask);
2639 }
2640 return false;
2641 }
2642 return IsSame(Scalars, ReuseShuffleIndices);
2643 }
2644
2645 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2646 return State == TreeEntry::NeedToGather &&
2647 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2648 UserTreeIndices.front().UserTE == UserEI.UserTE;
2649 }
2650
2651 /// \returns true if current entry has same operands as \p TE.
2652 bool hasEqualOperands(const TreeEntry &TE) const {
2653 if (TE.getNumOperands() != getNumOperands())
2654 return false;
2655 SmallBitVector Used(getNumOperands());
2656 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2657 unsigned PrevCount = Used.count();
2658 for (unsigned K = 0; K < E; ++K) {
2659 if (Used.test(K))
2660 continue;
2661 if (getOperand(K) == TE.getOperand(I)) {
2662 Used.set(K);
2663 break;
2664 }
2665 }
2666 // Check if we actually found the matching operand.
2667 if (PrevCount == Used.count())
2668 return false;
2669 }
2670 return true;
2671 }
2672
2673 /// \return Final vectorization factor for the node. Defined by the total
2674 /// number of vectorized scalars, including those, used several times in the
2675 /// entry and counted in the \a ReuseShuffleIndices, if any.
2676 unsigned getVectorFactor() const {
2677 if (!ReuseShuffleIndices.empty())
2678 return ReuseShuffleIndices.size();
2679 return Scalars.size();
2680 };
2681
2682 /// A vector of scalars.
2683 ValueList Scalars;
2684
2685 /// The Scalars are vectorized into this value. It is initialized to Null.
2686 WeakTrackingVH VectorizedValue = nullptr;
2687
2688 /// New vector phi instructions emitted for the vectorized phi nodes.
2689 PHINode *PHI = nullptr;
2690
2691 /// Do we need to gather this sequence or vectorize it
2692 /// (either with vector instruction or with scatter/gather
2693 /// intrinsics for store/load)?
2694 enum EntryState {
2695 Vectorize,
2696 ScatterVectorize,
2697 StridedVectorize,
2698 NeedToGather
2699 };
2700 EntryState State;
2701
2702 /// Does this sequence require some shuffling?
2703 SmallVector<int, 4> ReuseShuffleIndices;
2704
2705 /// Does this entry require reordering?
2706 SmallVector<unsigned, 4> ReorderIndices;
2707
2708 /// Points back to the VectorizableTree.
2709 ///
2710 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2711 /// to be a pointer and needs to be able to initialize the child iterator.
2712 /// Thus we need a reference back to the container to translate the indices
2713 /// to entries.
2714 VecTreeTy &Container;
2715
2716 /// The TreeEntry index containing the user of this entry. We can actually
2717 /// have multiple users so the data structure is not truly a tree.
2718 SmallVector<EdgeInfo, 1> UserTreeIndices;
2719
2720 /// The index of this treeEntry in VectorizableTree.
2721 int Idx = -1;
2722
2723 private:
2724 /// The operands of each instruction in each lane Operands[op_index][lane].
2725 /// Note: This helps avoid the replication of the code that performs the
2726 /// reordering of operands during buildTree_rec() and vectorizeTree().
2728
2729 /// The main/alternate instruction.
2730 Instruction *MainOp = nullptr;
2731 Instruction *AltOp = nullptr;
2732
2733 public:
2734 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
2735 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2736 if (Operands.size() < OpIdx + 1)
2737 Operands.resize(OpIdx + 1);
2738 assert(Operands[OpIdx].empty() && "Already resized?");
2739 assert(OpVL.size() <= Scalars.size() &&
2740 "Number of operands is greater than the number of scalars.");
2741 Operands[OpIdx].resize(OpVL.size());
2742 copy(OpVL, Operands[OpIdx].begin());
2743 }
2744
2745 /// Set the operands of this bundle in their original order.
2746 void setOperandsInOrder() {
2747 assert(Operands.empty() && "Already initialized?");
2748 auto *I0 = cast<Instruction>(Scalars[0]);
2749 Operands.resize(I0->getNumOperands());
2750 unsigned NumLanes = Scalars.size();
2751 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2752 OpIdx != NumOperands; ++OpIdx) {
2753 Operands[OpIdx].resize(NumLanes);
2754 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2755 auto *I = cast<Instruction>(Scalars[Lane]);
2756 assert(I->getNumOperands() == NumOperands &&
2757 "Expected same number of operands");
2758 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
2759 }
2760 }
2761 }
2762
2763 /// Reorders operands of the node to the given mask \p Mask.
2764 void reorderOperands(ArrayRef<int> Mask) {
2765 for (ValueList &Operand : Operands)
2766 reorderScalars(Operand, Mask);
2767 }
2768
2769 /// \returns the \p OpIdx operand of this TreeEntry.
2770 ValueList &getOperand(unsigned OpIdx) {
2771 assert(OpIdx < Operands.size() && "Off bounds");
2772 return Operands[OpIdx];
2773 }
2774
2775 /// \returns the \p OpIdx operand of this TreeEntry.
2776 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
2777 assert(OpIdx < Operands.size() && "Off bounds");
2778 return Operands[OpIdx];
2779 }
2780
2781 /// \returns the number of operands.
2782 unsigned getNumOperands() const { return Operands.size(); }
2783
2784 /// \return the single \p OpIdx operand.
2785 Value *getSingleOperand(unsigned OpIdx) const {
2786 assert(OpIdx < Operands.size() && "Off bounds");
2787 assert(!Operands[OpIdx].empty() && "No operand available");
2788 return Operands[OpIdx][0];
2789 }
2790
2791 /// Some of the instructions in the list have alternate opcodes.
2792 bool isAltShuffle() const { return MainOp != AltOp; }
2793
2794 bool isOpcodeOrAlt(Instruction *I) const {
2795 unsigned CheckedOpcode = I->getOpcode();
2796 return (getOpcode() == CheckedOpcode ||
2797 getAltOpcode() == CheckedOpcode);
2798 }
2799
2800 /// Chooses the correct key for scheduling data. If \p Op has the same (or
2801 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
2802 /// \p OpValue.
2803 Value *isOneOf(Value *Op) const {
2804 auto *I = dyn_cast<Instruction>(Op);
2805 if (I && isOpcodeOrAlt(I))
2806 return Op;
2807 return MainOp;
2808 }
2809
2810 void setOperations(const InstructionsState &S) {
2811 MainOp = S.MainOp;
2812 AltOp = S.AltOp;
2813 }
2814
2815 Instruction *getMainOp() const {
2816 return MainOp;
2817 }
2818
2819 Instruction *getAltOp() const {
2820 return AltOp;
2821 }
2822
2823 /// The main/alternate opcodes for the list of instructions.
2824 unsigned getOpcode() const {
2825 return MainOp ? MainOp->getOpcode() : 0;
2826 }
2827
2828 unsigned getAltOpcode() const {
2829 return AltOp ? AltOp->getOpcode() : 0;
2830 }
2831
2832 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
2833 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
2834 int findLaneForValue(Value *V) const {
2835 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
2836 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2837 if (!ReorderIndices.empty())
2838 FoundLane = ReorderIndices[FoundLane];
2839 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2840 if (!ReuseShuffleIndices.empty()) {
2841 FoundLane = std::distance(ReuseShuffleIndices.begin(),
2842 find(ReuseShuffleIndices, FoundLane));
2843 }
2844 return FoundLane;
2845 }
2846
2847 /// Build a shuffle mask for graph entry which represents a merge of main
2848 /// and alternate operations.
2849 void
2850 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
2852 SmallVectorImpl<Value *> *OpScalars = nullptr,
2853 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
2854
2855 /// Return true if this is a non-power-of-2 node.
2856 bool isNonPowOf2Vec() const {
2857 bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size());
2858 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
2859 "Reshuffling not supported with non-power-of-2 vectors yet.");
2860 return IsNonPowerOf2;
2861 }
2862
2863#ifndef NDEBUG
2864 /// Debug printer.
2865 LLVM_DUMP_METHOD void dump() const {
2866 dbgs() << Idx << ".\n";
2867 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
2868 dbgs() << "Operand " << OpI << ":\n";
2869 for (const Value *V : Operands[OpI])
2870 dbgs().indent(2) << *V << "\n";
2871 }
2872 dbgs() << "Scalars: \n";
2873 for (Value *V : Scalars)
2874 dbgs().indent(2) << *V << "\n";
2875 dbgs() << "State: ";
2876 switch (State) {
2877 case Vectorize:
2878 dbgs() << "Vectorize\n";
2879 break;
2880 case ScatterVectorize:
2881 dbgs() << "ScatterVectorize\n";
2882 break;
2883 case StridedVectorize:
2884 dbgs() << "StridedVectorize\n";
2885 break;
2886 case NeedToGather:
2887 dbgs() << "NeedToGather\n";
2888 break;
2889 }
2890 dbgs() << "MainOp: ";
2891 if (MainOp)
2892 dbgs() << *MainOp << "\n";
2893 else
2894 dbgs() << "NULL\n";
2895 dbgs() << "AltOp: ";
2896 if (AltOp)
2897 dbgs() << *AltOp << "\n";
2898 else
2899 dbgs() << "NULL\n";
2900 dbgs() << "VectorizedValue: ";
2901 if (VectorizedValue)
2902 dbgs() << *VectorizedValue << "\n";
2903 else
2904 dbgs() << "NULL\n";
2905 dbgs() << "ReuseShuffleIndices: ";
2906 if (ReuseShuffleIndices.empty())
2907 dbgs() << "Empty";
2908 else
2909 for (int ReuseIdx : ReuseShuffleIndices)
2910 dbgs() << ReuseIdx << ", ";
2911 dbgs() << "\n";
2912 dbgs() << "ReorderIndices: ";
2913 for (unsigned ReorderIdx : ReorderIndices)
2914 dbgs() << ReorderIdx << ", ";
2915 dbgs() << "\n";
2916 dbgs() << "UserTreeIndices: ";
2917 for (const auto &EInfo : UserTreeIndices)
2918 dbgs() << EInfo << ", ";
2919 dbgs() << "\n";
2920 }
2921#endif
2922 };
2923
2924#ifndef NDEBUG
2925 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
2926 InstructionCost VecCost, InstructionCost ScalarCost,
2927 StringRef Banner) const {
2928 dbgs() << "SLP: " << Banner << ":\n";
2929 E->dump();
2930 dbgs() << "SLP: Costs:\n";
2931 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
2932 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
2933 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
2934 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2935 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
2936 }
2937#endif
2938
2939 /// Create a new VectorizableTree entry.
2940 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2941 std::optional<ScheduleData *> Bundle,
2942 const InstructionsState &S,
2943 const EdgeInfo &UserTreeIdx,
2944 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2945 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2946 TreeEntry::EntryState EntryState =
2947 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2948 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2949 ReuseShuffleIndices, ReorderIndices);
2950 }
2951
2952 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2953 TreeEntry::EntryState EntryState,
2954 std::optional<ScheduleData *> Bundle,
2955 const InstructionsState &S,
2956 const EdgeInfo &UserTreeIdx,
2957 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2958 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2959 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2960 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2961 "Need to vectorize gather entry?");
2962 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
2963 TreeEntry *Last = VectorizableTree.back().get();
2964 Last->Idx = VectorizableTree.size() - 1;
2965 Last->State = EntryState;
2966 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2967 ReuseShuffleIndices.end());
2968 if (ReorderIndices.empty()) {
2969 Last->Scalars.assign(VL.begin(), VL.end());
2970 Last->setOperations(S);
2971 } else {
2972 // Reorder scalars and build final mask.
2973 Last->Scalars.assign(VL.size(), nullptr);
2974 transform(ReorderIndices, Last->Scalars.begin(),
2975 [VL](unsigned Idx) -> Value * {
2976 if (Idx >= VL.size())
2977 return UndefValue::get(VL.front()->getType());
2978 return VL[Idx];
2979 });
2980 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
2981 Last->setOperations(S);
2982 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
2983 }
2984 if (Last->State != TreeEntry::NeedToGather) {
2985 for (Value *V : VL) {
2986 const TreeEntry *TE = getTreeEntry(V);
2987 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
2988 "Scalar already in tree!");
2989 if (TE) {
2990 if (TE != Last)
2991 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
2992 continue;
2993 }
2994 ScalarToTreeEntry[V] = Last;
2995 }
2996 // Update the scheduler bundle to point to this TreeEntry.
2997 ScheduleData *BundleMember = *Bundle;
2998 assert((BundleMember || isa<PHINode>(S.MainOp) ||
2999 isVectorLikeInstWithConstOps(S.MainOp) ||
3000 doesNotNeedToSchedule(VL)) &&
3001 "Bundle and VL out of sync");
3002 if (BundleMember) {
3003 for (Value *V : VL) {
3005 continue;
3006 if (!BundleMember)
3007 continue;
3008 BundleMember->TE = Last;
3009 BundleMember = BundleMember->NextInBundle;
3010 }
3011 }
3012 assert(!BundleMember && "Bundle and VL out of sync");
3013 } else {
3014 // Build a map for gathered scalars to the nodes where they are used.
3015 bool AllConstsOrCasts = true;
3016 for (Value *V : VL)
3017 if (!isConstant(V)) {
3018 auto *I = dyn_cast<CastInst>(V);
3019 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3020 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3021 }
3022 if (AllConstsOrCasts)
3023 CastMaxMinBWSizes =
3024 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3025 MustGather.insert(VL.begin(), VL.end());
3026 }
3027
3028 if (UserTreeIdx.UserTE) {
3029 Last->UserTreeIndices.push_back(UserTreeIdx);
3030 assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3031 "Reordering isn't implemented for non-power-of-2 nodes yet");
3032 }
3033 return Last;
3034 }
3035
3036 /// -- Vectorization State --
3037 /// Holds all of the tree entries.
3038 TreeEntry::VecTreeTy VectorizableTree;
3039
3040#ifndef NDEBUG
3041 /// Debug printer.
3042 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3043 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3044 VectorizableTree[Id]->dump();
3045 dbgs() << "\n";
3046 }
3047 }
3048#endif
3049
3050 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3051
3052 const TreeEntry *getTreeEntry(Value *V) const {
3053 return ScalarToTreeEntry.lookup(V);
3054 }
3055
3056 /// Check that the operand node of alternate node does not generate
3057 /// buildvector sequence. If it is, then probably not worth it to build
3058 /// alternate shuffle, if number of buildvector operands + alternate
3059 /// instruction > than the number of buildvector instructions.
3060 /// \param S the instructions state of the analyzed values.
3061 /// \param VL list of the instructions with alternate opcodes.
3062 bool areAltOperandsProfitable(const InstructionsState &S,
3063 ArrayRef<Value *> VL) const;
3064
3065 /// Checks if the specified list of the instructions/values can be vectorized
3066 /// and fills required data before actual scheduling of the instructions.
3067 TreeEntry::EntryState getScalarsVectorizationState(
3068 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
3069 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
3070
3071 /// Maps a specific scalar to its tree entry.
3072 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3073
3074 /// List of scalars, used in several vectorize nodes, and the list of the
3075 /// nodes.
3077
3078 /// Maps a value to the proposed vectorizable size.
3079 SmallDenseMap<Value *, unsigned> InstrElementSize;
3080
3081 /// A list of scalars that we found that we need to keep as scalars.
3082 ValueSet MustGather;
3083
3084 /// A set of first non-schedulable values.
3085 ValueSet NonScheduledFirst;
3086
3087 /// A map between the vectorized entries and the last instructions in the
3088 /// bundles. The bundles are built in use order, not in the def order of the
3089 /// instructions. So, we cannot rely directly on the last instruction in the
3090 /// bundle being the last instruction in the program order during
3091 /// vectorization process since the basic blocks are affected, need to
3092 /// pre-gather them before.
3093 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3094
3095 /// List of gather nodes, depending on other gather/vector nodes, which should
3096 /// be emitted after the vector instruction emission process to correctly
3097 /// handle order of the vector instructions and shuffles.
3098 SetVector<const TreeEntry *> PostponedGathers;
3099
3100 using ValueToGatherNodesMap =
3102 ValueToGatherNodesMap ValueToGatherNodes;
3103
3104 /// This POD struct describes one external user in the vectorized tree.
3105 struct ExternalUser {
3106 ExternalUser(Value *S, llvm::User *U, int L)
3107 : Scalar(S), User(U), Lane(L) {}
3108
3109 // Which scalar in our function.
3110 Value *Scalar;
3111
3112 // Which user that uses the scalar.
3114
3115 // Which lane does the scalar belong to.
3116 int Lane;
3117 };
3118 using UserList = SmallVector<ExternalUser, 16>;
3119
3120 /// Checks if two instructions may access the same memory.
3121 ///
3122 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3123 /// is invariant in the calling loop.
3124 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3125 Instruction *Inst2) {
3126 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3127 return true;
3128 // First check if the result is already in the cache.
3129 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3130 auto It = AliasCache.find(Key);
3131 if (It != AliasCache.end())
3132 return It->second;
3133 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3134 // Store the result in the cache.
3135 AliasCache.try_emplace(Key, Aliased);
3136 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3137 return Aliased;
3138 }
3139
3140 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3141
3142 /// Cache for alias results.
3143 /// TODO: consider moving this to the AliasAnalysis itself.
3145
3146 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3147 // globally through SLP because we don't perform any action which
3148 // invalidates capture results.
3149 BatchAAResults BatchAA;
3150
3151 /// Temporary store for deleted instructions. Instructions will be deleted
3152 /// eventually when the BoUpSLP is destructed. The deferral is required to
3153 /// ensure that there are no incorrect collisions in the AliasCache, which
3154 /// can happen if a new instruction is allocated at the same address as a
3155 /// previously deleted instruction.
3156 DenseSet<Instruction *> DeletedInstructions;
3157
3158 /// Set of the instruction, being analyzed already for reductions.
3159 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3160
3161 /// Set of hashes for the list of reduction values already being analyzed.
3162 DenseSet<size_t> AnalyzedReductionVals;
3163
3164 /// Values, already been analyzed for mininmal bitwidth and found to be
3165 /// non-profitable.
3166 DenseSet<Value *> AnalyzedMinBWVals;
3167
3168 /// A list of values that need to extracted out of the tree.
3169 /// This list holds pairs of (Internal Scalar : External User). External User
3170 /// can be nullptr, it means that this Internal Scalar will be used later,
3171 /// after vectorization.
3172 UserList ExternalUses;
3173
3174 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3175 /// extractelement instructions.
3176 SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;
3177
3178 /// Values used only by @llvm.assume calls.
3180
3181 /// Holds all of the instructions that we gathered, shuffle instructions and
3182 /// extractelements.
3183 SetVector<Instruction *> GatherShuffleExtractSeq;
3184
3185 /// A list of blocks that we are going to CSE.
3186 DenseSet<BasicBlock *> CSEBlocks;
3187
3188 /// Contains all scheduling relevant data for an instruction.
3189 /// A ScheduleData either represents a single instruction or a member of an
3190 /// instruction bundle (= a group of instructions which is combined into a
3191 /// vector instruction).
3192 struct ScheduleData {
3193 // The initial value for the dependency counters. It means that the
3194 // dependencies are not calculated yet.
3195 enum { InvalidDeps = -1 };
3196
3197 ScheduleData() = default;
3198
3199 void init(int BlockSchedulingRegionID, Value *OpVal) {
3200 FirstInBundle = this;
3201 NextInBundle = nullptr;
3202 NextLoadStore = nullptr;
3203 IsScheduled = false;
3204 SchedulingRegionID = BlockSchedulingRegionID;
3205 clearDependencies();
3206 OpValue = OpVal;
3207 TE = nullptr;
3208 }
3209
3210 /// Verify basic self consistency properties
3211 void verify() {
3212 if (hasValidDependencies()) {
3213 assert(UnscheduledDeps <= Dependencies && "invariant");
3214 } else {
3215 assert(UnscheduledDeps == Dependencies && "invariant");
3216 }
3217
3218 if (IsScheduled) {
3219 assert(isSchedulingEntity() &&
3220 "unexpected scheduled state");
3221 for (const ScheduleData *BundleMember = this; BundleMember;
3222 BundleMember = BundleMember->NextInBundle) {
3223 assert(BundleMember->hasValidDependencies() &&
3224 BundleMember->UnscheduledDeps == 0 &&
3225 "unexpected scheduled state");
3226 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3227 "only bundle is marked scheduled");
3228 }
3229 }
3230
3231 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3232 "all bundle members must be in same basic block");
3233 }
3234
3235 /// Returns true if the dependency information has been calculated.
3236 /// Note that depenendency validity can vary between instructions within
3237 /// a single bundle.
3238 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3239
3240 /// Returns true for single instructions and for bundle representatives
3241 /// (= the head of a bundle).
3242 bool isSchedulingEntity() const { return FirstInBundle == this; }
3243
3244 /// Returns true if it represents an instruction bundle and not only a
3245 /// single instruction.
3246 bool isPartOfBundle() const {
3247 return NextInBundle != nullptr || FirstInBundle != this || TE;
3248 }
3249
3250 /// Returns true if it is ready for scheduling, i.e. it has no more
3251 /// unscheduled depending instructions/bundles.
3252 bool isReady() const {
3253 assert(isSchedulingEntity() &&
3254 "can't consider non-scheduling entity for ready list");
3255 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3256 }
3257
3258 /// Modifies the number of unscheduled dependencies for this instruction,
3259 /// and returns the number of remaining dependencies for the containing
3260 /// bundle.
3261 int incrementUnscheduledDeps(int Incr) {
3262 assert(hasValidDependencies() &&
3263 "increment of unscheduled deps would be meaningless");
3264 UnscheduledDeps += Incr;
3265 return FirstInBundle->unscheduledDepsInBundle();
3266 }
3267
3268 /// Sets the number of unscheduled dependencies to the number of
3269 /// dependencies.
3270 void resetUnscheduledDeps() {
3271 UnscheduledDeps = Dependencies;
3272 }
3273
3274 /// Clears all dependency information.
3275 void clearDependencies() {
3276 Dependencies = InvalidDeps;
3277 resetUnscheduledDeps();
3278 MemoryDependencies.clear();
3279 ControlDependencies.clear();
3280 }
3281
3282 int unscheduledDepsInBundle() const {
3283 assert(isSchedulingEntity() && "only meaningful on the bundle");
3284 int Sum = 0;
3285 for (const ScheduleData *BundleMember = this; BundleMember;
3286 BundleMember = BundleMember->NextInBundle) {
3287 if (BundleMember->UnscheduledDeps == InvalidDeps)
3288 return InvalidDeps;
3289 Sum += BundleMember->UnscheduledDeps;
3290 }
3291 return Sum;
3292 }
3293
3294 void dump(raw_ostream &os) const {
3295 if (!isSchedulingEntity()) {
3296 os << "/ " << *Inst;
3297 } else if (NextInBundle) {
3298 os << '[' << *Inst;
3299 ScheduleData *SD = NextInBundle;
3300 while (SD) {
3301 os << ';' << *SD->Inst;
3302 SD = SD->NextInBundle;
3303 }
3304 os << ']';
3305 } else {
3306 os << *Inst;
3307 }
3308 }
3309
3310 Instruction *Inst = nullptr;
3311
3312 /// Opcode of the current instruction in the schedule data.
3313 Value *OpValue = nullptr;
3314
3315 /// The TreeEntry that this instruction corresponds to.
3316 TreeEntry *TE = nullptr;
3317
3318 /// Points to the head in an instruction bundle (and always to this for
3319 /// single instructions).
3320 ScheduleData *FirstInBundle = nullptr;
3321
3322 /// Single linked list of all instructions in a bundle. Null if it is a
3323 /// single instruction.
3324 ScheduleData *NextInBundle = nullptr;
3325
3326 /// Single linked list of all memory instructions (e.g. load, store, call)
3327 /// in the block - until the end of the scheduling region.
3328 ScheduleData *NextLoadStore = nullptr;
3329
3330 /// The dependent memory instructions.
3331 /// This list is derived on demand in calculateDependencies().
3332 SmallVector<ScheduleData *, 4> MemoryDependencies;
3333
3334 /// List of instructions which this instruction could be control dependent
3335 /// on. Allowing such nodes to be scheduled below this one could introduce
3336 /// a runtime fault which didn't exist in the original program.
3337 /// ex: this is a load or udiv following a readonly call which inf loops
3338 SmallVector<ScheduleData *, 4> ControlDependencies;
3339
3340 /// This ScheduleData is in the current scheduling region if this matches
3341 /// the current SchedulingRegionID of BlockScheduling.
3342 int SchedulingRegionID = 0;
3343
3344 /// Used for getting a "good" final ordering of instructions.
3345 int SchedulingPriority = 0;
3346
3347 /// The number of dependencies. Constitutes of the number of users of the
3348 /// instruction plus the number of dependent memory instructions (if any).
3349 /// This value is calculated on demand.
3350 /// If InvalidDeps, the number of dependencies is not calculated yet.
3351 int Dependencies = InvalidDeps;
3352
3353 /// The number of dependencies minus the number of dependencies of scheduled
3354 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3355 /// for scheduling.
3356 /// Note that this is negative as long as Dependencies is not calculated.
3357 int UnscheduledDeps = InvalidDeps;
3358
3359 /// True if this instruction is scheduled (or considered as scheduled in the
3360 /// dry-run).
3361 bool IsScheduled = false;
3362 };
3363
3364#ifndef NDEBUG
3366 const BoUpSLP::ScheduleData &SD) {
3367 SD.dump(os);
3368 return os;
3369 }
3370#endif
3371
3372 friend struct GraphTraits<BoUpSLP *>;
3373 friend struct DOTGraphTraits<BoUpSLP *>;
3374
3375 /// Contains all scheduling data for a basic block.
3376 /// It does not schedules instructions, which are not memory read/write
3377 /// instructions and their operands are either constants, or arguments, or
3378 /// phis, or instructions from others blocks, or their users are phis or from
3379 /// the other blocks. The resulting vector instructions can be placed at the
3380 /// beginning of the basic block without scheduling (if operands does not need
3381 /// to be scheduled) or at the end of the block (if users are outside of the
3382 /// block). It allows to save some compile time and memory used by the
3383 /// compiler.
3384 /// ScheduleData is assigned for each instruction in between the boundaries of
3385 /// the tree entry, even for those, which are not part of the graph. It is
3386 /// required to correctly follow the dependencies between the instructions and
3387 /// their correct scheduling. The ScheduleData is not allocated for the
3388 /// instructions, which do not require scheduling, like phis, nodes with
3389 /// extractelements/insertelements only or nodes with instructions, with
3390 /// uses/operands outside of the block.
3391 struct BlockScheduling {
3392 BlockScheduling(BasicBlock *BB)
3393 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3394
3395 void clear() {
3396 ReadyInsts.clear();
3397 ScheduleStart = nullptr;
3398 ScheduleEnd = nullptr;
3399 FirstLoadStoreInRegion = nullptr;
3400 LastLoadStoreInRegion = nullptr;
3401 RegionHasStackSave = false;
3402
3403 // Reduce the maximum schedule region size by the size of the
3404 // previous scheduling run.
3405 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3406 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3407 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3408 ScheduleRegionSize = 0;
3409
3410 // Make a new scheduling region, i.e. all existing ScheduleData is not
3411 // in the new region yet.
3412 ++SchedulingRegionID;
3413 }
3414
3415 ScheduleData *getScheduleData(Instruction *I) {
3416 if (BB != I->getParent())
3417 // Avoid lookup if can't possibly be in map.
3418 return nullptr;
3419 ScheduleData *SD = ScheduleDataMap.lookup(I);
3420 if (SD && isInSchedulingRegion(SD))
3421 return SD;
3422 return nullptr;
3423 }
3424
3425 ScheduleData *getScheduleData(Value *V) {
3426 if (auto *I = dyn_cast<Instruction>(V))
3427 return getScheduleData(I);
3428 return nullptr;
3429 }
3430
3431 ScheduleData *getScheduleData(Value *V, Value *Key) {
3432 if (V == Key)
3433 return getScheduleData(V);
3434 auto I = ExtraScheduleDataMap.find(V);
3435 if (I != ExtraScheduleDataMap.end()) {
3436 ScheduleData *SD = I->second.lookup(Key);
3437 if (SD && isInSchedulingRegion(SD))
3438 return SD;
3439 }
3440 return nullptr;
3441 }
3442
3443 bool isInSchedulingRegion(ScheduleData *SD) const {
3444 return SD->SchedulingRegionID == SchedulingRegionID;
3445 }
3446
3447 /// Marks an instruction as scheduled and puts all dependent ready
3448 /// instructions into the ready-list.
3449 template <typename ReadyListType>
3450 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3451 SD->IsScheduled = true;
3452 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3453
3454 for (ScheduleData *BundleMember = SD; BundleMember;
3455 BundleMember = BundleMember->NextInBundle) {
3456 if (BundleMember->Inst != BundleMember->OpValue)
3457 continue;
3458
3459 // Handle the def-use chain dependencies.
3460
3461 // Decrement the unscheduled counter and insert to ready list if ready.
3462 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3463 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
3464 if (OpDef && OpDef->hasValidDependencies() &&
3465 OpDef->incrementUnscheduledDeps(-1) == 0) {
3466 // There are no more unscheduled dependencies after
3467 // decrementing, so we can put the dependent instruction
3468 // into the ready list.
3469 ScheduleData *DepBundle = OpDef->FirstInBundle;
3470 assert(!DepBundle->IsScheduled &&
3471 "already scheduled bundle gets ready");
3472 ReadyList.insert(DepBundle);
3473 LLVM_DEBUG(dbgs()
3474 << "SLP: gets ready (def): " << *DepBundle << "\n");
3475 }
3476 });
3477 };
3478
3479 // If BundleMember is a vector bundle, its operands may have been
3480 // reordered during buildTree(). We therefore need to get its operands
3481 // through the TreeEntry.
3482 if (TreeEntry *TE = BundleMember->TE) {
3483 // Need to search for the lane since the tree entry can be reordered.
3484 int Lane = std::distance(TE->Scalars.begin(),
3485 find(TE->Scalars, BundleMember->Inst));
3486 assert(Lane >= 0 && "Lane not set");
3487
3488 // Since vectorization tree is being built recursively this assertion
3489 // ensures that the tree entry has all operands set before reaching
3490 // this code. Couple of exceptions known at the moment are extracts
3491 // where their second (immediate) operand is not added. Since
3492 // immediates do not affect scheduler behavior this is considered
3493 // okay.
3494 auto *In = BundleMember->Inst;
3495 assert(
3496 In &&
3497 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3498 In->getNumOperands() == TE->getNumOperands()) &&
3499 "Missed TreeEntry operands?");
3500 (void)In; // fake use to avoid build failure when assertions disabled
3501
3502 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3503 OpIdx != NumOperands; ++OpIdx)
3504 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3505 DecrUnsched(I);
3506 } else {
3507 // If BundleMember is a stand-alone instruction, no operand reordering
3508 // has taken place, so we directly access its operands.
3509 for (Use &U : BundleMember->Inst->operands())
3510 if (auto *I = dyn_cast<Instruction>(U.get()))
3511 DecrUnsched(I);
3512 }
3513 // Handle the memory dependencies.
3514 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3515 if (MemoryDepSD->hasValidDependencies() &&
3516 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3517 // There are no more unscheduled dependencies after decrementing,
3518 // so we can put the dependent instruction into the ready list.
3519 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3520 assert(!DepBundle->IsScheduled &&
3521 "already scheduled bundle gets ready");
3522 ReadyList.insert(DepBundle);
3524 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3525 }
3526 }
3527 // Handle the control dependencies.
3528 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3529 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3530 // There are no more unscheduled dependencies after decrementing,
3531 // so we can put the dependent instruction into the ready list.
3532 ScheduleData *DepBundle = DepSD->FirstInBundle;
3533 assert(!DepBundle->IsScheduled &&
3534 "already scheduled bundle gets ready");
3535 ReadyList.insert(DepBundle);
3537 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3538 }
3539 }
3540 }
3541 }
3542
3543 /// Verify basic self consistency properties of the data structure.
3544 void verify() {
3545 if (!ScheduleStart)
3546 return;
3547
3548 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3549 ScheduleStart->comesBefore(ScheduleEnd) &&
3550 "Not a valid scheduling region?");
3551
3552 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3553 auto *SD = getScheduleData(I);
3554 if (!SD)
3555 continue;
3556 assert(isInSchedulingRegion(SD) &&
3557 "primary schedule data not in window?");
3558 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3559 "entire bundle in window!");
3560 (void)SD;
3561 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3562 }
3563
3564 for (auto *SD : ReadyInsts) {
3565 assert(SD->isSchedulingEntity() && SD->isReady() &&
3566 "item in ready list not ready?");
3567 (void)SD;
3568 }
3569 }
3570
3571 void doForAllOpcodes(Value *V,
3572 function_ref<void(ScheduleData *SD)> Action) {
3573 if (ScheduleData *SD = getScheduleData(V))
3574 Action(SD);
3575 auto I = ExtraScheduleDataMap.find(V);
3576 if (I != ExtraScheduleDataMap.end())
3577 for (auto &P : I->second)
3578 if (isInSchedulingRegion(P.second))
3579 Action(P.second);
3580 }
3581
3582 /// Put all instructions into the ReadyList which are ready for scheduling.
3583 template <typename ReadyListType>
3584 void initialFillReadyList(ReadyListType &ReadyList) {
3585 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3586 doForAllOpcodes(I, [&](ScheduleData *SD) {
3587 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3588 SD->isReady()) {
3589 ReadyList.insert(SD);
3590 LLVM_DEBUG(dbgs()
3591 << "SLP: initially in ready list: " << *SD << "\n");
3592 }
3593 });
3594 }
3595 }
3596
3597 /// Build a bundle from the ScheduleData nodes corresponding to the
3598 /// scalar instruction for each lane.
3599 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3600
3601 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3602 /// cyclic dependencies. This is only a dry-run, no instructions are
3603 /// actually moved at this stage.
3604 /// \returns the scheduling bundle. The returned Optional value is not
3605 /// std::nullopt if \p VL is allowed to be scheduled.
3606 std::optional<ScheduleData *>
3607 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3608 const InstructionsState &S);
3609
3610 /// Un-bundles a group of instructions.
3611 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3612
3613 /// Allocates schedule data chunk.
3614 ScheduleData *allocateScheduleDataChunks();
3615
3616 /// Extends the scheduling region so that V is inside the region.
3617 /// \returns true if the region size is within the limit.
3618 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3619
3620 /// Initialize the ScheduleData structures for new instructions in the
3621 /// scheduling region.
3622 void initScheduleData(Instruction *FromI, Instruction *ToI,
3623 ScheduleData *PrevLoadStore,
3624 ScheduleData *NextLoadStore);
3625
3626 /// Updates the dependency information of a bundle and of all instructions/
3627 /// bundles which depend on the original bundle.
3628 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3629 BoUpSLP *SLP);
3630
3631 /// Sets all instruction in the scheduling region to un-scheduled.
3632 void resetSchedule();
3633
3634 BasicBlock *BB;
3635
3636 /// Simple memory allocation for ScheduleData.
3638
3639 /// The size of a ScheduleData array in ScheduleDataChunks.
3640 int ChunkSize;
3641
3642 /// The allocator position in the current chunk, which is the last entry
3643 /// of ScheduleDataChunks.
3644 int ChunkPos;
3645
3646 /// Attaches ScheduleData to Instruction.
3647 /// Note that the mapping survives during all vectorization iterations, i.e.
3648 /// ScheduleData structures are recycled.
3650
3651 /// Attaches ScheduleData to Instruction with the leading key.
3653 ExtraScheduleDataMap;
3654
3655 /// The ready-list for scheduling (only used for the dry-run).
3656 SetVector<ScheduleData *> ReadyInsts;
3657
3658 /// The first instruction of the scheduling region.
3659 Instruction *ScheduleStart = nullptr;
3660
3661 /// The first instruction _after_ the scheduling region.
3662 Instruction *ScheduleEnd = nullptr;
3663
3664 /// The first memory accessing instruction in the scheduling region
3665 /// (can be null).
3666 ScheduleData *FirstLoadStoreInRegion = nullptr;
3667
3668 /// The last memory accessing instruction in the scheduling region
3669 /// (can be null).
3670 ScheduleData *LastLoadStoreInRegion = nullptr;
3671
3672 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3673 /// region? Used to optimize the dependence calculation for the
3674 /// common case where there isn't.
3675 bool RegionHasStackSave = false;
3676
3677 /// The current size of the scheduling region.
3678 int ScheduleRegionSize = 0;
3679
3680 /// The maximum size allowed for the scheduling region.
3681 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3682
3683 /// The ID of the scheduling region. For a new vectorization iteration this
3684 /// is incremented which "removes" all ScheduleData from the region.
3685 /// Make sure that the initial SchedulingRegionID is greater than the
3686 /// initial SchedulingRegionID in ScheduleData (which is 0).
3687 int SchedulingRegionID = 1;
3688 };
3689
3690 /// Attaches the BlockScheduling structures to basic blocks.
3692
3693 /// Performs the "real" scheduling. Done before vectorization is actually
3694 /// performed in a basic block.
3695 void scheduleBlock(BlockScheduling *BS);
3696
3697 /// List of users to ignore during scheduling and that don't need extracting.
3698 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3699
3700 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3701 /// sorted SmallVectors of unsigned.
3702 struct OrdersTypeDenseMapInfo {
3703 static OrdersType getEmptyKey() {
3704 OrdersType V;
3705 V.push_back(~1U);
3706 return V;
3707 }
3708
3709 static OrdersType getTombstoneKey() {
3710 OrdersType V;
3711 V.push_back(~2U);
3712 return V;
3713 }
3714
3715 static unsigned getHashValue(const OrdersType &V) {
3716 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3717 }
3718
3719 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3720 return LHS == RHS;
3721 }
3722 };
3723
3724 // Analysis and block reference.
3725 Function *F;
3726 ScalarEvolution *SE;
3728 TargetLibraryInfo *TLI;
3729 LoopInfo *LI;
3730 DominatorTree *DT;
3731 AssumptionCache *AC;
3732 DemandedBits *DB;
3733 const DataLayout *DL;
3735
3736 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3737 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3738
3739 /// Instruction builder to construct the vectorized tree.
3741
3742 /// A map of scalar integer values to the smallest bit width with which they
3743 /// can legally be represented. The values map to (width, signed) pairs,
3744 /// where "width" indicates the minimum bit width and "signed" is True if the
3745 /// value must be signed-extended, rather than zero-extended, back to its
3746 /// original width.
3748
3749 /// Final size of the reduced vector, if the current graph represents the
3750 /// input for the reduction and it was possible to narrow the size of the
3751 /// reduction.
3752 unsigned ReductionBitWidth = 0;
3753
3754 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
3755 /// type sizes, used in the tree.
3756 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
3757
3758 /// Indices of the vectorized nodes, which supposed to be the roots of the new
3759 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
3760 DenseSet<unsigned> ExtraBitWidthNodes;
3761};
3762
3763} // end namespace slpvectorizer
3764
3765template <> struct GraphTraits<BoUpSLP *> {
3766 using TreeEntry = BoUpSLP::TreeEntry;
3767
3768 /// NodeRef has to be a pointer per the GraphWriter.
3770
3772
3773 /// Add the VectorizableTree to the index iterator to be able to return
3774 /// TreeEntry pointers.
3775 struct ChildIteratorType
3776 : public iterator_adaptor_base<
3777 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3779
3781 ContainerTy &VT)
3782 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
3783
3784 NodeRef operator*() { return I->UserTE; }
3785 };
3786
3788 return R.VectorizableTree[0].get();
3789 }
3790
3791 static ChildIteratorType child_begin(NodeRef N) {
3792 return {N->UserTreeIndices.begin(), N->Container};
3793 }
3794
3795 static ChildIteratorType child_end(NodeRef N) {
3796 return {N->UserTreeIndices.end(), N->Container};
3797 }
3798
3799 /// For the node iterator we just need to turn the TreeEntry iterator into a
3800 /// TreeEntry* iterator so that it dereferences to NodeRef.
3801 class nodes_iterator {
3803 ItTy It;
3804
3805 public:
3806 nodes_iterator(const ItTy &It2) : It(It2) {}
3807 NodeRef operator*() { return It->get(); }
3808 nodes_iterator operator++() {
3809 ++It;
3810 return *this;
3811 }
3812 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
3813 };
3814
3815 static nodes_iterator nodes_begin(BoUpSLP *R) {
3816 return nodes_iterator(R->VectorizableTree.begin());
3817 }
3818
3819 static nodes_iterator nodes_end(BoUpSLP *R) {
3820 return nodes_iterator(R->VectorizableTree.end());
3821 }
3822
3823 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
3824};
3825
3826template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
3827 using TreeEntry = BoUpSLP::TreeEntry;
3828
3829 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
3830
3831 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
3832 std::string Str;
3834 OS << Entry->Idx << ".\n";
3835 if (isSplat(Entry->Scalars))
3836 OS << "<splat> ";
3837 for (auto *V : Entry->Scalars) {
3838 OS << *V;
3839 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
3840 return EU.Scalar == V;
3841 }))
3842 OS << " <extract>";
3843 OS << "\n";
3844 }
3845 return Str;
3846 }
3847
3848 static std::string getNodeAttributes(const TreeEntry *Entry,
3849 const BoUpSLP *) {
3850 if (Entry->State == TreeEntry::NeedToGather)
3851 return "color=red";
3852 if (Entry->State == TreeEntry::ScatterVectorize ||
3853 Entry->State == TreeEntry::StridedVectorize)
3854 return "color=blue";
3855 return "";
3856 }
3857};
3858
3859} // end namespace llvm
3860
3863 for (auto *I : DeletedInstructions) {
3864 for (Use &U : I->operands()) {
3865 auto *Op = dyn_cast<Instruction>(U.get());
3866 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
3868 DeadInsts.emplace_back(Op);
3869 }
3870 I->dropAllReferences();
3871 }
3872 for (auto *I : DeletedInstructions) {
3873 assert(I->use_empty() &&
3874 "trying to erase instruction with users.");
3875 I->eraseFromParent();
3876 }
3877
3878 // Cleanup any dead scalar code feeding the vectorized instructions
3880
3881#ifdef EXPENSIVE_CHECKS
3882 // If we could guarantee that this call is not extremely slow, we could
3883 // remove the ifdef limitation (see PR47712).
3884 assert(!verifyFunction(*F, &dbgs()));
3885#endif
3886}
3887
3888/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
3889/// contains original mask for the scalars reused in the node. Procedure
3890/// transform this mask in accordance with the given \p Mask.
3892 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
3893 "Expected non-empty mask.");
3894 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
3895 Prev.swap(Reuses);
3896 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
3897 if (Mask[I] != PoisonMaskElem)
3898 Reuses[Mask[I]] = Prev[I];
3899}
3900
3901/// Reorders the given \p Order according to the given \p Mask. \p Order - is
3902/// the original order of the scalars. Procedure transforms the provided order
3903/// in accordance with the given \p Mask. If the resulting \p Order is just an
3904/// identity order, \p Order is cleared.
3906 bool BottomOrder = false) {
3907 assert(!Mask.empty() && "Expected non-empty mask.");
3908 unsigned Sz = Mask.size();
3909 if (BottomOrder) {
3910 SmallVector<unsigned> PrevOrder;
3911 if (Order.empty()) {
3912 PrevOrder.resize(Sz);
3913 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
3914 } else {
3915 PrevOrder.swap(Order);
3916 }
3917 Order.assign(Sz, Sz);
3918 for (unsigned I = 0; I < Sz; ++I)
3919 if (Mask[I] != PoisonMaskElem)
3920 Order[I] = PrevOrder[Mask[I]];
3921 if (all_of(enumerate(Order), [&](const auto &Data) {
3922 return Data.value() == Sz || Data.index() == Data.value();
3923 })) {
3924 Order.clear();
3925 return;
3926 }
3927 fixupOrderingIndices(Order);
3928 return;
3929 }
3930 SmallVector<int> MaskOrder;
3931 if (Order.empty()) {
3932 MaskOrder.resize(Sz);
3933 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
3934 } else {
3935 inversePermutation(Order, MaskOrder);
3936 }
3937 reorderReuses(MaskOrder, Mask);
3938 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
3939 Order.clear();
3940 return;
3941 }
3942 Order.assign(Sz, Sz);
3943 for (unsigned I = 0; I < Sz; ++I)
3944 if (MaskOrder[I] != PoisonMaskElem)
3945 Order[MaskOrder[I]] = I;
3946 fixupOrderingIndices(Order);
3947}
3948
3949std::optional<BoUpSLP::OrdersType>
3950BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
3951 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
3952 // Try to find subvector extract/insert patterns and reorder only such
3953 // patterns.
3954 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
3955 Type *ScalarTy = GatheredScalars.front()->getType();
3956 int NumScalars = GatheredScalars.size();
3957 if (!isValidElementType(ScalarTy))
3958 return std::nullopt;
3959 auto *VecTy = FixedVectorType::get(ScalarTy, NumScalars);
3960 int NumParts = TTI->getNumberOfParts(VecTy);
3961 if (NumParts == 0 || NumParts >= NumScalars)
3962 NumParts = 1;
3963 SmallVector<int> ExtractMask;
3964 SmallVector<int> Mask;
3967 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
3969 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
3970 /*ForOrder=*/true);
3971 // No shuffled operands - ignore.
3972 if (GatherShuffles.empty() && ExtractShuffles.empty())
3973 return std::nullopt;
3974 OrdersType CurrentOrder(NumScalars, NumScalars);
3975 if (GatherShuffles.size() == 1 &&
3976 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
3977 Entries.front().front()->isSame(TE.Scalars)) {
3978 // Perfect match in the graph, will reuse the previously vectorized
3979 // node. Cost is 0.
3980 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
3981 return CurrentOrder;
3982 }
3983 auto IsSplatMask = [](ArrayRef<int> Mask) {
3984 int SingleElt = PoisonMaskElem;
3985 return all_of(Mask, [&](int I) {
3986 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
3987 SingleElt = I;
3988 return I == PoisonMaskElem || I == SingleElt;
3989 });
3990 };
3991 // Exclusive broadcast mask - ignore.
3992 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
3993 (Entries.size() != 1 ||
3994 Entries.front().front()->ReorderIndices.empty())) ||
3995 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
3996 return std::nullopt;
3997 SmallBitVector ShuffledSubMasks(NumParts);
3998 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
3999 ArrayRef<int> Mask, int PartSz, int NumParts,
4000 function_ref<unsigned(unsigned)> GetVF) {
4001 for (int I : seq<int>(0, NumParts)) {
4002 if (ShuffledSubMasks.test(I))
4003 continue;
4004 const int VF = GetVF(I);
4005 if (VF == 0)
4006 continue;
4007 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, PartSz);
4008 // Shuffle of at least 2 vectors - ignore.
4009 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4010 std::fill(Slice.begin(), Slice.end(), NumScalars);
4011 ShuffledSubMasks.set(I);
4012 continue;
4013 }
4014 // Try to include as much elements from the mask as possible.
4015 int FirstMin = INT_MAX;
4016 int SecondVecFound = false;
4017 for (int K : seq<int>(0, PartSz)) {
4018 int Idx = Mask[I * PartSz + K];
4019 if (Idx == PoisonMaskElem) {
4020 Value *V = GatheredScalars[I * PartSz + K];
4021 if (isConstant(V) && !isa<PoisonValue>(V)) {
4022 SecondVecFound = true;
4023 break;
4024 }
4025 continue;
4026 }
4027 if (Idx < VF) {
4028 if (FirstMin > Idx)
4029 FirstMin = Idx;
4030 } else {
4031 SecondVecFound = true;
4032 break;
4033 }
4034 }
4035 FirstMin = (FirstMin / PartSz) * PartSz;
4036 // Shuffle of at least 2 vectors - ignore.
4037 if (SecondVecFound) {
4038 std::fill(Slice.begin(), Slice.end(), NumScalars);
4039 ShuffledSubMasks.set(I);
4040 continue;
4041 }
4042 for (int K : seq<int>(0, PartSz)) {
4043 int Idx = Mask[I * PartSz + K];
4044 if (Idx == PoisonMaskElem)
4045 continue;
4046 Idx -= FirstMin;
4047 if (Idx >= PartSz) {
4048 SecondVecFound = true;
4049 break;
4050 }
4051 if (CurrentOrder[I * PartSz + Idx] >
4052 static_cast<unsigned>(I * PartSz + K) &&
4053 CurrentOrder[I * PartSz + Idx] !=
4054 static_cast<unsigned>(I * PartSz + Idx))
4055 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4056 }
4057 // Shuffle of at least 2 vectors - ignore.
4058 if (SecondVecFound) {
4059 std::fill(Slice.begin(), Slice.end(), NumScalars);
4060 ShuffledSubMasks.set(I);
4061 continue;
4062 }
4063 }
4064 };
4065 int PartSz = NumScalars / NumParts;
4066 if (!ExtractShuffles.empty())
4067 TransformMaskToOrder(
4068 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4069 if (!ExtractShuffles[I])
4070 return 0U;
4071 unsigned VF = 0;
4072 for (unsigned Idx : seq<unsigned>(0, PartSz)) {
4073 int K = I * PartSz + Idx;
4074 if (ExtractMask[K] == PoisonMaskElem)
4075 continue;
4076 if (!TE.ReuseShuffleIndices.empty())
4077 K = TE.ReuseShuffleIndices[K];
4078 if (!TE.ReorderIndices.empty())
4079 K = std::distance(TE.ReorderIndices.begin(),
4080 find(TE.ReorderIndices, K));
4081 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4082 if (!EI)
4083 continue;
4084 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4085 ->getElementCount()
4086 .getKnownMinValue());
4087 }
4088 return VF;
4089 });
4090 // Check special corner case - single shuffle of the same entry.
4091 if (GatherShuffles.size() == 1 && NumParts != 1) {
4092 if (ShuffledSubMasks.any())
4093 return std::nullopt;
4094 PartSz = NumScalars;
4095 NumParts = 1;
4096 }
4097 if (!Entries.empty())
4098 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4099 if (!GatherShuffles[I])
4100 return 0U;
4101 return std::max(Entries[I].front()->getVectorFactor(),
4102 Entries[I].back()->getVectorFactor());
4103 });
4104 int NumUndefs =
4105 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4106 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4107 return std::nullopt;
4108 return std::move(CurrentOrder);
4109}
4110
4111static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4112 const TargetLibraryInfo &TLI,
4113 bool CompareOpcodes = true) {
4114 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
4115 return false;
4116 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4117 if (!GEP1)
4118 return false;
4119 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4120 if (!GEP2)
4121 return false;
4122 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4123 ((isConstant(GEP1->getOperand(1)) &&
4124 isConstant(GEP2->getOperand(1))) ||
4125 !CompareOpcodes ||
4126 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4127 .getOpcode());
4128}
4129
4130/// Calculates minimal alignment as a common alignment.
4131template <typename T>
4133 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4134 for (Value *V : VL.drop_front())
4135 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4136 return CommonAlignment;
4137}
4138
4139/// Check if \p Order represents reverse order.
4141 unsigned Sz = Order.size();
4142 return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
4143 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4144 });
4145}
4146
4147/// Checks if the provided list of pointers \p Pointers represents the strided
4148/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4149/// Otherwise, if \p Inst is not specified, just initialized optional value is
4150/// returned to show that the pointers represent strided pointers. If \p Inst
4151/// specified, the runtime stride is materialized before the given \p Inst.
4152/// \returns std::nullopt if the pointers are not pointers with the runtime
4153/// stride, nullptr or actual stride value, otherwise.
4154static std::optional<Value *>
4156 const DataLayout &DL, ScalarEvolution &SE,
4157 SmallVectorImpl<unsigned> &SortedIndices,
4158 Instruction *Inst = nullptr) {
4160 const SCEV *PtrSCEVLowest = nullptr;
4161 const SCEV *PtrSCEVHighest = nullptr;
4162 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4163 // addresses).
4164 for (Value *Ptr : PointerOps) {
4165 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4166 if (!PtrSCEV)
4167 return std::nullopt;
4168 SCEVs.push_back(PtrSCEV);
4169 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4170 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4171 continue;
4172 }
4173 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4174 if (isa<SCEVCouldNotCompute>(Diff))
4175 return std::nullopt;
4176 if (Diff->isNonConstantNegative()) {
4177 PtrSCEVLowest = PtrSCEV;
4178 continue;
4179 }
4180 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4181 if (isa<SCEVCouldNotCompute>(Diff1))
4182 return std::nullopt;
4183 if (Diff1->isNonConstantNegative()) {
4184 PtrSCEVHighest = PtrSCEV;
4185 continue;
4186 }
4187 }
4188 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4189 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4190 if (isa<SCEVCouldNotCompute>(Dist))
4191 return std::nullopt;
4192 int Size = DL.getTypeStoreSize(ElemTy);
4193 auto TryGetStride = [&](const SCEV *Dist,
4194 const SCEV *Multiplier) -> const SCEV * {
4195 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4196 if (M->getOperand(0) == Multiplier)
4197 return M->getOperand(1);
4198 if (M->getOperand(1) == Multiplier)
4199 return M->getOperand(0);
4200 return nullptr;
4201 }
4202 if (Multiplier == Dist)
4203 return SE.getConstant(Dist->getType(), 1);
4204 return SE.getUDivExactExpr(Dist, Multiplier);
4205 };
4206 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4207 const SCEV *Stride = nullptr;
4208 if (Size != 1 || SCEVs.size() > 2) {
4209 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4210 Stride = TryGetStride(Dist, Sz);
4211 if (!Stride)
4212 return std::nullopt;
4213 }
4214 if (!Stride || isa<SCEVConstant>(Stride))
4215 return std::nullopt;
4216 // Iterate through all pointers and check if all distances are
4217 // unique multiple of Stride.
4218 using DistOrdPair = std::pair<int64_t, int>;
4219 auto Compare = llvm::less_first();
4220 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4221 int Cnt = 0;
4222 bool IsConsecutive = true;
4223 for (const SCEV *PtrSCEV : SCEVs) {
4224 unsigned Dist = 0;
4225 if (PtrSCEV != PtrSCEVLowest) {
4226 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4227 const SCEV *Coeff = TryGetStride(Diff, Stride);
4228 if (!Coeff)
4229 return std::nullopt;
4230 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4231 if (!SC || isa<SCEVCouldNotCompute>(SC))
4232 return std::nullopt;
4233 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4234 SE.getMulExpr(Stride, SC)))
4235 ->isZero())
4236 return std::nullopt;
4237 Dist = SC->getAPInt().getZExtValue();
4238 }
4239 // If the strides are not the same or repeated, we can't vectorize.
4240 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4241 return std::nullopt;
4242 auto Res = Offsets.emplace(Dist, Cnt);
4243 if (!Res.second)
4244 return std::nullopt;
4245 // Consecutive order if the inserted element is the last one.
4246 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4247 ++Cnt;
4248 }
4249 if (Offsets.size() != SCEVs.size())
4250 return std::nullopt;
4251 SortedIndices.clear();
4252 if (!IsConsecutive) {
4253 // Fill SortedIndices array only if it is non-consecutive.
4254 SortedIndices.resize(PointerOps.size());
4255 Cnt = 0;
4256 for (const std::pair<int64_t, int> &Pair : Offsets) {
4257 SortedIndices[Cnt] = Pair.second;
4258 ++Cnt;
4259 }
4260 }
4261 if (!Inst)
4262 return nullptr;
4263 SCEVExpander Expander(SE, DL, "strided-load-vec");
4264 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4265}
4266
4268 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
4269 SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
4270 // Check that a vectorized load would load the same memory as a scalar
4271 // load. For example, we don't want to vectorize loads that are smaller
4272 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4273 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4274 // from such a struct, we read/write packed bits disagreeing with the
4275 // unvectorized version.
4276 Type *ScalarTy = VL0->getType();
4277
4278 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4279 return LoadsState::Gather;
4280
4281 // Make sure all loads in the bundle are simple - we can't vectorize
4282 // atomic or volatile loads.
4283 PointerOps.clear();
4284 const unsigned Sz = VL.size();
4285 PointerOps.resize(Sz);
4286 auto *POIter = PointerOps.begin();
4287 for (Value *V : VL) {
4288 auto *L = cast<LoadInst>(V);
4289 if (!L->isSimple())
4290 return LoadsState::Gather;
4291 *POIter = L->getPointerOperand();
4292 ++POIter;
4293 }
4294
4295 Order.clear();
4296 auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
4297 // Check the order of pointer operands or that all pointers are the same.
4298 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4299 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4300 if (!Order.empty() && !isPowerOf2_32(VL.size())) {
4301 assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4302 "supported with VectorizeNonPowerOf2");
4303 return LoadsState::Gather;
4304 }
4305
4306 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4307 if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
4308 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4309 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4311 if (IsSorted || all_of(PointerOps, [&](Value *P) {
4312 return arePointersCompatible(P, PointerOps.front(), *TLI);
4313 })) {
4314 if (IsSorted) {
4315 Value *Ptr0;
4316 Value *PtrN;
4317 if (Order.empty()) {
4318 Ptr0 = PointerOps.front();
4319 PtrN = PointerOps.back();
4320 } else {
4321 Ptr0 = PointerOps[Order.front()];
4322 PtrN = PointerOps[Order.back()];
4323 }
4324 std::optional<int> Diff =
4325 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
4326 // Check that the sorted loads are consecutive.
4327 if (static_cast<unsigned>(*Diff) == Sz - 1)
4328 return LoadsState::Vectorize;
4329 // Simple check if not a strided access - clear order.
4330 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4331 // Try to generate strided load node if:
4332 // 1. Target with strided load support is detected.
4333 // 2. The number of loads is greater than MinProfitableStridedLoads,
4334 // or the potential stride <= MaxProfitableLoadStride and the
4335 // potential stride is power-of-2 (to avoid perf regressions for the very
4336 // small number of loads) and max distance > number of loads, or potential
4337 // stride is -1.
4338 // 3. The loads are ordered, or number of unordered loads <=
4339 // MaxProfitableUnorderedLoads, or loads are in reversed order.
4340 // (this check is to avoid extra costs for very expensive shuffles).
4341 if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
4342 (static_cast<unsigned>(std::abs(*Diff)) <=
4344 isPowerOf2_32(std::abs(*Diff)))) &&
4345 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4346 *Diff == -(static_cast<int>(Sz) - 1))) {
4347 int Stride = *Diff / static_cast<int>(Sz - 1);
4348 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4349 Align Alignment =
4350 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
4351 ->getAlign();
4352 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
4353 // Iterate through all pointers and check if all distances are
4354 // unique multiple of Dist.
4355 SmallSet<int, 4> Dists;
4356 for (Value *Ptr : PointerOps) {
4357 int Dist = 0;
4358 if (Ptr == PtrN)
4359 Dist = *Diff;
4360 else if (Ptr != Ptr0)
4361 Dist =
4362 *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
4363 // If the strides are not the same or repeated, we can't
4364 // vectorize.
4365 if (((Dist / Stride) * Stride) != Dist ||
4366 !Dists.insert(Dist).second)
4367 break;
4368 }
4369 if (Dists.size() == Sz)
4371 }
4372 }
4373 }
4374 }
4375 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4376 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
4377 unsigned MinVF = getMinVF(Sz);
4378 unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
4379 MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
4380 for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4381 unsigned VectorizedCnt = 0;
4383 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4384 Cnt += VF, ++VectorizedCnt) {
4385 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4387 SmallVector<Value *> PointerOps;
4388 LoadsState LS =
4389 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
4390 /*TryRecursiveCheck=*/false);
4391 // Check that the sorted loads are consecutive.
4392 if (LS == LoadsState::Gather)
4393 break;
4394 // If need the reorder - consider as high-cost masked gather for now.
4395 if ((LS == LoadsState::Vectorize ||
4397 !Order.empty() && !isReverseOrder(Order))
4399 States.push_back(LS);
4400 }
4401 // Can be vectorized later as a serie of loads/insertelements.
4402 if (VectorizedCnt == VL.size() / VF) {
4403 // Compare masked gather cost and loads + insersubvector costs.
4405 InstructionCost MaskedGatherCost = TTI.getGatherScatterOpCost(
4406 Instruction::Load, VecTy,
4407 cast<LoadInst>(VL0)->getPointerOperand(),
4408 /*VariableMask=*/false, CommonAlignment, CostKind);
4409 InstructionCost VecLdCost = 0;
4410 auto *SubVecTy = FixedVectorType::get(ScalarTy, VF);
4411 for (auto [I, LS] : enumerate(States)) {
4412 auto *LI0 = cast<LoadInst>(VL[I * VF]);
4413 switch (LS) {
4415 VecLdCost += TTI.getMemoryOpCost(
4416 Instruction::Load, SubVecTy, LI0->getAlign(),
4417 LI0->getPointerAddressSpace(), CostKind,
4419 break;
4421 VecLdCost += TTI.getStridedMemoryOpCost(
4422 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4423 /*VariableMask=*/false, CommonAlignment, CostKind);
4424 break;
4426 VecLdCost += TTI.getGatherScatterOpCost(
4427 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4428 /*VariableMask=*/false, CommonAlignment, CostKind);
4429 break;
4430 case LoadsState::Gather:
4432 "Expected only consecutive, strided or masked gather loads.");
4433 }
4434 SmallVector<int> ShuffleMask(VL.size());
4435 for (int Idx : seq<int>(0, VL.size()))
4436 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4437 VecLdCost +=
4438 TTI.getShuffleCost(TTI ::SK_InsertSubvector, VecTy,
4439 ShuffleMask, CostKind, I * VF, SubVecTy);
4440 }
4441 // If masked gather cost is higher - better to vectorize, so
4442 // consider it as a gather node. It will be better estimated
4443 // later.
4444 if (MaskedGatherCost > VecLdCost)
4445 return true;
4446 }
4447 }
4448 return false;
4449 };
4450 // TODO: need to improve analysis of the pointers, if not all of them are
4451 // GEPs or have > 2 operands, we end up with a gather node, which just
4452 // increases the cost.
4453 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
4454 bool ProfitableGatherPointers =
4455 L && Sz > 2 &&
4456 static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
4457 return L->isLoopInvariant(V);
4458 })) <= Sz / 2;
4459 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
4460 auto *GEP = dyn_cast<GetElementPtrInst>(P);
4461 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
4462 (GEP && GEP->getNumOperands() == 2 &&
4463 isa<Constant, Instruction>(GEP->getOperand(1)));
4464 })) {
4465 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4466 if (TTI->isLegalMaskedGather(VecTy, CommonAlignment) &&
4467 !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
4468 // Check if potential masked gather can be represented as series
4469 // of loads + insertsubvectors.
4470 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4471 // If masked gather cost is higher - better to vectorize, so
4472 // consider it as a gather node. It will be better estimated
4473 // later.
4474 return LoadsState::Gather;
4475 }
4477 }
4478 }
4479 }
4480
4481 return LoadsState::Gather;
4482}
4483
4485 const DataLayout &DL, ScalarEvolution &SE,
4486 SmallVectorImpl<unsigned> &SortedIndices) {
4488 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4489 "Expected list of pointer operands.");
4490 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4491 // Ptr into, sort and return the sorted indices with values next to one
4492 // another.
4494 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4495
4496 unsigned Cnt = 1;
4497 for (Value *Ptr : VL.drop_front()) {
4498 bool Found = any_of(Bases, [&](auto &Base) {
4499 std::optional<int> Diff =
4500 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4501 /*StrictCheck=*/true);
4502 if (!Diff)
4503 return false;
4504
4505 Base.second.emplace_back(Ptr, *Diff, Cnt++);
4506 return true;
4507 });
4508
4509 if (!Found) {
4510 // If we haven't found enough to usefully cluster, return early.
4511 if (Bases.size() > VL.size() / 2 - 1)
4512 return false;
4513
4514 // Not found already - add a new Base
4515 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
4516 }
4517 }
4518
4519 // For each of the bases sort the pointers by Offset and check if any of the
4520 // base become consecutively allocated.
4521 bool AnyConsecutive = false;
4522 for (auto &Base : Bases) {
4523 auto &Vec = Base.second;
4524 if (Vec.size() > 1) {
4525 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
4526 const std::tuple<Value *, int, unsigned> &Y) {
4527 return std::get<1>(X) < std::get<1>(Y);
4528 });
4529 int InitialOffset = std::get<1>(Vec[0]);
4530 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
4531 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4532 });
4533 }
4534 }
4535
4536 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4537 SortedIndices.clear();
4538 if (!AnyConsecutive)
4539 return false;
4540
4541 for (auto &Base : Bases) {
4542 for (auto &T : Base.second)
4543 SortedIndices.push_back(std::get<2>(T));
4544 }
4545
4546 assert(SortedIndices.size() == VL.size() &&
4547 "Expected SortedIndices to be the size of VL");
4548 return true;
4549}
4550
4551std::optional<BoUpSLP::OrdersType>
4552BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4553 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
4554 Type *ScalarTy = TE.Scalars[0]->getType();
4555
4557 Ptrs.reserve(TE.Scalars.size());
4558 for (Value *V : TE.Scalars) {
4559 auto *L = dyn_cast<LoadInst>(V);
4560 if (!L || !L->isSimple())
4561 return std::nullopt;
4562 Ptrs.push_back(L->getPointerOperand());
4563 }
4564
4565 BoUpSLP::OrdersType Order;
4566 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
4567 return std::move(Order);
4568 return std::nullopt;
4569}
4570
4571/// Check if two insertelement instructions are from the same buildvector.
4574 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4575 // Instructions must be from the same basic blocks.
4576 if (VU->getParent() != V->getParent())
4577 return false;
4578 // Checks if 2 insertelements are from the same buildvector.
4579 if (VU->getType() != V->getType())
4580 return false;
4581 // Multiple used inserts are separate nodes.
4582 if (!VU->hasOneUse() && !V->hasOneUse())
4583 return false;
4584 auto *IE1 = VU;
4585 auto *IE2 = V;
4586 std::optional<unsigned> Idx1 = getInsertIndex(IE1);
4587 std::optional<unsigned> Idx2 = getInsertIndex(IE2);
4588 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4589 return false;
4590 // Go through the vector operand of insertelement instructions trying to find
4591 // either VU as the original vector for IE2 or V as the original vector for
4592 // IE1.
4593 SmallBitVector ReusedIdx(
4594 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
4595 bool IsReusedIdx = false;
4596 do {
4597 if (IE2 == VU && !IE1)
4598 return VU->hasOneUse();
4599 if (IE1 == V && !IE2)
4600 return V->hasOneUse();
4601 if (IE1 && IE1 != V) {
4602 unsigned Idx1 = getInsertIndex(IE1).value_or(*Idx2);
4603 IsReusedIdx |= ReusedIdx.test(Idx1);
4604 ReusedIdx.set(Idx1);
4605 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4606 IE1 = nullptr;
4607 else
4608 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4609 }
4610 if (IE2 && IE2 != VU) {
4611 unsigned Idx2 = getInsertIndex(IE2).value_or(*Idx1);
4612 IsReusedIdx |= ReusedIdx.test(Idx2);
4613 ReusedIdx.set(Idx2);
4614 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4615 IE2 = nullptr;
4616 else
4617 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4618 }
4619 } while (!IsReusedIdx && (IE1 || IE2));
4620 return false;
4621}
4622
4623std::optional<BoUpSLP::OrdersType>
4624BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4625 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
4626 if (TE.isNonPowOf2Vec())
4627 return std::nullopt;
4628
4629 // No need to reorder if need to shuffle reuses, still need to shuffle the
4630 // node.
4631 if (!TE.ReuseShuffleIndices.empty()) {
4632 if (isSplat(TE.Scalars))
4633 return std::nullopt;
4634 // Check if reuse shuffle indices can be improved by reordering.
4635 // For this, check that reuse mask is "clustered", i.e. each scalar values
4636 // is used once in each submask of size <number_of_scalars>.
4637 // Example: 4 scalar values.
4638 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4639 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4640 // element 3 is used twice in the second submask.
4641 unsigned Sz = TE.Scalars.size();
4642 if (TE.State == TreeEntry::NeedToGather) {
4643 if (std::optional<OrdersType> CurrentOrder =
4645 SmallVector<int> Mask;
4646 fixupOrderingIndices(*CurrentOrder);
4647 inversePermutation(*CurrentOrder, Mask);
4648 ::addMask(Mask, TE.ReuseShuffleIndices);
4649 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4650 unsigned Sz = TE.Scalars.size();
4651 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
4652 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
4653 if (Idx != PoisonMaskElem)
4654 Res[Idx + K * Sz] = I + K * Sz;
4655 }
4656 return std::move(Res);
4657 }
4658 }
4659 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4661 TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
4662 return std::nullopt;
4663 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
4664 Sz)) {
4665 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4666 if (TE.ReorderIndices.empty())
4667 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4668 else
4669 inversePermutation(TE.ReorderIndices, ReorderMask);
4670 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4671 unsigned VF = ReorderMask.size();
4672 OrdersType ResOrder(VF, VF);
4673 unsigned NumParts = VF / Sz;
4674 SmallBitVector UsedVals(NumParts);
4675 for (unsigned I = 0; I < VF; I += Sz) {
4676 int Val = PoisonMaskElem;
4677 unsigned UndefCnt = 0;
4678 if (any_of(ArrayRef(ReorderMask).slice(I, Sz),
4679 [&](int Idx) {
4680 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
4681 Val = Idx;
4682 if (Idx == PoisonMaskElem)
4683 ++UndefCnt;
4684 return Idx != PoisonMaskElem && Idx != Val;
4685 }) ||
4686 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
4687 UndefCnt > Sz / 2)
4688 return std::nullopt;
4689 UsedVals.set(Val);
4690 for (unsigned K = 0; K < NumParts; ++K)
4691 ResOrder[Val + Sz * K] = I + K;
4692 }
4693 return std::move(ResOrder);
4694 }
4695 unsigned VF = TE.getVectorFactor();
4696 // Try build correct order for extractelement instructions.
4697 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
4698 TE.ReuseShuffleIndices.end());
4699 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4700 all_of(TE.Scalars, [Sz](Value *V) {
4701 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4702 return Idx && *Idx < Sz;
4703 })) {
4704 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4705 if (TE.ReorderIndices.empty())
4706 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4707 else
4708 inversePermutation(TE.ReorderIndices, ReorderMask);
4709 for (unsigned I = 0; I < VF; ++I) {
4710 int &Idx = ReusedMask[I];
4711 if (Idx == PoisonMaskElem)
4712 continue;
4713 Value *V = TE.Scalars[ReorderMask[Idx]];
4714 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
4715 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
4716 }
4717 }
4718 // Build the order of the VF size, need to reorder reuses shuffles, they are
4719 // always of VF size.
4720 OrdersType ResOrder(VF);
4721 std::iota(ResOrder.begin(), ResOrder.end(), 0);
4722 auto *It = ResOrder.begin();
4723 for (unsigned K = 0; K < VF; K += Sz) {
4724 OrdersType CurrentOrder(TE.ReorderIndices);
4725 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
4726 if (SubMask.front() == PoisonMaskElem)
4727 std::iota(SubMask.begin(), SubMask.end(), 0);
4728 reorderOrder(CurrentOrder, SubMask);
4729 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
4730 std::advance(It, Sz);
4731 }
4732 if (TE.State == TreeEntry::NeedToGather &&
4733 all_of(enumerate(ResOrder),
4734 [](const auto &Data) { return Data.index() == Data.value(); }))
4735 return std::nullopt; // No need to reorder.
4736 return std::move(ResOrder);
4737 }
4738 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
4739 any_of(TE.UserTreeIndices,
4740 [](const EdgeInfo &EI) {
4741 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
4742 }) &&
4743 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
4744 return std::nullopt;
4745 if ((TE.State == TreeEntry::Vectorize ||
4746 TE.State == TreeEntry::StridedVectorize) &&
4747 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4748 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4749 !TE.isAltShuffle())
4750 return TE.ReorderIndices;
4751 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4752 auto PHICompare = [&](unsigned I1, unsigned I2) {
4753 Value *V1 = TE.Scalars[I1];
4754 Value *V2 = TE.Scalars[I2];
4755 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
4756 return false;
4757 if (V1->getNumUses() < V2->getNumUses())
4758 return true;
4759 if (V1->getNumUses() > V2->getNumUses())
4760 return false;
4761 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
4762 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4763 if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4764 if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4766 IE1, IE2,
4767 [](InsertElementInst *II) { return II->getOperand(0); }))
4768 return I1 < I2;
4769 return getInsertIndex(IE1) < getInsertIndex(IE2);
4770 }
4771 if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4772 if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4773 if (EE1->getOperand(0) != EE2->getOperand(0))
4774 return I1 < I2;
4775 return getInsertIndex(EE1) < getInsertIndex(EE2);
4776 }
4777 return I1 < I2;
4778 };
4779 auto IsIdentityOrder = [](const OrdersType &Order) {
4780 for (unsigned Idx : seq<unsigned>(0, Order.size()))
4781 if (Idx != Order[Idx])
4782 return false;
4783 return true;
4784 };
4785 if (!TE.ReorderIndices.empty())
4786 return TE.ReorderIndices;
4788 SmallVector<unsigned> Phis(TE.Scalars.size());
4789 std::iota(Phis.begin(), Phis.end(), 0);
4790 OrdersType ResOrder(TE.Scalars.size());
4791 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4792 PhiToId[Id] = Id;
4793 stable_sort(Phis, PHICompare);
4794 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4795 ResOrder[Id] = PhiToId[Phis[Id]];
4796 if (IsIdentityOrder(ResOrder))
4797 return std::nullopt; // No need to reorder.
4798 return std::move(ResOrder);
4799 }
4800 if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
4801 allSameType(TE.Scalars)) {
4802 // TODO: add analysis of other gather nodes with extractelement
4803 // instructions and other values/instructions, not only undefs.
4804 if ((TE.getOpcode() == Instruction::ExtractElement ||
4805 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
4806 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
4807 all_of(TE.Scalars, [](Value *V) {
4808 auto *EE = dyn_cast<ExtractElementInst>(V);
4809 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4810 })) {
4811 // Check that gather of extractelements can be represented as
4812 // just a shuffle of a single vector.
4813 OrdersType CurrentOrder;
4814 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4815 /*ResizeAllowed=*/true);
4816 if (Reuse || !CurrentOrder.empty())
4817 return std::move(CurrentOrder);
4818 }
4819 // If the gather node is <undef, v, .., poison> and
4820 // insertelement poison, v, 0 [+ permute]
4821 // is cheaper than
4822 // insertelement poison, v, n - try to reorder.
4823 // If rotating the whole graph, exclude the permute cost, the whole graph
4824 // might be transformed.
4825 int Sz = TE.Scalars.size();
4826 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
4827 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
4828 const auto *It =
4829 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
4830 if (It == TE.Scalars.begin())
4831 return OrdersType();
4832 auto *Ty = FixedVectorType::get(TE.Scalars.front()->getType(), Sz);
4833 if (It != TE.Scalars.end()) {
4834 OrdersType Order(Sz, Sz);
4835 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4836 Order[Idx] = 0;
4837 fixupOrderingIndices(Order);
4838 SmallVector<int> Mask;
4839 inversePermutation(Order, Mask);
4840 InstructionCost PermuteCost =
4841 TopToBottom
4842 ? 0
4844 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
4845 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
4846 PoisonValue::get(Ty), *It);
4847 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
4848 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
4849 PoisonValue::get(Ty), *It);
4850 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
4851 OrdersType Order(Sz, Sz);
4852 Order[Idx] = 0;
4853 return std::move(Order);
4854 }
4855 }
4856 }
4857 if (isSplat(TE.Scalars))
4858 return std::nullopt;
4859 if (TE.Scalars.size() >= 4)
4860 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
4861 return Order;
4862 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
4863 return CurrentOrder;
4864 }
4865 return std::nullopt;
4866}
4867
4868/// Checks if the given mask is a "clustered" mask with the same clusters of
4869/// size \p Sz, which are not identity submasks.
4871 unsigned Sz) {
4872 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
4873 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
4874 return false;
4875 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
4876 ArrayRef<int> Cluster = Mask.slice(I, Sz);
4877 if (Cluster != FirstCluster)
4878 return false;
4879 }
4880 return true;
4881}
4882
4883void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
4884 // Reorder reuses mask.
4885 reorderReuses(TE.ReuseShuffleIndices, Mask);
4886 const unsigned Sz = TE.Scalars.size();
4887 // For vectorized and non-clustered reused no need to do anything else.
4888 if (TE.State != TreeEntry::NeedToGather ||
4890 Sz) ||
4891 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
4892 return;
4893 SmallVector<int> NewMask;
4894 inversePermutation(TE.ReorderIndices, NewMask);
4895 addMask(NewMask, TE.ReuseShuffleIndices);
4896 // Clear reorder since it is going to be applied to the new mask.
4897 TE.ReorderIndices.clear();
4898 // Try to improve gathered nodes with clustered reuses, if possible.
4899 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
4900 SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
4901 inversePermutation(NewOrder, NewMask);
4902 reorderScalars(TE.Scalars, NewMask);
4903 // Fill the reuses mask with the identity submasks.
4904 for (auto *It = TE.ReuseShuffleIndices.begin(),
4905 *End = TE.ReuseShuffleIndices.end();
4906 It != End; std::advance(It, Sz))
4907 std::iota(It, std::next(It, Sz), 0);
4908}
4909
4911 ArrayRef<unsigned> SecondaryOrder) {
4912 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
4913 "Expected same size of orders");
4914 unsigned Sz = Order.size();
4915 SmallBitVector UsedIndices(Sz);
4916 for (unsigned Idx : seq<unsigned>(0, Sz)) {
4917 if (Order[Idx] != Sz)
4918 UsedIndices.set(Order[Idx]);
4919 }
4920 if (SecondaryOrder.empty()) {
4921 for (unsigned Idx : seq<unsigned>(0, Sz))
4922 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
4923 Order[Idx] = Idx;
4924 } else {
4925 for (unsigned Idx : seq<unsigned>(0, Sz))
4926 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
4927 !UsedIndices.test(SecondaryOrder[Idx]))
4928 Order[Idx] = SecondaryOrder[Idx];
4929 }
4930}
4931
4933 // Maps VF to the graph nodes.
4935 // ExtractElement gather nodes which can be vectorized and need to handle
4936 // their ordering.
4938
4939 // Phi nodes can have preferred ordering based on their result users
4941
4942 // AltShuffles can also have a preferred ordering that leads to fewer
4943 // instructions, e.g., the addsub instruction in x86.
4944 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
4945
4946 // Maps a TreeEntry to the reorder indices of external users.
4948 ExternalUserReorderMap;
4949 // Find all reorderable nodes with the given VF.
4950 // Currently the are vectorized stores,loads,extracts + some gathering of
4951 // extracts.
4952 for_each(VectorizableTree, [&, &TTIRef = *TTI](
4953 const std::unique_ptr<TreeEntry> &TE) {
4954 // Look for external users that will probably be vectorized.
4955 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
4956 findExternalStoreUsersReorderIndices(TE.get());
4957 if (!ExternalUserReorderIndices.empty()) {
4958 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4959 ExternalUserReorderMap.try_emplace(TE.get(),
4960 std::move(ExternalUserReorderIndices));
4961 }
4962
4963 // Patterns like [fadd,fsub] can be combined into a single instruction in
4964 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
4965 // to take into account their order when looking for the most used order.
4966 if (TE->isAltShuffle()) {
4967 VectorType *VecTy =
4968 FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
4969 unsigned Opcode0 = TE->getOpcode();
4970 unsigned Opcode1 = TE->getAltOpcode();
4971 // The opcode mask selects between the two opcodes.
4972 SmallBitVector OpcodeMask(TE->Scalars.size(), false);
4973 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
4974 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
4975 OpcodeMask.set(Lane);
4976 // If this pattern is supported by the target then we consider the order.
4977 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
4978 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4979 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
4980 }
4981 // TODO: Check the reverse order too.
4982 }
4983
4984 if (std::optional<OrdersType> CurrentOrder =
4985 getReorderingData(*TE, /*TopToBottom=*/true)) {
4986 // Do not include ordering for nodes used in the alt opcode vectorization,
4987 // better to reorder them during bottom-to-top stage. If follow the order
4988 // here, it causes reordering of the whole graph though actually it is
4989 // profitable just to reorder the subgraph that starts from the alternate
4990 // opcode vectorization node. Such nodes already end-up with the shuffle
4991 // instruction and it is just enough to change this shuffle rather than
4992 // rotate the scalars for the whole graph.
4993 unsigned Cnt = 0;
4994 const TreeEntry *UserTE = TE.get();
4995 while (UserTE && Cnt < RecursionMaxDepth) {
4996 if (UserTE->UserTreeIndices.size() != 1)
4997 break;
4998 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
4999 return EI.UserTE->State == TreeEntry::Vectorize &&
5000 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5001 }))
5002 return;
5003 UserTE = UserTE->UserTreeIndices.back().UserTE;
5004 ++Cnt;
5005 }
5006 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5007 if (!(TE->State == TreeEntry::Vectorize ||
5008 TE->State == TreeEntry::StridedVectorize) ||
5009 !TE->ReuseShuffleIndices.empty())
5010 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
5011 if (TE->State == TreeEntry::Vectorize &&
5012 TE->getOpcode() == Instruction::PHI)
5013 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
5014 }
5015 });
5016
5017 // Reorder the graph nodes according to their vectorization factor.
5018 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5019 VF /= 2) {
5020 auto It = VFToOrderedEntries.find(VF);
5021 if (It == VFToOrderedEntries.end())
5022 continue;
5023 // Try to find the most profitable order. We just are looking for the most
5024 // used order and reorder scalar elements in the nodes according to this
5025 // mostly used order.
5026 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5027 // All operands are reordered and used only in this node - propagate the
5028 // most used order to the user node.
5031 OrdersUses;
5033 for (const TreeEntry *OpTE : OrderedEntries) {
5034 // No need to reorder this nodes, still need to extend and to use shuffle,
5035 // just need to merge reordering shuffle and the reuse shuffle.
5036 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5037 continue;
5038 // Count number of orders uses.
5039 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5040 &PhisToOrders]() -> const OrdersType & {
5041 if (OpTE->State == TreeEntry::NeedToGather ||
5042 !OpTE->ReuseShuffleIndices.empty()) {
5043 auto It = GathersToOrders.find(OpTE);
5044 if (It != GathersToOrders.end())
5045 return It->second;
5046 }
5047 if (OpTE->isAltShuffle()) {
5048 auto It = AltShufflesToOrders.find(OpTE);
5049 if (It != AltShufflesToOrders.end())
5050 return It->second;
5051 }
5052 if (OpTE->State == TreeEntry::Vectorize &&
5053 OpTE->getOpcode() == Instruction::PHI) {
5054 auto It = PhisToOrders.find(OpTE);
5055 if (It != PhisToOrders.end())
5056 return It->second;
5057 }
5058 return OpTE->ReorderIndices;
5059 }();
5060 // First consider the order of the external scalar users.
5061 auto It = ExternalUserReorderMap.find(OpTE);
5062 if (It != ExternalUserReorderMap.end()) {
5063 const auto &ExternalUserReorderIndices = It->second;
5064 // If the OpTE vector factor != number of scalars - use natural order,
5065 // it is an attempt to reorder node with reused scalars but with
5066 // external uses.
5067 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5068 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
5069 ExternalUserReorderIndices.size();
5070 } else {
5071 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
5072 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5073 }
5074 // No other useful reorder data in this entry.
5075 if (Order.empty())
5076 continue;
5077 }
5078 // Stores actually store the mask, not the order, need to invert.
5079 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5080 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5081 SmallVector<int> Mask;
5082 inversePermutation(Order, Mask);
5083 unsigned E = Order.size();
5084 OrdersType CurrentOrder(E, E);
5085 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5086 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5087 });
5088 fixupOrderingIndices(CurrentOrder);
5089 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5090 } else {
5091 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5092 }
5093 }
5094 if (OrdersUses.empty())
5095 continue;
5096 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5097 const unsigned Sz = Order.size();
5098 for (unsigned Idx : seq<unsigned>(0, Sz))
5099 if (Idx != Order[Idx] && Order[Idx] != Sz)
5100 return false;
5101 return true;
5102 };
5103 // Choose the most used order.
5104 unsigned IdentityCnt = 0;
5105 unsigned FilledIdentityCnt = 0;
5106 OrdersType IdentityOrder(VF, VF);
5107 for (auto &Pair : OrdersUses) {
5108 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5109 if (!Pair.first.empty())
5110 FilledIdentityCnt += Pair.second;
5111 IdentityCnt += Pair.second;
5112 combineOrders(IdentityOrder, Pair.first);
5113 }
5114 }
5115 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5116 unsigned Cnt = IdentityCnt;
5117 for (auto &Pair : OrdersUses) {
5118 // Prefer identity order. But, if filled identity found (non-empty order)
5119 // with same number of uses, as the new candidate order, we can choose
5120 // this candidate order.
5121 if (Cnt < Pair.second ||
5122 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5123 Cnt == Pair.second && !BestOrder.empty() &&
5124 IsIdentityOrder(BestOrder))) {
5125 combineOrders(Pair.first, BestOrder);
5126 BestOrder = Pair.first;
5127 Cnt = Pair.second;
5128 } else {
5129 combineOrders(BestOrder, Pair.first);
5130 }
5131 }
5132 // Set order of the user node.
5133 if (IsIdentityOrder(BestOrder))
5134 continue;
5135 fixupOrderingIndices(BestOrder);
5136 SmallVector<int> Mask;
5137 inversePermutation(BestOrder, Mask);
5138 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5139 unsigned E = BestOrder.size();
5140 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5141 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5142 });
5143 // Do an actual reordering, if profitable.
5144 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5145 // Just do the reordering for the nodes with the given VF.
5146 if (TE->Scalars.size() != VF) {
5147 if (TE->ReuseShuffleIndices.size() == VF) {
5148 // Need to reorder the reuses masks of the operands with smaller VF to
5149 // be able to find the match between the graph nodes and scalar
5150 // operands of the given node during vectorization/cost estimation.
5151 assert(all_of(TE->UserTreeIndices,
5152 [VF, &TE](const EdgeInfo &EI) {
5153 return EI.UserTE->Scalars.size() == VF ||
5154 EI.UserTE->Scalars.size() ==
5155 TE->Scalars.size();
5156 }) &&
5157 "All users must be of VF size.");
5158 // Update ordering of the operands with the smaller VF than the given
5159 // one.
5160 reorderNodeWithReuses(*TE, Mask);
5161 }
5162 continue;
5163 }
5164 if ((TE->State == TreeEntry::Vectorize ||
5165 TE->State == TreeEntry::StridedVectorize) &&
5167 InsertElementInst>(TE->getMainOp()) &&
5168 !TE->isAltShuffle()) {
5169 // Build correct orders for extract{element,value}, loads and
5170 // stores.
5171 reorderOrder(TE->ReorderIndices, Mask);
5172 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5173 TE->reorderOperands(Mask);
5174 } else {
5175 // Reorder the node and its operands.
5176 TE->reorderOperands(Mask);
5177 assert(TE->ReorderIndices.empty() &&
5178 "Expected empty reorder sequence.");
5179 reorderScalars(TE->Scalars, Mask);
5180 }
5181 if (!TE->ReuseShuffleIndices.empty()) {
5182 // Apply reversed order to keep the original ordering of the reused
5183 // elements to avoid extra reorder indices shuffling.
5184 OrdersType CurrentOrder;
5185 reorderOrder(CurrentOrder, MaskOrder);
5186 SmallVector<int> NewReuses;
5187 inversePermutation(CurrentOrder, NewReuses);
5188 addMask(NewReuses, TE->ReuseShuffleIndices);
5189 TE->ReuseShuffleIndices.swap(NewReuses);
5190 }
5191 }
5192 }
5193}
5194
5195bool BoUpSLP::canReorderOperands(
5196 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5197 ArrayRef<TreeEntry *> ReorderableGathers,
5198 SmallVectorImpl<TreeEntry *> &GatherOps) {
5199 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5200 if (UserTE->isNonPowOf2Vec())
5201 return false;
5202
5203 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
5204 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
5205 return OpData.first == I &&
5206 (OpData.second->State == TreeEntry::Vectorize ||
5207 OpData.second->State == TreeEntry::StridedVectorize);
5208 }))
5209 continue;
5210 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
5211 // Do not reorder if operand node is used by many user nodes.
5212 if (any_of(TE->UserTreeIndices,
5213 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5214 return false;
5215 // Add the node to the list of the ordered nodes with the identity
5216 // order.
5217 Edges.emplace_back(I, TE);
5218 // Add ScatterVectorize nodes to the list of operands, where just
5219 // reordering of the scalars is required. Similar to the gathers, so
5220 // simply add to the list of gathered ops.
5221 // If there are reused scalars, process this node as a regular vectorize
5222 // node, just reorder reuses mask.
5223 if (TE->State != TreeEntry::Vectorize &&
5224 TE->State != TreeEntry::StridedVectorize &&
5225 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5226 GatherOps.push_back(TE);
5227 continue;
5228 }
5229 TreeEntry *Gather = nullptr;
5230 if (count_if(ReorderableGathers,
5231 [&Gather, UserTE, I](TreeEntry *TE) {
5232 assert(TE->State != TreeEntry::Vectorize &&
5233 TE->State != TreeEntry::StridedVectorize &&
5234 "Only non-vectorized nodes are expected.");
5235 if (any_of(TE->UserTreeIndices,
5236 [UserTE, I](const EdgeInfo &EI) {
5237 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5238 })) {
5239 assert(TE->isSame(UserTE->getOperand(I)) &&
5240 "Operand entry does not match operands.");
5241 Gather = TE;
5242 return true;
5243 }
5244 return false;
5245 }) > 1 &&
5246 !allConstant(UserTE->getOperand(I)))
5247 return false;
5248 if (Gather)
5249 GatherOps.push_back(Gather);
5250 }
5251 return true;
5252}
5253
5254void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
5255 SetVector<TreeEntry *> OrderedEntries;
5256 DenseSet<const TreeEntry *> GathersToOrders;
5257 // Find all reorderable leaf nodes with the given VF.
5258 // Currently the are vectorized loads,extracts without alternate operands +
5259 // some gathering of extracts.
5260 SmallVector<TreeEntry *> NonVectorized;
5261 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5262 if (TE->State != TreeEntry::Vectorize &&
5263 TE->State != TreeEntry::StridedVectorize)
5264 NonVectorized.push_back(TE.get());
5265 if (std::optional<OrdersType> CurrentOrder =
5266 getReorderingData(*TE, /*TopToBottom=*/false)) {
5267 OrderedEntries.insert(TE.get());
5268 if (!(TE->State == TreeEntry::Vectorize ||
5269 TE->State == TreeEntry::StridedVectorize) ||
5270 !TE->ReuseShuffleIndices.empty())
5271 GathersToOrders.insert(TE.get());
5272 }
5273 }
5274
5275 // 1. Propagate order to the graph nodes, which use only reordered nodes.
5276 // I.e., if the node has operands, that are reordered, try to make at least
5277 // one operand order in the natural order and reorder others + reorder the
5278 // user node itself.
5280 while (!OrderedEntries.empty()) {
5281 // 1. Filter out only reordered nodes.
5282 // 2. If the entry has multiple uses - skip it and jump to the next node.
5284 SmallVector<TreeEntry *> Filtered;
5285 for (TreeEntry *TE : OrderedEntries) {
5286 if (!(TE->State == TreeEntry::Vectorize ||
5287 TE->State == TreeEntry::StridedVectorize ||
5288 (TE->State == TreeEntry::NeedToGather &&
5289 GathersToOrders.contains(TE))) ||
5290 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5291 !all_of(drop_begin(TE->UserTreeIndices),
5292 [TE](const EdgeInfo &EI) {
5293 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5294 }) ||
5295 !Visited.insert(TE).second) {
5296 Filtered.push_back(TE);
5297 continue;
5298 }
5299 // Build a map between user nodes and their operands order to speedup
5300 // search. The graph currently does not provide this dependency directly.
5301 for (EdgeInfo &EI : TE->UserTreeIndices) {
5302 TreeEntry *UserTE = EI.UserTE;
5303 auto It = Users.find(UserTE);
5304 if (It == Users.end())
5305 It = Users.insert({UserTE, {}}).first;
5306 It->second.emplace_back(EI.EdgeIdx, TE);
5307 }
5308 }
5309 // Erase filtered entries.
5310 for (TreeEntry *TE : Filtered)
5311 OrderedEntries.remove(TE);
5313 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5314 UsersVec(Users.begin(), Users.end());
5315 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
5316 return Data1.first->Idx > Data2.first->Idx;
5317 });
5318 for (auto &Data : UsersVec) {
5319 // Check that operands are used only in the User node.
5320 SmallVector<TreeEntry *> GatherOps;
5321 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
5322 GatherOps)) {
5323 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5324 OrderedEntries.remove(Op.second);
5325 continue;
5326 }
5327 // All operands are reordered and used only in this node - propagate the
5328 // most used order to the user node.
5331 OrdersUses;
5332 // Do the analysis for each tree entry only once, otherwise the order of
5333 // the same node my be considered several times, though might be not
5334 // profitable.
5337 for (const auto &Op : Data.second) {
5338 TreeEntry *OpTE = Op.second;
5339 if (!VisitedOps.insert(OpTE).second)
5340 continue;
5341 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5342 continue;
5343 const auto Order = [&]() -> const OrdersType {
5344 if (OpTE->State == TreeEntry::NeedToGather ||
5345 !OpTE->ReuseShuffleIndices.empty())
5346 return getReorderingData(*OpTE, /*TopToBottom=*/false)
5347 .value_or(OrdersType(1));
5348 return OpTE->ReorderIndices;
5349 }();
5350 // The order is partially ordered, skip it in favor of fully non-ordered
5351 // orders.
5352 if (Order.size() == 1)
5353 continue;
5354 unsigned NumOps = count_if(
5355 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
5356 return P.second == OpTE;
5357 });
5358 // Stores actually store the mask, not the order, need to invert.
5359 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5360 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5361 SmallVector<int> Mask;
5362 inversePermutation(Order, Mask);
5363 unsigned E = Order.size();
5364 OrdersType CurrentOrder(E, E);
5365 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5366 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5367 });
5368 fixupOrderingIndices(CurrentOrder);
5369 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5370 NumOps;
5371 } else {
5372 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5373 }
5374 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
5375 const auto AllowsReordering = [&](const TreeEntry *TE) {
5376 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5377 if (TE->isNonPowOf2Vec())
5378 return false;
5379 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5380 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5381 (IgnoreReorder && TE->Idx == 0))
5382 return true;
5383 if (TE->State == TreeEntry::NeedToGather) {
5384 if (GathersToOrders.contains(TE))
5385 return !getReorderingData(*TE, /*TopToBottom=*/false)
5386 .value_or(OrdersType(1))
5387 .empty();
5388 return true;
5389 }
5390 return false;
5391 };
5392 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
5393 TreeEntry *UserTE = EI.UserTE;
5394 if (!VisitedUsers.insert(UserTE).second)
5395 continue;
5396 // May reorder user node if it requires reordering, has reused
5397 // scalars, is an alternate op vectorize node or its op nodes require
5398 // reordering.
5399 if (AllowsReordering(UserTE))
5400 continue;
5401 // Check if users allow reordering.
5402 // Currently look up just 1 level of operands to avoid increase of
5403 // the compile time.
5404 // Profitable to reorder if definitely more operands allow
5405 // reordering rather than those with natural order.
5407 if (static_cast<unsigned>(count_if(
5408 Ops, [UserTE, &AllowsReordering](
5409 const std::pair<unsigned, TreeEntry *> &Op) {
5410 return AllowsReordering(Op.second) &&
5411 all_of(Op.second->UserTreeIndices,
5412 [UserTE](const EdgeInfo &EI) {
5413 return EI.UserTE == UserTE;
5414 });
5415 })) <= Ops.size() / 2)
5416 ++Res.first->second;
5417 }
5418 }
5419 if (OrdersUses.empty()) {
5420 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5421 OrderedEntries.remove(Op.second);
5422 continue;
5423 }
5424 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5425 const unsigned Sz = Order.size();
5426 for (unsigned Idx : seq<unsigned>(0, Sz))
5427 if (Idx != Order[Idx] && Order[Idx] != Sz)
5428 return false;
5429 return true;
5430 };
5431 // Choose the most used order.
5432 unsigned IdentityCnt = 0;
5433 unsigned VF = Data.second.front().second->getVectorFactor();
5434 OrdersType IdentityOrder(VF, VF);
5435 for (auto &Pair : OrdersUses) {
5436 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5437 IdentityCnt += Pair.second;
5438 combineOrders(IdentityOrder, Pair.first);
5439 }
5440 }
5441 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5442 unsigned Cnt = IdentityCnt;
5443 for (auto &Pair : OrdersUses) {
5444 // Prefer identity order. But, if filled identity found (non-empty
5445 // order) with same number of uses, as the new candidate order, we can
5446 // choose this candidate order.
5447 if (Cnt < Pair.second) {
5448 combineOrders(Pair.first, BestOrder);
5449 BestOrder = Pair.first;
5450 Cnt = Pair.second;
5451 } else {
5452 combineOrders(BestOrder, Pair.first);
5453 }
5454 }
5455 // Set order of the user node.
5456 if (IsIdentityOrder(BestOrder)) {
5457 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5458 OrderedEntries.remove(Op.second);
5459 continue;
5460 }
5461 fixupOrderingIndices(BestOrder);
5462 // Erase operands from OrderedEntries list and adjust their orders.
5463 VisitedOps.clear();
5464 SmallVector<int> Mask;
5465 inversePermutation(BestOrder, Mask);
5466 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5467 unsigned E = BestOrder.size();
5468 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5469 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5470 });
5471 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
5472 TreeEntry *TE = Op.second;
5473 OrderedEntries.remove(TE);
5474 if (!VisitedOps.insert(TE).second)
5475 continue;
5476 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
5477 reorderNodeWithReuses(*TE, Mask);
5478 continue;
5479 }
5480 // Gathers are processed separately.
5481 if (TE->State != TreeEntry::Vectorize &&
5482 TE->State != TreeEntry::StridedVectorize &&
5483 (TE->State != TreeEntry::ScatterVectorize ||
5484 TE->ReorderIndices.empty()))
5485 continue;
5486 assert((BestOrder.size() == TE->ReorderIndices.size() ||
5487 TE->ReorderIndices.empty()) &&
5488 "Non-matching sizes of user/operand entries.");
5489 reorderOrder(TE->ReorderIndices, Mask);
5490 if (IgnoreReorder && TE == VectorizableTree.front().get())
5491 IgnoreReorder = false;
5492 }
5493 // For gathers just need to reorder its scalars.
5494 for (TreeEntry *Gather : GatherOps) {
5495 assert(Gather->ReorderIndices.empty() &&
5496 "Unexpected reordering of gathers.");
5497 if (!Gather->ReuseShuffleIndices.empty()) {
5498 // Just reorder reuses indices.
5499 reorderReuses(Gather->ReuseShuffleIndices, Mask);
5500 continue;
5501 }
5502 reorderScalars(Gather->Scalars, Mask);
5503 OrderedEntries.remove(Gather);
5504 }
5505 // Reorder operands of the user node and set the ordering for the user
5506 // node itself.
5507 if (Data.first->State != TreeEntry::Vectorize ||
5508 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5509 Data.first->getMainOp()) ||
5510 Data.first->isAltShuffle())
5511 Data.first->reorderOperands(Mask);
5512 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
5513 Data.first->isAltShuffle() ||
5514 Data.first->State == TreeEntry::StridedVectorize) {
5515 reorderScalars(Data.first->Scalars, Mask);
5516 reorderOrder(Data.first->ReorderIndices, MaskOrder,
5517 /*BottomOrder=*/true);
5518 if (Data.first->ReuseShuffleIndices.empty() &&
5519 !Data.first->ReorderIndices.empty() &&
5520 !Data.first->isAltShuffle()) {
5521 // Insert user node to the list to try to sink reordering deeper in
5522 // the graph.
5523 OrderedEntries.insert(Data.first);
5524 }
5525 } else {
5526 reorderOrder(Data.first->ReorderIndices, Mask);
5527 }
5528 }
5529 }
5530 // If the reordering is unnecessary, just remove the reorder.
5531 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5532 VectorizableTree.front()->ReuseShuffleIndices.empty())
5533 VectorizableTree.front()->ReorderIndices.clear();
5534}
5535
5537 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
5538 DenseMap<Value *, unsigned> ScalarToExtUses;
5539 // Collect the values that we need to extract from the tree.
5540 for (auto &TEPtr : VectorizableTree) {
5541 TreeEntry *Entry = TEPtr.get();
5542
5543 // No need to handle users of gathered values.
5544 if (Entry->State == TreeEntry::NeedToGather)
5545 continue;
5546
5547 // For each lane:
5548 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5549 Value *Scalar = Entry->Scalars[Lane];
5550 if (!isa<Instruction>(Scalar))
5551 continue;
5552 // All uses must be replaced already? No need to do it again.
5553 auto It = ScalarToExtUses.find(Scalar);
5554 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
5555 continue;
5556
5557 // Check if the scalar is externally used as an extra arg.
5558 const auto *ExtI = ExternallyUsedValues.find(Scalar);
5559 if (ExtI != ExternallyUsedValues.end()) {
5560 int FoundLane = Entry->findLaneForValue(Scalar);
5561 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
5562 << FoundLane << " from " << *Scalar << ".\n");
5563 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
5564 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
5565 continue;
5566 }
5567 for (User *U : Scalar->users()) {
5568 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
5569
5570 Instruction *UserInst = dyn_cast<Instruction>(U);
5571 if (!UserInst || isDeleted(UserInst))
5572 continue;
5573
5574 // Ignore users in the user ignore list.
5575 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5576 continue;
5577
5578 // Skip in-tree scalars that become vectors
5579 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5580 // Some in-tree scalars will remain as scalar in vectorized
5581 // instructions. If that is the case, the one in FoundLane will
5582 // be used.
5583 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5585 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5586 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
5587 << ".\n");
5588 assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
5589 continue;
5590 }
5591 U = nullptr;
5592 if (It != ScalarToExtUses.end()) {
5593 ExternalUses[It->second].User = nullptr;
5594 break;
5595 }
5596 }
5597
5598 int FoundLane = Entry->findLaneForValue(Scalar);
5599 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
5600 << " from lane " << FoundLane << " from " << *Scalar
5601 << ".\n");
5602 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
5603 ExternalUses.emplace_back(Scalar, U, FoundLane);
5604 if (!U)
5605 break;
5606 }
5607 }
5608 }
5609}
5610
5612BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
5614 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5615 Value *V = TE->Scalars[Lane];
5616 // To save compilation time we don't visit if we have too many users.
5617 if (V->hasNUsesOrMore(UsesLimit))
5618 break;
5619
5620 // Collect stores per pointer object.
5621 for (User *U : V->users()) {
5622 auto *SI = dyn_cast<StoreInst>(U);
5623 if (SI == nullptr || !SI->isSimple() ||
5624 !isValidElementType(SI->getValueOperand()->getType()))
5625 continue;
5626 // Skip entry if already
5627 if (getTreeEntry(U))
5628 continue;
5629
5630 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
5631 auto &StoresVec = PtrToStoresMap[Ptr];
5632 // For now just keep one store per pointer object per lane.
5633 // TODO: Extend this to support multiple stores per pointer per lane
5634 if (StoresVec.size() > Lane)
5635 continue;
5636 // Skip if in different BBs.
5637 if (!StoresVec.empty() &&
5638 SI->getParent() != StoresVec.back()->getParent())
5639 continue;
5640 // Make sure that the stores are of the same type.
5641 if (!StoresVec.empty() &&
5642 SI->getValueOperand()->getType() !=
5643 StoresVec.back()->getValueOperand()->getType())
5644 continue;
5645 StoresVec.push_back(SI);
5646 }
5647 }
5648 return PtrToStoresMap;
5649}
5650
5651bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
5652 OrdersType &ReorderIndices) const {
5653 // We check whether the stores in StoreVec can form a vector by sorting them
5654 // and checking whether they are consecutive.
5655
5656 // To avoid calling getPointersDiff() while sorting we create a vector of
5657 // pairs {store, offset from first} and sort this instead.
5658 SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
5659 StoreInst *S0 = StoresVec[0];
5660 StoreOffsetVec[0] = {S0, 0};
5661 Type *S0Ty = S0->getValueOperand()->getType();
5662 Value *S0Ptr = S0->getPointerOperand();
5663 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
5664 StoreInst *SI = StoresVec[Idx];
5665 std::optional<int> Diff =
5666 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
5667 SI->getPointerOperand(), *DL, *SE,
5668 /*StrictCheck=*/true);
5669 // We failed to compare the pointers so just abandon this StoresVec.
5670 if (!Diff)
5671 return false;
5672 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
5673 }
5674
5675 // Sort the vector based on the pointers. We create a copy because we may
5676 // need the original later for calculating the reorder (shuffle) indices.
5677 stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
5678 const std::pair<StoreInst *, int> &Pair2) {
5679 int Offset1 = Pair1.second;
5680 int Offset2 = Pair2.second;
5681 return Offset1 < Offset2;
5682 });
5683
5684 // Check if the stores are consecutive by checking if their difference is 1.
5685 for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5686 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
5687 return false;
5688
5689 // Calculate the shuffle indices according to their offset against the sorted
5690 // StoreOffsetVec.
5691 ReorderIndices.reserve(StoresVec.size());
5692 for (StoreInst *SI : StoresVec) {
5693 unsigned Idx = find_if(StoreOffsetVec,
5694 [SI](const std::pair<StoreInst *, int> &Pair) {
5695 return Pair.first == SI;
5696 }) -
5697 StoreOffsetVec.begin();
5698 ReorderIndices.push_back(Idx);
5699 }
5700 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
5701 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
5702 // same convention here.
5703 auto IsIdentityOrder = [](const OrdersType &Order) {
5704 for (unsigned Idx : seq<unsigned>(0, Order.size()))
5705 if (Idx != Order[Idx])
5706 return false;
5707 return true;
5708 };
5709 if (IsIdentityOrder(ReorderIndices))
5710 ReorderIndices.clear();
5711
5712 return true;
5713}
5714
5715#ifndef NDEBUG
5717 for (unsigned Idx : Order)
5718 dbgs() << Idx << ", ";
5719 dbgs() << "\n";
5720}
5721#endif
5722
5724BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
5725 unsigned NumLanes = TE->Scalars.size();
5726
5728 collectUserStores(TE);
5729
5730 // Holds the reorder indices for each candidate store vector that is a user of
5731 // the current TreeEntry.
5732 SmallVector<OrdersType, 1> ExternalReorderIndices;
5733
5734 // Now inspect the stores collected per pointer and look for vectorization
5735 // candidates. For each candidate calculate the reorder index vector and push
5736 // it into `ExternalReorderIndices`
5737 for (const auto &Pair : PtrToStoresMap) {
5738 auto &StoresVec = Pair.second;
5739 // If we have fewer than NumLanes stores, then we can't form a vector.
5740 if (StoresVec.size() != NumLanes)
5741 continue;
5742
5743 // If the stores are not consecutive then abandon this StoresVec.
5744 OrdersType ReorderIndices;
5745 if (!canFormVector(StoresVec, ReorderIndices))
5746 continue;
5747
5748 // We now know that the scalars in StoresVec can form a vector instruction,
5749 // so set the reorder indices.
5750 ExternalReorderIndices.push_back(ReorderIndices);
5751 }
5752 return ExternalReorderIndices;
5753}
5754
5756 const SmallDenseSet<Value *> &UserIgnoreLst) {
5757 deleteTree();
5758 UserIgnoreList = &UserIgnoreLst;
5759 if (!allSameType(Roots))
5760 return;
5761 buildTree_rec(Roots, 0, EdgeInfo());
5762}
5763
5765 deleteTree();
5766 if (!allSameType(Roots))
5767 return;
5768 buildTree_rec(Roots, 0, EdgeInfo());
5769}
5770
5771/// \return true if the specified list of values has only one instruction that
5772/// requires scheduling, false otherwise.
5773#ifndef NDEBUG
5775 Value *NeedsScheduling = nullptr;
5776 for (Value *V : VL) {
5778 continue;
5779 if (!NeedsScheduling) {
5780 NeedsScheduling = V;
5781 continue;
5782 }
5783 return false;
5784 }
5785 return NeedsScheduling;
5786}
5787#endif
5788
5789/// Generates key/subkey pair for the given value to provide effective sorting
5790/// of the values and better detection of the vectorizable values sequences. The
5791/// keys/subkeys can be used for better sorting of the values themselves (keys)
5792/// and in values subgroups (subkeys).
5793static std::pair<size_t, size_t> generateKeySubkey(
5794 Value *V, const TargetLibraryInfo *TLI,
5795 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
5796 bool AllowAlternate) {
5797 hash_code Key = hash_value(V->getValueID() + 2);
5798 hash_code SubKey = hash_value(0);
5799 // Sort the loads by the distance between the pointers.
5800 if (auto *LI = dyn_cast<LoadInst>(V)) {
5801 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
5802 if (LI->isSimple())
5803 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
5804 else
5805 Key = SubKey = hash_value(LI);
5806 } else if (isVectorLikeInstWithConstOps(V)) {
5807 // Sort extracts by the vector operands.
5808 if (isa<ExtractElementInst, UndefValue>(V))
5809 Key = hash_value(Value::UndefValueVal + 1);
5810 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
5811 if (!isUndefVector(EI->getVectorOperand()).all() &&
5812 !isa<UndefValue>(EI->getIndexOperand()))
5813 SubKey = hash_value(EI->getVectorOperand());
5814 }
5815 } else if (auto *I = dyn_cast<Instruction>(V)) {
5816 // Sort other instructions just by the opcodes except for CMPInst.
5817 // For CMP also sort by the predicate kind.
5818 if ((isa<BinaryOperator, CastInst>(I)) &&
5819 isValidForAlternation(I->getOpcode())) {
5820 if (AllowAlternate)
5821 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
5822 else
5823 Key = hash_combine(hash_value(I->getOpcode()), Key);
5824 SubKey = hash_combine(
5825 hash_value(I->getOpcode()), hash_value(I->getType()),
5826 hash_value(isa<BinaryOperator>(I)
5827 ? I->getType()
5828 : cast<CastInst>(I)->getOperand(0)->getType()));
5829 // For casts, look through the only operand to improve compile time.
5830 if (isa<CastInst>(I)) {
5831 std::pair<size_t, size_t> OpVals =
5832 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
5833 /*AllowAlternate=*/true);
5834 Key = hash_combine(OpVals.first, Key);
5835 SubKey = hash_combine(OpVals.first, SubKey);
5836 }
5837 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
5838 CmpInst::Predicate Pred = CI->getPredicate();
5839 if (CI->isCommutative())
5840 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
5842 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
5843 hash_value(SwapPred),
5844 hash_value(CI->getOperand(0)->getType()));
5845 } else if (auto *Call = dyn_cast<CallInst>(I)) {
5848 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
5849 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
5850 SubKey = hash_combine(hash_value(I->getOpcode()),
5851 hash_value(Call->getCalledFunction()));
5852 } else {
5853 Key = hash_combine(hash_value(Call), Key);
5854 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
5855 }
5856 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
5857 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
5858 hash_value(Op.Tag), SubKey);
5859 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
5860 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
5861 SubKey = hash_value(Gep->getPointerOperand());
5862 else
5863 SubKey = hash_value(Gep);
5864 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
5865 !isa<ConstantInt>(I->getOperand(1))) {
5866 // Do not try to vectorize instructions with potentially high cost.
5867 SubKey = hash_value(I);
5868 } else {
5869 SubKey = hash_value(I->getOpcode());
5870 }
5871 Key = hash_combine(hash_value(I->getParent()), Key);
5872 }
5873 return std::make_pair(Key, SubKey);
5874}
5875
5876/// Checks if the specified instruction \p I is an alternate operation for
5877/// the given \p MainOp and \p AltOp instructions.
5878static bool isAlternateInstruction(const Instruction *I,
5879 const Instruction *MainOp,
5880 const Instruction *AltOp,
5881 const TargetLibraryInfo &TLI);
5882
5883bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
5884 ArrayRef<Value *> VL) const {
5885 unsigned Opcode0 = S.getOpcode();
5886 unsigned Opcode1 = S.getAltOpcode();
5887 // The opcode mask selects between the two opcodes.
5888 SmallBitVector OpcodeMask(VL.size(), false);
5889 for (unsigned Lane : seq<unsigned>(0, VL.size()))
5890 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
5891 OpcodeMask.set(Lane);
5892 // If this pattern is supported by the target then consider it profitable.
5893 if (TTI->isLegalAltInstr(FixedVectorType::get(S.MainOp->getType(), VL.size()),
5894 Opcode0, Opcode1, OpcodeMask))
5895 return true;
5897 for (unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
5898 Operands.emplace_back();
5899 // Prepare the operand vector.
5900 for (Value *V : VL)
5901 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
5902 }
5903 if (Operands.size() == 2) {
5904 // Try find best operands candidates.
5905 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
5907 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
5908 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
5909 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
5910 std::optional<int> Res = findBestRootPair(Candidates);
5911 switch (Res.value_or(0)) {
5912 case 0:
5913 break;
5914 case 1:
5915 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
5916 break;
5917 case 2:
5918 std::swap(Operands[0][I], Operands[1][I]);
5919 break;
5920 default:
5921 llvm_unreachable("Unexpected index.");
5922 }
5923 }
5924 }
5925 DenseSet<unsigned> UniqueOpcodes;
5926 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
5927 unsigned NonInstCnt = 0;
5928 // Estimate number of instructions, required for the vectorized node and for
5929 // the buildvector node.
5930 unsigned UndefCnt = 0;
5931 // Count the number of extra shuffles, required for vector nodes.
5932 unsigned ExtraShuffleInsts = 0;
5933 // Check that operands do not contain same values and create either perfect
5934 // diamond match or shuffled match.
5935 if (Operands.size() == 2) {
5936 // Do not count same operands twice.
5937 if (Operands.front() == Operands.back()) {
5938 Operands.erase(Operands.begin());
5939 } else if (!allConstant(Operands.front()) &&
5940 all_of(Operands.front(), [&](Value *V) {
5941 return is_contained(Operands.back(), V);
5942 })) {
5943 Operands.erase(Operands.begin());
5944 ++ExtraShuffleInsts;
5945 }
5946 }
5947 const Loop *L = LI->getLoopFor(S.MainOp->getParent());
5948 // Vectorize node, if:
5949 // 1. at least single operand is constant or splat.
5950 // 2. Operands have many loop invariants (the instructions are not loop
5951 // invariants).
5952 // 3. At least single unique operands is supposed to vectorized.
5953 return none_of(Operands,
5954 [&](ArrayRef<Value *> Op) {
5955 if (allConstant(Op) ||
5956 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
5957 getSameOpcode(Op, *TLI).MainOp))
5958 return false;
5960 for (Value *V : Op) {
5961 if (isa<Constant, ExtractElementInst>(V) ||
5962 getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
5963 if (isa<UndefValue>(V))
5964 ++UndefCnt;
5965 continue;
5966 }
5967 auto Res = Uniques.try_emplace(V, 0);
5968 // Found first duplicate - need to add shuffle.
5969 if (!Res.second && Res.first->second == 1)
5970 ++ExtraShuffleInsts;
5971 ++Res.first->getSecond();
5972 if (auto *I = dyn_cast<Instruction>(V))
5973 UniqueOpcodes.insert(I->getOpcode());
5974 else if (Res.second)
5975 ++NonInstCnt;
5976 }
5977 return none_of(Uniques, [&](const auto &P) {
5978 return P.first->hasNUsesOrMore(P.second + 1) &&
5979 none_of(P.first->users(), [&](User *U) {
5980 return getTreeEntry(U) || Uniques.contains(U);
5981 });
5982 });
5983 }) ||
5984 // Do not vectorize node, if estimated number of vector instructions is
5985 // more than estimated number of buildvector instructions. Number of
5986 // vector operands is number of vector instructions + number of vector
5987 // instructions for operands (buildvectors). Number of buildvector
5988 // instructions is just number_of_operands * number_of_scalars.
5989 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
5990 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
5991 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
5992}
5993
5994BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
5995 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
5996 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
5997 assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
5998
5999 unsigned ShuffleOrOp =
6000 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
6001 auto *VL0 = cast<Instruction>(S.OpValue);
6002 switch (ShuffleOrOp) {
6003 case Instruction::PHI: {
6004 // Check for terminator values (e.g. invoke).
6005 for (Value *V : VL)
6006 for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
6007 Instruction *Term = dyn_cast<Instruction>(Incoming);
6008 if (Term && Term->isTerminator()) {
6010 << "SLP: Need to swizzle PHINodes (terminator use).\n");
6011 return TreeEntry::NeedToGather;
6012 }
6013 }
6014
6015 return TreeEntry::Vectorize;
6016 }
6017 case Instruction::ExtractValue:
6018 case Instruction::ExtractElement: {
6019 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6020 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6021 if (!isPowerOf2_32(VL.size()))
6022 return TreeEntry::NeedToGather;
6023 if (Reuse || !CurrentOrder.empty())
6024 return TreeEntry::Vectorize;
6025 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
6026 return TreeEntry::NeedToGather;
6027 }
6028 case Instruction::InsertElement: {
6029 // Check that we have a buildvector and not a shuffle of 2 or more
6030 // different vectors.
6031 ValueSet SourceVectors;
6032 for (Value *V : VL) {
6033 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
6034 assert(getInsertIndex(V) != std::nullopt &&
6035 "Non-constant or undef index?");
6036 }
6037
6038 if (count_if(VL, [&SourceVectors](Value *V) {
6039 return !SourceVectors.contains(V);
6040 }) >= 2) {
6041 // Found 2nd source vector - cancel.
6042 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
6043 "different source vectors.\n");
6044 return TreeEntry::NeedToGather;
6045 }
6046
6047 return TreeEntry::Vectorize;
6048 }
6049 case Instruction::Load: {
6050 // Check that a vectorized load would load the same memory as a scalar
6051 // load. For example, we don't want to vectorize loads that are smaller
6052 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6053 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6054 // from such a struct, we read/write packed bits disagreeing with the
6055 // unvectorized version.
6056 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
6058 return TreeEntry::Vectorize;
6060 return TreeEntry::ScatterVectorize;
6062 return TreeEntry::StridedVectorize;
6063 case LoadsState::Gather:
6064#ifndef NDEBUG
6065 Type *ScalarTy = VL0->getType();
6066 if (DL->getTypeSizeInBits(ScalarTy) !=
6067 DL->getTypeAllocSizeInBits(ScalarTy))
6068 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
6069 else if (any_of(VL,
6070 [](Value *V) { return !cast<LoadInst>(V)->isSimple(); }))
6071 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
6072 else
6073 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
6074#endif // NDEBUG
6075 return TreeEntry::NeedToGather;
6076 }
6077 llvm_unreachable("Unexpected state of loads");
6078 }
6079 case Instruction::ZExt:
6080 case Instruction::SExt:
6081 case Instruction::FPToUI:
6082 case Instruction::FPToSI:
6083 case Instruction::FPExt:
6084 case Instruction::PtrToInt:
6085 case Instruction::IntToPtr:
6086 case Instruction::SIToFP:
6087 case Instruction::UIToFP:
6088 case Instruction::Trunc:
6089 case Instruction::FPTrunc:
6090 case Instruction::BitCast: {
6091 Type *SrcTy = VL0->getOperand(0)->getType();
6092 for (Value *V : VL) {
6093 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6094 if (Ty != SrcTy || !isValidElementType(Ty)) {
6095 LLVM_DEBUG(
6096 dbgs() << "SLP: Gathering casts with different src types.\n");
6097 return TreeEntry::NeedToGather;
6098 }
6099 }
6100 return TreeEntry::Vectorize;
6101 }
6102 case Instruction::ICmp:
6103 case Instruction::FCmp: {
6104 // Check that all of the compares have the same predicate.
6105 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
6107 Type *ComparedTy = VL0->getOperand(0)->getType();
6108 for (Value *V : VL) {
6109 CmpInst *Cmp = cast<CmpInst>(V);
6110 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
6111 Cmp->getOperand(0)->getType() != ComparedTy) {
6112 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
6113 return TreeEntry::NeedToGather;
6114 }
6115 }
6116 return TreeEntry::Vectorize;
6117 }
6118 case Instruction::Select:
6119 case Instruction::FNeg:
6120 case Instruction::Add:
6121 case Instruction::FAdd:
6122 case Instruction::Sub:
6123 case Instruction::FSub:
6124 case Instruction::Mul:
6125 case Instruction::FMul:
6126 case Instruction::UDiv:
6127 case Instruction::SDiv:
6128 case Instruction::FDiv:
6129 case Instruction::URem:
6130 case Instruction::SRem:
6131 case Instruction::FRem:
6132 case Instruction::Shl:
6133 case Instruction::LShr:
6134 case Instruction::AShr:
6135 case Instruction::And:
6136 case Instruction::Or:
6137 case Instruction::Xor:
6138 return TreeEntry::Vectorize;
6139 case Instruction::GetElementPtr: {
6140 // We don't combine GEPs with complicated (nested) indexing.
6141 for (Value *V : VL) {
6142 auto *I = dyn_cast<GetElementPtrInst>(V);
6143 if (!I)
6144 continue;
6145 if (I->getNumOperands() != 2) {
6146 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
6147 return TreeEntry::NeedToGather;
6148 }
6149 }
6150
6151 // We can't combine several GEPs into one vector if they operate on
6152 // different types.
6153 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6154 for (Value *V : VL) {
6155 auto *GEP = dyn_cast<GEPOperator>(V);
6156 if (!GEP)
6157 continue;
6158 Type *CurTy = GEP->getSourceElementType();
6159 if (Ty0 != CurTy) {
6160 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
6161 return TreeEntry::NeedToGather;
6162 }
6163 }
6164
6165 // We don't combine GEPs with non-constant indexes.
6166 Type *Ty1 = VL0->getOperand(1)->getType();
6167 for (Value *V : VL) {
6168 auto *I = dyn_cast<GetElementPtrInst>(V);
6169 if (!I)
6170 continue;
6171 auto *Op = I->getOperand(1);
6172 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6173 (Op->getType() != Ty1 &&
6174 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6175 Op->getType()->getScalarSizeInBits() >
6176 DL->getIndexSizeInBits(
6177 V->getType()->getPointerAddressSpace())))) {
6178 LLVM_DEBUG(
6179 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
6180 return TreeEntry::NeedToGather;
6181 }
6182 }
6183
6184 return TreeEntry::Vectorize;
6185 }
6186 case Instruction::Store: {
6187 // Check if the stores are consecutive or if we need to swizzle them.
6188 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6189 // Avoid types that are padded when being allocated as scalars, while
6190 // being packed together in a vector (such as i1).
6191 if (DL->getTypeSizeInBits(ScalarTy) !=
6192 DL->getTypeAllocSizeInBits(ScalarTy)) {
6193 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
6194 return TreeEntry::NeedToGather;
6195 }
6196 // Make sure all stores in the bundle are simple - we can't vectorize
6197 // atomic or volatile stores.
6198 for (Value *V : VL) {
6199 auto *SI = cast<StoreInst>(V);
6200 if (!SI->isSimple()) {
6201 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
6202 return TreeEntry::NeedToGather;
6203 }
6204 PointerOps.push_back(SI->getPointerOperand());
6205 }
6206
6207 // Check the order of pointer operands.
6208 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
6209 Value *Ptr0;
6210 Value *PtrN;
6211 if (CurrentOrder.empty()) {
6212 Ptr0 = PointerOps.front();
6213 PtrN = PointerOps.back();
6214 } else {
6215 Ptr0 = PointerOps[CurrentOrder.front()];
6216 PtrN = PointerOps[CurrentOrder.back()];
6217 }
6218 std::optional<int> Dist =
6219 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
6220 // Check that the sorted pointer operands are consecutive.
6221 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
6222 return TreeEntry::Vectorize;
6223 }
6224
6225 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
6226 return TreeEntry::NeedToGather;
6227 }
6228 case Instruction::Call: {
6229 // Check if the calls are all to the same vectorizable intrinsic or
6230 // library function.
6231 CallInst *CI = cast<CallInst>(VL0);
6233
6234 VFShape Shape = VFShape::get(
6235 CI->getFunctionType(),
6236 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
6237 false /*HasGlobalPred*/);
6238 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
6239
6240 if (!VecFunc && !isTriviallyVectorizable(ID)) {
6241 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
6242 return TreeEntry::NeedToGather;
6243 }
6244 Function *F = CI->getCalledFunction();
6245 unsigned NumArgs = CI->arg_size();
6246 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
6247 for (unsigned J = 0; J != NumArgs; ++J)
6249 ScalarArgs[J] = CI->getArgOperand(J);
6250 for (Value *V : VL) {
6251 CallInst *CI2 = dyn_cast<CallInst>(V);
6252 if (!CI2 || CI2->getCalledFunction() != F ||
6253 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
6254 (VecFunc &&
6255 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
6257 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
6258 << "\n");
6259 return TreeEntry::NeedToGather;
6260 }
6261 // Some intrinsics have scalar arguments and should be same in order for
6262 // them to be vectorized.
6263 for (unsigned J = 0; J != NumArgs; ++J) {
6265 Value *A1J = CI2->getArgOperand(J);
6266 if (ScalarArgs[J] != A1J) {
6268 << "SLP: mismatched arguments in call:" << *CI
6269 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
6270 return TreeEntry::NeedToGather;
6271 }
6272 }
6273 }
6274 // Verify that the bundle operands are identical between the two calls.
6275 if (CI->hasOperandBundles() &&
6276 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
6277 CI->op_begin() + CI->getBundleOperandsEndIndex(),
6278 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
6279 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
6280 << "!=" << *V << '\n');
6281 return TreeEntry::NeedToGather;
6282 }
6283 }
6284
6285 return TreeEntry::Vectorize;
6286 }
6287 case Instruction::ShuffleVector: {
6288 // If this is not an alternate sequence of opcode like add-sub
6289 // then do not vectorize this instruction.
6290 if (!S.isAltShuffle()) {
6291 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
6292 return TreeEntry::NeedToGather;
6293 }
6294 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
6295 LLVM_DEBUG(
6296 dbgs()
6297 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
6298 "the whole alt sequence is not profitable.\n");
6299 return TreeEntry::NeedToGather;
6300 }
6301
6302 return TreeEntry::Vectorize;
6303 }
6304 default:
6305 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
6306 return TreeEntry::NeedToGather;
6307 }
6308}
6309
6310void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
6311 const EdgeInfo &UserTreeIdx) {
6312 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
6313
6314 SmallVector<int> ReuseShuffleIndicies;
6315 SmallVector<Value *> UniqueValues;
6316 SmallVector<Value *> NonUniqueValueVL;
6317 auto TryToFindDuplicates = [&](const InstructionsState &S,
6318 bool DoNotFail = false) {
6319 // Check that every instruction appears once in this bundle.
6320 DenseMap<Value *, unsigned> UniquePositions(VL.size());
6321 for (Value *V : VL) {
6322 if (isConstant(V)) {
6323 ReuseShuffleIndicies.emplace_back(
6324 isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());
6325 UniqueValues.emplace_back(V);
6326 continue;
6327 }
6328 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
6329 ReuseShuffleIndicies.emplace_back(Res.first->second);
6330 if (Res.second)
6331 UniqueValues.emplace_back(V);
6332 }
6333 size_t NumUniqueScalarValues = UniqueValues.size();
6334 if (NumUniqueScalarValues == VL.size()) {
6335 ReuseShuffleIndicies.clear();
6336 } else {
6337 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6338 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6339 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
6340 "for nodes with padding.\n");
6341 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6342 return false;
6343 }
6344 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
6345 if (NumUniqueScalarValues <= 1 ||
6346 (UniquePositions.size() == 1 && all_of(UniqueValues,
6347 [](Value *V) {
6348 return isa<UndefValue>(V) ||
6349 !isConstant(V);
6350 })) ||
6351 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6352 if (DoNotFail && UniquePositions.size() > 1 &&
6353 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6354 all_of(UniqueValues, [=](Value *V) {
6355 return isa<ExtractElementInst>(V) ||
6356 areAllUsersVectorized(cast<Instruction>(V),
6357 UserIgnoreList);
6358 })) {
6359 unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
6360 if (PWSz == VL.size()) {
6361 ReuseShuffleIndicies.clear();
6362 } else {
6363 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
6364 NonUniqueValueVL.append(PWSz - UniqueValues.size(),
6365 UniqueValues.back());
6366 VL = NonUniqueValueVL;
6367 }
6368 return true;
6369 }
6370 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
6371 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6372 return false;
6373 }
6374 VL = UniqueValues;
6375 }
6376 return true;
6377 };
6378
6379 InstructionsState S = getSameOpcode(VL, *TLI);
6380
6381 // Don't vectorize ephemeral values.
6382 if (!EphValues.empty()) {
6383 for (Value *V : VL) {
6384 if (EphValues.count(V)) {
6385 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6386 << ") is ephemeral.\n");
6387 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6388 return;
6389 }
6390 }
6391 }
6392
6393 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
6394 // a load), in which case peek through to include it in the tree, without
6395 // ballooning over-budget.
6396 if (Depth >= RecursionMaxDepth &&
6397 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6398 VL.size() >= 4 &&
6399 (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {
6400 return match(I,
6402 cast<Instruction>(I)->getOpcode() ==
6403 cast<Instruction>(S.MainOp)->getOpcode();
6404 })))) {
6405 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
6406 if (TryToFindDuplicates(S))
6407 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6408 ReuseShuffleIndicies);
6409 return;
6410 }
6411
6412 // Don't handle scalable vectors
6413 if (S.getOpcode() == Instruction::ExtractElement &&
6414 isa<ScalableVectorType>(
6415 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6416 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
6417 if (TryToFindDuplicates(S))
6418 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6419 ReuseShuffleIndicies);
6420 return;
6421 }
6422
6423 // Don't handle vectors.
6424 if (S.OpValue->getType()->isVectorTy() &&
6425 !isa<InsertElementInst>(S.OpValue)) {
6426 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
6427 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6428 return;
6429 }
6430
6431 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6432 if (SI->getValueOperand()->getType()->isVectorTy()) {
6433 LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
6434 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6435 return;
6436 }
6437
6438 // If all of the operands are identical or constant we have a simple solution.
6439 // If we deal with insert/extract instructions, they all must have constant
6440 // indices, otherwise we should gather them, not try to vectorize.
6441 // If alternate op node with 2 elements with gathered operands - do not
6442 // vectorize.
6443 auto &&NotProfitableForVectorization = [&S, this,
6445 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6446 return false;
6447 if (VectorizableTree.size() < MinTreeSize)
6448 return false;
6449 if (Depth >= RecursionMaxDepth - 1)
6450 return true;
6451 // Check if all operands are extracts, part of vector node or can build a
6452 // regular vectorize node.
6453 SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
6454 for (Value *V : VL) {
6455 auto *I = cast<Instruction>(V);
6456 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
6457 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6458 }));
6459 }
6460 bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
6461 if ((IsCommutative &&
6462 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6463 (!IsCommutative &&
6464 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
6465 return true;
6466 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
6468 auto *I1 = cast<Instruction>(VL.front());
6469 auto *I2 = cast<Instruction>(VL.back());
6470 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6471 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6472 I2->getOperand(Op));
6473 if (static_cast<unsigned>(count_if(
6474 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6476 })) >= S.MainOp->getNumOperands() / 2)
6477 return false;
6478 if (S.MainOp->getNumOperands() > 2)
6479 return true;
6480 if (IsCommutative) {
6481 // Check permuted operands.
6482 Candidates.clear();
6483 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6484 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6485 I2->getOperand((Op + 1) % E));
6486 if (any_of(
6487 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6489 }))
6490 return false;
6491 }
6492 return true;
6493 };
6494 SmallVector<unsigned> SortedIndices;
6495 BasicBlock *BB = nullptr;
6496 bool IsScatterVectorizeUserTE =
6497 UserTreeIdx.UserTE &&
6498 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6499 bool AreAllSameInsts =
6500 (S.getOpcode() && allSameBlock(VL)) ||
6501 (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
6502 VL.size() > 2 &&
6503 all_of(VL,
6504 [&BB](Value *V) {
6505 auto *I = dyn_cast<GetElementPtrInst>(V);
6506 if (!I)
6507 return doesNotNeedToBeScheduled(V);
6508 if (!BB)
6509 BB = I->getParent();
6510 return BB == I->getParent() && I->getNumOperands() == 2;
6511 }) &&
6512 BB &&
6513 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
6514 SortedIndices));
6515 if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) ||
6516 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6517 S.OpValue) &&
6519 NotProfitableForVectorization(VL)) {
6520 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
6521 if (TryToFindDuplicates(S))
6522 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6523 ReuseShuffleIndicies);
6524 return;
6525 }
6526
6527 // We now know that this is a vector of instructions of the same type from
6528 // the same block.
6529
6530 // Check if this is a duplicate of another entry.
6531 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
6532 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
6533 if (!E->isSame(VL)) {
6534 auto It = MultiNodeScalars.find(S.OpValue);
6535 if (It != MultiNodeScalars.end()) {
6536 auto *TEIt = find_if(It->getSecond(),
6537 [&](TreeEntry *ME) { return ME->isSame(VL); });
6538 if (TEIt != It->getSecond().end())
6539 E = *TEIt;
6540 else
6541 E = nullptr;
6542 } else {
6543 E = nullptr;
6544 }
6545 }
6546 if (!E) {
6547 if (!doesNotNeedToBeScheduled(S.OpValue)) {
6548 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
6549 if (TryToFindDuplicates(S))
6550 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6551 ReuseShuffleIndicies);
6552 return;
6553 }
6554 } else {
6555 // Record the reuse of the tree node. FIXME, currently this is only used
6556 // to properly draw the graph rather than for the actual vectorization.
6557 E->UserTreeIndices.push_back(UserTreeIdx);
6558 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
6559 << ".\n");
6560 return;
6561 }
6562 }
6563
6564 // Check that none of the instructions in the bundle are already in the tree.
6565 for (Value *V : VL) {
6566 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
6568 continue;
6569 if (getTreeEntry(V)) {
6570 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6571 << ") is already in tree.\n");
6572 if (TryToFindDuplicates(S))
6573 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6574 ReuseShuffleIndicies);
6575 return;
6576 }
6577 }
6578
6579 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
6580 if (UserIgnoreList && !UserIgnoreList->empty()) {
6581 for (Value *V : VL) {
6582 if (UserIgnoreList && UserIgnoreList->contains(V)) {
6583 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
6584 if (TryToFindDuplicates(S))
6585 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6586 ReuseShuffleIndicies);
6587 return;
6588 }
6589 }
6590 }
6591
6592 // Special processing for sorted pointers for ScatterVectorize node with
6593 // constant indeces only.
6594 if (AreAllSameInsts && UserTreeIdx.UserTE &&
6595 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
6596 !(S.getOpcode() && allSameBlock(VL))) {
6597 assert(S.OpValue->getType()->isPointerTy() &&
6598 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
6599 "Expected pointers only.");
6600 // Reset S to make it GetElementPtr kind of node.
6601 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
6602 assert(It != VL.end() && "Expected at least one GEP.");
6603 S = getSameOpcode(*It, *TLI);
6604 }
6605
6606 // Check that all of the users of the scalars that we want to vectorize are
6607 // schedulable.
6608 auto *VL0 = cast<Instruction>(S.OpValue);
6609 BB = VL0->getParent();
6610
6611 if (!DT->isReachableFromEntry(BB)) {
6612 // Don't go into unreachable blocks. They may contain instructions with
6613 // dependency cycles which confuse the final scheduling.
6614 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
6615 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6616 return;
6617 }
6618
6619 // Don't go into catchswitch blocks, which can happen with PHIs.
6620 // Such blocks can only have PHIs and the catchswitch. There is no
6621 // place to insert a shuffle if we need to, so just avoid that issue.
6622 if (isa<CatchSwitchInst>(BB->getTerminator())) {
6623 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
6624 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6625 return;
6626 }
6627
6628 // Check that every instruction appears once in this bundle.
6629 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
6630 return;
6631
6632 // Perform specific checks for each particular instruction kind.
6633 OrdersType CurrentOrder;
6634 SmallVector<Value *> PointerOps;
6635 TreeEntry::EntryState State = getScalarsVectorizationState(
6636 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
6637 if (State == TreeEntry::NeedToGather) {
6638 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6639 ReuseShuffleIndicies);
6640 return;
6641 }
6642
6643 auto &BSRef = BlocksSchedules[BB];
6644 if (!BSRef)
6645 BSRef = std::make_unique<BlockScheduling>(BB);
6646
6647 BlockScheduling &BS = *BSRef;
6648
6649 std::optional<ScheduleData *> Bundle =
6650 BS.tryScheduleBundle(UniqueValues, this, S);
6651#ifdef EXPENSIVE_CHECKS
6652 // Make sure we didn't break any internal invariants
6653 BS.verify();
6654#endif
6655 if (!Bundle) {
6656 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
6657 assert((!BS.getScheduleData(VL0) ||
6658 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
6659 "tryScheduleBundle should cancelScheduling on failure");
6660 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6661 ReuseShuffleIndicies);
6662 NonScheduledFirst.insert(VL.front());
6663 return;
6664 }
6665 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
6666
6667 unsigned ShuffleOrOp = S.isAltShuffle() ?
6668 (unsigned) Instruction::ShuffleVector : S.getOpcode();
6669 switch (ShuffleOrOp) {
6670 case Instruction::PHI: {
6671 auto *PH = cast<PHINode>(VL0);
6672
6673 TreeEntry *TE =
6674 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
6675 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
6676
6677 // Keeps the reordered operands to avoid code duplication.
6678 SmallVector<ValueList, 2> OperandsVec;
6679 for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
6680 if (!DT->isReachableFromEntry(PH->getIncomingBlock(I))) {
6681 ValueList Operands(VL.size(), PoisonValue::get(PH->getType()));
6682 TE->setOperand(I, Operands);
6683 OperandsVec.push_back(Operands);
6684 continue;
6685 }
6687 // Prepare the operand vector.
6688 for (Value *V : VL)
6689 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
6690 PH->getIncomingBlock(I)));
6691 TE->setOperand(I, Operands);
6692 OperandsVec.push_back(Operands);
6693 }
6694 for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
6695 buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
6696 return;
6697 }
6698 case Instruction::ExtractValue:
6699 case Instruction::ExtractElement: {
6700 if (CurrentOrder.empty()) {
6701 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
6702 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6703 ReuseShuffleIndicies);
6704 // This is a special case, as it does not gather, but at the same time
6705 // we are not extending buildTree_rec() towards the operands.
6706 ValueList Op0;
6707 Op0.assign(VL.size(), VL0->getOperand(0));
6708 VectorizableTree.back()->setOperand(0, Op0);
6709 return;
6710 }
6711 LLVM_DEBUG({
6712 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
6713 "with order";
6714 for (unsigned Idx : CurrentOrder)
6715 dbgs() << " " << Idx;
6716 dbgs() << "\n";
6717 });
6718 fixupOrderingIndices(CurrentOrder);
6719 // Insert new order with initial value 0, if it does not exist,
6720 // otherwise return the iterator to the existing one.
6721 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6722 ReuseShuffleIndicies, CurrentOrder);
6723 // This is a special case, as it does not gather, but at the same time
6724 // we are not extending buildTree_rec() towards the operands.
6725 ValueList Op0;
6726 Op0.assign(VL.size(), VL0->getOperand(0));
6727 VectorizableTree.back()->setOperand(0, Op0);
6728 return;
6729 }
6730 case Instruction::InsertElement: {
6731 assert(ReuseShuffleIndicies.empty() && "All inserts should be unique");
6732
6733 auto OrdCompare = [](const std::pair<int, int> &P1,
6734 const std::pair<int, int> &P2) {
6735 return P1.first > P2.first;
6736 };
6738 decltype(OrdCompare)>
6739 Indices(OrdCompare);
6740 for (int I = 0, E = VL.size(); I < E; ++I) {
6741 unsigned Idx = *getInsertIndex(VL[I]);
6742 Indices.emplace(Idx, I);
6743 }
6744 OrdersType CurrentOrder(VL.size(), VL.size());
6745 bool IsIdentity = true;
6746 for (int I = 0, E = VL.size(); I < E; ++I) {
6747 CurrentOrder[Indices.top().second] = I;
6748 IsIdentity &= Indices.top().second == I;
6749 Indices.pop();
6750 }
6751 if (IsIdentity)
6752 CurrentOrder.clear();
6753 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6754 std::nullopt, CurrentOrder);
6755 LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
6756
6757 constexpr int NumOps = 2;
6758 ValueList VectorOperands[NumOps];
6759 for (int I = 0; I < NumOps; ++I) {
6760 for (Value *V : VL)
6761 VectorOperands[I].push_back(cast<Instruction>(V)->getOperand(I));
6762
6763 TE->setOperand(I, VectorOperands[I]);
6764 }
6765 buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, NumOps - 1});
6766 return;
6767 }
6768 case Instruction::Load: {
6769 // Check that a vectorized load would load the same memory as a scalar
6770 // load. For example, we don't want to vectorize loads that are smaller
6771 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6772 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6773 // from such a struct, we read/write packed bits disagreeing with the
6774 // unvectorized version.
6775 TreeEntry *TE = nullptr;
6776 fixupOrderingIndices(CurrentOrder);
6777 switch (State) {
6778 case TreeEntry::Vectorize:
6779 if (CurrentOrder.empty()) {
6780 // Original loads are consecutive and does not require reordering.
6781 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6782 ReuseShuffleIndicies);
6783 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
6784 } else {
6785 // Need to reorder.
6786 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6787 ReuseShuffleIndicies, CurrentOrder);
6788 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
6789 }
6790 TE->setOperandsInOrder();
6791 break;
6792 case TreeEntry::StridedVectorize:
6793 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
6794 if (CurrentOrder.empty()) {
6795 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6796 UserTreeIdx, ReuseShuffleIndicies);
6797 } else {
6798 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6799 UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
6800 }
6801 TE->setOperandsInOrder();
6802 LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
6803 break;
6804 case TreeEntry::ScatterVectorize:
6805 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
6806 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
6807 UserTreeIdx, ReuseShuffleIndicies);
6808 TE->setOperandsInOrder();
6809 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
6810 LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
6811 break;
6812 case TreeEntry::NeedToGather:
6813 llvm_unreachable("Unexpected loads state.");
6814 }
6815 return;
6816 }
6817 case Instruction::ZExt:
6818 case Instruction::SExt:
6819 case Instruction::FPToUI:
6820 case Instruction::FPToSI:
6821 case Instruction::FPExt:
6822 case Instruction::PtrToInt:
6823 case Instruction::IntToPtr:
6824 case Instruction::SIToFP:
6825 case Instruction::UIToFP:
6826 case Instruction::Trunc:
6827 case Instruction::FPTrunc:
6828 case Instruction::BitCast: {
6829 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
6830 std::make_pair(std::numeric_limits<unsigned>::min(),
6831 std::numeric_limits<unsigned>::max()));
6832 if (ShuffleOrOp == Instruction::ZExt ||
6833 ShuffleOrOp == Instruction::SExt) {
6834 CastMaxMinBWSizes = std::make_pair(
6835 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
6836 PrevMaxBW),
6837 std::min<unsigned>(
6838 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
6839 PrevMinBW));
6840 } else if (ShuffleOrOp == Instruction::Trunc) {
6841 CastMaxMinBWSizes = std::make_pair(
6842 std::max<unsigned>(
6843 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
6844 PrevMaxBW),
6845 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
6846 PrevMinBW));
6847 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
6848 } else if (ShuffleOrOp == Instruction::SIToFP ||
6849 ShuffleOrOp == Instruction::UIToFP) {
6850 unsigned NumSignBits =
6851 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
6852 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
6853 APInt Mask = DB->getDemandedBits(OpI);
6854 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
6855 }
6856 if (NumSignBits * 2 >=
6857 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
6858 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
6859 }
6860 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6861 ReuseShuffleIndicies);
6862 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
6863
6864 TE->setOperandsInOrder();
6865 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6867 // Prepare the operand vector.
6868 for (Value *V : VL)
6869 Operands.push_back(cast<Instruction>(V)->getOperand(I));
6870
6871 buildTree_rec(Operands, Depth + 1, {TE, I});
6872 }
6873 return;
6874 }
6875 case Instruction::ICmp:
6876 case Instruction::FCmp: {
6877 // Check that all of the compares have the same predicate.
6878 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
6879 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6880 ReuseShuffleIndicies);
6881 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
6882
6884 if (cast<CmpInst>(VL0)->isCommutative()) {
6885 // Commutative predicate - collect + sort operands of the instructions
6886 // so that each side is more likely to have the same opcode.
6888 "Commutative Predicate mismatch");
6889 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
6890 } else {
6891 // Collect operands - commute if it uses the swapped predicate.
6892 for (Value *V : VL) {
6893 auto *Cmp = cast<CmpInst>(V);
6894 Value *LHS = Cmp->getOperand(0);
6895 Value *RHS = Cmp->getOperand(1);
6896 if (Cmp->getPredicate() != P0)
6897 std::swap(LHS, RHS);
6898 Left.push_back(LHS);
6899 Right.push_back(RHS);
6900 }
6901 }
6902 TE->setOperand(0, Left);
6903 TE->setOperand(1, Right);
6904 buildTree_rec(Left, Depth + 1, {TE, 0});
6905 buildTree_rec(Right, Depth + 1, {TE, 1});
6906 if (ShuffleOrOp == Instruction::ICmp) {
6907 unsigned NumSignBits0 =
6908 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
6909 if (NumSignBits0 * 2 >=
6910 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
6911 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
6912 unsigned NumSignBits1 =
6913 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
6914 if (NumSignBits1 * 2 >=
6915 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
6916 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
6917 }
6918 return;
6919 }
6920 case Instruction::Select:
6921 case Instruction::FNeg:
6922 case Instruction::Add:
6923 case Instruction::FAdd:
6924 case Instruction::Sub:
6925 case Instruction::FSub:
6926 case Instruction::Mul:
6927 case Instruction::FMul:
6928 case Instruction::UDiv:
6929 case Instruction::SDiv:
6930 case Instruction::FDiv:
6931 case Instruction::URem:
6932 case Instruction::SRem:
6933 case Instruction::FRem:
6934 case Instruction::Shl:
6935 case Instruction::LShr:
6936 case Instruction::AShr:
6937 case Instruction::And:
6938 case Instruction::Or:
6939 case Instruction::Xor: {
6940 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6941 ReuseShuffleIndicies);
6942 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
6943
6944 // Sort operands of the instructions so that each side is more likely to
6945 // have the same opcode.
6946 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
6948 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
6949 TE->setOperand(0, Left);
6950 TE->setOperand(1, Right);
6951 buildTree_rec(Left, Depth + 1, {TE, 0});
6952 buildTree_rec(Right, Depth + 1, {TE, 1});
6953 return;
6954 }
6955
6956 TE->setOperandsInOrder();
6957 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6959 // Prepare the operand vector.
6960 for (Value *V : VL)
6961 Operands.push_back(cast<Instruction>(V)->getOperand(I));
6962
6963 buildTree_rec(Operands, Depth + 1, {TE, I});
6964 }
6965 return;
6966 }
6967 case Instruction::GetElementPtr: {
6968 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6969 ReuseShuffleIndicies);
6970 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
6972 // Prepare the operand vector for pointer operands.
6973 for (Value *V : VL) {
6974 auto *GEP = dyn_cast<GetElementPtrInst>(V);
6975 if (!GEP) {
6976 Operands.front().push_back(V);
6977 continue;
6978 }
6979 Operands.front().push_back(GEP->getPointerOperand());
6980 }
6981 TE->setOperand(0, Operands.front());
6982 // Need to cast all indices to the same type before vectorization to
6983 // avoid crash.
6984 // Required to be able to find correct matches between different gather
6985 // nodes and reuse the vectorized values rather than trying to gather them
6986 // again.
6987 int IndexIdx = 1;
6988 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
6989 Type *Ty = all_of(VL,
6990 [VL0Ty, IndexIdx](Value *V) {
6991 auto *GEP = dyn_cast<GetElementPtrInst>(V);
6992 if (!GEP)
6993 return true;
6994 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
6995 })
6996 ? VL0Ty
6997 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
6998 ->getPointerOperandType()
6999 ->getScalarType());
7000 // Prepare the operand vector.
7001 for (Value *V : VL) {
7002 auto *I = dyn_cast<GetElementPtrInst>(V);
7003 if (!I) {
7004 Operands.back().push_back(
7005 ConstantInt::get(Ty, 0, /*isSigned=*/false));
7006 continue;
7007 }
7008 auto *Op = I->getOperand(IndexIdx);
7009 auto *CI = dyn_cast<ConstantInt>(Op);
7010 if (!CI)
7011 Operands.back().push_back(Op);
7012 else
7013 Operands.back().push_back(ConstantFoldIntegerCast(
7014 CI, Ty, CI->getValue().isSignBitSet(), *DL));
7015 }
7016 TE->setOperand(IndexIdx, Operands.back());
7017
7018 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
7019 buildTree_rec(Operands[I], Depth + 1, {TE, I});
7020 return;
7021 }
7022 case Instruction::Store: {
7023 // Check if the stores are consecutive or if we need to swizzle them.
7024 ValueList Operands(VL.size());
7025 auto *OIter = Operands.begin();
7026 for (Value *V : VL) {
7027 auto *SI = cast<StoreInst>(V);
7028 *OIter = SI->getValueOperand();
7029 ++OIter;
7030 }
7031 // Check that the sorted pointer operands are consecutive.
7032 if (CurrentOrder.empty()) {
7033 // Original stores are consecutive and does not require reordering.
7034 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7035 ReuseShuffleIndicies);
7036 TE->setOperandsInOrder();
7037 buildTree_rec(Operands, Depth + 1, {TE, 0});
7038 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
7039 } else {
7040 fixupOrderingIndices(CurrentOrder);
7041 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7042 ReuseShuffleIndicies, CurrentOrder);
7043 TE->setOperandsInOrder();
7044 buildTree_rec(Operands, Depth + 1, {TE, 0});
7045 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
7046 }
7047 return;
7048 }
7049 case Instruction::Call: {
7050 // Check if the calls are all to the same vectorizable intrinsic or
7051 // library function.
7052 CallInst *CI = cast<CallInst>(VL0);
7054
7055 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7056 ReuseShuffleIndicies);
7057 // Sort operands of the instructions so that each side is more likely to
7058 // have the same opcode.
7059 if (isCommutative(VL0)) {
7061 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7062 TE->setOperand(0, Left);
7063 TE->setOperand(1, Right);
7065 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7066 Operands.emplace_back();
7068 continue;
7069 for (Value *V : VL) {
7070 auto *CI2 = cast<CallInst>(V);
7071 Operands.back().push_back(CI2->getArgOperand(I));
7072 }
7073 TE->setOperand(I, Operands.back());
7074 }
7075 buildTree_rec(Left, Depth + 1, {TE, 0});
7076 buildTree_rec(Right, Depth + 1, {TE, 1});
7077 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7078 if (Operands[I - 2].empty())
7079 continue;
7080 buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
7081 }
7082 return;
7083 }
7084 TE->setOperandsInOrder();
7085 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
7086 // For scalar operands no need to create an entry since no need to
7087 // vectorize it.
7089 continue;
7091 // Prepare the operand vector.
7092 for (Value *V : VL) {
7093 auto *CI2 = cast<CallInst>(V);
7094 Operands.push_back(CI2->getArgOperand(I));
7095 }
7096 buildTree_rec(Operands, Depth + 1, {TE, I});
7097 }
7098 return;
7099 }
7100 case Instruction::ShuffleVector: {
7101 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7102 ReuseShuffleIndicies);
7103 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
7104
7105 // Reorder operands if reordering would enable vectorization.
7106 auto *CI = dyn_cast<CmpInst>(VL0);
7107 if (isa<BinaryOperator>(VL0) || CI) {
7109 if (!CI || all_of(VL, [](Value *V) {
7110 return cast<CmpInst>(V)->isCommutative();
7111 })) {
7112 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7113 } else {
7114 auto *MainCI = cast<CmpInst>(S.MainOp);
7115 auto *AltCI = cast<CmpInst>(S.AltOp);
7116 CmpInst::Predicate MainP = MainCI->getPredicate();
7117 CmpInst::Predicate AltP = AltCI->getPredicate();
7118 assert(MainP != AltP &&
7119 "Expected different main/alternate predicates.");
7120 // Collect operands - commute if it uses the swapped predicate or
7121 // alternate operation.
7122 for (Value *V : VL) {
7123 auto *Cmp = cast<CmpInst>(V);
7124 Value *LHS = Cmp->getOperand(0);
7125 Value *RHS = Cmp->getOperand(1);
7126
7127 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
7128 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7129 std::swap(LHS, RHS);
7130 } else {
7131 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7132 std::swap(LHS, RHS);
7133 }
7134 Left.push_back(LHS);
7135 Right.push_back(RHS);
7136 }
7137 }
7138 TE->setOperand(0, Left);
7139 TE->setOperand(1, Right);
7140 buildTree_rec(Left, Depth + 1, {TE, 0});
7141 buildTree_rec(Right, Depth + 1, {TE, 1});
7142 return;
7143 }
7144
7145 TE->setOperandsInOrder();
7146 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
7148 // Prepare the operand vector.
7149 for (Value *V : VL)
7150 Operands.push_back(cast<Instruction>(V)->getOperand(I));
7151
7152 buildTree_rec(Operands, Depth + 1, {TE, I});
7153 }
7154 return;
7155 }
7156 default:
7157 break;
7158 }
7159 llvm_unreachable("Unexpected vectorization of the instructions.");
7160}
7161
7163 unsigned N = 1;
7164 Type *EltTy = T;
7165
7166 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7167 if (auto *ST = dyn_cast<StructType>(EltTy)) {
7168 // Check that struct is homogeneous.
7169 for (const auto *Ty : ST->elements())
7170 if (Ty != *ST->element_begin())
7171 return 0;
7172 N *= ST->getNumElements();
7173 EltTy = *ST->element_begin();
7174 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
7175 N *= AT->getNumElements();
7176 EltTy = AT->getElementType();
7177 } else {
7178 auto *VT = cast<FixedVectorType>(EltTy);
7179 N *= VT->getNumElements();
7180 EltTy = VT->getElementType();
7181 }
7182 }
7183
7184 if (!isValidElementType(EltTy))
7185 return 0;
7186 uint64_t VTSize = DL->getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
7187 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7188 VTSize != DL->getTypeStoreSizeInBits(T))
7189 return 0;
7190 return N;
7191}
7192
7193bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
7194 SmallVectorImpl<unsigned> &CurrentOrder,
7195 bool ResizeAllowed) const {
7196 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7197 assert(It != VL.end() && "Expected at least one extract instruction.");
7198 auto *E0 = cast<Instruction>(*It);
7199 assert(
7200 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7201 "Invalid opcode");
7202 // Check if all of the extracts come from the same vector and from the
7203 // correct offset.
7204 Value *Vec = E0->getOperand(0);
7205
7206 CurrentOrder.clear();
7207
7208 // We have to extract from a vector/aggregate with the same number of elements.
7209 unsigned NElts;
7210 if (E0->getOpcode() == Instruction::ExtractValue) {
7211 NElts = canMapToVector(Vec->getType());
7212 if (!NElts)
7213 return false;
7214 // Check if load can be rewritten as load of vector.
7215 LoadInst *LI = dyn_cast<LoadInst>(Vec);
7216 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
7217 return false;
7218 } else {
7219 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
7220 }
7221
7222 unsigned E = VL.size();
7223 if (!ResizeAllowed && NElts != E)
7224 return false;
7226 unsigned MinIdx = NElts, MaxIdx = 0;
7227 for (auto [I, V] : enumerate(VL)) {
7228 auto *Inst = dyn_cast<Instruction>(V);
7229 if (!Inst)
7230 continue;
7231 if (Inst->getOperand(0) != Vec)
7232 return false;
7233 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
7234 if (isa<UndefValue>(EE->getIndexOperand()))
7235 continue;
7236 std::optional<unsigned> Idx = getExtractIndex(Inst);
7237 if (!Idx)
7238 return false;
7239 const unsigned ExtIdx = *Idx;
7240 if (ExtIdx >= NElts)
7241 continue;
7242 Indices[I] = ExtIdx;
7243 if (MinIdx > ExtIdx)
7244 MinIdx = ExtIdx;
7245 if (MaxIdx < ExtIdx)
7246 MaxIdx = ExtIdx;
7247 }
7248 if (MaxIdx - MinIdx + 1 > E)
7249 return false;
7250 if (MaxIdx + 1 <= E)
7251 MinIdx = 0;
7252
7253 // Check that all of the indices extract from the correct offset.
7254 bool ShouldKeepOrder = true;
7255 // Assign to all items the initial value E + 1 so we can check if the extract
7256 // instruction index was used already.
7257 // Also, later we can check that all the indices are used and we have a
7258 // consecutive access in the extract instructions, by checking that no
7259 // element of CurrentOrder still has value E + 1.
7260 CurrentOrder.assign(E, E);
7261 for (unsigned I = 0; I < E; ++I) {
7262 if (Indices[I] == PoisonMaskElem)
7263 continue;
7264 const unsigned ExtIdx = Indices[I] - MinIdx;
7265 if (CurrentOrder[ExtIdx] != E) {
7266 CurrentOrder.clear();
7267 return false;
7268 }
7269 ShouldKeepOrder &= ExtIdx == I;
7270 CurrentOrder[ExtIdx] = I;
7271 }
7272 if (ShouldKeepOrder)
7273 CurrentOrder.clear();
7274
7275 return ShouldKeepOrder;
7276}
7277
7278bool BoUpSLP::areAllUsersVectorized(
7279 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
7280 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
7281 all_of(I->users(), [this](User *U) {
7282 return ScalarToTreeEntry.contains(U) ||
7283 isVectorLikeInstWithConstOps(U) ||
7284 (isa<ExtractElementInst>(U) && MustGather.contains(U));
7285 });
7286}
7287
7288static std::pair<InstructionCost, InstructionCost>
7291 ArrayRef<Type *> ArgTys) {
7293
7294 // Calculate the cost of the scalar and vector calls.
7295 FastMathFlags FMF;
7296 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
7297 FMF = FPCI->getFastMathFlags();
7299 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
7300 dyn_cast<IntrinsicInst>(CI));
7301 auto IntrinsicCost =
7303
7304 auto Shape = VFShape::get(CI->getFunctionType(),
7306 false /*HasGlobalPred*/);
7307 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7308 auto LibCost = IntrinsicCost;
7309 if (!CI->isNoBuiltin() && VecFunc) {
7310 // Calculate the cost of the vector library call.
7311 // If the corresponding vector call is cheaper, return its cost.
7312 LibCost =
7313 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
7314 }
7315 return {IntrinsicCost, LibCost};
7316}
7317
7318void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7319 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
7320 SmallVectorImpl<Value *> *OpScalars,
7321 SmallVectorImpl<Value *> *AltScalars) const {
7322 unsigned Sz = Scalars.size();
7323 Mask.assign(Sz, PoisonMaskElem);
7324 SmallVector<int> OrderMask;
7325 if (!ReorderIndices.empty())
7326 inversePermutation(ReorderIndices, OrderMask);
7327 for (unsigned I = 0; I < Sz; ++I) {
7328 unsigned Idx = I;
7329 if (!ReorderIndices.empty())
7330 Idx = OrderMask[I];
7331 auto *OpInst = cast<Instruction>(Scalars[Idx]);
7332 if (IsAltOp(OpInst)) {
7333 Mask[I] = Sz + Idx;
7334 if (AltScalars)
7335 AltScalars->push_back(OpInst);
7336 } else {
7337 Mask[I] = Idx;
7338 if (OpScalars)
7339 OpScalars->push_back(OpInst);
7340 }
7341 }
7342 if (!ReuseShuffleIndices.empty()) {
7343 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
7344 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
7345 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7346 });
7347 Mask.swap(NewMask);
7348 }
7349}
7350
7352 const Instruction *MainOp,
7353 const Instruction *AltOp,
7354 const TargetLibraryInfo &TLI) {
7355 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7356 auto *AltCI = cast<CmpInst>(AltOp);
7357 CmpInst::Predicate MainP = MainCI->getPredicate();
7358 CmpInst::Predicate AltP = AltCI->getPredicate();
7359 assert(MainP != AltP && "Expected different main/alternate predicates.");
7360 auto *CI = cast<CmpInst>(I);
7361 if (isCmpSameOrSwapped(MainCI, CI, TLI))
7362 return false;
7363 if (isCmpSameOrSwapped(AltCI, CI, TLI))
7364 return true;
7365 CmpInst::Predicate P = CI->getPredicate();
7367
7368 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
7369 "CmpInst expected to match either main or alternate predicate or "
7370 "their swap.");
7371 (void)AltP;
7372 return MainP != P && MainP != SwappedP;
7373 }
7374 return I->getOpcode() == AltOp->getOpcode();
7375}
7376
7377TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
7378 assert(!Ops.empty());
7379 const auto *Op0 = Ops.front();
7380
7381 const bool IsConstant = all_of(Ops, [](Value *V) {
7382 // TODO: We should allow undef elements here
7383 return isConstant(V) && !isa<UndefValue>(V);
7384 });
7385 const bool IsUniform = all_of(Ops, [=](Value *V) {
7386 // TODO: We should allow undef elements here
7387 return V == Op0;
7388 });
7389 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
7390 // TODO: We should allow undef elements here
7391 if (auto *CI = dyn_cast<ConstantInt>(V))
7392 return CI->getValue().isPowerOf2();
7393 return false;
7394 });
7395 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
7396 // TODO: We should allow undef elements here
7397 if (auto *CI = dyn_cast<ConstantInt>(V))
7398 return CI->getValue().isNegatedPowerOf2();
7399 return false;
7400 });
7401
7403 if (IsConstant && IsUniform)
7405 else if (IsConstant)
7407 else if (IsUniform)
7409
7411 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
7412 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
7413
7414 return {VK, VP};
7415}
7416
7417namespace {
7418/// The base class for shuffle instruction emission and shuffle cost estimation.
7419class BaseShuffleAnalysis {
7420protected:
7421 /// Checks if the mask is an identity mask.
7422 /// \param IsStrict if is true the function returns false if mask size does
7423 /// not match vector size.
7424 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
7425 bool IsStrict) {
7426 int Limit = Mask.size();
7427 int VF = VecTy->getNumElements();
7428 int Index = -1;
7429 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
7430 return true;
7431 if (!IsStrict) {
7432 // Consider extract subvector starting from index 0.
7434 Index == 0)
7435 return true;
7436 // All VF-size submasks are identity (e.g.
7437 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
7438 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
7439 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
7440 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
7442 }))
7443 return true;
7444 }
7445 return false;
7446 }
7447
7448 /// Tries to combine 2 different masks into single one.
7449 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
7450 /// change the size of the vector, \p LocalVF is the original size of the
7451 /// shuffled vector.
7452 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
7453 ArrayRef<int> ExtMask) {
7454 unsigned VF = Mask.size();
7455 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
7456 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
7457 if (ExtMask[I] == PoisonMaskElem)
7458 continue;
7459 int MaskedIdx = Mask[ExtMask[I] % VF];
7460 NewMask[I] =
7461 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
7462 }
7463 Mask.swap(NewMask);
7464 }
7465
7466 /// Looks through shuffles trying to reduce final number of shuffles in the
7467 /// code. The function looks through the previously emitted shuffle
7468 /// instructions and properly mark indices in mask as undef.
7469 /// For example, given the code
7470 /// \code
7471 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
7472 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
7473 /// \endcode
7474 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
7475 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7476 /// <0, 1, 2, 3> for the shuffle.
7477 /// If 2 operands are of different size, the smallest one will be resized and
7478 /// the mask recalculated properly.
7479 /// For example, given the code
7480 /// \code
7481 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
7482 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
7483 /// \endcode
7484 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
7485 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7486 /// <0, 1, 2, 3> for the shuffle.
7487 /// So, it tries to transform permutations to simple vector merge, if
7488 /// possible.
7489 /// \param V The input vector which must be shuffled using the given \p Mask.
7490 /// If the better candidate is found, \p V is set to this best candidate
7491 /// vector.
7492 /// \param Mask The input mask for the shuffle. If the best candidate is found
7493 /// during looking-through-shuffles attempt, it is updated accordingly.
7494 /// \param SinglePermute true if the shuffle operation is originally a
7495 /// single-value-permutation. In this case the look-through-shuffles procedure
7496 /// may look for resizing shuffles as the best candidates.
7497 /// \return true if the shuffle results in the non-resizing identity shuffle
7498 /// (and thus can be ignored), false - otherwise.
7499 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
7500 bool SinglePermute) {
7501 Value *Op = V;
7502 ShuffleVectorInst *IdentityOp = nullptr;
7503 SmallVector<int> IdentityMask;
7504 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
7505 // Exit if not a fixed vector type or changing size shuffle.
7506 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7507 if (!SVTy)
7508 break;
7509 // Remember the identity or broadcast mask, if it is not a resizing
7510 // shuffle. If no better candidates are found, this Op and Mask will be
7511 // used in the final shuffle.
7512 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
7513 if (!IdentityOp || !SinglePermute ||
7514 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
7516 IdentityMask.size()))) {
7517 IdentityOp = SV;
7518 // Store current mask in the IdentityMask so later we did not lost
7519 // this info if IdentityOp is selected as the best candidate for the
7520 // permutation.
7521 IdentityMask.assign(Mask);
7522 }
7523 }
7524 // Remember the broadcast mask. If no better candidates are found, this Op
7525 // and Mask will be used in the final shuffle.
7526 // Zero splat can be used as identity too, since it might be used with
7527 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
7528 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
7529 // expensive, the analysis founds out, that the source vector is just a
7530 // broadcast, this original mask can be transformed to identity mask <0,
7531 // 1, 2, 3>.
7532 // \code
7533 // %0 = shuffle %v, poison, zeroinitalizer
7534 // %res = shuffle %0, poison, <3, 1, 2, 0>
7535 // \endcode
7536 // may be transformed to
7537 // \code
7538 // %0 = shuffle %v, poison, zeroinitalizer
7539 // %res = shuffle %0, poison, <0, 1, 2, 3>
7540 // \endcode
7541 if (SV->isZeroEltSplat()) {
7542 IdentityOp = SV;
7543 IdentityMask.assign(Mask);
7544 }
7545 int LocalVF = Mask.size();
7546 if (auto *SVOpTy =
7547 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7548 LocalVF = SVOpTy->getNumElements();
7549 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
7550 for (auto [Idx, I] : enumerate(Mask)) {
7551 if (I == PoisonMaskElem ||
7552 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
7553 continue;
7554 ExtMask[Idx] = SV->getMaskValue(I);
7555 }
7556 bool IsOp1Undef =
7557 isUndefVector(SV->getOperand(0),
7558 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
7559 .all();
7560 bool IsOp2Undef =
7561 isUndefVector(SV->getOperand(1),
7562 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
7563 .all();
7564 if (!IsOp1Undef && !IsOp2Undef) {
7565 // Update mask and mark undef elems.
7566 for (int &I : Mask) {
7567 if (I == PoisonMaskElem)
7568 continue;
7569 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
7571 I = PoisonMaskElem;
7572 }
7573 break;
7574 }
7575 SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
7576 SV->getShuffleMask().end());
7577 combineMasks(LocalVF, ShuffleMask, Mask);
7578 Mask.swap(ShuffleMask);
7579 if (IsOp2Undef)
7580 Op = SV->getOperand(0);
7581 else
7582 Op = SV->getOperand(1);
7583 }
7584 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
7585 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7587 if (IdentityOp) {
7588 V = IdentityOp;
7589 assert(Mask.size() == IdentityMask.size() &&
7590 "Expected masks of same sizes.");
7591 // Clear known poison elements.
7592 for (auto [I, Idx] : enumerate(Mask))
7593 if (Idx == PoisonMaskElem)
7594 IdentityMask[I] = PoisonMaskElem;
7595 Mask.swap(IdentityMask);
7596 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7597 return SinglePermute &&
7598 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
7599 /*IsStrict=*/true) ||
7600 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
7601 Shuffle->isZeroEltSplat() &&
7603 }
7604 V = Op;
7605 return false;
7606 }
7607 V = Op;
7608 return true;
7609 }
7610
7611 /// Smart shuffle instruction emission, walks through shuffles trees and
7612 /// tries to find the best matching vector for the actual shuffle
7613 /// instruction.
7614 template <typename T, typename ShuffleBuilderTy>
7615 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
7616 ShuffleBuilderTy &Builder) {
7617 assert(V1 && "Expected at least one vector value.");
7618 if (V2)
7619 Builder.resizeToMatch(V1, V2);
7620 int VF = Mask.size();
7621 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
7622 VF = FTy->getNumElements();
7623 if (V2 &&
7624 !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) {
7625 // Peek through shuffles.
7626 Value *Op1 = V1;
7627 Value *Op2 = V2;
7628 int VF =
7629 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
7630 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
7631 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
7632 for (int I = 0, E = Mask.size(); I < E; ++I) {
7633 if (Mask[I] < VF)
7634 CombinedMask1[I] = Mask[I];
7635 else
7636 CombinedMask2[I] = Mask[I] - VF;
7637 }
7638 Value *PrevOp1;
7639 Value *PrevOp2;
7640 do {
7641 PrevOp1 = Op1;
7642 PrevOp2 = Op2;
7643 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
7644 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
7645 // Check if we have 2 resizing shuffles - need to peek through operands
7646 // again.
7647 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
7648 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
7649 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
7650 for (auto [Idx, I] : enumerate(CombinedMask1)) {
7651 if (I == PoisonMaskElem)
7652 continue;
7653 ExtMask1[Idx] = SV1->getMaskValue(I);
7654 }
7655 SmallBitVector UseMask1 = buildUseMask(
7656 cast<FixedVectorType>(SV1->getOperand(1)->getType())
7657 ->getNumElements(),
7658 ExtMask1, UseMask::SecondArg);
7659 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
7660 for (auto [Idx, I] : enumerate(CombinedMask2)) {
7661 if (I == PoisonMaskElem)
7662 continue;
7663 ExtMask2[Idx] = SV2->getMaskValue(I);
7664 }
7665 SmallBitVector UseMask2 = buildUseMask(
7666 cast<FixedVectorType>(SV2->getOperand(1)->getType())
7667 ->getNumElements(),
7668 ExtMask2, UseMask::SecondArg);
7669 if (SV1->getOperand(0)->getType() ==
7670 SV2->getOperand(0)->getType() &&
7671 SV1->getOperand(0)->getType() != SV1->getType() &&
7672 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
7673 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
7674 Op1 = SV1->getOperand(0);
7675 Op2 = SV2->getOperand(0);
7676 SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
7677 SV1->getShuffleMask().end());
7678 int LocalVF = ShuffleMask1.size();
7679 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
7680 LocalVF = FTy->getNumElements();
7681 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
7682 CombinedMask1.swap(ShuffleMask1);
7683 SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
7684 SV2->getShuffleMask().end());
7685 LocalVF = ShuffleMask2.size();
7686 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
7687 LocalVF = FTy->getNumElements();
7688 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
7689 CombinedMask2.swap(ShuffleMask2);
7690 }
7691 }
7692 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
7693 Builder.resizeToMatch(Op1, Op2);
7694 VF = std::max(cast<VectorType>(Op1->getType())
7695 ->getElementCount()
7696 .getKnownMinValue(),
7697 cast<VectorType>(Op2->getType())
7698 ->getElementCount()
7699 .getKnownMinValue());
7700 for (int I = 0, E = Mask.size(); I < E; ++I) {
7701 if (CombinedMask2[I] != PoisonMaskElem) {
7702 assert(CombinedMask1[I] == PoisonMaskElem &&
7703 "Expected undefined mask element");
7704 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
7705 }
7706 }
7707 if (Op1 == Op2 &&
7708 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
7709 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
7710 isa<ShuffleVectorInst>(Op1) &&
7711 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
7712 ArrayRef(CombinedMask1))))
7713 return Builder.createIdentity(Op1);
7714 return Builder.createShuffleVector(
7715 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
7716 CombinedMask1);
7717 }
7718 if (isa<PoisonValue>(V1))
7719 return Builder.createPoison(
7720 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
7721 SmallVector<int> NewMask(Mask.begin(), Mask.end());
7722 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
7723 assert(V1 && "Expected non-null value after looking through shuffles.");
7724
7725 if (!IsIdentity)
7726 return Builder.createShuffleVector(V1, NewMask);
7727 return Builder.createIdentity(V1);
7728 }
7729};
7730} // namespace
7731
7732/// Returns the cost of the shuffle instructions with the given \p Kind, vector
7733/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
7734/// subvector pattern.
7735static InstructionCost
7737 VectorType *Tp, ArrayRef<int> Mask = std::nullopt,
7739 int Index = 0, VectorType *SubTp = nullptr,
7740 ArrayRef<const Value *> Args = std::nullopt) {
7741 if (Kind != TTI::SK_PermuteTwoSrc)
7742 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
7743 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7744 int NumSubElts;
7745 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
7746 Mask, NumSrcElts, NumSubElts, Index)) {
7747 if (Index + NumSubElts > NumSrcElts &&
7748 Index + NumSrcElts <= static_cast<int>(Mask.size()))
7749 return TTI.getShuffleCost(
7751 FixedVectorType::get(Tp->getElementType(), Mask.size()), Mask,
7753 }
7754 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
7755}
7756
7757/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
7758static std::pair<InstructionCost, InstructionCost>
7760 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
7761 Type *ScalarTy, VectorType *VecTy) {
7762 InstructionCost ScalarCost = 0;
7763 InstructionCost VecCost = 0;
7764 // Here we differentiate two cases: (1) when Ptrs represent a regular
7765 // vectorization tree node (as they are pointer arguments of scattered
7766 // loads) or (2) when Ptrs are the arguments of loads or stores being
7767 // vectorized as plane wide unit-stride load/store since all the
7768 // loads/stores are known to be from/to adjacent locations.
7769 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
7770 // Case 2: estimate costs for pointer related costs when vectorizing to
7771 // a wide load/store.
7772 // Scalar cost is estimated as a set of pointers with known relationship
7773 // between them.
7774 // For vector code we will use BasePtr as argument for the wide load/store
7775 // but we also need to account all the instructions which are going to
7776 // stay in vectorized code due to uses outside of these scalar
7777 // loads/stores.
7778 ScalarCost = TTI.getPointersChainCost(
7779 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
7780 CostKind);
7781
7782 SmallVector<const Value *> PtrsRetainedInVecCode;
7783 for (Value *V : Ptrs) {
7784 if (V == BasePtr) {
7785 PtrsRetainedInVecCode.push_back(V);
7786 continue;
7787 }
7788 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
7789 // For simplicity assume Ptr to stay in vectorized code if it's not a
7790 // GEP instruction. We don't care since it's cost considered free.
7791 // TODO: We should check for any uses outside of vectorizable tree
7792 // rather than just single use.
7793 if (!Ptr || !Ptr->hasOneUse())
7794 PtrsRetainedInVecCode.push_back(V);
7795 }
7796
7797 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
7798 // If all pointers stay in vectorized code then we don't have
7799 // any savings on that.
7800 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
7801 }
7802 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
7803 TTI::PointersChainInfo::getKnownStride(),
7804 VecTy, CostKind);
7805 } else {
7806 // Case 1: Ptrs are the arguments of loads that we are going to transform
7807 // into masked gather load intrinsic.
7808 // All the scalar GEPs will be removed as a result of vectorization.
7809 // For any external uses of some lanes extract element instructions will
7810 // be generated (which cost is estimated separately).
7811 TTI::PointersChainInfo PtrsInfo =
7812 all_of(Ptrs,
7813 [](const Value *V) {
7814 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
7815 return Ptr && !Ptr->hasAllConstantIndices();
7816 })
7817 ? TTI::PointersChainInfo::getUnknownStride()
7818 : TTI::PointersChainInfo::getKnownStride();
7819
7820 ScalarCost =
7821 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
7822 if (auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {
7823 SmallVector<const Value *> Indices(BaseGEP->indices());
7824 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
7825 BaseGEP->getPointerOperand(), Indices, VecTy,
7826 CostKind);
7827 }
7828 }
7829
7830 return std::make_pair(ScalarCost, VecCost);
7831}
7832
7835 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
7836 TreeEntry &E = *TE.get();
7837 switch (E.getOpcode()) {
7838 case Instruction::Load: {
7839 Type *ScalarTy = E.getMainOp()->getType();
7840 auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
7841 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
7842 // Check if profitable to represent consecutive load + reverse as strided
7843 // load with stride -1.
7844 if (isReverseOrder(E.ReorderIndices) &&
7845 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
7846 SmallVector<int> Mask;
7847 inversePermutation(E.ReorderIndices, Mask);
7848 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
7849 InstructionCost OriginalVecCost =
7850 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
7855 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
7856 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
7857 if (StridedCost < OriginalVecCost)
7858 // Strided load is more profitable than consecutive load + reverse -
7859 // transform the node to strided load.
7860 E.State = TreeEntry::StridedVectorize;
7861 }
7862 break;
7863 }
7864 default:
7865 break;
7866 }
7867 }
7868}
7869
7870/// Merges shuffle masks and emits final shuffle instruction, if required. It
7871/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
7872/// when the actual shuffle instruction is generated only if this is actually
7873/// required. Otherwise, the shuffle instruction emission is delayed till the
7874/// end of the process, to reduce the number of emitted instructions and further
7875/// analysis/transformations.
7876class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7877 bool IsFinalized = false;
7878 SmallVector<int> CommonMask;
7880 Type *ScalarTy = nullptr;
7881 const TargetTransformInfo &TTI;
7883 SmallDenseSet<Value *> VectorizedVals;
7884 BoUpSLP &R;
7885 SmallPtrSetImpl<Value *> &CheckedExtracts;
7886 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7887 /// While set, still trying to estimate the cost for the same nodes and we
7888 /// can delay actual cost estimation (virtual shuffle instruction emission).
7889 /// May help better estimate the cost if same nodes must be permuted + allows
7890 /// to move most of the long shuffles cost estimation to TTI.
7891 bool SameNodesEstimated = true;
7892
7893 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
7894 if (Ty->getScalarType()->isPointerTy()) {
7898 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
7899 Ty->getScalarType());
7900 if (auto *VTy = dyn_cast<VectorType>(Ty))
7901 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
7902 return Res;
7903 }
7904 return Constant::getAllOnesValue(Ty);
7905 }
7906
7907 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
7908 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
7909 return TTI::TCC_Free;
7910 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
7911 InstructionCost GatherCost = 0;
7912 SmallVector<Value *> Gathers(VL.begin(), VL.end());
7913 // Improve gather cost for gather of loads, if we can group some of the
7914 // loads into vector loads.
7915 InstructionsState S = getSameOpcode(VL, *R.TLI);
7916 const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy);
7917 unsigned MinVF = R.getMinVF(2 * Sz);
7918 if (VL.size() > 2 &&
7919 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
7920 (InVectors.empty() &&
7921 any_of(seq<unsigned>(0, VL.size() / MinVF),
7922 [&](unsigned Idx) {
7923 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
7924 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
7925 return S.getOpcode() == Instruction::Load &&
7926 !S.isAltShuffle();
7927 }))) &&
7928 !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
7929 !isSplat(Gathers)) {
7930 InstructionCost BaseCost = R.getGatherCost(Gathers, !Root, ScalarTy);
7931 SetVector<Value *> VectorizedLoads;
7933 SmallVector<unsigned> ScatterVectorized;
7934 unsigned StartIdx = 0;
7935 unsigned VF = VL.size() / 2;
7936 for (; VF >= MinVF; VF /= 2) {
7937 for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
7938 Cnt += VF) {
7939 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
7940 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
7941 InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);
7942 if (SliceS.getOpcode() != Instruction::Load ||
7943 SliceS.isAltShuffle())
7944 continue;
7945 }
7946 if (!VectorizedLoads.count(Slice.front()) &&
7947 !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
7948 SmallVector<Value *> PointerOps;
7949 OrdersType CurrentOrder;
7950 LoadsState LS = R.canVectorizeLoads(Slice, Slice.front(),
7951 CurrentOrder, PointerOps);
7952 switch (LS) {
7956 // Mark the vectorized loads so that we don't vectorize them
7957 // again.
7958 // TODO: better handling of loads with reorders.
7959 if (((LS == LoadsState::Vectorize ||
7961 CurrentOrder.empty()) ||
7963 isReverseOrder(CurrentOrder)))
7964 VectorizedStarts.emplace_back(Cnt, LS);
7965 else
7966 ScatterVectorized.push_back(Cnt);
7967 VectorizedLoads.insert(Slice.begin(), Slice.end());
7968 // If we vectorized initial block, no need to try to vectorize
7969 // it again.
7970 if (Cnt == StartIdx)
7971 StartIdx += VF;
7972 break;
7973 case LoadsState::Gather:
7974 break;
7975 }
7976 }
7977 }
7978 // Check if the whole array was vectorized already - exit.
7979 if (StartIdx >= VL.size())
7980 break;
7981 // Found vectorizable parts - exit.
7982 if (!VectorizedLoads.empty())
7983 break;
7984 }
7985 if (!VectorizedLoads.empty()) {
7986 unsigned NumParts = TTI.getNumberOfParts(VecTy);
7987 bool NeedInsertSubvectorAnalysis =
7988 !NumParts || (VL.size() / VF) > NumParts;
7989 // Get the cost for gathered loads.
7990 for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
7991 if (VectorizedLoads.contains(VL[I]))
7992 continue;
7993 GatherCost +=
7994 getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root);
7995 }
7996 // Exclude potentially vectorized loads from list of gathered
7997 // scalars.
7998 Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));
7999 // The cost for vectorized loads.
8000 InstructionCost ScalarsCost = 0;
8001 for (Value *V : VectorizedLoads) {
8002 auto *LI = cast<LoadInst>(V);
8003 ScalarsCost +=
8004 TTI.getMemoryOpCost(Instruction::Load, LI->getType(),
8005 LI->getAlign(), LI->getPointerAddressSpace(),
8006 CostKind, TTI::OperandValueInfo(), LI);
8007 }
8008 auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);
8009 for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
8010 auto *LI = cast<LoadInst>(VL[P.first]);
8011 Align Alignment = LI->getAlign();
8012 GatherCost +=
8013 P.second == LoadsState::Vectorize
8014 ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
8015 LI->getPointerAddressSpace(), CostKind,
8018 Instruction::Load, LoadTy, LI->getPointerOperand(),
8019 /*VariableMask=*/false, Alignment, CostKind, LI);
8020 // Estimate GEP cost.
8021 SmallVector<Value *> PointerOps(VF);
8022 for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
8023 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8024 auto [ScalarGEPCost, VectorGEPCost] =
8025 getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
8026 Instruction::Load, CostKind, LI->getType(), LoadTy);
8027 GatherCost += VectorGEPCost - ScalarGEPCost;
8028 }
8029 for (unsigned P : ScatterVectorized) {
8030 auto *LI0 = cast<LoadInst>(VL[P]);
8031 ArrayRef<Value *> Slice = VL.slice(P, VF);
8032 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
8033 GatherCost += TTI.getGatherScatterOpCost(
8034 Instruction::Load, LoadTy, LI0->getPointerOperand(),
8035 /*VariableMask=*/false, CommonAlignment, CostKind, LI0);
8036 // Estimate GEP cost.
8037 SmallVector<Value *> PointerOps(VF);
8038 for (auto [I, V] : enumerate(Slice))
8039 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8040 OrdersType Order;
8041 if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE,
8042 Order)) {
8043 // TODO: improve checks if GEPs can be vectorized.
8044 Value *Ptr0 = PointerOps.front();
8045 Type *ScalarTy = Ptr0->getType();
8046 auto *VecTy = FixedVectorType::get(ScalarTy, VF);
8047 auto [ScalarGEPCost, VectorGEPCost] =
8048 getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr,
8049 CostKind, ScalarTy, VecTy);
8050 GatherCost += VectorGEPCost - ScalarGEPCost;
8051 if (!Order.empty()) {
8052 SmallVector<int> Mask;
8053 inversePermutation(Order, Mask);
8055 VecTy, Mask, CostKind);
8056 }
8057 } else {
8058 GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true,
8059 PointerOps.front()->getType());
8060 }
8061 }
8062 if (NeedInsertSubvectorAnalysis) {
8063 // Add the cost for the subvectors insert.
8064 SmallVector<int> ShuffleMask(VL.size());
8065 for (unsigned I = VF, E = VL.size(); I < E; I += VF) {
8066 for (unsigned Idx : seq<unsigned>(0, E))
8067 ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx;
8068 GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy,
8069 ShuffleMask, CostKind, I, LoadTy);
8070 }
8071 }
8072 GatherCost -= ScalarsCost;
8073 }
8074 GatherCost = std::min(BaseCost, GatherCost);
8075 } else if (!Root && isSplat(VL)) {
8076 // Found the broadcasting of the single scalar, calculate the cost as
8077 // the broadcast.
8078 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
8079 assert(It != VL.end() && "Expected at least one non-undef value.");
8080 // Add broadcast for non-identity shuffle only.
8081 bool NeedShuffle =
8082 count(VL, *It) > 1 &&
8083 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
8084 if (!NeedShuffle)
8085 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
8086 CostKind, std::distance(VL.begin(), It),
8087 PoisonValue::get(VecTy), *It);
8088
8089 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
8090 transform(VL, ShuffleMask.begin(), [](Value *V) {
8091 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8092 });
8093 InstructionCost InsertCost =
8094 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
8095 PoisonValue::get(VecTy), *It);
8097 VecTy, ShuffleMask, CostKind,
8098 /*Index=*/0, /*SubTp=*/nullptr,
8099 /*Args=*/*It);
8100 }
8101 return GatherCost +
8102 (all_of(Gathers, IsaPred<UndefValue>)
8104 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
8105 ScalarTy));
8106 };
8107
8108 /// Compute the cost of creating a vector containing the extracted values from
8109 /// \p VL.
8111 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
8112 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8113 unsigned NumParts) {
8114 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
8115 unsigned NumElts =
8116 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
8117 auto *EE = dyn_cast<ExtractElementInst>(V);
8118 if (!EE)
8119 return Sz;
8120 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8121 if (!VecTy)
8122 return Sz;
8123 return std::max(Sz, VecTy->getNumElements());
8124 });
8125 unsigned NumSrcRegs =
8126 TTI.getNumberOfParts(FixedVectorType::get(ScalarTy, NumElts));
8127 if (NumSrcRegs == 0)
8128 NumSrcRegs = 1;
8129 // FIXME: this must be moved to TTI for better estimation.
8130 unsigned EltsPerVector = PowerOf2Ceil(std::max(
8131 divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
8132 auto CheckPerRegistersShuffle =
8133 [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
8134 DenseSet<int> RegIndices;
8135 // Check that if trying to permute same single/2 input vectors.
8137 int FirstRegId = -1;
8138 for (int &I : Mask) {
8139 if (I == PoisonMaskElem)
8140 continue;
8141 int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
8142 if (FirstRegId < 0)
8143 FirstRegId = RegId;
8144 RegIndices.insert(RegId);
8145 if (RegIndices.size() > 2)
8146 return std::nullopt;
8147 if (RegIndices.size() == 2)
8148 ShuffleKind = TTI::SK_PermuteTwoSrc;
8149 I = (I % NumElts) % EltsPerVector +
8150 (RegId == FirstRegId ? 0 : EltsPerVector);
8151 }
8152 return ShuffleKind;
8153 };
8155
8156 // Process extracts in blocks of EltsPerVector to check if the source vector
8157 // operand can be re-used directly. If not, add the cost of creating a
8158 // shuffle to extract the values into a vector register.
8159 for (unsigned Part = 0; Part < NumParts; ++Part) {
8160 if (!ShuffleKinds[Part])
8161 continue;
8162 ArrayRef<int> MaskSlice =
8163 Mask.slice(Part * EltsPerVector,
8164 (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
8165 ? Mask.size() % EltsPerVector
8166 : EltsPerVector);
8167 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
8168 copy(MaskSlice, SubMask.begin());
8169 std::optional<TTI::ShuffleKind> RegShuffleKind =
8170 CheckPerRegistersShuffle(SubMask);
8171 if (!RegShuffleKind) {
8172 Cost += ::getShuffleCost(TTI, *ShuffleKinds[Part],
8173 FixedVectorType::get(ScalarTy, NumElts),
8174 MaskSlice);
8175 continue;
8176 }
8177 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
8178 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
8179 Cost += ::getShuffleCost(TTI, *RegShuffleKind,
8180 FixedVectorType::get(ScalarTy, EltsPerVector),
8181 SubMask);
8182 }
8183 }
8184 return Cost;
8185 }
8186 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
8187 /// shuffle emission.
8188 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
8189 ArrayRef<int> Mask) {
8190 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8191 if (Mask[Idx] != PoisonMaskElem)
8192 CommonMask[Idx] = Idx;
8193 }
8194 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
8195 /// mask \p Mask, register number \p Part, that includes \p SliceSize
8196 /// elements.
8197 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
8198 ArrayRef<int> Mask, unsigned Part,
8199 unsigned SliceSize) {
8200 if (SameNodesEstimated) {
8201 // Delay the cost estimation if the same nodes are reshuffling.
8202 // If we already requested the cost of reshuffling of E1 and E2 before, no
8203 // need to estimate another cost with the sub-Mask, instead include this
8204 // sub-Mask into the CommonMask to estimate it later and avoid double cost
8205 // estimation.
8206 if ((InVectors.size() == 2 &&
8207 InVectors.front().get<const TreeEntry *>() == &E1 &&
8208 InVectors.back().get<const TreeEntry *>() == E2) ||
8209 (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
8210 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
8211 [](int Idx) { return Idx == PoisonMaskElem; }) &&
8212 "Expected all poisoned elements.");
8213 ArrayRef<int> SubMask =
8214 ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
8215 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
8216 return;
8217 }
8218 // Found non-matching nodes - need to estimate the cost for the matched
8219 // and transform mask.
8220 Cost += createShuffle(InVectors.front(),
8221 InVectors.size() == 1 ? nullptr : InVectors.back(),
8222 CommonMask);
8223 transformMaskAfterShuffle(CommonMask, CommonMask);
8224 }
8225 SameNodesEstimated = false;
8226 if (!E2 && InVectors.size() == 1) {
8227 unsigned VF = E1.getVectorFactor();
8228 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
8229 VF = std::max(VF,
8230 cast<FixedVectorType>(V1->getType())->getNumElements());
8231 } else {
8232 const auto *E = InVectors.front().get<const TreeEntry *>();
8233 VF = std::max(VF, E->getVectorFactor());
8234 }
8235 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8236 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8237 CommonMask[Idx] = Mask[Idx] + VF;
8238 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
8239 transformMaskAfterShuffle(CommonMask, CommonMask);
8240 } else {
8241 Cost += createShuffle(&E1, E2, Mask);
8242 transformMaskAfterShuffle(CommonMask, Mask);
8243 }
8244 }
8245
8246 class ShuffleCostBuilder {
8247 const TargetTransformInfo &TTI;
8248
8249 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
8250 int Index = -1;
8251 return Mask.empty() ||
8252 (VF == Mask.size() &&
8255 Index == 0);
8256 }
8257
8258 public:
8259 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
8260 ~ShuffleCostBuilder() = default;
8261 InstructionCost createShuffleVector(Value *V1, Value *,
8262 ArrayRef<int> Mask) const {
8263 // Empty mask or identity mask are free.
8264 unsigned VF =
8265 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8266 if (isEmptyOrIdentity(Mask, VF))
8267 return TTI::TCC_Free;
8268 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
8269 cast<VectorType>(V1->getType()), Mask);
8270 }
8271 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
8272 // Empty mask or identity mask are free.
8273 unsigned VF =
8274 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8275 if (isEmptyOrIdentity(Mask, VF))
8276 return TTI::TCC_Free;
8278 cast<VectorType>(V1->getType()), Mask);
8279 }
8280 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
8281 InstructionCost createPoison(Type *Ty, unsigned VF) const {
8282 return TTI::TCC_Free;
8283 }
8284 void resizeToMatch(Value *&, Value *&) const {}
8285 };
8286
8287 /// Smart shuffle instruction emission, walks through shuffles trees and
8288 /// tries to find the best matching vector for the actual shuffle
8289 /// instruction.
8291 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
8293 ArrayRef<int> Mask) {
8294 ShuffleCostBuilder Builder(TTI);
8295 SmallVector<int> CommonMask(Mask.begin(), Mask.end());
8296 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
8297 unsigned CommonVF = Mask.size();
8298 InstructionCost ExtraCost = 0;
8299 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
8300 unsigned VF) -> InstructionCost {
8301 if (E.State == TreeEntry::NeedToGather && allConstant(E.Scalars))
8302 return TTI::TCC_Free;
8303 Type *EScalarTy = E.Scalars.front()->getType();
8304 bool IsSigned = true;
8305 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
8306 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
8307 IsSigned = It->second.second;
8308 }
8309 if (EScalarTy != ScalarTy) {
8310 unsigned CastOpcode = Instruction::Trunc;
8311 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8312 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8313 if (DstSz > SrcSz)
8314 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8315 return TTI.getCastInstrCost(CastOpcode,
8316 FixedVectorType::get(ScalarTy, VF),
8317 FixedVectorType::get(EScalarTy, VF),
8318 TTI::CastContextHint::None, CostKind);
8319 }
8320 return TTI::TCC_Free;
8321 };
8322 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
8323 if (isa<Constant>(V))
8324 return TTI::TCC_Free;
8325 auto *VecTy = cast<VectorType>(V->getType());
8326 Type *EScalarTy = VecTy->getElementType();
8327 if (EScalarTy != ScalarTy) {
8328 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
8329 unsigned CastOpcode = Instruction::Trunc;
8330 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8331 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8332 if (DstSz > SrcSz)
8333 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8334 return TTI.getCastInstrCost(
8335 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
8336 VecTy, TTI::CastContextHint::None, CostKind);
8337 }
8338 return TTI::TCC_Free;
8339 };
8340 if (!V1 && !V2 && !P2.isNull()) {
8341 // Shuffle 2 entry nodes.
8342 const TreeEntry *E = P1.get<const TreeEntry *>();
8343 unsigned VF = E->getVectorFactor();
8344 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8345 CommonVF = std::max(VF, E2->getVectorFactor());
8346 assert(all_of(Mask,
8347 [=](int Idx) {
8348 return Idx < 2 * static_cast<int>(CommonVF);
8349 }) &&
8350 "All elements in mask must be less than 2 * CommonVF.");
8351 if (E->Scalars.size() == E2->Scalars.size()) {
8352 SmallVector<int> EMask = E->getCommonMask();
8353 SmallVector<int> E2Mask = E2->getCommonMask();
8354 if (!EMask.empty() || !E2Mask.empty()) {
8355 for (int &Idx : CommonMask) {
8356 if (Idx == PoisonMaskElem)
8357 continue;
8358 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
8359 Idx = EMask[Idx];
8360 else if (Idx >= static_cast<int>(CommonVF))
8361 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
8362 E->Scalars.size();
8363 }
8364 }
8365 CommonVF = E->Scalars.size();
8366 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
8367 GetNodeMinBWAffectedCost(*E2, CommonVF);
8368 } else {
8369 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
8370 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
8371 }
8372 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8373 V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
8374 } else if (!V1 && P2.isNull()) {
8375 // Shuffle single entry node.
8376 const TreeEntry *E = P1.get<const TreeEntry *>();
8377 unsigned VF = E->getVectorFactor();
8378 CommonVF = VF;
8379 assert(
8380 all_of(Mask,
8381 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8382 "All elements in mask must be less than CommonVF.");
8383 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8384 SmallVector<int> EMask = E->getCommonMask();
8385 assert(!EMask.empty() && "Expected non-empty common mask.");
8386 for (int &Idx : CommonMask) {
8387 if (Idx != PoisonMaskElem)
8388 Idx = EMask[Idx];
8389 }
8390 CommonVF = E->Scalars.size();
8391 }
8392 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
8393 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8394 // Not identity/broadcast? Try to see if the original vector is better.
8395 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8396 CommonVF == CommonMask.size() &&
8397 any_of(enumerate(CommonMask),
8398 [](const auto &&P) {
8399 return P.value() != PoisonMaskElem &&
8400 static_cast<unsigned>(P.value()) != P.index();
8401 }) &&
8402 any_of(CommonMask,
8403 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
8404 SmallVector<int> ReorderMask;
8405 inversePermutation(E->ReorderIndices, ReorderMask);
8406 ::addMask(CommonMask, ReorderMask);
8407 }
8408 } else if (V1 && P2.isNull()) {
8409 // Shuffle single vector.
8410 ExtraCost += GetValueMinBWAffectedCost(V1);
8411 CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
8412 assert(
8413 all_of(Mask,
8414 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8415 "All elements in mask must be less than CommonVF.");
8416 } else if (V1 && !V2) {
8417 // Shuffle vector and tree node.
8418 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8419 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8420 CommonVF = std::max(VF, E2->getVectorFactor());
8421 assert(all_of(Mask,
8422 [=](int Idx) {
8423 return Idx < 2 * static_cast<int>(CommonVF);
8424 }) &&
8425 "All elements in mask must be less than 2 * CommonVF.");
8426 if (E2->Scalars.size() == VF && VF != CommonVF) {
8427 SmallVector<int> E2Mask = E2->getCommonMask();
8428 assert(!E2Mask.empty() && "Expected non-empty common mask.");
8429 for (int &Idx : CommonMask) {
8430 if (Idx == PoisonMaskElem)
8431 continue;
8432 if (Idx >= static_cast<int>(CommonVF))
8433 Idx = E2Mask[Idx - CommonVF] + VF;
8434 }
8435 CommonVF = VF;
8436 }
8437 ExtraCost += GetValueMinBWAffectedCost(V1);
8438 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8439 ExtraCost += GetNodeMinBWAffectedCost(
8440 *E2, std::min(CommonVF, E2->getVectorFactor()));
8441 V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
8442 } else if (!V1 && V2) {
8443 // Shuffle vector and tree node.
8444 unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
8445 const TreeEntry *E1 = P1.get<const TreeEntry *>();
8446 CommonVF = std::max(VF, E1->getVectorFactor());
8447 assert(all_of(Mask,
8448 [=](int Idx) {
8449 return Idx < 2 * static_cast<int>(CommonVF);
8450 }) &&
8451 "All elements in mask must be less than 2 * CommonVF.");
8452 if (E1->Scalars.size() == VF && VF != CommonVF) {
8453 SmallVector<int> E1Mask = E1->getCommonMask();
8454 assert(!E1Mask.empty() && "Expected non-empty common mask.");
8455 for (int &Idx : CommonMask) {
8456 if (Idx == PoisonMaskElem)
8457 continue;
8458 if (Idx >= static_cast<int>(CommonVF))
8459 Idx = E1Mask[Idx - CommonVF] + VF;
8460 else
8461 Idx = E1Mask[Idx];
8462 }
8463 CommonVF = VF;
8464 }
8465 ExtraCost += GetNodeMinBWAffectedCost(
8466 *E1, std::min(CommonVF, E1->getVectorFactor()));
8467 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8468 ExtraCost += GetValueMinBWAffectedCost(V2);
8469 V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
8470 } else {
8471 assert(V1 && V2 && "Expected both vectors.");
8472 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8473 CommonVF =
8474 std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
8475 assert(all_of(Mask,
8476 [=](int Idx) {
8477 return Idx < 2 * static_cast<int>(CommonVF);
8478 }) &&
8479 "All elements in mask must be less than 2 * CommonVF.");
8480 ExtraCost +=
8481 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
8482 if (V1->getType() != V2->getType()) {
8483 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8484 V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
8485 } else {
8486 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
8487 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8488 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
8489 V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
8490 }
8491 }
8492 InVectors.front() = Constant::getNullValue(
8493 FixedVectorType::get(ScalarTy, CommonMask.size()));
8494 if (InVectors.size() == 2)
8495 InVectors.pop_back();
8496 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
8497 V1, V2, CommonMask, Builder);
8498 }
8499
8500public:
8502 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
8503 SmallPtrSetImpl<Value *> &CheckedExtracts)
8504 : ScalarTy(ScalarTy), TTI(TTI),
8505 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
8506 CheckedExtracts(CheckedExtracts) {}
8507 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
8508 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8509 unsigned NumParts, bool &UseVecBaseAsInput) {
8510 UseVecBaseAsInput = false;
8511 if (Mask.empty())
8512 return nullptr;
8513 Value *VecBase = nullptr;
8514 ArrayRef<Value *> VL = E->Scalars;
8515 // If the resulting type is scalarized, do not adjust the cost.
8516 if (NumParts == VL.size())
8517 return nullptr;
8518 // Check if it can be considered reused if same extractelements were
8519 // vectorized already.
8520 bool PrevNodeFound = any_of(
8521 ArrayRef(R.VectorizableTree).take_front(E->Idx),
8522 [&](const std::unique_ptr<TreeEntry> &TE) {
8523 return ((!TE->isAltShuffle() &&
8524 TE->getOpcode() == Instruction::ExtractElement) ||
8525 TE->State == TreeEntry::NeedToGather) &&
8526 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
8527 return VL.size() > Data.index() &&
8528 (Mask[Data.index()] == PoisonMaskElem ||
8529 isa<UndefValue>(VL[Data.index()]) ||
8530 Data.value() == VL[Data.index()]);
8531 });
8532 });
8533 SmallPtrSet<Value *, 4> UniqueBases;
8534 unsigned SliceSize = VL.size() / NumParts;
8535 for (unsigned Part = 0; Part < NumParts; ++Part) {
8536 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
8537 for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
8538 // Ignore non-extractelement scalars.
8539 if (isa<UndefValue>(V) ||
8540 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
8541 continue;
8542 // If all users of instruction are going to be vectorized and this
8543 // instruction itself is not going to be vectorized, consider this
8544 // instruction as dead and remove its cost from the final cost of the
8545 // vectorized tree.
8546 // Also, avoid adjusting the cost for extractelements with multiple uses
8547 // in different graph entries.
8548 auto *EE = cast<ExtractElementInst>(V);
8549 VecBase = EE->getVectorOperand();
8550 UniqueBases.insert(VecBase);
8551 const TreeEntry *VE = R.getTreeEntry(V);
8552 if (!CheckedExtracts.insert(V).second ||
8553 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
8554 any_of(EE->users(),
8555 [&](User *U) {
8556 return isa<GetElementPtrInst>(U) &&
8557 !R.areAllUsersVectorized(cast<Instruction>(U),
8558 &VectorizedVals);
8559 }) ||
8560 (VE && VE != E))
8561 continue;
8562 std::optional<unsigned> EEIdx = getExtractIndex(EE);
8563 if (!EEIdx)
8564 continue;
8565 unsigned Idx = *EEIdx;
8566 // Take credit for instruction that will become dead.
8567 if (EE->hasOneUse() || !PrevNodeFound) {
8568 Instruction *Ext = EE->user_back();
8569 if (isa<SExtInst, ZExtInst>(Ext) &&
8570 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
8571 // Use getExtractWithExtendCost() to calculate the cost of
8572 // extractelement/ext pair.
8573 Cost -=
8574 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
8575 EE->getVectorOperandType(), Idx);
8576 // Add back the cost of s|zext which is subtracted separately.
8578 Ext->getOpcode(), Ext->getType(), EE->getType(),
8579 TTI::getCastContextHint(Ext), CostKind, Ext);
8580 continue;
8581 }
8582 }
8583 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
8584 CostKind, Idx);
8585 }
8586 }
8587 // Check that gather of extractelements can be represented as just a
8588 // shuffle of a single/two vectors the scalars are extracted from.
8589 // Found the bunch of extractelement instructions that must be gathered
8590 // into a vector and can be represented as a permutation elements in a
8591 // single input vector or of 2 input vectors.
8592 // Done for reused if same extractelements were vectorized already.
8593 if (!PrevNodeFound)
8594 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8595 InVectors.assign(1, E);
8596 CommonMask.assign(Mask.begin(), Mask.end());
8597 transformMaskAfterShuffle(CommonMask, CommonMask);
8598 SameNodesEstimated = false;
8599 if (NumParts != 1 && UniqueBases.size() != 1) {
8600 UseVecBaseAsInput = true;
8601 VecBase = Constant::getNullValue(
8602 FixedVectorType::get(ScalarTy, CommonMask.size()));
8603 }
8604 return VecBase;
8605 }
8606 /// Checks if the specified entry \p E needs to be delayed because of its
8607 /// dependency nodes.
8608 std::optional<InstructionCost>
8609 needToDelay(const TreeEntry *,
8611 // No need to delay the cost estimation during analysis.
8612 return std::nullopt;
8613 }
8614 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
8615 if (&E1 == &E2) {
8616 assert(all_of(Mask,
8617 [&](int Idx) {
8618 return Idx < static_cast<int>(E1.getVectorFactor());
8619 }) &&
8620 "Expected single vector shuffle mask.");
8621 add(E1, Mask);
8622 return;
8623 }
8624 if (InVectors.empty()) {
8625 CommonMask.assign(Mask.begin(), Mask.end());
8626 InVectors.assign({&E1, &E2});
8627 return;
8628 }
8629 assert(!CommonMask.empty() && "Expected non-empty common mask.");
8630 auto *MaskVecTy = FixedVectorType::get(ScalarTy, Mask.size());
8631 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
8632 if (NumParts == 0 || NumParts >= Mask.size())
8633 NumParts = 1;
8634 unsigned SliceSize = Mask.size() / NumParts;
8635 const auto *It =
8636 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
8637 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8638 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
8639 }
8640 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
8641 if (InVectors.empty()) {
8642 CommonMask.assign(Mask.begin(), Mask.end());
8643 InVectors.assign(1, &E1);
8644 return;
8645 }
8646 assert(!CommonMask.empty() && "Expected non-empty common mask.");
8647 auto *MaskVecTy = FixedVectorType::get(ScalarTy, Mask.size());
8648 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
8649 if (NumParts == 0 || NumParts >= Mask.size())
8650 NumParts = 1;
8651 unsigned SliceSize = Mask.size() / NumParts;
8652 const auto *It =
8653 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
8654 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8655 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
8656 if (!SameNodesEstimated && InVectors.size() == 1)
8657 InVectors.emplace_back(&E1);
8658 }
8659 /// Adds 2 input vectors and the mask for their shuffling.
8660 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
8661 // May come only for shuffling of 2 vectors with extractelements, already
8662 // handled in adjustExtracts.
8663 assert(InVectors.size() == 1 &&
8664 all_of(enumerate(CommonMask),
8665 [&](auto P) {
8666 if (P.value() == PoisonMaskElem)
8667 return Mask[P.index()] == PoisonMaskElem;
8668 auto *EI =
8669 cast<ExtractElementInst>(InVectors.front()
8670 .get<const TreeEntry *>()
8671 ->Scalars[P.index()]);
8672 return EI->getVectorOperand() == V1 ||
8673 EI->getVectorOperand() == V2;
8674 }) &&
8675 "Expected extractelement vectors.");
8676 }
8677 /// Adds another one input vector and the mask for the shuffling.
8678 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
8679 if (InVectors.empty()) {
8680 assert(CommonMask.empty() && !ForExtracts &&
8681 "Expected empty input mask/vectors.");
8682 CommonMask.assign(Mask.begin(), Mask.end());
8683 InVectors.assign(1, V1);
8684 return;
8685 }
8686 if (ForExtracts) {
8687 // No need to add vectors here, already handled them in adjustExtracts.
8688 assert(InVectors.size() == 1 &&
8689 InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
8690 all_of(enumerate(CommonMask),
8691 [&](auto P) {
8692 Value *Scalar = InVectors.front()
8693 .get<const TreeEntry *>()
8694 ->Scalars[P.index()];
8695 if (P.value() == PoisonMaskElem)
8696 return P.value() == Mask[P.index()] ||
8697 isa<UndefValue>(Scalar);
8698 if (isa<Constant>(V1))
8699 return true;
8700 auto *EI = cast<ExtractElementInst>(Scalar);
8701 return EI->getVectorOperand() == V1;
8702 }) &&
8703 "Expected only tree entry for extractelement vectors.");
8704 return;
8705 }
8706 assert(!InVectors.empty() && !CommonMask.empty() &&
8707 "Expected only tree entries from extracts/reused buildvectors.");
8708 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8709 if (InVectors.size() == 2) {
8710 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
8711 transformMaskAfterShuffle(CommonMask, CommonMask);
8712 VF = std::max<unsigned>(VF, CommonMask.size());
8713 } else if (const auto *InTE =
8714 InVectors.front().dyn_cast<const TreeEntry *>()) {
8715 VF = std::max(VF, InTE->getVectorFactor());
8716 } else {
8717 VF = std::max(
8718 VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())
8719 ->getNumElements());
8720 }
8721 InVectors.push_back(V1);
8722 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8723 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8724 CommonMask[Idx] = Mask[Idx] + VF;
8725 }
8726 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
8727 Value *Root = nullptr) {
8728 Cost += getBuildVectorCost(VL, Root);
8729 if (!Root) {
8730 // FIXME: Need to find a way to avoid use of getNullValue here.
8732 unsigned VF = VL.size();
8733 if (MaskVF != 0)
8734 VF = std::min(VF, MaskVF);
8735 for (Value *V : VL.take_front(VF)) {
8736 if (isa<UndefValue>(V)) {
8737 Vals.push_back(cast<Constant>(V));
8738 continue;
8739 }
8740 Vals.push_back(Constant::getNullValue(V->getType()));
8741 }
8742 return ConstantVector::get(Vals);
8743 }
8746 cast<FixedVectorType>(Root->getType())->getNumElements()),
8747 getAllOnesValue(*R.DL, ScalarTy));
8748 }
8750 /// Finalize emission of the shuffles.
8752 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
8753 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
8754 IsFinalized = true;
8755 if (Action) {
8756 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
8757 if (InVectors.size() == 2)
8758 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
8759 else
8760 Cost += createShuffle(Vec, nullptr, CommonMask);
8761 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8762 if (CommonMask[Idx] != PoisonMaskElem)
8763 CommonMask[Idx] = Idx;
8764 assert(VF > 0 &&
8765 "Expected vector length for the final value before action.");
8766 Value *V = Vec.get<Value *>();
8767 Action(V, CommonMask);
8768 InVectors.front() = V;
8769 }
8770 ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
8771 if (CommonMask.empty()) {
8772 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
8773 return Cost;
8774 }
8775 return Cost +
8776 createShuffle(InVectors.front(),
8777 InVectors.size() == 2 ? InVectors.back() : nullptr,
8778 CommonMask);
8779 }
8780
8782 assert((IsFinalized || CommonMask.empty()) &&
8783 "Shuffle construction must be finalized.");
8784 }
8785};
8786
8787const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
8788 unsigned Idx) const {
8789 Value *Op = E->getOperand(Idx).front();
8790 if (const TreeEntry *TE = getTreeEntry(Op)) {
8791 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
8792 return EI.EdgeIdx == Idx && EI.UserTE == E;
8793 }) != TE->UserTreeIndices.end())
8794 return TE;
8795 auto MIt = MultiNodeScalars.find(Op);
8796 if (MIt != MultiNodeScalars.end()) {
8797 for (const TreeEntry *TE : MIt->second) {
8798 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
8799 return EI.EdgeIdx == Idx && EI.UserTE == E;
8800 }) != TE->UserTreeIndices.end())
8801 return TE;
8802 }
8803 }
8804 }
8805 const auto *It =
8806 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8807 return TE->State == TreeEntry::NeedToGather &&
8808 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
8809 return EI.EdgeIdx == Idx && EI.UserTE == E;
8810 }) != TE->UserTreeIndices.end();
8811 });
8812 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
8813 return It->get();
8814}
8815
8816TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
8817 if (TE.State == TreeEntry::ScatterVectorize ||
8818 TE.State == TreeEntry::StridedVectorize)
8820 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
8821 !TE.isAltShuffle()) {
8822 if (TE.ReorderIndices.empty())
8825 inversePermutation(TE.ReorderIndices, Mask);
8826 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
8828 }
8830}
8831
8832/// Builds the arguments types vector for the given call instruction with the
8833/// given \p ID for the specified vector factor.
8835 const Intrinsic::ID ID,
8836 const unsigned VF,
8837 unsigned MinBW) {
8838 SmallVector<Type *> ArgTys;
8839 for (auto [Idx, Arg] : enumerate(CI->args())) {
8842 ArgTys.push_back(Arg->getType());
8843 continue;
8844 }
8845 if (MinBW > 0) {
8847 IntegerType::get(CI->getContext(), MinBW), VF));
8848 continue;
8849 }
8850 }
8851 ArgTys.push_back(FixedVectorType::get(Arg->getType(), VF));
8852 }
8853 return ArgTys;
8854}
8855
8857BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
8858 SmallPtrSetImpl<Value *> &CheckedExtracts) {
8859 ArrayRef<Value *> VL = E->Scalars;
8860
8861 Type *ScalarTy = VL[0]->getType();
8862 if (E->State != TreeEntry::NeedToGather) {
8863 if (auto *SI = dyn_cast<StoreInst>(VL[0]))
8864 ScalarTy = SI->getValueOperand()->getType();
8865 else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
8866 ScalarTy = CI->getOperand(0)->getType();
8867 else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
8868 ScalarTy = IE->getOperand(1)->getType();
8869 }
8870 if (!isValidElementType(ScalarTy))
8872 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
8874
8875 // If we have computed a smaller type for the expression, update VecTy so
8876 // that the costs will be accurate.
8877 auto It = MinBWs.find(E);
8878 Type *OrigScalarTy = ScalarTy;
8879 if (It != MinBWs.end()) {
8880 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
8881 VecTy = FixedVectorType::get(ScalarTy, VL.size());
8882 }
8883 unsigned EntryVF = E->getVectorFactor();
8884 auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
8885
8886 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
8887 if (E->State == TreeEntry::NeedToGather) {
8888 if (allConstant(VL))
8889 return 0;
8890 if (isa<InsertElementInst>(VL[0]))
8892 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
8893 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
8894 }
8895 InstructionCost CommonCost = 0;
8897 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
8898 if (!E->ReorderIndices.empty() &&
8899 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
8900 SmallVector<int> NewMask;
8901 if (E->getOpcode() == Instruction::Store) {
8902 // For stores the order is actually a mask.
8903 NewMask.resize(E->ReorderIndices.size());
8904 copy(E->ReorderIndices, NewMask.begin());
8905 } else {
8906 inversePermutation(E->ReorderIndices, NewMask);
8907 }
8908 ::addMask(Mask, NewMask);
8909 }
8910 if (NeedToShuffleReuses)
8911 ::addMask(Mask, E->ReuseShuffleIndices);
8912 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
8913 CommonCost =
8914 TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
8915 assert((E->State == TreeEntry::Vectorize ||
8916 E->State == TreeEntry::ScatterVectorize ||
8917 E->State == TreeEntry::StridedVectorize) &&
8918 "Unhandled state");
8919 assert(E->getOpcode() &&
8920 ((allSameType(VL) && allSameBlock(VL)) ||
8921 (E->getOpcode() == Instruction::GetElementPtr &&
8922 E->getMainOp()->getType()->isPointerTy())) &&
8923 "Invalid VL");
8924 Instruction *VL0 = E->getMainOp();
8925 unsigned ShuffleOrOp =
8926 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
8927 SetVector<Value *> UniqueValues(VL.begin(), VL.end());
8928 const unsigned Sz = UniqueValues.size();
8929 SmallBitVector UsedScalars(Sz, false);
8930 for (unsigned I = 0; I < Sz; ++I) {
8931 if (getTreeEntry(UniqueValues[I]) == E)
8932 continue;
8933 UsedScalars.set(I);
8934 }
8935 auto GetCastContextHint = [&](Value *V) {
8936 if (const TreeEntry *OpTE = getTreeEntry(V))
8937 return getCastContextHint(*OpTE);
8938 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
8939 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
8942 };
8943 auto GetCostDiff =
8944 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
8946 // Calculate the cost of this instruction.
8947 InstructionCost ScalarCost = 0;
8948 if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) {
8949 // For some of the instructions no need to calculate cost for each
8950 // particular instruction, we can use the cost of the single
8951 // instruction x total number of scalar instructions.
8952 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
8953 } else {
8954 for (unsigned I = 0; I < Sz; ++I) {
8955 if (UsedScalars.test(I))
8956 continue;
8957 ScalarCost += ScalarEltCost(I);
8958 }
8959 }
8960
8961 InstructionCost VecCost = VectorCost(CommonCost);
8962 // Check if the current node must be resized, if the parent node is not
8963 // resized.
8964 if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) {
8965 const EdgeInfo &EI = E->UserTreeIndices.front();
8966 if ((EI.UserTE->getOpcode() != Instruction::Select ||
8967 EI.EdgeIdx != 0) &&
8968 It != MinBWs.end()) {
8969 auto UserBWIt = MinBWs.find(EI.UserTE);
8970 Type *UserScalarTy =
8971 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
8972 if (UserBWIt != MinBWs.end())
8973 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
8974 UserBWIt->second.first);
8975 if (ScalarTy != UserScalarTy) {
8976 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
8977 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
8978 unsigned VecOpcode;
8979 auto *UserVecTy =
8980 FixedVectorType::get(UserScalarTy, E->getVectorFactor());
8981 if (BWSz > SrcBWSz)
8982 VecOpcode = Instruction::Trunc;
8983 else
8984 VecOpcode =
8985 It->second.second ? Instruction::SExt : Instruction::ZExt;
8986 TTI::CastContextHint CCH = GetCastContextHint(VL0);
8987 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
8988 CostKind);
8989 }
8990 }
8991 }
8992 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
8993 ScalarCost, "Calculated costs for Tree"));
8994 return VecCost - ScalarCost;
8995 };
8996 // Calculate cost difference from vectorizing set of GEPs.
8997 // Negative value means vectorizing is profitable.
8998 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
8999 assert((E->State == TreeEntry::Vectorize ||
9000 E->State == TreeEntry::StridedVectorize) &&
9001 "Entry state expected to be Vectorize or StridedVectorize here.");
9002 InstructionCost ScalarCost = 0;
9003 InstructionCost VecCost = 0;
9004 std::tie(ScalarCost, VecCost) = getGEPCosts(
9005 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
9006 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
9007 "Calculated GEPs cost for Tree"));
9008
9009 return VecCost - ScalarCost;
9010 };
9011
9012 switch (ShuffleOrOp) {
9013 case Instruction::PHI: {
9014 // Count reused scalars.
9015 InstructionCost ScalarCost = 0;
9017 for (Value *V : UniqueValues) {
9018 auto *PHI = dyn_cast<PHINode>(V);
9019 if (!PHI)
9020 continue;
9021
9022 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
9023 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
9024 Value *Op = PHI->getIncomingValue(I);
9025 Operands[I] = Op;
9026 }
9027 if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
9028 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
9029 if (!OpTE->ReuseShuffleIndices.empty())
9030 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
9031 OpTE->Scalars.size());
9032 }
9033
9034 return CommonCost - ScalarCost;
9035 }
9036 case Instruction::ExtractValue:
9037 case Instruction::ExtractElement: {
9038 auto GetScalarCost = [&](unsigned Idx) {
9039 auto *I = cast<Instruction>(UniqueValues[Idx]);
9040 VectorType *SrcVecTy;
9041 if (ShuffleOrOp == Instruction::ExtractElement) {
9042 auto *EE = cast<ExtractElementInst>(I);
9043 SrcVecTy = EE->getVectorOperandType();
9044 } else {
9045 auto *EV = cast<ExtractValueInst>(I);
9046 Type *AggregateTy = EV->getAggregateOperand()->getType();
9047 unsigned NumElts;
9048 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
9049 NumElts = ATy->getNumElements();
9050 else
9051 NumElts = AggregateTy->getStructNumElements();
9052 SrcVecTy = FixedVectorType::get(OrigScalarTy, NumElts);
9053 }
9054 if (I->hasOneUse()) {
9055 Instruction *Ext = I->user_back();
9056 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
9057 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
9058 // Use getExtractWithExtendCost() to calculate the cost of
9059 // extractelement/ext pair.
9061 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
9062 // Subtract the cost of s|zext which is subtracted separately.
9064 Ext->getOpcode(), Ext->getType(), I->getType(),
9066 return Cost;
9067 }
9068 }
9069 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
9071 };
9072 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
9073 return GetCostDiff(GetScalarCost, GetVectorCost);
9074 }
9075 case Instruction::InsertElement: {
9076 assert(E->ReuseShuffleIndices.empty() &&
9077 "Unique insertelements only are expected.");
9078 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
9079 unsigned const NumElts = SrcVecTy->getNumElements();
9080 unsigned const NumScalars = VL.size();
9081
9082 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
9083
9084 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
9085 unsigned OffsetBeg = *getInsertIndex(VL.front());
9086 unsigned OffsetEnd = OffsetBeg;
9087 InsertMask[OffsetBeg] = 0;
9088 for (auto [I, V] : enumerate(VL.drop_front())) {
9089 unsigned Idx = *getInsertIndex(V);
9090 if (OffsetBeg > Idx)
9091 OffsetBeg = Idx;
9092 else if (OffsetEnd < Idx)
9093 OffsetEnd = Idx;
9094 InsertMask[Idx] = I + 1;
9095 }
9096 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
9097 if (NumOfParts > 0)
9098 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
9099 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9100 VecScalarsSz;
9101 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9102 unsigned InsertVecSz = std::min<unsigned>(
9103 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
9104 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9105 bool IsWholeSubvector =
9106 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9107 // Check if we can safely insert a subvector. If it is not possible, just
9108 // generate a whole-sized vector and shuffle the source vector and the new
9109 // subvector.
9110 if (OffsetBeg + InsertVecSz > VecSz) {
9111 // Align OffsetBeg to generate correct mask.
9112 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
9113 InsertVecSz = VecSz;
9114 }
9115
9116 APInt DemandedElts = APInt::getZero(NumElts);
9117 // TODO: Add support for Instruction::InsertValue.
9119 if (!E->ReorderIndices.empty()) {
9120 inversePermutation(E->ReorderIndices, Mask);
9121 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
9122 } else {
9123 Mask.assign(VecSz, PoisonMaskElem);
9124 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
9125 }
9126 bool IsIdentity = true;
9127 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
9128 Mask.swap(PrevMask);
9129 for (unsigned I = 0; I < NumScalars; ++I) {
9130 unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
9131 DemandedElts.setBit(InsertIdx);
9132 IsIdentity &= InsertIdx - OffsetBeg == I;
9133 Mask[InsertIdx - OffsetBeg] = I;
9134 }
9135 assert(Offset < NumElts && "Failed to find vector index offset");
9136
9138 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
9139 /*Insert*/ true, /*Extract*/ false,
9140 CostKind);
9141
9142 // First cost - resize to actual vector size if not identity shuffle or
9143 // need to shift the vector.
9144 // Do not calculate the cost if the actual size is the register size and
9145 // we can merge this shuffle with the following SK_Select.
9146 auto *InsertVecTy = FixedVectorType::get(ScalarTy, InsertVecSz);
9147 if (!IsIdentity)
9149 InsertVecTy, Mask);
9150 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
9151 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9152 }));
9153 // Second cost - permutation with subvector, if some elements are from the
9154 // initial vector or inserting a subvector.
9155 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
9156 // subvector of ActualVecTy.
9157 SmallBitVector InMask =
9158 isUndefVector(FirstInsert->getOperand(0),
9159 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9160 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
9161 if (InsertVecSz != VecSz) {
9162 auto *ActualVecTy = FixedVectorType::get(ScalarTy, VecSz);
9164 std::nullopt, CostKind, OffsetBeg - Offset,
9165 InsertVecTy);
9166 } else {
9167 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
9168 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
9169 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
9170 I <= End; ++I)
9171 if (Mask[I] != PoisonMaskElem)
9172 Mask[I] = I + VecSz;
9173 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
9174 Mask[I] =
9175 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
9176 Cost +=
9177 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
9178 }
9179 }
9180 return Cost;
9181 }
9182 case Instruction::ZExt:
9183 case Instruction::SExt:
9184 case Instruction::FPToUI:
9185 case Instruction::FPToSI:
9186 case Instruction::FPExt:
9187 case Instruction::PtrToInt:
9188 case Instruction::IntToPtr:
9189 case Instruction::SIToFP:
9190 case Instruction::UIToFP:
9191 case Instruction::Trunc:
9192 case Instruction::FPTrunc:
9193 case Instruction::BitCast: {
9194 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9195 Type *SrcScalarTy = VL0->getOperand(0)->getType();
9196 auto *SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
9197 unsigned Opcode = ShuffleOrOp;
9198 unsigned VecOpcode = Opcode;
9199 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
9200 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
9201 // Check if the values are candidates to demote.
9202 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
9203 if (SrcIt != MinBWs.end()) {
9204 SrcBWSz = SrcIt->second.first;
9205 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
9206 SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
9207 }
9208 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9209 if (BWSz == SrcBWSz) {
9210 VecOpcode = Instruction::BitCast;
9211 } else if (BWSz < SrcBWSz) {
9212 VecOpcode = Instruction::Trunc;
9213 } else if (It != MinBWs.end()) {
9214 assert(BWSz > SrcBWSz && "Invalid cast!");
9215 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9216 } else if (SrcIt != MinBWs.end()) {
9217 assert(BWSz > SrcBWSz && "Invalid cast!");
9218 VecOpcode =
9219 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9220 }
9221 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
9222 !SrcIt->second.second) {
9223 VecOpcode = Instruction::UIToFP;
9224 }
9225 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
9226 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9227 return TTI->getCastInstrCost(Opcode, VL0->getType(),
9228 VL0->getOperand(0)->getType(),
9230 };
9231 auto GetVectorCost = [=](InstructionCost CommonCost) {
9232 // Do not count cost here if minimum bitwidth is in effect and it is just
9233 // a bitcast (here it is just a noop).
9234 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9235 return CommonCost;
9236 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
9237 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
9238 return CommonCost +
9239 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
9240 VecOpcode == Opcode ? VI : nullptr);
9241 };
9242 return GetCostDiff(GetScalarCost, GetVectorCost);
9243 }
9244 case Instruction::FCmp:
9245 case Instruction::ICmp:
9246 case Instruction::Select: {
9247 CmpInst::Predicate VecPred, SwappedVecPred;
9248 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
9249 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
9250 match(VL0, MatchCmp))
9251 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
9252 else
9253 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
9256 auto GetScalarCost = [&](unsigned Idx) {
9257 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9258 CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
9261 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
9262 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
9263 !match(VI, MatchCmp)) ||
9264 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9265 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
9268
9269 return TTI->getCmpSelInstrCost(E->getOpcode(), OrigScalarTy,
9270 Builder.getInt1Ty(), CurrentPred, CostKind,
9271 VI);
9272 };
9273 auto GetVectorCost = [&](InstructionCost CommonCost) {
9274 auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
9275
9277 E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
9278 // Check if it is possible and profitable to use min/max for selects
9279 // in VL.
9280 //
9281 auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
9282 if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
9283 IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
9284 {VecTy, VecTy});
9285 InstructionCost IntrinsicCost =
9286 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9287 // If the selects are the only uses of the compares, they will be
9288 // dead and we can adjust the cost by removing their cost.
9289 if (IntrinsicAndUse.second)
9290 IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy,
9291 MaskTy, VecPred, CostKind);
9292 VecCost = std::min(VecCost, IntrinsicCost);
9293 }
9294 return VecCost + CommonCost;
9295 };
9296 return GetCostDiff(GetScalarCost, GetVectorCost);
9297 }
9298 case Instruction::FNeg:
9299 case Instruction::Add:
9300 case Instruction::FAdd:
9301 case Instruction::Sub:
9302 case Instruction::FSub:
9303 case Instruction::Mul:
9304 case Instruction::FMul:
9305 case Instruction::UDiv:
9306 case Instruction::SDiv:
9307 case Instruction::FDiv:
9308 case Instruction::URem:
9309 case Instruction::SRem:
9310 case Instruction::FRem:
9311 case Instruction::Shl:
9312 case Instruction::LShr:
9313 case Instruction::AShr:
9314 case Instruction::And:
9315 case Instruction::Or:
9316 case Instruction::Xor: {
9317 auto GetScalarCost = [&](unsigned Idx) {
9318 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9319 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9320 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
9321 TTI::OperandValueInfo Op2Info =
9322 TTI::getOperandInfo(VI->getOperand(OpIdx));
9323 SmallVector<const Value *> Operands(VI->operand_values());
9324 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
9325 Op1Info, Op2Info, Operands, VI);
9326 };
9327 auto GetVectorCost = [=](InstructionCost CommonCost) {
9328 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
9329 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
9330 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
9331 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
9332 Op2Info, std::nullopt, nullptr, TLI) +
9333 CommonCost;
9334 };
9335 return GetCostDiff(GetScalarCost, GetVectorCost);
9336 }
9337 case Instruction::GetElementPtr: {
9338 return CommonCost + GetGEPCostDiff(VL, VL0);
9339 }
9340 case Instruction::Load: {
9341 auto GetScalarCost = [&](unsigned Idx) {
9342 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
9343 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
9344 VI->getAlign(), VI->getPointerAddressSpace(),
9346 };
9347 auto *LI0 = cast<LoadInst>(VL0);
9348 auto GetVectorCost = [&](InstructionCost CommonCost) {
9349 InstructionCost VecLdCost;
9350 if (E->State == TreeEntry::Vectorize) {
9351 VecLdCost = TTI->getMemoryOpCost(
9352 Instruction::Load, VecTy, LI0->getAlign(),
9353 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
9354 } else if (E->State == TreeEntry::StridedVectorize) {
9355 Align CommonAlignment =
9356 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9357 VecLdCost = TTI->getStridedMemoryOpCost(
9358 Instruction::Load, VecTy, LI0->getPointerOperand(),
9359 /*VariableMask=*/false, CommonAlignment, CostKind);
9360 } else {
9361 assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
9362 Align CommonAlignment =
9363 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9364 VecLdCost = TTI->getGatherScatterOpCost(
9365 Instruction::Load, VecTy, LI0->getPointerOperand(),
9366 /*VariableMask=*/false, CommonAlignment, CostKind);
9367 }
9368 return VecLdCost + CommonCost;
9369 };
9370
9371 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
9372 // If this node generates masked gather load then it is not a terminal node.
9373 // Hence address operand cost is estimated separately.
9374 if (E->State == TreeEntry::ScatterVectorize)
9375 return Cost;
9376
9377 // Estimate cost of GEPs since this tree node is a terminator.
9378 SmallVector<Value *> PointerOps(VL.size());
9379 for (auto [I, V] : enumerate(VL))
9380 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
9381 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9382 }
9383 case Instruction::Store: {
9384 bool IsReorder = !E->ReorderIndices.empty();
9385 auto GetScalarCost = [=](unsigned Idx) {
9386 auto *VI = cast<StoreInst>(VL[Idx]);
9387 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
9388 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
9389 VI->getAlign(), VI->getPointerAddressSpace(),
9390 CostKind, OpInfo, VI);
9391 };
9392 auto *BaseSI =
9393 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9394 auto GetVectorCost = [=](InstructionCost CommonCost) {
9395 // We know that we can merge the stores. Calculate the cost.
9396 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
9397 return TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
9398 BaseSI->getPointerAddressSpace(), CostKind,
9399 OpInfo) +
9400 CommonCost;
9401 };
9402 SmallVector<Value *> PointerOps(VL.size());
9403 for (auto [I, V] : enumerate(VL)) {
9404 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
9405 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
9406 }
9407
9408 return GetCostDiff(GetScalarCost, GetVectorCost) +
9409 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9410 }
9411 case Instruction::Call: {
9412 auto GetScalarCost = [&](unsigned Idx) {
9413 auto *CI = cast<CallInst>(UniqueValues[Idx]);
9416 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
9417 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9418 }
9421 CI->getFunctionType()->params(), CostKind);
9422 };
9423 auto GetVectorCost = [=](InstructionCost CommonCost) {
9424 auto *CI = cast<CallInst>(VL0);
9426 SmallVector<Type *> ArgTys =
9428 It != MinBWs.end() ? It->second.first : 0);
9429 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
9430 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
9431 };
9432 return GetCostDiff(GetScalarCost, GetVectorCost);
9433 }
9434 case Instruction::ShuffleVector: {
9435 assert(E->isAltShuffle() &&
9436 ((Instruction::isBinaryOp(E->getOpcode()) &&
9437 Instruction::isBinaryOp(E->getAltOpcode())) ||
9438 (Instruction::isCast(E->getOpcode()) &&
9439 Instruction::isCast(E->getAltOpcode())) ||
9440 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9441 "Invalid Shuffle Vector Operand");
9442 // Try to find the previous shuffle node with the same operands and same
9443 // main/alternate ops.
9444 auto TryFindNodeWithEqualOperands = [=]() {
9445 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9446 if (TE.get() == E)
9447 break;
9448 if (TE->isAltShuffle() &&
9449 ((TE->getOpcode() == E->getOpcode() &&
9450 TE->getAltOpcode() == E->getAltOpcode()) ||
9451 (TE->getOpcode() == E->getAltOpcode() &&
9452 TE->getAltOpcode() == E->getOpcode())) &&
9453 TE->hasEqualOperands(*E))
9454 return true;
9455 }
9456 return false;
9457 };
9458 auto GetScalarCost = [&](unsigned Idx) {
9459 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9460 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
9461 (void)E;
9462 return TTI->getInstructionCost(VI, CostKind);
9463 };
9464 // Need to clear CommonCost since the final shuffle cost is included into
9465 // vector cost.
9466 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
9467 // VecCost is equal to sum of the cost of creating 2 vectors
9468 // and the cost of creating shuffle.
9469 InstructionCost VecCost = 0;
9470 if (TryFindNodeWithEqualOperands()) {
9471 LLVM_DEBUG({
9472 dbgs() << "SLP: diamond match for alternate node found.\n";
9473 E->dump();
9474 });
9475 // No need to add new vector costs here since we're going to reuse
9476 // same main/alternate vector ops, just do different shuffling.
9477 } else if (Instruction::isBinaryOp(E->getOpcode())) {
9478 VecCost =
9479 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
9480 VecCost +=
9481 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
9482 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
9483 auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
9484 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
9485 CI0->getPredicate(), CostKind, VL0);
9486 VecCost += TTIRef.getCmpSelInstrCost(
9487 E->getOpcode(), VecTy, MaskTy,
9488 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
9489 E->getAltOp());
9490 } else {
9491 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
9492 auto *SrcTy = FixedVectorType::get(SrcSclTy, VL.size());
9493 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
9494 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9495 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9496 unsigned SrcBWSz =
9497 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
9498 if (SrcIt != MinBWs.end()) {
9499 SrcBWSz = SrcIt->second.first;
9500 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
9501 SrcTy = FixedVectorType::get(SrcSclTy, VL.size());
9502 }
9503 if (BWSz <= SrcBWSz) {
9504 if (BWSz < SrcBWSz)
9505 VecCost =
9506 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
9508 LLVM_DEBUG({
9509 dbgs()
9510 << "SLP: alternate extension, which should be truncated.\n";
9511 E->dump();
9512 });
9513 return VecCost;
9514 }
9515 }
9516 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
9518 VecCost +=
9519 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
9521 }
9523 E->buildAltOpShuffleMask(
9524 [E](Instruction *I) {
9525 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
9526 return I->getOpcode() == E->getAltOpcode();
9527 },
9528 Mask);
9530 FinalVecTy, Mask);
9531 // Patterns like [fadd,fsub] can be combined into a single instruction
9532 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
9533 // need to take into account their order when looking for the most used
9534 // order.
9535 unsigned Opcode0 = E->getOpcode();
9536 unsigned Opcode1 = E->getAltOpcode();
9537 // The opcode mask selects between the two opcodes.
9538 SmallBitVector OpcodeMask(E->Scalars.size(), false);
9539 for (unsigned Lane : seq<unsigned>(0, E->Scalars.size()))
9540 if (cast<Instruction>(E->Scalars[Lane])->getOpcode() == Opcode1)
9541 OpcodeMask.set(Lane);
9542 // If this pattern is supported by the target then we consider the
9543 // order.
9544 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9545 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
9546 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
9547 return AltVecCost < VecCost ? AltVecCost : VecCost;
9548 }
9549 // TODO: Check the reverse order too.
9550 return VecCost;
9551 };
9552 return GetCostDiff(GetScalarCost, GetVectorCost);
9553 }
9554 default:
9555 llvm_unreachable("Unknown instruction");
9556 }
9557}
9558
9559bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
9560 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
9561 << VectorizableTree.size() << " is fully vectorizable .\n");
9562
9563 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
9565 return TE->State == TreeEntry::NeedToGather &&
9566 !any_of(TE->Scalars,
9567 [this](Value *V) { return EphValues.contains(V); }) &&
9568 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
9569 TE->Scalars.size() < Limit ||
9570 ((TE->getOpcode() == Instruction::ExtractElement ||
9571 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
9572 isFixedVectorShuffle(TE->Scalars, Mask)) ||
9573 (TE->State == TreeEntry::NeedToGather &&
9574 TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()));
9575 };
9576
9577 // We only handle trees of heights 1 and 2.
9578 if (VectorizableTree.size() == 1 &&
9579 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
9580 (ForReduction &&
9581 AreVectorizableGathers(VectorizableTree[0].get(),
9582 VectorizableTree[0]->Scalars.size()) &&
9583 VectorizableTree[0]->getVectorFactor() > 2)))
9584 return true;
9585
9586 if (VectorizableTree.size() != 2)
9587 return false;
9588
9589 // Handle splat and all-constants stores. Also try to vectorize tiny trees
9590 // with the second gather nodes if they have less scalar operands rather than
9591 // the initial tree element (may be profitable to shuffle the second gather)
9592 // or they are extractelements, which form shuffle.
9594 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
9595 AreVectorizableGathers(VectorizableTree[1].get(),
9596 VectorizableTree[0]->Scalars.size()))
9597 return true;
9598
9599 // Gathering cost would be too much for tiny trees.
9600 if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
9601 (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9602 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
9603 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
9604 return false;
9605
9606 return true;
9607}
9608
9609static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
9611 bool MustMatchOrInst) {
9612 // Look past the root to find a source value. Arbitrarily follow the
9613 // path through operand 0 of any 'or'. Also, peek through optional
9614 // shift-left-by-multiple-of-8-bits.
9615 Value *ZextLoad = Root;
9616 const APInt *ShAmtC;
9617 bool FoundOr = false;
9618 while (!isa<ConstantExpr>(ZextLoad) &&
9619 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
9620 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
9621 ShAmtC->urem(8) == 0))) {
9622 auto *BinOp = cast<BinaryOperator>(ZextLoad);
9623 ZextLoad = BinOp->getOperand(0);
9624 if (BinOp->getOpcode() == Instruction::Or)
9625 FoundOr = true;
9626 }
9627 // Check if the input is an extended load of the required or/shift expression.
9628 Value *Load;
9629 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
9630 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
9631 return false;
9632
9633 // Require that the total load bit width is a legal integer type.
9634 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
9635 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
9636 Type *SrcTy = Load->getType();
9637 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
9638 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
9639 return false;
9640
9641 // Everything matched - assume that we can fold the whole sequence using
9642 // load combining.
9643 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
9644 << *(cast<Instruction>(Root)) << "\n");
9645
9646 return true;
9647}
9648
9650 if (RdxKind != RecurKind::Or)
9651 return false;
9652
9653 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9654 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
9655 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
9656 /* MatchOr */ false);
9657}
9658
9660 // Peek through a final sequence of stores and check if all operations are
9661 // likely to be load-combined.
9662 unsigned NumElts = Stores.size();
9663 for (Value *Scalar : Stores) {
9664 Value *X;
9665 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
9666 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
9667 return false;
9668 }
9669 return true;
9670}
9671
9672bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
9673 // No need to vectorize inserts of gathered values.
9674 if (VectorizableTree.size() == 2 &&
9675 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
9676 VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9677 (VectorizableTree[1]->getVectorFactor() <= 2 ||
9678 !(isSplat(VectorizableTree[1]->Scalars) ||
9679 allConstant(VectorizableTree[1]->Scalars))))
9680 return true;
9681
9682 // If the graph includes only PHI nodes and gathers, it is defnitely not
9683 // profitable for the vectorization, we can skip it, if the cost threshold is
9684 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
9685 // gathers/buildvectors.
9686 constexpr int Limit = 4;
9687 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
9688 !VectorizableTree.empty() &&
9689 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9690 return (TE->State == TreeEntry::NeedToGather &&
9691 TE->getOpcode() != Instruction::ExtractElement &&
9692 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
9693 TE->getOpcode() == Instruction::PHI;
9694 }))
9695 return true;
9696
9697 // We can vectorize the tree if its size is greater than or equal to the
9698 // minimum size specified by the MinTreeSize command line option.
9699 if (VectorizableTree.size() >= MinTreeSize)
9700 return false;
9701
9702 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
9703 // can vectorize it if we can prove it fully vectorizable.
9704 if (isFullyVectorizableTinyTree(ForReduction))
9705 return false;
9706
9707 // Check if any of the gather node forms an insertelement buildvector
9708 // somewhere.
9709 bool IsAllowedSingleBVNode =
9710 VectorizableTree.size() > 1 ||
9711 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
9712 !VectorizableTree.front()->isAltShuffle() &&
9713 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
9714 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
9715 allSameBlock(VectorizableTree.front()->Scalars));
9716 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9717 return TE->State == TreeEntry::NeedToGather &&
9718 all_of(TE->Scalars, [&](Value *V) {
9719 return isa<ExtractElementInst, UndefValue>(V) ||
9720 (IsAllowedSingleBVNode &&
9721 !V->hasNUsesOrMore(UsesLimit) &&
9722 any_of(V->users(), IsaPred<InsertElementInst>));
9723 });
9724 }))
9725 return false;
9726
9727 assert(VectorizableTree.empty()
9728 ? ExternalUses.empty()
9729 : true && "We shouldn't have any external users");
9730
9731 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
9732 // vectorizable.
9733 return true;
9734}
9735
9737 // Walk from the bottom of the tree to the top, tracking which values are
9738 // live. When we see a call instruction that is not part of our tree,
9739 // query TTI to see if there is a cost to keeping values live over it
9740 // (for example, if spills and fills are required).
9741 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
9743
9745 Instruction *PrevInst = nullptr;
9746
9747 // The entries in VectorizableTree are not necessarily ordered by their
9748 // position in basic blocks. Collect them and order them by dominance so later
9749 // instructions are guaranteed to be visited first. For instructions in
9750 // different basic blocks, we only scan to the beginning of the block, so
9751 // their order does not matter, as long as all instructions in a basic block
9752 // are grouped together. Using dominance ensures a deterministic order.
9753 SmallVector<Instruction *, 16> OrderedScalars;
9754 for (const auto &TEPtr : VectorizableTree) {
9755 if (TEPtr->State != TreeEntry::Vectorize)
9756 continue;
9757 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
9758 if (!Inst)
9759 continue;
9760 OrderedScalars.push_back(Inst);
9761 }
9762 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
9763 auto *NodeA = DT->getNode(A->getParent());
9764 auto *NodeB = DT->getNode(B->getParent());
9765 assert(NodeA && "Should only process reachable instructions");
9766 assert(NodeB && "Should only process reachable instructions");
9767 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
9768 "Different nodes should have different DFS numbers");
9769 if (NodeA != NodeB)
9770 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
9771 return B->comesBefore(A);
9772 });
9773
9774 for (Instruction *Inst : OrderedScalars) {
9775 if (!PrevInst) {
9776 PrevInst = Inst;
9777 continue;
9778 }
9779
9780 // Update LiveValues.
9781 LiveValues.erase(PrevInst);
9782 for (auto &J : PrevInst->operands()) {
9783 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
9784 LiveValues.insert(cast<Instruction>(&*J));
9785 }
9786
9787 LLVM_DEBUG({
9788 dbgs() << "SLP: #LV: " << LiveValues.size();
9789 for (auto *X : LiveValues)
9790 dbgs() << " " << X->getName();
9791 dbgs() << ", Looking at ";
9792 Inst->dump();
9793 });
9794
9795 // Now find the sequence of instructions between PrevInst and Inst.
9796 unsigned NumCalls = 0;
9797 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
9798 PrevInstIt =
9799 PrevInst->getIterator().getReverse();
9800 while (InstIt != PrevInstIt) {
9801 if (PrevInstIt == PrevInst->getParent()->rend()) {
9802 PrevInstIt = Inst->getParent()->rbegin();
9803 continue;
9804 }
9805
9806 auto NoCallIntrinsic = [this](Instruction *I) {
9807 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
9808 if (II->isAssumeLikeIntrinsic())
9809 return true;
9810 FastMathFlags FMF;
9812 for (auto &ArgOp : II->args())
9813 Tys.push_back(ArgOp->getType());
9814 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
9815 FMF = FPMO->getFastMathFlags();
9816 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
9817 FMF);
9818 InstructionCost IntrCost =
9821 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
9822 if (IntrCost < CallCost)
9823 return true;
9824 }
9825 return false;
9826 };
9827
9828 // Debug information does not impact spill cost.
9829 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
9830 &*PrevInstIt != PrevInst)
9831 NumCalls++;
9832
9833 ++PrevInstIt;
9834 }
9835
9836 if (NumCalls) {
9838 for (auto *II : LiveValues) {
9839 auto *ScalarTy = II->getType();
9840 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
9841 ScalarTy = VectorTy->getElementType();
9842 V.push_back(FixedVectorType::get(ScalarTy, BundleWidth));
9843 }
9844 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
9845 }
9846
9847 PrevInst = Inst;
9848 }
9849
9850 return Cost;
9851}
9852
9853/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
9854/// buildvector sequence.
9856 const InsertElementInst *IE2) {
9857 if (IE1 == IE2)
9858 return false;
9859 const auto *I1 = IE1;
9860 const auto *I2 = IE2;
9861 const InsertElementInst *PrevI1;
9862 const InsertElementInst *PrevI2;
9863 unsigned Idx1 = *getInsertIndex(IE1);
9864 unsigned Idx2 = *getInsertIndex(IE2);
9865 do {
9866 if (I2 == IE1)
9867 return true;
9868 if (I1 == IE2)
9869 return false;
9870 PrevI1 = I1;
9871 PrevI2 = I2;
9872 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
9873 getInsertIndex(I1).value_or(Idx2) != Idx2)
9874 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
9875 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
9876 getInsertIndex(I2).value_or(Idx1) != Idx1)
9877 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
9878 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
9879 llvm_unreachable("Two different buildvectors not expected.");
9880}
9881
9882namespace {
9883/// Returns incoming Value *, if the requested type is Value * too, or a default
9884/// value, otherwise.
9885struct ValueSelect {
9886 template <typename U>
9887 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
9888 return V;
9889 }
9890 template <typename U>
9891 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
9892 return U();
9893 }
9894};
9895} // namespace
9896
9897/// Does the analysis of the provided shuffle masks and performs the requested
9898/// actions on the vectors with the given shuffle masks. It tries to do it in
9899/// several steps.
9900/// 1. If the Base vector is not undef vector, resizing the very first mask to
9901/// have common VF and perform action for 2 input vectors (including non-undef
9902/// Base). Other shuffle masks are combined with the resulting after the 1 stage
9903/// and processed as a shuffle of 2 elements.
9904/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
9905/// action only for 1 vector with the given mask, if it is not the identity
9906/// mask.
9907/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
9908/// vectors, combing the masks properly between the steps.
9909template <typename T>
9911 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
9912 function_ref<unsigned(T *)> GetVF,
9913 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
9915 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
9916 SmallVector<int> Mask(ShuffleMask.begin()->second);
9917 auto VMIt = std::next(ShuffleMask.begin());
9918 T *Prev = nullptr;
9919 SmallBitVector UseMask =
9920 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
9921 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
9922 if (!IsBaseUndef.all()) {
9923 // Base is not undef, need to combine it with the next subvectors.
9924 std::pair<T *, bool> Res =
9925 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
9926 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
9927 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
9928 if (Mask[Idx] == PoisonMaskElem)
9929 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
9930 else
9931 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
9932 }
9933 auto *V = ValueSelect::get<T *>(Base);
9934 (void)V;
9935 assert((!V || GetVF(V) == Mask.size()) &&
9936 "Expected base vector of VF number of elements.");
9937 Prev = Action(Mask, {nullptr, Res.first});
9938 } else if (ShuffleMask.size() == 1) {
9939 // Base is undef and only 1 vector is shuffled - perform the action only for
9940 // single vector, if the mask is not the identity mask.
9941 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
9942 /*ForSingleMask=*/true);
9943 if (Res.second)
9944 // Identity mask is found.
9945 Prev = Res.first;
9946 else
9947 Prev = Action(Mask, {ShuffleMask.begin()->first});
9948 } else {
9949 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
9950 // shuffles step by step, combining shuffle between the steps.
9951 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
9952 unsigned Vec2VF = GetVF(VMIt->first);
9953 if (Vec1VF == Vec2VF) {
9954 // No need to resize the input vectors since they are of the same size, we
9955 // can shuffle them directly.
9956 ArrayRef<int> SecMask = VMIt->second;
9957 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
9958 if (SecMask[I] != PoisonMaskElem) {
9959 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
9960 Mask[I] = SecMask[I] + Vec1VF;
9961 }
9962 }
9963 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
9964 } else {
9965 // Vectors of different sizes - resize and reshuffle.
9966 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
9967 /*ForSingleMask=*/false);
9968 std::pair<T *, bool> Res2 =
9969 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
9970 ArrayRef<int> SecMask = VMIt->second;
9971 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
9972 if (Mask[I] != PoisonMaskElem) {
9973 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
9974 if (Res1.second)
9975 Mask[I] = I;
9976 } else if (SecMask[I] != PoisonMaskElem) {
9977 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
9978 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
9979 }
9980 }
9981 Prev = Action(Mask, {Res1.first, Res2.first});
9982 }
9983 VMIt = std::next(VMIt);
9984 }
9985 bool IsBaseNotUndef = !IsBaseUndef.all();
9986 (void)IsBaseNotUndef;
9987 // Perform requested actions for the remaining masks/vectors.
9988 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
9989 // Shuffle other input vectors, if any.
9990 std::pair<T *, bool> Res =
9991 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
9992 ArrayRef<int> SecMask = VMIt->second;
9993 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
9994 if (SecMask[I] != PoisonMaskElem) {
9995 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
9996 "Multiple uses of scalars.");
9997 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
9998 } else if (Mask[I] != PoisonMaskElem) {
9999 Mask[I] = I;
10000 }
10001 }
10002 Prev = Action(Mask, {Prev, Res.first});
10003 }
10004 return Prev;
10005}
10006
10009 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
10010 << VectorizableTree.size() << ".\n");
10011
10012 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
10013
10014 SmallPtrSet<Value *, 4> CheckedExtracts;
10015 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
10016 TreeEntry &TE = *VectorizableTree[I];
10017 if (TE.State == TreeEntry::NeedToGather) {
10018 if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
10019 E && E->getVectorFactor() == TE.getVectorFactor() &&
10020 E->isSame(TE.Scalars)) {
10021 // Some gather nodes might be absolutely the same as some vectorizable
10022 // nodes after reordering, need to handle it.
10023 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
10024 << shortBundleName(TE.Scalars) << ".\n"
10025 << "SLP: Current total cost = " << Cost << "\n");
10026 continue;
10027 }
10028 }
10029
10030 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
10031 Cost += C;
10032 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
10033 << shortBundleName(TE.Scalars) << ".\n"
10034 << "SLP: Current total cost = " << Cost << "\n");
10035 }
10036
10037 SmallPtrSet<Value *, 16> ExtractCostCalculated;
10038 InstructionCost ExtractCost = 0;
10041 SmallVector<APInt> DemandedElts;
10042 SmallDenseSet<Value *, 4> UsedInserts;
10044 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
10045 for (ExternalUser &EU : ExternalUses) {
10046 // We only add extract cost once for the same scalar.
10047 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
10048 !ExtractCostCalculated.insert(EU.Scalar).second)
10049 continue;
10050
10051 // Uses by ephemeral values are free (because the ephemeral value will be
10052 // removed prior to code generation, and so the extraction will be
10053 // removed as well).
10054 if (EphValues.count(EU.User))
10055 continue;
10056
10057 // No extract cost for vector "scalar"
10058 if (isa<FixedVectorType>(EU.Scalar->getType()))
10059 continue;
10060
10061 // If found user is an insertelement, do not calculate extract cost but try
10062 // to detect it as a final shuffled/identity match.
10063 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
10064 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
10065 if (!UsedInserts.insert(VU).second)
10066 continue;
10067 std::optional<unsigned> InsertIdx = getInsertIndex(VU);
10068 if (InsertIdx) {
10069 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
10070 auto *It = find_if(
10071 FirstUsers,
10072 [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
10074 VU, cast<InsertElementInst>(Pair.first),
10075 [this](InsertElementInst *II) -> Value * {
10076 Value *Op0 = II->getOperand(0);
10077 if (getTreeEntry(II) && !getTreeEntry(Op0))
10078 return nullptr;
10079 return Op0;
10080 });
10081 });
10082 int VecId = -1;
10083 if (It == FirstUsers.end()) {
10084 (void)ShuffleMasks.emplace_back();
10085 SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
10086 if (Mask.empty())
10087 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10088 // Find the insertvector, vectorized in tree, if any.
10089 Value *Base = VU;
10090 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
10091 if (IEBase != EU.User &&
10092 (!IEBase->hasOneUse() ||
10093 getInsertIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
10094 break;
10095 // Build the mask for the vectorized insertelement instructions.
10096 if (const TreeEntry *E = getTreeEntry(IEBase)) {
10097 VU = IEBase;
10098 do {
10099 IEBase = cast<InsertElementInst>(Base);
10100 int Idx = *getInsertIndex(IEBase);
10101 assert(Mask[Idx] == PoisonMaskElem &&
10102 "InsertElementInstruction used already.");
10103 Mask[Idx] = Idx;
10104 Base = IEBase->getOperand(0);
10105 } while (E == getTreeEntry(Base));
10106 break;
10107 }
10108 Base = cast<InsertElementInst>(Base)->getOperand(0);
10109 }
10110 FirstUsers.emplace_back(VU, ScalarTE);
10111 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
10112 VecId = FirstUsers.size() - 1;
10113 auto It = MinBWs.find(ScalarTE);
10114 if (It != MinBWs.end() &&
10115 VectorCasts
10116 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
10117 .second) {
10118 unsigned BWSz = It->second.first;
10119 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
10120 unsigned VecOpcode;
10121 if (DstBWSz < BWSz)
10122 VecOpcode = Instruction::Trunc;
10123 else
10124 VecOpcode =
10125 It->second.second ? Instruction::SExt : Instruction::ZExt;
10128 VecOpcode, FTy,
10130 IntegerType::get(FTy->getContext(), BWSz),
10131 FTy->getNumElements()),
10133 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10134 << " for extending externally used vector with "
10135 "non-equal minimum bitwidth.\n");
10136 Cost += C;
10137 }
10138 } else {
10139 if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
10140 It->first = VU;
10141 VecId = std::distance(FirstUsers.begin(), It);
10142 }
10143 int InIdx = *InsertIdx;
10144 SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
10145 if (Mask.empty())
10146 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10147 Mask[InIdx] = EU.Lane;
10148 DemandedElts[VecId].setBit(InIdx);
10149 continue;
10150 }
10151 }
10152 }
10153 // Leave the GEPs as is, they are free in most cases and better to keep them
10154 // as GEPs.
10156 if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10157 if (!ValueToExtUses) {
10158 ValueToExtUses.emplace();
10159 for_each(enumerate(ExternalUses), [&](const auto &P) {
10160 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
10161 });
10162 }
10163 // Can use original GEP, if no operands vectorized or they are marked as
10164 // externally used already.
10165 bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
10166 if (!getTreeEntry(V))
10167 return true;
10168 auto It = ValueToExtUses->find(V);
10169 if (It != ValueToExtUses->end()) {
10170 // Replace all uses to avoid compiler crash.
10171 ExternalUses[It->second].User = nullptr;
10172 return true;
10173 }
10174 return false;
10175 });
10176 if (CanBeUsedAsGEP) {
10177 ExtractCost += TTI->getInstructionCost(GEP, CostKind);
10178 ExternalUsesAsGEPs.insert(EU.Scalar);
10179 continue;
10180 }
10181 }
10182
10183 // If we plan to rewrite the tree in a smaller type, we will need to sign
10184 // extend the extracted value back to the original type. Here, we account
10185 // for the extract and the added cost of the sign extend if needed.
10186 auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
10187 auto It = MinBWs.find(getTreeEntry(EU.Scalar));
10188 if (It != MinBWs.end()) {
10189 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
10190 unsigned Extend =
10191 It->second.second ? Instruction::SExt : Instruction::ZExt;
10192 VecTy = FixedVectorType::get(MinTy, BundleWidth);
10193 ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
10194 VecTy, EU.Lane);
10195 } else {
10196 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
10197 CostKind, EU.Lane);
10198 }
10199 }
10200 // Add reduced value cost, if resized.
10201 if (!VectorizedVals.empty()) {
10202 const TreeEntry &Root = *VectorizableTree.front().get();
10203 auto BWIt = MinBWs.find(&Root);
10204 if (BWIt != MinBWs.end()) {
10205 Type *DstTy = Root.Scalars.front()->getType();
10206 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
10207 unsigned SrcSz =
10208 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10209 if (OriginalSz != SrcSz) {
10210 unsigned Opcode = Instruction::Trunc;
10211 if (OriginalSz > SrcSz)
10212 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10213 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
10214 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
10217 }
10218 }
10219 }
10220
10221 InstructionCost SpillCost = getSpillCost();
10222 Cost += SpillCost + ExtractCost;
10223 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
10224 bool) {
10225 InstructionCost C = 0;
10226 unsigned VF = Mask.size();
10227 unsigned VecVF = TE->getVectorFactor();
10228 if (VF != VecVF &&
10229 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
10231 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
10232 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
10233 OrigMask.begin());
10234 C = TTI->getShuffleCost(
10236 FixedVectorType::get(TE->getMainOp()->getType(), VecVF), OrigMask);
10237 LLVM_DEBUG(
10238 dbgs() << "SLP: Adding cost " << C
10239 << " for final shuffle of insertelement external users.\n";
10240 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
10241 Cost += C;
10242 return std::make_pair(TE, true);
10243 }
10244 return std::make_pair(TE, false);
10245 };
10246 // Calculate the cost of the reshuffled vectors, if any.
10247 for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
10248 Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
10249 auto Vector = ShuffleMasks[I].takeVector();
10250 unsigned VF = 0;
10251 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
10253 assert((TEs.size() == 1 || TEs.size() == 2) &&
10254 "Expected exactly 1 or 2 tree entries.");
10255 if (TEs.size() == 1) {
10256 if (VF == 0)
10257 VF = TEs.front()->getVectorFactor();
10258 auto *FTy =
10259 FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
10260 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
10261 !all_of(enumerate(Mask), [=](const auto &Data) {
10262 return Data.value() == PoisonMaskElem ||
10263 (Data.index() < VF &&
10264 static_cast<int>(Data.index()) == Data.value());
10265 })) {
10268 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10269 << " for final shuffle of insertelement "
10270 "external users.\n";
10271 TEs.front()->dump();
10272 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10273 Cost += C;
10274 }
10275 } else {
10276 if (VF == 0) {
10277 if (TEs.front() &&
10278 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10279 VF = TEs.front()->getVectorFactor();
10280 else
10281 VF = Mask.size();
10282 }
10283 auto *FTy =
10284 FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
10287 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10288 << " for final shuffle of vector node and external "
10289 "insertelement users.\n";
10290 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10291 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10292 Cost += C;
10293 }
10294 VF = Mask.size();
10295 return TEs.back();
10296 };
10297 (void)performExtractsShuffleAction<const TreeEntry>(
10298 MutableArrayRef(Vector.data(), Vector.size()), Base,
10299 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
10300 EstimateShufflesCost);
10302 cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
10303 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
10304 Cost -= InsertCost;
10305 }
10306
10307 // Add the cost for reduced value resize (if required).
10308 if (ReductionBitWidth != 0) {
10309 assert(UserIgnoreList && "Expected reduction tree.");
10310 const TreeEntry &E = *VectorizableTree.front().get();
10311 auto It = MinBWs.find(&E);
10312 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10313 unsigned SrcSize = It->second.first;
10314 unsigned DstSize = ReductionBitWidth;
10315 unsigned Opcode = Instruction::Trunc;
10316 if (SrcSize < DstSize)
10317 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10318 auto *SrcVecTy =
10319 FixedVectorType::get(Builder.getIntNTy(SrcSize), E.getVectorFactor());
10320 auto *DstVecTy =
10321 FixedVectorType::get(Builder.getIntNTy(DstSize), E.getVectorFactor());
10322 TTI::CastContextHint CCH = getCastContextHint(E);
10323 InstructionCost CastCost;
10324 switch (E.getOpcode()) {
10325 case Instruction::SExt:
10326 case Instruction::ZExt:
10327 case Instruction::Trunc: {
10328 const TreeEntry *OpTE = getOperandEntry(&E, 0);
10329 CCH = getCastContextHint(*OpTE);
10330 break;
10331 }
10332 default:
10333 break;
10334 }
10335 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
10337 Cost += CastCost;
10338 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
10339 << " for final resize for reduction from " << SrcVecTy
10340 << " to " << DstVecTy << "\n";
10341 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10342 }
10343 }
10344
10345#ifndef NDEBUG
10346 SmallString<256> Str;
10347 {
10349 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
10350 << "SLP: Extract Cost = " << ExtractCost << ".\n"
10351 << "SLP: Total Cost = " << Cost << ".\n";
10352 }
10353 LLVM_DEBUG(dbgs() << Str);
10354 if (ViewSLPTree)
10355 ViewGraph(this, "SLP" + F->getName(), false, Str);
10356#endif
10357
10358 return Cost;
10359}
10360
10361/// Tries to find extractelement instructions with constant indices from fixed
10362/// vector type and gather such instructions into a bunch, which highly likely
10363/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10364/// successful, the matched scalars are replaced by poison values in \p VL for
10365/// future analysis.
10366std::optional<TTI::ShuffleKind>
10367BoUpSLP::tryToGatherSingleRegisterExtractElements(
10369 // Scan list of gathered scalars for extractelements that can be represented
10370 // as shuffles.
10372 SmallVector<int> UndefVectorExtracts;
10373 for (int I = 0, E = VL.size(); I < E; ++I) {
10374 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10375 if (!EI) {
10376 if (isa<UndefValue>(VL[I]))
10377 UndefVectorExtracts.push_back(I);
10378 continue;
10379 }
10380 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
10381 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
10382 continue;
10383 std::optional<unsigned> Idx = getExtractIndex(EI);
10384 // Undefined index.
10385 if (!Idx) {
10386 UndefVectorExtracts.push_back(I);
10387 continue;
10388 }
10389 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
10390 ExtractMask.reset(*Idx);
10391 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
10392 UndefVectorExtracts.push_back(I);
10393 continue;
10394 }
10395 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
10396 }
10397 // Sort the vector operands by the maximum number of uses in extractelements.
10399 for (const auto &Data : VectorOpToIdx)
10400 VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]
10401 .push_back(Data.first);
10402 for (auto &Data : VFToVector) {
10403 stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {
10404 return VectorOpToIdx.find(V1)->second.size() >
10405 VectorOpToIdx.find(V2)->second.size();
10406 });
10407 }
10408 // Find the best pair of the vectors with the same number of elements or a
10409 // single vector.
10410 const int UndefSz = UndefVectorExtracts.size();
10411 unsigned SingleMax = 0;
10412 Value *SingleVec = nullptr;
10413 unsigned PairMax = 0;
10414 std::pair<Value *, Value *> PairVec(nullptr, nullptr);
10415 for (auto &Data : VFToVector) {
10416 Value *V1 = Data.second.front();
10417 if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {
10418 SingleMax = VectorOpToIdx[V1].size() + UndefSz;
10419 SingleVec = V1;
10420 }
10421 Value *V2 = nullptr;
10422 if (Data.second.size() > 1)
10423 V2 = *std::next(Data.second.begin());
10424 if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +
10425 UndefSz) {
10426 PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;
10427 PairVec = std::make_pair(V1, V2);
10428 }
10429 }
10430 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10431 return std::nullopt;
10432 // Check if better to perform a shuffle of 2 vectors or just of a single
10433 // vector.
10434 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
10435 SmallVector<Value *> GatheredExtracts(
10436 VL.size(), PoisonValue::get(VL.front()->getType()));
10437 if (SingleMax >= PairMax && SingleMax) {
10438 for (int Idx : VectorOpToIdx[SingleVec])
10439 std::swap(GatheredExtracts[Idx], VL[Idx]);
10440 } else {
10441 for (Value *V : {PairVec.first, PairVec.second})
10442 for (int Idx : VectorOpToIdx[V])
10443 std::swap(GatheredExtracts[Idx], VL[Idx]);
10444 }
10445 // Add extracts from undefs too.
10446 for (int Idx : UndefVectorExtracts)
10447 std::swap(GatheredExtracts[Idx], VL[Idx]);
10448 // Check that gather of extractelements can be represented as just a
10449 // shuffle of a single/two vectors the scalars are extracted from.
10450 std::optional<TTI::ShuffleKind> Res =
10451 isFixedVectorShuffle(GatheredExtracts, Mask);
10452 if (!Res) {
10453 // TODO: try to check other subsets if possible.
10454 // Restore the original VL if attempt was not successful.
10455 copy(SavedVL, VL.begin());
10456 return std::nullopt;
10457 }
10458 // Restore unused scalars from mask, if some of the extractelements were not
10459 // selected for shuffle.
10460 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
10461 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
10462 isa<UndefValue>(GatheredExtracts[I])) {
10463 std::swap(VL[I], GatheredExtracts[I]);
10464 continue;
10465 }
10466 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10467 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
10468 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
10469 is_contained(UndefVectorExtracts, I))
10470 continue;
10471 }
10472 return Res;
10473}
10474
10475/// Tries to find extractelement instructions with constant indices from fixed
10476/// vector type and gather such instructions into a bunch, which highly likely
10477/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10478/// successful, the matched scalars are replaced by poison values in \p VL for
10479/// future analysis.
10481BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
10483 unsigned NumParts) const {
10484 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
10485 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
10486 Mask.assign(VL.size(), PoisonMaskElem);
10487 unsigned SliceSize = VL.size() / NumParts;
10488 for (unsigned Part = 0; Part < NumParts; ++Part) {
10489 // Scan list of gathered scalars for extractelements that can be represented
10490 // as shuffles.
10492 MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
10493 SmallVector<int> SubMask;
10494 std::optional<TTI::ShuffleKind> Res =
10495 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
10496 ShufflesRes[Part] = Res;
10497 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
10498 }
10499 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
10500 return Res.has_value();
10501 }))
10502 ShufflesRes.clear();
10503 return ShufflesRes;
10504}
10505
10506std::optional<TargetTransformInfo::ShuffleKind>
10507BoUpSLP::isGatherShuffledSingleRegisterEntry(
10508 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
10509 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
10510 Entries.clear();
10511 // TODO: currently checking only for Scalars in the tree entry, need to count
10512 // reused elements too for better cost estimation.
10513 const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
10514 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
10515 const BasicBlock *TEInsertBlock = nullptr;
10516 // Main node of PHI entries keeps the correct order of operands/incoming
10517 // blocks.
10518 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
10519 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
10520 TEInsertPt = TEInsertBlock->getTerminator();
10521 } else {
10522 TEInsertBlock = TEInsertPt->getParent();
10523 }
10524 if (!DT->isReachableFromEntry(TEInsertBlock))
10525 return std::nullopt;
10526 auto *NodeUI = DT->getNode(TEInsertBlock);
10527 assert(NodeUI && "Should only process reachable instructions");
10528 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
10529 auto CheckOrdering = [&](const Instruction *InsertPt) {
10530 // Argument InsertPt is an instruction where vector code for some other
10531 // tree entry (one that shares one or more scalars with TE) is going to be
10532 // generated. This lambda returns true if insertion point of vector code
10533 // for the TE dominates that point (otherwise dependency is the other way
10534 // around). The other node is not limited to be of a gather kind. Gather
10535 // nodes are not scheduled and their vector code is inserted before their
10536 // first user. If user is PHI, that is supposed to be at the end of a
10537 // predecessor block. Otherwise it is the last instruction among scalars of
10538 // the user node. So, instead of checking dependency between instructions
10539 // themselves, we check dependency between their insertion points for vector
10540 // code (since each scalar instruction ends up as a lane of a vector
10541 // instruction).
10542 const BasicBlock *InsertBlock = InsertPt->getParent();
10543 auto *NodeEUI = DT->getNode(InsertBlock);
10544 if (!NodeEUI)
10545 return false;
10546 assert((NodeUI == NodeEUI) ==
10547 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10548 "Different nodes should have different DFS numbers");
10549 // Check the order of the gather nodes users.
10550 if (TEInsertPt->getParent() != InsertBlock &&
10551 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
10552 return false;
10553 if (TEInsertPt->getParent() == InsertBlock &&
10554 TEInsertPt->comesBefore(InsertPt))
10555 return false;
10556 return true;
10557 };
10558 // Find all tree entries used by the gathered values. If no common entries
10559 // found - not a shuffle.
10560 // Here we build a set of tree nodes for each gathered value and trying to
10561 // find the intersection between these sets. If we have at least one common
10562 // tree node for each gathered value - we have just a permutation of the
10563 // single vector. If we have 2 different sets, we're in situation where we
10564 // have a permutation of 2 input vectors.
10566 DenseMap<Value *, int> UsedValuesEntry;
10567 for (Value *V : VL) {
10568 if (isConstant(V))
10569 continue;
10570 // Build a list of tree entries where V is used.
10572 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
10573 if (TEPtr == TE)
10574 continue;
10575 assert(any_of(TEPtr->Scalars,
10576 [&](Value *V) { return GatheredScalars.contains(V); }) &&
10577 "Must contain at least single gathered value.");
10578 assert(TEPtr->UserTreeIndices.size() == 1 &&
10579 "Expected only single user of a gather node.");
10580 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
10581
10582 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
10583 const Instruction *InsertPt =
10584 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
10585 : &getLastInstructionInBundle(UseEI.UserTE);
10586 if (TEInsertPt == InsertPt) {
10587 // If 2 gathers are operands of the same entry (regardless of whether
10588 // user is PHI or else), compare operands indices, use the earlier one
10589 // as the base.
10590 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
10591 continue;
10592 // If the user instruction is used for some reason in different
10593 // vectorized nodes - make it depend on index.
10594 if (TEUseEI.UserTE != UseEI.UserTE &&
10595 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
10596 continue;
10597 }
10598
10599 // Check if the user node of the TE comes after user node of TEPtr,
10600 // otherwise TEPtr depends on TE.
10601 if ((TEInsertBlock != InsertPt->getParent() ||
10602 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
10603 !CheckOrdering(InsertPt))
10604 continue;
10605 VToTEs.insert(TEPtr);
10606 }
10607 if (const TreeEntry *VTE = getTreeEntry(V)) {
10608 if (ForOrder) {
10609 if (VTE->State != TreeEntry::Vectorize) {
10610 auto It = MultiNodeScalars.find(V);
10611 if (It == MultiNodeScalars.end())
10612 continue;
10613 VTE = *It->getSecond().begin();
10614 // Iterate through all vectorized nodes.
10615 auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
10616 return MTE->State == TreeEntry::Vectorize;
10617 });
10618 if (MIt == It->getSecond().end())
10619 continue;
10620 VTE = *MIt;
10621 }
10622 }
10623 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
10624 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
10625 continue;
10626 VToTEs.insert(VTE);
10627 }
10628 if (VToTEs.empty())
10629 continue;
10630 if (UsedTEs.empty()) {
10631 // The first iteration, just insert the list of nodes to vector.
10632 UsedTEs.push_back(VToTEs);
10633 UsedValuesEntry.try_emplace(V, 0);
10634 } else {
10635 // Need to check if there are any previously used tree nodes which use V.
10636 // If there are no such nodes, consider that we have another one input
10637 // vector.
10638 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
10639 unsigned Idx = 0;
10640 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
10641 // Do we have a non-empty intersection of previously listed tree entries
10642 // and tree entries using current V?
10643 set_intersect(VToTEs, Set);
10644 if (!VToTEs.empty()) {
10645 // Yes, write the new subset and continue analysis for the next
10646 // scalar.
10647 Set.swap(VToTEs);
10648 break;
10649 }
10650 VToTEs = SavedVToTEs;
10651 ++Idx;
10652 }
10653 // No non-empty intersection found - need to add a second set of possible
10654 // source vectors.
10655 if (Idx == UsedTEs.size()) {
10656 // If the number of input vectors is greater than 2 - not a permutation,
10657 // fallback to the regular gather.
10658 // TODO: support multiple reshuffled nodes.
10659 if (UsedTEs.size() == 2)
10660 continue;
10661 UsedTEs.push_back(SavedVToTEs);
10662 Idx = UsedTEs.size() - 1;
10663 }
10664 UsedValuesEntry.try_emplace(V, Idx);
10665 }
10666 }
10667
10668 if (UsedTEs.empty()) {
10669 Entries.clear();
10670 return std::nullopt;
10671 }
10672
10673 unsigned VF = 0;
10674 if (UsedTEs.size() == 1) {
10675 // Keep the order to avoid non-determinism.
10676 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
10677 UsedTEs.front().end());
10678 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
10679 return TE1->Idx < TE2->Idx;
10680 });
10681 // Try to find the perfect match in another gather node at first.
10682 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
10683 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
10684 });
10685 if (It != FirstEntries.end() &&
10686 ((*It)->getVectorFactor() == VL.size() ||
10687 ((*It)->getVectorFactor() == TE->Scalars.size() &&
10688 TE->ReuseShuffleIndices.size() == VL.size() &&
10689 (*It)->isSame(TE->Scalars)))) {
10690 Entries.push_back(*It);
10691 if ((*It)->getVectorFactor() == VL.size()) {
10692 std::iota(std::next(Mask.begin(), Part * VL.size()),
10693 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
10694 } else {
10695 SmallVector<int> CommonMask = TE->getCommonMask();
10696 copy(CommonMask, Mask.begin());
10697 }
10698 // Clear undef scalars.
10699 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
10700 if (isa<PoisonValue>(VL[I]))
10703 }
10704 // No perfect match, just shuffle, so choose the first tree node from the
10705 // tree.
10706 Entries.push_back(FirstEntries.front());
10707 } else {
10708 // Try to find nodes with the same vector factor.
10709 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
10710 // Keep the order of tree nodes to avoid non-determinism.
10712 for (const TreeEntry *TE : UsedTEs.front()) {
10713 unsigned VF = TE->getVectorFactor();
10714 auto It = VFToTE.find(VF);
10715 if (It != VFToTE.end()) {
10716 if (It->second->Idx > TE->Idx)
10717 It->getSecond() = TE;
10718 continue;
10719 }
10720 VFToTE.try_emplace(VF, TE);
10721 }
10722 // Same, keep the order to avoid non-determinism.
10723 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
10724 UsedTEs.back().end());
10725 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
10726 return TE1->Idx < TE2->Idx;
10727 });
10728 for (const TreeEntry *TE : SecondEntries) {
10729 auto It = VFToTE.find(TE->getVectorFactor());
10730 if (It != VFToTE.end()) {
10731 VF = It->first;
10732 Entries.push_back(It->second);
10733 Entries.push_back(TE);
10734 break;
10735 }
10736 }
10737 // No 2 source vectors with the same vector factor - just choose 2 with max
10738 // index.
10739 if (Entries.empty()) {
10740 Entries.push_back(*llvm::max_element(
10741 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
10742 return TE1->Idx < TE2->Idx;
10743 }));
10744 Entries.push_back(SecondEntries.front());
10745 VF = std::max(Entries.front()->getVectorFactor(),
10746 Entries.back()->getVectorFactor());
10747 }
10748 }
10749
10750 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
10751 // Checks if the 2 PHIs are compatible in terms of high possibility to be
10752 // vectorized.
10753 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
10754 auto *PHI = cast<PHINode>(V);
10755 auto *PHI1 = cast<PHINode>(V1);
10756 // Check that all incoming values are compatible/from same parent (if they
10757 // are instructions).
10758 // The incoming values are compatible if they all are constants, or
10759 // instruction with the same/alternate opcodes from the same basic block.
10760 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
10761 Value *In = PHI->getIncomingValue(I);
10762 Value *In1 = PHI1->getIncomingValue(I);
10763 if (isConstant(In) && isConstant(In1))
10764 continue;
10765 if (!getSameOpcode({In, In1}, *TLI).getOpcode())
10766 return false;
10767 if (cast<Instruction>(In)->getParent() !=
10768 cast<Instruction>(In1)->getParent())
10769 return false;
10770 }
10771 return true;
10772 };
10773 // Check if the value can be ignored during analysis for shuffled gathers.
10774 // We suppose it is better to ignore instruction, which do not form splats,
10775 // are not vectorized/not extractelements (these instructions will be handled
10776 // by extractelements processing) or may form vector node in future.
10777 auto MightBeIgnored = [=](Value *V) {
10778 auto *I = dyn_cast<Instruction>(V);
10779 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
10781 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
10782 };
10783 // Check that the neighbor instruction may form a full vector node with the
10784 // current instruction V. It is possible, if they have same/alternate opcode
10785 // and same parent basic block.
10786 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
10787 Value *V1 = VL[Idx];
10788 bool UsedInSameVTE = false;
10789 auto It = UsedValuesEntry.find(V1);
10790 if (It != UsedValuesEntry.end())
10791 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
10792 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
10793 getSameOpcode({V, V1}, *TLI).getOpcode() &&
10794 cast<Instruction>(V)->getParent() ==
10795 cast<Instruction>(V1)->getParent() &&
10796 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
10797 };
10798 // Build a shuffle mask for better cost estimation and vector emission.
10799 SmallBitVector UsedIdxs(Entries.size());
10801 for (int I = 0, E = VL.size(); I < E; ++I) {
10802 Value *V = VL[I];
10803 auto It = UsedValuesEntry.find(V);
10804 if (It == UsedValuesEntry.end())
10805 continue;
10806 // Do not try to shuffle scalars, if they are constants, or instructions
10807 // that can be vectorized as a result of the following vector build
10808 // vectorization.
10809 if (isConstant(V) || (MightBeIgnored(V) &&
10810 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
10811 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
10812 continue;
10813 unsigned Idx = It->second;
10814 EntryLanes.emplace_back(Idx, I);
10815 UsedIdxs.set(Idx);
10816 }
10817 // Iterate through all shuffled scalars and select entries, which can be used
10818 // for final shuffle.
10820 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
10821 if (!UsedIdxs.test(I))
10822 continue;
10823 // Fix the entry number for the given scalar. If it is the first entry, set
10824 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
10825 // These indices are used when calculating final shuffle mask as the vector
10826 // offset.
10827 for (std::pair<unsigned, int> &Pair : EntryLanes)
10828 if (Pair.first == I)
10829 Pair.first = TempEntries.size();
10830 TempEntries.push_back(Entries[I]);
10831 }
10832 Entries.swap(TempEntries);
10833 if (EntryLanes.size() == Entries.size() &&
10834 !VL.equals(ArrayRef(TE->Scalars)
10835 .slice(Part * VL.size(),
10836 std::min<int>(VL.size(), TE->Scalars.size())))) {
10837 // We may have here 1 or 2 entries only. If the number of scalars is equal
10838 // to the number of entries, no need to do the analysis, it is not very
10839 // profitable. Since VL is not the same as TE->Scalars, it means we already
10840 // have some shuffles before. Cut off not profitable case.
10841 Entries.clear();
10842 return std::nullopt;
10843 }
10844 // Build the final mask, check for the identity shuffle, if possible.
10845 bool IsIdentity = Entries.size() == 1;
10846 // Pair.first is the offset to the vector, while Pair.second is the index of
10847 // scalar in the list.
10848 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
10849 unsigned Idx = Part * VL.size() + Pair.second;
10850 Mask[Idx] =
10851 Pair.first * VF +
10852 (ForOrder ? std::distance(
10853 Entries[Pair.first]->Scalars.begin(),
10854 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
10855 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
10856 IsIdentity &= Mask[Idx] == Pair.second;
10857 }
10858 switch (Entries.size()) {
10859 case 1:
10860 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
10862 break;
10863 case 2:
10864 if (EntryLanes.size() > 2 || VL.size() <= 2)
10866 break;
10867 default:
10868 break;
10869 }
10870 Entries.clear();
10871 // Clear the corresponding mask elements.
10872 std::fill(std::next(Mask.begin(), Part * VL.size()),
10873 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
10874 return std::nullopt;
10875}
10876
10878BoUpSLP::isGatherShuffledEntry(
10879 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
10880 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
10881 bool ForOrder) {
10882 assert(NumParts > 0 && NumParts < VL.size() &&
10883 "Expected positive number of registers.");
10884 Entries.clear();
10885 // No need to check for the topmost gather node.
10886 if (TE == VectorizableTree.front().get())
10887 return {};
10888 // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
10889 if (TE->isNonPowOf2Vec())
10890 return {};
10891 Mask.assign(VL.size(), PoisonMaskElem);
10892 assert(TE->UserTreeIndices.size() == 1 &&
10893 "Expected only single user of the gather node.");
10894 assert(VL.size() % NumParts == 0 &&
10895 "Number of scalars must be divisible by NumParts.");
10896 unsigned SliceSize = VL.size() / NumParts;
10898 for (unsigned Part = 0; Part < NumParts; ++Part) {
10899 ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
10900 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
10901 std::optional<TTI::ShuffleKind> SubRes =
10902 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
10903 ForOrder);
10904 if (!SubRes)
10905 SubEntries.clear();
10906 Res.push_back(SubRes);
10907 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
10908 SubEntries.front()->getVectorFactor() == VL.size() &&
10909 (SubEntries.front()->isSame(TE->Scalars) ||
10910 SubEntries.front()->isSame(VL))) {
10911 SmallVector<const TreeEntry *> LocalSubEntries;
10912 LocalSubEntries.swap(SubEntries);
10913 Entries.clear();
10914 Res.clear();
10915 std::iota(Mask.begin(), Mask.end(), 0);
10916 // Clear undef scalars.
10917 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
10918 if (isa<PoisonValue>(VL[I]))
10920 Entries.emplace_back(1, LocalSubEntries.front());
10922 return Res;
10923 }
10924 }
10925 if (all_of(Res,
10926 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
10927 Entries.clear();
10928 return {};
10929 }
10930 return Res;
10931}
10932
10933InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
10934 Type *ScalarTy) const {
10935 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
10936 bool DuplicateNonConst = false;
10937 // Find the cost of inserting/extracting values from the vector.
10938 // Check if the same elements are inserted several times and count them as
10939 // shuffle candidates.
10940 APInt ShuffledElements = APInt::getZero(VL.size());
10941 DenseMap<Value *, unsigned> UniqueElements;
10944 auto EstimateInsertCost = [&](unsigned I, Value *V) {
10945 if (V->getType() != ScalarTy) {
10946 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
10948 V = nullptr;
10949 }
10950 if (!ForPoisonSrc)
10951 Cost +=
10952 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
10953 I, Constant::getNullValue(VecTy), V);
10954 };
10955 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
10956 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
10957 Value *V = VL[I];
10958 // No need to shuffle duplicates for constants.
10959 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
10960 ShuffledElements.setBit(I);
10961 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
10962 continue;
10963 }
10964
10965 auto Res = UniqueElements.try_emplace(V, I);
10966 if (Res.second) {
10967 EstimateInsertCost(I, V);
10968 ShuffleMask[I] = I;
10969 continue;
10970 }
10971
10972 DuplicateNonConst = true;
10973 ShuffledElements.setBit(I);
10974 ShuffleMask[I] = Res.first->second;
10975 }
10976 if (ForPoisonSrc)
10977 Cost =
10978 TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
10979 /*Extract*/ false, CostKind);
10980 if (DuplicateNonConst)
10982 VecTy, ShuffleMask);
10983 return Cost;
10984}
10985
10986// Perform operand reordering on the instructions in VL and return the reordered
10987// operands in Left and Right.
10988void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
10991 const BoUpSLP &R) {
10992 if (VL.empty())
10993 return;
10994 VLOperands Ops(VL, R);
10995 // Reorder the operands in place.
10996 Ops.reorder();
10997 Left = Ops.getVL(0);
10998 Right = Ops.getVL(1);
10999}
11000
11001Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
11002 auto &Res = EntryToLastInstruction.FindAndConstruct(E);
11003 if (Res.second)
11004 return *Res.second;
11005 // Get the basic block this bundle is in. All instructions in the bundle
11006 // should be in this block (except for extractelement-like instructions with
11007 // constant indeces).
11008 auto *Front = E->getMainOp();
11009 auto *BB = Front->getParent();
11010 assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
11011 if (E->getOpcode() == Instruction::GetElementPtr &&
11012 !isa<GetElementPtrInst>(V))
11013 return true;
11014 auto *I = cast<Instruction>(V);
11015 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
11016 isVectorLikeInstWithConstOps(I);
11017 }));
11018
11019 auto FindLastInst = [&]() {
11020 Instruction *LastInst = Front;
11021 for (Value *V : E->Scalars) {
11022 auto *I = dyn_cast<Instruction>(V);
11023 if (!I)
11024 continue;
11025 if (LastInst->getParent() == I->getParent()) {
11026 if (LastInst->comesBefore(I))
11027 LastInst = I;
11028 continue;
11029 }
11030 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11031 !isa<GetElementPtrInst>(I)) ||
11032 (isVectorLikeInstWithConstOps(LastInst) &&
11034 "Expected vector-like or non-GEP in GEP node insts only.");
11035 if (!DT->isReachableFromEntry(LastInst->getParent())) {
11036 LastInst = I;
11037 continue;
11038 }
11039 if (!DT->isReachableFromEntry(I->getParent()))
11040 continue;
11041 auto *NodeA = DT->getNode(LastInst->getParent());
11042 auto *NodeB = DT->getNode(I->getParent());
11043 assert(NodeA && "Should only process reachable instructions");
11044 assert(NodeB && "Should only process reachable instructions");
11045 assert((NodeA == NodeB) ==
11046 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11047 "Different nodes should have different DFS numbers");
11048 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
11049 LastInst = I;
11050 }
11051 BB = LastInst->getParent();
11052 return LastInst;
11053 };
11054
11055 auto FindFirstInst = [&]() {
11056 Instruction *FirstInst = Front;
11057 for (Value *V : E->Scalars) {
11058 auto *I = dyn_cast<Instruction>(V);
11059 if (!I)
11060 continue;
11061 if (FirstInst->getParent() == I->getParent()) {
11062 if (I->comesBefore(FirstInst))
11063 FirstInst = I;
11064 continue;
11065 }
11066 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11067 !isa<GetElementPtrInst>(I)) ||
11068 (isVectorLikeInstWithConstOps(FirstInst) &&
11070 "Expected vector-like or non-GEP in GEP node insts only.");
11071 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
11072 FirstInst = I;
11073 continue;
11074 }
11075 if (!DT->isReachableFromEntry(I->getParent()))
11076 continue;
11077 auto *NodeA = DT->getNode(FirstInst->getParent());
11078 auto *NodeB = DT->getNode(I->getParent());
11079 assert(NodeA && "Should only process reachable instructions");
11080 assert(NodeB && "Should only process reachable instructions");
11081 assert((NodeA == NodeB) ==
11082 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11083 "Different nodes should have different DFS numbers");
11084 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11085 FirstInst = I;
11086 }
11087 return FirstInst;
11088 };
11089
11090 // Set the insert point to the beginning of the basic block if the entry
11091 // should not be scheduled.
11092 if (doesNotNeedToSchedule(E->Scalars) ||
11093 (E->State != TreeEntry::NeedToGather &&
11094 all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
11095 if ((E->getOpcode() == Instruction::GetElementPtr &&
11096 any_of(E->Scalars,
11097 [](Value *V) {
11098 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
11099 })) ||
11100 all_of(E->Scalars,
11101 [](Value *V) {
11102 return !isVectorLikeInstWithConstOps(V) &&
11103 isUsedOutsideBlock(V);
11104 }) ||
11105 (E->State == TreeEntry::NeedToGather && E->Idx == 0 &&
11106 all_of(E->Scalars, [](Value *V) {
11107 return isa<ExtractElementInst, UndefValue>(V) ||
11108 areAllOperandsNonInsts(V);
11109 })))
11110 Res.second = FindLastInst();
11111 else
11112 Res.second = FindFirstInst();
11113 return *Res.second;
11114 }
11115
11116 // Find the last instruction. The common case should be that BB has been
11117 // scheduled, and the last instruction is VL.back(). So we start with
11118 // VL.back() and iterate over schedule data until we reach the end of the
11119 // bundle. The end of the bundle is marked by null ScheduleData.
11120 if (BlocksSchedules.count(BB)) {
11121 Value *V = E->isOneOf(E->Scalars.back());
11123 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
11124 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11125 if (Bundle && Bundle->isPartOfBundle())
11126 for (; Bundle; Bundle = Bundle->NextInBundle)
11127 if (Bundle->OpValue == Bundle->Inst)
11128 Res.second = Bundle->Inst;
11129 }
11130
11131 // LastInst can still be null at this point if there's either not an entry
11132 // for BB in BlocksSchedules or there's no ScheduleData available for
11133 // VL.back(). This can be the case if buildTree_rec aborts for various
11134 // reasons (e.g., the maximum recursion depth is reached, the maximum region
11135 // size is reached, etc.). ScheduleData is initialized in the scheduling
11136 // "dry-run".
11137 //
11138 // If this happens, we can still find the last instruction by brute force. We
11139 // iterate forwards from Front (inclusive) until we either see all
11140 // instructions in the bundle or reach the end of the block. If Front is the
11141 // last instruction in program order, LastInst will be set to Front, and we
11142 // will visit all the remaining instructions in the block.
11143 //
11144 // One of the reasons we exit early from buildTree_rec is to place an upper
11145 // bound on compile-time. Thus, taking an additional compile-time hit here is
11146 // not ideal. However, this should be exceedingly rare since it requires that
11147 // we both exit early from buildTree_rec and that the bundle be out-of-order
11148 // (causing us to iterate all the way to the end of the block).
11149 if (!Res.second)
11150 Res.second = FindLastInst();
11151 assert(Res.second && "Failed to find last instruction in bundle");
11152 return *Res.second;
11153}
11154
11155void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
11156 auto *Front = E->getMainOp();
11157 Instruction *LastInst = &getLastInstructionInBundle(E);
11158 assert(LastInst && "Failed to find last instruction in bundle");
11159 BasicBlock::iterator LastInstIt = LastInst->getIterator();
11160 // If the instruction is PHI, set the insert point after all the PHIs.
11161 bool IsPHI = isa<PHINode>(LastInst);
11162 if (IsPHI)
11163 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
11164 if (IsPHI || (E->State != TreeEntry::NeedToGather &&
11165 doesNotNeedToSchedule(E->Scalars))) {
11166 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
11167 } else {
11168 // Set the insertion point after the last instruction in the bundle. Set the
11169 // debug location to Front.
11170 Builder.SetInsertPoint(
11171 LastInst->getParent(),
11173 }
11174 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11175}
11176
11177Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy) {
11178 // List of instructions/lanes from current block and/or the blocks which are
11179 // part of the current loop. These instructions will be inserted at the end to
11180 // make it possible to optimize loops and hoist invariant instructions out of
11181 // the loops body with better chances for success.
11183 SmallSet<int, 4> PostponedIndices;
11184 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
11185 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
11187 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
11188 InsertBB = InsertBB->getSinglePredecessor();
11189 return InsertBB && InsertBB == InstBB;
11190 };
11191 for (int I = 0, E = VL.size(); I < E; ++I) {
11192 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
11193 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11194 getTreeEntry(Inst) ||
11195 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
11196 PostponedIndices.insert(I).second)
11197 PostponedInsts.emplace_back(Inst, I);
11198 }
11199
11200 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
11201 Type *Ty) {
11202 Value *Scalar = V;
11203 if (Scalar->getType() != Ty) {
11204 assert(Scalar->getType()->isIntegerTy() && Ty->isIntegerTy() &&
11205 "Expected integer types only.");
11206 Scalar = Builder.CreateIntCast(
11207 Scalar, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
11208 }
11209
11210 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11211 auto *InsElt = dyn_cast<InsertElementInst>(Vec);
11212 if (!InsElt)
11213 return Vec;
11214 GatherShuffleExtractSeq.insert(InsElt);
11215 CSEBlocks.insert(InsElt->getParent());
11216 // Add to our 'need-to-extract' list.
11217 if (isa<Instruction>(V)) {
11218 if (TreeEntry *Entry = getTreeEntry(V)) {
11219 // Find which lane we need to extract.
11220 User *UserOp = nullptr;
11221 if (Scalar != V) {
11222 if (auto *SI = dyn_cast<Instruction>(Scalar))
11223 UserOp = SI;
11224 } else {
11225 UserOp = InsElt;
11226 }
11227 if (UserOp) {
11228 unsigned FoundLane = Entry->findLaneForValue(V);
11229 ExternalUses.emplace_back(V, UserOp, FoundLane);
11230 }
11231 }
11232 }
11233 return Vec;
11234 };
11235 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
11236 Value *Vec = Root ? Root : PoisonValue::get(VecTy);
11237 SmallVector<int> NonConsts;
11238 // Insert constant values at first.
11239 for (int I = 0, E = VL.size(); I < E; ++I) {
11240 if (PostponedIndices.contains(I))
11241 continue;
11242 if (!isConstant(VL[I])) {
11243 NonConsts.push_back(I);
11244 continue;
11245 }
11246 if (Root) {
11247 if (!isa<UndefValue>(VL[I])) {
11248 NonConsts.push_back(I);
11249 continue;
11250 }
11251 if (isa<PoisonValue>(VL[I]))
11252 continue;
11253 if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
11254 if (SV->getMaskValue(I) == PoisonMaskElem)
11255 continue;
11256 }
11257 }
11258 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11259 }
11260 // Insert non-constant values.
11261 for (int I : NonConsts)
11262 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11263 // Append instructions, which are/may be part of the loop, in the end to make
11264 // it possible to hoist non-loop-based instructions.
11265 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11266 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11267
11268 return Vec;
11269}
11270
11271/// Merges shuffle masks and emits final shuffle instruction, if required. It
11272/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
11273/// when the actual shuffle instruction is generated only if this is actually
11274/// required. Otherwise, the shuffle instruction emission is delayed till the
11275/// end of the process, to reduce the number of emitted instructions and further
11276/// analysis/transformations.
11277/// The class also will look through the previously emitted shuffle instructions
11278/// and properly mark indices in mask as undef.
11279/// For example, given the code
11280/// \code
11281/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
11282/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
11283/// \endcode
11284/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
11285/// look through %s1 and %s2 and emit
11286/// \code
11287/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11288/// \endcode
11289/// instead.
11290/// If 2 operands are of different size, the smallest one will be resized and
11291/// the mask recalculated properly.
11292/// For example, given the code
11293/// \code
11294/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
11295/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
11296/// \endcode
11297/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
11298/// look through %s1 and %s2 and emit
11299/// \code
11300/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11301/// \endcode
11302/// instead.
11303class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
11304 bool IsFinalized = false;
11305 /// Combined mask for all applied operands and masks. It is built during
11306 /// analysis and actual emission of shuffle vector instructions.
11307 SmallVector<int> CommonMask;
11308 /// List of operands for the shuffle vector instruction. It hold at max 2
11309 /// operands, if the 3rd is going to be added, the first 2 are combined into
11310 /// shuffle with \p CommonMask mask, the first operand sets to be the
11311 /// resulting shuffle and the second operand sets to be the newly added
11312 /// operand. The \p CommonMask is transformed in the proper way after that.
11313 SmallVector<Value *, 2> InVectors;
11314 Type *ScalarTy = nullptr;
11315 IRBuilderBase &Builder;
11316 BoUpSLP &R;
11317
11318 class ShuffleIRBuilder {
11319 IRBuilderBase &Builder;
11320 /// Holds all of the instructions that we gathered.
11321 SetVector<Instruction *> &GatherShuffleExtractSeq;
11322 /// A list of blocks that we are going to CSE.
11323 DenseSet<BasicBlock *> &CSEBlocks;
11324 /// Data layout.
11325 const DataLayout &DL;
11326
11327 public:
11328 ShuffleIRBuilder(IRBuilderBase &Builder,
11329 SetVector<Instruction *> &GatherShuffleExtractSeq,
11330 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
11331 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11332 CSEBlocks(CSEBlocks), DL(DL) {}
11333 ~ShuffleIRBuilder() = default;
11334 /// Creates shufflevector for the 2 operands with the given mask.
11335 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
11336 if (V1->getType() != V2->getType()) {
11338 V1->getType()->isIntOrIntVectorTy() &&
11339 "Expected integer vector types only.");
11340 if (V1->getType() != V2->getType()) {
11341 if (cast<VectorType>(V2->getType())
11342 ->getElementType()
11343 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
11344 ->getElementType()
11345 ->getIntegerBitWidth())
11346 V2 = Builder.CreateIntCast(
11347 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
11348 else
11349 V1 = Builder.CreateIntCast(
11350 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
11351 }
11352 }
11353 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
11354 if (auto *I = dyn_cast<Instruction>(Vec)) {
11355 GatherShuffleExtractSeq.insert(I);
11356 CSEBlocks.insert(I->getParent());
11357 }
11358 return Vec;
11359 }
11360 /// Creates permutation of the single vector operand with the given mask, if
11361 /// it is not identity mask.
11362 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
11363 if (Mask.empty())
11364 return V1;
11365 unsigned VF = Mask.size();
11366 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
11367 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
11368 return V1;
11369 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
11370 if (auto *I = dyn_cast<Instruction>(Vec)) {
11371 GatherShuffleExtractSeq.insert(I);
11372 CSEBlocks.insert(I->getParent());
11373 }
11374 return Vec;
11375 }
11376 Value *createIdentity(Value *V) { return V; }
11377 Value *createPoison(Type *Ty, unsigned VF) {
11378 return PoisonValue::get(FixedVectorType::get(Ty, VF));
11379 }
11380 /// Resizes 2 input vector to match the sizes, if the they are not equal
11381 /// yet. The smallest vector is resized to the size of the larger vector.
11382 void resizeToMatch(Value *&V1, Value *&V2) {
11383 if (V1->getType() == V2->getType())
11384 return;
11385 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
11386 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
11387 int VF = std::max(V1VF, V2VF);
11388 int MinVF = std::min(V1VF, V2VF);
11389 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
11390 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
11391 0);
11392 Value *&Op = MinVF == V1VF ? V1 : V2;
11393 Op = Builder.CreateShuffleVector(Op, IdentityMask);
11394 if (auto *I = dyn_cast<Instruction>(Op)) {
11395 GatherShuffleExtractSeq.insert(I);
11396 CSEBlocks.insert(I->getParent());
11397 }
11398 if (MinVF == V1VF)
11399 V1 = Op;
11400 else
11401 V2 = Op;
11402 }
11403 };
11404
11405 /// Smart shuffle instruction emission, walks through shuffles trees and
11406 /// tries to find the best matching vector for the actual shuffle
11407 /// instruction.
11408 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
11409 assert(V1 && "Expected at least one vector value.");
11410 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11411 R.CSEBlocks, *R.DL);
11412 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11413 ShuffleBuilder);
11414 }
11415
11416 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
11417 /// shuffle emission.
11418 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
11419 ArrayRef<int> Mask) {
11420 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11421 if (Mask[Idx] != PoisonMaskElem)
11422 CommonMask[Idx] = Idx;
11423 }
11424
11425 /// Cast value \p V to the vector type with the same number of elements, but
11426 /// the base type \p ScalarTy.
11427 Value *castToScalarTyElem(Value *V) {
11428 auto *VecTy = cast<VectorType>(V->getType());
11429 if (VecTy->getElementType() == ScalarTy)
11430 return V;
11431 return Builder.CreateIntCast(
11432 V, VectorType::get(ScalarTy, VecTy->getElementCount()),
11433 !isKnownNonNegative(V, SimplifyQuery(*R.DL)));
11434 }
11435
11436public:
11438 : ScalarTy(ScalarTy), Builder(Builder), R(R) {}
11439
11440 /// Adjusts extractelements after reusing them.
11441 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
11442 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11443 unsigned NumParts, bool &UseVecBaseAsInput) {
11444 UseVecBaseAsInput = false;
11445 SmallPtrSet<Value *, 4> UniqueBases;
11446 Value *VecBase = nullptr;
11447 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
11448 int Idx = Mask[I];
11449 if (Idx == PoisonMaskElem)
11450 continue;
11451 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
11452 VecBase = EI->getVectorOperand();
11453 if (const TreeEntry *TE = R.getTreeEntry(VecBase))
11454 VecBase = TE->VectorizedValue;
11455 assert(VecBase && "Expected vectorized value.");
11456 UniqueBases.insert(VecBase);
11457 // If the only one use is vectorized - can delete the extractelement
11458 // itself.
11459 if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) ||
11460 any_of(EI->users(), [&](User *U) {
11461 const TreeEntry *UTE = R.getTreeEntry(U);
11462 return !UTE || R.MultiNodeScalars.contains(U) ||
11463 (isa<GetElementPtrInst>(U) &&
11464 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
11465 count_if(R.VectorizableTree,
11466 [&](const std::unique_ptr<TreeEntry> &TE) {
11467 return any_of(TE->UserTreeIndices,
11468 [&](const EdgeInfo &Edge) {
11469 return Edge.UserTE == UTE;
11470 }) &&
11471 is_contained(TE->Scalars, EI);
11472 }) != 1;
11473 }))
11474 continue;
11475 R.eraseInstruction(EI);
11476 }
11477 if (NumParts == 1 || UniqueBases.size() == 1) {
11478 VecBase = castToScalarTyElem(VecBase);
11479 return VecBase;
11480 }
11481 UseVecBaseAsInput = true;
11482 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
11483 for (auto [I, Idx] : enumerate(Mask))
11484 if (Idx != PoisonMaskElem)
11485 Idx = I;
11486 };
11487 // Perform multi-register vector shuffle, joining them into a single virtual
11488 // long vector.
11489 // Need to shuffle each part independently and then insert all this parts
11490 // into a long virtual vector register, forming the original vector.
11491 Value *Vec = nullptr;
11492 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
11493 unsigned SliceSize = E->Scalars.size() / NumParts;
11494 for (unsigned Part = 0; Part < NumParts; ++Part) {
11496 ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
11497 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
11498 constexpr int MaxBases = 2;
11499 SmallVector<Value *, MaxBases> Bases(MaxBases);
11500#ifndef NDEBUG
11501 int PrevSize = 0;
11502#endif // NDEBUG
11503 for (const auto [I, V]: enumerate(VL)) {
11504 if (SubMask[I] == PoisonMaskElem)
11505 continue;
11506 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
11507 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
11508 VecOp = TE->VectorizedValue;
11509 assert(VecOp && "Expected vectorized value.");
11510 const int Size =
11511 cast<FixedVectorType>(VecOp->getType())->getNumElements();
11512#ifndef NDEBUG
11513 assert((PrevSize == Size || PrevSize == 0) &&
11514 "Expected vectors of the same size.");
11515 PrevSize = Size;
11516#endif // NDEBUG
11517 VecOp = castToScalarTyElem(VecOp);
11518 Bases[SubMask[I] < Size ? 0 : 1] = VecOp;
11519 }
11520 if (!Bases.front())
11521 continue;
11522 Value *SubVec;
11523 if (Bases.back()) {
11524 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
11525 TransformToIdentity(SubMask);
11526 } else {
11527 SubVec = Bases.front();
11528 }
11529 if (!Vec) {
11530 Vec = SubVec;
11531 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
11532 [&](unsigned P) {
11533 ArrayRef<int> SubMask =
11534 Mask.slice(P * SliceSize, SliceSize);
11535 return all_of(SubMask, [](int Idx) {
11536 return Idx == PoisonMaskElem;
11537 });
11538 })) &&
11539 "Expected first part or all previous parts masked.");
11540 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11541 } else {
11542 unsigned VF = cast<FixedVectorType>(Vec->getType())->getNumElements();
11543 if (Vec->getType() != SubVec->getType()) {
11544 unsigned SubVecVF =
11545 cast<FixedVectorType>(SubVec->getType())->getNumElements();
11546 VF = std::max(VF, SubVecVF);
11547 }
11548 // Adjust SubMask.
11549 for (int &Idx : SubMask)
11550 if (Idx != PoisonMaskElem)
11551 Idx += VF;
11552 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11553 Vec = createShuffle(Vec, SubVec, VecMask);
11554 TransformToIdentity(VecMask);
11555 }
11556 }
11557 copy(VecMask, Mask.begin());
11558 return Vec;
11559 }
11560 /// Checks if the specified entry \p E needs to be delayed because of its
11561 /// dependency nodes.
11562 std::optional<Value *>
11563 needToDelay(const TreeEntry *E,
11565 // No need to delay emission if all deps are ready.
11566 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
11567 return all_of(
11568 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
11569 }))
11570 return std::nullopt;
11571 // Postpone gather emission, will be emitted after the end of the
11572 // process to keep correct order.
11573 auto *ResVecTy = FixedVectorType::get(ScalarTy, E->getVectorFactor());
11574 return Builder.CreateAlignedLoad(
11575 ResVecTy,
11577 MaybeAlign());
11578 }
11579 /// Adds 2 input vectors (in form of tree entries) and the mask for their
11580 /// shuffling.
11581 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
11582 add(E1.VectorizedValue, E2.VectorizedValue, Mask);
11583 }
11584 /// Adds single input vector (in form of tree entry) and the mask for its
11585 /// shuffling.
11586 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
11587 add(E1.VectorizedValue, Mask);
11588 }
11589 /// Adds 2 input vectors and the mask for their shuffling.
11590 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
11591 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
11592 V1 = castToScalarTyElem(V1);
11593 V2 = castToScalarTyElem(V2);
11594 if (InVectors.empty()) {
11595 InVectors.push_back(V1);
11596 InVectors.push_back(V2);
11597 CommonMask.assign(Mask.begin(), Mask.end());
11598 return;
11599 }
11600 Value *Vec = InVectors.front();
11601 if (InVectors.size() == 2) {
11602 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
11603 transformMaskAfterShuffle(CommonMask, CommonMask);
11604 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
11605 Mask.size()) {
11606 Vec = createShuffle(Vec, nullptr, CommonMask);
11607 transformMaskAfterShuffle(CommonMask, CommonMask);
11608 }
11609 V1 = createShuffle(V1, V2, Mask);
11610 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11611 if (Mask[Idx] != PoisonMaskElem)
11612 CommonMask[Idx] = Idx + Sz;
11613 InVectors.front() = Vec;
11614 if (InVectors.size() == 2)
11615 InVectors.back() = V1;
11616 else
11617 InVectors.push_back(V1);
11618 }
11619 /// Adds another one input vector and the mask for the shuffling.
11620 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
11621 V1 = castToScalarTyElem(V1);
11622 if (InVectors.empty()) {
11623 if (!isa<FixedVectorType>(V1->getType())) {
11624 V1 = createShuffle(V1, nullptr, CommonMask);
11625 CommonMask.assign(Mask.size(), PoisonMaskElem);
11626 transformMaskAfterShuffle(CommonMask, Mask);
11627 }
11628 InVectors.push_back(V1);
11629 CommonMask.assign(Mask.begin(), Mask.end());
11630 return;
11631 }
11632 const auto *It = find(InVectors, V1);
11633 if (It == InVectors.end()) {
11634 if (InVectors.size() == 2 ||
11635 InVectors.front()->getType() != V1->getType() ||
11636 !isa<FixedVectorType>(V1->getType())) {
11637 Value *V = InVectors.front();
11638 if (InVectors.size() == 2) {
11639 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
11640 transformMaskAfterShuffle(CommonMask, CommonMask);
11641 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
11642 CommonMask.size()) {
11643 V = createShuffle(InVectors.front(), nullptr, CommonMask);
11644 transformMaskAfterShuffle(CommonMask, CommonMask);
11645 }
11646 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11647 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
11648 CommonMask[Idx] =
11649 V->getType() != V1->getType()
11650 ? Idx + Sz
11651 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
11652 ->getNumElements();
11653 if (V->getType() != V1->getType())
11654 V1 = createShuffle(V1, nullptr, Mask);
11655 InVectors.front() = V;
11656 if (InVectors.size() == 2)
11657 InVectors.back() = V1;
11658 else
11659 InVectors.push_back(V1);
11660 return;
11661 }
11662 // Check if second vector is required if the used elements are already
11663 // used from the first one.
11664 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11665 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
11666 InVectors.push_back(V1);
11667 break;
11668 }
11669 }
11670 int VF = CommonMask.size();
11671 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
11672 VF = FTy->getNumElements();
11673 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11674 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
11675 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
11676 }
11677 /// Adds another one input vector and the mask for the shuffling.
11679 SmallVector<int> NewMask;
11680 inversePermutation(Order, NewMask);
11681 add(V1, NewMask);
11682 }
11683 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
11684 Value *Root = nullptr) {
11685 return R.gather(VL, Root, ScalarTy);
11686 }
11687 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
11688 /// Finalize emission of the shuffles.
11689 /// \param Action the action (if any) to be performed before final applying of
11690 /// the \p ExtMask mask.
11691 Value *
11692 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
11693 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
11694 IsFinalized = true;
11695 if (Action) {
11696 Value *Vec = InVectors.front();
11697 if (InVectors.size() == 2) {
11698 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
11699 InVectors.pop_back();
11700 } else {
11701 Vec = createShuffle(Vec, nullptr, CommonMask);
11702 }
11703 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11704 if (CommonMask[Idx] != PoisonMaskElem)
11705 CommonMask[Idx] = Idx;
11706 assert(VF > 0 &&
11707 "Expected vector length for the final value before action.");
11708 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
11709 if (VecVF < VF) {
11710 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
11711 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
11712 Vec = createShuffle(Vec, nullptr, ResizeMask);
11713 }
11714 Action(Vec, CommonMask);
11715 InVectors.front() = Vec;
11716 }
11717 if (!ExtMask.empty()) {
11718 if (CommonMask.empty()) {
11719 CommonMask.assign(ExtMask.begin(), ExtMask.end());
11720 } else {
11721 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
11722 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
11723 if (ExtMask[I] == PoisonMaskElem)
11724 continue;
11725 NewMask[I] = CommonMask[ExtMask[I]];
11726 }
11727 CommonMask.swap(NewMask);
11728 }
11729 }
11730 if (CommonMask.empty()) {
11731 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
11732 return InVectors.front();
11733 }
11734 if (InVectors.size() == 2)
11735 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
11736 return createShuffle(InVectors.front(), nullptr, CommonMask);
11737 }
11738
11740 assert((IsFinalized || CommonMask.empty()) &&
11741 "Shuffle construction must be finalized.");
11742 }
11743};
11744
11745Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
11746 bool PostponedPHIs) {
11747 ValueList &VL = E->getOperand(NodeIdx);
11748 const unsigned VF = VL.size();
11749 InstructionsState S = getSameOpcode(VL, *TLI);
11750 // Special processing for GEPs bundle, which may include non-gep values.
11751 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
11752 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11753 if (It != VL.end())
11754 S = getSameOpcode(*It, *TLI);
11755 }
11756 if (S.getOpcode()) {
11757 auto CheckSameVE = [&](const TreeEntry *VE) {
11758 return VE->isSame(VL) &&
11759 (any_of(VE->UserTreeIndices,
11760 [E, NodeIdx](const EdgeInfo &EI) {
11761 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11762 }) ||
11763 any_of(VectorizableTree,
11764 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
11765 return TE->isOperandGatherNode({E, NodeIdx}) &&
11766 VE->isSame(TE->Scalars);
11767 }));
11768 };
11769 TreeEntry *VE = getTreeEntry(S.OpValue);
11770 bool IsSameVE = VE && CheckSameVE(VE);
11771 if (!IsSameVE) {
11772 auto It = MultiNodeScalars.find(S.OpValue);
11773 if (It != MultiNodeScalars.end()) {
11774 auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
11775 return TE != VE && CheckSameVE(TE);
11776 });
11777 if (I != It->getSecond().end()) {
11778 VE = *I;
11779 IsSameVE = true;
11780 }
11781 }
11782 }
11783 if (IsSameVE) {
11784 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
11785 ShuffleInstructionBuilder ShuffleBuilder(
11786 cast<VectorType>(V->getType())->getElementType(), Builder, *this);
11787 ShuffleBuilder.add(V, Mask);
11788 return ShuffleBuilder.finalize(std::nullopt);
11789 };
11790 Value *V = vectorizeTree(VE, PostponedPHIs);
11791 if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
11792 if (!VE->ReuseShuffleIndices.empty()) {
11793 // Reshuffle to get only unique values.
11794 // If some of the scalars are duplicated in the vectorization
11795 // tree entry, we do not vectorize them but instead generate a
11796 // mask for the reuses. But if there are several users of the
11797 // same entry, they may have different vectorization factors.
11798 // This is especially important for PHI nodes. In this case, we
11799 // need to adapt the resulting instruction for the user
11800 // vectorization factor and have to reshuffle it again to take
11801 // only unique elements of the vector. Without this code the
11802 // function incorrectly returns reduced vector instruction with
11803 // the same elements, not with the unique ones.
11804
11805 // block:
11806 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
11807 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
11808 // ... (use %2)
11809 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
11810 // br %block
11812 for (auto [I, V] : enumerate(VL)) {
11813 if (isa<PoisonValue>(V))
11814 continue;
11815 Mask[I] = VE->findLaneForValue(V);
11816 }
11817 V = FinalShuffle(V, Mask);
11818 } else {
11819 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
11820 "Expected vectorization factor less "
11821 "than original vector size.");
11822 SmallVector<int> UniformMask(VF, 0);
11823 std::iota(UniformMask.begin(), UniformMask.end(), 0);
11824 V = FinalShuffle(V, UniformMask);
11825 }
11826 }
11827 // Need to update the operand gather node, if actually the operand is not a
11828 // vectorized node, but the buildvector/gather node, which matches one of
11829 // the vectorized nodes.
11830 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
11831 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11832 }) == VE->UserTreeIndices.end()) {
11833 auto *It = find_if(
11834 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
11835 return TE->State == TreeEntry::NeedToGather &&
11836 TE->UserTreeIndices.front().UserTE == E &&
11837 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
11838 });
11839 assert(It != VectorizableTree.end() && "Expected gather node operand.");
11840 (*It)->VectorizedValue = V;
11841 }
11842 return V;
11843 }
11844 }
11845
11846 // Find the corresponding gather entry and vectorize it.
11847 // Allows to be more accurate with tree/graph transformations, checks for the
11848 // correctness of the transformations in many cases.
11849 auto *I = find_if(VectorizableTree,
11850 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
11851 return TE->isOperandGatherNode({E, NodeIdx});
11852 });
11853 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
11854 assert(I->get()->UserTreeIndices.size() == 1 &&
11855 "Expected only single user for the gather node.");
11856 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
11857 return vectorizeTree(I->get(), PostponedPHIs);
11858}
11859
11860template <typename BVTy, typename ResTy, typename... Args>
11861ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
11862 Args &...Params) {
11863 assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
11864 unsigned VF = E->getVectorFactor();
11865
11866 bool NeedFreeze = false;
11867 SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
11868 E->ReuseShuffleIndices.end());
11869 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
11870 // Build a mask out of the reorder indices and reorder scalars per this
11871 // mask.
11872 SmallVector<int> ReorderMask;
11873 inversePermutation(E->ReorderIndices, ReorderMask);
11874 if (!ReorderMask.empty())
11875 reorderScalars(GatheredScalars, ReorderMask);
11876 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
11877 unsigned I, unsigned SliceSize) {
11878 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
11879 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
11880 }))
11881 return false;
11882 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
11883 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
11884 if (UserTE->getNumOperands() != 2)
11885 return false;
11886 auto *It =
11887 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
11888 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
11889 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
11890 }) != TE->UserTreeIndices.end();
11891 });
11892 if (It == VectorizableTree.end())
11893 return false;
11894 int Idx;
11895 if ((Mask.size() < InputVF &&
11897 Idx == 0) ||
11898 (Mask.size() == InputVF &&
11899 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
11900 std::iota(std::next(Mask.begin(), I * SliceSize),
11901 std::next(Mask.begin(), (I + 1) * SliceSize), 0);
11902 } else {
11903 unsigned IVal =
11904 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
11905 std::fill(std::next(Mask.begin(), I * SliceSize),
11906 std::next(Mask.begin(), (I + 1) * SliceSize), IVal);
11907 }
11908 return true;
11909 };
11910 BVTy ShuffleBuilder(ScalarTy, Params...);
11911 ResTy Res = ResTy();
11913 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
11915 Value *ExtractVecBase = nullptr;
11916 bool UseVecBaseAsInput = false;
11919 Type *OrigScalarTy = GatheredScalars.front()->getType();
11920 auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());
11921 unsigned NumParts = TTI->getNumberOfParts(VecTy);
11922 if (NumParts == 0 || NumParts >= GatheredScalars.size())
11923 NumParts = 1;
11924 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
11925 // Check for gathered extracts.
11926 bool Resized = false;
11927 ExtractShuffles =
11928 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
11929 if (!ExtractShuffles.empty()) {
11930 SmallVector<const TreeEntry *> ExtractEntries;
11931 for (auto [Idx, I] : enumerate(ExtractMask)) {
11932 if (I == PoisonMaskElem)
11933 continue;
11934 if (const auto *TE = getTreeEntry(
11935 cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
11936 ExtractEntries.push_back(TE);
11937 }
11938 if (std::optional<ResTy> Delayed =
11939 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
11940 // Delay emission of gathers which are not ready yet.
11941 PostponedGathers.insert(E);
11942 // Postpone gather emission, will be emitted after the end of the
11943 // process to keep correct order.
11944 return *Delayed;
11945 }
11946 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
11947 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
11948 ExtractVecBase = VecBase;
11949 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
11950 if (VF == VecBaseTy->getNumElements() &&
11951 GatheredScalars.size() != VF) {
11952 Resized = true;
11953 GatheredScalars.append(VF - GatheredScalars.size(),
11954 PoisonValue::get(OrigScalarTy));
11955 }
11956 }
11957 }
11958 // Gather extracts after we check for full matched gathers only.
11959 if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
11960 E->isAltShuffle() ||
11961 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
11962 isSplat(E->Scalars) ||
11963 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
11964 GatherShuffles =
11965 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
11966 }
11967 if (!GatherShuffles.empty()) {
11968 if (std::optional<ResTy> Delayed =
11969 ShuffleBuilder.needToDelay(E, Entries)) {
11970 // Delay emission of gathers which are not ready yet.
11971 PostponedGathers.insert(E);
11972 // Postpone gather emission, will be emitted after the end of the
11973 // process to keep correct order.
11974 return *Delayed;
11975 }
11976 if (GatherShuffles.size() == 1 &&
11977 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
11978 Entries.front().front()->isSame(E->Scalars)) {
11979 // Perfect match in the graph, will reuse the previously vectorized
11980 // node. Cost is 0.
11981 LLVM_DEBUG(
11982 dbgs()
11983 << "SLP: perfect diamond match for gather bundle "
11984 << shortBundleName(E->Scalars) << ".\n");
11985 // Restore the mask for previous partially matched values.
11986 Mask.resize(E->Scalars.size());
11987 const TreeEntry *FrontTE = Entries.front().front();
11988 if (FrontTE->ReorderIndices.empty() &&
11989 ((FrontTE->ReuseShuffleIndices.empty() &&
11990 E->Scalars.size() == FrontTE->Scalars.size()) ||
11991 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
11992 std::iota(Mask.begin(), Mask.end(), 0);
11993 } else {
11994 for (auto [I, V] : enumerate(E->Scalars)) {
11995 if (isa<PoisonValue>(V)) {
11997 continue;
11998 }
11999 Mask[I] = FrontTE->findLaneForValue(V);
12000 }
12001 }
12002 ShuffleBuilder.add(*FrontTE, Mask);
12003 Res = ShuffleBuilder.finalize(E->getCommonMask());
12004 return Res;
12005 }
12006 if (!Resized) {
12007 if (GatheredScalars.size() != VF &&
12008 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
12009 return any_of(TEs, [&](const TreeEntry *TE) {
12010 return TE->getVectorFactor() == VF;
12011 });
12012 }))
12013 GatheredScalars.append(VF - GatheredScalars.size(),
12014 PoisonValue::get(OrigScalarTy));
12015 }
12016 // Remove shuffled elements from list of gathers.
12017 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
12018 if (Mask[I] != PoisonMaskElem)
12019 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
12020 }
12021 }
12022 }
12023 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
12024 SmallVectorImpl<int> &ReuseMask,
12025 bool IsRootPoison) {
12026 // For splats with can emit broadcasts instead of gathers, so try to find
12027 // such sequences.
12028 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
12029 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
12030 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
12031 SmallVector<int> UndefPos;
12032 DenseMap<Value *, unsigned> UniquePositions;
12033 // Gather unique non-const values and all constant values.
12034 // For repeated values, just shuffle them.
12035 int NumNonConsts = 0;
12036 int SinglePos = 0;
12037 for (auto [I, V] : enumerate(Scalars)) {
12038 if (isa<UndefValue>(V)) {
12039 if (!isa<PoisonValue>(V)) {
12040 ReuseMask[I] = I;
12041 UndefPos.push_back(I);
12042 }
12043 continue;
12044 }
12045 if (isConstant(V)) {
12046 ReuseMask[I] = I;
12047 continue;
12048 }
12049 ++NumNonConsts;
12050 SinglePos = I;
12051 Value *OrigV = V;
12052 Scalars[I] = PoisonValue::get(OrigScalarTy);
12053 if (IsSplat) {
12054 Scalars.front() = OrigV;
12055 ReuseMask[I] = 0;
12056 } else {
12057 const auto Res = UniquePositions.try_emplace(OrigV, I);
12058 Scalars[Res.first->second] = OrigV;
12059 ReuseMask[I] = Res.first->second;
12060 }
12061 }
12062 if (NumNonConsts == 1) {
12063 // Restore single insert element.
12064 if (IsSplat) {
12065 ReuseMask.assign(VF, PoisonMaskElem);
12066 std::swap(Scalars.front(), Scalars[SinglePos]);
12067 if (!UndefPos.empty() && UndefPos.front() == 0)
12068 Scalars.front() = UndefValue::get(OrigScalarTy);
12069 }
12070 ReuseMask[SinglePos] = SinglePos;
12071 } else if (!UndefPos.empty() && IsSplat) {
12072 // For undef values, try to replace them with the simple broadcast.
12073 // We can do it if the broadcasted value is guaranteed to be
12074 // non-poisonous, or by freezing the incoming scalar value first.
12075 auto *It = find_if(Scalars, [this, E](Value *V) {
12076 return !isa<UndefValue>(V) &&
12077 (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
12078 (E->UserTreeIndices.size() == 1 &&
12079 any_of(V->uses(), [E](const Use &U) {
12080 // Check if the value already used in the same operation in
12081 // one of the nodes already.
12082 return E->UserTreeIndices.front().EdgeIdx !=
12083 U.getOperandNo() &&
12084 is_contained(
12085 E->UserTreeIndices.front().UserTE->Scalars,
12086 U.getUser());
12087 })));
12088 });
12089 if (It != Scalars.end()) {
12090 // Replace undefs by the non-poisoned scalars and emit broadcast.
12091 int Pos = std::distance(Scalars.begin(), It);
12092 for (int I : UndefPos) {
12093 // Set the undef position to the non-poisoned scalar.
12094 ReuseMask[I] = Pos;
12095 // Replace the undef by the poison, in the mask it is replaced by
12096 // non-poisoned scalar already.
12097 if (I != Pos)
12098 Scalars[I] = PoisonValue::get(OrigScalarTy);
12099 }
12100 } else {
12101 // Replace undefs by the poisons, emit broadcast and then emit
12102 // freeze.
12103 for (int I : UndefPos) {
12104 ReuseMask[I] = PoisonMaskElem;
12105 if (isa<UndefValue>(Scalars[I]))
12106 Scalars[I] = PoisonValue::get(OrigScalarTy);
12107 }
12108 NeedFreeze = true;
12109 }
12110 }
12111 };
12112 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
12113 bool IsNonPoisoned = true;
12114 bool IsUsedInExpr = true;
12115 Value *Vec1 = nullptr;
12116 if (!ExtractShuffles.empty()) {
12117 // Gather of extractelements can be represented as just a shuffle of
12118 // a single/two vectors the scalars are extracted from.
12119 // Find input vectors.
12120 Value *Vec2 = nullptr;
12121 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12122 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
12123 ExtractMask[I] = PoisonMaskElem;
12124 }
12125 if (UseVecBaseAsInput) {
12126 Vec1 = ExtractVecBase;
12127 } else {
12128 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12129 if (ExtractMask[I] == PoisonMaskElem)
12130 continue;
12131 if (isa<UndefValue>(E->Scalars[I]))
12132 continue;
12133 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
12134 Value *VecOp = EI->getVectorOperand();
12135 if (const auto *TE = getTreeEntry(VecOp))
12136 if (TE->VectorizedValue)
12137 VecOp = TE->VectorizedValue;
12138 if (!Vec1) {
12139 Vec1 = VecOp;
12140 } else if (Vec1 != VecOp) {
12141 assert((!Vec2 || Vec2 == VecOp) &&
12142 "Expected only 1 or 2 vectors shuffle.");
12143 Vec2 = VecOp;
12144 }
12145 }
12146 }
12147 if (Vec2) {
12148 IsUsedInExpr = false;
12149 IsNonPoisoned &=
12151 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12152 } else if (Vec1) {
12153 IsUsedInExpr &= FindReusedSplat(
12154 ExtractMask,
12155 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
12156 ExtractMask.size());
12157 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
12158 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
12159 } else {
12160 IsUsedInExpr = false;
12161 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
12162 /*ForExtracts=*/true);
12163 }
12164 }
12165 if (!GatherShuffles.empty()) {
12166 unsigned SliceSize = E->Scalars.size() / NumParts;
12167 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
12168 for (const auto [I, TEs] : enumerate(Entries)) {
12169 if (TEs.empty()) {
12170 assert(!GatherShuffles[I] &&
12171 "No shuffles with empty entries list expected.");
12172 continue;
12173 }
12174 assert((TEs.size() == 1 || TEs.size() == 2) &&
12175 "Expected shuffle of 1 or 2 entries.");
12176 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
12177 VecMask.assign(VecMask.size(), PoisonMaskElem);
12178 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
12179 if (TEs.size() == 1) {
12180 IsUsedInExpr &= FindReusedSplat(
12181 VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
12182 ShuffleBuilder.add(*TEs.front(), VecMask);
12183 if (TEs.front()->VectorizedValue)
12184 IsNonPoisoned &=
12185 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
12186 } else {
12187 IsUsedInExpr = false;
12188 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
12189 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
12190 IsNonPoisoned &=
12191 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
12192 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
12193 }
12194 }
12195 }
12196 // Try to figure out best way to combine values: build a shuffle and insert
12197 // elements or just build several shuffles.
12198 // Insert non-constant scalars.
12199 SmallVector<Value *> NonConstants(GatheredScalars);
12200 int EMSz = ExtractMask.size();
12201 int MSz = Mask.size();
12202 // Try to build constant vector and shuffle with it only if currently we
12203 // have a single permutation and more than 1 scalar constants.
12204 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
12205 bool IsIdentityShuffle =
12206 ((UseVecBaseAsInput ||
12207 all_of(ExtractShuffles,
12208 [](const std::optional<TTI::ShuffleKind> &SK) {
12209 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12211 })) &&
12212 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
12213 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
12214 (!GatherShuffles.empty() &&
12215 all_of(GatherShuffles,
12216 [](const std::optional<TTI::ShuffleKind> &SK) {
12217 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12219 }) &&
12220 none_of(Mask, [&](int I) { return I >= MSz; }) &&
12222 bool EnoughConstsForShuffle =
12223 IsSingleShuffle &&
12224 (none_of(GatheredScalars,
12225 [](Value *V) {
12226 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12227 }) ||
12228 any_of(GatheredScalars,
12229 [](Value *V) {
12230 return isa<Constant>(V) && !isa<UndefValue>(V);
12231 })) &&
12232 (!IsIdentityShuffle ||
12233 (GatheredScalars.size() == 2 &&
12234 any_of(GatheredScalars,
12235 [](Value *V) { return !isa<UndefValue>(V); })) ||
12236 count_if(GatheredScalars, [](Value *V) {
12237 return isa<Constant>(V) && !isa<PoisonValue>(V);
12238 }) > 1);
12239 // NonConstants array contains just non-constant values, GatheredScalars
12240 // contains only constant to build final vector and then shuffle.
12241 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
12242 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
12243 NonConstants[I] = PoisonValue::get(OrigScalarTy);
12244 else
12245 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
12246 }
12247 // Generate constants for final shuffle and build a mask for them.
12248 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
12249 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
12250 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
12251 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12252 ShuffleBuilder.add(BV, BVMask);
12253 }
12254 if (all_of(NonConstants, [=](Value *V) {
12255 return isa<PoisonValue>(V) ||
12256 (IsSingleShuffle && ((IsIdentityShuffle &&
12257 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
12258 }))
12259 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12260 else
12261 Res = ShuffleBuilder.finalize(
12262 E->ReuseShuffleIndices, E->Scalars.size(),
12263 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
12264 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
12265 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
12266 });
12267 } else if (!allConstant(GatheredScalars)) {
12268 // Gather unique scalars and all constants.
12269 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
12270 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
12271 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
12272 ShuffleBuilder.add(BV, ReuseMask);
12273 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12274 } else {
12275 // Gather all constants.
12276 SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem);
12277 for (auto [I, V] : enumerate(E->Scalars)) {
12278 if (!isa<PoisonValue>(V))
12279 Mask[I] = I;
12280 }
12281 Value *BV = ShuffleBuilder.gather(E->Scalars);
12282 ShuffleBuilder.add(BV, Mask);
12283 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12284 }
12285
12286 if (NeedFreeze)
12287 Res = ShuffleBuilder.createFreeze(Res);
12288 return Res;
12289}
12290
12291Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
12292 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
12293 Builder, *this);
12294}
12295
12296Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
12297 IRBuilderBase::InsertPointGuard Guard(Builder);
12298
12299 if (E->VectorizedValue &&
12300 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12301 E->isAltShuffle())) {
12302 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
12303 return E->VectorizedValue;
12304 }
12305
12306 Value *V = E->Scalars.front();
12307 Type *ScalarTy = V->getType();
12308 if (auto *Store = dyn_cast<StoreInst>(V))
12309 ScalarTy = Store->getValueOperand()->getType();
12310 else if (auto *IE = dyn_cast<InsertElementInst>(V))
12311 ScalarTy = IE->getOperand(1)->getType();
12312 auto It = MinBWs.find(E);
12313 if (It != MinBWs.end())
12314 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
12315 auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
12316 if (E->State == TreeEntry::NeedToGather) {
12317 // Set insert point for non-reduction initial nodes.
12318 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12319 setInsertPointAfterBundle(E);
12320 Value *Vec = createBuildVector(E, ScalarTy);
12321 E->VectorizedValue = Vec;
12322 return Vec;
12323 }
12324
12325 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
12326 auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
12327 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
12328 if (E->getOpcode() == Instruction::Store) {
12330 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
12331 E->ReorderIndices.size());
12332 ShuffleBuilder.add(V, Mask);
12333 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12334 ShuffleBuilder.addOrdered(V, std::nullopt);
12335 } else {
12336 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
12337 }
12338 return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12339 };
12340
12341 assert((E->State == TreeEntry::Vectorize ||
12342 E->State == TreeEntry::ScatterVectorize ||
12343 E->State == TreeEntry::StridedVectorize) &&
12344 "Unhandled state");
12345 unsigned ShuffleOrOp =
12346 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
12347 Instruction *VL0 = E->getMainOp();
12348 auto GetOperandSignedness = [&](unsigned Idx) {
12349 const TreeEntry *OpE = getOperandEntry(E, Idx);
12350 bool IsSigned = false;
12351 auto It = MinBWs.find(OpE);
12352 if (It != MinBWs.end())
12353 IsSigned = It->second.second;
12354 else
12355 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
12356 return !isKnownNonNegative(R, SimplifyQuery(*DL));
12357 });
12358 return IsSigned;
12359 };
12360 switch (ShuffleOrOp) {
12361 case Instruction::PHI: {
12362 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
12363 E != VectorizableTree.front().get() ||
12364 !E->UserTreeIndices.empty()) &&
12365 "PHI reordering is free.");
12366 if (PostponedPHIs && E->VectorizedValue)
12367 return E->VectorizedValue;
12368 auto *PH = cast<PHINode>(VL0);
12369 Builder.SetInsertPoint(PH->getParent(),
12370 PH->getParent()->getFirstNonPHIIt());
12371 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12372 if (PostponedPHIs || !E->VectorizedValue) {
12373 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
12374 E->PHI = NewPhi;
12375 Value *V = NewPhi;
12376
12377 // Adjust insertion point once all PHI's have been generated.
12378 Builder.SetInsertPoint(PH->getParent(),
12379 PH->getParent()->getFirstInsertionPt());
12380 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12381
12382 V = FinalShuffle(V, E, VecTy);
12383
12384 E->VectorizedValue = V;
12385 if (PostponedPHIs)
12386 return V;
12387 }
12388 PHINode *NewPhi = cast<PHINode>(E->PHI);
12389 // If phi node is fully emitted - exit.
12390 if (NewPhi->getNumIncomingValues() != 0)
12391 return NewPhi;
12392
12393 // PHINodes may have multiple entries from the same block. We want to
12394 // visit every block once.
12396
12397 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12399 BasicBlock *IBB = PH->getIncomingBlock(I);
12400
12401 // Stop emission if all incoming values are generated.
12402 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
12403 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12404 return NewPhi;
12405 }
12406
12407 if (!VisitedBBs.insert(IBB).second) {
12408 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
12409 continue;
12410 }
12411
12412 Builder.SetInsertPoint(IBB->getTerminator());
12413 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12414 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
12415 if (VecTy != Vec->getType()) {
12416 assert((It != MinBWs.end() ||
12417 getOperandEntry(E, I)->State == TreeEntry::NeedToGather ||
12418 MinBWs.contains(getOperandEntry(E, I))) &&
12419 "Expected item in MinBWs.");
12420 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
12421 }
12422 NewPhi->addIncoming(Vec, IBB);
12423 }
12424
12425 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
12426 "Invalid number of incoming values");
12427 return NewPhi;
12428 }
12429
12430 case Instruction::ExtractElement: {
12431 Value *V = E->getSingleOperand(0);
12432 if (const TreeEntry *TE = getTreeEntry(V))
12433 V = TE->VectorizedValue;
12434 setInsertPointAfterBundle(E);
12435 V = FinalShuffle(V, E, VecTy);
12436 E->VectorizedValue = V;
12437 return V;
12438 }
12439 case Instruction::ExtractValue: {
12440 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
12441 Builder.SetInsertPoint(LI);
12442 Value *Ptr = LI->getPointerOperand();
12443 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
12444 Value *NewV = propagateMetadata(V, E->Scalars);
12445 NewV = FinalShuffle(NewV, E, VecTy);
12446 E->VectorizedValue = NewV;
12447 return NewV;
12448 }
12449 case Instruction::InsertElement: {
12450 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
12451 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
12452 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
12453 ArrayRef<Value *> Op = E->getOperand(1);
12454 Type *ScalarTy = Op.front()->getType();
12455 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
12456 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
12457 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
12458 assert(Res.first > 0 && "Expected item in MinBWs.");
12459 V = Builder.CreateIntCast(
12460 V,
12462 ScalarTy,
12463 cast<FixedVectorType>(V->getType())->getNumElements()),
12464 Res.second);
12465 }
12466
12467 // Create InsertVector shuffle if necessary
12468 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
12469 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
12470 }));
12471 const unsigned NumElts =
12472 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
12473 const unsigned NumScalars = E->Scalars.size();
12474
12475 unsigned Offset = *getInsertIndex(VL0);
12476 assert(Offset < NumElts && "Failed to find vector index offset");
12477
12478 // Create shuffle to resize vector
12480 if (!E->ReorderIndices.empty()) {
12481 inversePermutation(E->ReorderIndices, Mask);
12482 Mask.append(NumElts - NumScalars, PoisonMaskElem);
12483 } else {
12484 Mask.assign(NumElts, PoisonMaskElem);
12485 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
12486 }
12487 // Create InsertVector shuffle if necessary
12488 bool IsIdentity = true;
12489 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
12490 Mask.swap(PrevMask);
12491 for (unsigned I = 0; I < NumScalars; ++I) {
12492 Value *Scalar = E->Scalars[PrevMask[I]];
12493 unsigned InsertIdx = *getInsertIndex(Scalar);
12494 IsIdentity &= InsertIdx - Offset == I;
12495 Mask[InsertIdx - Offset] = I;
12496 }
12497 if (!IsIdentity || NumElts != NumScalars) {
12498 Value *V2 = nullptr;
12499 bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
12500 SmallVector<int> InsertMask(Mask);
12501 if (NumElts != NumScalars && Offset == 0) {
12502 // Follow all insert element instructions from the current buildvector
12503 // sequence.
12504 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
12505 do {
12506 std::optional<unsigned> InsertIdx = getInsertIndex(Ins);
12507 if (!InsertIdx)
12508 break;
12509 if (InsertMask[*InsertIdx] == PoisonMaskElem)
12510 InsertMask[*InsertIdx] = *InsertIdx;
12511 if (!Ins->hasOneUse())
12512 break;
12513 Ins = dyn_cast_or_null<InsertElementInst>(
12514 Ins->getUniqueUndroppableUser());
12515 } while (Ins);
12516 SmallBitVector UseMask =
12517 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12518 SmallBitVector IsFirstPoison =
12519 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12520 SmallBitVector IsFirstUndef =
12521 isUndefVector(FirstInsert->getOperand(0), UseMask);
12522 if (!IsFirstPoison.all()) {
12523 unsigned Idx = 0;
12524 for (unsigned I = 0; I < NumElts; I++) {
12525 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
12526 IsFirstUndef.test(I)) {
12527 if (IsVNonPoisonous) {
12528 InsertMask[I] = I < NumScalars ? I : 0;
12529 continue;
12530 }
12531 if (!V2)
12532 V2 = UndefValue::get(V->getType());
12533 if (Idx >= NumScalars)
12534 Idx = NumScalars - 1;
12535 InsertMask[I] = NumScalars + Idx;
12536 ++Idx;
12537 } else if (InsertMask[I] != PoisonMaskElem &&
12538 Mask[I] == PoisonMaskElem) {
12539 InsertMask[I] = PoisonMaskElem;
12540 }
12541 }
12542 } else {
12543 InsertMask = Mask;
12544 }
12545 }
12546 if (!V2)
12547 V2 = PoisonValue::get(V->getType());
12548 V = Builder.CreateShuffleVector(V, V2, InsertMask);
12549 if (auto *I = dyn_cast<Instruction>(V)) {
12550 GatherShuffleExtractSeq.insert(I);
12551 CSEBlocks.insert(I->getParent());
12552 }
12553 }
12554
12555 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
12556 for (unsigned I = 0; I < NumElts; I++) {
12557 if (Mask[I] != PoisonMaskElem)
12558 InsertMask[Offset + I] = I;
12559 }
12560 SmallBitVector UseMask =
12561 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12562 SmallBitVector IsFirstUndef =
12563 isUndefVector(FirstInsert->getOperand(0), UseMask);
12564 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
12565 NumElts != NumScalars) {
12566 if (IsFirstUndef.all()) {
12567 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
12568 SmallBitVector IsFirstPoison =
12569 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12570 if (!IsFirstPoison.all()) {
12571 for (unsigned I = 0; I < NumElts; I++) {
12572 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
12573 InsertMask[I] = I + NumElts;
12574 }
12575 }
12576 V = Builder.CreateShuffleVector(
12577 V,
12578 IsFirstPoison.all() ? PoisonValue::get(V->getType())
12579 : FirstInsert->getOperand(0),
12580 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
12581 if (auto *I = dyn_cast<Instruction>(V)) {
12582 GatherShuffleExtractSeq.insert(I);
12583 CSEBlocks.insert(I->getParent());
12584 }
12585 }
12586 } else {
12587 SmallBitVector IsFirstPoison =
12588 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12589 for (unsigned I = 0; I < NumElts; I++) {
12590 if (InsertMask[I] == PoisonMaskElem)
12591 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
12592 else
12593 InsertMask[I] += NumElts;
12594 }
12595 V = Builder.CreateShuffleVector(
12596 FirstInsert->getOperand(0), V, InsertMask,
12597 cast<Instruction>(E->Scalars.back())->getName());
12598 if (auto *I = dyn_cast<Instruction>(V)) {
12599 GatherShuffleExtractSeq.insert(I);
12600 CSEBlocks.insert(I->getParent());
12601 }
12602 }
12603 }
12604
12605 ++NumVectorInstructions;
12606 E->VectorizedValue = V;
12607 return V;
12608 }
12609 case Instruction::ZExt:
12610 case Instruction::SExt:
12611 case Instruction::FPToUI:
12612 case Instruction::FPToSI:
12613 case Instruction::FPExt:
12614 case Instruction::PtrToInt:
12615 case Instruction::IntToPtr:
12616 case Instruction::SIToFP:
12617 case Instruction::UIToFP:
12618 case Instruction::Trunc:
12619 case Instruction::FPTrunc:
12620 case Instruction::BitCast: {
12621 setInsertPointAfterBundle(E);
12622
12623 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
12624 if (E->VectorizedValue) {
12625 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12626 return E->VectorizedValue;
12627 }
12628
12629 auto *CI = cast<CastInst>(VL0);
12630 Instruction::CastOps VecOpcode = CI->getOpcode();
12631 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
12632 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
12633 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
12634 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
12635 SrcScalarTy != CI->getOperand(0)->getType())) {
12636 // Check if the values are candidates to demote.
12637 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
12638 if (SrcIt != MinBWs.end())
12639 SrcBWSz = SrcIt->second.first;
12640 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
12641 if (BWSz == SrcBWSz) {
12642 VecOpcode = Instruction::BitCast;
12643 } else if (BWSz < SrcBWSz) {
12644 VecOpcode = Instruction::Trunc;
12645 } else if (It != MinBWs.end()) {
12646 assert(BWSz > SrcBWSz && "Invalid cast!");
12647 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12648 } else if (SrcIt != MinBWs.end()) {
12649 assert(BWSz > SrcBWSz && "Invalid cast!");
12650 VecOpcode =
12651 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
12652 }
12653 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
12654 !SrcIt->second.second) {
12655 VecOpcode = Instruction::UIToFP;
12656 }
12657 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
12658 ? InVec
12659 : Builder.CreateCast(VecOpcode, InVec, VecTy);
12660 V = FinalShuffle(V, E, VecTy);
12661
12662 E->VectorizedValue = V;
12663 ++NumVectorInstructions;
12664 return V;
12665 }
12666 case Instruction::FCmp:
12667 case Instruction::ICmp: {
12668 setInsertPointAfterBundle(E);
12669
12670 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
12671 if (E->VectorizedValue) {
12672 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12673 return E->VectorizedValue;
12674 }
12675 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
12676 if (E->VectorizedValue) {
12677 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12678 return E->VectorizedValue;
12679 }
12680 if (L->getType() != R->getType()) {
12681 assert((getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12682 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12683 MinBWs.contains(getOperandEntry(E, 0)) ||
12684 MinBWs.contains(getOperandEntry(E, 1))) &&
12685 "Expected item in MinBWs.");
12686 if (cast<VectorType>(L->getType())
12687 ->getElementType()
12688 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
12689 ->getElementType()
12690 ->getIntegerBitWidth()) {
12691 Type *CastTy = R->getType();
12692 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
12693 } else {
12694 Type *CastTy = L->getType();
12695 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
12696 }
12697 }
12698
12699 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
12700 Value *V = Builder.CreateCmp(P0, L, R);
12701 propagateIRFlags(V, E->Scalars, VL0);
12702 // Do not cast for cmps.
12703 VecTy = cast<FixedVectorType>(V->getType());
12704 V = FinalShuffle(V, E, VecTy);
12705
12706 E->VectorizedValue = V;
12707 ++NumVectorInstructions;
12708 return V;
12709 }
12710 case Instruction::Select: {
12711 setInsertPointAfterBundle(E);
12712
12713 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
12714 if (E->VectorizedValue) {
12715 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12716 return E->VectorizedValue;
12717 }
12718 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
12719 if (E->VectorizedValue) {
12720 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12721 return E->VectorizedValue;
12722 }
12723 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
12724 if (E->VectorizedValue) {
12725 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12726 return E->VectorizedValue;
12727 }
12728 if (True->getType() != VecTy || False->getType() != VecTy) {
12729 assert((It != MinBWs.end() ||
12730 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12731 getOperandEntry(E, 2)->State == TreeEntry::NeedToGather ||
12732 MinBWs.contains(getOperandEntry(E, 1)) ||
12733 MinBWs.contains(getOperandEntry(E, 2))) &&
12734 "Expected item in MinBWs.");
12735 if (True->getType() != VecTy)
12736 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
12737 if (False->getType() != VecTy)
12738 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
12739 }
12740
12741 Value *V = Builder.CreateSelect(Cond, True, False);
12742 V = FinalShuffle(V, E, VecTy);
12743
12744 E->VectorizedValue = V;
12745 ++NumVectorInstructions;
12746 return V;
12747 }
12748 case Instruction::FNeg: {
12749 setInsertPointAfterBundle(E);
12750
12751 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
12752
12753 if (E->VectorizedValue) {
12754 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12755 return E->VectorizedValue;
12756 }
12757
12758 Value *V = Builder.CreateUnOp(
12759 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
12760 propagateIRFlags(V, E->Scalars, VL0);
12761 if (auto *I = dyn_cast<Instruction>(V))
12762 V = propagateMetadata(I, E->Scalars);
12763
12764 V = FinalShuffle(V, E, VecTy);
12765
12766 E->VectorizedValue = V;
12767 ++NumVectorInstructions;
12768
12769 return V;
12770 }
12771 case Instruction::Add:
12772 case Instruction::FAdd:
12773 case Instruction::Sub:
12774 case Instruction::FSub:
12775 case Instruction::Mul:
12776 case Instruction::FMul:
12777 case Instruction::UDiv:
12778 case Instruction::SDiv:
12779 case Instruction::FDiv:
12780 case Instruction::URem:
12781 case Instruction::SRem:
12782 case Instruction::FRem:
12783 case Instruction::Shl:
12784 case Instruction::LShr:
12785 case Instruction::AShr:
12786 case Instruction::And:
12787 case Instruction::Or:
12788 case Instruction::Xor: {
12789 setInsertPointAfterBundle(E);
12790
12791 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
12792 if (E->VectorizedValue) {
12793 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12794 return E->VectorizedValue;
12795 }
12796 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
12797 if (E->VectorizedValue) {
12798 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12799 return E->VectorizedValue;
12800 }
12801 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
12802 assert((It != MinBWs.end() ||
12803 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12804 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12805 MinBWs.contains(getOperandEntry(E, 0)) ||
12806 MinBWs.contains(getOperandEntry(E, 1))) &&
12807 "Expected item in MinBWs.");
12808 if (LHS->getType() != VecTy)
12809 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
12810 if (RHS->getType() != VecTy)
12811 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
12812 }
12813
12814 Value *V = Builder.CreateBinOp(
12815 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
12816 RHS);
12817 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
12818 if (auto *I = dyn_cast<Instruction>(V)) {
12819 V = propagateMetadata(I, E->Scalars);
12820 // Drop nuw flags for abs(sub(commutative), true).
12821 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
12822 any_of(E->Scalars, [](Value *V) {
12823 return isCommutative(cast<Instruction>(V));
12824 }))
12825 I->setHasNoUnsignedWrap(/*b=*/false);
12826 }
12827
12828 V = FinalShuffle(V, E, VecTy);
12829
12830 E->VectorizedValue = V;
12831 ++NumVectorInstructions;
12832
12833 return V;
12834 }
12835 case Instruction::Load: {
12836 // Loads are inserted at the head of the tree because we don't want to
12837 // sink them all the way down past store instructions.
12838 setInsertPointAfterBundle(E);
12839
12840 LoadInst *LI = cast<LoadInst>(VL0);
12841 Instruction *NewLI;
12842 Value *PO = LI->getPointerOperand();
12843 if (E->State == TreeEntry::Vectorize) {
12844 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
12845 } else if (E->State == TreeEntry::StridedVectorize) {
12846 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
12847 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
12848 PO = IsReverseOrder ? PtrN : Ptr0;
12849 std::optional<int> Diff = getPointersDiff(
12850 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
12851 Type *StrideTy = DL->getIndexType(PO->getType());
12852 Value *StrideVal;
12853 if (Diff) {
12854 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
12855 StrideVal =
12856 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
12857 DL->getTypeAllocSize(ScalarTy));
12858 } else {
12859 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
12860 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
12861 return cast<LoadInst>(V)->getPointerOperand();
12862 });
12863 OrdersType Order;
12864 std::optional<Value *> Stride =
12865 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
12866 &*Builder.GetInsertPoint());
12867 Value *NewStride =
12868 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
12869 StrideVal = Builder.CreateMul(
12870 NewStride,
12871 ConstantInt::get(
12872 StrideTy,
12873 (IsReverseOrder ? -1 : 1) *
12874 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
12875 }
12876 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
12877 auto *Inst = Builder.CreateIntrinsic(
12878 Intrinsic::experimental_vp_strided_load,
12879 {VecTy, PO->getType(), StrideTy},
12880 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
12881 Builder.getInt32(E->Scalars.size())});
12882 Inst->addParamAttr(
12883 /*ArgNo=*/0,
12884 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
12885 NewLI = Inst;
12886 } else {
12887 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
12888 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
12889 if (E->VectorizedValue) {
12890 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12891 return E->VectorizedValue;
12892 }
12893 // Use the minimum alignment of the gathered loads.
12894 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
12895 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
12896 }
12897 Value *V = propagateMetadata(NewLI, E->Scalars);
12898
12899 V = FinalShuffle(V, E, VecTy);
12900 E->VectorizedValue = V;
12901 ++NumVectorInstructions;
12902 return V;
12903 }
12904 case Instruction::Store: {
12905 auto *SI = cast<StoreInst>(VL0);
12906
12907 setInsertPointAfterBundle(E);
12908
12909 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
12910 if (VecValue->getType() != VecTy)
12911 VecValue =
12912 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
12913 VecValue = FinalShuffle(VecValue, E, VecTy);
12914
12915 Value *Ptr = SI->getPointerOperand();
12916 StoreInst *ST =
12917 Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
12918
12919 Value *V = propagateMetadata(ST, E->Scalars);
12920
12921 E->VectorizedValue = V;
12922 ++NumVectorInstructions;
12923 return V;
12924 }
12925 case Instruction::GetElementPtr: {
12926 auto *GEP0 = cast<GetElementPtrInst>(VL0);
12927 setInsertPointAfterBundle(E);
12928
12929 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
12930 if (E->VectorizedValue) {
12931 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12932 return E->VectorizedValue;
12933 }
12934
12935 SmallVector<Value *> OpVecs;
12936 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
12937 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
12938 if (E->VectorizedValue) {
12939 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12940 return E->VectorizedValue;
12941 }
12942 OpVecs.push_back(OpVec);
12943 }
12944
12945 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
12946 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
12948 for (Value *V : E->Scalars) {
12949 if (isa<GetElementPtrInst>(V))
12950 GEPs.push_back(V);
12951 }
12952 V = propagateMetadata(I, GEPs);
12953 }
12954
12955 V = FinalShuffle(V, E, VecTy);
12956
12957 E->VectorizedValue = V;
12958 ++NumVectorInstructions;
12959
12960 return V;
12961 }
12962 case Instruction::Call: {
12963 CallInst *CI = cast<CallInst>(VL0);
12964 setInsertPointAfterBundle(E);
12965
12967
12968 SmallVector<Type *> ArgTys =
12970 It != MinBWs.end() ? It->second.first : 0);
12971 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
12972 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
12973 VecCallCosts.first <= VecCallCosts.second;
12974
12975 Value *ScalarArg = nullptr;
12976 SmallVector<Value *> OpVecs;
12977 SmallVector<Type *, 2> TysForDecl;
12978 // Add return type if intrinsic is overloaded on it.
12979 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
12980 TysForDecl.push_back(VecTy);
12981 auto *CEI = cast<CallInst>(VL0);
12982 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
12983 ValueList OpVL;
12984 // Some intrinsics have scalar arguments. This argument should not be
12985 // vectorized.
12986 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
12987 ScalarArg = CEI->getArgOperand(I);
12988 // if decided to reduce bitwidth of abs intrinsic, it second argument
12989 // must be set false (do not return poison, if value issigned min).
12990 if (ID == Intrinsic::abs && It != MinBWs.end() &&
12991 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
12992 ScalarArg = Builder.getFalse();
12993 OpVecs.push_back(ScalarArg);
12995 TysForDecl.push_back(ScalarArg->getType());
12996 continue;
12997 }
12998
12999 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
13000 if (E->VectorizedValue) {
13001 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13002 return E->VectorizedValue;
13003 }
13004 ScalarArg = CEI->getArgOperand(I);
13005 if (cast<VectorType>(OpVec->getType())->getElementType() !=
13006 ScalarArg->getType() &&
13007 It == MinBWs.end()) {
13008 auto *CastTy = FixedVectorType::get(ScalarArg->getType(),
13009 VecTy->getNumElements());
13010 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
13011 } else if (It != MinBWs.end()) {
13012 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
13013 }
13014 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
13015 OpVecs.push_back(OpVec);
13016 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
13017 TysForDecl.push_back(OpVec->getType());
13018 }
13019
13020 Function *CF;
13021 if (!UseIntrinsic) {
13022 VFShape Shape =
13025 static_cast<unsigned>(VecTy->getNumElements())),
13026 false /*HasGlobalPred*/);
13027 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
13028 } else {
13029 CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl);
13030 }
13031
13033 CI->getOperandBundlesAsDefs(OpBundles);
13034 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
13035
13036 propagateIRFlags(V, E->Scalars, VL0);
13037 V = FinalShuffle(V, E, VecTy);
13038
13039 E->VectorizedValue = V;
13040 ++NumVectorInstructions;
13041 return V;
13042 }
13043 case Instruction::ShuffleVector: {
13044 assert(E->isAltShuffle() &&
13045 ((Instruction::isBinaryOp(E->getOpcode()) &&
13046 Instruction::isBinaryOp(E->getAltOpcode())) ||
13047 (Instruction::isCast(E->getOpcode()) &&
13048 Instruction::isCast(E->getAltOpcode())) ||
13049 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
13050 "Invalid Shuffle Vector Operand");
13051
13052 Value *LHS = nullptr, *RHS = nullptr;
13053 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
13054 setInsertPointAfterBundle(E);
13055 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13056 if (E->VectorizedValue) {
13057 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13058 return E->VectorizedValue;
13059 }
13060 RHS = vectorizeOperand(E, 1, PostponedPHIs);
13061 } else {
13062 setInsertPointAfterBundle(E);
13063 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13064 }
13065 if (E->VectorizedValue) {
13066 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13067 return E->VectorizedValue;
13068 }
13069 if (LHS && RHS &&
13070 ((Instruction::isBinaryOp(E->getOpcode()) &&
13071 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
13072 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
13073 assert((It != MinBWs.end() ||
13074 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
13075 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
13076 MinBWs.contains(getOperandEntry(E, 0)) ||
13077 MinBWs.contains(getOperandEntry(E, 1))) &&
13078 "Expected item in MinBWs.");
13079 Type *CastTy = VecTy;
13080 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
13081 if (cast<VectorType>(LHS->getType())
13082 ->getElementType()
13083 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
13084 ->getElementType()
13085 ->getIntegerBitWidth())
13086 CastTy = RHS->getType();
13087 else
13088 CastTy = LHS->getType();
13089 }
13090 if (LHS->getType() != CastTy)
13091 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
13092 if (RHS->getType() != CastTy)
13093 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
13094 }
13095
13096 Value *V0, *V1;
13097 if (Instruction::isBinaryOp(E->getOpcode())) {
13098 V0 = Builder.CreateBinOp(
13099 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
13100 V1 = Builder.CreateBinOp(
13101 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
13102 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
13103 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
13104 auto *AltCI = cast<CmpInst>(E->getAltOp());
13105 CmpInst::Predicate AltPred = AltCI->getPredicate();
13106 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
13107 } else {
13108 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
13109 unsigned SrcBWSz = DL->getTypeSizeInBits(
13110 cast<VectorType>(LHS->getType())->getElementType());
13111 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
13112 if (BWSz <= SrcBWSz) {
13113 if (BWSz < SrcBWSz)
13114 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
13115 assert(LHS->getType() == VecTy && "Expected same type as operand.");
13116 if (auto *I = dyn_cast<Instruction>(LHS))
13117 LHS = propagateMetadata(I, E->Scalars);
13118 E->VectorizedValue = LHS;
13119 ++NumVectorInstructions;
13120 return LHS;
13121 }
13122 }
13123 V0 = Builder.CreateCast(
13124 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
13125 V1 = Builder.CreateCast(
13126 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
13127 }
13128 // Add V0 and V1 to later analysis to try to find and remove matching
13129 // instruction, if any.
13130 for (Value *V : {V0, V1}) {
13131 if (auto *I = dyn_cast<Instruction>(V)) {
13132 GatherShuffleExtractSeq.insert(I);
13133 CSEBlocks.insert(I->getParent());
13134 }
13135 }
13136
13137 // Create shuffle to take alternate operations from the vector.
13138 // Also, gather up main and alt scalar ops to propagate IR flags to
13139 // each vector operation.
13140 ValueList OpScalars, AltScalars;
13142 E->buildAltOpShuffleMask(
13143 [E, this](Instruction *I) {
13144 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
13145 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
13146 *TLI);
13147 },
13148 Mask, &OpScalars, &AltScalars);
13149
13150 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
13151 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
13152 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
13153 // Drop nuw flags for abs(sub(commutative), true).
13154 if (auto *I = dyn_cast<Instruction>(Vec);
13155 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
13156 any_of(E->Scalars, [](Value *V) {
13157 auto *IV = cast<Instruction>(V);
13158 return IV->getOpcode() == Instruction::Sub &&
13159 isCommutative(cast<Instruction>(IV));
13160 }))
13161 I->setHasNoUnsignedWrap(/*b=*/false);
13162 };
13163 DropNuwFlag(V0, E->getOpcode());
13164 DropNuwFlag(V1, E->getAltOpcode());
13165
13166 Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
13167 if (auto *I = dyn_cast<Instruction>(V)) {
13168 V = propagateMetadata(I, E->Scalars);
13169 GatherShuffleExtractSeq.insert(I);
13170 CSEBlocks.insert(I->getParent());
13171 }
13172
13173 E->VectorizedValue = V;
13174 ++NumVectorInstructions;
13175
13176 return V;
13177 }
13178 default:
13179 llvm_unreachable("unknown inst");
13180 }
13181 return nullptr;
13182}
13183
13185 ExtraValueToDebugLocsMap ExternallyUsedValues;
13186 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
13187 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13188}
13189
13190namespace {
13191/// Data type for handling buildvector sequences with the reused scalars from
13192/// other tree entries.
13193struct ShuffledInsertData {
13194 /// List of insertelements to be replaced by shuffles.
13195 SmallVector<InsertElementInst *> InsertElements;
13196 /// The parent vectors and shuffle mask for the given list of inserts.
13198};
13199} // namespace
13200
13202 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
13203 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
13204 Instruction *ReductionRoot) {
13205 // All blocks must be scheduled before any instructions are inserted.
13206 for (auto &BSIter : BlocksSchedules) {
13207 scheduleBlock(BSIter.second.get());
13208 }
13209 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
13210 // need to rebuild it.
13211 EntryToLastInstruction.clear();
13212
13213 if (ReductionRoot)
13214 Builder.SetInsertPoint(ReductionRoot->getParent(),
13215 ReductionRoot->getIterator());
13216 else
13217 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13218
13219 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
13220 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
13221 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13222 if (TE->State == TreeEntry::Vectorize &&
13223 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13224 TE->VectorizedValue)
13225 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
13226 // Run through the list of postponed gathers and emit them, replacing the temp
13227 // emitted allocas with actual vector instructions.
13228 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
13230 for (const TreeEntry *E : PostponedNodes) {
13231 auto *TE = const_cast<TreeEntry *>(E);
13232 if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
13233 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
13234 TE->UserTreeIndices.front().EdgeIdx)) &&
13235 VecTE->isSame(TE->Scalars))
13236 // Found gather node which is absolutely the same as one of the
13237 // vectorized nodes. It may happen after reordering.
13238 continue;
13239 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
13240 TE->VectorizedValue = nullptr;
13241 auto *UserI =
13242 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
13243 // If user is a PHI node, its vector code have to be inserted right before
13244 // block terminator. Since the node was delayed, there were some unresolved
13245 // dependencies at the moment when stab instruction was emitted. In a case
13246 // when any of these dependencies turn out an operand of another PHI, coming
13247 // from this same block, position of a stab instruction will become invalid.
13248 // The is because source vector that supposed to feed this gather node was
13249 // inserted at the end of the block [after stab instruction]. So we need
13250 // to adjust insertion point again to the end of block.
13251 if (isa<PHINode>(UserI)) {
13252 // Insert before all users.
13253 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
13254 for (User *U : PrevVec->users()) {
13255 if (U == UserI)
13256 continue;
13257 auto *UI = dyn_cast<Instruction>(U);
13258 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
13259 continue;
13260 if (UI->comesBefore(InsertPt))
13261 InsertPt = UI;
13262 }
13263 Builder.SetInsertPoint(InsertPt);
13264 } else {
13265 Builder.SetInsertPoint(PrevVec);
13266 }
13267 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
13268 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
13269 if (Vec->getType() != PrevVec->getType()) {
13270 assert(Vec->getType()->isIntOrIntVectorTy() &&
13271 PrevVec->getType()->isIntOrIntVectorTy() &&
13272 "Expected integer vector types only.");
13273 std::optional<bool> IsSigned;
13274 for (Value *V : TE->Scalars) {
13275 if (const TreeEntry *BaseTE = getTreeEntry(V)) {
13276 auto It = MinBWs.find(BaseTE);
13277 if (It != MinBWs.end()) {
13278 IsSigned = IsSigned.value_or(false) || It->second.second;
13279 if (*IsSigned)
13280 break;
13281 }
13282 for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
13283 auto It = MinBWs.find(MNTE);
13284 if (It != MinBWs.end()) {
13285 IsSigned = IsSigned.value_or(false) || It->second.second;
13286 if (*IsSigned)
13287 break;
13288 }
13289 }
13290 if (IsSigned.value_or(false))
13291 break;
13292 // Scan through gather nodes.
13293 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
13294 auto It = MinBWs.find(BVE);
13295 if (It != MinBWs.end()) {
13296 IsSigned = IsSigned.value_or(false) || It->second.second;
13297 if (*IsSigned)
13298 break;
13299 }
13300 }
13301 if (IsSigned.value_or(false))
13302 break;
13303 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
13304 IsSigned =
13305 IsSigned.value_or(false) ||
13306 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
13307 continue;
13308 }
13309 if (IsSigned.value_or(false))
13310 break;
13311 }
13312 }
13313 if (IsSigned.value_or(false)) {
13314 // Final attempt - check user node.
13315 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
13316 if (It != MinBWs.end())
13317 IsSigned = It->second.second;
13318 }
13319 assert(IsSigned &&
13320 "Expected user node or perfect diamond match in MinBWs.");
13321 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
13322 }
13323 PrevVec->replaceAllUsesWith(Vec);
13324 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
13325 // Replace the stub vector node, if it was used before for one of the
13326 // buildvector nodes already.
13327 auto It = PostponedValues.find(PrevVec);
13328 if (It != PostponedValues.end()) {
13329 for (TreeEntry *VTE : It->getSecond())
13330 VTE->VectorizedValue = Vec;
13331 }
13332 eraseInstruction(PrevVec);
13333 }
13334
13335 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
13336 << " values .\n");
13337
13338 SmallVector<ShuffledInsertData> ShuffledInserts;
13339 // Maps vector instruction to original insertelement instruction
13340 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
13341 // Maps extract Scalar to the corresponding extractelement instruction in the
13342 // basic block. Only one extractelement per block should be emitted.
13343 DenseMap<Value *,
13345 ScalarToEEs;
13346 SmallDenseSet<Value *, 4> UsedInserts;
13348 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
13349 // Extract all of the elements with the external uses.
13350 for (const auto &ExternalUse : ExternalUses) {
13351 Value *Scalar = ExternalUse.Scalar;
13352 llvm::User *User = ExternalUse.User;
13353
13354 // Skip users that we already RAUW. This happens when one instruction
13355 // has multiple uses of the same value.
13356 if (User && !is_contained(Scalar->users(), User))
13357 continue;
13358 TreeEntry *E = getTreeEntry(Scalar);
13359 assert(E && "Invalid scalar");
13360 assert(E->State != TreeEntry::NeedToGather &&
13361 "Extracting from a gather list");
13362 // Non-instruction pointers are not deleted, just skip them.
13363 if (E->getOpcode() == Instruction::GetElementPtr &&
13364 !isa<GetElementPtrInst>(Scalar))
13365 continue;
13366
13367 Value *Vec = E->VectorizedValue;
13368 assert(Vec && "Can't find vectorizable value");
13369
13370 Value *Lane = Builder.getInt32(ExternalUse.Lane);
13371 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
13372 if (Scalar->getType() != Vec->getType()) {
13373 Value *Ex = nullptr;
13374 Value *ExV = nullptr;
13375 auto *GEP = dyn_cast<GetElementPtrInst>(Scalar);
13376 bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP);
13377 auto It = ScalarToEEs.find(Scalar);
13378 if (It != ScalarToEEs.end()) {
13379 // No need to emit many extracts, just move the only one in the
13380 // current block.
13381 auto EEIt = It->second.find(Builder.GetInsertBlock());
13382 if (EEIt != It->second.end()) {
13383 Instruction *I = EEIt->second.first;
13384 if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
13385 Builder.GetInsertPoint()->comesBefore(I)) {
13386 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
13387 Builder.GetInsertPoint());
13388 if (auto *CI = EEIt->second.second)
13389 CI->moveAfter(I);
13390 }
13391 Ex = I;
13392 ExV = EEIt->second.second ? EEIt->second.second : Ex;
13393 }
13394 }
13395 if (!Ex) {
13396 // "Reuse" the existing extract to improve final codegen.
13397 if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
13398 Value *V = ES->getVectorOperand();
13399 if (const TreeEntry *ETE = getTreeEntry(V))
13400 V = ETE->VectorizedValue;
13401 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
13402 } else if (ReplaceGEP) {
13403 // Leave the GEPs as is, they are free in most cases and better to
13404 // keep them as GEPs.
13405 auto *CloneGEP = GEP->clone();
13406 if (isa<Instruction>(Vec))
13407 CloneGEP->insertBefore(*Builder.GetInsertBlock(),
13408 Builder.GetInsertPoint());
13409 else
13410 CloneGEP->insertBefore(GEP);
13411 if (GEP->hasName())
13412 CloneGEP->takeName(GEP);
13413 Ex = CloneGEP;
13414 } else {
13415 Ex = Builder.CreateExtractElement(Vec, Lane);
13416 }
13417 // If necessary, sign-extend or zero-extend ScalarRoot
13418 // to the larger type.
13419 ExV = Ex;
13420 if (Scalar->getType() != Ex->getType())
13421 ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
13422 MinBWs.find(E)->second.second);
13423 if (auto *I = dyn_cast<Instruction>(Ex))
13424 ScalarToEEs[Scalar].try_emplace(
13425 Builder.GetInsertBlock(),
13426 std::make_pair(I, cast<Instruction>(ExV)));
13427 }
13428 // The then branch of the previous if may produce constants, since 0
13429 // operand might be a constant.
13430 if (auto *ExI = dyn_cast<Instruction>(Ex)) {
13431 GatherShuffleExtractSeq.insert(ExI);
13432 CSEBlocks.insert(ExI->getParent());
13433 }
13434 return ExV;
13435 }
13436 assert(isa<FixedVectorType>(Scalar->getType()) &&
13437 isa<InsertElementInst>(Scalar) &&
13438 "In-tree scalar of vector type is not insertelement?");
13439 auto *IE = cast<InsertElementInst>(Scalar);
13440 VectorToInsertElement.try_emplace(Vec, IE);
13441 return Vec;
13442 };
13443 // If User == nullptr, the Scalar remains as scalar in vectorized
13444 // instructions or is used as extra arg. Generate ExtractElement instruction
13445 // and update the record for this scalar in ExternallyUsedValues.
13446 if (!User) {
13447 if (!ScalarsWithNullptrUser.insert(Scalar).second)
13448 continue;
13449 assert((ExternallyUsedValues.count(Scalar) ||
13450 any_of(Scalar->users(),
13451 [&](llvm::User *U) {
13452 if (ExternalUsesAsGEPs.contains(U))
13453 return true;
13454 TreeEntry *UseEntry = getTreeEntry(U);
13455 return UseEntry &&
13456 (UseEntry->State == TreeEntry::Vectorize ||
13457 UseEntry->State ==
13458 TreeEntry::StridedVectorize) &&
13459 (E->State == TreeEntry::Vectorize ||
13460 E->State == TreeEntry::StridedVectorize) &&
13461 doesInTreeUserNeedToExtract(
13462 Scalar,
13463 cast<Instruction>(UseEntry->Scalars.front()),
13464 TLI);
13465 })) &&
13466 "Scalar with nullptr User must be registered in "
13467 "ExternallyUsedValues map or remain as scalar in vectorized "
13468 "instructions");
13469 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
13470 if (auto *PHI = dyn_cast<PHINode>(VecI))
13471 Builder.SetInsertPoint(PHI->getParent(),
13472 PHI->getParent()->getFirstNonPHIIt());
13473 else
13474 Builder.SetInsertPoint(VecI->getParent(),
13475 std::next(VecI->getIterator()));
13476 } else {
13477 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13478 }
13479 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13480 // Required to update internally referenced instructions.
13481 Scalar->replaceAllUsesWith(NewInst);
13482 ReplacedExternals.emplace_back(Scalar, NewInst);
13483 continue;
13484 }
13485
13486 if (auto *VU = dyn_cast<InsertElementInst>(User)) {
13487 // Skip if the scalar is another vector op or Vec is not an instruction.
13488 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
13489 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
13490 if (!UsedInserts.insert(VU).second)
13491 continue;
13492 // Need to use original vector, if the root is truncated.
13493 auto BWIt = MinBWs.find(E);
13494 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
13495 auto *ScalarTy = FTy->getElementType();
13496 auto Key = std::make_pair(Vec, ScalarTy);
13497 auto VecIt = VectorCasts.find(Key);
13498 if (VecIt == VectorCasts.end()) {
13499 IRBuilderBase::InsertPointGuard Guard(Builder);
13500 if (auto *IVec = dyn_cast<Instruction>(Vec))
13501 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
13502 Vec = Builder.CreateIntCast(
13503 Vec,
13505 ScalarTy,
13506 cast<FixedVectorType>(Vec->getType())->getNumElements()),
13507 BWIt->second.second);
13508 VectorCasts.try_emplace(Key, Vec);
13509 } else {
13510 Vec = VecIt->second;
13511 }
13512 }
13513
13514 std::optional<unsigned> InsertIdx = getInsertIndex(VU);
13515 if (InsertIdx) {
13516 auto *It =
13517 find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
13518 // Checks if 2 insertelements are from the same buildvector.
13519 InsertElementInst *VecInsert = Data.InsertElements.front();
13521 VU, VecInsert,
13522 [](InsertElementInst *II) { return II->getOperand(0); });
13523 });
13524 unsigned Idx = *InsertIdx;
13525 if (It == ShuffledInserts.end()) {
13526 (void)ShuffledInserts.emplace_back();
13527 It = std::next(ShuffledInserts.begin(),
13528 ShuffledInserts.size() - 1);
13529 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
13530 if (Mask.empty())
13531 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
13532 // Find the insertvector, vectorized in tree, if any.
13533 Value *Base = VU;
13534 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
13535 if (IEBase != User &&
13536 (!IEBase->hasOneUse() ||
13537 getInsertIndex(IEBase).value_or(Idx) == Idx))
13538 break;
13539 // Build the mask for the vectorized insertelement instructions.
13540 if (const TreeEntry *E = getTreeEntry(IEBase)) {
13541 do {
13542 IEBase = cast<InsertElementInst>(Base);
13543 int IEIdx = *getInsertIndex(IEBase);
13544 assert(Mask[IEIdx] == PoisonMaskElem &&
13545 "InsertElementInstruction used already.");
13546 Mask[IEIdx] = IEIdx;
13547 Base = IEBase->getOperand(0);
13548 } while (E == getTreeEntry(Base));
13549 break;
13550 }
13551 Base = cast<InsertElementInst>(Base)->getOperand(0);
13552 // After the vectorization the def-use chain has changed, need
13553 // to look through original insertelement instructions, if they
13554 // get replaced by vector instructions.
13555 auto It = VectorToInsertElement.find(Base);
13556 if (It != VectorToInsertElement.end())
13557 Base = It->second;
13558 }
13559 }
13560 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
13561 if (Mask.empty())
13562 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
13563 Mask[Idx] = ExternalUse.Lane;
13564 It->InsertElements.push_back(cast<InsertElementInst>(User));
13565 continue;
13566 }
13567 }
13568 }
13569 }
13570
13571 // Generate extracts for out-of-tree users.
13572 // Find the insertion point for the extractelement lane.
13573 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
13574 if (PHINode *PH = dyn_cast<PHINode>(User)) {
13575 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
13576 if (PH->getIncomingValue(I) == Scalar) {
13577 Instruction *IncomingTerminator =
13578 PH->getIncomingBlock(I)->getTerminator();
13579 if (isa<CatchSwitchInst>(IncomingTerminator)) {
13580 Builder.SetInsertPoint(VecI->getParent(),
13581 std::next(VecI->getIterator()));
13582 } else {
13583 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
13584 }
13585 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13586 PH->setOperand(I, NewInst);
13587 }
13588 }
13589 } else {
13590 Builder.SetInsertPoint(cast<Instruction>(User));
13591 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13592 User->replaceUsesOfWith(Scalar, NewInst);
13593 }
13594 } else {
13595 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13596 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13597 User->replaceUsesOfWith(Scalar, NewInst);
13598 }
13599
13600 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
13601 }
13602
13603 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
13604 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
13605 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
13606 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
13607 for (int I = 0, E = Mask.size(); I < E; ++I) {
13608 if (Mask[I] < VF)
13609 CombinedMask1[I] = Mask[I];
13610 else
13611 CombinedMask2[I] = Mask[I] - VF;
13612 }
13613 ShuffleInstructionBuilder ShuffleBuilder(
13614 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
13615 ShuffleBuilder.add(V1, CombinedMask1);
13616 if (V2)
13617 ShuffleBuilder.add(V2, CombinedMask2);
13618 return ShuffleBuilder.finalize(std::nullopt);
13619 };
13620
13621 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
13622 bool ForSingleMask) {
13623 unsigned VF = Mask.size();
13624 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
13625 if (VF != VecVF) {
13626 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
13627 Vec = CreateShuffle(Vec, nullptr, Mask);
13628 return std::make_pair(Vec, true);
13629 }
13630 if (!ForSingleMask) {
13631 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
13632 for (unsigned I = 0; I < VF; ++I) {
13633 if (Mask[I] != PoisonMaskElem)
13634 ResizeMask[Mask[I]] = Mask[I];
13635 }
13636 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
13637 }
13638 }
13639
13640 return std::make_pair(Vec, false);
13641 };
13642 // Perform shuffling of the vectorize tree entries for better handling of
13643 // external extracts.
13644 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
13645 // Find the first and the last instruction in the list of insertelements.
13646 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
13647 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
13648 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
13649 Builder.SetInsertPoint(LastInsert);
13650 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
13651 Value *NewInst = performExtractsShuffleAction<Value>(
13652 MutableArrayRef(Vector.data(), Vector.size()),
13653 FirstInsert->getOperand(0),
13654 [](Value *Vec) {
13655 return cast<VectorType>(Vec->getType())
13656 ->getElementCount()
13657 .getKnownMinValue();
13658 },
13659 ResizeToVF,
13660 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
13661 ArrayRef<Value *> Vals) {
13662 assert((Vals.size() == 1 || Vals.size() == 2) &&
13663 "Expected exactly 1 or 2 input values.");
13664 if (Vals.size() == 1) {
13665 // Do not create shuffle if the mask is a simple identity
13666 // non-resizing mask.
13667 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
13668 ->getNumElements() ||
13669 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
13670 return CreateShuffle(Vals.front(), nullptr, Mask);
13671 return Vals.front();
13672 }
13673 return CreateShuffle(Vals.front() ? Vals.front()
13674 : FirstInsert->getOperand(0),
13675 Vals.back(), Mask);
13676 });
13677 auto It = ShuffledInserts[I].InsertElements.rbegin();
13678 // Rebuild buildvector chain.
13679 InsertElementInst *II = nullptr;
13680 if (It != ShuffledInserts[I].InsertElements.rend())
13681 II = *It;
13683 while (It != ShuffledInserts[I].InsertElements.rend()) {
13684 assert(II && "Must be an insertelement instruction.");
13685 if (*It == II)
13686 ++It;
13687 else
13688 Inserts.push_back(cast<Instruction>(II));
13689 II = dyn_cast<InsertElementInst>(II->getOperand(0));
13690 }
13691 for (Instruction *II : reverse(Inserts)) {
13692 II->replaceUsesOfWith(II->getOperand(0), NewInst);
13693 if (auto *NewI = dyn_cast<Instruction>(NewInst))
13694 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
13695 II->moveAfter(NewI);
13696 NewInst = II;
13697 }
13698 LastInsert->replaceAllUsesWith(NewInst);
13699 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
13700 IE->replaceUsesOfWith(IE->getOperand(0),
13701 PoisonValue::get(IE->getOperand(0)->getType()));
13702 IE->replaceUsesOfWith(IE->getOperand(1),
13703 PoisonValue::get(IE->getOperand(1)->getType()));
13704 eraseInstruction(IE);
13705 }
13706 CSEBlocks.insert(LastInsert->getParent());
13707 }
13708
13709 SmallVector<Instruction *> RemovedInsts;
13710 // For each vectorized value:
13711 for (auto &TEPtr : VectorizableTree) {
13712 TreeEntry *Entry = TEPtr.get();
13713
13714 // No need to handle users of gathered values.
13715 if (Entry->State == TreeEntry::NeedToGather)
13716 continue;
13717
13718 assert(Entry->VectorizedValue && "Can't find vectorizable value");
13719
13720 // For each lane:
13721 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
13722 Value *Scalar = Entry->Scalars[Lane];
13723
13724 if (Entry->getOpcode() == Instruction::GetElementPtr &&
13725 !isa<GetElementPtrInst>(Scalar))
13726 continue;
13727#ifndef NDEBUG
13728 Type *Ty = Scalar->getType();
13729 if (!Ty->isVoidTy()) {
13730 for (User *U : Scalar->users()) {
13731 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
13732
13733 // It is legal to delete users in the ignorelist.
13734 assert((getTreeEntry(U) ||
13735 (UserIgnoreList && UserIgnoreList->contains(U)) ||
13736 (isa_and_nonnull<Instruction>(U) &&
13737 isDeleted(cast<Instruction>(U)))) &&
13738 "Deleting out-of-tree value");
13739 }
13740 }
13741#endif
13742 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
13743 eraseInstruction(cast<Instruction>(Scalar));
13744 // Retain to-be-deleted instructions for some debug-info
13745 // bookkeeping. NOTE: eraseInstruction only marks the instruction for
13746 // deletion - instructions are not deleted until later.
13747 RemovedInsts.push_back(cast<Instruction>(Scalar));
13748 }
13749 }
13750
13751 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
13752 // new vector instruction.
13753 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
13754 V->mergeDIAssignID(RemovedInsts);
13755
13756 Builder.ClearInsertionPoint();
13757 InstrElementSize.clear();
13758
13759 const TreeEntry &RootTE = *VectorizableTree.front().get();
13760 Value *Vec = RootTE.VectorizedValue;
13761 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
13762 It != MinBWs.end() &&
13763 ReductionBitWidth != It->second.first) {
13764 IRBuilder<>::InsertPointGuard Guard(Builder);
13765 Builder.SetInsertPoint(ReductionRoot->getParent(),
13766 ReductionRoot->getIterator());
13767 Vec = Builder.CreateIntCast(
13768 Vec,
13769 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
13770 cast<VectorType>(Vec->getType())->getElementCount()),
13771 It->second.second);
13772 }
13773 return Vec;
13774}
13775
13777 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
13778 << " gather sequences instructions.\n");
13779 // LICM InsertElementInst sequences.
13780 for (Instruction *I : GatherShuffleExtractSeq) {
13781 if (isDeleted(I))
13782 continue;
13783
13784 // Check if this block is inside a loop.
13785 Loop *L = LI->getLoopFor(I->getParent());
13786 if (!L)
13787 continue;
13788
13789 // Check if it has a preheader.
13790 BasicBlock *PreHeader = L->getLoopPreheader();
13791 if (!PreHeader)
13792 continue;
13793
13794 // If the vector or the element that we insert into it are
13795 // instructions that are defined in this basic block then we can't
13796 // hoist this instruction.
13797 if (any_of(I->operands(), [L](Value *V) {
13798 auto *OpI = dyn_cast<Instruction>(V);
13799 return OpI && L->contains(OpI);
13800 }))
13801 continue;
13802
13803 // We can hoist this instruction. Move it to the pre-header.
13804 I->moveBefore(PreHeader->getTerminator());
13805 CSEBlocks.insert(PreHeader);
13806 }
13807
13808 // Make a list of all reachable blocks in our CSE queue.
13810 CSEWorkList.reserve(CSEBlocks.size());
13811 for (BasicBlock *BB : CSEBlocks)
13812 if (DomTreeNode *N = DT->getNode(BB)) {
13814 CSEWorkList.push_back(N);
13815 }
13816
13817 // Sort blocks by domination. This ensures we visit a block after all blocks
13818 // dominating it are visited.
13819 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
13820 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
13821 "Different nodes should have different DFS numbers");
13822 return A->getDFSNumIn() < B->getDFSNumIn();
13823 });
13824
13825 // Less defined shuffles can be replaced by the more defined copies.
13826 // Between two shuffles one is less defined if it has the same vector operands
13827 // and its mask indeces are the same as in the first one or undefs. E.g.
13828 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
13829 // poison, <0, 0, 0, 0>.
13830 auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
13831 SmallVectorImpl<int> &NewMask) {
13832 if (I1->getType() != I2->getType())
13833 return false;
13834 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
13835 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
13836 if (!SI1 || !SI2)
13837 return I1->isIdenticalTo(I2);
13838 if (SI1->isIdenticalTo(SI2))
13839 return true;
13840 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
13841 if (SI1->getOperand(I) != SI2->getOperand(I))
13842 return false;
13843 // Check if the second instruction is more defined than the first one.
13844 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
13845 ArrayRef<int> SM1 = SI1->getShuffleMask();
13846 // Count trailing undefs in the mask to check the final number of used
13847 // registers.
13848 unsigned LastUndefsCnt = 0;
13849 for (int I = 0, E = NewMask.size(); I < E; ++I) {
13850 if (SM1[I] == PoisonMaskElem)
13851 ++LastUndefsCnt;
13852 else
13853 LastUndefsCnt = 0;
13854 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
13855 NewMask[I] != SM1[I])
13856 return false;
13857 if (NewMask[I] == PoisonMaskElem)
13858 NewMask[I] = SM1[I];
13859 }
13860 // Check if the last undefs actually change the final number of used vector
13861 // registers.
13862 return SM1.size() - LastUndefsCnt > 1 &&
13863 TTI->getNumberOfParts(SI1->getType()) ==
13865 FixedVectorType::get(SI1->getType()->getElementType(),
13866 SM1.size() - LastUndefsCnt));
13867 };
13868 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
13869 // instructions. TODO: We can further optimize this scan if we split the
13870 // instructions into different buckets based on the insert lane.
13872 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
13873 assert(*I &&
13874 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
13875 "Worklist not sorted properly!");
13876 BasicBlock *BB = (*I)->getBlock();
13877 // For all instructions in blocks containing gather sequences:
13878 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
13879 if (isDeleted(&In))
13880 continue;
13881 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
13882 !GatherShuffleExtractSeq.contains(&In))
13883 continue;
13884
13885 // Check if we can replace this instruction with any of the
13886 // visited instructions.
13887 bool Replaced = false;
13888 for (Instruction *&V : Visited) {
13889 SmallVector<int> NewMask;
13890 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
13891 DT->dominates(V->getParent(), In.getParent())) {
13892 In.replaceAllUsesWith(V);
13893 eraseInstruction(&In);
13894 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
13895 if (!NewMask.empty())
13896 SI->setShuffleMask(NewMask);
13897 Replaced = true;
13898 break;
13899 }
13900 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
13901 GatherShuffleExtractSeq.contains(V) &&
13902 IsIdenticalOrLessDefined(V, &In, NewMask) &&
13903 DT->dominates(In.getParent(), V->getParent())) {
13904 In.moveAfter(V);
13905 V->replaceAllUsesWith(&In);
13907 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
13908 if (!NewMask.empty())
13909 SI->setShuffleMask(NewMask);
13910 V = &In;
13911 Replaced = true;
13912 break;
13913 }
13914 }
13915 if (!Replaced) {
13916 assert(!is_contained(Visited, &In));
13917 Visited.push_back(&In);
13918 }
13919 }
13920 }
13921 CSEBlocks.clear();
13922 GatherShuffleExtractSeq.clear();
13923}
13924
13925BoUpSLP::ScheduleData *
13926BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
13927 ScheduleData *Bundle = nullptr;
13928 ScheduleData *PrevInBundle = nullptr;
13929 for (Value *V : VL) {
13931 continue;
13932 ScheduleData *BundleMember = getScheduleData(V);
13933 assert(BundleMember &&
13934 "no ScheduleData for bundle member "
13935 "(maybe not in same basic block)");
13936 assert(BundleMember->isSchedulingEntity() &&
13937 "bundle member already part of other bundle");
13938 if (PrevInBundle) {
13939 PrevInBundle->NextInBundle = BundleMember;
13940 } else {
13941 Bundle = BundleMember;
13942 }
13943
13944 // Group the instructions to a bundle.
13945 BundleMember->FirstInBundle = Bundle;
13946 PrevInBundle = BundleMember;
13947 }
13948 assert(Bundle && "Failed to find schedule bundle");
13949 return Bundle;
13950}
13951
13952// Groups the instructions to a bundle (which is then a single scheduling entity)
13953// and schedules instructions until the bundle gets ready.
13954std::optional<BoUpSLP::ScheduleData *>
13955BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
13956 const InstructionsState &S) {
13957 // No need to schedule PHIs, insertelement, extractelement and extractvalue
13958 // instructions.
13959 if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
13961 return nullptr;
13962
13963 // Initialize the instruction bundle.
13964 Instruction *OldScheduleEnd = ScheduleEnd;
13965 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
13966
13967 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
13968 ScheduleData *Bundle) {
13969 // The scheduling region got new instructions at the lower end (or it is a
13970 // new region for the first bundle). This makes it necessary to
13971 // recalculate all dependencies.
13972 // It is seldom that this needs to be done a second time after adding the
13973 // initial bundle to the region.
13974 if (ScheduleEnd != OldScheduleEnd) {
13975 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
13976 doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
13977 ReSchedule = true;
13978 }
13979 if (Bundle) {
13980 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
13981 << " in block " << BB->getName() << "\n");
13982 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
13983 }
13984
13985 if (ReSchedule) {
13986 resetSchedule();
13987 initialFillReadyList(ReadyInsts);
13988 }
13989
13990 // Now try to schedule the new bundle or (if no bundle) just calculate
13991 // dependencies. As soon as the bundle is "ready" it means that there are no
13992 // cyclic dependencies and we can schedule it. Note that's important that we
13993 // don't "schedule" the bundle yet (see cancelScheduling).
13994 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
13995 !ReadyInsts.empty()) {
13996 ScheduleData *Picked = ReadyInsts.pop_back_val();
13997 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
13998 "must be ready to schedule");
13999 schedule(Picked, ReadyInsts);
14000 }
14001 };
14002
14003 // Make sure that the scheduling region contains all
14004 // instructions of the bundle.
14005 for (Value *V : VL) {
14007 continue;
14008 if (!extendSchedulingRegion(V, S)) {
14009 // If the scheduling region got new instructions at the lower end (or it
14010 // is a new region for the first bundle). This makes it necessary to
14011 // recalculate all dependencies.
14012 // Otherwise the compiler may crash trying to incorrectly calculate
14013 // dependencies and emit instruction in the wrong order at the actual
14014 // scheduling.
14015 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
14016 return std::nullopt;
14017 }
14018 }
14019
14020 bool ReSchedule = false;
14021 for (Value *V : VL) {
14023 continue;
14024 ScheduleData *BundleMember = getScheduleData(V);
14025 assert(BundleMember &&
14026 "no ScheduleData for bundle member (maybe not in same basic block)");
14027
14028 // Make sure we don't leave the pieces of the bundle in the ready list when
14029 // whole bundle might not be ready.
14030 ReadyInsts.remove(BundleMember);
14031
14032 if (!BundleMember->IsScheduled)
14033 continue;
14034 // A bundle member was scheduled as single instruction before and now
14035 // needs to be scheduled as part of the bundle. We just get rid of the
14036 // existing schedule.
14037 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
14038 << " was already scheduled\n");
14039 ReSchedule = true;
14040 }
14041
14042 auto *Bundle = buildBundle(VL);
14043 TryScheduleBundleImpl(ReSchedule, Bundle);
14044 if (!Bundle->isReady()) {
14045 cancelScheduling(VL, S.OpValue);
14046 return std::nullopt;
14047 }
14048 return Bundle;
14049}
14050
14051void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
14052 Value *OpValue) {
14053 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
14055 return;
14056
14057 if (doesNotNeedToBeScheduled(OpValue))
14058 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
14059 ScheduleData *Bundle = getScheduleData(OpValue);
14060 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
14061 assert(!Bundle->IsScheduled &&
14062 "Can't cancel bundle which is already scheduled");
14063 assert(Bundle->isSchedulingEntity() &&
14064 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
14065 "tried to unbundle something which is not a bundle");
14066
14067 // Remove the bundle from the ready list.
14068 if (Bundle->isReady())
14069 ReadyInsts.remove(Bundle);
14070
14071 // Un-bundle: make single instructions out of the bundle.
14072 ScheduleData *BundleMember = Bundle;
14073 while (BundleMember) {
14074 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
14075 BundleMember->FirstInBundle = BundleMember;
14076 ScheduleData *Next = BundleMember->NextInBundle;
14077 BundleMember->NextInBundle = nullptr;
14078 BundleMember->TE = nullptr;
14079 if (BundleMember->unscheduledDepsInBundle() == 0) {
14080 ReadyInsts.insert(BundleMember);
14081 }
14082 BundleMember = Next;
14083 }
14084}
14085
14086BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
14087 // Allocate a new ScheduleData for the instruction.
14088 if (ChunkPos >= ChunkSize) {
14089 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
14090 ChunkPos = 0;
14091 }
14092 return &(ScheduleDataChunks.back()[ChunkPos++]);
14093}
14094
14095bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
14096 const InstructionsState &S) {
14097 if (getScheduleData(V, isOneOf(S, V)))
14098 return true;
14099 Instruction *I = dyn_cast<Instruction>(V);
14100 assert(I && "bundle member must be an instruction");
14101 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
14103 "phi nodes/insertelements/extractelements/extractvalues don't need to "
14104 "be scheduled");
14105 auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
14106 ScheduleData *ISD = getScheduleData(I);
14107 if (!ISD)
14108 return false;
14109 assert(isInSchedulingRegion(ISD) &&
14110 "ScheduleData not in scheduling region");
14111 ScheduleData *SD = allocateScheduleDataChunks();
14112 SD->Inst = I;
14113 SD->init(SchedulingRegionID, S.OpValue);
14114 ExtraScheduleDataMap[I][S.OpValue] = SD;
14115 return true;
14116 };
14117 if (CheckScheduleForI(I))
14118 return true;
14119 if (!ScheduleStart) {
14120 // It's the first instruction in the new region.
14121 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
14122 ScheduleStart = I;
14123 ScheduleEnd = I->getNextNode();
14124 if (isOneOf(S, I) != I)
14125 CheckScheduleForI(I);
14126 assert(ScheduleEnd && "tried to vectorize a terminator?");
14127 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
14128 return true;
14129 }
14130 // Search up and down at the same time, because we don't know if the new
14131 // instruction is above or below the existing scheduling region.
14132 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
14133 // against the budget. Otherwise debug info could affect codegen.
14135 ++ScheduleStart->getIterator().getReverse();
14136 BasicBlock::reverse_iterator UpperEnd = BB->rend();
14137 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
14138 BasicBlock::iterator LowerEnd = BB->end();
14139 auto IsAssumeLikeIntr = [](const Instruction &I) {
14140 if (auto *II = dyn_cast<IntrinsicInst>(&I))
14141 return II->isAssumeLikeIntrinsic();
14142 return false;
14143 };
14144 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14145 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14146 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
14147 &*DownIter != I) {
14148 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14149 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
14150 return false;
14151 }
14152
14153 ++UpIter;
14154 ++DownIter;
14155
14156 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14157 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14158 }
14159 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
14160 assert(I->getParent() == ScheduleStart->getParent() &&
14161 "Instruction is in wrong basic block.");
14162 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
14163 ScheduleStart = I;
14164 if (isOneOf(S, I) != I)
14165 CheckScheduleForI(I);
14166 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
14167 << "\n");
14168 return true;
14169 }
14170 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
14171 "Expected to reach top of the basic block or instruction down the "
14172 "lower end.");
14173 assert(I->getParent() == ScheduleEnd->getParent() &&
14174 "Instruction is in wrong basic block.");
14175 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
14176 nullptr);
14177 ScheduleEnd = I->getNextNode();
14178 if (isOneOf(S, I) != I)
14179 CheckScheduleForI(I);
14180 assert(ScheduleEnd && "tried to vectorize a terminator?");
14181 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
14182 return true;
14183}
14184
14185void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
14186 Instruction *ToI,
14187 ScheduleData *PrevLoadStore,
14188 ScheduleData *NextLoadStore) {
14189 ScheduleData *CurrentLoadStore = PrevLoadStore;
14190 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
14191 // No need to allocate data for non-schedulable instructions.
14193 continue;
14194 ScheduleData *SD = ScheduleDataMap.lookup(I);
14195 if (!SD) {
14196 SD = allocateScheduleDataChunks();
14197 ScheduleDataMap[I] = SD;
14198 SD->Inst = I;
14199 }
14200 assert(!isInSchedulingRegion(SD) &&
14201 "new ScheduleData already in scheduling region");
14202 SD->init(SchedulingRegionID, I);
14203
14204 if (I->mayReadOrWriteMemory() &&
14205 (!isa<IntrinsicInst>(I) ||
14206 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
14207 cast<IntrinsicInst>(I)->getIntrinsicID() !=
14208 Intrinsic::pseudoprobe))) {
14209 // Update the linked list of memory accessing instructions.
14210 if (CurrentLoadStore) {
14211 CurrentLoadStore->NextLoadStore = SD;
14212 } else {
14213 FirstLoadStoreInRegion = SD;
14214 }
14215 CurrentLoadStore = SD;
14216 }
14217
14218 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14219 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14220 RegionHasStackSave = true;
14221 }
14222 if (NextLoadStore) {
14223 if (CurrentLoadStore)
14224 CurrentLoadStore->NextLoadStore = NextLoadStore;
14225 } else {
14226 LastLoadStoreInRegion = CurrentLoadStore;
14227 }
14228}
14229
14230void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14231 bool InsertInReadyList,
14232 BoUpSLP *SLP) {
14233 assert(SD->isSchedulingEntity());
14234
14236 WorkList.push_back(SD);
14237
14238 while (!WorkList.empty()) {
14239 ScheduleData *SD = WorkList.pop_back_val();
14240 for (ScheduleData *BundleMember = SD; BundleMember;
14241 BundleMember = BundleMember->NextInBundle) {
14242 assert(isInSchedulingRegion(BundleMember));
14243 if (BundleMember->hasValidDependencies())
14244 continue;
14245
14246 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
14247 << "\n");
14248 BundleMember->Dependencies = 0;
14249 BundleMember->resetUnscheduledDeps();
14250
14251 // Handle def-use chain dependencies.
14252 if (BundleMember->OpValue != BundleMember->Inst) {
14253 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
14254 BundleMember->Dependencies++;
14255 ScheduleData *DestBundle = UseSD->FirstInBundle;
14256 if (!DestBundle->IsScheduled)
14257 BundleMember->incrementUnscheduledDeps(1);
14258 if (!DestBundle->hasValidDependencies())
14259 WorkList.push_back(DestBundle);
14260 }
14261 } else {
14262 for (User *U : BundleMember->Inst->users()) {
14263 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
14264 BundleMember->Dependencies++;
14265 ScheduleData *DestBundle = UseSD->FirstInBundle;
14266 if (!DestBundle->IsScheduled)
14267 BundleMember->incrementUnscheduledDeps(1);
14268 if (!DestBundle->hasValidDependencies())
14269 WorkList.push_back(DestBundle);
14270 }
14271 }
14272 }
14273
14274 auto MakeControlDependent = [&](Instruction *I) {
14275 auto *DepDest = getScheduleData(I);
14276 assert(DepDest && "must be in schedule window");
14277 DepDest->ControlDependencies.push_back(BundleMember);
14278 BundleMember->Dependencies++;
14279 ScheduleData *DestBundle = DepDest->FirstInBundle;
14280 if (!DestBundle->IsScheduled)
14281 BundleMember->incrementUnscheduledDeps(1);
14282 if (!DestBundle->hasValidDependencies())
14283 WorkList.push_back(DestBundle);
14284 };
14285
14286 // Any instruction which isn't safe to speculate at the beginning of the
14287 // block is control dependend on any early exit or non-willreturn call
14288 // which proceeds it.
14289 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
14290 for (Instruction *I = BundleMember->Inst->getNextNode();
14291 I != ScheduleEnd; I = I->getNextNode()) {
14292 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
14293 continue;
14294
14295 // Add the dependency
14296 MakeControlDependent(I);
14297
14299 // Everything past here must be control dependent on I.
14300 break;
14301 }
14302 }
14303
14304 if (RegionHasStackSave) {
14305 // If we have an inalloc alloca instruction, it needs to be scheduled
14306 // after any preceeding stacksave. We also need to prevent any alloca
14307 // from reordering above a preceeding stackrestore.
14308 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
14309 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
14310 for (Instruction *I = BundleMember->Inst->getNextNode();
14311 I != ScheduleEnd; I = I->getNextNode()) {
14312 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14313 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14314 // Any allocas past here must be control dependent on I, and I
14315 // must be memory dependend on BundleMember->Inst.
14316 break;
14317
14318 if (!isa<AllocaInst>(I))
14319 continue;
14320
14321 // Add the dependency
14322 MakeControlDependent(I);
14323 }
14324 }
14325
14326 // In addition to the cases handle just above, we need to prevent
14327 // allocas and loads/stores from moving below a stacksave or a
14328 // stackrestore. Avoiding moving allocas below stackrestore is currently
14329 // thought to be conservatism. Moving loads/stores below a stackrestore
14330 // can lead to incorrect code.
14331 if (isa<AllocaInst>(BundleMember->Inst) ||
14332 BundleMember->Inst->mayReadOrWriteMemory()) {
14333 for (Instruction *I = BundleMember->Inst->getNextNode();
14334 I != ScheduleEnd; I = I->getNextNode()) {
14335 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
14336 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14337 continue;
14338
14339 // Add the dependency
14340 MakeControlDependent(I);
14341 break;
14342 }
14343 }
14344 }
14345
14346 // Handle the memory dependencies (if any).
14347 ScheduleData *DepDest = BundleMember->NextLoadStore;
14348 if (!DepDest)
14349 continue;
14350 Instruction *SrcInst = BundleMember->Inst;
14351 assert(SrcInst->mayReadOrWriteMemory() &&
14352 "NextLoadStore list for non memory effecting bundle?");
14353 MemoryLocation SrcLoc = getLocation(SrcInst);
14354 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14355 unsigned NumAliased = 0;
14356 unsigned DistToSrc = 1;
14357
14358 for (; DepDest; DepDest = DepDest->NextLoadStore) {
14359 assert(isInSchedulingRegion(DepDest));
14360
14361 // We have two limits to reduce the complexity:
14362 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
14363 // SLP->isAliased (which is the expensive part in this loop).
14364 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
14365 // the whole loop (even if the loop is fast, it's quadratic).
14366 // It's important for the loop break condition (see below) to
14367 // check this limit even between two read-only instructions.
14368 if (DistToSrc >= MaxMemDepDistance ||
14369 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
14370 (NumAliased >= AliasedCheckLimit ||
14371 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
14372
14373 // We increment the counter only if the locations are aliased
14374 // (instead of counting all alias checks). This gives a better
14375 // balance between reduced runtime and accurate dependencies.
14376 NumAliased++;
14377
14378 DepDest->MemoryDependencies.push_back(BundleMember);
14379 BundleMember->Dependencies++;
14380 ScheduleData *DestBundle = DepDest->FirstInBundle;
14381 if (!DestBundle->IsScheduled) {
14382 BundleMember->incrementUnscheduledDeps(1);
14383 }
14384 if (!DestBundle->hasValidDependencies()) {
14385 WorkList.push_back(DestBundle);
14386 }
14387 }
14388
14389 // Example, explaining the loop break condition: Let's assume our
14390 // starting instruction is i0 and MaxMemDepDistance = 3.
14391 //
14392 // +--------v--v--v
14393 // i0,i1,i2,i3,i4,i5,i6,i7,i8
14394 // +--------^--^--^
14395 //
14396 // MaxMemDepDistance let us stop alias-checking at i3 and we add
14397 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
14398 // Previously we already added dependencies from i3 to i6,i7,i8
14399 // (because of MaxMemDepDistance). As we added a dependency from
14400 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
14401 // and we can abort this loop at i6.
14402 if (DistToSrc >= 2 * MaxMemDepDistance)
14403 break;
14404 DistToSrc++;
14405 }
14406 }
14407 if (InsertInReadyList && SD->isReady()) {
14408 ReadyInsts.insert(SD);
14409 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
14410 << "\n");
14411 }
14412 }
14413}
14414
14415void BoUpSLP::BlockScheduling::resetSchedule() {
14416 assert(ScheduleStart &&
14417 "tried to reset schedule on block which has not been scheduled");
14418 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
14419 doForAllOpcodes(I, [&](ScheduleData *SD) {
14420 assert(isInSchedulingRegion(SD) &&
14421 "ScheduleData not in scheduling region");
14422 SD->IsScheduled = false;
14423 SD->resetUnscheduledDeps();
14424 });
14425 }
14426 ReadyInsts.clear();
14427}
14428
14429void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14430 if (!BS->ScheduleStart)
14431 return;
14432
14433 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
14434
14435 // A key point - if we got here, pre-scheduling was able to find a valid
14436 // scheduling of the sub-graph of the scheduling window which consists
14437 // of all vector bundles and their transitive users. As such, we do not
14438 // need to reschedule anything *outside of* that subgraph.
14439
14440 BS->resetSchedule();
14441
14442 // For the real scheduling we use a more sophisticated ready-list: it is
14443 // sorted by the original instruction location. This lets the final schedule
14444 // be as close as possible to the original instruction order.
14445 // WARNING: If changing this order causes a correctness issue, that means
14446 // there is some missing dependence edge in the schedule data graph.
14447 struct ScheduleDataCompare {
14448 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
14449 return SD2->SchedulingPriority < SD1->SchedulingPriority;
14450 }
14451 };
14452 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14453
14454 // Ensure that all dependency data is updated (for nodes in the sub-graph)
14455 // and fill the ready-list with initial instructions.
14456 int Idx = 0;
14457 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
14458 I = I->getNextNode()) {
14459 BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
14460 TreeEntry *SDTE = getTreeEntry(SD->Inst);
14461 (void)SDTE;
14463 SD->isPartOfBundle() ==
14464 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
14465 "scheduler and vectorizer bundle mismatch");
14466 SD->FirstInBundle->SchedulingPriority = Idx++;
14467
14468 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14469 BS->calculateDependencies(SD, false, this);
14470 });
14471 }
14472 BS->initialFillReadyList(ReadyInsts);
14473
14474 Instruction *LastScheduledInst = BS->ScheduleEnd;
14475
14476 // Do the "real" scheduling.
14477 while (!ReadyInsts.empty()) {
14478 ScheduleData *Picked = *ReadyInsts.begin();
14479 ReadyInsts.erase(ReadyInsts.begin());
14480
14481 // Move the scheduled instruction(s) to their dedicated places, if not
14482 // there yet.
14483 for (ScheduleData *BundleMember = Picked; BundleMember;
14484 BundleMember = BundleMember->NextInBundle) {
14485 Instruction *PickedInst = BundleMember->Inst;
14486 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
14487 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
14488 LastScheduledInst = PickedInst;
14489 }
14490
14491 BS->schedule(Picked, ReadyInsts);
14492 }
14493
14494 // Check that we didn't break any of our invariants.
14495#ifdef EXPENSIVE_CHECKS
14496 BS->verify();
14497#endif
14498
14499#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
14500 // Check that all schedulable entities got scheduled
14501 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
14502 BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
14503 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
14504 assert(SD->IsScheduled && "must be scheduled at this point");
14505 }
14506 });
14507 }
14508#endif
14509
14510 // Avoid duplicate scheduling of the block.
14511 BS->ScheduleStart = nullptr;
14512}
14513
14515 // If V is a store, just return the width of the stored value (or value
14516 // truncated just before storing) without traversing the expression tree.
14517 // This is the common case.
14518 if (auto *Store = dyn_cast<StoreInst>(V))
14519 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
14520
14521 if (auto *IEI = dyn_cast<InsertElementInst>(V))
14522 return getVectorElementSize(IEI->getOperand(1));
14523
14524 auto E = InstrElementSize.find(V);
14525 if (E != InstrElementSize.end())
14526 return E->second;
14527
14528 // If V is not a store, we can traverse the expression tree to find loads
14529 // that feed it. The type of the loaded value may indicate a more suitable
14530 // width than V's type. We want to base the vector element size on the width
14531 // of memory operations where possible.
14534 if (auto *I = dyn_cast<Instruction>(V)) {
14535 Worklist.emplace_back(I, I->getParent(), 0);
14536 Visited.insert(I);
14537 }
14538
14539 // Traverse the expression tree in bottom-up order looking for loads. If we
14540 // encounter an instruction we don't yet handle, we give up.
14541 auto Width = 0u;
14542 Value *FirstNonBool = nullptr;
14543 while (!Worklist.empty()) {
14544 auto [I, Parent, Level] = Worklist.pop_back_val();
14545
14546 // We should only be looking at scalar instructions here. If the current
14547 // instruction has a vector type, skip.
14548 auto *Ty = I->getType();
14549 if (isa<VectorType>(Ty))
14550 continue;
14551 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
14552 FirstNonBool = I;
14553 if (Level > RecursionMaxDepth)
14554 continue;
14555
14556 // If the current instruction is a load, update MaxWidth to reflect the
14557 // width of the loaded value.
14558 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
14559 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
14560
14561 // Otherwise, we need to visit the operands of the instruction. We only
14562 // handle the interesting cases from buildTree here. If an operand is an
14563 // instruction we haven't yet visited and from the same basic block as the
14564 // user or the use is a PHI node, we add it to the worklist.
14567 for (Use &U : I->operands()) {
14568 if (auto *J = dyn_cast<Instruction>(U.get()))
14569 if (Visited.insert(J).second &&
14570 (isa<PHINode>(I) || J->getParent() == Parent)) {
14571 Worklist.emplace_back(J, J->getParent(), Level + 1);
14572 continue;
14573 }
14574 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
14575 FirstNonBool = U.get();
14576 }
14577 } else {
14578 break;
14579 }
14580 }
14581
14582 // If we didn't encounter a memory access in the expression tree, or if we
14583 // gave up for some reason, just return the width of V. Otherwise, return the
14584 // maximum width we found.
14585 if (!Width) {
14586 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
14587 V = FirstNonBool;
14588 Width = DL->getTypeSizeInBits(V->getType());
14589 }
14590
14591 for (Instruction *I : Visited)
14592 InstrElementSize[I] = Width;
14593
14594 return Width;
14595}
14596
14597bool BoUpSLP::collectValuesToDemote(
14598 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
14600 unsigned &MaxDepthLevel, bool &IsProfitableToDemote,
14601 bool IsTruncRoot) const {
14602 // We can always demote constants.
14603 if (all_of(E.Scalars, IsaPred<Constant>))
14604 return true;
14605
14606 unsigned OrigBitWidth = DL->getTypeSizeInBits(E.Scalars.front()->getType());
14607 if (OrigBitWidth == BitWidth) {
14608 MaxDepthLevel = 1;
14609 return true;
14610 }
14611
14612 // If the value is not a vectorized instruction in the expression and not used
14613 // by the insertelement instruction and not used in multiple vector nodes, it
14614 // cannot be demoted.
14615 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
14616 if (MultiNodeScalars.contains(V))
14617 return false;
14618 if (OrigBitWidth > BitWidth) {
14619 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14620 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
14621 return true;
14622 }
14623 auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
14624 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
14625 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*DL));
14626 if (IsSigned)
14627 ++BitWidth1;
14628 if (auto *I = dyn_cast<Instruction>(V)) {
14629 APInt Mask = DB->getDemandedBits(I);
14630 unsigned BitWidth2 =
14631 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
14632 while (!IsSigned && BitWidth2 < OrigBitWidth) {
14633 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
14634 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
14635 break;
14636 BitWidth2 *= 2;
14637 }
14638 BitWidth1 = std::min(BitWidth1, BitWidth2);
14639 }
14640 BitWidth = std::max(BitWidth, BitWidth1);
14641 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
14642 };
14643 using namespace std::placeholders;
14644 auto FinalAnalysis = [&]() {
14645 if (!IsProfitableToDemote)
14646 return false;
14647 bool Res = all_of(
14648 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
14649 // Demote gathers.
14650 if (Res && E.State == TreeEntry::NeedToGather) {
14651 // Check possible extractelement instructions bases and final vector
14652 // length.
14653 SmallPtrSet<Value *, 4> UniqueBases;
14654 for (Value *V : E.Scalars) {
14655 auto *EE = dyn_cast<ExtractElementInst>(V);
14656 if (!EE)
14657 continue;
14658 UniqueBases.insert(EE->getVectorOperand());
14659 }
14660 const unsigned VF = E.Scalars.size();
14661 Type *OrigScalarTy = E.Scalars.front()->getType();
14662 if (UniqueBases.size() <= 2 ||
14663 TTI->getNumberOfParts(FixedVectorType::get(OrigScalarTy, VF)) ==
14665 IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
14666 ToDemote.push_back(E.Idx);
14667 }
14668 return Res;
14669 };
14670 if (E.State == TreeEntry::NeedToGather || !Visited.insert(&E).second ||
14671 any_of(E.Scalars, [&](Value *V) {
14672 return all_of(V->users(), [&](User *U) {
14673 return isa<InsertElementInst>(U) && !getTreeEntry(U);
14674 });
14675 }))
14676 return FinalAnalysis();
14677
14678 if (any_of(E.Scalars, [&](Value *V) {
14679 return !all_of(V->users(), [=](User *U) {
14680 return getTreeEntry(U) ||
14681 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14682 (!isa<CmpInst>(U) && U->getType()->isSized() &&
14683 !U->getType()->isScalableTy() &&
14684 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
14685 }) && !IsPotentiallyTruncated(V, BitWidth);
14686 }))
14687 return false;
14688
14689 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
14690 bool &NeedToExit) {
14691 NeedToExit = false;
14692 unsigned InitLevel = MaxDepthLevel;
14693 for (const TreeEntry *Op : Operands) {
14694 unsigned Level = InitLevel;
14695 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
14696 ToDemote, Visited, Level, IsProfitableToDemote,
14697 IsTruncRoot)) {
14698 if (!IsProfitableToDemote)
14699 return false;
14700 NeedToExit = true;
14701 if (!FinalAnalysis())
14702 return false;
14703 continue;
14704 }
14705 MaxDepthLevel = std::max(MaxDepthLevel, Level);
14706 }
14707 return true;
14708 };
14709 auto AttemptCheckBitwidth =
14710 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
14711 // Try all bitwidth < OrigBitWidth.
14712 NeedToExit = false;
14713 unsigned BestFailBitwidth = 0;
14714 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
14715 if (Checker(BitWidth, OrigBitWidth))
14716 return true;
14717 if (BestFailBitwidth == 0 && FinalAnalysis())
14718 BestFailBitwidth = BitWidth;
14719 }
14720 if (BitWidth >= OrigBitWidth) {
14721 if (BestFailBitwidth == 0) {
14722 BitWidth = OrigBitWidth;
14723 return false;
14724 }
14725 MaxDepthLevel = 1;
14726 BitWidth = BestFailBitwidth;
14727 NeedToExit = true;
14728 return true;
14729 }
14730 return false;
14731 };
14732 auto TryProcessInstruction =
14733 [&](unsigned &BitWidth,
14735 function_ref<bool(unsigned, unsigned)> Checker = {}) {
14736 if (Operands.empty()) {
14737 if (!IsTruncRoot)
14738 MaxDepthLevel = 1;
14739 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
14740 std::ref(BitWidth)));
14741 } else {
14742 // Several vectorized uses? Check if we can truncate it, otherwise -
14743 // exit.
14744 if (E.UserTreeIndices.size() > 1 &&
14745 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
14746 std::ref(BitWidth))))
14747 return false;
14748 bool NeedToExit = false;
14749 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
14750 return false;
14751 if (NeedToExit)
14752 return true;
14753 if (!ProcessOperands(Operands, NeedToExit))
14754 return false;
14755 if (NeedToExit)
14756 return true;
14757 }
14758
14759 ++MaxDepthLevel;
14760 // Record the entry that we can demote.
14761 ToDemote.push_back(E.Idx);
14762 return IsProfitableToDemote;
14763 };
14764 switch (E.getOpcode()) {
14765
14766 // We can always demote truncations and extensions. Since truncations can
14767 // seed additional demotion, we save the truncated value.
14768 case Instruction::Trunc:
14769 if (IsProfitableToDemoteRoot)
14770 IsProfitableToDemote = true;
14771 return TryProcessInstruction(BitWidth);
14772 case Instruction::ZExt:
14773 case Instruction::SExt:
14774 IsProfitableToDemote = true;
14775 return TryProcessInstruction(BitWidth);
14776
14777 // We can demote certain binary operations if we can demote both of their
14778 // operands.
14779 case Instruction::Add:
14780 case Instruction::Sub:
14781 case Instruction::Mul:
14782 case Instruction::And:
14783 case Instruction::Or:
14784 case Instruction::Xor: {
14785 return TryProcessInstruction(
14786 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
14787 }
14788 case Instruction::Shl: {
14789 // If we are truncating the result of this SHL, and if it's a shift of an
14790 // inrange amount, we can always perform a SHL in a smaller type.
14791 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
14792 return all_of(E.Scalars, [&](Value *V) {
14793 auto *I = cast<Instruction>(V);
14794 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14795 return AmtKnownBits.getMaxValue().ult(BitWidth);
14796 });
14797 };
14798 return TryProcessInstruction(
14799 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
14800 }
14801 case Instruction::LShr: {
14802 // If this is a truncate of a logical shr, we can truncate it to a smaller
14803 // lshr iff we know that the bits we would otherwise be shifting in are
14804 // already zeros.
14805 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14806 return all_of(E.Scalars, [&](Value *V) {
14807 auto *I = cast<Instruction>(V);
14808 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14809 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14810 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
14811 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
14812 SimplifyQuery(*DL));
14813 });
14814 };
14815 return TryProcessInstruction(
14816 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
14817 LShrChecker);
14818 }
14819 case Instruction::AShr: {
14820 // If this is a truncate of an arithmetic shr, we can truncate it to a
14821 // smaller ashr iff we know that all the bits from the sign bit of the
14822 // original type and the sign bit of the truncate type are similar.
14823 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14824 return all_of(E.Scalars, [&](Value *V) {
14825 auto *I = cast<Instruction>(V);
14826 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14827 unsigned ShiftedBits = OrigBitWidth - BitWidth;
14828 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
14829 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
14830 nullptr, DT);
14831 });
14832 };
14833 return TryProcessInstruction(
14834 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
14835 AShrChecker);
14836 }
14837 case Instruction::UDiv:
14838 case Instruction::URem: {
14839 // UDiv and URem can be truncated if all the truncated bits are zero.
14840 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14841 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
14842 return all_of(E.Scalars, [&](Value *V) {
14843 auto *I = cast<Instruction>(V);
14844 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14845 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
14846 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
14847 });
14848 };
14849 return TryProcessInstruction(
14850 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
14851 }
14852
14853 // We can demote selects if we can demote their true and false values.
14854 case Instruction::Select: {
14855 return TryProcessInstruction(
14856 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
14857 }
14858
14859 // We can demote phis if we can demote all their incoming operands. Note that
14860 // we don't need to worry about cycles since we ensure single use above.
14861 case Instruction::PHI: {
14862 const unsigned NumOps = E.getNumOperands();
14864 transform(seq<unsigned>(0, NumOps), Ops.begin(),
14865 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
14866
14867 return TryProcessInstruction(BitWidth, Ops);
14868 }
14869
14870 case Instruction::Call: {
14871 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
14872 if (!IC)
14873 break;
14875 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
14876 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
14877 break;
14878 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
14879 function_ref<bool(unsigned, unsigned)> CallChecker;
14880 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14881 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
14882 return all_of(E.Scalars, [&](Value *V) {
14883 auto *I = cast<Instruction>(V);
14884 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
14885 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14886 return MaskedValueIsZero(I->getOperand(0), Mask,
14887 SimplifyQuery(*DL)) &&
14888 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
14889 }
14890 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
14891 "Expected min/max intrinsics only.");
14892 unsigned SignBits = OrigBitWidth - BitWidth;
14893 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
14894 return SignBits <= ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
14895 nullptr, DT) &&
14896 (!isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL)) ||
14897 MaskedValueIsZero(I->getOperand(0), Mask,
14898 SimplifyQuery(*DL))) &&
14899 SignBits <= ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
14900 nullptr, DT) &&
14901 (!isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL)) ||
14902 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
14903 });
14904 };
14905 if (ID != Intrinsic::abs) {
14906 Operands.push_back(getOperandEntry(&E, 1));
14907 CallChecker = CompChecker;
14908 }
14909 InstructionCost BestCost =
14910 std::numeric_limits<InstructionCost::CostType>::max();
14911 unsigned BestBitWidth = BitWidth;
14912 unsigned VF = E.Scalars.size();
14913 // Choose the best bitwidth based on cost estimations.
14914 auto Checker = [&](unsigned BitWidth, unsigned) {
14915 unsigned MinBW = PowerOf2Ceil(BitWidth);
14916 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(IC, ID, VF, MinBW);
14917 auto VecCallCosts = getVectorCallCosts(
14918 IC,
14919 FixedVectorType::get(IntegerType::get(IC->getContext(), MinBW), VF),
14920 TTI, TLI, ArgTys);
14921 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
14922 if (Cost < BestCost) {
14923 BestCost = Cost;
14924 BestBitWidth = BitWidth;
14925 }
14926 return false;
14927 };
14928 [[maybe_unused]] bool NeedToExit;
14929 (void)AttemptCheckBitwidth(Checker, NeedToExit);
14930 BitWidth = BestBitWidth;
14931 return TryProcessInstruction(BitWidth, Operands, CallChecker);
14932 }
14933
14934 // Otherwise, conservatively give up.
14935 default:
14936 break;
14937 }
14938 MaxDepthLevel = 1;
14939 return FinalAnalysis();
14940}
14941
14942static RecurKind getRdxKind(Value *V);
14943
14945 // We only attempt to truncate integer expressions.
14946 bool IsStoreOrInsertElt =
14947 VectorizableTree.front()->getOpcode() == Instruction::Store ||
14948 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
14949 if ((IsStoreOrInsertElt || UserIgnoreList) &&
14950 ExtraBitWidthNodes.size() <= 1 &&
14951 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
14952 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
14953 return;
14954
14955 unsigned NodeIdx = 0;
14956 if (IsStoreOrInsertElt &&
14957 VectorizableTree.front()->State != TreeEntry::NeedToGather)
14958 NodeIdx = 1;
14959
14960 // Ensure the roots of the vectorizable tree don't form a cycle.
14961 if (VectorizableTree[NodeIdx]->State == TreeEntry::NeedToGather ||
14962 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
14963 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
14964 [NodeIdx](const EdgeInfo &EI) {
14965 return EI.UserTE->Idx >
14966 static_cast<int>(NodeIdx);
14967 })))
14968 return;
14969
14970 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
14971 // resize to the final type.
14972 bool IsTruncRoot = false;
14973 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
14974 SmallVector<unsigned> RootDemotes;
14975 if (NodeIdx != 0 &&
14976 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
14977 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
14978 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
14979 IsTruncRoot = true;
14980 RootDemotes.push_back(NodeIdx);
14981 IsProfitableToDemoteRoot = true;
14982 ++NodeIdx;
14983 }
14984
14985 // Analyzed the reduction already and not profitable - exit.
14986 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
14987 return;
14988
14989 SmallVector<unsigned> ToDemote;
14990 auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
14991 bool IsProfitableToDemoteRoot, unsigned Opcode,
14992 unsigned Limit, bool IsTruncRoot,
14993 bool IsSignedCmp) {
14994 ToDemote.clear();
14995 unsigned VF = E.getVectorFactor();
14996 auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType());
14997 if (!TreeRootIT || !Opcode)
14998 return 0u;
14999
15000 if (any_of(E.Scalars,
15001 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
15002 return 0u;
15003
15004 unsigned NumParts =
15005 TTI->getNumberOfParts(FixedVectorType::get(TreeRootIT, VF));
15006
15007 // The maximum bit width required to represent all the values that can be
15008 // demoted without loss of precision. It would be safe to truncate the roots
15009 // of the expression to this width.
15010 unsigned MaxBitWidth = 1u;
15011
15012 // True if the roots can be zero-extended back to their original type,
15013 // rather than sign-extended. We know that if the leading bits are not
15014 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
15015 // True.
15016 // Determine if the sign bit of all the roots is known to be zero. If not,
15017 // IsKnownPositive is set to False.
15018 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
15019 KnownBits Known = computeKnownBits(R, *DL);
15020 return Known.isNonNegative();
15021 });
15022
15023 // We first check if all the bits of the roots are demanded. If they're not,
15024 // we can truncate the roots to this narrower type.
15025 for (Value *Root : E.Scalars) {
15026 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
15027 TypeSize NumTypeBits = DL->getTypeSizeInBits(Root->getType());
15028 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15029 // If we can't prove that the sign bit is zero, we must add one to the
15030 // maximum bit width to account for the unknown sign bit. This preserves
15031 // the existing sign bit so we can safely sign-extend the root back to the
15032 // original type. Otherwise, if we know the sign bit is zero, we will
15033 // zero-extend the root instead.
15034 //
15035 // FIXME: This is somewhat suboptimal, as there will be cases where adding
15036 // one to the maximum bit width will yield a larger-than-necessary
15037 // type. In general, we need to add an extra bit only if we can't
15038 // prove that the upper bit of the original type is equal to the
15039 // upper bit of the proposed smaller type. If these two bits are
15040 // the same (either zero or one) we know that sign-extending from
15041 // the smaller type will result in the same value. Here, since we
15042 // can't yet prove this, we are just making the proposed smaller
15043 // type larger to ensure correctness.
15044 if (!IsKnownPositive)
15045 ++BitWidth1;
15046
15047 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
15048 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15049 MaxBitWidth =
15050 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
15051 }
15052
15053 if (MaxBitWidth < 8 && MaxBitWidth > 1)
15054 MaxBitWidth = 8;
15055
15056 // If the original type is large, but reduced type does not improve the reg
15057 // use - ignore it.
15058 if (NumParts > 1 &&
15059 NumParts ==
15061 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
15062 return 0u;
15063
15064 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
15065 Opcode == Instruction::SExt ||
15066 Opcode == Instruction::ZExt || NumParts > 1;
15067 // Conservatively determine if we can actually truncate the roots of the
15068 // expression. Collect the values that can be demoted in ToDemote and
15069 // additional roots that require investigating in Roots.
15071 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
15072 bool NeedToDemote = IsProfitableToDemote;
15073
15074 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
15075 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
15076 IsTruncRoot) ||
15077 (MaxDepthLevel <= Limit &&
15078 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
15079 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
15080 DL->getTypeSizeInBits(TreeRootIT) /
15081 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
15082 ->getOperand(0)
15083 ->getType()) >
15084 2)))))
15085 return 0u;
15086 // Round MaxBitWidth up to the next power-of-two.
15087 MaxBitWidth = bit_ceil(MaxBitWidth);
15088
15089 return MaxBitWidth;
15090 };
15091
15092 // If we can truncate the root, we must collect additional values that might
15093 // be demoted as a result. That is, those seeded by truncations we will
15094 // modify.
15095 // Add reduction ops sizes, if any.
15096 if (UserIgnoreList &&
15097 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
15098 for (Value *V : *UserIgnoreList) {
15099 auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
15100 auto NumTypeBits = DL->getTypeSizeInBits(V->getType());
15101 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15103 ++BitWidth1;
15104 unsigned BitWidth2 = BitWidth1;
15106 auto Mask = DB->getDemandedBits(cast<Instruction>(V));
15107 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15108 }
15109 ReductionBitWidth =
15110 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
15111 }
15112 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
15113 ReductionBitWidth = 8;
15114
15115 ReductionBitWidth = bit_ceil(ReductionBitWidth);
15116 }
15117 bool IsTopRoot = NodeIdx == 0;
15118 while (NodeIdx < VectorizableTree.size() &&
15119 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15120 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15121 RootDemotes.push_back(NodeIdx);
15122 ++NodeIdx;
15123 IsTruncRoot = true;
15124 }
15125 bool IsSignedCmp = false;
15126 while (NodeIdx < VectorizableTree.size()) {
15127 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
15128 unsigned Limit = 2;
15129 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
15130 if (IsTopRoot &&
15131 ReductionBitWidth ==
15132 DL->getTypeSizeInBits(
15133 VectorizableTree.front()->Scalars.front()->getType()))
15134 Limit = 3;
15135 unsigned MaxBitWidth = ComputeMaxBitWidth(
15136 *VectorizableTree[NodeIdx].get(), IsTopRoot, IsProfitableToDemoteRoot,
15137 Opcode, Limit, IsTruncRoot, IsSignedCmp);
15138 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
15139 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
15140 ReductionBitWidth = bit_ceil(MaxBitWidth);
15141 else if (MaxBitWidth == 0)
15142 ReductionBitWidth = 0;
15143 }
15144
15145 for (unsigned Idx : RootDemotes) {
15146 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
15147 uint32_t OrigBitWidth = DL->getTypeSizeInBits(V->getType());
15148 if (OrigBitWidth > MaxBitWidth) {
15149 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
15150 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
15151 }
15152 return false;
15153 }))
15154 ToDemote.push_back(Idx);
15155 }
15156 RootDemotes.clear();
15157 IsTopRoot = false;
15158 IsProfitableToDemoteRoot = true;
15159
15160 if (ExtraBitWidthNodes.empty()) {
15161 NodeIdx = VectorizableTree.size();
15162 } else {
15163 unsigned NewIdx = 0;
15164 do {
15165 NewIdx = *ExtraBitWidthNodes.begin();
15166 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
15167 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
15168 NodeIdx = NewIdx;
15169 IsTruncRoot =
15170 NodeIdx < VectorizableTree.size() &&
15171 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15172 [](const EdgeInfo &EI) {
15173 return EI.EdgeIdx == 0 &&
15174 EI.UserTE->getOpcode() == Instruction::Trunc &&
15175 !EI.UserTE->isAltShuffle();
15176 });
15177 IsSignedCmp =
15178 NodeIdx < VectorizableTree.size() &&
15179 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15180 [&](const EdgeInfo &EI) {
15181 return EI.UserTE->getOpcode() == Instruction::ICmp &&
15182 any_of(EI.UserTE->Scalars, [&](Value *V) {
15183 auto *IC = dyn_cast<ICmpInst>(V);
15184 return IC &&
15185 (IC->isSigned() ||
15186 !isKnownNonNegative(IC->getOperand(0),
15187 SimplifyQuery(*DL)) ||
15188 !isKnownNonNegative(IC->getOperand(1),
15189 SimplifyQuery(*DL)));
15190 });
15191 });
15192 }
15193
15194 // If the maximum bit width we compute is less than the with of the roots'
15195 // type, we can proceed with the narrowing. Otherwise, do nothing.
15196 if (MaxBitWidth == 0 ||
15197 MaxBitWidth >=
15198 cast<IntegerType>(TreeRoot.front()->getType())->getBitWidth()) {
15199 if (UserIgnoreList)
15200 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
15201 continue;
15202 }
15203
15204 // Finally, map the values we can demote to the maximum bit with we
15205 // computed.
15206 for (unsigned Idx : ToDemote) {
15207 TreeEntry *TE = VectorizableTree[Idx].get();
15208 if (MinBWs.contains(TE))
15209 continue;
15210 bool IsSigned = TE->getOpcode() == Instruction::SExt ||
15211 any_of(TE->Scalars, [&](Value *R) {
15212 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15213 });
15214 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
15215 }
15216 }
15217}
15218
15220 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
15221 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
15222 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
15223 auto *AA = &AM.getResult<AAManager>(F);
15224 auto *LI = &AM.getResult<LoopAnalysis>(F);
15225 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
15226 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
15227 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
15229
15230 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
15231 if (!Changed)
15232 return PreservedAnalyses::all();
15233
15236 return PA;
15237}
15238
15240 TargetTransformInfo *TTI_,
15241 TargetLibraryInfo *TLI_, AAResults *AA_,
15242 LoopInfo *LI_, DominatorTree *DT_,
15243 AssumptionCache *AC_, DemandedBits *DB_,
15246 return false;
15247 SE = SE_;
15248 TTI = TTI_;
15249 TLI = TLI_;
15250 AA = AA_;
15251 LI = LI_;
15252 DT = DT_;
15253 AC = AC_;
15254 DB = DB_;
15255 DL = &F.getParent()->getDataLayout();
15256
15257 Stores.clear();
15258 GEPs.clear();
15259 bool Changed = false;
15260
15261 // If the target claims to have no vector registers don't attempt
15262 // vectorization.
15264 LLVM_DEBUG(
15265 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
15266 return false;
15267 }
15268
15269 // Don't vectorize when the attribute NoImplicitFloat is used.
15270 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
15271 return false;
15272
15273 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
15274
15275 // Use the bottom up slp vectorizer to construct chains that start with
15276 // store instructions.
15277 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
15278
15279 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
15280 // delete instructions.
15281
15282 // Update DFS numbers now so that we can use them for ordering.
15283 DT->updateDFSNumbers();
15284
15285 // Scan the blocks in the function in post order.
15286 for (auto *BB : post_order(&F.getEntryBlock())) {
15287 // Start new block - clear the list of reduction roots.
15288 R.clearReductionData();
15289 collectSeedInstructions(BB);
15290
15291 // Vectorize trees that end at stores.
15292 if (!Stores.empty()) {
15293 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
15294 << " underlying objects.\n");
15295 Changed |= vectorizeStoreChains(R);
15296 }
15297
15298 // Vectorize trees that end at reductions.
15299 Changed |= vectorizeChainsInBlock(BB, R);
15300
15301 // Vectorize the index computations of getelementptr instructions. This
15302 // is primarily intended to catch gather-like idioms ending at
15303 // non-consecutive loads.
15304 if (!GEPs.empty()) {
15305 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
15306 << " underlying objects.\n");
15307 Changed |= vectorizeGEPIndices(BB, R);
15308 }
15309 }
15310
15311 if (Changed) {
15312 R.optimizeGatherSequence();
15313 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
15314 }
15315 return Changed;
15316}
15317
15318std::optional<bool>
15319SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
15320 unsigned Idx, unsigned MinVF,
15321 unsigned &Size) {
15322 Size = 0;
15323 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
15324 << "\n");
15325 const unsigned Sz = R.getVectorElementSize(Chain[0]);
15326 unsigned VF = Chain.size();
15327
15328 if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) {
15329 // Check if vectorizing with a non-power-of-2 VF should be considered. At
15330 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
15331 // all vector lanes are used.
15332 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
15333 return false;
15334 }
15335
15336 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
15337 << "\n");
15338
15339 SetVector<Value *> ValOps;
15340 for (Value *V : Chain)
15341 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
15342 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
15343 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
15344 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
15345 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
15346 bool IsPowerOf2 =
15347 isPowerOf2_32(ValOps.size()) ||
15348 (VectorizeNonPowerOf2 && isPowerOf2_32(ValOps.size() + 1));
15349 if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
15350 (!S.MainOp->isSafeToRemove() ||
15351 any_of(ValOps.getArrayRef(),
15352 [&](Value *V) {
15353 return !isa<ExtractElementInst>(V) &&
15354 (V->getNumUses() > Chain.size() ||
15355 any_of(V->users(), [&](User *U) {
15356 return !Stores.contains(U);
15357 }));
15358 }))) ||
15359 (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
15360 Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
15361 return false;
15362 }
15363 }
15364 if (R.isLoadCombineCandidate(Chain))
15365 return true;
15366 R.buildTree(Chain);
15367 // Check if tree tiny and store itself or its value is not vectorized.
15368 if (R.isTreeTinyAndNotFullyVectorizable()) {
15369 if (R.isGathered(Chain.front()) ||
15370 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
15371 return std::nullopt;
15372 Size = R.getTreeSize();
15373 return false;
15374 }
15375 R.reorderTopToBottom();
15376 R.reorderBottomToTop();
15377 R.buildExternalUses();
15378
15379 R.computeMinimumValueSizes();
15380 R.transformNodes();
15381
15382 Size = R.getTreeSize();
15383 if (S.getOpcode() == Instruction::Load)
15384 Size = 2; // cut off masked gather small trees
15385 InstructionCost Cost = R.getTreeCost();
15386
15387 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
15388 if (Cost < -SLPCostThreshold) {
15389 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
15390
15391 using namespace ore;
15392
15393 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
15394 cast<StoreInst>(Chain[0]))
15395 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
15396 << " and with tree size "
15397 << NV("TreeSize", R.getTreeSize()));
15398
15399 R.vectorizeTree();
15400 return true;
15401 }
15402
15403 return false;
15404}
15405
15406/// Checks if the quadratic mean deviation is less than 90% of the mean size.
15407static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
15408 bool First) {
15409 unsigned Num = 0;
15410 uint64_t Sum = std::accumulate(
15411 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
15412 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
15413 unsigned Size = First ? Val.first : Val.second;
15414 if (Size == 1)
15415 return V;
15416 ++Num;
15417 return V + Size;
15418 });
15419 if (Num == 0)
15420 return true;
15421 uint64_t Mean = Sum / Num;
15422 if (Mean == 0)
15423 return true;
15424 uint64_t Dev = std::accumulate(
15425 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
15426 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
15427 unsigned P = First ? Val.first : Val.second;
15428 if (P == 1)
15429 return V;
15430 return V + (P - Mean) * (P - Mean);
15431 }) /
15432 Num;
15433 return Dev * 81 / (Mean * Mean) == 0;
15434}
15435
15436bool SLPVectorizerPass::vectorizeStores(
15437 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
15438 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
15439 &Visited) {
15440 // We may run into multiple chains that merge into a single chain. We mark the
15441 // stores that we vectorized so that we don't visit the same store twice.
15442 BoUpSLP::ValueSet VectorizedStores;
15443 bool Changed = false;
15444
15445 struct StoreDistCompare {
15446 bool operator()(const std::pair<unsigned, int> &Op1,
15447 const std::pair<unsigned, int> &Op2) const {
15448 return Op1.second < Op2.second;
15449 }
15450 };
15451 // A set of pairs (index of store in Stores array ref, Distance of the store
15452 // address relative to base store address in units).
15453 using StoreIndexToDistSet =
15454 std::set<std::pair<unsigned, int>, StoreDistCompare>;
15455 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
15456 int PrevDist = -1;
15458 // Collect the chain into a list.
15459 for (auto [Idx, Data] : enumerate(Set)) {
15460 if (Operands.empty() || Data.second - PrevDist == 1) {
15461 Operands.push_back(Stores[Data.first]);
15462 PrevDist = Data.second;
15463 if (Idx != Set.size() - 1)
15464 continue;
15465 }
15466 auto E = make_scope_exit([&, &DataVar = Data]() {
15467 Operands.clear();
15468 Operands.push_back(Stores[DataVar.first]);
15469 PrevDist = DataVar.second;
15470 });
15471
15472 if (Operands.size() <= 1 ||
15473 !Visited
15474 .insert({Operands.front(),
15475 cast<StoreInst>(Operands.front())->getValueOperand(),
15476 Operands.back(),
15477 cast<StoreInst>(Operands.back())->getValueOperand(),
15478 Operands.size()})
15479 .second)
15480 continue;
15481
15482 unsigned MaxVecRegSize = R.getMaxVecRegSize();
15483 unsigned EltSize = R.getVectorElementSize(Operands[0]);
15484 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
15485
15486 unsigned MaxVF =
15487 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
15488 unsigned MaxRegVF = MaxVF;
15489 auto *Store = cast<StoreInst>(Operands[0]);
15490 Type *StoreTy = Store->getValueOperand()->getType();
15491 Type *ValueTy = StoreTy;
15492 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
15493 ValueTy = Trunc->getSrcTy();
15494 if (ValueTy == StoreTy &&
15495 R.getVectorElementSize(Store->getValueOperand()) <= EltSize)
15496 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
15497 unsigned MinVF = std::max<unsigned>(
15499 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
15500 ValueTy)));
15501
15502 if (MaxVF < MinVF) {
15503 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
15504 << ") < "
15505 << "MinVF (" << MinVF << ")\n");
15506 continue;
15507 }
15508
15509 unsigned NonPowerOf2VF = 0;
15511 // First try vectorizing with a non-power-of-2 VF. At the moment, only
15512 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
15513 // lanes are used.
15514 unsigned CandVF = Operands.size();
15515 if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxRegVF)
15516 NonPowerOf2VF = CandVF;
15517 }
15518
15519 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
15520 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
15521 unsigned Size = MinVF;
15522 for_each(reverse(CandidateVFs), [&](unsigned &VF) {
15523 VF = Size > MaxVF ? NonPowerOf2VF : Size;
15524 Size *= 2;
15525 });
15526 unsigned End = Operands.size();
15527 unsigned Repeat = 0;
15528 constexpr unsigned MaxAttempts = 4;
15530 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
15531 P.first = P.second = 1;
15532 });
15534 auto IsNotVectorized = [](bool First,
15535 const std::pair<unsigned, unsigned> &P) {
15536 return First ? P.first > 0 : P.second > 0;
15537 };
15538 auto IsVectorized = [](bool First,
15539 const std::pair<unsigned, unsigned> &P) {
15540 return First ? P.first == 0 : P.second == 0;
15541 };
15542 auto VFIsProfitable = [](bool First, unsigned Size,
15543 const std::pair<unsigned, unsigned> &P) {
15544 return First ? Size >= P.first : Size >= P.second;
15545 };
15546 auto FirstSizeSame = [](unsigned Size,
15547 const std::pair<unsigned, unsigned> &P) {
15548 return Size == P.first;
15549 };
15550 while (true) {
15551 ++Repeat;
15552 bool RepeatChanged = false;
15553 bool AnyProfitableGraph;
15554 for (unsigned Size : CandidateVFs) {
15555 AnyProfitableGraph = false;
15556 unsigned StartIdx = std::distance(
15557 RangeSizes.begin(),
15558 find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
15559 std::placeholders::_1)));
15560 while (StartIdx < End) {
15561 unsigned EndIdx =
15562 std::distance(RangeSizes.begin(),
15563 find_if(RangeSizes.drop_front(StartIdx),
15564 std::bind(IsVectorized, Size >= MaxRegVF,
15565 std::placeholders::_1)));
15566 unsigned Sz = EndIdx >= End ? End : EndIdx;
15567 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
15568 if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
15569 Size >= MaxRegVF)) {
15570 ++Cnt;
15571 continue;
15572 }
15574 assert(all_of(Slice,
15575 [&](Value *V) {
15576 return cast<StoreInst>(V)
15577 ->getValueOperand()
15578 ->getType() ==
15579 cast<StoreInst>(Slice.front())
15580 ->getValueOperand()
15581 ->getType();
15582 }) &&
15583 "Expected all operands of same type.");
15584 if (!NonSchedulable.empty()) {
15585 auto [NonSchedSizeMax, NonSchedSizeMin] =
15586 NonSchedulable.lookup(Slice.front());
15587 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
15588 Cnt += NonSchedSizeMax;
15589 continue;
15590 }
15591 }
15592 unsigned TreeSize;
15593 std::optional<bool> Res =
15594 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
15595 if (!Res) {
15596 NonSchedulable
15597 .try_emplace(Slice.front(), std::make_pair(Size, Size))
15598 .first->getSecond()
15599 .second = Size;
15600 } else if (*Res) {
15601 // Mark the vectorized stores so that we don't vectorize them
15602 // again.
15603 VectorizedStores.insert(Slice.begin(), Slice.end());
15604 // Mark the vectorized stores so that we don't vectorize them
15605 // again.
15606 AnyProfitableGraph = RepeatChanged = Changed = true;
15607 // If we vectorized initial block, no need to try to vectorize
15608 // it again.
15609 for_each(RangeSizes.slice(Cnt, Size),
15610 [](std::pair<unsigned, unsigned> &P) {
15611 P.first = P.second = 0;
15612 });
15613 if (Cnt < StartIdx + MinVF) {
15614 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
15615 [](std::pair<unsigned, unsigned> &P) {
15616 P.first = P.second = 0;
15617 });
15618 StartIdx = Cnt + Size;
15619 }
15620 if (Cnt > Sz - Size - MinVF) {
15621 for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
15622 [](std::pair<unsigned, unsigned> &P) {
15623 P.first = P.second = 0;
15624 });
15625 if (Sz == End)
15626 End = Cnt;
15627 Sz = Cnt;
15628 }
15629 Cnt += Size;
15630 continue;
15631 }
15632 if (Size > 2 && Res &&
15633 !all_of(RangeSizes.slice(Cnt, Size),
15634 std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
15635 std::placeholders::_1))) {
15636 Cnt += Size;
15637 continue;
15638 }
15639 // Check for the very big VFs that we're not rebuilding same
15640 // trees, just with larger number of elements.
15641 if (Size > MaxRegVF && TreeSize > 1 &&
15642 all_of(RangeSizes.slice(Cnt, Size),
15643 std::bind(FirstSizeSame, TreeSize,
15644 std::placeholders::_1))) {
15645 Cnt += Size;
15646 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
15647 ++Cnt;
15648 continue;
15649 }
15650 if (TreeSize > 1)
15651 for_each(RangeSizes.slice(Cnt, Size),
15652 [&](std::pair<unsigned, unsigned> &P) {
15653 if (Size >= MaxRegVF)
15654 P.second = std::max(P.second, TreeSize);
15655 else
15656 P.first = std::max(P.first, TreeSize);
15657 });
15658 ++Cnt;
15659 AnyProfitableGraph = true;
15660 }
15661 if (StartIdx >= End)
15662 break;
15663 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
15664 AnyProfitableGraph = true;
15665 StartIdx = std::distance(
15666 RangeSizes.begin(),
15667 find_if(RangeSizes.drop_front(Sz),
15668 std::bind(IsNotVectorized, Size >= MaxRegVF,
15669 std::placeholders::_1)));
15670 }
15671 if (!AnyProfitableGraph && Size >= MaxRegVF)
15672 break;
15673 }
15674 // All values vectorized - exit.
15675 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
15676 return P.first == 0 && P.second == 0;
15677 }))
15678 break;
15679 // Check if tried all attempts or no need for the last attempts at all.
15680 if (Repeat >= MaxAttempts ||
15681 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
15682 break;
15683 constexpr unsigned StoresLimit = 64;
15684 const unsigned MaxTotalNum = bit_floor(std::min<unsigned>(
15685 Operands.size(),
15686 static_cast<unsigned>(
15687 End -
15688 std::distance(
15689 RangeSizes.begin(),
15690 find_if(RangeSizes, std::bind(IsNotVectorized, true,
15691 std::placeholders::_1))) +
15692 1)));
15693 unsigned VF = PowerOf2Ceil(CandidateVFs.front()) * 2;
15694 if (VF > MaxTotalNum || VF >= StoresLimit)
15695 break;
15696 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
15697 if (P.first != 0)
15698 P.first = std::max(P.second, P.first);
15699 });
15700 // Last attempt to vectorize max number of elements, if all previous
15701 // attempts were unsuccessful because of the cost issues.
15702 CandidateVFs.clear();
15703 CandidateVFs.push_back(VF);
15704 }
15705 }
15706 };
15707
15708 // Stores pair (first: index of the store into Stores array ref, address of
15709 // which taken as base, second: sorted set of pairs {index, dist}, which are
15710 // indices of stores in the set and their store location distances relative to
15711 // the base address).
15712
15713 // Need to store the index of the very first store separately, since the set
15714 // may be reordered after the insertion and the first store may be moved. This
15715 // container allows to reduce number of calls of getPointersDiff() function.
15717 // Inserts the specified store SI with the given index Idx to the set of the
15718 // stores. If the store with the same distance is found already - stop
15719 // insertion, try to vectorize already found stores. If some stores from this
15720 // sequence were not vectorized - try to vectorize them with the new store
15721 // later. But this logic is applied only to the stores, that come before the
15722 // previous store with the same distance.
15723 // Example:
15724 // 1. store x, %p
15725 // 2. store y, %p+1
15726 // 3. store z, %p+2
15727 // 4. store a, %p
15728 // 5. store b, %p+3
15729 // - Scan this from the last to first store. The very first bunch of stores is
15730 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
15731 // vector).
15732 // - The next store in the list - #1 - has the same distance from store #5 as
15733 // the store #4.
15734 // - Try to vectorize sequence of stores 4,2,3,5.
15735 // - If all these stores are vectorized - just drop them.
15736 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
15737 // - Start new stores sequence.
15738 // The new bunch of stores is {1, {1, 0}}.
15739 // - Add the stores from previous sequence, that were not vectorized.
15740 // Here we consider the stores in the reversed order, rather they are used in
15741 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
15742 // Store #3 can be added -> comes after store #4 with the same distance as
15743 // store #1.
15744 // Store #5 cannot be added - comes before store #4.
15745 // This logic allows to improve the compile time, we assume that the stores
15746 // after previous store with the same distance most likely have memory
15747 // dependencies and no need to waste compile time to try to vectorize them.
15748 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
15749 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
15750 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
15751 std::optional<int> Diff = getPointersDiff(
15752 Stores[Set.first]->getValueOperand()->getType(),
15753 Stores[Set.first]->getPointerOperand(),
15754 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
15755 /*StrictCheck=*/true);
15756 if (!Diff)
15757 continue;
15758 auto It = Set.second.find(std::make_pair(Idx, *Diff));
15759 if (It == Set.second.end()) {
15760 Set.second.emplace(Idx, *Diff);
15761 return;
15762 }
15763 // Try to vectorize the first found set to avoid duplicate analysis.
15764 TryToVectorize(Set.second);
15765 StoreIndexToDistSet PrevSet;
15766 PrevSet.swap(Set.second);
15767 Set.first = Idx;
15768 Set.second.emplace(Idx, 0);
15769 // Insert stores that followed previous match to try to vectorize them
15770 // with this store.
15771 unsigned StartIdx = It->first + 1;
15772 SmallBitVector UsedStores(Idx - StartIdx);
15773 // Distances to previously found dup store (or this store, since they
15774 // store to the same addresses).
15775 SmallVector<int> Dists(Idx - StartIdx, 0);
15776 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
15777 // Do not try to vectorize sequences, we already tried.
15778 if (Pair.first <= It->first ||
15779 VectorizedStores.contains(Stores[Pair.first]))
15780 break;
15781 unsigned BI = Pair.first - StartIdx;
15782 UsedStores.set(BI);
15783 Dists[BI] = Pair.second - It->second;
15784 }
15785 for (unsigned I = StartIdx; I < Idx; ++I) {
15786 unsigned BI = I - StartIdx;
15787 if (UsedStores.test(BI))
15788 Set.second.emplace(I, Dists[BI]);
15789 }
15790 return;
15791 }
15792 auto &Res = SortedStores.emplace_back();
15793 Res.first = Idx;
15794 Res.second.emplace(Idx, 0);
15795 };
15796 StoreInst *PrevStore = Stores.front();
15797 for (auto [I, SI] : enumerate(Stores)) {
15798 // Check that we do not try to vectorize stores of different types.
15799 if (PrevStore->getValueOperand()->getType() !=
15800 SI->getValueOperand()->getType()) {
15801 for (auto &Set : SortedStores)
15802 TryToVectorize(Set.second);
15803 SortedStores.clear();
15804 PrevStore = SI;
15805 }
15806 FillStoresSet(I, SI);
15807 }
15808
15809 // Final vectorization attempt.
15810 for (auto &Set : SortedStores)
15811 TryToVectorize(Set.second);
15812
15813 return Changed;
15814}
15815
15816void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
15817 // Initialize the collections. We will make a single pass over the block.
15818 Stores.clear();
15819 GEPs.clear();
15820
15821 // Visit the store and getelementptr instructions in BB and organize them in
15822 // Stores and GEPs according to the underlying objects of their pointer
15823 // operands.
15824 for (Instruction &I : *BB) {
15825 // Ignore store instructions that are volatile or have a pointer operand
15826 // that doesn't point to a scalar type.
15827 if (auto *SI = dyn_cast<StoreInst>(&I)) {
15828 if (!SI->isSimple())
15829 continue;
15830 if (!isValidElementType(SI->getValueOperand()->getType()))
15831 continue;
15832 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
15833 }
15834
15835 // Ignore getelementptr instructions that have more than one index, a
15836 // constant index, or a pointer operand that doesn't point to a scalar
15837 // type.
15838 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
15839 if (GEP->getNumIndices() != 1)
15840 continue;
15841 Value *Idx = GEP->idx_begin()->get();
15842 if (isa<Constant>(Idx))
15843 continue;
15844 if (!isValidElementType(Idx->getType()))
15845 continue;
15846 if (GEP->getType()->isVectorTy())
15847 continue;
15848 GEPs[GEP->getPointerOperand()].push_back(GEP);
15849 }
15850 }
15851}
15852
15853bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
15854 bool MaxVFOnly) {
15855 if (VL.size() < 2)
15856 return false;
15857
15858 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
15859 << VL.size() << ".\n");
15860
15861 // Check that all of the parts are instructions of the same type,
15862 // we permit an alternate opcode via InstructionsState.
15863 InstructionsState S = getSameOpcode(VL, *TLI);
15864 if (!S.getOpcode())
15865 return false;
15866
15867 Instruction *I0 = cast<Instruction>(S.OpValue);
15868 // Make sure invalid types (including vector type) are rejected before
15869 // determining vectorization factor for scalar instructions.
15870 for (Value *V : VL) {
15871 Type *Ty = V->getType();
15872 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
15873 // NOTE: the following will give user internal llvm type name, which may
15874 // not be useful.
15875 R.getORE()->emit([&]() {
15876 std::string TypeStr;
15877 llvm::raw_string_ostream rso(TypeStr);
15878 Ty->print(rso);
15879 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
15880 << "Cannot SLP vectorize list: type "
15881 << rso.str() + " is unsupported by vectorizer";
15882 });
15883 return false;
15884 }
15885 }
15886
15887 unsigned Sz = R.getVectorElementSize(I0);
15888 unsigned MinVF = R.getMinVF(Sz);
15889 unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
15890 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
15891 if (MaxVF < 2) {
15892 R.getORE()->emit([&]() {
15893 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
15894 << "Cannot SLP vectorize list: vectorization factor "
15895 << "less than 2 is not supported";
15896 });
15897 return false;
15898 }
15899
15900 bool Changed = false;
15901 bool CandidateFound = false;
15902 InstructionCost MinCost = SLPCostThreshold.getValue();
15903 Type *ScalarTy = VL[0]->getType();
15904 if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
15905 ScalarTy = IE->getOperand(1)->getType();
15906
15907 unsigned NextInst = 0, MaxInst = VL.size();
15908 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
15909 // No actual vectorization should happen, if number of parts is the same as
15910 // provided vectorization factor (i.e. the scalar type is used for vector
15911 // code during codegen).
15912 auto *VecTy = FixedVectorType::get(ScalarTy, VF);
15913 if (TTI->getNumberOfParts(VecTy) == VF)
15914 continue;
15915 for (unsigned I = NextInst; I < MaxInst; ++I) {
15916 unsigned ActualVF = std::min(MaxInst - I, VF);
15917
15918 if (!isPowerOf2_32(ActualVF))
15919 continue;
15920
15921 if (MaxVFOnly && ActualVF < MaxVF)
15922 break;
15923 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
15924 break;
15925
15926 ArrayRef<Value *> Ops = VL.slice(I, ActualVF);
15927 // Check that a previous iteration of this loop did not delete the Value.
15928 if (llvm::any_of(Ops, [&R](Value *V) {
15929 auto *I = dyn_cast<Instruction>(V);
15930 return I && R.isDeleted(I);
15931 }))
15932 continue;
15933
15934 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
15935 << "\n");
15936
15937 R.buildTree(Ops);
15938 if (R.isTreeTinyAndNotFullyVectorizable())
15939 continue;
15940 R.reorderTopToBottom();
15941 R.reorderBottomToTop(
15942 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
15943 !R.doesRootHaveInTreeUses());
15944 R.buildExternalUses();
15945
15946 R.computeMinimumValueSizes();
15947 R.transformNodes();
15948 InstructionCost Cost = R.getTreeCost();
15949 CandidateFound = true;
15950 MinCost = std::min(MinCost, Cost);
15951
15952 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
15953 << " for VF=" << ActualVF << "\n");
15954 if (Cost < -SLPCostThreshold) {
15955 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
15956 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
15957 cast<Instruction>(Ops[0]))
15958 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
15959 << " and with tree size "
15960 << ore::NV("TreeSize", R.getTreeSize()));
15961
15962 R.vectorizeTree();
15963 // Move to the next bundle.
15964 I += VF - 1;
15965 NextInst = I + 1;
15966 Changed = true;
15967 }
15968 }
15969 }
15970
15971 if (!Changed && CandidateFound) {
15972 R.getORE()->emit([&]() {
15973 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
15974 << "List vectorization was possible but not beneficial with cost "
15975 << ore::NV("Cost", MinCost) << " >= "
15976 << ore::NV("Treshold", -SLPCostThreshold);
15977 });
15978 } else if (!Changed) {
15979 R.getORE()->emit([&]() {
15980 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
15981 << "Cannot SLP vectorize list: vectorization was impossible"
15982 << " with available vectorization factors";
15983 });
15984 }
15985 return Changed;
15986}
15987
15988bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
15989 if (!I)
15990 return false;
15991
15992 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
15993 return false;
15994
15995 Value *P = I->getParent();
15996
15997 // Vectorize in current basic block only.
15998 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
15999 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
16000 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
16001 return false;
16002
16003 // First collect all possible candidates
16005 Candidates.emplace_back(Op0, Op1);
16006
16007 auto *A = dyn_cast<BinaryOperator>(Op0);
16008 auto *B = dyn_cast<BinaryOperator>(Op1);
16009 // Try to skip B.
16010 if (A && B && B->hasOneUse()) {
16011 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
16012 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
16013 if (B0 && B0->getParent() == P)
16014 Candidates.emplace_back(A, B0);
16015 if (B1 && B1->getParent() == P)
16016 Candidates.emplace_back(A, B1);
16017 }
16018 // Try to skip A.
16019 if (B && A && A->hasOneUse()) {
16020 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
16021 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
16022 if (A0 && A0->getParent() == P)
16023 Candidates.emplace_back(A0, B);
16024 if (A1 && A1->getParent() == P)
16025 Candidates.emplace_back(A1, B);
16026 }
16027
16028 if (Candidates.size() == 1)
16029 return tryToVectorizeList({Op0, Op1}, R);
16030
16031 // We have multiple options. Try to pick the single best.
16032 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
16033 if (!BestCandidate)
16034 return false;
16035 return tryToVectorizeList(
16036 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
16037}
16038
16039namespace {
16040
16041/// Model horizontal reductions.
16042///
16043/// A horizontal reduction is a tree of reduction instructions that has values
16044/// that can be put into a vector as its leaves. For example:
16045///
16046/// mul mul mul mul
16047/// \ / \ /
16048/// + +
16049/// \ /
16050/// +
16051/// This tree has "mul" as its leaf values and "+" as its reduction
16052/// instructions. A reduction can feed into a store or a binary operation
16053/// feeding a phi.
16054/// ...
16055/// \ /
16056/// +
16057/// |
16058/// phi +=
16059///
16060/// Or:
16061/// ...
16062/// \ /
16063/// +
16064/// |
16065/// *p =
16066///
16067class HorizontalReduction {
16068 using ReductionOpsType = SmallVector<Value *, 16>;
16069 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
16070 ReductionOpsListType ReductionOps;
16071 /// List of possibly reduced values.
16073 /// Maps reduced value to the corresponding reduction operation.
16075 // Use map vector to make stable output.
16077 WeakTrackingVH ReductionRoot;
16078 /// The type of reduction operation.
16079 RecurKind RdxKind;
16080 /// Checks if the optimization of original scalar identity operations on
16081 /// matched horizontal reductions is enabled and allowed.
16082 bool IsSupportedHorRdxIdentityOp = false;
16083
16084 static bool isCmpSelMinMax(Instruction *I) {
16085 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
16087 }
16088
16089 // And/or are potentially poison-safe logical patterns like:
16090 // select x, y, false
16091 // select x, true, y
16092 static bool isBoolLogicOp(Instruction *I) {
16093 return isa<SelectInst>(I) &&
16094 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
16095 }
16096
16097 /// Checks if instruction is associative and can be vectorized.
16098 static bool isVectorizable(RecurKind Kind, Instruction *I) {
16099 if (Kind == RecurKind::None)
16100 return false;
16101
16102 // Integer ops that map to select instructions or intrinsics are fine.
16104 isBoolLogicOp(I))
16105 return true;
16106
16107 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
16108 // FP min/max are associative except for NaN and -0.0. We do not
16109 // have to rule out -0.0 here because the intrinsic semantics do not
16110 // specify a fixed result for it.
16111 return I->getFastMathFlags().noNaNs();
16112 }
16113
16114 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
16115 return true;
16116
16117 return I->isAssociative();
16118 }
16119
16120 static Value *getRdxOperand(Instruction *I, unsigned Index) {
16121 // Poison-safe 'or' takes the form: select X, true, Y
16122 // To make that work with the normal operand processing, we skip the
16123 // true value operand.
16124 // TODO: Change the code and data structures to handle this without a hack.
16125 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
16126 return I->getOperand(2);
16127 return I->getOperand(Index);
16128 }
16129
16130 /// Creates reduction operation with the current opcode.
16131 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
16132 Value *RHS, const Twine &Name, bool UseSelect) {
16133 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
16134 switch (Kind) {
16135 case RecurKind::Or:
16136 if (UseSelect &&
16138 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
16139 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16140 Name);
16141 case RecurKind::And:
16142 if (UseSelect &&
16144 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
16145 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16146 Name);
16147 case RecurKind::Add:
16148 case RecurKind::Mul:
16149 case RecurKind::Xor:
16150 case RecurKind::FAdd:
16151 case RecurKind::FMul:
16152 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16153 Name);
16154 case RecurKind::FMax:
16155 return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
16156 case RecurKind::FMin:
16157 return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
16158 case RecurKind::FMaximum:
16159 return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
16160 case RecurKind::FMinimum:
16161 return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
16162 case RecurKind::SMax:
16163 if (UseSelect) {
16164 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
16165 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16166 }
16167 return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
16168 case RecurKind::SMin:
16169 if (UseSelect) {
16170 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
16171 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16172 }
16173 return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
16174 case RecurKind::UMax:
16175 if (UseSelect) {
16176 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
16177 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16178 }
16179 return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
16180 case RecurKind::UMin:
16181 if (UseSelect) {
16182 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
16183 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16184 }
16185 return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
16186 default:
16187 llvm_unreachable("Unknown reduction operation.");
16188 }
16189 }
16190
16191 /// Creates reduction operation with the current opcode with the IR flags
16192 /// from \p ReductionOps, dropping nuw/nsw flags.
16193 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
16194 Value *RHS, const Twine &Name,
16195 const ReductionOpsListType &ReductionOps) {
16196 bool UseSelect = ReductionOps.size() == 2 ||
16197 // Logical or/and.
16198 (ReductionOps.size() == 1 &&
16199 any_of(ReductionOps.front(), IsaPred<SelectInst>));
16200 assert((!UseSelect || ReductionOps.size() != 2 ||
16201 isa<SelectInst>(ReductionOps[1][0])) &&
16202 "Expected cmp + select pairs for reduction");
16203 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
16205 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
16206 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
16207 /*IncludeWrapFlags=*/false);
16208 propagateIRFlags(Op, ReductionOps[1], nullptr,
16209 /*IncludeWrapFlags=*/false);
16210 return Op;
16211 }
16212 }
16213 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
16214 return Op;
16215 }
16216
16217public:
16218 static RecurKind getRdxKind(Value *V) {
16219 auto *I = dyn_cast<Instruction>(V);
16220 if (!I)
16221 return RecurKind::None;
16222 if (match(I, m_Add(m_Value(), m_Value())))
16223 return RecurKind::Add;
16224 if (match(I, m_Mul(m_Value(), m_Value())))
16225 return RecurKind::Mul;
16226 if (match(I, m_And(m_Value(), m_Value())) ||
16228 return RecurKind::And;
16229 if (match(I, m_Or(m_Value(), m_Value())) ||
16231 return RecurKind::Or;
16232 if (match(I, m_Xor(m_Value(), m_Value())))
16233 return RecurKind::Xor;
16234 if (match(I, m_FAdd(m_Value(), m_Value())))
16235 return RecurKind::FAdd;
16236 if (match(I, m_FMul(m_Value(), m_Value())))
16237 return RecurKind::FMul;
16238
16239 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
16240 return RecurKind::FMax;
16241 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
16242 return RecurKind::FMin;
16243
16244 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
16245 return RecurKind::FMaximum;
16246 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
16247 return RecurKind::FMinimum;
16248 // This matches either cmp+select or intrinsics. SLP is expected to handle
16249 // either form.
16250 // TODO: If we are canonicalizing to intrinsics, we can remove several
16251 // special-case paths that deal with selects.
16252 if (match(I, m_SMax(m_Value(), m_Value())))
16253 return RecurKind::SMax;
16254 if (match(I, m_SMin(m_Value(), m_Value())))
16255 return RecurKind::SMin;
16256 if (match(I, m_UMax(m_Value(), m_Value())))
16257 return RecurKind::UMax;
16258 if (match(I, m_UMin(m_Value(), m_Value())))
16259 return RecurKind::UMin;
16260
16261 if (auto *Select = dyn_cast<SelectInst>(I)) {
16262 // Try harder: look for min/max pattern based on instructions producing
16263 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
16264 // During the intermediate stages of SLP, it's very common to have
16265 // pattern like this (since optimizeGatherSequence is run only once
16266 // at the end):
16267 // %1 = extractelement <2 x i32> %a, i32 0
16268 // %2 = extractelement <2 x i32> %a, i32 1
16269 // %cond = icmp sgt i32 %1, %2
16270 // %3 = extractelement <2 x i32> %a, i32 0
16271 // %4 = extractelement <2 x i32> %a, i32 1
16272 // %select = select i1 %cond, i32 %3, i32 %4
16273 CmpInst::Predicate Pred;
16274 Instruction *L1;
16275 Instruction *L2;
16276
16277 Value *LHS = Select->getTrueValue();
16278 Value *RHS = Select->getFalseValue();
16279 Value *Cond = Select->getCondition();
16280
16281 // TODO: Support inverse predicates.
16282 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
16283 if (!isa<ExtractElementInst>(RHS) ||
16284 !L2->isIdenticalTo(cast<Instruction>(RHS)))
16285 return RecurKind::None;
16286 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
16287 if (!isa<ExtractElementInst>(LHS) ||
16288 !L1->isIdenticalTo(cast<Instruction>(LHS)))
16289 return RecurKind::None;
16290 } else {
16291 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
16292 return RecurKind::None;
16293 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
16294 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
16295 !L2->isIdenticalTo(cast<Instruction>(RHS)))
16296 return RecurKind::None;
16297 }
16298
16299 switch (Pred) {
16300 default:
16301 return RecurKind::None;
16302 case CmpInst::ICMP_SGT:
16303 case CmpInst::ICMP_SGE:
16304 return RecurKind::SMax;
16305 case CmpInst::ICMP_SLT:
16306 case CmpInst::ICMP_SLE:
16307 return RecurKind::SMin;
16308 case CmpInst::ICMP_UGT:
16309 case CmpInst::ICMP_UGE:
16310 return RecurKind::UMax;
16311 case CmpInst::ICMP_ULT:
16312 case CmpInst::ICMP_ULE:
16313 return RecurKind::UMin;
16314 }
16315 }
16316 return RecurKind::None;
16317 }
16318
16319 /// Get the index of the first operand.
16320 static unsigned getFirstOperandIndex(Instruction *I) {
16321 return isCmpSelMinMax(I) ? 1 : 0;
16322 }
16323
16324private:
16325 /// Total number of operands in the reduction operation.
16326 static unsigned getNumberOfOperands(Instruction *I) {
16327 return isCmpSelMinMax(I) ? 3 : 2;
16328 }
16329
16330 /// Checks if the instruction is in basic block \p BB.
16331 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
16332 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
16333 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
16334 auto *Sel = cast<SelectInst>(I);
16335 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
16336 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
16337 }
16338 return I->getParent() == BB;
16339 }
16340
16341 /// Expected number of uses for reduction operations/reduced values.
16342 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
16343 if (IsCmpSelMinMax) {
16344 // SelectInst must be used twice while the condition op must have single
16345 // use only.
16346 if (auto *Sel = dyn_cast<SelectInst>(I))
16347 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
16348 return I->hasNUses(2);
16349 }
16350
16351 // Arithmetic reduction operation must be used once only.
16352 return I->hasOneUse();
16353 }
16354
16355 /// Initializes the list of reduction operations.
16356 void initReductionOps(Instruction *I) {
16357 if (isCmpSelMinMax(I))
16358 ReductionOps.assign(2, ReductionOpsType());
16359 else
16360 ReductionOps.assign(1, ReductionOpsType());
16361 }
16362
16363 /// Add all reduction operations for the reduction instruction \p I.
16364 void addReductionOps(Instruction *I) {
16365 if (isCmpSelMinMax(I)) {
16366 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
16367 ReductionOps[1].emplace_back(I);
16368 } else {
16369 ReductionOps[0].emplace_back(I);
16370 }
16371 }
16372
16373 static bool isGoodForReduction(ArrayRef<Value *> Data) {
16374 int Sz = Data.size();
16375 auto *I = dyn_cast<Instruction>(Data.front());
16376 return Sz > 1 || isConstant(Data.front()) ||
16377 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
16378 }
16379
16380public:
16381 HorizontalReduction() = default;
16382
16383 /// Try to find a reduction tree.
16384 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
16385 ScalarEvolution &SE, const DataLayout &DL,
16386 const TargetLibraryInfo &TLI) {
16387 RdxKind = HorizontalReduction::getRdxKind(Root);
16388 if (!isVectorizable(RdxKind, Root))
16389 return false;
16390
16391 // Analyze "regular" integer/FP types for reductions - no target-specific
16392 // types or pointers.
16393 Type *Ty = Root->getType();
16394 if (!isValidElementType(Ty) || Ty->isPointerTy())
16395 return false;
16396
16397 // Though the ultimate reduction may have multiple uses, its condition must
16398 // have only single use.
16399 if (auto *Sel = dyn_cast<SelectInst>(Root))
16400 if (!Sel->getCondition()->hasOneUse())
16401 return false;
16402
16403 ReductionRoot = Root;
16404
16405 // Iterate through all the operands of the possible reduction tree and
16406 // gather all the reduced values, sorting them by their value id.
16407 BasicBlock *BB = Root->getParent();
16408 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
16409 SmallVector<Instruction *> Worklist(1, Root);
16410 // Checks if the operands of the \p TreeN instruction are also reduction
16411 // operations or should be treated as reduced values or an extra argument,
16412 // which is not part of the reduction.
16413 auto CheckOperands = [&](Instruction *TreeN,
16414 SmallVectorImpl<Value *> &ExtraArgs,
16415 SmallVectorImpl<Value *> &PossibleReducedVals,
16416 SmallVectorImpl<Instruction *> &ReductionOps) {
16417 for (int I = getFirstOperandIndex(TreeN),
16418 End = getNumberOfOperands(TreeN);
16419 I < End; ++I) {
16420 Value *EdgeVal = getRdxOperand(TreeN, I);
16421 ReducedValsToOps[EdgeVal].push_back(TreeN);
16422 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
16423 // Edge has wrong parent - mark as an extra argument.
16424 if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
16425 !hasSameParent(EdgeInst, BB)) {
16426 ExtraArgs.push_back(EdgeVal);
16427 continue;
16428 }
16429 // If the edge is not an instruction, or it is different from the main
16430 // reduction opcode or has too many uses - possible reduced value.
16431 // Also, do not try to reduce const values, if the operation is not
16432 // foldable.
16433 if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
16434 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
16435 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
16436 !isVectorizable(RdxKind, EdgeInst) ||
16437 (R.isAnalyzedReductionRoot(EdgeInst) &&
16438 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
16439 PossibleReducedVals.push_back(EdgeVal);
16440 continue;
16441 }
16442 ReductionOps.push_back(EdgeInst);
16443 }
16444 };
16445 // Try to regroup reduced values so that it gets more profitable to try to
16446 // reduce them. Values are grouped by their value ids, instructions - by
16447 // instruction op id and/or alternate op id, plus do extra analysis for
16448 // loads (grouping them by the distabce between pointers) and cmp
16449 // instructions (grouping them by the predicate).
16451 PossibleReducedVals;
16452 initReductionOps(Root);
16454 SmallSet<size_t, 2> LoadKeyUsed;
16455 SmallPtrSet<Value *, 4> DoNotReverseVals;
16456
16457 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
16459 if (LoadKeyUsed.contains(Key)) {
16460 auto LIt = LoadsMap.find(Ptr);
16461 if (LIt != LoadsMap.end()) {
16462 for (LoadInst *RLI : LIt->second) {
16463 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
16464 LI->getType(), LI->getPointerOperand(), DL, SE,
16465 /*StrictCheck=*/true))
16466 return hash_value(RLI->getPointerOperand());
16467 }
16468 for (LoadInst *RLI : LIt->second) {
16470 LI->getPointerOperand(), TLI)) {
16471 hash_code SubKey = hash_value(RLI->getPointerOperand());
16472 DoNotReverseVals.insert(RLI);
16473 return SubKey;
16474 }
16475 }
16476 if (LIt->second.size() > 2) {
16477 hash_code SubKey =
16478 hash_value(LIt->second.back()->getPointerOperand());
16479 DoNotReverseVals.insert(LIt->second.back());
16480 return SubKey;
16481 }
16482 }
16483 }
16484 LoadKeyUsed.insert(Key);
16485 LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
16486 return hash_value(LI->getPointerOperand());
16487 };
16488
16489 while (!Worklist.empty()) {
16490 Instruction *TreeN = Worklist.pop_back_val();
16492 SmallVector<Value *> PossibleRedVals;
16493 SmallVector<Instruction *> PossibleReductionOps;
16494 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
16495 // If too many extra args - mark the instruction itself as a reduction
16496 // value, not a reduction operation.
16497 if (Args.size() < 2) {
16498 addReductionOps(TreeN);
16499 // Add extra args.
16500 if (!Args.empty()) {
16501 assert(Args.size() == 1 && "Expected only single argument.");
16502 ExtraArgs[TreeN] = Args.front();
16503 }
16504 // Add reduction values. The values are sorted for better vectorization
16505 // results.
16506 for (Value *V : PossibleRedVals) {
16507 size_t Key, Idx;
16508 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
16509 /*AllowAlternate=*/false);
16510 ++PossibleReducedVals[Key][Idx]
16511 .insert(std::make_pair(V, 0))
16512 .first->second;
16513 }
16514 Worklist.append(PossibleReductionOps.rbegin(),
16515 PossibleReductionOps.rend());
16516 } else {
16517 size_t Key, Idx;
16518 std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey,
16519 /*AllowAlternate=*/false);
16520 ++PossibleReducedVals[Key][Idx]
16521 .insert(std::make_pair(TreeN, 0))
16522 .first->second;
16523 }
16524 }
16525 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
16526 // Sort values by the total number of values kinds to start the reduction
16527 // from the longest possible reduced values sequences.
16528 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
16529 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
16530 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
16531 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
16532 It != E; ++It) {
16533 PossibleRedValsVect.emplace_back();
16534 auto RedValsVect = It->second.takeVector();
16535 stable_sort(RedValsVect, llvm::less_second());
16536 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
16537 PossibleRedValsVect.back().append(Data.second, Data.first);
16538 }
16539 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
16540 return P1.size() > P2.size();
16541 });
16542 int NewIdx = -1;
16543 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
16544 if (isGoodForReduction(Data) ||
16545 (isa<LoadInst>(Data.front()) && NewIdx >= 0 &&
16546 isa<LoadInst>(ReducedVals[NewIdx].front()) &&
16548 cast<LoadInst>(Data.front())->getPointerOperand()) ==
16549 getUnderlyingObject(cast<LoadInst>(ReducedVals[NewIdx].front())
16550 ->getPointerOperand()))) {
16551 if (NewIdx < 0) {
16552 NewIdx = ReducedVals.size();
16553 ReducedVals.emplace_back();
16554 }
16555 if (DoNotReverseVals.contains(Data.front()))
16556 ReducedVals[NewIdx].append(Data.begin(), Data.end());
16557 else
16558 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
16559 } else {
16560 ReducedVals.emplace_back().append(Data.rbegin(), Data.rend());
16561 }
16562 }
16563 }
16564 // Sort the reduced values by number of same/alternate opcode and/or pointer
16565 // operand.
16566 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
16567 return P1.size() > P2.size();
16568 });
16569 return true;
16570 }
16571
16572 /// Attempt to vectorize the tree found by matchAssociativeReduction.
16573 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
16574 const TargetLibraryInfo &TLI) {
16575 constexpr int ReductionLimit = 4;
16576 constexpr unsigned RegMaxNumber = 4;
16577 constexpr unsigned RedValsMaxNumber = 128;
16578 // If there are a sufficient number of reduction values, reduce
16579 // to a nearby power-of-2. We can safely generate oversized
16580 // vectors and rely on the backend to split them to legal sizes.
16581 unsigned NumReducedVals =
16582 std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0,
16583 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
16584 if (!isGoodForReduction(Vals))
16585 return Num;
16586 return Num + Vals.size();
16587 });
16588 if (NumReducedVals < ReductionLimit &&
16590 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
16591 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
16592 }))) {
16593 for (ReductionOpsType &RdxOps : ReductionOps)
16594 for (Value *RdxOp : RdxOps)
16595 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
16596 return nullptr;
16597 }
16598
16599 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
16600 TargetFolder(DL));
16601 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
16602
16603 // Track the reduced values in case if they are replaced by extractelement
16604 // because of the vectorization.
16606 ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
16607 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
16608 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
16609 ExternallyUsedValues.reserve(ExtraArgs.size() + 1);
16610 // The same extra argument may be used several times, so log each attempt
16611 // to use it.
16612 for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
16613 assert(Pair.first && "DebugLoc must be set.");
16614 ExternallyUsedValues[Pair.second].push_back(Pair.first);
16615 TrackedVals.try_emplace(Pair.second, Pair.second);
16616 }
16617
16618 // The compare instruction of a min/max is the insertion point for new
16619 // instructions and may be replaced with a new compare instruction.
16620 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
16621 assert(isa<SelectInst>(RdxRootInst) &&
16622 "Expected min/max reduction to have select root instruction");
16623 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
16624 assert(isa<Instruction>(ScalarCond) &&
16625 "Expected min/max reduction to have compare condition");
16626 return cast<Instruction>(ScalarCond);
16627 };
16628
16629 // Return new VectorizedTree, based on previous value.
16630 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
16631 if (VectorizedTree) {
16632 // Update the final value in the reduction.
16634 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
16635 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
16637 !isGuaranteedNotToBePoison(VectorizedTree))) {
16638 auto It = ReducedValsToOps.find(Res);
16639 if (It != ReducedValsToOps.end() &&
16640 any_of(It->getSecond(),
16641 [](Instruction *I) { return isBoolLogicOp(I); }))
16642 std::swap(VectorizedTree, Res);
16643 }
16644
16645 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
16646 ReductionOps);
16647 }
16648 // Initialize the final value in the reduction.
16649 return Res;
16650 };
16651 bool AnyBoolLogicOp =
16652 any_of(ReductionOps.back(), [](Value *V) {
16653 return isBoolLogicOp(cast<Instruction>(V));
16654 });
16655 // The reduction root is used as the insertion point for new instructions,
16656 // so set it as externally used to prevent it from being deleted.
16657 ExternallyUsedValues[ReductionRoot];
16658 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
16659 ReductionOps.front().size());
16660 for (ReductionOpsType &RdxOps : ReductionOps)
16661 for (Value *RdxOp : RdxOps) {
16662 if (!RdxOp)
16663 continue;
16664 IgnoreList.insert(RdxOp);
16665 }
16666 // Intersect the fast-math-flags from all reduction operations.
16667 FastMathFlags RdxFMF;
16668 RdxFMF.set();
16669 for (Value *U : IgnoreList)
16670 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
16671 RdxFMF &= FPMO->getFastMathFlags();
16672 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
16673
16674 // Need to track reduced vals, they may be changed during vectorization of
16675 // subvectors.
16676 for (ArrayRef<Value *> Candidates : ReducedVals)
16677 for (Value *V : Candidates)
16678 TrackedVals.try_emplace(V, V);
16679
16680 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
16681 // List of the values that were reduced in other trees as part of gather
16682 // nodes and thus requiring extract if fully vectorized in other trees.
16683 SmallPtrSet<Value *, 4> RequiredExtract;
16684 Value *VectorizedTree = nullptr;
16685 bool CheckForReusedReductionOps = false;
16686 // Try to vectorize elements based on their type.
16687 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
16688 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
16689 InstructionsState S = getSameOpcode(OrigReducedVals, TLI);
16690 SmallVector<Value *> Candidates;
16691 Candidates.reserve(2 * OrigReducedVals.size());
16692 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
16693 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
16694 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
16695 // Check if the reduction value was not overriden by the extractelement
16696 // instruction because of the vectorization and exclude it, if it is not
16697 // compatible with other values.
16698 // Also check if the instruction was folded to constant/other value.
16699 auto *Inst = dyn_cast<Instruction>(RdxVal);
16700 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
16701 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
16702 (S.getOpcode() && !Inst))
16703 continue;
16704 Candidates.push_back(RdxVal);
16705 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
16706 }
16707 bool ShuffledExtracts = false;
16708 // Try to handle shuffled extractelements.
16709 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
16710 I + 1 < E) {
16711 InstructionsState NextS = getSameOpcode(ReducedVals[I + 1], TLI);
16712 if (NextS.getOpcode() == Instruction::ExtractElement &&
16713 !NextS.isAltShuffle()) {
16714 SmallVector<Value *> CommonCandidates(Candidates);
16715 for (Value *RV : ReducedVals[I + 1]) {
16716 Value *RdxVal = TrackedVals.find(RV)->second;
16717 // Check if the reduction value was not overriden by the
16718 // extractelement instruction because of the vectorization and
16719 // exclude it, if it is not compatible with other values.
16720 if (auto *Inst = dyn_cast<Instruction>(RdxVal))
16721 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
16722 continue;
16723 CommonCandidates.push_back(RdxVal);
16724 TrackedToOrig.try_emplace(RdxVal, RV);
16725 }
16727 if (isFixedVectorShuffle(CommonCandidates, Mask)) {
16728 ++I;
16729 Candidates.swap(CommonCandidates);
16730 ShuffledExtracts = true;
16731 }
16732 }
16733 }
16734
16735 // Emit code for constant values.
16736 if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 &&
16737 allConstant(Candidates)) {
16738 Value *Res = Candidates.front();
16739 ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond();
16740 for (Value *VC : ArrayRef(Candidates).drop_front()) {
16741 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
16742 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
16743 if (auto *ResI = dyn_cast<Instruction>(Res))
16744 V.analyzedReductionRoot(ResI);
16745 }
16746 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
16747 continue;
16748 }
16749
16750 unsigned NumReducedVals = Candidates.size();
16751 if (NumReducedVals < ReductionLimit &&
16752 (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization ||
16753 !isSplat(Candidates)))
16754 continue;
16755
16756 // Check if we support repeated scalar values processing (optimization of
16757 // original scalar identity operations on matched horizontal reductions).
16758 IsSupportedHorRdxIdentityOp =
16759 AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
16760 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
16761 // Gather same values.
16762 MapVector<Value *, unsigned> SameValuesCounter;
16763 if (IsSupportedHorRdxIdentityOp)
16764 for (Value *V : Candidates)
16765 ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second;
16766 // Used to check if the reduced values used same number of times. In this
16767 // case the compiler may produce better code. E.g. if reduced values are
16768 // aabbccdd (8 x values), then the first node of the tree will have a node
16769 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
16770 // Plus, the final reduction will be performed on <8 x aabbccdd>.
16771 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
16772 // x abcd) * 2.
16773 // Currently it only handles add/fadd/xor. and/or/min/max do not require
16774 // this analysis, other operations may require an extra estimation of
16775 // the profitability.
16776 bool SameScaleFactor = false;
16777 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
16778 SameValuesCounter.size() != Candidates.size();
16779 if (OptReusedScalars) {
16780 SameScaleFactor =
16781 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
16782 RdxKind == RecurKind::Xor) &&
16783 all_of(drop_begin(SameValuesCounter),
16784 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
16785 return P.second == SameValuesCounter.front().second;
16786 });
16787 Candidates.resize(SameValuesCounter.size());
16788 transform(SameValuesCounter, Candidates.begin(),
16789 [](const auto &P) { return P.first; });
16790 NumReducedVals = Candidates.size();
16791 // Have a reduction of the same element.
16792 if (NumReducedVals == 1) {
16793 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
16794 unsigned Cnt = SameValuesCounter.lookup(OrigV);
16795 Value *RedVal =
16796 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
16797 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
16798 VectorizedVals.try_emplace(OrigV, Cnt);
16799 continue;
16800 }
16801 }
16802
16803 unsigned MaxVecRegSize = V.getMaxVecRegSize();
16804 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
16805 unsigned MaxElts =
16806 RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize);
16807
16808 unsigned ReduxWidth = std::min<unsigned>(
16809 llvm::bit_floor(NumReducedVals),
16810 std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
16811 RegMaxNumber * RedValsMaxNumber));
16812 unsigned Start = 0;
16813 unsigned Pos = Start;
16814 // Restarts vectorization attempt with lower vector factor.
16815 unsigned PrevReduxWidth = ReduxWidth;
16816 bool CheckForReusedReductionOpsLocal = false;
16817 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
16818 &CheckForReusedReductionOpsLocal,
16819 &PrevReduxWidth, &V,
16820 &IgnoreList](bool IgnoreVL = false) {
16821 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
16822 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
16823 // Check if any of the reduction ops are gathered. If so, worth
16824 // trying again with less number of reduction ops.
16825 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
16826 }
16827 ++Pos;
16828 if (Pos < NumReducedVals - ReduxWidth + 1)
16829 return IsAnyRedOpGathered;
16830 Pos = Start;
16831 ReduxWidth /= 2;
16832 return IsAnyRedOpGathered;
16833 };
16834 bool AnyVectorized = false;
16835 while (Pos < NumReducedVals - ReduxWidth + 1 &&
16836 ReduxWidth >= ReductionLimit) {
16837 // Dependency in tree of the reduction ops - drop this attempt, try
16838 // later.
16839 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
16840 Start == 0) {
16841 CheckForReusedReductionOps = true;
16842 break;
16843 }
16844 PrevReduxWidth = ReduxWidth;
16845 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
16846 // Beeing analyzed already - skip.
16847 if (V.areAnalyzedReductionVals(VL)) {
16848 (void)AdjustReducedVals(/*IgnoreVL=*/true);
16849 continue;
16850 }
16851 // Early exit if any of the reduction values were deleted during
16852 // previous vectorization attempts.
16853 if (any_of(VL, [&V](Value *RedVal) {
16854 auto *RedValI = dyn_cast<Instruction>(RedVal);
16855 if (!RedValI)
16856 return false;
16857 return V.isDeleted(RedValI);
16858 }))
16859 break;
16860 V.buildTree(VL, IgnoreList);
16861 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
16862 if (!AdjustReducedVals())
16863 V.analyzedReductionVals(VL);
16864 continue;
16865 }
16866 if (V.isLoadCombineReductionCandidate(RdxKind)) {
16867 if (!AdjustReducedVals())
16868 V.analyzedReductionVals(VL);
16869 continue;
16870 }
16871 V.reorderTopToBottom();
16872 // No need to reorder the root node at all.
16873 V.reorderBottomToTop(/*IgnoreReorder=*/true);
16874 // Keep extracted other reduction values, if they are used in the
16875 // vectorization trees.
16876 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
16877 ExternallyUsedValues);
16878 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
16879 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
16880 continue;
16881 for (Value *V : ReducedVals[Cnt])
16882 if (isa<Instruction>(V))
16883 LocalExternallyUsedValues[TrackedVals[V]];
16884 }
16885 if (!IsSupportedHorRdxIdentityOp) {
16886 // Number of uses of the candidates in the vector of values.
16887 assert(SameValuesCounter.empty() &&
16888 "Reused values counter map is not empty");
16889 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
16890 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
16891 continue;
16892 Value *V = Candidates[Cnt];
16893 Value *OrigV = TrackedToOrig.find(V)->second;
16894 ++SameValuesCounter[OrigV];
16895 }
16896 }
16897 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
16898 // Gather externally used values.
16900 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
16901 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
16902 continue;
16903 Value *RdxVal = Candidates[Cnt];
16904 if (!Visited.insert(RdxVal).second)
16905 continue;
16906 // Check if the scalar was vectorized as part of the vectorization
16907 // tree but not the top node.
16908 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
16909 LocalExternallyUsedValues[RdxVal];
16910 continue;
16911 }
16912 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
16913 unsigned NumOps =
16914 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
16915 if (NumOps != ReducedValsToOps.find(OrigV)->second.size())
16916 LocalExternallyUsedValues[RdxVal];
16917 }
16918 // Do not need the list of reused scalars in regular mode anymore.
16919 if (!IsSupportedHorRdxIdentityOp)
16920 SameValuesCounter.clear();
16921 for (Value *RdxVal : VL)
16922 if (RequiredExtract.contains(RdxVal))
16923 LocalExternallyUsedValues[RdxVal];
16924 // Update LocalExternallyUsedValues for the scalar, replaced by
16925 // extractelement instructions.
16926 DenseMap<Value *, Value *> ReplacementToExternal;
16927 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals)
16928 ReplacementToExternal.try_emplace(Pair.second, Pair.first);
16929 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
16930 Value *Ext = Pair.first;
16931 auto RIt = ReplacementToExternal.find(Ext);
16932 while (RIt != ReplacementToExternal.end()) {
16933 Ext = RIt->second;
16934 RIt = ReplacementToExternal.find(Ext);
16935 }
16936 auto *It = ExternallyUsedValues.find(Ext);
16937 if (It == ExternallyUsedValues.end())
16938 continue;
16939 LocalExternallyUsedValues[Pair.second].append(It->second);
16940 }
16941 V.buildExternalUses(LocalExternallyUsedValues);
16942
16943 V.computeMinimumValueSizes();
16944 V.transformNodes();
16945
16946 // Estimate cost.
16947 InstructionCost TreeCost = V.getTreeCost(VL);
16948 InstructionCost ReductionCost =
16949 getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
16950 InstructionCost Cost = TreeCost + ReductionCost;
16951 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
16952 << " for reduction\n");
16953 if (!Cost.isValid())
16954 break;
16955 if (Cost >= -SLPCostThreshold) {
16956 V.getORE()->emit([&]() {
16958 SV_NAME, "HorSLPNotBeneficial",
16959 ReducedValsToOps.find(VL[0])->second.front())
16960 << "Vectorizing horizontal reduction is possible "
16961 << "but not beneficial with cost " << ore::NV("Cost", Cost)
16962 << " and threshold "
16963 << ore::NV("Threshold", -SLPCostThreshold);
16964 });
16965 if (!AdjustReducedVals())
16966 V.analyzedReductionVals(VL);
16967 continue;
16968 }
16969
16970 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
16971 << Cost << ". (HorRdx)\n");
16972 V.getORE()->emit([&]() {
16973 return OptimizationRemark(
16974 SV_NAME, "VectorizedHorizontalReduction",
16975 ReducedValsToOps.find(VL[0])->second.front())
16976 << "Vectorized horizontal reduction with cost "
16977 << ore::NV("Cost", Cost) << " and with tree size "
16978 << ore::NV("TreeSize", V.getTreeSize());
16979 });
16980
16981 Builder.setFastMathFlags(RdxFMF);
16982
16983 // Emit a reduction. If the root is a select (min/max idiom), the insert
16984 // point is the compare condition of that select.
16985 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
16986 Instruction *InsertPt = RdxRootInst;
16987 if (IsCmpSelMinMax)
16988 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
16989
16990 // Vectorize a tree.
16991 Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,
16992 ReplacedExternals, InsertPt);
16993
16994 Builder.SetInsertPoint(InsertPt);
16995
16996 // To prevent poison from leaking across what used to be sequential,
16997 // safe, scalar boolean logic operations, the reduction operand must be
16998 // frozen.
16999 if ((isBoolLogicOp(RdxRootInst) ||
17000 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
17001 !isGuaranteedNotToBePoison(VectorizedRoot))
17002 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
17003
17004 // Emit code to correctly handle reused reduced values, if required.
17005 if (OptReusedScalars && !SameScaleFactor) {
17006 VectorizedRoot =
17007 emitReusedOps(VectorizedRoot, Builder, V.getRootNodeScalars(),
17008 SameValuesCounter, TrackedToOrig);
17009 }
17010
17011 Value *ReducedSubTree =
17012 emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
17013 if (ReducedSubTree->getType() != VL.front()->getType()) {
17014 ReducedSubTree = Builder.CreateIntCast(
17015 ReducedSubTree, VL.front()->getType(), any_of(VL, [&](Value *R) {
17017 R, cast<Instruction>(ReductionOps.front().front())
17018 ->getModule()
17019 ->getDataLayout());
17020 return !Known.isNonNegative();
17021 }));
17022 }
17023
17024 // Improved analysis for add/fadd/xor reductions with same scale factor
17025 // for all operands of reductions. We can emit scalar ops for them
17026 // instead.
17027 if (OptReusedScalars && SameScaleFactor)
17028 ReducedSubTree = emitScaleForReusedOps(
17029 ReducedSubTree, Builder, SameValuesCounter.front().second);
17030
17031 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
17032 // Count vectorized reduced values to exclude them from final reduction.
17033 for (Value *RdxVal : VL) {
17034 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17035 if (IsSupportedHorRdxIdentityOp) {
17036 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
17037 continue;
17038 }
17039 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
17040 if (!V.isVectorized(RdxVal))
17041 RequiredExtract.insert(RdxVal);
17042 }
17043 Pos += ReduxWidth;
17044 Start = Pos;
17045 ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
17046 AnyVectorized = true;
17047 }
17048 if (OptReusedScalars && !AnyVectorized) {
17049 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
17050 Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second);
17051 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17052 Value *OrigV = TrackedToOrig.find(P.first)->second;
17053 VectorizedVals.try_emplace(OrigV, P.second);
17054 }
17055 continue;
17056 }
17057 }
17058 if (VectorizedTree) {
17059 // Reorder operands of bool logical op in the natural order to avoid
17060 // possible problem with poison propagation. If not possible to reorder
17061 // (both operands are originally RHS), emit an extra freeze instruction
17062 // for the LHS operand.
17063 // I.e., if we have original code like this:
17064 // RedOp1 = select i1 ?, i1 LHS, i1 false
17065 // RedOp2 = select i1 RHS, i1 ?, i1 false
17066
17067 // Then, we swap LHS/RHS to create a new op that matches the poison
17068 // semantics of the original code.
17069
17070 // If we have original code like this and both values could be poison:
17071 // RedOp1 = select i1 ?, i1 LHS, i1 false
17072 // RedOp2 = select i1 ?, i1 RHS, i1 false
17073
17074 // Then, we must freeze LHS in the new op.
17075 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
17076 Instruction *RedOp1,
17077 Instruction *RedOp2,
17078 bool InitStep) {
17079 if (!AnyBoolLogicOp)
17080 return;
17081 if (isBoolLogicOp(RedOp1) &&
17082 ((!InitStep && LHS == VectorizedTree) ||
17083 getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
17084 return;
17085 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
17086 getRdxOperand(RedOp2, 0) == RHS ||
17088 std::swap(LHS, RHS);
17089 return;
17090 }
17091 if (LHS != VectorizedTree)
17092 LHS = Builder.CreateFreeze(LHS);
17093 };
17094 // Finish the reduction.
17095 // Need to add extra arguments and not vectorized possible reduction
17096 // values.
17097 // Try to avoid dependencies between the scalar remainders after
17098 // reductions.
17099 auto FinalGen =
17101 bool InitStep) {
17102 unsigned Sz = InstVals.size();
17104 Sz % 2);
17105 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
17106 Instruction *RedOp = InstVals[I + 1].first;
17107 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
17108 Value *RdxVal1 = InstVals[I].second;
17109 Value *StableRdxVal1 = RdxVal1;
17110 auto It1 = TrackedVals.find(RdxVal1);
17111 if (It1 != TrackedVals.end())
17112 StableRdxVal1 = It1->second;
17113 Value *RdxVal2 = InstVals[I + 1].second;
17114 Value *StableRdxVal2 = RdxVal2;
17115 auto It2 = TrackedVals.find(RdxVal2);
17116 if (It2 != TrackedVals.end())
17117 StableRdxVal2 = It2->second;
17118 // To prevent poison from leaking across what used to be
17119 // sequential, safe, scalar boolean logic operations, the
17120 // reduction operand must be frozen.
17121 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
17122 RedOp, InitStep);
17123 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
17124 StableRdxVal2, "op.rdx", ReductionOps);
17125 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
17126 }
17127 if (Sz % 2 == 1)
17128 ExtraReds[Sz / 2] = InstVals.back();
17129 return ExtraReds;
17130 };
17132 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
17133 VectorizedTree);
17135 for (ArrayRef<Value *> Candidates : ReducedVals) {
17136 for (Value *RdxVal : Candidates) {
17137 if (!Visited.insert(RdxVal).second)
17138 continue;
17139 unsigned NumOps = VectorizedVals.lookup(RdxVal);
17140 for (Instruction *RedOp :
17141 ArrayRef(ReducedValsToOps.find(RdxVal)->second)
17142 .drop_back(NumOps))
17143 ExtraReductions.emplace_back(RedOp, RdxVal);
17144 }
17145 }
17146 for (auto &Pair : ExternallyUsedValues) {
17147 // Add each externally used value to the final reduction.
17148 for (auto *I : Pair.second)
17149 ExtraReductions.emplace_back(I, Pair.first);
17150 }
17151 // Iterate through all not-vectorized reduction values/extra arguments.
17152 bool InitStep = true;
17153 while (ExtraReductions.size() > 1) {
17154 VectorizedTree = ExtraReductions.front().second;
17156 FinalGen(ExtraReductions, InitStep);
17157 ExtraReductions.swap(NewReds);
17158 InitStep = false;
17159 }
17160 VectorizedTree = ExtraReductions.front().second;
17161
17162 ReductionRoot->replaceAllUsesWith(VectorizedTree);
17163
17164 // The original scalar reduction is expected to have no remaining
17165 // uses outside the reduction tree itself. Assert that we got this
17166 // correct, replace internal uses with undef, and mark for eventual
17167 // deletion.
17168#ifndef NDEBUG
17169 SmallSet<Value *, 4> IgnoreSet;
17170 for (ArrayRef<Value *> RdxOps : ReductionOps)
17171 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
17172#endif
17173 for (ArrayRef<Value *> RdxOps : ReductionOps) {
17174 for (Value *Ignore : RdxOps) {
17175 if (!Ignore)
17176 continue;
17177#ifndef NDEBUG
17178 for (auto *U : Ignore->users()) {
17179 assert(IgnoreSet.count(U) &&
17180 "All users must be either in the reduction ops list.");
17181 }
17182#endif
17183 if (!Ignore->use_empty()) {
17184 Value *Undef = UndefValue::get(Ignore->getType());
17185 Ignore->replaceAllUsesWith(Undef);
17186 }
17187 V.eraseInstruction(cast<Instruction>(Ignore));
17188 }
17189 }
17190 } else if (!CheckForReusedReductionOps) {
17191 for (ReductionOpsType &RdxOps : ReductionOps)
17192 for (Value *RdxOp : RdxOps)
17193 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17194 }
17195 return VectorizedTree;
17196 }
17197
17198private:
17199 /// Calculate the cost of a reduction.
17200 InstructionCost getReductionCost(TargetTransformInfo *TTI,
17201 ArrayRef<Value *> ReducedVals,
17202 bool IsCmpSelMinMax, unsigned ReduxWidth,
17203 FastMathFlags FMF) {
17205 Type *ScalarTy = ReducedVals.front()->getType();
17206 FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
17207 InstructionCost VectorCost = 0, ScalarCost;
17208 // If all of the reduced values are constant, the vector cost is 0, since
17209 // the reduction value can be calculated at the compile time.
17210 bool AllConsts = allConstant(ReducedVals);
17211 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
17213 // Scalar cost is repeated for N-1 elements.
17214 int Cnt = ReducedVals.size();
17215 for (Value *RdxVal : ReducedVals) {
17216 if (Cnt == 1)
17217 break;
17218 --Cnt;
17219 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
17220 Cost += GenCostFn();
17221 continue;
17222 }
17223 InstructionCost ScalarCost = 0;
17224 for (User *U : RdxVal->users()) {
17225 auto *RdxOp = cast<Instruction>(U);
17226 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
17227 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
17228 continue;
17229 }
17230 ScalarCost = InstructionCost::getInvalid();
17231 break;
17232 }
17233 if (ScalarCost.isValid())
17234 Cost += ScalarCost;
17235 else
17236 Cost += GenCostFn();
17237 }
17238 return Cost;
17239 };
17240 switch (RdxKind) {
17241 case RecurKind::Add:
17242 case RecurKind::Mul:
17243 case RecurKind::Or:
17244 case RecurKind::And:
17245 case RecurKind::Xor:
17246 case RecurKind::FAdd:
17247 case RecurKind::FMul: {
17248 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
17249 if (!AllConsts)
17250 VectorCost =
17251 TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
17252 ScalarCost = EvaluateScalarCost([&]() {
17253 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
17254 });
17255 break;
17256 }
17257 case RecurKind::FMax:
17258 case RecurKind::FMin:
17259 case RecurKind::FMaximum:
17260 case RecurKind::FMinimum:
17261 case RecurKind::SMax:
17262 case RecurKind::SMin:
17263 case RecurKind::UMax:
17264 case RecurKind::UMin: {
17266 if (!AllConsts)
17267 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
17268 ScalarCost = EvaluateScalarCost([&]() {
17269 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
17270 return TTI->getIntrinsicInstrCost(ICA, CostKind);
17271 });
17272 break;
17273 }
17274 default:
17275 llvm_unreachable("Expected arithmetic or min/max reduction operation");
17276 }
17277
17278 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
17279 << " for reduction of " << shortBundleName(ReducedVals)
17280 << " (It is a splitting reduction)\n");
17281 return VectorCost - ScalarCost;
17282 }
17283
17284 /// Emit a horizontal reduction of the vectorized value.
17285 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
17286 unsigned ReduxWidth, const TargetTransformInfo *TTI) {
17287 assert(VectorizedValue && "Need to have a vectorized tree node");
17288 assert(isPowerOf2_32(ReduxWidth) &&
17289 "We only handle power-of-two reductions for now");
17290 assert(RdxKind != RecurKind::FMulAdd &&
17291 "A call to the llvm.fmuladd intrinsic is not handled yet");
17292
17293 ++NumVectorInstructions;
17294 return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind);
17295 }
17296
17297 /// Emits optimized code for unique scalar value reused \p Cnt times.
17298 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17299 unsigned Cnt) {
17300 assert(IsSupportedHorRdxIdentityOp &&
17301 "The optimization of matched scalar identity horizontal reductions "
17302 "must be supported.");
17303 switch (RdxKind) {
17304 case RecurKind::Add: {
17305 // res = mul vv, n
17306 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
17307 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
17308 << VectorizedValue << ". (HorRdx)\n");
17309 return Builder.CreateMul(VectorizedValue, Scale);
17310 }
17311 case RecurKind::Xor: {
17312 // res = n % 2 ? 0 : vv
17313 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
17314 << ". (HorRdx)\n");
17315 if (Cnt % 2 == 0)
17316 return Constant::getNullValue(VectorizedValue->getType());
17317 return VectorizedValue;
17318 }
17319 case RecurKind::FAdd: {
17320 // res = fmul v, n
17321 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
17322 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
17323 << VectorizedValue << ". (HorRdx)\n");
17324 return Builder.CreateFMul(VectorizedValue, Scale);
17325 }
17326 case RecurKind::And:
17327 case RecurKind::Or:
17328 case RecurKind::SMax:
17329 case RecurKind::SMin:
17330 case RecurKind::UMax:
17331 case RecurKind::UMin:
17332 case RecurKind::FMax:
17333 case RecurKind::FMin:
17334 case RecurKind::FMaximum:
17335 case RecurKind::FMinimum:
17336 // res = vv
17337 return VectorizedValue;
17338 case RecurKind::Mul:
17339 case RecurKind::FMul:
17340 case RecurKind::FMulAdd:
17341 case RecurKind::IAnyOf:
17342 case RecurKind::FAnyOf:
17343 case RecurKind::None:
17344 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
17345 }
17346 return nullptr;
17347 }
17348
17349 /// Emits actual operation for the scalar identity values, found during
17350 /// horizontal reduction analysis.
17351 Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17353 const MapVector<Value *, unsigned> &SameValuesCounter,
17354 const DenseMap<Value *, Value *> &TrackedToOrig) {
17355 assert(IsSupportedHorRdxIdentityOp &&
17356 "The optimization of matched scalar identity horizontal reductions "
17357 "must be supported.");
17358 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
17359 if (VTy->getElementType() != VL.front()->getType()) {
17360 VectorizedValue = Builder.CreateIntCast(
17361 VectorizedValue,
17362 FixedVectorType::get(VL.front()->getType(), VTy->getNumElements()),
17363 any_of(VL, [&](Value *R) {
17365 R, cast<Instruction>(ReductionOps.front().front())
17366 ->getModule()
17367 ->getDataLayout());
17368 return !Known.isNonNegative();
17369 }));
17370 }
17371 switch (RdxKind) {
17372 case RecurKind::Add: {
17373 // root = mul prev_root, <1, 1, n, 1>
17375 for (Value *V : VL) {
17376 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17377 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
17378 }
17379 auto *Scale = ConstantVector::get(Vals);
17380 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
17381 << VectorizedValue << ". (HorRdx)\n");
17382 return Builder.CreateMul(VectorizedValue, Scale);
17383 }
17384 case RecurKind::And:
17385 case RecurKind::Or:
17386 // No need for multiple or/and(s).
17387 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
17388 << ". (HorRdx)\n");
17389 return VectorizedValue;
17390 case RecurKind::SMax:
17391 case RecurKind::SMin:
17392 case RecurKind::UMax:
17393 case RecurKind::UMin:
17394 case RecurKind::FMax:
17395 case RecurKind::FMin:
17396 case RecurKind::FMaximum:
17397 case RecurKind::FMinimum:
17398 // No need for multiple min/max(s) of the same value.
17399 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
17400 << ". (HorRdx)\n");
17401 return VectorizedValue;
17402 case RecurKind::Xor: {
17403 // Replace values with even number of repeats with 0, since
17404 // x xor x = 0.
17405 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
17406 // 7>, if elements 4th and 6th elements have even number of repeats.
17408 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
17410 std::iota(Mask.begin(), Mask.end(), 0);
17411 bool NeedShuffle = false;
17412 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
17413 Value *V = VL[I];
17414 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17415 if (Cnt % 2 == 0) {
17416 Mask[I] = VF;
17417 NeedShuffle = true;
17418 }
17419 }
17420 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
17421 : Mask) dbgs()
17422 << I << " ";
17423 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
17424 if (NeedShuffle)
17425 VectorizedValue = Builder.CreateShuffleVector(
17426 VectorizedValue,
17427 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
17428 return VectorizedValue;
17429 }
17430 case RecurKind::FAdd: {
17431 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
17433 for (Value *V : VL) {
17434 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17435 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
17436 }
17437 auto *Scale = ConstantVector::get(Vals);
17438 return Builder.CreateFMul(VectorizedValue, Scale);
17439 }
17440 case RecurKind::Mul:
17441 case RecurKind::FMul:
17442 case RecurKind::FMulAdd:
17443 case RecurKind::IAnyOf:
17444 case RecurKind::FAnyOf:
17445 case RecurKind::None:
17446 llvm_unreachable("Unexpected reduction kind for reused scalars.");
17447 }
17448 return nullptr;
17449 }
17450};
17451} // end anonymous namespace
17452
17453/// Gets recurrence kind from the specified value.
17455 return HorizontalReduction::getRdxKind(V);
17456}
17457static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
17458 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
17459 return cast<FixedVectorType>(IE->getType())->getNumElements();
17460
17461 unsigned AggregateSize = 1;
17462 auto *IV = cast<InsertValueInst>(InsertInst);
17463 Type *CurrentType = IV->getType();
17464 do {
17465 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
17466 for (auto *Elt : ST->elements())
17467 if (Elt != ST->getElementType(0)) // check homogeneity
17468 return std::nullopt;
17469 AggregateSize *= ST->getNumElements();
17470 CurrentType = ST->getElementType(0);
17471 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
17472 AggregateSize *= AT->getNumElements();
17473 CurrentType = AT->getElementType();
17474 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
17475 AggregateSize *= VT->getNumElements();
17476 return AggregateSize;
17477 } else if (CurrentType->isSingleValueType()) {
17478 return AggregateSize;
17479 } else {
17480 return std::nullopt;
17481 }
17482 } while (true);
17483}
17484
17485static void findBuildAggregate_rec(Instruction *LastInsertInst,
17487 SmallVectorImpl<Value *> &BuildVectorOpds,
17488 SmallVectorImpl<Value *> &InsertElts,
17489 unsigned OperandOffset) {
17490 do {
17491 Value *InsertedOperand = LastInsertInst->getOperand(1);
17492 std::optional<unsigned> OperandIndex =
17493 getInsertIndex(LastInsertInst, OperandOffset);
17494 if (!OperandIndex)
17495 return;
17496 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
17497 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
17498 BuildVectorOpds, InsertElts, *OperandIndex);
17499
17500 } else {
17501 BuildVectorOpds[*OperandIndex] = InsertedOperand;
17502 InsertElts[*OperandIndex] = LastInsertInst;
17503 }
17504 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
17505 } while (LastInsertInst != nullptr &&
17506 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
17507 LastInsertInst->hasOneUse());
17508}
17509
17510/// Recognize construction of vectors like
17511/// %ra = insertelement <4 x float> poison, float %s0, i32 0
17512/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
17513/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
17514/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
17515/// starting from the last insertelement or insertvalue instruction.
17516///
17517/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
17518/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
17519/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
17520///
17521/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
17522///
17523/// \return true if it matches.
17524static bool findBuildAggregate(Instruction *LastInsertInst,
17526 SmallVectorImpl<Value *> &BuildVectorOpds,
17527 SmallVectorImpl<Value *> &InsertElts) {
17528
17529 assert((isa<InsertElementInst>(LastInsertInst) ||
17530 isa<InsertValueInst>(LastInsertInst)) &&
17531 "Expected insertelement or insertvalue instruction!");
17532
17533 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
17534 "Expected empty result vectors!");
17535
17536 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
17537 if (!AggregateSize)
17538 return false;
17539 BuildVectorOpds.resize(*AggregateSize);
17540 InsertElts.resize(*AggregateSize);
17541
17542 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);
17543 llvm::erase(BuildVectorOpds, nullptr);
17544 llvm::erase(InsertElts, nullptr);
17545 if (BuildVectorOpds.size() >= 2)
17546 return true;
17547
17548 return false;
17549}
17550
17551/// Try and get a reduction instruction from a phi node.
17552///
17553/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
17554/// if they come from either \p ParentBB or a containing loop latch.
17555///
17556/// \returns A candidate reduction value if possible, or \code nullptr \endcode
17557/// if not possible.
17559 BasicBlock *ParentBB, LoopInfo *LI) {
17560 // There are situations where the reduction value is not dominated by the
17561 // reduction phi. Vectorizing such cases has been reported to cause
17562 // miscompiles. See PR25787.
17563 auto DominatedReduxValue = [&](Value *R) {
17564 return isa<Instruction>(R) &&
17565 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
17566 };
17567
17568 Instruction *Rdx = nullptr;
17569
17570 // Return the incoming value if it comes from the same BB as the phi node.
17571 if (P->getIncomingBlock(0) == ParentBB) {
17572 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
17573 } else if (P->getIncomingBlock(1) == ParentBB) {
17574 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
17575 }
17576
17577 if (Rdx && DominatedReduxValue(Rdx))
17578 return Rdx;
17579
17580 // Otherwise, check whether we have a loop latch to look at.
17581 Loop *BBL = LI->getLoopFor(ParentBB);
17582 if (!BBL)
17583 return nullptr;
17584 BasicBlock *BBLatch = BBL->getLoopLatch();
17585 if (!BBLatch)
17586 return nullptr;
17587
17588 // There is a loop latch, return the incoming value if it comes from
17589 // that. This reduction pattern occasionally turns up.
17590 if (P->getIncomingBlock(0) == BBLatch) {
17591 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
17592 } else if (P->getIncomingBlock(1) == BBLatch) {
17593 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
17594 }
17595
17596 if (Rdx && DominatedReduxValue(Rdx))
17597 return Rdx;
17598
17599 return nullptr;
17600}
17601
17602static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
17603 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
17604 return true;
17605 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
17606 return true;
17607 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
17608 return true;
17609 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
17610 return true;
17611 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
17612 return true;
17613 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
17614 return true;
17615 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
17616 return true;
17617 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
17618 return true;
17619 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
17620 return true;
17621 return false;
17622}
17623
17624/// We could have an initial reduction that is not an add.
17625/// r *= v1 + v2 + v3 + v4
17626/// In such a case start looking for a tree rooted in the first '+'.
17627/// \Returns the new root if found, which may be nullptr if not an instruction.
17629 Instruction *Root) {
17630 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
17631 isa<IntrinsicInst>(Root)) &&
17632 "Expected binop, select, or intrinsic for reduction matching");
17633 Value *LHS =
17634 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
17635 Value *RHS =
17636 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
17637 if (LHS == Phi)
17638 return dyn_cast<Instruction>(RHS);
17639 if (RHS == Phi)
17640 return dyn_cast<Instruction>(LHS);
17641 return nullptr;
17642}
17643
17644/// \p Returns the first operand of \p I that does not match \p Phi. If
17645/// operand is not an instruction it returns nullptr.
17647 Value *Op0 = nullptr;
17648 Value *Op1 = nullptr;
17649 if (!matchRdxBop(I, Op0, Op1))
17650 return nullptr;
17651 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
17652}
17653
17654/// \Returns true if \p I is a candidate instruction for reduction vectorization.
17656 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
17657 Value *B0 = nullptr, *B1 = nullptr;
17658 bool IsBinop = matchRdxBop(I, B0, B1);
17659 return IsBinop || IsSelect;
17660}
17661
17662bool SLPVectorizerPass::vectorizeHorReduction(
17664 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
17665 if (!ShouldVectorizeHor)
17666 return false;
17667 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
17668
17669 if (Root->getParent() != BB || isa<PHINode>(Root))
17670 return false;
17671
17672 // If we can find a secondary reduction root, use that instead.
17673 auto SelectRoot = [&]() {
17674 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
17675 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
17676 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
17677 return NewRoot;
17678 return Root;
17679 };
17680
17681 // Start analysis starting from Root instruction. If horizontal reduction is
17682 // found, try to vectorize it. If it is not a horizontal reduction or
17683 // vectorization is not possible or not effective, and currently analyzed
17684 // instruction is a binary operation, try to vectorize the operands, using
17685 // pre-order DFS traversal order. If the operands were not vectorized, repeat
17686 // the same procedure considering each operand as a possible root of the
17687 // horizontal reduction.
17688 // Interrupt the process if the Root instruction itself was vectorized or all
17689 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
17690 // If a horizintal reduction was not matched or vectorized we collect
17691 // instructions for possible later attempts for vectorization.
17692 std::queue<std::pair<Instruction *, unsigned>> Stack;
17693 Stack.emplace(SelectRoot(), 0);
17694 SmallPtrSet<Value *, 8> VisitedInstrs;
17695 bool Res = false;
17696 auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
17697 if (R.isAnalyzedReductionRoot(Inst))
17698 return nullptr;
17699 if (!isReductionCandidate(Inst))
17700 return nullptr;
17701 HorizontalReduction HorRdx;
17702 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
17703 return nullptr;
17704 return HorRdx.tryToReduce(R, *DL, TTI, *TLI);
17705 };
17706 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
17707 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
17708 FutureSeed = getNonPhiOperand(Root, P);
17709 if (!FutureSeed)
17710 return false;
17711 }
17712 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
17713 // analysis is done separately.
17714 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
17715 PostponedInsts.push_back(FutureSeed);
17716 return true;
17717 };
17718
17719 while (!Stack.empty()) {
17720 Instruction *Inst;
17721 unsigned Level;
17722 std::tie(Inst, Level) = Stack.front();
17723 Stack.pop();
17724 // Do not try to analyze instruction that has already been vectorized.
17725 // This may happen when we vectorize instruction operands on a previous
17726 // iteration while stack was populated before that happened.
17727 if (R.isDeleted(Inst))
17728 continue;
17729 if (Value *VectorizedV = TryToReduce(Inst)) {
17730 Res = true;
17731 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
17732 // Try to find another reduction.
17733 Stack.emplace(I, Level);
17734 continue;
17735 }
17736 } else {
17737 // We could not vectorize `Inst` so try to use it as a future seed.
17738 if (!TryAppendToPostponedInsts(Inst)) {
17739 assert(Stack.empty() && "Expected empty stack");
17740 break;
17741 }
17742 }
17743
17744 // Try to vectorize operands.
17745 // Continue analysis for the instruction from the same basic block only to
17746 // save compile time.
17747 if (++Level < RecursionMaxDepth)
17748 for (auto *Op : Inst->operand_values())
17749 if (VisitedInstrs.insert(Op).second)
17750 if (auto *I = dyn_cast<Instruction>(Op))
17751 // Do not try to vectorize CmpInst operands, this is done
17752 // separately.
17753 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
17754 !R.isDeleted(I) && I->getParent() == BB)
17755 Stack.emplace(I, Level);
17756 }
17757 return Res;
17758}
17759
17760bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
17761 BasicBlock *BB, BoUpSLP &R,
17763 SmallVector<WeakTrackingVH> PostponedInsts;
17764 bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
17765 Res |= tryToVectorize(PostponedInsts, R);
17766 return Res;
17767}
17768
17769bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
17770 BoUpSLP &R) {
17771 bool Res = false;
17772 for (Value *V : Insts)
17773 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
17774 Res |= tryToVectorize(Inst, R);
17775 return Res;
17776}
17777
17778bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
17779 BasicBlock *BB, BoUpSLP &R) {
17780 if (!R.canMapToVector(IVI->getType()))
17781 return false;
17782
17783 SmallVector<Value *, 16> BuildVectorOpds;
17784 SmallVector<Value *, 16> BuildVectorInsts;
17785 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
17786 return false;
17787
17788 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
17789 // Aggregate value is unlikely to be processed in vector register.
17790 return tryToVectorizeList(BuildVectorOpds, R);
17791}
17792
17793bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
17794 BasicBlock *BB, BoUpSLP &R) {
17795 SmallVector<Value *, 16> BuildVectorInsts;
17796 SmallVector<Value *, 16> BuildVectorOpds;
17798 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
17799 (llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
17800 isFixedVectorShuffle(BuildVectorOpds, Mask)))
17801 return false;
17802
17803 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
17804 return tryToVectorizeList(BuildVectorInsts, R);
17805}
17806
17807template <typename T>
17809 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
17810 function_ref<bool(T *, T *)> AreCompatible,
17811 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
17812 bool MaxVFOnly, BoUpSLP &R) {
17813 bool Changed = false;
17814 // Sort by type, parent, operands.
17815 stable_sort(Incoming, Comparator);
17816
17817 // Try to vectorize elements base on their type.
17818 SmallVector<T *> Candidates;
17819 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) {
17820 // Look for the next elements with the same type, parent and operand
17821 // kinds.
17822 auto *SameTypeIt = IncIt;
17823 while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))
17824 ++SameTypeIt;
17825
17826 // Try to vectorize them.
17827 unsigned NumElts = (SameTypeIt - IncIt);
17828 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
17829 << NumElts << ")\n");
17830 // The vectorization is a 3-state attempt:
17831 // 1. Try to vectorize instructions with the same/alternate opcodes with the
17832 // size of maximal register at first.
17833 // 2. Try to vectorize remaining instructions with the same type, if
17834 // possible. This may result in the better vectorization results rather than
17835 // if we try just to vectorize instructions with the same/alternate opcodes.
17836 // 3. Final attempt to try to vectorize all instructions with the
17837 // same/alternate ops only, this may result in some extra final
17838 // vectorization.
17839 if (NumElts > 1 &&
17840 TryToVectorizeHelper(ArrayRef(IncIt, NumElts), MaxVFOnly)) {
17841 // Success start over because instructions might have been changed.
17842 Changed = true;
17843 } else {
17844 /// \Returns the minimum number of elements that we will attempt to
17845 /// vectorize.
17846 auto GetMinNumElements = [&R](Value *V) {
17847 unsigned EltSize = R.getVectorElementSize(V);
17848 return std::max(2U, R.getMaxVecRegSize() / EltSize);
17849 };
17850 if (NumElts < GetMinNumElements(*IncIt) &&
17851 (Candidates.empty() ||
17852 Candidates.front()->getType() == (*IncIt)->getType())) {
17853 Candidates.append(IncIt, std::next(IncIt, NumElts));
17854 }
17855 }
17856 // Final attempt to vectorize instructions with the same types.
17857 if (Candidates.size() > 1 &&
17858 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
17859 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
17860 // Success start over because instructions might have been changed.
17861 Changed = true;
17862 } else if (MaxVFOnly) {
17863 // Try to vectorize using small vectors.
17864 for (auto *It = Candidates.begin(), *End = Candidates.end();
17865 It != End;) {
17866 auto *SameTypeIt = It;
17867 while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))
17868 ++SameTypeIt;
17869 unsigned NumElts = (SameTypeIt - It);
17870 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(It, NumElts),
17871 /*MaxVFOnly=*/false))
17872 Changed = true;
17873 It = SameTypeIt;
17874 }
17875 }
17876 Candidates.clear();
17877 }
17878
17879 // Start over at the next instruction of a different type (or the end).
17880 IncIt = SameTypeIt;
17881 }
17882 return Changed;
17883}
17884
17885/// Compare two cmp instructions. If IsCompatibility is true, function returns
17886/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
17887/// operands. If IsCompatibility is false, function implements strict weak
17888/// ordering relation between two cmp instructions, returning true if the first
17889/// instruction is "less" than the second, i.e. its predicate is less than the
17890/// predicate of the second or the operands IDs are less than the operands IDs
17891/// of the second cmp instruction.
17892template <bool IsCompatibility>
17893static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
17894 const DominatorTree &DT) {
17895 assert(isValidElementType(V->getType()) &&
17896 isValidElementType(V2->getType()) &&
17897 "Expected valid element types only.");
17898 if (V == V2)
17899 return IsCompatibility;
17900 auto *CI1 = cast<CmpInst>(V);
17901 auto *CI2 = cast<CmpInst>(V2);
17902 if (CI1->getOperand(0)->getType()->getTypeID() <
17903 CI2->getOperand(0)->getType()->getTypeID())
17904 return !IsCompatibility;
17905 if (CI1->getOperand(0)->getType()->getTypeID() >
17906 CI2->getOperand(0)->getType()->getTypeID())
17907 return false;
17908 CmpInst::Predicate Pred1 = CI1->getPredicate();
17909 CmpInst::Predicate Pred2 = CI2->getPredicate();
17912 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
17913 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
17914 if (BasePred1 < BasePred2)
17915 return !IsCompatibility;
17916 if (BasePred1 > BasePred2)
17917 return false;
17918 // Compare operands.
17919 bool CI1Preds = Pred1 == BasePred1;
17920 bool CI2Preds = Pred2 == BasePred1;
17921 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
17922 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
17923 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
17924 if (Op1 == Op2)
17925 continue;
17926 if (Op1->getValueID() < Op2->getValueID())
17927 return !IsCompatibility;
17928 if (Op1->getValueID() > Op2->getValueID())
17929 return false;
17930 if (auto *I1 = dyn_cast<Instruction>(Op1))
17931 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
17932 if (IsCompatibility) {
17933 if (I1->getParent() != I2->getParent())
17934 return false;
17935 } else {
17936 // Try to compare nodes with same parent.
17937 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
17938 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
17939 if (!NodeI1)
17940 return NodeI2 != nullptr;
17941 if (!NodeI2)
17942 return false;
17943 assert((NodeI1 == NodeI2) ==
17944 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
17945 "Different nodes should have different DFS numbers");
17946 if (NodeI1 != NodeI2)
17947 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
17948 }
17949 InstructionsState S = getSameOpcode({I1, I2}, TLI);
17950 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
17951 continue;
17952 if (IsCompatibility)
17953 return false;
17954 if (I1->getOpcode() != I2->getOpcode())
17955 return I1->getOpcode() < I2->getOpcode();
17956 }
17957 }
17958 return IsCompatibility;
17959}
17960
17961template <typename ItT>
17962bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
17963 BasicBlock *BB, BoUpSLP &R) {
17964 bool Changed = false;
17965 // Try to find reductions first.
17966 for (CmpInst *I : CmpInsts) {
17967 if (R.isDeleted(I))
17968 continue;
17969 for (Value *Op : I->operands())
17970 if (auto *RootOp = dyn_cast<Instruction>(Op))
17971 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI);
17972 }
17973 // Try to vectorize operands as vector bundles.
17974 for (CmpInst *I : CmpInsts) {
17975 if (R.isDeleted(I))
17976 continue;
17977 Changed |= tryToVectorize(I, R);
17978 }
17979 // Try to vectorize list of compares.
17980 // Sort by type, compare predicate, etc.
17981 auto CompareSorter = [&](Value *V, Value *V2) {
17982 if (V == V2)
17983 return false;
17984 return compareCmp<false>(V, V2, *TLI, *DT);
17985 };
17986
17987 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
17988 if (V1 == V2)
17989 return true;
17990 return compareCmp<true>(V1, V2, *TLI, *DT);
17991 };
17992
17994 for (Instruction *V : CmpInsts)
17995 if (!R.isDeleted(V) && isValidElementType(V->getType()))
17996 Vals.push_back(V);
17997 if (Vals.size() <= 1)
17998 return Changed;
17999 Changed |= tryToVectorizeSequence<Value>(
18000 Vals, CompareSorter, AreCompatibleCompares,
18001 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18002 // Exclude possible reductions from other blocks.
18003 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
18004 return any_of(V->users(), [V](User *U) {
18005 auto *Select = dyn_cast<SelectInst>(U);
18006 return Select &&
18007 Select->getParent() != cast<Instruction>(V)->getParent();
18008 });
18009 });
18010 if (ArePossiblyReducedInOtherBlock)
18011 return false;
18012 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18013 },
18014 /*MaxVFOnly=*/true, R);
18015 return Changed;
18016}
18017
18018bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
18019 BasicBlock *BB, BoUpSLP &R) {
18020 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
18021 "This function only accepts Insert instructions");
18022 bool OpsChanged = false;
18023 SmallVector<WeakTrackingVH> PostponedInsts;
18024 // pass1 - try to vectorize reductions only
18025 for (auto *I : reverse(Instructions)) {
18026 if (R.isDeleted(I))
18027 continue;
18028 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
18029 }
18030 // pass2 - try to match and vectorize a buildvector sequence.
18031 for (auto *I : reverse(Instructions)) {
18032 if (R.isDeleted(I) || isa<CmpInst>(I))
18033 continue;
18034 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
18035 OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
18036 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
18037 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
18038 }
18039 }
18040 // Now try to vectorize postponed instructions.
18041 OpsChanged |= tryToVectorize(PostponedInsts, R);
18042
18043 Instructions.clear();
18044 return OpsChanged;
18045}
18046
18047bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
18048 bool Changed = false;
18050 SmallPtrSet<Value *, 16> VisitedInstrs;
18051 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
18052 // node. Allows better to identify the chains that can be vectorized in the
18053 // better way.
18055 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
18057 isValidElementType(V2->getType()) &&
18058 "Expected vectorizable types only.");
18059 // It is fine to compare type IDs here, since we expect only vectorizable
18060 // types, like ints, floats and pointers, we don't care about other type.
18061 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
18062 return true;
18063 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
18064 return false;
18065 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
18066 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
18067 if (Opcodes1.size() < Opcodes2.size())
18068 return true;
18069 if (Opcodes1.size() > Opcodes2.size())
18070 return false;
18071 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
18072 {
18073 // Instructions come first.
18074 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
18075 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
18076 if (I1 && I2) {
18077 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
18078 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
18079 if (!NodeI1)
18080 return NodeI2 != nullptr;
18081 if (!NodeI2)
18082 return false;
18083 assert((NodeI1 == NodeI2) ==
18084 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18085 "Different nodes should have different DFS numbers");
18086 if (NodeI1 != NodeI2)
18087 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18088 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18089 if (S.getOpcode() && !S.isAltShuffle())
18090 continue;
18091 return I1->getOpcode() < I2->getOpcode();
18092 }
18093 if (I1)
18094 return true;
18095 if (I2)
18096 return false;
18097 }
18098 {
18099 // Non-undef constants come next.
18100 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
18101 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
18102 if (C1 && C2)
18103 continue;
18104 if (C1)
18105 return true;
18106 if (C2)
18107 return false;
18108 }
18109 bool U1 = isa<UndefValue>(Opcodes1[I]);
18110 bool U2 = isa<UndefValue>(Opcodes2[I]);
18111 {
18112 // Non-constant non-instructions come next.
18113 if (!U1 && !U2) {
18114 auto ValID1 = Opcodes1[I]->getValueID();
18115 auto ValID2 = Opcodes2[I]->getValueID();
18116 if (ValID1 == ValID2)
18117 continue;
18118 if (ValID1 < ValID2)
18119 return true;
18120 if (ValID1 > ValID2)
18121 return false;
18122 }
18123 if (!U1)
18124 return true;
18125 if (!U2)
18126 return false;
18127 }
18128 // Undefs come last.
18129 assert(U1 && U2 && "The only thing left should be undef & undef.");
18130 continue;
18131 }
18132 return false;
18133 };
18134 auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) {
18135 if (V1 == V2)
18136 return true;
18137 if (V1->getType() != V2->getType())
18138 return false;
18139 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
18140 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
18141 if (Opcodes1.size() != Opcodes2.size())
18142 return false;
18143 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
18144 // Undefs are compatible with any other value.
18145 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
18146 continue;
18147 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
18148 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
18149 if (I1->getParent() != I2->getParent())
18150 return false;
18151 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18152 if (S.getOpcode())
18153 continue;
18154 return false;
18155 }
18156 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
18157 continue;
18158 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
18159 return false;
18160 }
18161 return true;
18162 };
18163
18164 bool HaveVectorizedPhiNodes = false;
18165 do {
18166 // Collect the incoming values from the PHIs.
18167 Incoming.clear();
18168 for (Instruction &I : *BB) {
18169 PHINode *P = dyn_cast<PHINode>(&I);
18170 if (!P)
18171 break;
18172
18173 // No need to analyze deleted, vectorized and non-vectorizable
18174 // instructions.
18175 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
18176 isValidElementType(P->getType()))
18177 Incoming.push_back(P);
18178 }
18179
18180 if (Incoming.size() <= 1)
18181 break;
18182
18183 // Find the corresponding non-phi nodes for better matching when trying to
18184 // build the tree.
18185 for (Value *V : Incoming) {
18186 SmallVectorImpl<Value *> &Opcodes =
18187 PHIToOpcodes.try_emplace(V).first->getSecond();
18188 if (!Opcodes.empty())
18189 continue;
18190 SmallVector<Value *, 4> Nodes(1, V);
18192 while (!Nodes.empty()) {
18193 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
18194 if (!Visited.insert(PHI).second)
18195 continue;
18196 for (Value *V : PHI->incoming_values()) {
18197 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
18198 Nodes.push_back(PHI1);
18199 continue;
18200 }
18201 Opcodes.emplace_back(V);
18202 }
18203 }
18204 }
18205
18206 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
18207 Incoming, PHICompare, AreCompatiblePHIs,
18208 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18209 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18210 },
18211 /*MaxVFOnly=*/true, R);
18212 Changed |= HaveVectorizedPhiNodes;
18213 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
18214 } while (HaveVectorizedPhiNodes);
18215
18216 VisitedInstrs.clear();
18217
18218 InstSetVector PostProcessInserts;
18219 SmallSetVector<CmpInst *, 8> PostProcessCmps;
18220 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
18221 // also vectorizes `PostProcessCmps`.
18222 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
18223 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
18224 if (VectorizeCmps) {
18225 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
18226 PostProcessCmps.clear();
18227 }
18228 PostProcessInserts.clear();
18229 return Changed;
18230 };
18231 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
18232 auto IsInPostProcessInstrs = [&](Instruction *I) {
18233 if (auto *Cmp = dyn_cast<CmpInst>(I))
18234 return PostProcessCmps.contains(Cmp);
18235 return isa<InsertElementInst, InsertValueInst>(I) &&
18236 PostProcessInserts.contains(I);
18237 };
18238 // Returns true if `I` is an instruction without users, like terminator, or
18239 // function call with ignored return value, store. Ignore unused instructions
18240 // (basing on instruction type, except for CallInst and InvokeInst).
18241 auto HasNoUsers = [](Instruction *I) {
18242 return I->use_empty() &&
18243 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
18244 };
18245 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
18246 // Skip instructions with scalable type. The num of elements is unknown at
18247 // compile-time for scalable type.
18248 if (isa<ScalableVectorType>(It->getType()))
18249 continue;
18250
18251 // Skip instructions marked for the deletion.
18252 if (R.isDeleted(&*It))
18253 continue;
18254 // We may go through BB multiple times so skip the one we have checked.
18255 if (!VisitedInstrs.insert(&*It).second) {
18256 if (HasNoUsers(&*It) &&
18257 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
18258 // We would like to start over since some instructions are deleted
18259 // and the iterator may become invalid value.
18260 Changed = true;
18261 It = BB->begin();
18262 E = BB->end();
18263 }
18264 continue;
18265 }
18266
18267 if (isa<DbgInfoIntrinsic>(It))
18268 continue;
18269
18270 // Try to vectorize reductions that use PHINodes.
18271 if (PHINode *P = dyn_cast<PHINode>(It)) {
18272 // Check that the PHI is a reduction PHI.
18273 if (P->getNumIncomingValues() == 2) {
18274 // Try to match and vectorize a horizontal reduction.
18275 Instruction *Root = getReductionInstr(DT, P, BB, LI);
18276 if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
18277 Changed = true;
18278 It = BB->begin();
18279 E = BB->end();
18280 continue;
18281 }
18282 }
18283 // Try to vectorize the incoming values of the PHI, to catch reductions
18284 // that feed into PHIs.
18285 for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) {
18286 // Skip if the incoming block is the current BB for now. Also, bypass
18287 // unreachable IR for efficiency and to avoid crashing.
18288 // TODO: Collect the skipped incoming values and try to vectorize them
18289 // after processing BB.
18290 if (BB == P->getIncomingBlock(I) ||
18291 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
18292 continue;
18293
18294 // Postponed instructions should not be vectorized here, delay their
18295 // vectorization.
18296 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
18297 PI && !IsInPostProcessInstrs(PI))
18298 Changed |= vectorizeRootInstruction(nullptr, PI,
18299 P->getIncomingBlock(I), R, TTI);
18300 }
18301 continue;
18302 }
18303
18304 if (HasNoUsers(&*It)) {
18305 bool OpsChanged = false;
18306 auto *SI = dyn_cast<StoreInst>(It);
18307 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
18308 if (SI) {
18309 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
18310 // Try to vectorize chain in store, if this is the only store to the
18311 // address in the block.
18312 // TODO: This is just a temporarily solution to save compile time. Need
18313 // to investigate if we can safely turn on slp-vectorize-hor-store
18314 // instead to allow lookup for reduction chains in all non-vectorized
18315 // stores (need to check side effects and compile time).
18316 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
18317 SI->getValueOperand()->hasOneUse();
18318 }
18319 if (TryToVectorizeRoot) {
18320 for (auto *V : It->operand_values()) {
18321 // Postponed instructions should not be vectorized here, delay their
18322 // vectorization.
18323 if (auto *VI = dyn_cast<Instruction>(V);
18324 VI && !IsInPostProcessInstrs(VI))
18325 // Try to match and vectorize a horizontal reduction.
18326 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI);
18327 }
18328 }
18329 // Start vectorization of post-process list of instructions from the
18330 // top-tree instructions to try to vectorize as many instructions as
18331 // possible.
18332 OpsChanged |=
18333 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
18334 if (OpsChanged) {
18335 // We would like to start over since some instructions are deleted
18336 // and the iterator may become invalid value.
18337 Changed = true;
18338 It = BB->begin();
18339 E = BB->end();
18340 continue;
18341 }
18342 }
18343
18344 if (isa<InsertElementInst, InsertValueInst>(It))
18345 PostProcessInserts.insert(&*It);
18346 else if (isa<CmpInst>(It))
18347 PostProcessCmps.insert(cast<CmpInst>(&*It));
18348 }
18349
18350 return Changed;
18351}
18352
18353bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
18354 auto Changed = false;
18355 for (auto &Entry : GEPs) {
18356 // If the getelementptr list has fewer than two elements, there's nothing
18357 // to do.
18358 if (Entry.second.size() < 2)
18359 continue;
18360
18361 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
18362 << Entry.second.size() << ".\n");
18363
18364 // Process the GEP list in chunks suitable for the target's supported
18365 // vector size. If a vector register can't hold 1 element, we are done. We
18366 // are trying to vectorize the index computations, so the maximum number of
18367 // elements is based on the size of the index expression, rather than the
18368 // size of the GEP itself (the target's pointer size).
18369 unsigned MaxVecRegSize = R.getMaxVecRegSize();
18370 unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin());
18371 if (MaxVecRegSize < EltSize)
18372 continue;
18373
18374 unsigned MaxElts = MaxVecRegSize / EltSize;
18375 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
18376 auto Len = std::min<unsigned>(BE - BI, MaxElts);
18377 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
18378
18379 // Initialize a set a candidate getelementptrs. Note that we use a
18380 // SetVector here to preserve program order. If the index computations
18381 // are vectorizable and begin with loads, we want to minimize the chance
18382 // of having to reorder them later.
18383 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
18384
18385 // Some of the candidates may have already been vectorized after we
18386 // initially collected them or their index is optimized to constant value.
18387 // If so, they are marked as deleted, so remove them from the set of
18388 // candidates.
18389 Candidates.remove_if([&R](Value *I) {
18390 return R.isDeleted(cast<Instruction>(I)) ||
18391 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
18392 });
18393
18394 // Remove from the set of candidates all pairs of getelementptrs with
18395 // constant differences. Such getelementptrs are likely not good
18396 // candidates for vectorization in a bottom-up phase since one can be
18397 // computed from the other. We also ensure all candidate getelementptr
18398 // indices are unique.
18399 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
18400 auto *GEPI = GEPList[I];
18401 if (!Candidates.count(GEPI))
18402 continue;
18403 auto *SCEVI = SE->getSCEV(GEPList[I]);
18404 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
18405 auto *GEPJ = GEPList[J];
18406 auto *SCEVJ = SE->getSCEV(GEPList[J]);
18407 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
18408 Candidates.remove(GEPI);
18409 Candidates.remove(GEPJ);
18410 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
18411 Candidates.remove(GEPJ);
18412 }
18413 }
18414 }
18415
18416 // We break out of the above computation as soon as we know there are
18417 // fewer than two candidates remaining.
18418 if (Candidates.size() < 2)
18419 continue;
18420
18421 // Add the single, non-constant index of each candidate to the bundle. We
18422 // ensured the indices met these constraints when we originally collected
18423 // the getelementptrs.
18424 SmallVector<Value *, 16> Bundle(Candidates.size());
18425 auto BundleIndex = 0u;
18426 for (auto *V : Candidates) {
18427 auto *GEP = cast<GetElementPtrInst>(V);
18428 auto *GEPIdx = GEP->idx_begin()->get();
18429 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
18430 Bundle[BundleIndex++] = GEPIdx;
18431 }
18432
18433 // Try and vectorize the indices. We are currently only interested in
18434 // gather-like cases of the form:
18435 //
18436 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
18437 //
18438 // where the loads of "a", the loads of "b", and the subtractions can be
18439 // performed in parallel. It's likely that detecting this pattern in a
18440 // bottom-up phase will be simpler and less costly than building a
18441 // full-blown top-down phase beginning at the consecutive loads.
18442 Changed |= tryToVectorizeList(Bundle, R);
18443 }
18444 }
18445 return Changed;
18446}
18447
18448bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
18449 bool Changed = false;
18450 // Sort by type, base pointers and values operand. Value operands must be
18451 // compatible (have the same opcode, same parent), otherwise it is
18452 // definitely not profitable to try to vectorize them.
18453 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
18454 if (V->getValueOperand()->getType()->getTypeID() <
18455 V2->getValueOperand()->getType()->getTypeID())
18456 return true;
18457 if (V->getValueOperand()->getType()->getTypeID() >
18458 V2->getValueOperand()->getType()->getTypeID())
18459 return false;
18460 if (V->getPointerOperandType()->getTypeID() <
18461 V2->getPointerOperandType()->getTypeID())
18462 return true;
18463 if (V->getPointerOperandType()->getTypeID() >
18464 V2->getPointerOperandType()->getTypeID())
18465 return false;
18466 // UndefValues are compatible with all other values.
18467 if (isa<UndefValue>(V->getValueOperand()) ||
18468 isa<UndefValue>(V2->getValueOperand()))
18469 return false;
18470 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
18471 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
18473 DT->getNode(I1->getParent());
18475 DT->getNode(I2->getParent());
18476 assert(NodeI1 && "Should only process reachable instructions");
18477 assert(NodeI2 && "Should only process reachable instructions");
18478 assert((NodeI1 == NodeI2) ==
18479 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18480 "Different nodes should have different DFS numbers");
18481 if (NodeI1 != NodeI2)
18482 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18483 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18484 if (S.getOpcode())
18485 return false;
18486 return I1->getOpcode() < I2->getOpcode();
18487 }
18488 if (isa<Constant>(V->getValueOperand()) &&
18489 isa<Constant>(V2->getValueOperand()))
18490 return false;
18491 return V->getValueOperand()->getValueID() <
18492 V2->getValueOperand()->getValueID();
18493 };
18494
18495 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
18496 if (V1 == V2)
18497 return true;
18498 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
18499 return false;
18500 if (V1->getPointerOperandType() != V2->getPointerOperandType())
18501 return false;
18502 // Undefs are compatible with any other value.
18503 if (isa<UndefValue>(V1->getValueOperand()) ||
18504 isa<UndefValue>(V2->getValueOperand()))
18505 return true;
18506 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
18507 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
18508 if (I1->getParent() != I2->getParent())
18509 return false;
18510 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18511 return S.getOpcode() > 0;
18512 }
18513 if (isa<Constant>(V1->getValueOperand()) &&
18514 isa<Constant>(V2->getValueOperand()))
18515 return true;
18516 return V1->getValueOperand()->getValueID() ==
18517 V2->getValueOperand()->getValueID();
18518 };
18519
18520 // Attempt to sort and vectorize each of the store-groups.
18522 for (auto &Pair : Stores) {
18523 if (Pair.second.size() < 2)
18524 continue;
18525
18526 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
18527 << Pair.second.size() << ".\n");
18528
18529 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
18530 continue;
18531
18532 // Reverse stores to do bottom-to-top analysis. This is important if the
18533 // values are stores to the same addresses several times, in this case need
18534 // to follow the stores order (reversed to meet the memory dependecies).
18535 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
18536 Pair.second.rend());
18537 Changed |= tryToVectorizeSequence<StoreInst>(
18538 ReversedStores, StoreSorter, AreCompatibleStores,
18539 [&](ArrayRef<StoreInst *> Candidates, bool) {
18540 return vectorizeStores(Candidates, R, Attempted);
18541 },
18542 /*MaxVFOnly=*/false, R);
18543 }
18544 return Changed;
18545}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:529
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition: LICM.cpp:1497
Loop::LoopBounds::Direction Direction
Definition: LoopInfo.cpp:230
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
#define T
Module.h This file contains the declarations for the Module class.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(VerifyEach)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > AllowHorRdxIdenityOptimization("slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, cl::desc("Allow optimization of original scalar identity operations on " "matched horizontal reductions."))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static Value * isOneOf(const InstructionsState &S, Value *Op)
Chooses the correct key for scheduling data.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
#define SV_NAME
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static std::string shortBundleName(ArrayRef< Value * > VL)
Print a short descriptor of the instruction bundle suitable for debug output.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static std::optional< unsigned > getInsertIndex(const Value *InsertInst, unsigned Offset=0)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt)
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
This defines the Use class.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:76
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1636
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:264
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:492
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:473
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:187
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:174
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:228
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:204
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:168
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:195
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:194
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator end()
Definition: BasicBlock.h:443
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:430
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
Definition: BasicBlock.cpp:367
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:167
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
reverse_iterator rend()
Definition: BasicBlock.h:448
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:165
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:221
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:70
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1494
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:2332
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:2227
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1742
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2469
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:2326
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1600
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1678
unsigned arg_size() const
Definition: InstrTypes.h:1685
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1871
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:2323
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:601
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:983
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1362
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:1022
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:1023
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:1017
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:1016
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:1020
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:1018
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:1021
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:1019
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:1167
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:1129
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:1105
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2126
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:154
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1449
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1398
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:417
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:101
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
value_type & FindAndConstruct(const KeyT &Key)
Definition: DenseMap.h:348
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:321
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:296
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void set()
Definition: FMF.h:62
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:130
Type * getReturnType() const
Definition: DerivedTypes.h:124
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2257
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:921
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:511
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2460
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:539
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2265
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1807
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:466
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1110
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:175
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2535
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:311
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:220
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:848
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1753
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:486
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2366
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2397
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2249
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2494
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:471
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1666
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:169
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2273
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2161
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2196
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1826
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2412
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1866
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1587
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1361
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:630
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:260
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:742
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:454
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
Definition: Instruction.h:257
const BasicBlock * getParent() const
Definition: Instruction.h:152
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:258
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
An instruction for reading from memory.
Definition: Instructions.h:184
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:286
Value * getPointerOperand()
Definition: Instructions.h:280
bool isSimple() const
Definition: Instructions.h:272
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:236
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
iterator end()
Definition: MapVector.h:71
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:55
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:118
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
void reserve(size_type NumEntries)
Grow the MapVector so that it can contain at least NumEntries items before resizing again.
Definition: MapVector.h:64
size_type size() const
Definition: MapVector.h:60
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:83
void clear()
Definition: MapVector.h:88
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
T & front() const
front - Get the first element.
Definition: ArrayRef.h:363
iterator end() const
Definition: ArrayRef.h:357
iterator begin() const
Definition: ArrayRef.h:356
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:376
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
Definition: ArrayRef.h:449
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:155
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:144
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:356
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
iterator end() const
Definition: SmallPtrSet.h:385
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
iterator begin() const
Definition: SmallPtrSet.h:380
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:366
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:236
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:981
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
Type * getPointerOperandType() const
Definition: Instructions.h:420
Value * getValueOperand()
Definition: Instructions.h:414
Value * getPointerOperand()
Definition: Instructions.h:417
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:34
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr) const
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:160
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:287
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:166
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1808
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:242
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Definition: User.h:73
op_iterator op_begin()
Definition: User.h:234
Value * getOperand(unsigned i) const
Definition: User.h:169
iterator_range< value_op_iterator > operand_values()
Definition: User.h:266
The Vector Function Database.
Definition: VectorUtils.h:29
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:70
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:532
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:153
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Type * getElementType() const
Definition: DerivedTypes.h:436
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type size() const
Definition: DenseSet.h:81
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
bool erase(const ValueT &V)
Definition: DenseSet.h:101
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:74
self_iterator getIterator()
Definition: ilist_node.h:109
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:660
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:690
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals=std::nullopt)
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:103
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1469
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:777
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:836
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239
@ Undef
Value of the register doesn't matter.
ManagedStatic< cl::opt< FnT >, OptCreatorT > Action
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:456
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1715
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:128
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:950
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:540
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
Value * createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a target reduction of the given vector.
Definition: LoopUtils.cpp:1166
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:40
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7062
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool getAlign(const Function &F, unsigned index, unsigned &align)
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
iterator_range< po_iterator< T > > post_order(const T &G)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2059
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1754
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:116
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:419
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1235
constexpr int PoisonMaskElem
@ Other
Any other memory.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ Or
Bitwise or logical OR of integers.
@ None
Not a recurrence.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Definition: STLExtras.h:1986
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:427
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
InstructionCost Cost
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:439
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:613
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:491
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2490
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:104
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:220
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1450
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1459
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const