LLVM 19.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <memory>
97#include <optional>
98#include <set>
99#include <string>
100#include <tuple>
101#include <utility>
102
103using namespace llvm;
104using namespace llvm::PatternMatch;
105using namespace slpvectorizer;
106
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
109
110STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111
112static cl::opt<bool>
113 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
114 cl::desc("Run the SLP vectorization passes"));
115
116static cl::opt<int>
118 cl::desc("Only vectorize if you gain more than this "
119 "number "));
120
122 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
123 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
124 "heuristics and makes vectorization decision via cost modeling."));
125
126static cl::opt<bool>
127ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
128 cl::desc("Attempt to vectorize horizontal reductions"));
129
131 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
132 cl::desc(
133 "Attempt to vectorize horizontal reductions feeding into a store"));
134
135// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
136// even if we match a reduction but do not vectorize in the end.
138 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
139 cl::desc("Allow optimization of original scalar identity operations on "
140 "matched horizontal reductions."));
141
142static cl::opt<int>
144 cl::desc("Attempt to vectorize for this register size in bits"));
145
148 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
149
150/// Limits the size of scheduling regions in a block.
151/// It avoid long compile times for _very_ large blocks where vector
152/// instructions are spread over a wide range.
153/// This limit is way higher than needed by real-world functions.
154static cl::opt<int>
155ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
156 cl::desc("Limit the size of the SLP scheduling region per block"));
157
159 "slp-min-reg-size", cl::init(128), cl::Hidden,
160 cl::desc("Attempt to vectorize for this register size in bits"));
161
163 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
164 cl::desc("Limit the recursion depth when building a vectorizable tree"));
165
167 "slp-min-tree-size", cl::init(3), cl::Hidden,
168 cl::desc("Only vectorize small trees if they are fully vectorizable"));
169
170// The maximum depth that the look-ahead score heuristic will explore.
171// The higher this value, the higher the compilation time overhead.
173 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
174 cl::desc("The maximum look-ahead depth for operand reordering scores"));
175
176// The maximum depth that the look-ahead score heuristic will explore
177// when it probing among candidates for vectorization tree roots.
178// The higher this value, the higher the compilation time overhead but unlike
179// similar limit for operands ordering this is less frequently used, hence
180// impact of higher value is less noticeable.
182 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
183 cl::desc("The maximum look-ahead depth for searching best rooting option"));
184
186 "slp-min-strided-loads", cl::init(2), cl::Hidden,
187 cl::desc("The minimum number of loads, which should be considered strided, "
188 "if the stride is > 1 or is runtime value"));
189
191 "slp-max-stride", cl::init(8), cl::Hidden,
192 cl::desc("The maximum stride, considered to be profitable."));
193
194static cl::opt<bool>
195 ViewSLPTree("view-slp-tree", cl::Hidden,
196 cl::desc("Display the SLP trees with Graphviz"));
197
199 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
200 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
201
202// Limit the number of alias checks. The limit is chosen so that
203// it has no negative effect on the llvm benchmarks.
204static const unsigned AliasedCheckLimit = 10;
205
206// Limit of the number of uses for potentially transformed instructions/values,
207// used in checks to avoid compile-time explode.
208static constexpr int UsesLimit = 8;
209
210// Another limit for the alias checks: The maximum distance between load/store
211// instructions where alias checks are done.
212// This limit is useful for very large basic blocks.
213static const unsigned MaxMemDepDistance = 160;
214
215/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
216/// regions to be handled.
217static const int MinScheduleRegionSize = 16;
218
219/// Maximum allowed number of operands in the PHI nodes.
220static const unsigned MaxPHINumOperands = 128;
221
222/// Predicate for the element types that the SLP vectorizer supports.
223///
224/// The most important thing to filter here are types which are invalid in LLVM
225/// vectors. We also filter target specific types which have absolutely no
226/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
227/// avoids spending time checking the cost model and realizing that they will
228/// be inevitably scalarized.
229static bool isValidElementType(Type *Ty) {
230 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
231 !Ty->isPPC_FP128Ty();
232}
233
234/// \returns True if the value is a constant (but not globals/constant
235/// expressions).
236static bool isConstant(Value *V) {
237 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
238}
239
240/// Checks if \p V is one of vector-like instructions, i.e. undef,
241/// insertelement/extractelement with constant indices for fixed vector type or
242/// extractvalue instruction.
244 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
245 !isa<ExtractValueInst, UndefValue>(V))
246 return false;
247 auto *I = dyn_cast<Instruction>(V);
248 if (!I || isa<ExtractValueInst>(I))
249 return true;
250 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
251 return false;
252 if (isa<ExtractElementInst>(I))
253 return isConstant(I->getOperand(1));
254 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
255 return isConstant(I->getOperand(2));
256}
257
258/// Returns power-of-2 number of elements in a single register (part), given the
259/// total number of elements \p Size and number of registers (parts) \p
260/// NumParts.
261static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
262 return PowerOf2Ceil(divideCeil(Size, NumParts));
263}
264
265/// Returns correct remaining number of elements, considering total amount \p
266/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
267/// and current register (part) \p Part.
268static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
269 unsigned Part) {
270 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
271}
272
273#if !defined(NDEBUG)
274/// Print a short descriptor of the instruction bundle suitable for debug output.
275static std::string shortBundleName(ArrayRef<Value *> VL) {
276 std::string Result;
277 raw_string_ostream OS(Result);
278 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
279 OS.flush();
280 return Result;
281}
282#endif
283
284/// \returns true if all of the instructions in \p VL are in the same block or
285/// false otherwise.
287 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
288 if (!I0)
289 return false;
291 return true;
292
293 BasicBlock *BB = I0->getParent();
294 for (int I = 1, E = VL.size(); I < E; I++) {
295 auto *II = dyn_cast<Instruction>(VL[I]);
296 if (!II)
297 return false;
298
299 if (BB != II->getParent())
300 return false;
301 }
302 return true;
303}
304
305/// \returns True if all of the values in \p VL are constants (but not
306/// globals/constant expressions).
308 // Constant expressions and globals can't be vectorized like normal integer/FP
309 // constants.
310 return all_of(VL, isConstant);
311}
312
313/// \returns True if all of the values in \p VL are identical or some of them
314/// are UndefValue.
315static bool isSplat(ArrayRef<Value *> VL) {
316 Value *FirstNonUndef = nullptr;
317 for (Value *V : VL) {
318 if (isa<UndefValue>(V))
319 continue;
320 if (!FirstNonUndef) {
321 FirstNonUndef = V;
322 continue;
323 }
324 if (V != FirstNonUndef)
325 return false;
326 }
327 return FirstNonUndef != nullptr;
328}
329
330/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
332 if (auto *Cmp = dyn_cast<CmpInst>(I))
333 return Cmp->isCommutative();
334 if (auto *BO = dyn_cast<BinaryOperator>(I))
335 return BO->isCommutative() ||
336 (BO->getOpcode() == Instruction::Sub &&
337 !BO->hasNUsesOrMore(UsesLimit) &&
338 all_of(
339 BO->uses(),
340 [](const Use &U) {
341 // Commutative, if icmp eq/ne sub, 0
342 ICmpInst::Predicate Pred;
343 if (match(U.getUser(),
344 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
345 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
346 return true;
347 // Commutative, if abs(sub nsw, true) or abs(sub, false).
348 ConstantInt *Flag;
349 return match(U.getUser(),
350 m_Intrinsic<Intrinsic::abs>(
351 m_Specific(U.get()), m_ConstantInt(Flag))) &&
352 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
353 Flag->isOne());
354 })) ||
355 (BO->getOpcode() == Instruction::FSub &&
356 !BO->hasNUsesOrMore(UsesLimit) &&
357 all_of(BO->uses(), [](const Use &U) {
358 return match(U.getUser(),
359 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
360 }));
361 return I->isCommutative();
362}
363
364/// \returns inserting index of InsertElement or InsertValue instruction,
365/// using Offset as base offset for index.
366static std::optional<unsigned> getInsertIndex(const Value *InsertInst,
367 unsigned Offset = 0) {
368 int Index = Offset;
369 if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
370 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
371 if (!VT)
372 return std::nullopt;
373 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
374 if (!CI)
375 return std::nullopt;
376 if (CI->getValue().uge(VT->getNumElements()))
377 return std::nullopt;
378 Index *= VT->getNumElements();
379 Index += CI->getZExtValue();
380 return Index;
381 }
382
383 const auto *IV = cast<InsertValueInst>(InsertInst);
384 Type *CurrentType = IV->getType();
385 for (unsigned I : IV->indices()) {
386 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
387 Index *= ST->getNumElements();
388 CurrentType = ST->getElementType(I);
389 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
390 Index *= AT->getNumElements();
391 CurrentType = AT->getElementType();
392 } else {
393 return std::nullopt;
394 }
395 Index += I;
396 }
397 return Index;
398}
399
400namespace {
401/// Specifies the way the mask should be analyzed for undefs/poisonous elements
402/// in the shuffle mask.
403enum class UseMask {
404 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
405 ///< check for the mask elements for the first argument (mask
406 ///< indices are in range [0:VF)).
407 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
408 ///< for the mask elements for the second argument (mask indices
409 ///< are in range [VF:2*VF))
410 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
411 ///< future shuffle elements and mark them as ones as being used
412 ///< in future. Non-undef elements are considered as unused since
413 ///< they're already marked as used in the mask.
414};
415} // namespace
416
417/// Prepares a use bitset for the given mask either for the first argument or
418/// for the second.
420 UseMask MaskArg) {
421 SmallBitVector UseMask(VF, true);
422 for (auto [Idx, Value] : enumerate(Mask)) {
423 if (Value == PoisonMaskElem) {
424 if (MaskArg == UseMask::UndefsAsMask)
425 UseMask.reset(Idx);
426 continue;
427 }
428 if (MaskArg == UseMask::FirstArg && Value < VF)
429 UseMask.reset(Value);
430 else if (MaskArg == UseMask::SecondArg && Value >= VF)
431 UseMask.reset(Value - VF);
432 }
433 return UseMask;
434}
435
436/// Checks if the given value is actually an undefined constant vector.
437/// Also, if the \p UseMask is not empty, tries to check if the non-masked
438/// elements actually mask the insertelement buildvector, if any.
439template <bool IsPoisonOnly = false>
441 const SmallBitVector &UseMask = {}) {
442 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
443 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
444 if (isa<T>(V))
445 return Res;
446 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
447 if (!VecTy)
448 return Res.reset();
449 auto *C = dyn_cast<Constant>(V);
450 if (!C) {
451 if (!UseMask.empty()) {
452 const Value *Base = V;
453 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
454 Base = II->getOperand(0);
455 if (isa<T>(II->getOperand(1)))
456 continue;
457 std::optional<unsigned> Idx = getInsertIndex(II);
458 if (!Idx) {
459 Res.reset();
460 return Res;
461 }
462 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
463 Res.reset(*Idx);
464 }
465 // TODO: Add analysis for shuffles here too.
466 if (V == Base) {
467 Res.reset();
468 } else {
469 SmallBitVector SubMask(UseMask.size(), false);
470 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
471 }
472 } else {
473 Res.reset();
474 }
475 return Res;
476 }
477 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
478 if (Constant *Elem = C->getAggregateElement(I))
479 if (!isa<T>(Elem) &&
480 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
481 Res.reset(I);
482 }
483 return Res;
484}
485
486/// Checks if the vector of instructions can be represented as a shuffle, like:
487/// %x0 = extractelement <4 x i8> %x, i32 0
488/// %x3 = extractelement <4 x i8> %x, i32 3
489/// %y1 = extractelement <4 x i8> %y, i32 1
490/// %y2 = extractelement <4 x i8> %y, i32 2
491/// %x0x0 = mul i8 %x0, %x0
492/// %x3x3 = mul i8 %x3, %x3
493/// %y1y1 = mul i8 %y1, %y1
494/// %y2y2 = mul i8 %y2, %y2
495/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
496/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
497/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
498/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
499/// ret <4 x i8> %ins4
500/// can be transformed into:
501/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
502/// i32 6>
503/// %2 = mul <4 x i8> %1, %1
504/// ret <4 x i8> %2
505/// Mask will return the Shuffle Mask equivalent to the extracted elements.
506/// TODO: Can we split off and reuse the shuffle mask detection from
507/// ShuffleVectorInst/getShuffleCost?
508static std::optional<TargetTransformInfo::ShuffleKind>
510 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
511 if (It == VL.end())
512 return std::nullopt;
513 auto *EI0 = cast<ExtractElementInst>(*It);
514 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
515 return std::nullopt;
516 unsigned Size =
517 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
518 Value *Vec1 = nullptr;
519 Value *Vec2 = nullptr;
520 bool HasNonUndefVec = any_of(VL, [](Value *V) {
521 auto *EE = dyn_cast<ExtractElementInst>(V);
522 if (!EE)
523 return false;
524 Value *Vec = EE->getVectorOperand();
525 if (isa<UndefValue>(Vec))
526 return false;
527 return isGuaranteedNotToBePoison(Vec);
528 });
529 enum ShuffleMode { Unknown, Select, Permute };
530 ShuffleMode CommonShuffleMode = Unknown;
531 Mask.assign(VL.size(), PoisonMaskElem);
532 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
533 // Undef can be represented as an undef element in a vector.
534 if (isa<UndefValue>(VL[I]))
535 continue;
536 auto *EI = cast<ExtractElementInst>(VL[I]);
537 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
538 return std::nullopt;
539 auto *Vec = EI->getVectorOperand();
540 // We can extractelement from undef or poison vector.
541 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
542 continue;
543 // All vector operands must have the same number of vector elements.
544 if (isa<UndefValue>(Vec)) {
545 Mask[I] = I;
546 } else {
547 if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
548 return std::nullopt;
549 if (isa<UndefValue>(EI->getIndexOperand()))
550 continue;
551 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
552 if (!Idx)
553 return std::nullopt;
554 // Undefined behavior if Idx is negative or >= Size.
555 if (Idx->getValue().uge(Size))
556 continue;
557 unsigned IntIdx = Idx->getValue().getZExtValue();
558 Mask[I] = IntIdx;
559 }
560 if (isUndefVector(Vec).all() && HasNonUndefVec)
561 continue;
562 // For correct shuffling we have to have at most 2 different vector operands
563 // in all extractelement instructions.
564 if (!Vec1 || Vec1 == Vec) {
565 Vec1 = Vec;
566 } else if (!Vec2 || Vec2 == Vec) {
567 Vec2 = Vec;
568 Mask[I] += Size;
569 } else {
570 return std::nullopt;
571 }
572 if (CommonShuffleMode == Permute)
573 continue;
574 // If the extract index is not the same as the operation number, it is a
575 // permutation.
576 if (Mask[I] % Size != I) {
577 CommonShuffleMode = Permute;
578 continue;
579 }
580 CommonShuffleMode = Select;
581 }
582 // If we're not crossing lanes in different vectors, consider it as blending.
583 if (CommonShuffleMode == Select && Vec2)
585 // If Vec2 was never used, we have a permutation of a single vector, otherwise
586 // we have permutation of 2 vectors.
589}
590
591/// \returns True if Extract{Value,Element} instruction extracts element Idx.
592static std::optional<unsigned> getExtractIndex(Instruction *E) {
593 unsigned Opcode = E->getOpcode();
594 assert((Opcode == Instruction::ExtractElement ||
595 Opcode == Instruction::ExtractValue) &&
596 "Expected extractelement or extractvalue instruction.");
597 if (Opcode == Instruction::ExtractElement) {
598 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
599 if (!CI)
600 return std::nullopt;
601 return CI->getZExtValue();
602 }
603 auto *EI = cast<ExtractValueInst>(E);
604 if (EI->getNumIndices() != 1)
605 return std::nullopt;
606 return *EI->idx_begin();
607}
608
609namespace {
610
611/// Main data required for vectorization of instructions.
612struct InstructionsState {
613 /// The very first instruction in the list with the main opcode.
614 Value *OpValue = nullptr;
615
616 /// The main/alternate instruction.
617 Instruction *MainOp = nullptr;
618 Instruction *AltOp = nullptr;
619
620 /// The main/alternate opcodes for the list of instructions.
621 unsigned getOpcode() const {
622 return MainOp ? MainOp->getOpcode() : 0;
623 }
624
625 unsigned getAltOpcode() const {
626 return AltOp ? AltOp->getOpcode() : 0;
627 }
628
629 /// Some of the instructions in the list have alternate opcodes.
630 bool isAltShuffle() const { return AltOp != MainOp; }
631
632 bool isOpcodeOrAlt(Instruction *I) const {
633 unsigned CheckedOpcode = I->getOpcode();
634 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
635 }
636
637 InstructionsState() = delete;
638 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
639 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
640};
641
642} // end anonymous namespace
643
644/// Chooses the correct key for scheduling data. If \p Op has the same (or
645/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
646/// OpValue.
647static Value *isOneOf(const InstructionsState &S, Value *Op) {
648 auto *I = dyn_cast<Instruction>(Op);
649 if (I && S.isOpcodeOrAlt(I))
650 return Op;
651 return S.OpValue;
652}
653
654/// \returns true if \p Opcode is allowed as part of the main/alternate
655/// instruction for SLP vectorization.
656///
657/// Example of unsupported opcode is SDIV that can potentially cause UB if the
658/// "shuffled out" lane would result in division by zero.
659static bool isValidForAlternation(unsigned Opcode) {
660 if (Instruction::isIntDivRem(Opcode))
661 return false;
662
663 return true;
664}
665
666static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
667 const TargetLibraryInfo &TLI,
668 unsigned BaseIndex = 0);
669
670/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
671/// compatible instructions or constants, or just some other regular values.
672static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
673 Value *Op1, const TargetLibraryInfo &TLI) {
674 return (isConstant(BaseOp0) && isConstant(Op0)) ||
675 (isConstant(BaseOp1) && isConstant(Op1)) ||
676 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
677 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
678 BaseOp0 == Op0 || BaseOp1 == Op1 ||
679 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
680 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
681}
682
683/// \returns true if a compare instruction \p CI has similar "look" and
684/// same predicate as \p BaseCI, "as is" or with its operands and predicate
685/// swapped, false otherwise.
686static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
687 const TargetLibraryInfo &TLI) {
688 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
689 "Assessing comparisons of different types?");
690 CmpInst::Predicate BasePred = BaseCI->getPredicate();
691 CmpInst::Predicate Pred = CI->getPredicate();
693
694 Value *BaseOp0 = BaseCI->getOperand(0);
695 Value *BaseOp1 = BaseCI->getOperand(1);
696 Value *Op0 = CI->getOperand(0);
697 Value *Op1 = CI->getOperand(1);
698
699 return (BasePred == Pred &&
700 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
701 (BasePred == SwappedPred &&
702 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
703}
704
705/// \returns analysis of the Instructions in \p VL described in
706/// InstructionsState, the Opcode that we suppose the whole list
707/// could be vectorized even if its structure is diverse.
708static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
709 const TargetLibraryInfo &TLI,
710 unsigned BaseIndex) {
711 // Make sure these are all Instructions.
712 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
713 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
714
715 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
716 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
717 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
718 CmpInst::Predicate BasePred =
719 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
721 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
722 unsigned AltOpcode = Opcode;
723 unsigned AltIndex = BaseIndex;
724
725 bool SwappedPredsCompatible = [&]() {
726 if (!IsCmpOp)
727 return false;
728 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
729 UniquePreds.insert(BasePred);
730 UniqueNonSwappedPreds.insert(BasePred);
731 for (Value *V : VL) {
732 auto *I = dyn_cast<CmpInst>(V);
733 if (!I)
734 return false;
735 CmpInst::Predicate CurrentPred = I->getPredicate();
736 CmpInst::Predicate SwappedCurrentPred =
737 CmpInst::getSwappedPredicate(CurrentPred);
738 UniqueNonSwappedPreds.insert(CurrentPred);
739 if (!UniquePreds.contains(CurrentPred) &&
740 !UniquePreds.contains(SwappedCurrentPred))
741 UniquePreds.insert(CurrentPred);
742 }
743 // Total number of predicates > 2, but if consider swapped predicates
744 // compatible only 2, consider swappable predicates as compatible opcodes,
745 // not alternate.
746 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
747 }();
748 // Check for one alternate opcode from another BinaryOperator.
749 // TODO - generalize to support all operators (types, calls etc.).
750 auto *IBase = cast<Instruction>(VL[BaseIndex]);
751 Intrinsic::ID BaseID = 0;
752 SmallVector<VFInfo> BaseMappings;
753 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
755 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
756 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
757 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
758 }
759 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
760 auto *I = cast<Instruction>(VL[Cnt]);
761 unsigned InstOpcode = I->getOpcode();
762 if (IsBinOp && isa<BinaryOperator>(I)) {
763 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
764 continue;
765 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
766 isValidForAlternation(Opcode)) {
767 AltOpcode = InstOpcode;
768 AltIndex = Cnt;
769 continue;
770 }
771 } else if (IsCastOp && isa<CastInst>(I)) {
772 Value *Op0 = IBase->getOperand(0);
773 Type *Ty0 = Op0->getType();
774 Value *Op1 = I->getOperand(0);
775 Type *Ty1 = Op1->getType();
776 if (Ty0 == Ty1) {
777 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
778 continue;
779 if (Opcode == AltOpcode) {
781 isValidForAlternation(InstOpcode) &&
782 "Cast isn't safe for alternation, logic needs to be updated!");
783 AltOpcode = InstOpcode;
784 AltIndex = Cnt;
785 continue;
786 }
787 }
788 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
789 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
790 Type *Ty0 = BaseInst->getOperand(0)->getType();
791 Type *Ty1 = Inst->getOperand(0)->getType();
792 if (Ty0 == Ty1) {
793 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
794 // Check for compatible operands. If the corresponding operands are not
795 // compatible - need to perform alternate vectorization.
796 CmpInst::Predicate CurrentPred = Inst->getPredicate();
797 CmpInst::Predicate SwappedCurrentPred =
798 CmpInst::getSwappedPredicate(CurrentPred);
799
800 if ((E == 2 || SwappedPredsCompatible) &&
801 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
802 continue;
803
804 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
805 continue;
806 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
807 if (AltIndex != BaseIndex) {
808 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
809 continue;
810 } else if (BasePred != CurrentPred) {
811 assert(
812 isValidForAlternation(InstOpcode) &&
813 "CmpInst isn't safe for alternation, logic needs to be updated!");
814 AltIndex = Cnt;
815 continue;
816 }
817 CmpInst::Predicate AltPred = AltInst->getPredicate();
818 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
819 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
820 continue;
821 }
822 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
823 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
824 if (Gep->getNumOperands() != 2 ||
825 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
826 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
827 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
829 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
830 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
831 auto *BaseLI = cast<LoadInst>(IBase);
832 if (!LI->isSimple() || !BaseLI->isSimple())
833 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
834 } else if (auto *Call = dyn_cast<CallInst>(I)) {
835 auto *CallBase = cast<CallInst>(IBase);
836 if (Call->getCalledFunction() != CallBase->getCalledFunction())
837 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
838 if (Call->hasOperandBundles() &&
839 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
840 Call->op_begin() + Call->getBundleOperandsEndIndex(),
841 CallBase->op_begin() +
843 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
845 if (ID != BaseID)
846 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
847 if (!ID) {
848 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
849 if (Mappings.size() != BaseMappings.size() ||
850 Mappings.front().ISA != BaseMappings.front().ISA ||
851 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
852 Mappings.front().VectorName != BaseMappings.front().VectorName ||
853 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
854 Mappings.front().Shape.Parameters !=
855 BaseMappings.front().Shape.Parameters)
856 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
857 }
858 }
859 continue;
860 }
861 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
862 }
863
864 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
865 cast<Instruction>(VL[AltIndex]));
866}
867
868/// \returns true if all of the values in \p VL have the same type or false
869/// otherwise.
871 Type *Ty = VL.front()->getType();
872 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
873}
874
875/// \returns True if in-tree use also needs extract. This refers to
876/// possible scalar operand in vectorized instruction.
877static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
878 TargetLibraryInfo *TLI) {
879 unsigned Opcode = UserInst->getOpcode();
880 switch (Opcode) {
881 case Instruction::Load: {
882 LoadInst *LI = cast<LoadInst>(UserInst);
883 return (LI->getPointerOperand() == Scalar);
884 }
885 case Instruction::Store: {
886 StoreInst *SI = cast<StoreInst>(UserInst);
887 return (SI->getPointerOperand() == Scalar);
888 }
889 case Instruction::Call: {
890 CallInst *CI = cast<CallInst>(UserInst);
892 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
893 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
894 Arg.value().get() == Scalar;
895 });
896 }
897 default:
898 return false;
899 }
900}
901
902/// \returns the AA location that is being access by the instruction.
904 if (StoreInst *SI = dyn_cast<StoreInst>(I))
905 return MemoryLocation::get(SI);
906 if (LoadInst *LI = dyn_cast<LoadInst>(I))
907 return MemoryLocation::get(LI);
908 return MemoryLocation();
909}
910
911/// \returns True if the instruction is not a volatile or atomic load/store.
912static bool isSimple(Instruction *I) {
913 if (LoadInst *LI = dyn_cast<LoadInst>(I))
914 return LI->isSimple();
915 if (StoreInst *SI = dyn_cast<StoreInst>(I))
916 return SI->isSimple();
917 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
918 return !MI->isVolatile();
919 return true;
920}
921
922/// Shuffles \p Mask in accordance with the given \p SubMask.
923/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
924/// one but two input vectors.
925static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
926 bool ExtendingManyInputs = false) {
927 if (SubMask.empty())
928 return;
929 assert(
930 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
931 // Check if input scalars were extended to match the size of other node.
932 (SubMask.size() == Mask.size() &&
933 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
934 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
935 "SubMask with many inputs support must be larger than the mask.");
936 if (Mask.empty()) {
937 Mask.append(SubMask.begin(), SubMask.end());
938 return;
939 }
940 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
941 int TermValue = std::min(Mask.size(), SubMask.size());
942 for (int I = 0, E = SubMask.size(); I < E; ++I) {
943 if (SubMask[I] == PoisonMaskElem ||
944 (!ExtendingManyInputs &&
945 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
946 continue;
947 NewMask[I] = Mask[SubMask[I]];
948 }
949 Mask.swap(NewMask);
950}
951
952/// Order may have elements assigned special value (size) which is out of
953/// bounds. Such indices only appear on places which correspond to undef values
954/// (see canReuseExtract for details) and used in order to avoid undef values
955/// have effect on operands ordering.
956/// The first loop below simply finds all unused indices and then the next loop
957/// nest assigns these indices for undef values positions.
958/// As an example below Order has two undef positions and they have assigned
959/// values 3 and 7 respectively:
960/// before: 6 9 5 4 9 2 1 0
961/// after: 6 3 5 4 7 2 1 0
963 const unsigned Sz = Order.size();
964 SmallBitVector UnusedIndices(Sz, /*t=*/true);
965 SmallBitVector MaskedIndices(Sz);
966 for (unsigned I = 0; I < Sz; ++I) {
967 if (Order[I] < Sz)
968 UnusedIndices.reset(Order[I]);
969 else
970 MaskedIndices.set(I);
971 }
972 if (MaskedIndices.none())
973 return;
974 assert(UnusedIndices.count() == MaskedIndices.count() &&
975 "Non-synced masked/available indices.");
976 int Idx = UnusedIndices.find_first();
977 int MIdx = MaskedIndices.find_first();
978 while (MIdx >= 0) {
979 assert(Idx >= 0 && "Indices must be synced.");
980 Order[MIdx] = Idx;
981 Idx = UnusedIndices.find_next(Idx);
982 MIdx = MaskedIndices.find_next(MIdx);
983 }
984}
985
986/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
987/// Opcode1.
989 unsigned Opcode1) {
990 SmallBitVector OpcodeMask(VL.size(), false);
991 for (unsigned Lane : seq<unsigned>(VL.size()))
992 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
993 OpcodeMask.set(Lane);
994 return OpcodeMask;
995}
996
997namespace llvm {
998
1000 SmallVectorImpl<int> &Mask) {
1001 Mask.clear();
1002 const unsigned E = Indices.size();
1003 Mask.resize(E, PoisonMaskElem);
1004 for (unsigned I = 0; I < E; ++I)
1005 Mask[Indices[I]] = I;
1006}
1007
1008/// Reorders the list of scalars in accordance with the given \p Mask.
1010 ArrayRef<int> Mask) {
1011 assert(!Mask.empty() && "Expected non-empty mask.");
1012 SmallVector<Value *> Prev(Scalars.size(),
1013 PoisonValue::get(Scalars.front()->getType()));
1014 Prev.swap(Scalars);
1015 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1016 if (Mask[I] != PoisonMaskElem)
1017 Scalars[Mask[I]] = Prev[I];
1018}
1019
1020/// Checks if the provided value does not require scheduling. It does not
1021/// require scheduling if this is not an instruction or it is an instruction
1022/// that does not read/write memory and all operands are either not instructions
1023/// or phi nodes or instructions from different blocks.
1025 auto *I = dyn_cast<Instruction>(V);
1026 if (!I)
1027 return true;
1028 return !mayHaveNonDefUseDependency(*I) &&
1029 all_of(I->operands(), [I](Value *V) {
1030 auto *IO = dyn_cast<Instruction>(V);
1031 if (!IO)
1032 return true;
1033 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1034 });
1035}
1036
1037/// Checks if the provided value does not require scheduling. It does not
1038/// require scheduling if this is not an instruction or it is an instruction
1039/// that does not read/write memory and all users are phi nodes or instructions
1040/// from the different blocks.
1041static bool isUsedOutsideBlock(Value *V) {
1042 auto *I = dyn_cast<Instruction>(V);
1043 if (!I)
1044 return true;
1045 // Limits the number of uses to save compile time.
1046 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1047 all_of(I->users(), [I](User *U) {
1048 auto *IU = dyn_cast<Instruction>(U);
1049 if (!IU)
1050 return true;
1051 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1052 });
1053}
1054
1055/// Checks if the specified value does not require scheduling. It does not
1056/// require scheduling if all operands and all users do not need to be scheduled
1057/// in the current basic block.
1060}
1061
1062/// Checks if the specified array of instructions does not require scheduling.
1063/// It is so if all either instructions have operands that do not require
1064/// scheduling or their users do not require scheduling since they are phis or
1065/// in other basic blocks.
1067 return !VL.empty() &&
1069}
1070
1071namespace slpvectorizer {
1072
1073/// Bottom Up SLP Vectorizer.
1074class BoUpSLP {
1075 struct TreeEntry;
1076 struct ScheduleData;
1079
1080public:
1081 /// Tracks the state we can represent the loads in the given sequence.
1082 enum class LoadsState {
1083 Gather,
1084 Vectorize,
1087 };
1088
1096
1098 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1101 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1102 AC(AC), DB(DB), DL(DL), ORE(ORE),
1103 Builder(Se->getContext(), TargetFolder(*DL)) {
1104 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1105 // Use the vector register size specified by the target unless overridden
1106 // by a command-line option.
1107 // TODO: It would be better to limit the vectorization factor based on
1108 // data type rather than just register size. For example, x86 AVX has
1109 // 256-bit registers, but it does not support integer operations
1110 // at that width (that requires AVX2).
1111 if (MaxVectorRegSizeOption.getNumOccurrences())
1112 MaxVecRegSize = MaxVectorRegSizeOption;
1113 else
1114 MaxVecRegSize =
1116 .getFixedValue();
1117
1118 if (MinVectorRegSizeOption.getNumOccurrences())
1119 MinVecRegSize = MinVectorRegSizeOption;
1120 else
1121 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1122 }
1123
1124 /// Vectorize the tree that starts with the elements in \p VL.
1125 /// Returns the vectorized root.
1127
1128 /// Vectorize the tree but with the list of externally used values \p
1129 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1130 /// generated extractvalue instructions.
1131 /// \param ReplacedExternals containd list of replaced external values
1132 /// {scalar, replace} after emitting extractelement for external uses.
1133 Value *
1134 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1135 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1136 Instruction *ReductionRoot = nullptr);
1137
1138 /// \returns the cost incurred by unwanted spills and fills, caused by
1139 /// holding live values over call sites.
1141
1142 /// \returns the vectorization cost of the subtree that starts at \p VL.
1143 /// A negative number means that this is profitable.
1144 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1145
1146 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1147 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1148 void buildTree(ArrayRef<Value *> Roots,
1149 const SmallDenseSet<Value *> &UserIgnoreLst);
1150
1151 /// Construct a vectorizable tree that starts at \p Roots.
1152 void buildTree(ArrayRef<Value *> Roots);
1153
1154 /// Returns whether the root node has in-tree uses.
1156 return !VectorizableTree.empty() &&
1157 !VectorizableTree.front()->UserTreeIndices.empty();
1158 }
1159
1160 /// Return the scalars of the root node.
1162 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1163 return VectorizableTree.front()->Scalars;
1164 }
1165
1166 /// Builds external uses of the vectorized scalars, i.e. the list of
1167 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1168 /// ExternallyUsedValues contains additional list of external uses to handle
1169 /// vectorization of reductions.
1170 void
1171 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1172
1173 /// Transforms graph nodes to target specific representations, if profitable.
1174 void transformNodes();
1175
1176 /// Clear the internal data structures that are created by 'buildTree'.
1177 void deleteTree() {
1178 VectorizableTree.clear();
1179 ScalarToTreeEntry.clear();
1180 MultiNodeScalars.clear();
1181 MustGather.clear();
1182 NonScheduledFirst.clear();
1183 EntryToLastInstruction.clear();
1184 ExternalUses.clear();
1185 ExternalUsesAsGEPs.clear();
1186 for (auto &Iter : BlocksSchedules) {
1187 BlockScheduling *BS = Iter.second.get();
1188 BS->clear();
1189 }
1190 MinBWs.clear();
1191 ReductionBitWidth = 0;
1192 CastMaxMinBWSizes.reset();
1193 ExtraBitWidthNodes.clear();
1194 InstrElementSize.clear();
1195 UserIgnoreList = nullptr;
1196 PostponedGathers.clear();
1197 ValueToGatherNodes.clear();
1198 }
1199
1200 unsigned getTreeSize() const { return VectorizableTree.size(); }
1201
1202 /// Perform LICM and CSE on the newly generated gather sequences.
1204
1205 /// Checks if the specified gather tree entry \p TE can be represented as a
1206 /// shuffled vector entry + (possibly) permutation with other gathers. It
1207 /// implements the checks only for possibly ordered scalars (Loads,
1208 /// ExtractElement, ExtractValue), which can be part of the graph.
1209 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1210
1211 /// Sort loads into increasing pointers offsets to allow greater clustering.
1212 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1213
1214 /// Gets reordering data for the given tree entry. If the entry is vectorized
1215 /// - just return ReorderIndices, otherwise check if the scalars can be
1216 /// reordered and return the most optimal order.
1217 /// \return std::nullopt if ordering is not important, empty order, if
1218 /// identity order is important, or the actual order.
1219 /// \param TopToBottom If true, include the order of vectorized stores and
1220 /// insertelement nodes, otherwise skip them.
1221 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1222 bool TopToBottom);
1223
1224 /// Reorders the current graph to the most profitable order starting from the
1225 /// root node to the leaf nodes. The best order is chosen only from the nodes
1226 /// of the same size (vectorization factor). Smaller nodes are considered
1227 /// parts of subgraph with smaller VF and they are reordered independently. We
1228 /// can make it because we still need to extend smaller nodes to the wider VF
1229 /// and we can merge reordering shuffles with the widening shuffles.
1230 void reorderTopToBottom();
1231
1232 /// Reorders the current graph to the most profitable order starting from
1233 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1234 /// number of reshuffles if the leaf nodes use the same order. In this case we
1235 /// can merge the orders and just shuffle user node instead of shuffling its
1236 /// operands. Plus, even the leaf nodes have different orders, it allows to
1237 /// sink reordering in the graph closer to the root node and merge it later
1238 /// during analysis.
1239 void reorderBottomToTop(bool IgnoreReorder = false);
1240
1241 /// \return The vector element size in bits to use when vectorizing the
1242 /// expression tree ending at \p V. If V is a store, the size is the width of
1243 /// the stored value. Otherwise, the size is the width of the largest loaded
1244 /// value reaching V. This method is used by the vectorizer to calculate
1245 /// vectorization factors.
1246 unsigned getVectorElementSize(Value *V);
1247
1248 /// Compute the minimum type sizes required to represent the entries in a
1249 /// vectorizable tree.
1251
1252 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1253 unsigned getMaxVecRegSize() const {
1254 return MaxVecRegSize;
1255 }
1256
1257 // \returns minimum vector register size as set by cl::opt.
1258 unsigned getMinVecRegSize() const {
1259 return MinVecRegSize;
1260 }
1261
1262 unsigned getMinVF(unsigned Sz) const {
1263 return std::max(2U, getMinVecRegSize() / Sz);
1264 }
1265
1266 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1267 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1268 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1269 return MaxVF ? MaxVF : UINT_MAX;
1270 }
1271
1272 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1273 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1274 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1275 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1276 ///
1277 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1278 unsigned canMapToVector(Type *T) const;
1279
1280 /// \returns True if the VectorizableTree is both tiny and not fully
1281 /// vectorizable. We do not vectorize such trees.
1282 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1283
1284 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1285 /// can be load combined in the backend. Load combining may not be allowed in
1286 /// the IR optimizer, so we do not want to alter the pattern. For example,
1287 /// partially transforming a scalar bswap() pattern into vector code is
1288 /// effectively impossible for the backend to undo.
1289 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1290 /// may not be necessary.
1291 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1292
1293 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1294 /// can be load combined in the backend. Load combining may not be allowed in
1295 /// the IR optimizer, so we do not want to alter the pattern. For example,
1296 /// partially transforming a scalar bswap() pattern into vector code is
1297 /// effectively impossible for the backend to undo.
1298 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1299 /// may not be necessary.
1300 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1301
1302 /// Checks if the given array of loads can be represented as a vectorized,
1303 /// scatter or just simple gather.
1304 /// \param VL list of loads.
1305 /// \param VL0 main load value.
1306 /// \param Order returned order of load instructions.
1307 /// \param PointerOps returned list of pointer operands.
1308 /// \param TryRecursiveCheck used to check if long masked gather can be
1309 /// represented as a serie of loads/insert subvector, if profitable.
1312 SmallVectorImpl<Value *> &PointerOps,
1313 bool TryRecursiveCheck = true) const;
1314
1316
1317 /// This structure holds any data we need about the edges being traversed
1318 /// during buildTree_rec(). We keep track of:
1319 /// (i) the user TreeEntry index, and
1320 /// (ii) the index of the edge.
1321 struct EdgeInfo {
1322 EdgeInfo() = default;
1323 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1325 /// The user TreeEntry.
1326 TreeEntry *UserTE = nullptr;
1327 /// The operand index of the use.
1328 unsigned EdgeIdx = UINT_MAX;
1329#ifndef NDEBUG
1331 const BoUpSLP::EdgeInfo &EI) {
1332 EI.dump(OS);
1333 return OS;
1334 }
1335 /// Debug print.
1336 void dump(raw_ostream &OS) const {
1337 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1338 << " EdgeIdx:" << EdgeIdx << "}";
1339 }
1340 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1341#endif
1342 bool operator == (const EdgeInfo &Other) const {
1343 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1344 }
1345 };
1346
1347 /// A helper class used for scoring candidates for two consecutive lanes.
1349 const TargetLibraryInfo &TLI;
1350 const DataLayout &DL;
1351 ScalarEvolution &SE;
1352 const BoUpSLP &R;
1353 int NumLanes; // Total number of lanes (aka vectorization factor).
1354 int MaxLevel; // The maximum recursion depth for accumulating score.
1355
1356 public:
1358 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1359 int MaxLevel)
1360 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1361 MaxLevel(MaxLevel) {}
1362
1363 // The hard-coded scores listed here are not very important, though it shall
1364 // be higher for better matches to improve the resulting cost. When
1365 // computing the scores of matching one sub-tree with another, we are
1366 // basically counting the number of values that are matching. So even if all
1367 // scores are set to 1, we would still get a decent matching result.
1368 // However, sometimes we have to break ties. For example we may have to
1369 // choose between matching loads vs matching opcodes. This is what these
1370 // scores are helping us with: they provide the order of preference. Also,
1371 // this is important if the scalar is externally used or used in another
1372 // tree entry node in the different lane.
1373
1374 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1375 static const int ScoreConsecutiveLoads = 4;
1376 /// The same load multiple times. This should have a better score than
1377 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1378 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1379 /// a vector load and 1.0 for a broadcast.
1380 static const int ScoreSplatLoads = 3;
1381 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1382 static const int ScoreReversedLoads = 3;
1383 /// A load candidate for masked gather.
1384 static const int ScoreMaskedGatherCandidate = 1;
1385 /// ExtractElementInst from same vector and consecutive indexes.
1386 static const int ScoreConsecutiveExtracts = 4;
1387 /// ExtractElementInst from same vector and reversed indices.
1388 static const int ScoreReversedExtracts = 3;
1389 /// Constants.
1390 static const int ScoreConstants = 2;
1391 /// Instructions with the same opcode.
1392 static const int ScoreSameOpcode = 2;
1393 /// Instructions with alt opcodes (e.g, add + sub).
1394 static const int ScoreAltOpcodes = 1;
1395 /// Identical instructions (a.k.a. splat or broadcast).
1396 static const int ScoreSplat = 1;
1397 /// Matching with an undef is preferable to failing.
1398 static const int ScoreUndef = 1;
1399 /// Score for failing to find a decent match.
1400 static const int ScoreFail = 0;
1401 /// Score if all users are vectorized.
1402 static const int ScoreAllUserVectorized = 1;
1403
1404 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1405 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1406 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1407 /// MainAltOps.
1409 ArrayRef<Value *> MainAltOps) const {
1410 if (!isValidElementType(V1->getType()) ||
1411 !isValidElementType(V2->getType()))
1413
1414 if (V1 == V2) {
1415 if (isa<LoadInst>(V1)) {
1416 // Retruns true if the users of V1 and V2 won't need to be extracted.
1417 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1418 // Bail out if we have too many uses to save compilation time.
1419 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1420 return false;
1421
1422 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1423 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1424 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1425 });
1426 };
1427 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1428 };
1429 // A broadcast of a load can be cheaper on some targets.
1430 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1431 ElementCount::getFixed(NumLanes)) &&
1432 ((int)V1->getNumUses() == NumLanes ||
1433 AllUsersAreInternal(V1, V2)))
1435 }
1437 }
1438
1439 auto CheckSameEntryOrFail = [&]() {
1440 if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1441 TE1 && TE1 == R.getTreeEntry(V2))
1444 };
1445
1446 auto *LI1 = dyn_cast<LoadInst>(V1);
1447 auto *LI2 = dyn_cast<LoadInst>(V2);
1448 if (LI1 && LI2) {
1449 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1450 !LI2->isSimple())
1451 return CheckSameEntryOrFail();
1452
1453 std::optional<int> Dist = getPointersDiff(
1454 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1455 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1456 if (!Dist || *Dist == 0) {
1457 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1458 getUnderlyingObject(LI2->getPointerOperand()) &&
1459 R.TTI->isLegalMaskedGather(
1460 FixedVectorType::get(LI1->getType(), NumLanes),
1461 LI1->getAlign()))
1463 return CheckSameEntryOrFail();
1464 }
1465 // The distance is too large - still may be profitable to use masked
1466 // loads/gathers.
1467 if (std::abs(*Dist) > NumLanes / 2)
1469 // This still will detect consecutive loads, but we might have "holes"
1470 // in some cases. It is ok for non-power-2 vectorization and may produce
1471 // better results. It should not affect current vectorization.
1474 }
1475
1476 auto *C1 = dyn_cast<Constant>(V1);
1477 auto *C2 = dyn_cast<Constant>(V2);
1478 if (C1 && C2)
1480
1481 // Extracts from consecutive indexes of the same vector better score as
1482 // the extracts could be optimized away.
1483 Value *EV1;
1484 ConstantInt *Ex1Idx;
1485 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1486 // Undefs are always profitable for extractelements.
1487 // Compiler can easily combine poison and extractelement <non-poison> or
1488 // undef and extractelement <poison>. But combining undef +
1489 // extractelement <non-poison-but-may-produce-poison> requires some
1490 // extra operations.
1491 if (isa<UndefValue>(V2))
1492 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1495 Value *EV2 = nullptr;
1496 ConstantInt *Ex2Idx = nullptr;
1497 if (match(V2,
1499 m_Undef())))) {
1500 // Undefs are always profitable for extractelements.
1501 if (!Ex2Idx)
1503 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1505 if (EV2 == EV1) {
1506 int Idx1 = Ex1Idx->getZExtValue();
1507 int Idx2 = Ex2Idx->getZExtValue();
1508 int Dist = Idx2 - Idx1;
1509 // The distance is too large - still may be profitable to use
1510 // shuffles.
1511 if (std::abs(Dist) == 0)
1513 if (std::abs(Dist) > NumLanes / 2)
1517 }
1519 }
1520 return CheckSameEntryOrFail();
1521 }
1522
1523 auto *I1 = dyn_cast<Instruction>(V1);
1524 auto *I2 = dyn_cast<Instruction>(V2);
1525 if (I1 && I2) {
1526 if (I1->getParent() != I2->getParent())
1527 return CheckSameEntryOrFail();
1528 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1529 Ops.push_back(I1);
1530 Ops.push_back(I2);
1531 InstructionsState S = getSameOpcode(Ops, TLI);
1532 // Note: Only consider instructions with <= 2 operands to avoid
1533 // complexity explosion.
1534 if (S.getOpcode() &&
1535 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1536 !S.isAltShuffle()) &&
1537 all_of(Ops, [&S](Value *V) {
1538 return cast<Instruction>(V)->getNumOperands() ==
1539 S.MainOp->getNumOperands();
1540 }))
1541 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1543 }
1544
1545 if (isa<UndefValue>(V2))
1547
1548 return CheckSameEntryOrFail();
1549 }
1550
1551 /// Go through the operands of \p LHS and \p RHS recursively until
1552 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1553 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1554 /// of \p U1 and \p U2), except at the beginning of the recursion where
1555 /// these are set to nullptr.
1556 ///
1557 /// For example:
1558 /// \verbatim
1559 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1560 /// \ / \ / \ / \ /
1561 /// + + + +
1562 /// G1 G2 G3 G4
1563 /// \endverbatim
1564 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1565 /// each level recursively, accumulating the score. It starts from matching
1566 /// the additions at level 0, then moves on to the loads (level 1). The
1567 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1568 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1569 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1570 /// Please note that the order of the operands does not matter, as we
1571 /// evaluate the score of all profitable combinations of operands. In
1572 /// other words the score of G1 and G4 is the same as G1 and G2. This
1573 /// heuristic is based on ideas described in:
1574 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1575 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1576 /// Luís F. W. Góes
1578 Instruction *U2, int CurrLevel,
1579 ArrayRef<Value *> MainAltOps) const {
1580
1581 // Get the shallow score of V1 and V2.
1582 int ShallowScoreAtThisLevel =
1583 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1584
1585 // If reached MaxLevel,
1586 // or if V1 and V2 are not instructions,
1587 // or if they are SPLAT,
1588 // or if they are not consecutive,
1589 // or if profitable to vectorize loads or extractelements, early return
1590 // the current cost.
1591 auto *I1 = dyn_cast<Instruction>(LHS);
1592 auto *I2 = dyn_cast<Instruction>(RHS);
1593 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1594 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1595 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1596 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1597 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1598 ShallowScoreAtThisLevel))
1599 return ShallowScoreAtThisLevel;
1600 assert(I1 && I2 && "Should have early exited.");
1601
1602 // Contains the I2 operand indexes that got matched with I1 operands.
1603 SmallSet<unsigned, 4> Op2Used;
1604
1605 // Recursion towards the operands of I1 and I2. We are trying all possible
1606 // operand pairs, and keeping track of the best score.
1607 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1608 OpIdx1 != NumOperands1; ++OpIdx1) {
1609 // Try to pair op1I with the best operand of I2.
1610 int MaxTmpScore = 0;
1611 unsigned MaxOpIdx2 = 0;
1612 bool FoundBest = false;
1613 // If I2 is commutative try all combinations.
1614 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1615 unsigned ToIdx = isCommutative(I2)
1616 ? I2->getNumOperands()
1617 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1618 assert(FromIdx <= ToIdx && "Bad index");
1619 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1620 // Skip operands already paired with OpIdx1.
1621 if (Op2Used.count(OpIdx2))
1622 continue;
1623 // Recursively calculate the cost at each level
1624 int TmpScore =
1625 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1626 I1, I2, CurrLevel + 1, std::nullopt);
1627 // Look for the best score.
1628 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1629 TmpScore > MaxTmpScore) {
1630 MaxTmpScore = TmpScore;
1631 MaxOpIdx2 = OpIdx2;
1632 FoundBest = true;
1633 }
1634 }
1635 if (FoundBest) {
1636 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1637 Op2Used.insert(MaxOpIdx2);
1638 ShallowScoreAtThisLevel += MaxTmpScore;
1639 }
1640 }
1641 return ShallowScoreAtThisLevel;
1642 }
1643 };
1644 /// A helper data structure to hold the operands of a vector of instructions.
1645 /// This supports a fixed vector length for all operand vectors.
1647 /// For each operand we need (i) the value, and (ii) the opcode that it
1648 /// would be attached to if the expression was in a left-linearized form.
1649 /// This is required to avoid illegal operand reordering.
1650 /// For example:
1651 /// \verbatim
1652 /// 0 Op1
1653 /// |/
1654 /// Op1 Op2 Linearized + Op2
1655 /// \ / ----------> |/
1656 /// - -
1657 ///
1658 /// Op1 - Op2 (0 + Op1) - Op2
1659 /// \endverbatim
1660 ///
1661 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1662 ///
1663 /// Another way to think of this is to track all the operations across the
1664 /// path from the operand all the way to the root of the tree and to
1665 /// calculate the operation that corresponds to this path. For example, the
1666 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1667 /// corresponding operation is a '-' (which matches the one in the
1668 /// linearized tree, as shown above).
1669 ///
1670 /// For lack of a better term, we refer to this operation as Accumulated
1671 /// Path Operation (APO).
1672 struct OperandData {
1673 OperandData() = default;
1674 OperandData(Value *V, bool APO, bool IsUsed)
1675 : V(V), APO(APO), IsUsed(IsUsed) {}
1676 /// The operand value.
1677 Value *V = nullptr;
1678 /// TreeEntries only allow a single opcode, or an alternate sequence of
1679 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1680 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1681 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1682 /// (e.g., Add/Mul)
1683 bool APO = false;
1684 /// Helper data for the reordering function.
1685 bool IsUsed = false;
1686 };
1687
1688 /// During operand reordering, we are trying to select the operand at lane
1689 /// that matches best with the operand at the neighboring lane. Our
1690 /// selection is based on the type of value we are looking for. For example,
1691 /// if the neighboring lane has a load, we need to look for a load that is
1692 /// accessing a consecutive address. These strategies are summarized in the
1693 /// 'ReorderingMode' enumerator.
1694 enum class ReorderingMode {
1695 Load, ///< Matching loads to consecutive memory addresses
1696 Opcode, ///< Matching instructions based on opcode (same or alternate)
1697 Constant, ///< Matching constants
1698 Splat, ///< Matching the same instruction multiple times (broadcast)
1699 Failed, ///< We failed to create a vectorizable group
1700 };
1701
1703
1704 /// A vector of operand vectors.
1706
1707 const TargetLibraryInfo &TLI;
1708 const DataLayout &DL;
1709 ScalarEvolution &SE;
1710 const BoUpSLP &R;
1711 const Loop *L = nullptr;
1712
1713 /// \returns the operand data at \p OpIdx and \p Lane.
1714 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1715 return OpsVec[OpIdx][Lane];
1716 }
1717
1718 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1719 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1720 return OpsVec[OpIdx][Lane];
1721 }
1722
1723 /// Clears the used flag for all entries.
1724 void clearUsed() {
1725 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1726 OpIdx != NumOperands; ++OpIdx)
1727 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1728 ++Lane)
1729 OpsVec[OpIdx][Lane].IsUsed = false;
1730 }
1731
1732 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1733 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1734 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1735 }
1736
1737 /// \param Lane lane of the operands under analysis.
1738 /// \param OpIdx operand index in \p Lane lane we're looking the best
1739 /// candidate for.
1740 /// \param Idx operand index of the current candidate value.
1741 /// \returns The additional score due to possible broadcasting of the
1742 /// elements in the lane. It is more profitable to have power-of-2 unique
1743 /// elements in the lane, it will be vectorized with higher probability
1744 /// after removing duplicates. Currently the SLP vectorizer supports only
1745 /// vectorization of the power-of-2 number of unique scalars.
1746 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1747 Value *IdxLaneV = getData(Idx, Lane).V;
1748 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1749 return 0;
1751 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1752 if (Ln == Lane)
1753 continue;
1754 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1755 if (!isa<Instruction>(OpIdxLnV))
1756 return 0;
1757 Uniques.insert(OpIdxLnV);
1758 }
1759 int UniquesCount = Uniques.size();
1760 int UniquesCntWithIdxLaneV =
1761 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1762 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1763 int UniquesCntWithOpIdxLaneV =
1764 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1765 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1766 return 0;
1767 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1768 UniquesCntWithOpIdxLaneV) -
1769 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1770 }
1771
1772 /// \param Lane lane of the operands under analysis.
1773 /// \param OpIdx operand index in \p Lane lane we're looking the best
1774 /// candidate for.
1775 /// \param Idx operand index of the current candidate value.
1776 /// \returns The additional score for the scalar which users are all
1777 /// vectorized.
1778 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1779 Value *IdxLaneV = getData(Idx, Lane).V;
1780 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1781 // Do not care about number of uses for vector-like instructions
1782 // (extractelement/extractvalue with constant indices), they are extracts
1783 // themselves and already externally used. Vectorization of such
1784 // instructions does not add extra extractelement instruction, just may
1785 // remove it.
1786 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1787 isVectorLikeInstWithConstOps(OpIdxLaneV))
1789 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1790 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1791 return 0;
1792 return R.areAllUsersVectorized(IdxLaneI)
1794 : 0;
1795 }
1796
1797 /// Score scaling factor for fully compatible instructions but with
1798 /// different number of external uses. Allows better selection of the
1799 /// instructions with less external uses.
1800 static const int ScoreScaleFactor = 10;
1801
1802 /// \Returns the look-ahead score, which tells us how much the sub-trees
1803 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1804 /// score. This helps break ties in an informed way when we cannot decide on
1805 /// the order of the operands by just considering the immediate
1806 /// predecessors.
1807 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1808 int Lane, unsigned OpIdx, unsigned Idx,
1809 bool &IsUsed) {
1810 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1812 // Keep track of the instruction stack as we recurse into the operands
1813 // during the look-ahead score exploration.
1814 int Score =
1815 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1816 /*CurrLevel=*/1, MainAltOps);
1817 if (Score) {
1818 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1819 if (Score <= -SplatScore) {
1820 // Set the minimum score for splat-like sequence to avoid setting
1821 // failed state.
1822 Score = 1;
1823 } else {
1824 Score += SplatScore;
1825 // Scale score to see the difference between different operands
1826 // and similar operands but all vectorized/not all vectorized
1827 // uses. It does not affect actual selection of the best
1828 // compatible operand in general, just allows to select the
1829 // operand with all vectorized uses.
1830 Score *= ScoreScaleFactor;
1831 Score += getExternalUseScore(Lane, OpIdx, Idx);
1832 IsUsed = true;
1833 }
1834 }
1835 return Score;
1836 }
1837
1838 /// Best defined scores per lanes between the passes. Used to choose the
1839 /// best operand (with the highest score) between the passes.
1840 /// The key - {Operand Index, Lane}.
1841 /// The value - the best score between the passes for the lane and the
1842 /// operand.
1844 BestScoresPerLanes;
1845
1846 // Search all operands in Ops[*][Lane] for the one that matches best
1847 // Ops[OpIdx][LastLane] and return its opreand index.
1848 // If no good match can be found, return std::nullopt.
1849 std::optional<unsigned>
1850 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1851 ArrayRef<ReorderingMode> ReorderingModes,
1852 ArrayRef<Value *> MainAltOps) {
1853 unsigned NumOperands = getNumOperands();
1854
1855 // The operand of the previous lane at OpIdx.
1856 Value *OpLastLane = getData(OpIdx, LastLane).V;
1857
1858 // Our strategy mode for OpIdx.
1859 ReorderingMode RMode = ReorderingModes[OpIdx];
1860 if (RMode == ReorderingMode::Failed)
1861 return std::nullopt;
1862
1863 // The linearized opcode of the operand at OpIdx, Lane.
1864 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1865
1866 // The best operand index and its score.
1867 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1868 // are using the score to differentiate between the two.
1869 struct BestOpData {
1870 std::optional<unsigned> Idx;
1871 unsigned Score = 0;
1872 } BestOp;
1873 BestOp.Score =
1874 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1875 .first->second;
1876
1877 // Track if the operand must be marked as used. If the operand is set to
1878 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1879 // want to reestimate the operands again on the following iterations).
1880 bool IsUsed = RMode == ReorderingMode::Splat ||
1881 RMode == ReorderingMode::Constant ||
1882 RMode == ReorderingMode::Load;
1883 // Iterate through all unused operands and look for the best.
1884 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1885 // Get the operand at Idx and Lane.
1886 OperandData &OpData = getData(Idx, Lane);
1887 Value *Op = OpData.V;
1888 bool OpAPO = OpData.APO;
1889
1890 // Skip already selected operands.
1891 if (OpData.IsUsed)
1892 continue;
1893
1894 // Skip if we are trying to move the operand to a position with a
1895 // different opcode in the linearized tree form. This would break the
1896 // semantics.
1897 if (OpAPO != OpIdxAPO)
1898 continue;
1899
1900 // Look for an operand that matches the current mode.
1901 switch (RMode) {
1902 case ReorderingMode::Load:
1903 case ReorderingMode::Opcode: {
1904 bool LeftToRight = Lane > LastLane;
1905 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1906 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1907 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1908 OpIdx, Idx, IsUsed);
1909 if (Score > static_cast<int>(BestOp.Score) ||
1910 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
1911 Idx == OpIdx)) {
1912 BestOp.Idx = Idx;
1913 BestOp.Score = Score;
1914 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1915 }
1916 break;
1917 }
1918 case ReorderingMode::Constant:
1919 if (isa<Constant>(Op) ||
1920 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
1921 BestOp.Idx = Idx;
1922 if (isa<Constant>(Op)) {
1924 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1926 }
1927 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
1928 IsUsed = false;
1929 }
1930 break;
1931 case ReorderingMode::Splat:
1932 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
1933 IsUsed = Op == OpLastLane;
1934 if (Op == OpLastLane) {
1935 BestOp.Score = LookAheadHeuristics::ScoreSplat;
1936 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1938 }
1939 BestOp.Idx = Idx;
1940 }
1941 break;
1942 case ReorderingMode::Failed:
1943 llvm_unreachable("Not expected Failed reordering mode.");
1944 }
1945 }
1946
1947 if (BestOp.Idx) {
1948 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1949 return BestOp.Idx;
1950 }
1951 // If we could not find a good match return std::nullopt.
1952 return std::nullopt;
1953 }
1954
1955 /// Helper for reorderOperandVecs.
1956 /// \returns the lane that we should start reordering from. This is the one
1957 /// which has the least number of operands that can freely move about or
1958 /// less profitable because it already has the most optimal set of operands.
1959 unsigned getBestLaneToStartReordering() const {
1960 unsigned Min = UINT_MAX;
1961 unsigned SameOpNumber = 0;
1962 // std::pair<unsigned, unsigned> is used to implement a simple voting
1963 // algorithm and choose the lane with the least number of operands that
1964 // can freely move about or less profitable because it already has the
1965 // most optimal set of operands. The first unsigned is a counter for
1966 // voting, the second unsigned is the counter of lanes with instructions
1967 // with same/alternate opcodes and same parent basic block.
1969 // Try to be closer to the original results, if we have multiple lanes
1970 // with same cost. If 2 lanes have the same cost, use the one with the
1971 // lowest index.
1972 for (int I = getNumLanes(); I > 0; --I) {
1973 unsigned Lane = I - 1;
1974 OperandsOrderData NumFreeOpsHash =
1975 getMaxNumOperandsThatCanBeReordered(Lane);
1976 // Compare the number of operands that can move and choose the one with
1977 // the least number.
1978 if (NumFreeOpsHash.NumOfAPOs < Min) {
1979 Min = NumFreeOpsHash.NumOfAPOs;
1980 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1981 HashMap.clear();
1982 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1983 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1984 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1985 // Select the most optimal lane in terms of number of operands that
1986 // should be moved around.
1987 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1988 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1989 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1990 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1991 auto *It = HashMap.find(NumFreeOpsHash.Hash);
1992 if (It == HashMap.end())
1993 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1994 else
1995 ++It->second.first;
1996 }
1997 }
1998 // Select the lane with the minimum counter.
1999 unsigned BestLane = 0;
2000 unsigned CntMin = UINT_MAX;
2001 for (const auto &Data : reverse(HashMap)) {
2002 if (Data.second.first < CntMin) {
2003 CntMin = Data.second.first;
2004 BestLane = Data.second.second;
2005 }
2006 }
2007 return BestLane;
2008 }
2009
2010 /// Data structure that helps to reorder operands.
2011 struct OperandsOrderData {
2012 /// The best number of operands with the same APOs, which can be
2013 /// reordered.
2014 unsigned NumOfAPOs = UINT_MAX;
2015 /// Number of operands with the same/alternate instruction opcode and
2016 /// parent.
2017 unsigned NumOpsWithSameOpcodeParent = 0;
2018 /// Hash for the actual operands ordering.
2019 /// Used to count operands, actually their position id and opcode
2020 /// value. It is used in the voting mechanism to find the lane with the
2021 /// least number of operands that can freely move about or less profitable
2022 /// because it already has the most optimal set of operands. Can be
2023 /// replaced with SmallVector<unsigned> instead but hash code is faster
2024 /// and requires less memory.
2025 unsigned Hash = 0;
2026 };
2027 /// \returns the maximum number of operands that are allowed to be reordered
2028 /// for \p Lane and the number of compatible instructions(with the same
2029 /// parent/opcode). This is used as a heuristic for selecting the first lane
2030 /// to start operand reordering.
2031 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2032 unsigned CntTrue = 0;
2033 unsigned NumOperands = getNumOperands();
2034 // Operands with the same APO can be reordered. We therefore need to count
2035 // how many of them we have for each APO, like this: Cnt[APO] = x.
2036 // Since we only have two APOs, namely true and false, we can avoid using
2037 // a map. Instead we can simply count the number of operands that
2038 // correspond to one of them (in this case the 'true' APO), and calculate
2039 // the other by subtracting it from the total number of operands.
2040 // Operands with the same instruction opcode and parent are more
2041 // profitable since we don't need to move them in many cases, with a high
2042 // probability such lane already can be vectorized effectively.
2043 bool AllUndefs = true;
2044 unsigned NumOpsWithSameOpcodeParent = 0;
2045 Instruction *OpcodeI = nullptr;
2046 BasicBlock *Parent = nullptr;
2047 unsigned Hash = 0;
2048 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2049 const OperandData &OpData = getData(OpIdx, Lane);
2050 if (OpData.APO)
2051 ++CntTrue;
2052 // Use Boyer-Moore majority voting for finding the majority opcode and
2053 // the number of times it occurs.
2054 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2055 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
2056 I->getParent() != Parent) {
2057 if (NumOpsWithSameOpcodeParent == 0) {
2058 NumOpsWithSameOpcodeParent = 1;
2059 OpcodeI = I;
2060 Parent = I->getParent();
2061 } else {
2062 --NumOpsWithSameOpcodeParent;
2063 }
2064 } else {
2065 ++NumOpsWithSameOpcodeParent;
2066 }
2067 }
2068 Hash = hash_combine(
2069 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2070 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2071 }
2072 if (AllUndefs)
2073 return {};
2074 OperandsOrderData Data;
2075 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2076 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2077 Data.Hash = Hash;
2078 return Data;
2079 }
2080
2081 /// Go through the instructions in VL and append their operands.
2082 void appendOperandsOfVL(ArrayRef<Value *> VL) {
2083 assert(!VL.empty() && "Bad VL");
2084 assert((empty() || VL.size() == getNumLanes()) &&
2085 "Expected same number of lanes");
2086 assert(isa<Instruction>(VL[0]) && "Expected instruction");
2087 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2088 constexpr unsigned IntrinsicNumOperands = 2;
2089 if (isa<IntrinsicInst>(VL[0]))
2090 NumOperands = IntrinsicNumOperands;
2091 OpsVec.resize(NumOperands);
2092 unsigned NumLanes = VL.size();
2093 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2094 OpsVec[OpIdx].resize(NumLanes);
2095 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2096 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2097 // Our tree has just 3 nodes: the root and two operands.
2098 // It is therefore trivial to get the APO. We only need to check the
2099 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2100 // RHS operand. The LHS operand of both add and sub is never attached
2101 // to an inversese operation in the linearized form, therefore its APO
2102 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2103
2104 // Since operand reordering is performed on groups of commutative
2105 // operations or alternating sequences (e.g., +, -), we can safely
2106 // tell the inverse operations by checking commutativity.
2107 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2108 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2109 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2110 APO, false};
2111 }
2112 }
2113 }
2114
2115 /// \returns the number of operands.
2116 unsigned getNumOperands() const { return OpsVec.size(); }
2117
2118 /// \returns the number of lanes.
2119 unsigned getNumLanes() const { return OpsVec[0].size(); }
2120
2121 /// \returns the operand value at \p OpIdx and \p Lane.
2122 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2123 return getData(OpIdx, Lane).V;
2124 }
2125
2126 /// \returns true if the data structure is empty.
2127 bool empty() const { return OpsVec.empty(); }
2128
2129 /// Clears the data.
2130 void clear() { OpsVec.clear(); }
2131
2132 /// \Returns true if there are enough operands identical to \p Op to fill
2133 /// the whole vector (it is mixed with constants or loop invariant values).
2134 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2135 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2136 bool OpAPO = getData(OpIdx, Lane).APO;
2137 bool IsInvariant = L && L->isLoopInvariant(Op);
2138 unsigned Cnt = 0;
2139 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2140 if (Ln == Lane)
2141 continue;
2142 // This is set to true if we found a candidate for broadcast at Lane.
2143 bool FoundCandidate = false;
2144 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2145 OperandData &Data = getData(OpI, Ln);
2146 if (Data.APO != OpAPO || Data.IsUsed)
2147 continue;
2148 Value *OpILane = getValue(OpI, Lane);
2149 bool IsConstantOp = isa<Constant>(OpILane);
2150 // Consider the broadcast candidate if:
2151 // 1. Same value is found in one of the operands.
2152 if (Data.V == Op ||
2153 // 2. The operand in the given lane is not constant but there is a
2154 // constant operand in another lane (which can be moved to the
2155 // given lane). In this case we can represent it as a simple
2156 // permutation of constant and broadcast.
2157 (!IsConstantOp &&
2158 ((Lns > 2 && isa<Constant>(Data.V)) ||
2159 // 2.1. If we have only 2 lanes, need to check that value in the
2160 // next lane does not build same opcode sequence.
2161 (Lns == 2 &&
2162 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
2163 .getOpcode() &&
2164 isa<Constant>(Data.V)))) ||
2165 // 3. The operand in the current lane is loop invariant (can be
2166 // hoisted out) and another operand is also a loop invariant
2167 // (though not a constant). In this case the whole vector can be
2168 // hoisted out.
2169 // FIXME: need to teach the cost model about this case for better
2170 // estimation.
2171 (IsInvariant && !isa<Constant>(Data.V) &&
2172 !getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
2173 L->isLoopInvariant(Data.V))) {
2174 FoundCandidate = true;
2175 Data.IsUsed = Data.V == Op;
2176 if (Data.V == Op)
2177 ++Cnt;
2178 break;
2179 }
2180 }
2181 if (!FoundCandidate)
2182 return false;
2183 }
2184 return getNumLanes() == 2 || Cnt > 1;
2185 }
2186
2187 public:
2188 /// Initialize with all the operands of the instruction vector \p RootVL.
2190 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2191 L(R.LI->getLoopFor(
2192 (cast<Instruction>(RootVL.front())->getParent()))) {
2193 // Append all the operands of RootVL.
2194 appendOperandsOfVL(RootVL);
2195 }
2196
2197 /// \Returns a value vector with the operands across all lanes for the
2198 /// opearnd at \p OpIdx.
2199 ValueList getVL(unsigned OpIdx) const {
2200 ValueList OpVL(OpsVec[OpIdx].size());
2201 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2202 "Expected same num of lanes across all operands");
2203 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2204 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2205 return OpVL;
2206 }
2207
2208 // Performs operand reordering for 2 or more operands.
2209 // The original operands are in OrigOps[OpIdx][Lane].
2210 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2211 void reorder() {
2212 unsigned NumOperands = getNumOperands();
2213 unsigned NumLanes = getNumLanes();
2214 // Each operand has its own mode. We are using this mode to help us select
2215 // the instructions for each lane, so that they match best with the ones
2216 // we have selected so far.
2217 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2218
2219 // This is a greedy single-pass algorithm. We are going over each lane
2220 // once and deciding on the best order right away with no back-tracking.
2221 // However, in order to increase its effectiveness, we start with the lane
2222 // that has operands that can move the least. For example, given the
2223 // following lanes:
2224 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2225 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2226 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2227 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2228 // we will start at Lane 1, since the operands of the subtraction cannot
2229 // be reordered. Then we will visit the rest of the lanes in a circular
2230 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2231
2232 // Find the first lane that we will start our search from.
2233 unsigned FirstLane = getBestLaneToStartReordering();
2234
2235 // Initialize the modes.
2236 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2237 Value *OpLane0 = getValue(OpIdx, FirstLane);
2238 // Keep track if we have instructions with all the same opcode on one
2239 // side.
2240 if (isa<LoadInst>(OpLane0))
2241 ReorderingModes[OpIdx] = ReorderingMode::Load;
2242 else if (isa<Instruction>(OpLane0)) {
2243 // Check if OpLane0 should be broadcast.
2244 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2245 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2246 else
2247 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2248 }
2249 else if (isa<Constant>(OpLane0))
2250 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2251 else if (isa<Argument>(OpLane0))
2252 // Our best hope is a Splat. It may save some cost in some cases.
2253 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2254 else
2255 // NOTE: This should be unreachable.
2256 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2257 }
2258
2259 // Check that we don't have same operands. No need to reorder if operands
2260 // are just perfect diamond or shuffled diamond match. Do not do it only
2261 // for possible broadcasts or non-power of 2 number of scalars (just for
2262 // now).
2263 auto &&SkipReordering = [this]() {
2264 SmallPtrSet<Value *, 4> UniqueValues;
2265 ArrayRef<OperandData> Op0 = OpsVec.front();
2266 for (const OperandData &Data : Op0)
2267 UniqueValues.insert(Data.V);
2268 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2269 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2270 return !UniqueValues.contains(Data.V);
2271 }))
2272 return false;
2273 }
2274 // TODO: Check if we can remove a check for non-power-2 number of
2275 // scalars after full support of non-power-2 vectorization.
2276 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2277 };
2278
2279 // If the initial strategy fails for any of the operand indexes, then we
2280 // perform reordering again in a second pass. This helps avoid assigning
2281 // high priority to the failed strategy, and should improve reordering for
2282 // the non-failed operand indexes.
2283 for (int Pass = 0; Pass != 2; ++Pass) {
2284 // Check if no need to reorder operands since they're are perfect or
2285 // shuffled diamond match.
2286 // Need to do it to avoid extra external use cost counting for
2287 // shuffled matches, which may cause regressions.
2288 if (SkipReordering())
2289 break;
2290 // Skip the second pass if the first pass did not fail.
2291 bool StrategyFailed = false;
2292 // Mark all operand data as free to use.
2293 clearUsed();
2294 // We keep the original operand order for the FirstLane, so reorder the
2295 // rest of the lanes. We are visiting the nodes in a circular fashion,
2296 // using FirstLane as the center point and increasing the radius
2297 // distance.
2298 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2299 for (unsigned I = 0; I < NumOperands; ++I)
2300 MainAltOps[I].push_back(getData(I, FirstLane).V);
2301
2302 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2303 // Visit the lane on the right and then the lane on the left.
2304 for (int Direction : {+1, -1}) {
2305 int Lane = FirstLane + Direction * Distance;
2306 if (Lane < 0 || Lane >= (int)NumLanes)
2307 continue;
2308 int LastLane = Lane - Direction;
2309 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2310 "Out of bounds");
2311 // Look for a good match for each operand.
2312 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2313 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2314 std::optional<unsigned> BestIdx = getBestOperand(
2315 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2316 // By not selecting a value, we allow the operands that follow to
2317 // select a better matching value. We will get a non-null value in
2318 // the next run of getBestOperand().
2319 if (BestIdx) {
2320 // Swap the current operand with the one returned by
2321 // getBestOperand().
2322 swap(OpIdx, *BestIdx, Lane);
2323 } else {
2324 // Enable the second pass.
2325 StrategyFailed = true;
2326 }
2327 // Try to get the alternate opcode and follow it during analysis.
2328 if (MainAltOps[OpIdx].size() != 2) {
2329 OperandData &AltOp = getData(OpIdx, Lane);
2330 InstructionsState OpS =
2331 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2332 if (OpS.getOpcode() && OpS.isAltShuffle())
2333 MainAltOps[OpIdx].push_back(AltOp.V);
2334 }
2335 }
2336 }
2337 }
2338 // Skip second pass if the strategy did not fail.
2339 if (!StrategyFailed)
2340 break;
2341 }
2342 }
2343
2344#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2345 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2346 switch (RMode) {
2347 case ReorderingMode::Load:
2348 return "Load";
2349 case ReorderingMode::Opcode:
2350 return "Opcode";
2351 case ReorderingMode::Constant:
2352 return "Constant";
2353 case ReorderingMode::Splat:
2354 return "Splat";
2355 case ReorderingMode::Failed:
2356 return "Failed";
2357 }
2358 llvm_unreachable("Unimplemented Reordering Type");
2359 }
2360
2361 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2362 raw_ostream &OS) {
2363 return OS << getModeStr(RMode);
2364 }
2365
2366 /// Debug print.
2367 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2368 printMode(RMode, dbgs());
2369 }
2370
2371 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2372 return printMode(RMode, OS);
2373 }
2374
2376 const unsigned Indent = 2;
2377 unsigned Cnt = 0;
2378 for (const OperandDataVec &OpDataVec : OpsVec) {
2379 OS << "Operand " << Cnt++ << "\n";
2380 for (const OperandData &OpData : OpDataVec) {
2381 OS.indent(Indent) << "{";
2382 if (Value *V = OpData.V)
2383 OS << *V;
2384 else
2385 OS << "null";
2386 OS << ", APO:" << OpData.APO << "}\n";
2387 }
2388 OS << "\n";
2389 }
2390 return OS;
2391 }
2392
2393 /// Debug print.
2394 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2395#endif
2396 };
2397
2398 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2399 /// for a pair which have highest score deemed to have best chance to form
2400 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2401 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2402 /// of the cost, considered to be good enough score.
2403 std::optional<int>
2404 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2405 int Limit = LookAheadHeuristics::ScoreFail) const {
2406 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2408 int BestScore = Limit;
2409 std::optional<int> Index;
2410 for (int I : seq<int>(0, Candidates.size())) {
2411 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2412 Candidates[I].second,
2413 /*U1=*/nullptr, /*U2=*/nullptr,
2414 /*Level=*/1, std::nullopt);
2415 if (Score > BestScore) {
2416 BestScore = Score;
2417 Index = I;
2418 }
2419 }
2420 return Index;
2421 }
2422
2423 /// Checks if the instruction is marked for deletion.
2424 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2425
2426 /// Removes an instruction from its block and eventually deletes it.
2427 /// It's like Instruction::eraseFromParent() except that the actual deletion
2428 /// is delayed until BoUpSLP is destructed.
2430 DeletedInstructions.insert(I);
2431 }
2432
2433 /// Checks if the instruction was already analyzed for being possible
2434 /// reduction root.
2436 return AnalyzedReductionsRoots.count(I);
2437 }
2438 /// Register given instruction as already analyzed for being possible
2439 /// reduction root.
2441 AnalyzedReductionsRoots.insert(I);
2442 }
2443 /// Checks if the provided list of reduced values was checked already for
2444 /// vectorization.
2446 return AnalyzedReductionVals.contains(hash_value(VL));
2447 }
2448 /// Adds the list of reduced values to list of already checked values for the
2449 /// vectorization.
2451 AnalyzedReductionVals.insert(hash_value(VL));
2452 }
2453 /// Clear the list of the analyzed reduction root instructions.
2455 AnalyzedReductionsRoots.clear();
2456 AnalyzedReductionVals.clear();
2457 AnalyzedMinBWVals.clear();
2458 }
2459 /// Checks if the given value is gathered in one of the nodes.
2460 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2461 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2462 }
2463 /// Checks if the given value is gathered in one of the nodes.
2464 bool isGathered(const Value *V) const {
2465 return MustGather.contains(V);
2466 }
2467 /// Checks if the specified value was not schedule.
2468 bool isNotScheduled(const Value *V) const {
2469 return NonScheduledFirst.contains(V);
2470 }
2471
2472 /// Check if the value is vectorized in the tree.
2473 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2474
2475 ~BoUpSLP();
2476
2477private:
2478 /// Determine if a node \p E in can be demoted to a smaller type with a
2479 /// truncation. We collect the entries that will be demoted in ToDemote.
2480 /// \param E Node for analysis
2481 /// \param ToDemote indices of the nodes to be demoted.
2482 bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
2483 unsigned &BitWidth,
2484 SmallVectorImpl<unsigned> &ToDemote,
2486 unsigned &MaxDepthLevel,
2487 bool &IsProfitableToDemote,
2488 bool IsTruncRoot) const;
2489
2490 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2491 /// reordering (i.e. the operands can be reordered because they have only one
2492 /// user and reordarable).
2493 /// \param ReorderableGathers List of all gather nodes that require reordering
2494 /// (e.g., gather of extractlements or partially vectorizable loads).
2495 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2496 /// reordering, subset of \p NonVectorized.
2497 bool
2498 canReorderOperands(TreeEntry *UserTE,
2499 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2500 ArrayRef<TreeEntry *> ReorderableGathers,
2501 SmallVectorImpl<TreeEntry *> &GatherOps);
2502
2503 /// Checks if the given \p TE is a gather node with clustered reused scalars
2504 /// and reorders it per given \p Mask.
2505 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2506
2507 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2508 /// if any. If it is not vectorized (gather node), returns nullptr.
2509 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2510 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2511 TreeEntry *TE = nullptr;
2512 const auto *It = find_if(VL, [&](Value *V) {
2513 TE = getTreeEntry(V);
2514 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2515 return true;
2516 auto It = MultiNodeScalars.find(V);
2517 if (It != MultiNodeScalars.end()) {
2518 for (TreeEntry *E : It->second) {
2519 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2520 TE = E;
2521 return true;
2522 }
2523 }
2524 }
2525 return false;
2526 });
2527 if (It != VL.end()) {
2528 assert(TE->isSame(VL) && "Expected same scalars.");
2529 return TE;
2530 }
2531 return nullptr;
2532 }
2533
2534 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2535 /// if any. If it is not vectorized (gather node), returns nullptr.
2536 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2537 unsigned OpIdx) const {
2538 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2539 const_cast<TreeEntry *>(UserTE), OpIdx);
2540 }
2541
2542 /// Checks if all users of \p I are the part of the vectorization tree.
2543 bool areAllUsersVectorized(
2544 Instruction *I,
2545 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2546
2547 /// Return information about the vector formed for the specified index
2548 /// of a vector of (the same) instruction.
2550
2551 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2552 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2553
2554 /// \returns Cast context for the given graph node.
2556 getCastContextHint(const TreeEntry &TE) const;
2557
2558 /// \returns the cost of the vectorizable entry.
2559 InstructionCost getEntryCost(const TreeEntry *E,
2560 ArrayRef<Value *> VectorizedVals,
2561 SmallPtrSetImpl<Value *> &CheckedExtracts);
2562
2563 /// This is the recursive part of buildTree.
2564 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2565 const EdgeInfo &EI);
2566
2567 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2568 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2569 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2570 /// returns false, setting \p CurrentOrder to either an empty vector or a
2571 /// non-identity permutation that allows to reuse extract instructions.
2572 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2573 /// extract order.
2574 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2575 SmallVectorImpl<unsigned> &CurrentOrder,
2576 bool ResizeAllowed = false) const;
2577
2578 /// Vectorize a single entry in the tree.
2579 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2580 /// avoid issues with def-use order.
2581 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2582
2583 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2584 /// \p E.
2585 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2586 /// avoid issues with def-use order.
2587 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2588
2589 /// Create a new vector from a list of scalar values. Produces a sequence
2590 /// which exploits values reused across lanes, and arranges the inserts
2591 /// for ease of later optimization.
2592 template <typename BVTy, typename ResTy, typename... Args>
2593 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
2594
2595 /// Create a new vector from a list of scalar values. Produces a sequence
2596 /// which exploits values reused across lanes, and arranges the inserts
2597 /// for ease of later optimization.
2598 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
2599
2600 /// Returns the instruction in the bundle, which can be used as a base point
2601 /// for scheduling. Usually it is the last instruction in the bundle, except
2602 /// for the case when all operands are external (in this case, it is the first
2603 /// instruction in the list).
2604 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2605
2606 /// Tries to find extractelement instructions with constant indices from fixed
2607 /// vector type and gather such instructions into a bunch, which highly likely
2608 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2609 /// was successful, the matched scalars are replaced by poison values in \p VL
2610 /// for future analysis.
2611 std::optional<TargetTransformInfo::ShuffleKind>
2612 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2613 SmallVectorImpl<int> &Mask) const;
2614
2615 /// Tries to find extractelement instructions with constant indices from fixed
2616 /// vector type and gather such instructions into a bunch, which highly likely
2617 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2618 /// was successful, the matched scalars are replaced by poison values in \p VL
2619 /// for future analysis.
2621 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2623 unsigned NumParts) const;
2624
2625 /// Checks if the gathered \p VL can be represented as a single register
2626 /// shuffle(s) of previous tree entries.
2627 /// \param TE Tree entry checked for permutation.
2628 /// \param VL List of scalars (a subset of the TE scalar), checked for
2629 /// permutations. Must form single-register vector.
2630 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2631 /// commands to build the mask using the original vector value, without
2632 /// relying on the potential reordering.
2633 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2634 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2635 std::optional<TargetTransformInfo::ShuffleKind>
2636 isGatherShuffledSingleRegisterEntry(
2637 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2638 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2639 bool ForOrder);
2640
2641 /// Checks if the gathered \p VL can be represented as multi-register
2642 /// shuffle(s) of previous tree entries.
2643 /// \param TE Tree entry checked for permutation.
2644 /// \param VL List of scalars (a subset of the TE scalar), checked for
2645 /// permutations.
2646 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2647 /// commands to build the mask using the original vector value, without
2648 /// relying on the potential reordering.
2649 /// \returns per-register series of ShuffleKind, if gathered values can be
2650 /// represented as shuffles of previous tree entries. \p Mask is filled with
2651 /// the shuffle mask (also on per-register base).
2653 isGatherShuffledEntry(
2654 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2656 unsigned NumParts, bool ForOrder = false);
2657
2658 /// \returns the scalarization cost for this list of values. Assuming that
2659 /// this subtree gets vectorized, we may need to extract the values from the
2660 /// roots. This method calculates the cost of extracting the values.
2661 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2662 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
2663 Type *ScalarTy) const;
2664
2665 /// Set the Builder insert point to one after the last instruction in
2666 /// the bundle
2667 void setInsertPointAfterBundle(const TreeEntry *E);
2668
2669 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2670 /// specified, the starting vector value is poison.
2671 Value *gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy);
2672
2673 /// \returns whether the VectorizableTree is fully vectorizable and will
2674 /// be beneficial even the tree height is tiny.
2675 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2676
2677 /// Reorder commutative or alt operands to get better probability of
2678 /// generating vectorized code.
2679 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2682 const BoUpSLP &R);
2683
2684 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2685 /// users of \p TE and collects the stores. It returns the map from the store
2686 /// pointers to the collected stores.
2688 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2689
2690 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2691 /// stores in \p StoresVec can form a vector instruction. If so it returns
2692 /// true and populates \p ReorderIndices with the shuffle indices of the
2693 /// stores when compared to the sorted vector.
2694 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2695 OrdersType &ReorderIndices) const;
2696
2697 /// Iterates through the users of \p TE, looking for scalar stores that can be
2698 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2699 /// their order and builds an order index vector for each store bundle. It
2700 /// returns all these order vectors found.
2701 /// We run this after the tree has formed, otherwise we may come across user
2702 /// instructions that are not yet in the tree.
2704 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2705
2706 struct TreeEntry {
2707 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2708 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2709
2710 /// \returns Common mask for reorder indices and reused scalars.
2711 SmallVector<int> getCommonMask() const {
2713 inversePermutation(ReorderIndices, Mask);
2714 ::addMask(Mask, ReuseShuffleIndices);
2715 return Mask;
2716 }
2717
2718 /// \returns true if the scalars in VL are equal to this entry.
2719 bool isSame(ArrayRef<Value *> VL) const {
2720 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2721 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2722 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2723 return VL.size() == Mask.size() &&
2724 std::equal(VL.begin(), VL.end(), Mask.begin(),
2725 [Scalars](Value *V, int Idx) {
2726 return (isa<UndefValue>(V) &&
2727 Idx == PoisonMaskElem) ||
2728 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2729 });
2730 };
2731 if (!ReorderIndices.empty()) {
2732 // TODO: implement matching if the nodes are just reordered, still can
2733 // treat the vector as the same if the list of scalars matches VL
2734 // directly, without reordering.
2736 inversePermutation(ReorderIndices, Mask);
2737 if (VL.size() == Scalars.size())
2738 return IsSame(Scalars, Mask);
2739 if (VL.size() == ReuseShuffleIndices.size()) {
2740 ::addMask(Mask, ReuseShuffleIndices);
2741 return IsSame(Scalars, Mask);
2742 }
2743 return false;
2744 }
2745 return IsSame(Scalars, ReuseShuffleIndices);
2746 }
2747
2748 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2749 return State == TreeEntry::NeedToGather &&
2750 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2751 UserTreeIndices.front().UserTE == UserEI.UserTE;
2752 }
2753
2754 /// \returns true if current entry has same operands as \p TE.
2755 bool hasEqualOperands(const TreeEntry &TE) const {
2756 if (TE.getNumOperands() != getNumOperands())
2757 return false;
2758 SmallBitVector Used(getNumOperands());
2759 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2760 unsigned PrevCount = Used.count();
2761 for (unsigned K = 0; K < E; ++K) {
2762 if (Used.test(K))
2763 continue;
2764 if (getOperand(K) == TE.getOperand(I)) {
2765 Used.set(K);
2766 break;
2767 }
2768 }
2769 // Check if we actually found the matching operand.
2770 if (PrevCount == Used.count())
2771 return false;
2772 }
2773 return true;
2774 }
2775
2776 /// \return Final vectorization factor for the node. Defined by the total
2777 /// number of vectorized scalars, including those, used several times in the
2778 /// entry and counted in the \a ReuseShuffleIndices, if any.
2779 unsigned getVectorFactor() const {
2780 if (!ReuseShuffleIndices.empty())
2781 return ReuseShuffleIndices.size();
2782 return Scalars.size();
2783 };
2784
2785 /// A vector of scalars.
2786 ValueList Scalars;
2787
2788 /// The Scalars are vectorized into this value. It is initialized to Null.
2789 WeakTrackingVH VectorizedValue = nullptr;
2790
2791 /// New vector phi instructions emitted for the vectorized phi nodes.
2792 PHINode *PHI = nullptr;
2793
2794 /// Do we need to gather this sequence or vectorize it
2795 /// (either with vector instruction or with scatter/gather
2796 /// intrinsics for store/load)?
2797 enum EntryState {
2798 Vectorize,
2799 ScatterVectorize,
2800 StridedVectorize,
2801 NeedToGather
2802 };
2803 EntryState State;
2804
2805 /// Does this sequence require some shuffling?
2806 SmallVector<int, 4> ReuseShuffleIndices;
2807
2808 /// Does this entry require reordering?
2809 SmallVector<unsigned, 4> ReorderIndices;
2810
2811 /// Points back to the VectorizableTree.
2812 ///
2813 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2814 /// to be a pointer and needs to be able to initialize the child iterator.
2815 /// Thus we need a reference back to the container to translate the indices
2816 /// to entries.
2817 VecTreeTy &Container;
2818
2819 /// The TreeEntry index containing the user of this entry. We can actually
2820 /// have multiple users so the data structure is not truly a tree.
2821 SmallVector<EdgeInfo, 1> UserTreeIndices;
2822
2823 /// The index of this treeEntry in VectorizableTree.
2824 int Idx = -1;
2825
2826 private:
2827 /// The operands of each instruction in each lane Operands[op_index][lane].
2828 /// Note: This helps avoid the replication of the code that performs the
2829 /// reordering of operands during buildTree_rec() and vectorizeTree().
2831
2832 /// The main/alternate instruction.
2833 Instruction *MainOp = nullptr;
2834 Instruction *AltOp = nullptr;
2835
2836 public:
2837 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
2838 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2839 if (Operands.size() < OpIdx + 1)
2840 Operands.resize(OpIdx + 1);
2841 assert(Operands[OpIdx].empty() && "Already resized?");
2842 assert(OpVL.size() <= Scalars.size() &&
2843 "Number of operands is greater than the number of scalars.");
2844 Operands[OpIdx].resize(OpVL.size());
2845 copy(OpVL, Operands[OpIdx].begin());
2846 }
2847
2848 /// Set the operands of this bundle in their original order.
2849 void setOperandsInOrder() {
2850 assert(Operands.empty() && "Already initialized?");
2851 auto *I0 = cast<Instruction>(Scalars[0]);
2852 Operands.resize(I0->getNumOperands());
2853 unsigned NumLanes = Scalars.size();
2854 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2855 OpIdx != NumOperands; ++OpIdx) {
2856 Operands[OpIdx].resize(NumLanes);
2857 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2858 auto *I = cast<Instruction>(Scalars[Lane]);
2859 assert(I->getNumOperands() == NumOperands &&
2860 "Expected same number of operands");
2861 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
2862 }
2863 }
2864 }
2865
2866 /// Reorders operands of the node to the given mask \p Mask.
2867 void reorderOperands(ArrayRef<int> Mask) {
2868 for (ValueList &Operand : Operands)
2869 reorderScalars(Operand, Mask);
2870 }
2871
2872 /// \returns the \p OpIdx operand of this TreeEntry.
2873 ValueList &getOperand(unsigned OpIdx) {
2874 assert(OpIdx < Operands.size() && "Off bounds");
2875 return Operands[OpIdx];
2876 }
2877
2878 /// \returns the \p OpIdx operand of this TreeEntry.
2879 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
2880 assert(OpIdx < Operands.size() && "Off bounds");
2881 return Operands[OpIdx];
2882 }
2883
2884 /// \returns the number of operands.
2885 unsigned getNumOperands() const { return Operands.size(); }
2886
2887 /// \return the single \p OpIdx operand.
2888 Value *getSingleOperand(unsigned OpIdx) const {
2889 assert(OpIdx < Operands.size() && "Off bounds");
2890 assert(!Operands[OpIdx].empty() && "No operand available");
2891 return Operands[OpIdx][0];
2892 }
2893
2894 /// Some of the instructions in the list have alternate opcodes.
2895 bool isAltShuffle() const { return MainOp != AltOp; }
2896
2897 bool isOpcodeOrAlt(Instruction *I) const {
2898 unsigned CheckedOpcode = I->getOpcode();
2899 return (getOpcode() == CheckedOpcode ||
2900 getAltOpcode() == CheckedOpcode);
2901 }
2902
2903 /// Chooses the correct key for scheduling data. If \p Op has the same (or
2904 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
2905 /// \p OpValue.
2906 Value *isOneOf(Value *Op) const {
2907 auto *I = dyn_cast<Instruction>(Op);
2908 if (I && isOpcodeOrAlt(I))
2909 return Op;
2910 return MainOp;
2911 }
2912
2913 void setOperations(const InstructionsState &S) {
2914 MainOp = S.MainOp;
2915 AltOp = S.AltOp;
2916 }
2917
2918 Instruction *getMainOp() const {
2919 return MainOp;
2920 }
2921
2922 Instruction *getAltOp() const {
2923 return AltOp;
2924 }
2925
2926 /// The main/alternate opcodes for the list of instructions.
2927 unsigned getOpcode() const {
2928 return MainOp ? MainOp->getOpcode() : 0;
2929 }
2930
2931 unsigned getAltOpcode() const {
2932 return AltOp ? AltOp->getOpcode() : 0;
2933 }
2934
2935 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
2936 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
2937 int findLaneForValue(Value *V) const {
2938 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
2939 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2940 if (!ReorderIndices.empty())
2941 FoundLane = ReorderIndices[FoundLane];
2942 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2943 if (!ReuseShuffleIndices.empty()) {
2944 FoundLane = std::distance(ReuseShuffleIndices.begin(),
2945 find(ReuseShuffleIndices, FoundLane));
2946 }
2947 return FoundLane;
2948 }
2949
2950 /// Build a shuffle mask for graph entry which represents a merge of main
2951 /// and alternate operations.
2952 void
2953 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
2955 SmallVectorImpl<Value *> *OpScalars = nullptr,
2956 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
2957
2958 /// Return true if this is a non-power-of-2 node.
2959 bool isNonPowOf2Vec() const {
2960 bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size());
2961 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
2962 "Reshuffling not supported with non-power-of-2 vectors yet.");
2963 return IsNonPowerOf2;
2964 }
2965
2966#ifndef NDEBUG
2967 /// Debug printer.
2968 LLVM_DUMP_METHOD void dump() const {
2969 dbgs() << Idx << ".\n";
2970 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
2971 dbgs() << "Operand " << OpI << ":\n";
2972 for (const Value *V : Operands[OpI])
2973 dbgs().indent(2) << *V << "\n";
2974 }
2975 dbgs() << "Scalars: \n";
2976 for (Value *V : Scalars)
2977 dbgs().indent(2) << *V << "\n";
2978 dbgs() << "State: ";
2979 switch (State) {
2980 case Vectorize:
2981 dbgs() << "Vectorize\n";
2982 break;
2983 case ScatterVectorize:
2984 dbgs() << "ScatterVectorize\n";
2985 break;
2986 case StridedVectorize:
2987 dbgs() << "StridedVectorize\n";
2988 break;
2989 case NeedToGather:
2990 dbgs() << "NeedToGather\n";
2991 break;
2992 }
2993 dbgs() << "MainOp: ";
2994 if (MainOp)
2995 dbgs() << *MainOp << "\n";
2996 else
2997 dbgs() << "NULL\n";
2998 dbgs() << "AltOp: ";
2999 if (AltOp)
3000 dbgs() << *AltOp << "\n";
3001 else
3002 dbgs() << "NULL\n";
3003 dbgs() << "VectorizedValue: ";
3004 if (VectorizedValue)
3005 dbgs() << *VectorizedValue << "\n";
3006 else
3007 dbgs() << "NULL\n";
3008 dbgs() << "ReuseShuffleIndices: ";
3009 if (ReuseShuffleIndices.empty())
3010 dbgs() << "Empty";
3011 else
3012 for (int ReuseIdx : ReuseShuffleIndices)
3013 dbgs() << ReuseIdx << ", ";
3014 dbgs() << "\n";
3015 dbgs() << "ReorderIndices: ";
3016 for (unsigned ReorderIdx : ReorderIndices)
3017 dbgs() << ReorderIdx << ", ";
3018 dbgs() << "\n";
3019 dbgs() << "UserTreeIndices: ";
3020 for (const auto &EInfo : UserTreeIndices)
3021 dbgs() << EInfo << ", ";
3022 dbgs() << "\n";
3023 }
3024#endif
3025 };
3026
3027#ifndef NDEBUG
3028 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3029 InstructionCost VecCost, InstructionCost ScalarCost,
3030 StringRef Banner) const {
3031 dbgs() << "SLP: " << Banner << ":\n";
3032 E->dump();
3033 dbgs() << "SLP: Costs:\n";
3034 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3035 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
3036 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
3037 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3038 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3039 }
3040#endif
3041
3042 /// Create a new VectorizableTree entry.
3043 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3044 std::optional<ScheduleData *> Bundle,
3045 const InstructionsState &S,
3046 const EdgeInfo &UserTreeIdx,
3047 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3048 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3049 TreeEntry::EntryState EntryState =
3050 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3051 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3052 ReuseShuffleIndices, ReorderIndices);
3053 }
3054
3055 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3056 TreeEntry::EntryState EntryState,
3057 std::optional<ScheduleData *> Bundle,
3058 const InstructionsState &S,
3059 const EdgeInfo &UserTreeIdx,
3060 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3061 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3062 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3063 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3064 "Need to vectorize gather entry?");
3065 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3066 TreeEntry *Last = VectorizableTree.back().get();
3067 Last->Idx = VectorizableTree.size() - 1;
3068 Last->State = EntryState;
3069 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3070 ReuseShuffleIndices.end());
3071 if (ReorderIndices.empty()) {
3072 Last->Scalars.assign(VL.begin(), VL.end());
3073 Last->setOperations(S);
3074 } else {
3075 // Reorder scalars and build final mask.
3076 Last->Scalars.assign(VL.size(), nullptr);
3077 transform(ReorderIndices, Last->Scalars.begin(),
3078 [VL](unsigned Idx) -> Value * {
3079 if (Idx >= VL.size())
3080 return UndefValue::get(VL.front()->getType());
3081 return VL[Idx];
3082 });
3083 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3084 Last->setOperations(S);
3085 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3086 }
3087 if (Last->State != TreeEntry::NeedToGather) {
3088 for (Value *V : VL) {
3089 const TreeEntry *TE = getTreeEntry(V);
3090 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3091 "Scalar already in tree!");
3092 if (TE) {
3093 if (TE != Last)
3094 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3095 continue;
3096 }
3097 ScalarToTreeEntry[V] = Last;
3098 }
3099 // Update the scheduler bundle to point to this TreeEntry.
3100 ScheduleData *BundleMember = *Bundle;
3101 assert((BundleMember || isa<PHINode>(S.MainOp) ||
3102 isVectorLikeInstWithConstOps(S.MainOp) ||
3103 doesNotNeedToSchedule(VL)) &&
3104 "Bundle and VL out of sync");
3105 if (BundleMember) {
3106 for (Value *V : VL) {
3108 continue;
3109 if (!BundleMember)
3110 continue;
3111 BundleMember->TE = Last;
3112 BundleMember = BundleMember->NextInBundle;
3113 }
3114 }
3115 assert(!BundleMember && "Bundle and VL out of sync");
3116 } else {
3117 // Build a map for gathered scalars to the nodes where they are used.
3118 bool AllConstsOrCasts = true;
3119 for (Value *V : VL)
3120 if (!isConstant(V)) {
3121 auto *I = dyn_cast<CastInst>(V);
3122 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3123 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3124 }
3125 if (AllConstsOrCasts)
3126 CastMaxMinBWSizes =
3127 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3128 MustGather.insert(VL.begin(), VL.end());
3129 }
3130
3131 if (UserTreeIdx.UserTE) {
3132 Last->UserTreeIndices.push_back(UserTreeIdx);
3133 assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3134 "Reordering isn't implemented for non-power-of-2 nodes yet");
3135 }
3136 return Last;
3137 }
3138
3139 /// -- Vectorization State --
3140 /// Holds all of the tree entries.
3141 TreeEntry::VecTreeTy VectorizableTree;
3142
3143#ifndef NDEBUG
3144 /// Debug printer.
3145 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3146 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3147 VectorizableTree[Id]->dump();
3148 dbgs() << "\n";
3149 }
3150 }
3151#endif
3152
3153 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3154
3155 const TreeEntry *getTreeEntry(Value *V) const {
3156 return ScalarToTreeEntry.lookup(V);
3157 }
3158
3159 /// Check that the operand node of alternate node does not generate
3160 /// buildvector sequence. If it is, then probably not worth it to build
3161 /// alternate shuffle, if number of buildvector operands + alternate
3162 /// instruction > than the number of buildvector instructions.
3163 /// \param S the instructions state of the analyzed values.
3164 /// \param VL list of the instructions with alternate opcodes.
3165 bool areAltOperandsProfitable(const InstructionsState &S,
3166 ArrayRef<Value *> VL) const;
3167
3168 /// Checks if the specified list of the instructions/values can be vectorized
3169 /// and fills required data before actual scheduling of the instructions.
3170 TreeEntry::EntryState getScalarsVectorizationState(
3171 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
3172 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
3173
3174 /// Maps a specific scalar to its tree entry.
3175 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3176
3177 /// List of scalars, used in several vectorize nodes, and the list of the
3178 /// nodes.
3180
3181 /// Maps a value to the proposed vectorizable size.
3182 SmallDenseMap<Value *, unsigned> InstrElementSize;
3183
3184 /// A list of scalars that we found that we need to keep as scalars.
3185 ValueSet MustGather;
3186
3187 /// A set of first non-schedulable values.
3188 ValueSet NonScheduledFirst;
3189
3190 /// A map between the vectorized entries and the last instructions in the
3191 /// bundles. The bundles are built in use order, not in the def order of the
3192 /// instructions. So, we cannot rely directly on the last instruction in the
3193 /// bundle being the last instruction in the program order during
3194 /// vectorization process since the basic blocks are affected, need to
3195 /// pre-gather them before.
3196 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3197
3198 /// List of gather nodes, depending on other gather/vector nodes, which should
3199 /// be emitted after the vector instruction emission process to correctly
3200 /// handle order of the vector instructions and shuffles.
3201 SetVector<const TreeEntry *> PostponedGathers;
3202
3203 using ValueToGatherNodesMap =
3205 ValueToGatherNodesMap ValueToGatherNodes;
3206
3207 /// This POD struct describes one external user in the vectorized tree.
3208 struct ExternalUser {
3209 ExternalUser(Value *S, llvm::User *U, int L)
3210 : Scalar(S), User(U), Lane(L) {}
3211
3212 // Which scalar in our function.
3213 Value *Scalar;
3214
3215 // Which user that uses the scalar.
3217
3218 // Which lane does the scalar belong to.
3219 int Lane;
3220 };
3221 using UserList = SmallVector<ExternalUser, 16>;
3222
3223 /// Checks if two instructions may access the same memory.
3224 ///
3225 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3226 /// is invariant in the calling loop.
3227 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3228 Instruction *Inst2) {
3229 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3230 return true;
3231 // First check if the result is already in the cache.
3232 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3233 auto It = AliasCache.find(Key);
3234 if (It != AliasCache.end())
3235 return It->second;
3236 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3237 // Store the result in the cache.
3238 AliasCache.try_emplace(Key, Aliased);
3239 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3240 return Aliased;
3241 }
3242
3243 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3244
3245 /// Cache for alias results.
3246 /// TODO: consider moving this to the AliasAnalysis itself.
3248
3249 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3250 // globally through SLP because we don't perform any action which
3251 // invalidates capture results.
3252 BatchAAResults BatchAA;
3253
3254 /// Temporary store for deleted instructions. Instructions will be deleted
3255 /// eventually when the BoUpSLP is destructed. The deferral is required to
3256 /// ensure that there are no incorrect collisions in the AliasCache, which
3257 /// can happen if a new instruction is allocated at the same address as a
3258 /// previously deleted instruction.
3259 DenseSet<Instruction *> DeletedInstructions;
3260
3261 /// Set of the instruction, being analyzed already for reductions.
3262 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3263
3264 /// Set of hashes for the list of reduction values already being analyzed.
3265 DenseSet<size_t> AnalyzedReductionVals;
3266
3267 /// Values, already been analyzed for mininmal bitwidth and found to be
3268 /// non-profitable.
3269 DenseSet<Value *> AnalyzedMinBWVals;
3270
3271 /// A list of values that need to extracted out of the tree.
3272 /// This list holds pairs of (Internal Scalar : External User). External User
3273 /// can be nullptr, it means that this Internal Scalar will be used later,
3274 /// after vectorization.
3275 UserList ExternalUses;
3276
3277 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3278 /// extractelement instructions.
3279 SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;
3280
3281 /// Values used only by @llvm.assume calls.
3283
3284 /// Holds all of the instructions that we gathered, shuffle instructions and
3285 /// extractelements.
3286 SetVector<Instruction *> GatherShuffleExtractSeq;
3287
3288 /// A list of blocks that we are going to CSE.
3289 DenseSet<BasicBlock *> CSEBlocks;
3290
3291 /// Contains all scheduling relevant data for an instruction.
3292 /// A ScheduleData either represents a single instruction or a member of an
3293 /// instruction bundle (= a group of instructions which is combined into a
3294 /// vector instruction).
3295 struct ScheduleData {
3296 // The initial value for the dependency counters. It means that the
3297 // dependencies are not calculated yet.
3298 enum { InvalidDeps = -1 };
3299
3300 ScheduleData() = default;
3301
3302 void init(int BlockSchedulingRegionID, Value *OpVal) {
3303 FirstInBundle = this;
3304 NextInBundle = nullptr;
3305 NextLoadStore = nullptr;
3306 IsScheduled = false;
3307 SchedulingRegionID = BlockSchedulingRegionID;
3308 clearDependencies();
3309 OpValue = OpVal;
3310 TE = nullptr;
3311 }
3312
3313 /// Verify basic self consistency properties
3314 void verify() {
3315 if (hasValidDependencies()) {
3316 assert(UnscheduledDeps <= Dependencies && "invariant");
3317 } else {
3318 assert(UnscheduledDeps == Dependencies && "invariant");
3319 }
3320
3321 if (IsScheduled) {
3322 assert(isSchedulingEntity() &&
3323 "unexpected scheduled state");
3324 for (const ScheduleData *BundleMember = this; BundleMember;
3325 BundleMember = BundleMember->NextInBundle) {
3326 assert(BundleMember->hasValidDependencies() &&
3327 BundleMember->UnscheduledDeps == 0 &&
3328 "unexpected scheduled state");
3329 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3330 "only bundle is marked scheduled");
3331 }
3332 }
3333
3334 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3335 "all bundle members must be in same basic block");
3336 }
3337
3338 /// Returns true if the dependency information has been calculated.
3339 /// Note that depenendency validity can vary between instructions within
3340 /// a single bundle.
3341 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3342
3343 /// Returns true for single instructions and for bundle representatives
3344 /// (= the head of a bundle).
3345 bool isSchedulingEntity() const { return FirstInBundle == this; }
3346
3347 /// Returns true if it represents an instruction bundle and not only a
3348 /// single instruction.
3349 bool isPartOfBundle() const {
3350 return NextInBundle != nullptr || FirstInBundle != this || TE;
3351 }
3352
3353 /// Returns true if it is ready for scheduling, i.e. it has no more
3354 /// unscheduled depending instructions/bundles.
3355 bool isReady() const {
3356 assert(isSchedulingEntity() &&
3357 "can't consider non-scheduling entity for ready list");
3358 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3359 }
3360
3361 /// Modifies the number of unscheduled dependencies for this instruction,
3362 /// and returns the number of remaining dependencies for the containing
3363 /// bundle.
3364 int incrementUnscheduledDeps(int Incr) {
3365 assert(hasValidDependencies() &&
3366 "increment of unscheduled deps would be meaningless");
3367 UnscheduledDeps += Incr;
3368 return FirstInBundle->unscheduledDepsInBundle();
3369 }
3370
3371 /// Sets the number of unscheduled dependencies to the number of
3372 /// dependencies.
3373 void resetUnscheduledDeps() {
3374 UnscheduledDeps = Dependencies;
3375 }
3376
3377 /// Clears all dependency information.
3378 void clearDependencies() {
3379 Dependencies = InvalidDeps;
3380 resetUnscheduledDeps();
3381 MemoryDependencies.clear();
3382 ControlDependencies.clear();
3383 }
3384
3385 int unscheduledDepsInBundle() const {
3386 assert(isSchedulingEntity() && "only meaningful on the bundle");
3387 int Sum = 0;
3388 for (const ScheduleData *BundleMember = this; BundleMember;
3389 BundleMember = BundleMember->NextInBundle) {
3390 if (BundleMember->UnscheduledDeps == InvalidDeps)
3391 return InvalidDeps;
3392 Sum += BundleMember->UnscheduledDeps;
3393 }
3394 return Sum;
3395 }
3396
3397 void dump(raw_ostream &os) const {
3398 if (!isSchedulingEntity()) {
3399 os << "/ " << *Inst;
3400 } else if (NextInBundle) {
3401 os << '[' << *Inst;
3402 ScheduleData *SD = NextInBundle;
3403 while (SD) {
3404 os << ';' << *SD->Inst;
3405 SD = SD->NextInBundle;
3406 }
3407 os << ']';
3408 } else {
3409 os << *Inst;
3410 }
3411 }
3412
3413 Instruction *Inst = nullptr;
3414
3415 /// Opcode of the current instruction in the schedule data.
3416 Value *OpValue = nullptr;
3417
3418 /// The TreeEntry that this instruction corresponds to.
3419 TreeEntry *TE = nullptr;
3420
3421 /// Points to the head in an instruction bundle (and always to this for
3422 /// single instructions).
3423 ScheduleData *FirstInBundle = nullptr;
3424
3425 /// Single linked list of all instructions in a bundle. Null if it is a
3426 /// single instruction.
3427 ScheduleData *NextInBundle = nullptr;
3428
3429 /// Single linked list of all memory instructions (e.g. load, store, call)
3430 /// in the block - until the end of the scheduling region.
3431 ScheduleData *NextLoadStore = nullptr;
3432
3433 /// The dependent memory instructions.
3434 /// This list is derived on demand in calculateDependencies().
3435 SmallVector<ScheduleData *, 4> MemoryDependencies;
3436
3437 /// List of instructions which this instruction could be control dependent
3438 /// on. Allowing such nodes to be scheduled below this one could introduce
3439 /// a runtime fault which didn't exist in the original program.
3440 /// ex: this is a load or udiv following a readonly call which inf loops
3441 SmallVector<ScheduleData *, 4> ControlDependencies;
3442
3443 /// This ScheduleData is in the current scheduling region if this matches
3444 /// the current SchedulingRegionID of BlockScheduling.
3445 int SchedulingRegionID = 0;
3446
3447 /// Used for getting a "good" final ordering of instructions.
3448 int SchedulingPriority = 0;
3449
3450 /// The number of dependencies. Constitutes of the number of users of the
3451 /// instruction plus the number of dependent memory instructions (if any).
3452 /// This value is calculated on demand.
3453 /// If InvalidDeps, the number of dependencies is not calculated yet.
3454 int Dependencies = InvalidDeps;
3455
3456 /// The number of dependencies minus the number of dependencies of scheduled
3457 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3458 /// for scheduling.
3459 /// Note that this is negative as long as Dependencies is not calculated.
3460 int UnscheduledDeps = InvalidDeps;
3461
3462 /// True if this instruction is scheduled (or considered as scheduled in the
3463 /// dry-run).
3464 bool IsScheduled = false;
3465 };
3466
3467#ifndef NDEBUG
3469 const BoUpSLP::ScheduleData &SD) {
3470 SD.dump(os);
3471 return os;
3472 }
3473#endif
3474
3475 friend struct GraphTraits<BoUpSLP *>;
3476 friend struct DOTGraphTraits<BoUpSLP *>;
3477
3478 /// Contains all scheduling data for a basic block.
3479 /// It does not schedules instructions, which are not memory read/write
3480 /// instructions and their operands are either constants, or arguments, or
3481 /// phis, or instructions from others blocks, or their users are phis or from
3482 /// the other blocks. The resulting vector instructions can be placed at the
3483 /// beginning of the basic block without scheduling (if operands does not need
3484 /// to be scheduled) or at the end of the block (if users are outside of the
3485 /// block). It allows to save some compile time and memory used by the
3486 /// compiler.
3487 /// ScheduleData is assigned for each instruction in between the boundaries of
3488 /// the tree entry, even for those, which are not part of the graph. It is
3489 /// required to correctly follow the dependencies between the instructions and
3490 /// their correct scheduling. The ScheduleData is not allocated for the
3491 /// instructions, which do not require scheduling, like phis, nodes with
3492 /// extractelements/insertelements only or nodes with instructions, with
3493 /// uses/operands outside of the block.
3494 struct BlockScheduling {
3495 BlockScheduling(BasicBlock *BB)
3496 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3497
3498 void clear() {
3499 ReadyInsts.clear();
3500 ScheduleStart = nullptr;
3501 ScheduleEnd = nullptr;
3502 FirstLoadStoreInRegion = nullptr;
3503 LastLoadStoreInRegion = nullptr;
3504 RegionHasStackSave = false;
3505
3506 // Reduce the maximum schedule region size by the size of the
3507 // previous scheduling run.
3508 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3509 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3510 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3511 ScheduleRegionSize = 0;
3512
3513 // Make a new scheduling region, i.e. all existing ScheduleData is not
3514 // in the new region yet.
3515 ++SchedulingRegionID;
3516 }
3517
3518 ScheduleData *getScheduleData(Instruction *I) {
3519 if (BB != I->getParent())
3520 // Avoid lookup if can't possibly be in map.
3521 return nullptr;
3522 ScheduleData *SD = ScheduleDataMap.lookup(I);
3523 if (SD && isInSchedulingRegion(SD))
3524 return SD;
3525 return nullptr;
3526 }
3527
3528 ScheduleData *getScheduleData(Value *V) {
3529 if (auto *I = dyn_cast<Instruction>(V))
3530 return getScheduleData(I);
3531 return nullptr;
3532 }
3533
3534 ScheduleData *getScheduleData(Value *V, Value *Key) {
3535 if (V == Key)
3536 return getScheduleData(V);
3537 auto I = ExtraScheduleDataMap.find(V);
3538 if (I != ExtraScheduleDataMap.end()) {
3539 ScheduleData *SD = I->second.lookup(Key);
3540 if (SD && isInSchedulingRegion(SD))
3541 return SD;
3542 }
3543 return nullptr;
3544 }
3545
3546 bool isInSchedulingRegion(ScheduleData *SD) const {
3547 return SD->SchedulingRegionID == SchedulingRegionID;
3548 }
3549
3550 /// Marks an instruction as scheduled and puts all dependent ready
3551 /// instructions into the ready-list.
3552 template <typename ReadyListType>
3553 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3554 SD->IsScheduled = true;
3555 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3556
3557 for (ScheduleData *BundleMember = SD; BundleMember;
3558 BundleMember = BundleMember->NextInBundle) {
3559 if (BundleMember->Inst != BundleMember->OpValue)
3560 continue;
3561
3562 // Handle the def-use chain dependencies.
3563
3564 // Decrement the unscheduled counter and insert to ready list if ready.
3565 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3566 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
3567 if (OpDef && OpDef->hasValidDependencies() &&
3568 OpDef->incrementUnscheduledDeps(-1) == 0) {
3569 // There are no more unscheduled dependencies after
3570 // decrementing, so we can put the dependent instruction
3571 // into the ready list.
3572 ScheduleData *DepBundle = OpDef->FirstInBundle;
3573 assert(!DepBundle->IsScheduled &&
3574 "already scheduled bundle gets ready");
3575 ReadyList.insert(DepBundle);
3576 LLVM_DEBUG(dbgs()
3577 << "SLP: gets ready (def): " << *DepBundle << "\n");
3578 }
3579 });
3580 };
3581
3582 // If BundleMember is a vector bundle, its operands may have been
3583 // reordered during buildTree(). We therefore need to get its operands
3584 // through the TreeEntry.
3585 if (TreeEntry *TE = BundleMember->TE) {
3586 // Need to search for the lane since the tree entry can be reordered.
3587 int Lane = std::distance(TE->Scalars.begin(),
3588 find(TE->Scalars, BundleMember->Inst));
3589 assert(Lane >= 0 && "Lane not set");
3590
3591 // Since vectorization tree is being built recursively this assertion
3592 // ensures that the tree entry has all operands set before reaching
3593 // this code. Couple of exceptions known at the moment are extracts
3594 // where their second (immediate) operand is not added. Since
3595 // immediates do not affect scheduler behavior this is considered
3596 // okay.
3597 auto *In = BundleMember->Inst;
3598 assert(
3599 In &&
3600 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3601 In->getNumOperands() == TE->getNumOperands()) &&
3602 "Missed TreeEntry operands?");
3603 (void)In; // fake use to avoid build failure when assertions disabled
3604
3605 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3606 OpIdx != NumOperands; ++OpIdx)
3607 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3608 DecrUnsched(I);
3609 } else {
3610 // If BundleMember is a stand-alone instruction, no operand reordering
3611 // has taken place, so we directly access its operands.
3612 for (Use &U : BundleMember->Inst->operands())
3613 if (auto *I = dyn_cast<Instruction>(U.get()))
3614 DecrUnsched(I);
3615 }
3616 // Handle the memory dependencies.
3617 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3618 if (MemoryDepSD->hasValidDependencies() &&
3619 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3620 // There are no more unscheduled dependencies after decrementing,
3621 // so we can put the dependent instruction into the ready list.
3622 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3623 assert(!DepBundle->IsScheduled &&
3624 "already scheduled bundle gets ready");
3625 ReadyList.insert(DepBundle);
3627 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3628 }
3629 }
3630 // Handle the control dependencies.
3631 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3632 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3633 // There are no more unscheduled dependencies after decrementing,
3634 // so we can put the dependent instruction into the ready list.
3635 ScheduleData *DepBundle = DepSD->FirstInBundle;
3636 assert(!DepBundle->IsScheduled &&
3637 "already scheduled bundle gets ready");
3638 ReadyList.insert(DepBundle);
3640 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3641 }
3642 }
3643 }
3644 }
3645
3646 /// Verify basic self consistency properties of the data structure.
3647 void verify() {
3648 if (!ScheduleStart)
3649 return;
3650
3651 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3652 ScheduleStart->comesBefore(ScheduleEnd) &&
3653 "Not a valid scheduling region?");
3654
3655 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3656 auto *SD = getScheduleData(I);
3657 if (!SD)
3658 continue;
3659 assert(isInSchedulingRegion(SD) &&
3660 "primary schedule data not in window?");
3661 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3662 "entire bundle in window!");
3663 (void)SD;
3664 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3665 }
3666
3667 for (auto *SD : ReadyInsts) {
3668 assert(SD->isSchedulingEntity() && SD->isReady() &&
3669 "item in ready list not ready?");
3670 (void)SD;
3671 }
3672 }
3673
3674 void doForAllOpcodes(Value *V,
3675 function_ref<void(ScheduleData *SD)> Action) {
3676 if (ScheduleData *SD = getScheduleData(V))
3677 Action(SD);
3678 auto I = ExtraScheduleDataMap.find(V);
3679 if (I != ExtraScheduleDataMap.end())
3680 for (auto &P : I->second)
3681 if (isInSchedulingRegion(P.second))
3682 Action(P.second);
3683 }
3684
3685 /// Put all instructions into the ReadyList which are ready for scheduling.
3686 template <typename ReadyListType>
3687 void initialFillReadyList(ReadyListType &ReadyList) {
3688 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3689 doForAllOpcodes(I, [&](ScheduleData *SD) {
3690 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3691 SD->isReady()) {
3692 ReadyList.insert(SD);
3693 LLVM_DEBUG(dbgs()
3694 << "SLP: initially in ready list: " << *SD << "\n");
3695 }
3696 });
3697 }
3698 }
3699
3700 /// Build a bundle from the ScheduleData nodes corresponding to the
3701 /// scalar instruction for each lane.
3702 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3703
3704 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3705 /// cyclic dependencies. This is only a dry-run, no instructions are
3706 /// actually moved at this stage.
3707 /// \returns the scheduling bundle. The returned Optional value is not
3708 /// std::nullopt if \p VL is allowed to be scheduled.
3709 std::optional<ScheduleData *>
3710 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3711 const InstructionsState &S);
3712
3713 /// Un-bundles a group of instructions.
3714 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3715
3716 /// Allocates schedule data chunk.
3717 ScheduleData *allocateScheduleDataChunks();
3718
3719 /// Extends the scheduling region so that V is inside the region.
3720 /// \returns true if the region size is within the limit.
3721 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3722
3723 /// Initialize the ScheduleData structures for new instructions in the
3724 /// scheduling region.
3725 void initScheduleData(Instruction *FromI, Instruction *ToI,
3726 ScheduleData *PrevLoadStore,
3727 ScheduleData *NextLoadStore);
3728
3729 /// Updates the dependency information of a bundle and of all instructions/
3730 /// bundles which depend on the original bundle.
3731 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3732 BoUpSLP *SLP);
3733
3734 /// Sets all instruction in the scheduling region to un-scheduled.
3735 void resetSchedule();
3736
3737 BasicBlock *BB;
3738
3739 /// Simple memory allocation for ScheduleData.
3741
3742 /// The size of a ScheduleData array in ScheduleDataChunks.
3743 int ChunkSize;
3744
3745 /// The allocator position in the current chunk, which is the last entry
3746 /// of ScheduleDataChunks.
3747 int ChunkPos;
3748
3749 /// Attaches ScheduleData to Instruction.
3750 /// Note that the mapping survives during all vectorization iterations, i.e.
3751 /// ScheduleData structures are recycled.
3753
3754 /// Attaches ScheduleData to Instruction with the leading key.
3756 ExtraScheduleDataMap;
3757
3758 /// The ready-list for scheduling (only used for the dry-run).
3759 SetVector<ScheduleData *> ReadyInsts;
3760
3761 /// The first instruction of the scheduling region.
3762 Instruction *ScheduleStart = nullptr;
3763
3764 /// The first instruction _after_ the scheduling region.
3765 Instruction *ScheduleEnd = nullptr;
3766
3767 /// The first memory accessing instruction in the scheduling region
3768 /// (can be null).
3769 ScheduleData *FirstLoadStoreInRegion = nullptr;
3770
3771 /// The last memory accessing instruction in the scheduling region
3772 /// (can be null).
3773 ScheduleData *LastLoadStoreInRegion = nullptr;
3774
3775 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3776 /// region? Used to optimize the dependence calculation for the
3777 /// common case where there isn't.
3778 bool RegionHasStackSave = false;
3779
3780 /// The current size of the scheduling region.
3781 int ScheduleRegionSize = 0;
3782
3783 /// The maximum size allowed for the scheduling region.
3784 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3785
3786 /// The ID of the scheduling region. For a new vectorization iteration this
3787 /// is incremented which "removes" all ScheduleData from the region.
3788 /// Make sure that the initial SchedulingRegionID is greater than the
3789 /// initial SchedulingRegionID in ScheduleData (which is 0).
3790 int SchedulingRegionID = 1;
3791 };
3792
3793 /// Attaches the BlockScheduling structures to basic blocks.
3795
3796 /// Performs the "real" scheduling. Done before vectorization is actually
3797 /// performed in a basic block.
3798 void scheduleBlock(BlockScheduling *BS);
3799
3800 /// List of users to ignore during scheduling and that don't need extracting.
3801 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3802
3803 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3804 /// sorted SmallVectors of unsigned.
3805 struct OrdersTypeDenseMapInfo {
3806 static OrdersType getEmptyKey() {
3807 OrdersType V;
3808 V.push_back(~1U);
3809 return V;
3810 }
3811
3812 static OrdersType getTombstoneKey() {
3813 OrdersType V;
3814 V.push_back(~2U);
3815 return V;
3816 }
3817
3818 static unsigned getHashValue(const OrdersType &V) {
3819 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3820 }
3821
3822 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3823 return LHS == RHS;
3824 }
3825 };
3826
3827 // Analysis and block reference.
3828 Function *F;
3829 ScalarEvolution *SE;
3831 TargetLibraryInfo *TLI;
3832 LoopInfo *LI;
3833 DominatorTree *DT;
3834 AssumptionCache *AC;
3835 DemandedBits *DB;
3836 const DataLayout *DL;
3838
3839 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3840 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3841
3842 /// Instruction builder to construct the vectorized tree.
3844
3845 /// A map of scalar integer values to the smallest bit width with which they
3846 /// can legally be represented. The values map to (width, signed) pairs,
3847 /// where "width" indicates the minimum bit width and "signed" is True if the
3848 /// value must be signed-extended, rather than zero-extended, back to its
3849 /// original width.
3851
3852 /// Final size of the reduced vector, if the current graph represents the
3853 /// input for the reduction and it was possible to narrow the size of the
3854 /// reduction.
3855 unsigned ReductionBitWidth = 0;
3856
3857 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
3858 /// type sizes, used in the tree.
3859 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
3860
3861 /// Indices of the vectorized nodes, which supposed to be the roots of the new
3862 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
3863 DenseSet<unsigned> ExtraBitWidthNodes;
3864};
3865
3866} // end namespace slpvectorizer
3867
3868template <> struct GraphTraits<BoUpSLP *> {
3869 using TreeEntry = BoUpSLP::TreeEntry;
3870
3871 /// NodeRef has to be a pointer per the GraphWriter.
3873
3875
3876 /// Add the VectorizableTree to the index iterator to be able to return
3877 /// TreeEntry pointers.
3878 struct ChildIteratorType
3879 : public iterator_adaptor_base<
3880 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3882
3884 ContainerTy &VT)
3885 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
3886
3887 NodeRef operator*() { return I->UserTE; }
3888 };
3889
3891 return R.VectorizableTree[0].get();
3892 }
3893
3894 static ChildIteratorType child_begin(NodeRef N) {
3895 return {N->UserTreeIndices.begin(), N->Container};
3896 }
3897
3898 static ChildIteratorType child_end(NodeRef N) {
3899 return {N->UserTreeIndices.end(), N->Container};
3900 }
3901
3902 /// For the node iterator we just need to turn the TreeEntry iterator into a
3903 /// TreeEntry* iterator so that it dereferences to NodeRef.
3904 class nodes_iterator {
3906 ItTy It;
3907
3908 public:
3909 nodes_iterator(const ItTy &It2) : It(It2) {}
3910 NodeRef operator*() { return It->get(); }
3911 nodes_iterator operator++() {
3912 ++It;
3913 return *this;
3914 }
3915 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
3916 };
3917
3918 static nodes_iterator nodes_begin(BoUpSLP *R) {
3919 return nodes_iterator(R->VectorizableTree.begin());
3920 }
3921
3922 static nodes_iterator nodes_end(BoUpSLP *R) {
3923 return nodes_iterator(R->VectorizableTree.end());
3924 }
3925
3926 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
3927};
3928
3929template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
3930 using TreeEntry = BoUpSLP::TreeEntry;
3931
3932 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
3933
3934 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
3935 std::string Str;
3937 OS << Entry->Idx << ".\n";
3938 if (isSplat(Entry->Scalars))
3939 OS << "<splat> ";
3940 for (auto *V : Entry->Scalars) {
3941 OS << *V;
3942 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
3943 return EU.Scalar == V;
3944 }))
3945 OS << " <extract>";
3946 OS << "\n";
3947 }
3948 return Str;
3949 }
3950
3951 static std::string getNodeAttributes(const TreeEntry *Entry,
3952 const BoUpSLP *) {
3953 if (Entry->State == TreeEntry::NeedToGather)
3954 return "color=red";
3955 if (Entry->State == TreeEntry::ScatterVectorize ||
3956 Entry->State == TreeEntry::StridedVectorize)
3957 return "color=blue";
3958 return "";
3959 }
3960};
3961
3962} // end namespace llvm
3963
3966 for (auto *I : DeletedInstructions) {
3967 for (Use &U : I->operands()) {
3968 auto *Op = dyn_cast<Instruction>(U.get());
3969 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
3971 DeadInsts.emplace_back(Op);
3972 }
3973 I->dropAllReferences();
3974 }
3975 for (auto *I : DeletedInstructions) {
3976 assert(I->use_empty() &&
3977 "trying to erase instruction with users.");
3978 I->eraseFromParent();
3979 }
3980
3981 // Cleanup any dead scalar code feeding the vectorized instructions
3983
3984#ifdef EXPENSIVE_CHECKS
3985 // If we could guarantee that this call is not extremely slow, we could
3986 // remove the ifdef limitation (see PR47712).
3987 assert(!verifyFunction(*F, &dbgs()));
3988#endif
3989}
3990
3991/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
3992/// contains original mask for the scalars reused in the node. Procedure
3993/// transform this mask in accordance with the given \p Mask.
3995 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
3996 "Expected non-empty mask.");
3997 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
3998 Prev.swap(Reuses);
3999 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
4000 if (Mask[I] != PoisonMaskElem)
4001 Reuses[Mask[I]] = Prev[I];
4002}
4003
4004/// Reorders the given \p Order according to the given \p Mask. \p Order - is
4005/// the original order of the scalars. Procedure transforms the provided order
4006/// in accordance with the given \p Mask. If the resulting \p Order is just an
4007/// identity order, \p Order is cleared.
4009 bool BottomOrder = false) {
4010 assert(!Mask.empty() && "Expected non-empty mask.");
4011 unsigned Sz = Mask.size();
4012 if (BottomOrder) {
4013 SmallVector<unsigned> PrevOrder;
4014 if (Order.empty()) {
4015 PrevOrder.resize(Sz);
4016 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4017 } else {
4018 PrevOrder.swap(Order);
4019 }
4020 Order.assign(Sz, Sz);
4021 for (unsigned I = 0; I < Sz; ++I)
4022 if (Mask[I] != PoisonMaskElem)
4023 Order[I] = PrevOrder[Mask[I]];
4024 if (all_of(enumerate(Order), [&](const auto &Data) {
4025 return Data.value() == Sz || Data.index() == Data.value();
4026 })) {
4027 Order.clear();
4028 return;
4029 }
4030 fixupOrderingIndices(Order);
4031 return;
4032 }
4033 SmallVector<int> MaskOrder;
4034 if (Order.empty()) {
4035 MaskOrder.resize(Sz);
4036 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4037 } else {
4038 inversePermutation(Order, MaskOrder);
4039 }
4040 reorderReuses(MaskOrder, Mask);
4041 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4042 Order.clear();
4043 return;
4044 }
4045 Order.assign(Sz, Sz);
4046 for (unsigned I = 0; I < Sz; ++I)
4047 if (MaskOrder[I] != PoisonMaskElem)
4048 Order[MaskOrder[I]] = I;
4049 fixupOrderingIndices(Order);
4050}
4051
4052std::optional<BoUpSLP::OrdersType>
4053BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4054 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
4055 // Try to find subvector extract/insert patterns and reorder only such
4056 // patterns.
4057 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4058 Type *ScalarTy = GatheredScalars.front()->getType();
4059 int NumScalars = GatheredScalars.size();
4060 if (!isValidElementType(ScalarTy))
4061 return std::nullopt;
4062 auto *VecTy = FixedVectorType::get(ScalarTy, NumScalars);
4063 int NumParts = TTI->getNumberOfParts(VecTy);
4064 if (NumParts == 0 || NumParts >= NumScalars)
4065 NumParts = 1;
4066 SmallVector<int> ExtractMask;
4067 SmallVector<int> Mask;
4070 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4072 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4073 /*ForOrder=*/true);
4074 // No shuffled operands - ignore.
4075 if (GatherShuffles.empty() && ExtractShuffles.empty())
4076 return std::nullopt;
4077 OrdersType CurrentOrder(NumScalars, NumScalars);
4078 if (GatherShuffles.size() == 1 &&
4079 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4080 Entries.front().front()->isSame(TE.Scalars)) {
4081 // Perfect match in the graph, will reuse the previously vectorized
4082 // node. Cost is 0.
4083 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4084 return CurrentOrder;
4085 }
4086 auto IsSplatMask = [](ArrayRef<int> Mask) {
4087 int SingleElt = PoisonMaskElem;
4088 return all_of(Mask, [&](int I) {
4089 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4090 SingleElt = I;
4091 return I == PoisonMaskElem || I == SingleElt;
4092 });
4093 };
4094 // Exclusive broadcast mask - ignore.
4095 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4096 (Entries.size() != 1 ||
4097 Entries.front().front()->ReorderIndices.empty())) ||
4098 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4099 return std::nullopt;
4100 SmallBitVector ShuffledSubMasks(NumParts);
4101 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4102 ArrayRef<int> Mask, int PartSz, int NumParts,
4103 function_ref<unsigned(unsigned)> GetVF) {
4104 for (int I : seq<int>(0, NumParts)) {
4105 if (ShuffledSubMasks.test(I))
4106 continue;
4107 const int VF = GetVF(I);
4108 if (VF == 0)
4109 continue;
4110 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
4111 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4112 // Shuffle of at least 2 vectors - ignore.
4113 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4114 std::fill(Slice.begin(), Slice.end(), NumScalars);
4115 ShuffledSubMasks.set(I);
4116 continue;
4117 }
4118 // Try to include as much elements from the mask as possible.
4119 int FirstMin = INT_MAX;
4120 int SecondVecFound = false;
4121 for (int K : seq<int>(Limit)) {
4122 int Idx = Mask[I * PartSz + K];
4123 if (Idx == PoisonMaskElem) {
4124 Value *V = GatheredScalars[I * PartSz + K];
4125 if (isConstant(V) && !isa<PoisonValue>(V)) {
4126 SecondVecFound = true;
4127 break;
4128 }
4129 continue;
4130 }
4131 if (Idx < VF) {
4132 if (FirstMin > Idx)
4133 FirstMin = Idx;
4134 } else {
4135 SecondVecFound = true;
4136 break;
4137 }
4138 }
4139 FirstMin = (FirstMin / PartSz) * PartSz;
4140 // Shuffle of at least 2 vectors - ignore.
4141 if (SecondVecFound) {
4142 std::fill(Slice.begin(), Slice.end(), NumScalars);
4143 ShuffledSubMasks.set(I);
4144 continue;
4145 }
4146 for (int K : seq<int>(Limit)) {
4147 int Idx = Mask[I * PartSz + K];
4148 if (Idx == PoisonMaskElem)
4149 continue;
4150 Idx -= FirstMin;
4151 if (Idx >= PartSz) {
4152 SecondVecFound = true;
4153 break;
4154 }
4155 if (CurrentOrder[I * PartSz + Idx] >
4156 static_cast<unsigned>(I * PartSz + K) &&
4157 CurrentOrder[I * PartSz + Idx] !=
4158 static_cast<unsigned>(I * PartSz + Idx))
4159 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4160 }
4161 // Shuffle of at least 2 vectors - ignore.
4162 if (SecondVecFound) {
4163 std::fill(Slice.begin(), Slice.end(), NumScalars);
4164 ShuffledSubMasks.set(I);
4165 continue;
4166 }
4167 }
4168 };
4169 int PartSz = getPartNumElems(NumScalars, NumParts);
4170 if (!ExtractShuffles.empty())
4171 TransformMaskToOrder(
4172 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4173 if (!ExtractShuffles[I])
4174 return 0U;
4175 unsigned VF = 0;
4176 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
4177 for (unsigned Idx : seq<unsigned>(Sz)) {
4178 int K = I * PartSz + Idx;
4179 if (ExtractMask[K] == PoisonMaskElem)
4180 continue;
4181 if (!TE.ReuseShuffleIndices.empty())
4182 K = TE.ReuseShuffleIndices[K];
4183 if (!TE.ReorderIndices.empty())
4184 K = std::distance(TE.ReorderIndices.begin(),
4185 find(TE.ReorderIndices, K));
4186 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4187 if (!EI)
4188 continue;
4189 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4190 ->getElementCount()
4191 .getKnownMinValue());
4192 }
4193 return VF;
4194 });
4195 // Check special corner case - single shuffle of the same entry.
4196 if (GatherShuffles.size() == 1 && NumParts != 1) {
4197 if (ShuffledSubMasks.any())
4198 return std::nullopt;
4199 PartSz = NumScalars;
4200 NumParts = 1;
4201 }
4202 if (!Entries.empty())
4203 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4204 if (!GatherShuffles[I])
4205 return 0U;
4206 return std::max(Entries[I].front()->getVectorFactor(),
4207 Entries[I].back()->getVectorFactor());
4208 });
4209 int NumUndefs =
4210 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4211 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4212 return std::nullopt;
4213 return std::move(CurrentOrder);
4214}
4215
4216static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4217 const TargetLibraryInfo &TLI,
4218 bool CompareOpcodes = true) {
4219 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
4220 return false;
4221 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4222 if (!GEP1)
4223 return false;
4224 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4225 if (!GEP2)
4226 return false;
4227 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4228 ((isConstant(GEP1->getOperand(1)) &&
4229 isConstant(GEP2->getOperand(1))) ||
4230 !CompareOpcodes ||
4231 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4232 .getOpcode());
4233}
4234
4235/// Calculates minimal alignment as a common alignment.
4236template <typename T>
4238 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4239 for (Value *V : VL.drop_front())
4240 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4241 return CommonAlignment;
4242}
4243
4244/// Check if \p Order represents reverse order.
4246 unsigned Sz = Order.size();
4247 return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
4248 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4249 });
4250}
4251
4252/// Checks if the provided list of pointers \p Pointers represents the strided
4253/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4254/// Otherwise, if \p Inst is not specified, just initialized optional value is
4255/// returned to show that the pointers represent strided pointers. If \p Inst
4256/// specified, the runtime stride is materialized before the given \p Inst.
4257/// \returns std::nullopt if the pointers are not pointers with the runtime
4258/// stride, nullptr or actual stride value, otherwise.
4259static std::optional<Value *>
4261 const DataLayout &DL, ScalarEvolution &SE,
4262 SmallVectorImpl<unsigned> &SortedIndices,
4263 Instruction *Inst = nullptr) {
4265 const SCEV *PtrSCEVLowest = nullptr;
4266 const SCEV *PtrSCEVHighest = nullptr;
4267 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4268 // addresses).
4269 for (Value *Ptr : PointerOps) {
4270 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4271 if (!PtrSCEV)
4272 return std::nullopt;
4273 SCEVs.push_back(PtrSCEV);
4274 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4275 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4276 continue;
4277 }
4278 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4279 if (isa<SCEVCouldNotCompute>(Diff))
4280 return std::nullopt;
4281 if (Diff->isNonConstantNegative()) {
4282 PtrSCEVLowest = PtrSCEV;
4283 continue;
4284 }
4285 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4286 if (isa<SCEVCouldNotCompute>(Diff1))
4287 return std::nullopt;
4288 if (Diff1->isNonConstantNegative()) {
4289 PtrSCEVHighest = PtrSCEV;
4290 continue;
4291 }
4292 }
4293 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4294 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4295 if (isa<SCEVCouldNotCompute>(Dist))
4296 return std::nullopt;
4297 int Size = DL.getTypeStoreSize(ElemTy);
4298 auto TryGetStride = [&](const SCEV *Dist,
4299 const SCEV *Multiplier) -> const SCEV * {
4300 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4301 if (M->getOperand(0) == Multiplier)
4302 return M->getOperand(1);
4303 if (M->getOperand(1) == Multiplier)
4304 return M->getOperand(0);
4305 return nullptr;
4306 }
4307 if (Multiplier == Dist)
4308 return SE.getConstant(Dist->getType(), 1);
4309 return SE.getUDivExactExpr(Dist, Multiplier);
4310 };
4311 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4312 const SCEV *Stride = nullptr;
4313 if (Size != 1 || SCEVs.size() > 2) {
4314 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4315 Stride = TryGetStride(Dist, Sz);
4316 if (!Stride)
4317 return std::nullopt;
4318 }
4319 if (!Stride || isa<SCEVConstant>(Stride))
4320 return std::nullopt;
4321 // Iterate through all pointers and check if all distances are
4322 // unique multiple of Stride.
4323 using DistOrdPair = std::pair<int64_t, int>;
4324 auto Compare = llvm::less_first();
4325 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4326 int Cnt = 0;
4327 bool IsConsecutive = true;
4328 for (const SCEV *PtrSCEV : SCEVs) {
4329 unsigned Dist = 0;
4330 if (PtrSCEV != PtrSCEVLowest) {
4331 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4332 const SCEV *Coeff = TryGetStride(Diff, Stride);
4333 if (!Coeff)
4334 return std::nullopt;
4335 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4336 if (!SC || isa<SCEVCouldNotCompute>(SC))
4337 return std::nullopt;
4338 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4339 SE.getMulExpr(Stride, SC)))
4340 ->isZero())
4341 return std::nullopt;
4342 Dist = SC->getAPInt().getZExtValue();
4343 }
4344 // If the strides are not the same or repeated, we can't vectorize.
4345 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4346 return std::nullopt;
4347 auto Res = Offsets.emplace(Dist, Cnt);
4348 if (!Res.second)
4349 return std::nullopt;
4350 // Consecutive order if the inserted element is the last one.
4351 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4352 ++Cnt;
4353 }
4354 if (Offsets.size() != SCEVs.size())
4355 return std::nullopt;
4356 SortedIndices.clear();
4357 if (!IsConsecutive) {
4358 // Fill SortedIndices array only if it is non-consecutive.
4359 SortedIndices.resize(PointerOps.size());
4360 Cnt = 0;
4361 for (const std::pair<int64_t, int> &Pair : Offsets) {
4362 SortedIndices[Cnt] = Pair.second;
4363 ++Cnt;
4364 }
4365 }
4366 if (!Inst)
4367 return nullptr;
4368 SCEVExpander Expander(SE, DL, "strided-load-vec");
4369 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4370}
4371
4372static std::pair<InstructionCost, InstructionCost>
4374 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4375 Type *ScalarTy, VectorType *VecTy);
4376
4378 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
4379 SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
4380 // Check that a vectorized load would load the same memory as a scalar
4381 // load. For example, we don't want to vectorize loads that are smaller
4382 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4383 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4384 // from such a struct, we read/write packed bits disagreeing with the
4385 // unvectorized version.
4386 Type *ScalarTy = VL0->getType();
4387
4388 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4389 return LoadsState::Gather;
4390
4391 // Make sure all loads in the bundle are simple - we can't vectorize
4392 // atomic or volatile loads.
4393 PointerOps.clear();
4394 const unsigned Sz = VL.size();
4395 PointerOps.resize(Sz);
4396 auto *POIter = PointerOps.begin();
4397 for (Value *V : VL) {
4398 auto *L = cast<LoadInst>(V);
4399 if (!L->isSimple())
4400 return LoadsState::Gather;
4401 *POIter = L->getPointerOperand();
4402 ++POIter;
4403 }
4404
4405 Order.clear();
4406 auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
4407 // Check the order of pointer operands or that all pointers are the same.
4408 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4409 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4410 if (!Order.empty() && !isPowerOf2_32(VL.size())) {
4411 assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4412 "supported with VectorizeNonPowerOf2");
4413 return LoadsState::Gather;
4414 }
4415
4416 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4417 if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
4418 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4419 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4421 if (IsSorted || all_of(PointerOps, [&](Value *P) {
4422 return arePointersCompatible(P, PointerOps.front(), *TLI);
4423 })) {
4424 if (IsSorted) {
4425 Value *Ptr0;
4426 Value *PtrN;
4427 if (Order.empty()) {
4428 Ptr0 = PointerOps.front();
4429 PtrN = PointerOps.back();
4430 } else {
4431 Ptr0 = PointerOps[Order.front()];
4432 PtrN = PointerOps[Order.back()];
4433 }
4434 std::optional<int> Diff =
4435 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
4436 // Check that the sorted loads are consecutive.
4437 if (static_cast<unsigned>(*Diff) == Sz - 1)
4438 return LoadsState::Vectorize;
4439 // Simple check if not a strided access - clear order.
4440 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4441 // Try to generate strided load node if:
4442 // 1. Target with strided load support is detected.
4443 // 2. The number of loads is greater than MinProfitableStridedLoads,
4444 // or the potential stride <= MaxProfitableLoadStride and the
4445 // potential stride is power-of-2 (to avoid perf regressions for the very
4446 // small number of loads) and max distance > number of loads, or potential
4447 // stride is -1.
4448 // 3. The loads are ordered, or number of unordered loads <=
4449 // MaxProfitableUnorderedLoads, or loads are in reversed order.
4450 // (this check is to avoid extra costs for very expensive shuffles).
4451 if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
4452 (static_cast<unsigned>(std::abs(*Diff)) <=
4454 isPowerOf2_32(std::abs(*Diff)))) &&
4455 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4456 *Diff == -(static_cast<int>(Sz) - 1))) {
4457 int Stride = *Diff / static_cast<int>(Sz - 1);
4458 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4459 Align Alignment =
4460 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
4461 ->getAlign();
4462 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
4463 // Iterate through all pointers and check if all distances are
4464 // unique multiple of Dist.
4465 SmallSet<int, 4> Dists;
4466 for (Value *Ptr : PointerOps) {
4467 int Dist = 0;
4468 if (Ptr == PtrN)
4469 Dist = *Diff;
4470 else if (Ptr != Ptr0)
4471 Dist =
4472 *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
4473 // If the strides are not the same or repeated, we can't
4474 // vectorize.
4475 if (((Dist / Stride) * Stride) != Dist ||
4476 !Dists.insert(Dist).second)
4477 break;
4478 }
4479 if (Dists.size() == Sz)
4481 }
4482 }
4483 }
4484 }
4485 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4486 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
4487 unsigned MinVF = getMinVF(Sz);
4488 unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
4489 MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
4490 for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4491 unsigned VectorizedCnt = 0;
4493 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4494 Cnt += VF, ++VectorizedCnt) {
4495 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4497 SmallVector<Value *> PointerOps;
4498 LoadsState LS =
4499 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
4500 /*TryRecursiveCheck=*/false);
4501 // Check that the sorted loads are consecutive.
4502 if (LS == LoadsState::Gather)
4503 break;
4504 // If need the reorder - consider as high-cost masked gather for now.
4505 if ((LS == LoadsState::Vectorize ||
4507 !Order.empty() && !isReverseOrder(Order))
4509 States.push_back(LS);
4510 }
4511 // Can be vectorized later as a serie of loads/insertelements.
4512 if (VectorizedCnt == VL.size() / VF) {
4513 // Compare masked gather cost and loads + insersubvector costs.
4515 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4516 TTI, PointerOps, PointerOps.front(), Instruction::GetElementPtr,
4517 CostKind, ScalarTy, VecTy);
4518 InstructionCost MaskedGatherCost =
4520 Instruction::Load, VecTy,
4521 cast<LoadInst>(VL0)->getPointerOperand(),
4522 /*VariableMask=*/false, CommonAlignment, CostKind) +
4523 VectorGEPCost - ScalarGEPCost;
4524 InstructionCost VecLdCost = 0;
4525 auto *SubVecTy = FixedVectorType::get(ScalarTy, VF);
4526 for (auto [I, LS] : enumerate(States)) {
4527 auto *LI0 = cast<LoadInst>(VL[I * VF]);
4528 switch (LS) {
4529 case LoadsState::Vectorize: {
4530 auto [ScalarGEPCost, VectorGEPCost] =
4531 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4532 LI0->getPointerOperand(), Instruction::Load,
4533 CostKind, ScalarTy, SubVecTy);
4534 VecLdCost += TTI.getMemoryOpCost(
4535 Instruction::Load, SubVecTy, LI0->getAlign(),
4536 LI0->getPointerAddressSpace(), CostKind,
4538 VectorGEPCost - ScalarGEPCost;
4539 break;
4540 }
4542 auto [ScalarGEPCost, VectorGEPCost] =
4543 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4544 LI0->getPointerOperand(), Instruction::Load,
4545 CostKind, ScalarTy, SubVecTy);
4546 VecLdCost +=
4548 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4549 /*VariableMask=*/false, CommonAlignment, CostKind) +
4550 VectorGEPCost - ScalarGEPCost;
4551 break;
4552 }
4554 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4555 TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4556 LI0->getPointerOperand(), Instruction::GetElementPtr,
4557 CostKind, ScalarTy, SubVecTy);
4558 VecLdCost +=
4560 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4561 /*VariableMask=*/false, CommonAlignment, CostKind) +
4562 VectorGEPCost - ScalarGEPCost;
4563 break;
4564 }
4565 case LoadsState::Gather:
4567 "Expected only consecutive, strided or masked gather loads.");
4568 }
4569 SmallVector<int> ShuffleMask(VL.size());
4570 for (int Idx : seq<int>(0, VL.size()))
4571 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4572 VecLdCost +=
4573 TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy, ShuffleMask,
4574 CostKind, I * VF, SubVecTy);
4575 }
4576 // If masked gather cost is higher - better to vectorize, so
4577 // consider it as a gather node. It will be better estimated
4578 // later.
4579 if (MaskedGatherCost >= VecLdCost)
4580 return true;
4581 }
4582 }
4583 return false;
4584 };
4585 // TODO: need to improve analysis of the pointers, if not all of them are
4586 // GEPs or have > 2 operands, we end up with a gather node, which just
4587 // increases the cost.
4588 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
4589 bool ProfitableGatherPointers =
4590 L && Sz > 2 &&
4591 static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
4592 return L->isLoopInvariant(V);
4593 })) <= Sz / 2;
4594 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
4595 auto *GEP = dyn_cast<GetElementPtrInst>(P);
4596 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
4597 (GEP && GEP->getNumOperands() == 2 &&
4598 isa<Constant, Instruction>(GEP->getOperand(1)));
4599 })) {
4600 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4601 if (TTI->isLegalMaskedGather(VecTy, CommonAlignment) &&
4602 !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
4603 // Check if potential masked gather can be represented as series
4604 // of loads + insertsubvectors.
4605 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4606 // If masked gather cost is higher - better to vectorize, so
4607 // consider it as a gather node. It will be better estimated
4608 // later.
4609 return LoadsState::Gather;
4610 }
4612 }
4613 }
4614 }
4615
4616 return LoadsState::Gather;
4617}
4618
4620 const DataLayout &DL, ScalarEvolution &SE,
4621 SmallVectorImpl<unsigned> &SortedIndices) {
4623 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4624 "Expected list of pointer operands.");
4625 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4626 // Ptr into, sort and return the sorted indices with values next to one
4627 // another.
4629 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4630
4631 unsigned Cnt = 1;
4632 for (Value *Ptr : VL.drop_front()) {
4633 bool Found = any_of(Bases, [&](auto &Base) {
4634 std::optional<int> Diff =
4635 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4636 /*StrictCheck=*/true);
4637 if (!Diff)
4638 return false;
4639
4640 Base.second.emplace_back(Ptr, *Diff, Cnt++);
4641 return true;
4642 });
4643
4644 if (!Found) {
4645 // If we haven't found enough to usefully cluster, return early.
4646 if (Bases.size() > VL.size() / 2 - 1)
4647 return false;
4648
4649 // Not found already - add a new Base
4650 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
4651 }
4652 }
4653
4654 // For each of the bases sort the pointers by Offset and check if any of the
4655 // base become consecutively allocated.
4656 bool AnyConsecutive = false;
4657 for (auto &Base : Bases) {
4658 auto &Vec = Base.second;
4659 if (Vec.size() > 1) {
4660 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
4661 const std::tuple<Value *, int, unsigned> &Y) {
4662 return std::get<1>(X) < std::get<1>(Y);
4663 });
4664 int InitialOffset = std::get<1>(Vec[0]);
4665 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
4666 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4667 });
4668 }
4669 }
4670
4671 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4672 SortedIndices.clear();
4673 if (!AnyConsecutive)
4674 return false;
4675
4676 for (auto &Base : Bases) {
4677 for (auto &T : Base.second)
4678 SortedIndices.push_back(std::get<2>(T));
4679 }
4680
4681 assert(SortedIndices.size() == VL.size() &&
4682 "Expected SortedIndices to be the size of VL");
4683 return true;
4684}
4685
4686std::optional<BoUpSLP::OrdersType>
4687BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4688 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
4689 Type *ScalarTy = TE.Scalars[0]->getType();
4690
4692 Ptrs.reserve(TE.Scalars.size());
4693 for (Value *V : TE.Scalars) {
4694 auto *L = dyn_cast<LoadInst>(V);
4695 if (!L || !L->isSimple())
4696 return std::nullopt;
4697 Ptrs.push_back(L->getPointerOperand());
4698 }
4699
4700 BoUpSLP::OrdersType Order;
4701 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
4702 return std::move(Order);
4703 return std::nullopt;
4704}
4705
4706/// Check if two insertelement instructions are from the same buildvector.
4709 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4710 // Instructions must be from the same basic blocks.
4711 if (VU->getParent() != V->getParent())
4712 return false;
4713 // Checks if 2 insertelements are from the same buildvector.
4714 if (VU->getType() != V->getType())
4715 return false;
4716 // Multiple used inserts are separate nodes.
4717 if (!VU->hasOneUse() && !V->hasOneUse())
4718 return false;
4719 auto *IE1 = VU;
4720 auto *IE2 = V;
4721 std::optional<unsigned> Idx1 = getInsertIndex(IE1);
4722 std::optional<unsigned> Idx2 = getInsertIndex(IE2);
4723 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4724 return false;
4725 // Go through the vector operand of insertelement instructions trying to find
4726 // either VU as the original vector for IE2 or V as the original vector for
4727 // IE1.
4728 SmallBitVector ReusedIdx(
4729 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
4730 bool IsReusedIdx = false;
4731 do {
4732 if (IE2 == VU && !IE1)
4733 return VU->hasOneUse();
4734 if (IE1 == V && !IE2)
4735 return V->hasOneUse();
4736 if (IE1 && IE1 != V) {
4737 unsigned Idx1 = getInsertIndex(IE1).value_or(*Idx2);
4738 IsReusedIdx |= ReusedIdx.test(Idx1);
4739 ReusedIdx.set(Idx1);
4740 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4741 IE1 = nullptr;
4742 else
4743 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4744 }
4745 if (IE2 && IE2 != VU) {
4746 unsigned Idx2 = getInsertIndex(IE2).value_or(*Idx1);
4747 IsReusedIdx |= ReusedIdx.test(Idx2);
4748 ReusedIdx.set(Idx2);
4749 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4750 IE2 = nullptr;
4751 else
4752 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4753 }
4754 } while (!IsReusedIdx && (IE1 || IE2));
4755 return false;
4756}
4757
4758std::optional<BoUpSLP::OrdersType>
4759BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4760 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
4761 if (TE.isNonPowOf2Vec())
4762 return std::nullopt;
4763
4764 // No need to reorder if need to shuffle reuses, still need to shuffle the
4765 // node.
4766 if (!TE.ReuseShuffleIndices.empty()) {
4767 if (isSplat(TE.Scalars))
4768 return std::nullopt;
4769 // Check if reuse shuffle indices can be improved by reordering.
4770 // For this, check that reuse mask is "clustered", i.e. each scalar values
4771 // is used once in each submask of size <number_of_scalars>.
4772 // Example: 4 scalar values.
4773 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4774 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4775 // element 3 is used twice in the second submask.
4776 unsigned Sz = TE.Scalars.size();
4777 if (TE.State == TreeEntry::NeedToGather) {
4778 if (std::optional<OrdersType> CurrentOrder =
4780 SmallVector<int> Mask;
4781 fixupOrderingIndices(*CurrentOrder);
4782 inversePermutation(*CurrentOrder, Mask);
4783 ::addMask(Mask, TE.ReuseShuffleIndices);
4784 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4785 unsigned Sz = TE.Scalars.size();
4786 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
4787 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
4788 if (Idx != PoisonMaskElem)
4789 Res[Idx + K * Sz] = I + K * Sz;
4790 }
4791 return std::move(Res);
4792 }
4793 }
4794 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4796 TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
4797 return std::nullopt;
4798 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
4799 Sz)) {
4800 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4801 if (TE.ReorderIndices.empty())
4802 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4803 else
4804 inversePermutation(TE.ReorderIndices, ReorderMask);
4805 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4806 unsigned VF = ReorderMask.size();
4807 OrdersType ResOrder(VF, VF);
4808 unsigned NumParts = divideCeil(VF, Sz);
4809 SmallBitVector UsedVals(NumParts);
4810 for (unsigned I = 0; I < VF; I += Sz) {
4811 int Val = PoisonMaskElem;
4812 unsigned UndefCnt = 0;
4813 unsigned Limit = std::min(Sz, VF - I);
4814 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
4815 [&](int Idx) {
4816 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
4817 Val = Idx;
4818 if (Idx == PoisonMaskElem)
4819 ++UndefCnt;
4820 return Idx != PoisonMaskElem && Idx != Val;
4821 }) ||
4822 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
4823 UndefCnt > Sz / 2)
4824 return std::nullopt;
4825 UsedVals.set(Val);
4826 for (unsigned K = 0; K < NumParts; ++K)
4827 ResOrder[Val + Sz * K] = I + K;
4828 }
4829 return std::move(ResOrder);
4830 }
4831 unsigned VF = TE.getVectorFactor();
4832 // Try build correct order for extractelement instructions.
4833 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
4834 TE.ReuseShuffleIndices.end());
4835 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4836 all_of(TE.Scalars, [Sz](Value *V) {
4837 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4838 return Idx && *Idx < Sz;
4839 })) {
4840 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4841 if (TE.ReorderIndices.empty())
4842 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4843 else
4844 inversePermutation(TE.ReorderIndices, ReorderMask);
4845 for (unsigned I = 0; I < VF; ++I) {
4846 int &Idx = ReusedMask[I];
4847 if (Idx == PoisonMaskElem)
4848 continue;
4849 Value *V = TE.Scalars[ReorderMask[Idx]];
4850 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
4851 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
4852 }
4853 }
4854 // Build the order of the VF size, need to reorder reuses shuffles, they are
4855 // always of VF size.
4856 OrdersType ResOrder(VF);
4857 std::iota(ResOrder.begin(), ResOrder.end(), 0);
4858 auto *It = ResOrder.begin();
4859 for (unsigned K = 0; K < VF; K += Sz) {
4860 OrdersType CurrentOrder(TE.ReorderIndices);
4861 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
4862 if (SubMask.front() == PoisonMaskElem)
4863 std::iota(SubMask.begin(), SubMask.end(), 0);
4864 reorderOrder(CurrentOrder, SubMask);
4865 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
4866 std::advance(It, Sz);
4867 }
4868 if (TE.State == TreeEntry::NeedToGather &&
4869 all_of(enumerate(ResOrder),
4870 [](const auto &Data) { return Data.index() == Data.value(); }))
4871 return std::nullopt; // No need to reorder.
4872 return std::move(ResOrder);
4873 }
4874 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
4875 any_of(TE.UserTreeIndices,
4876 [](const EdgeInfo &EI) {
4877 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
4878 }) &&
4879 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
4880 return std::nullopt;
4881 if ((TE.State == TreeEntry::Vectorize ||
4882 TE.State == TreeEntry::StridedVectorize) &&
4883 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4884 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4885 !TE.isAltShuffle())
4886 return TE.ReorderIndices;
4887 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4888 auto PHICompare = [&](unsigned I1, unsigned I2) {
4889 Value *V1 = TE.Scalars[I1];
4890 Value *V2 = TE.Scalars[I2];
4891 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
4892 return false;
4893 if (V1->getNumUses() < V2->getNumUses())
4894 return true;
4895 if (V1->getNumUses() > V2->getNumUses())
4896 return false;
4897 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
4898 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4899 if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4900 if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4902 IE1, IE2,
4903 [](InsertElementInst *II) { return II->getOperand(0); }))
4904 return I1 < I2;
4905 return getInsertIndex(IE1) < getInsertIndex(IE2);
4906 }
4907 if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4908 if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4909 if (EE1->getOperand(0) != EE2->getOperand(0))
4910 return I1 < I2;
4911 return getInsertIndex(EE1) < getInsertIndex(EE2);
4912 }
4913 return I1 < I2;
4914 };
4915 auto IsIdentityOrder = [](const OrdersType &Order) {
4916 for (unsigned Idx : seq<unsigned>(0, Order.size()))
4917 if (Idx != Order[Idx])
4918 return false;
4919 return true;
4920 };
4921 if (!TE.ReorderIndices.empty())
4922 return TE.ReorderIndices;
4924 SmallVector<unsigned> Phis(TE.Scalars.size());
4925 std::iota(Phis.begin(), Phis.end(), 0);
4926 OrdersType ResOrder(TE.Scalars.size());
4927 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4928 PhiToId[Id] = Id;
4929 stable_sort(Phis, PHICompare);
4930 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4931 ResOrder[Id] = PhiToId[Phis[Id]];
4932 if (IsIdentityOrder(ResOrder))
4933 return std::nullopt; // No need to reorder.
4934 return std::move(ResOrder);
4935 }
4936 if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
4937 allSameType(TE.Scalars)) {
4938 // TODO: add analysis of other gather nodes with extractelement
4939 // instructions and other values/instructions, not only undefs.
4940 if ((TE.getOpcode() == Instruction::ExtractElement ||
4941 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
4942 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
4943 all_of(TE.Scalars, [](Value *V) {
4944 auto *EE = dyn_cast<ExtractElementInst>(V);
4945 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4946 })) {
4947 // Check that gather of extractelements can be represented as
4948 // just a shuffle of a single vector.
4949 OrdersType CurrentOrder;
4950 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4951 /*ResizeAllowed=*/true);
4952 if (Reuse || !CurrentOrder.empty())
4953 return std::move(CurrentOrder);
4954 }
4955 // If the gather node is <undef, v, .., poison> and
4956 // insertelement poison, v, 0 [+ permute]
4957 // is cheaper than
4958 // insertelement poison, v, n - try to reorder.
4959 // If rotating the whole graph, exclude the permute cost, the whole graph
4960 // might be transformed.
4961 int Sz = TE.Scalars.size();
4962 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
4963 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
4964 const auto *It =
4965 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
4966 if (It == TE.Scalars.begin())
4967 return OrdersType();
4968 auto *Ty = FixedVectorType::get(TE.Scalars.front()->getType(), Sz);
4969 if (It != TE.Scalars.end()) {
4970 OrdersType Order(Sz, Sz);
4971 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4972 Order[Idx] = 0;
4973 fixupOrderingIndices(Order);
4974 SmallVector<int> Mask;
4975 inversePermutation(Order, Mask);
4976 InstructionCost PermuteCost =
4977 TopToBottom
4978 ? 0
4980 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
4981 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
4982 PoisonValue::get(Ty), *It);
4983 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
4984 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
4985 PoisonValue::get(Ty), *It);
4986 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
4987 OrdersType Order(Sz, Sz);
4988 Order[Idx] = 0;
4989 return std::move(Order);
4990 }
4991 }
4992 }
4993 if (isSplat(TE.Scalars))
4994 return std::nullopt;
4995 if (TE.Scalars.size() >= 4)
4996 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
4997 return Order;
4998 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
4999 return CurrentOrder;
5000 }
5001 return std::nullopt;
5002}
5003
5004/// Checks if the given mask is a "clustered" mask with the same clusters of
5005/// size \p Sz, which are not identity submasks.
5007 unsigned Sz) {
5008 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
5009 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
5010 return false;
5011 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
5012 ArrayRef<int> Cluster = Mask.slice(I, Sz);
5013 if (Cluster != FirstCluster)
5014 return false;
5015 }
5016 return true;
5017}
5018
5019void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
5020 // Reorder reuses mask.
5021 reorderReuses(TE.ReuseShuffleIndices, Mask);
5022 const unsigned Sz = TE.Scalars.size();
5023 // For vectorized and non-clustered reused no need to do anything else.
5024 if (TE.State != TreeEntry::NeedToGather ||
5026 Sz) ||
5027 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
5028 return;
5029 SmallVector<int> NewMask;
5030 inversePermutation(TE.ReorderIndices, NewMask);
5031 addMask(NewMask, TE.ReuseShuffleIndices);
5032 // Clear reorder since it is going to be applied to the new mask.
5033 TE.ReorderIndices.clear();
5034 // Try to improve gathered nodes with clustered reuses, if possible.
5035 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
5036 SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
5037 inversePermutation(NewOrder, NewMask);
5038 reorderScalars(TE.Scalars, NewMask);
5039 // Fill the reuses mask with the identity submasks.
5040 for (auto *It = TE.ReuseShuffleIndices.begin(),
5041 *End = TE.ReuseShuffleIndices.end();
5042 It != End; std::advance(It, Sz))
5043 std::iota(It, std::next(It, Sz), 0);
5044}
5045
5047 ArrayRef<unsigned> SecondaryOrder) {
5048 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5049 "Expected same size of orders");
5050 unsigned Sz = Order.size();
5051 SmallBitVector UsedIndices(Sz);
5052 for (unsigned Idx : seq<unsigned>(0, Sz)) {
5053 if (Order[Idx] != Sz)
5054 UsedIndices.set(Order[Idx]);
5055 }
5056 if (SecondaryOrder.empty()) {
5057 for (unsigned Idx : seq<unsigned>(0, Sz))
5058 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5059 Order[Idx] = Idx;
5060 } else {
5061 for (unsigned Idx : seq<unsigned>(0, Sz))
5062 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5063 !UsedIndices.test(SecondaryOrder[Idx]))
5064 Order[Idx] = SecondaryOrder[Idx];
5065 }
5066}
5067
5069 // Maps VF to the graph nodes.
5071 // ExtractElement gather nodes which can be vectorized and need to handle
5072 // their ordering.
5074
5075 // Phi nodes can have preferred ordering based on their result users
5077
5078 // AltShuffles can also have a preferred ordering that leads to fewer
5079 // instructions, e.g., the addsub instruction in x86.
5080 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5081
5082 // Maps a TreeEntry to the reorder indices of external users.
5084 ExternalUserReorderMap;
5085 // Find all reorderable nodes with the given VF.
5086 // Currently the are vectorized stores,loads,extracts + some gathering of
5087 // extracts.
5088 for_each(VectorizableTree, [&, &TTIRef = *TTI](
5089 const std::unique_ptr<TreeEntry> &TE) {
5090 // Look for external users that will probably be vectorized.
5091 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5092 findExternalStoreUsersReorderIndices(TE.get());
5093 if (!ExternalUserReorderIndices.empty()) {
5094 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5095 ExternalUserReorderMap.try_emplace(TE.get(),
5096 std::move(ExternalUserReorderIndices));
5097 }
5098
5099 // Patterns like [fadd,fsub] can be combined into a single instruction in
5100 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5101 // to take into account their order when looking for the most used order.
5102 if (TE->isAltShuffle()) {
5103 VectorType *VecTy =
5104 FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
5105 unsigned Opcode0 = TE->getOpcode();
5106 unsigned Opcode1 = TE->getAltOpcode();
5107 SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
5108 // If this pattern is supported by the target then we consider the order.
5109 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5110 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5111 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
5112 }
5113 // TODO: Check the reverse order too.
5114 }
5115
5116 if (std::optional<OrdersType> CurrentOrder =
5117 getReorderingData(*TE, /*TopToBottom=*/true)) {
5118 // Do not include ordering for nodes used in the alt opcode vectorization,
5119 // better to reorder them during bottom-to-top stage. If follow the order
5120 // here, it causes reordering of the whole graph though actually it is
5121 // profitable just to reorder the subgraph that starts from the alternate
5122 // opcode vectorization node. Such nodes already end-up with the shuffle
5123 // instruction and it is just enough to change this shuffle rather than
5124 // rotate the scalars for the whole graph.
5125 unsigned Cnt = 0;
5126 const TreeEntry *UserTE = TE.get();
5127 while (UserTE && Cnt < RecursionMaxDepth) {
5128 if (UserTE->UserTreeIndices.size() != 1)
5129 break;
5130 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
5131 return EI.UserTE->State == TreeEntry::Vectorize &&
5132 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5133 }))
5134 return;
5135 UserTE = UserTE->UserTreeIndices.back().UserTE;
5136 ++Cnt;
5137 }
5138 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5139 if (!(TE->State == TreeEntry::Vectorize ||
5140 TE->State == TreeEntry::StridedVectorize) ||
5141 !TE->ReuseShuffleIndices.empty())
5142 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
5143 if (TE->State == TreeEntry::Vectorize &&
5144 TE->getOpcode() == Instruction::PHI)
5145 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
5146 }
5147 });
5148
5149 // Reorder the graph nodes according to their vectorization factor.
5150 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5151 VF /= 2) {
5152 auto It = VFToOrderedEntries.find(VF);
5153 if (It == VFToOrderedEntries.end())
5154 continue;
5155 // Try to find the most profitable order. We just are looking for the most
5156 // used order and reorder scalar elements in the nodes according to this
5157 // mostly used order.
5158 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5159 // All operands are reordered and used only in this node - propagate the
5160 // most used order to the user node.
5163 OrdersUses;
5165 for (const TreeEntry *OpTE : OrderedEntries) {
5166 // No need to reorder this nodes, still need to extend and to use shuffle,
5167 // just need to merge reordering shuffle and the reuse shuffle.
5168 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5169 continue;
5170 // Count number of orders uses.
5171 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5172 &PhisToOrders]() -> const OrdersType & {
5173 if (OpTE->State == TreeEntry::NeedToGather ||
5174 !OpTE->ReuseShuffleIndices.empty()) {
5175 auto It = GathersToOrders.find(OpTE);
5176 if (It != GathersToOrders.end())
5177 return It->second;
5178 }
5179 if (OpTE->isAltShuffle()) {
5180 auto It = AltShufflesToOrders.find(OpTE);
5181 if (It != AltShufflesToOrders.end())
5182 return It->second;
5183 }
5184 if (OpTE->State == TreeEntry::Vectorize &&
5185 OpTE->getOpcode() == Instruction::PHI) {
5186 auto It = PhisToOrders.find(OpTE);
5187 if (It != PhisToOrders.end())
5188 return It->second;
5189 }
5190 return OpTE->ReorderIndices;
5191 }();
5192 // First consider the order of the external scalar users.
5193 auto It = ExternalUserReorderMap.find(OpTE);
5194 if (It != ExternalUserReorderMap.end()) {
5195 const auto &ExternalUserReorderIndices = It->second;
5196 // If the OpTE vector factor != number of scalars - use natural order,
5197 // it is an attempt to reorder node with reused scalars but with
5198 // external uses.
5199 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5200 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
5201 ExternalUserReorderIndices.size();
5202 } else {
5203 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
5204 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5205 }
5206 // No other useful reorder data in this entry.
5207 if (Order.empty())
5208 continue;
5209 }
5210 // Stores actually store the mask, not the order, need to invert.
5211 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5212 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5213 SmallVector<int> Mask;
5214 inversePermutation(Order, Mask);
5215 unsigned E = Order.size();
5216 OrdersType CurrentOrder(E, E);
5217 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5218 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5219 });
5220 fixupOrderingIndices(CurrentOrder);
5221 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5222 } else {
5223 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5224 }
5225 }
5226 if (OrdersUses.empty())
5227 continue;
5228 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5229 const unsigned Sz = Order.size();
5230 for (unsigned Idx : seq<unsigned>(0, Sz))
5231 if (Idx != Order[Idx] && Order[Idx] != Sz)
5232 return false;
5233 return true;
5234 };
5235 // Choose the most used order.
5236 unsigned IdentityCnt = 0;
5237 unsigned FilledIdentityCnt = 0;
5238 OrdersType IdentityOrder(VF, VF);
5239 for (auto &Pair : OrdersUses) {
5240 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5241 if (!Pair.first.empty())
5242 FilledIdentityCnt += Pair.second;
5243 IdentityCnt += Pair.second;
5244 combineOrders(IdentityOrder, Pair.first);
5245 }
5246 }
5247 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5248 unsigned Cnt = IdentityCnt;
5249 for (auto &Pair : OrdersUses) {
5250 // Prefer identity order. But, if filled identity found (non-empty order)
5251 // with same number of uses, as the new candidate order, we can choose
5252 // this candidate order.
5253 if (Cnt < Pair.second ||
5254 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5255 Cnt == Pair.second && !BestOrder.empty() &&
5256 IsIdentityOrder(BestOrder))) {
5257 combineOrders(Pair.first, BestOrder);
5258 BestOrder = Pair.first;
5259 Cnt = Pair.second;
5260 } else {
5261 combineOrders(BestOrder, Pair.first);
5262 }
5263 }
5264 // Set order of the user node.
5265 if (IsIdentityOrder(BestOrder))
5266 continue;
5267 fixupOrderingIndices(BestOrder);
5268 SmallVector<int> Mask;
5269 inversePermutation(BestOrder, Mask);
5270 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5271 unsigned E = BestOrder.size();
5272 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5273 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5274 });
5275 // Do an actual reordering, if profitable.
5276 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5277 // Just do the reordering for the nodes with the given VF.
5278 if (TE->Scalars.size() != VF) {
5279 if (TE->ReuseShuffleIndices.size() == VF) {
5280 // Need to reorder the reuses masks of the operands with smaller VF to
5281 // be able to find the match between the graph nodes and scalar
5282 // operands of the given node during vectorization/cost estimation.
5283 assert(all_of(TE->UserTreeIndices,
5284 [VF, &TE](const EdgeInfo &EI) {
5285 return EI.UserTE->Scalars.size() == VF ||
5286 EI.UserTE->Scalars.size() ==
5287 TE->Scalars.size();
5288 }) &&
5289 "All users must be of VF size.");
5290 // Update ordering of the operands with the smaller VF than the given
5291 // one.
5292 reorderNodeWithReuses(*TE, Mask);
5293 }
5294 continue;
5295 }
5296 if ((TE->State == TreeEntry::Vectorize ||
5297 TE->State == TreeEntry::StridedVectorize) &&
5299 InsertElementInst>(TE->getMainOp()) &&
5300 !TE->isAltShuffle()) {
5301 // Build correct orders for extract{element,value}, loads and
5302 // stores.
5303 reorderOrder(TE->ReorderIndices, Mask);
5304 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5305 TE->reorderOperands(Mask);
5306 } else {
5307 // Reorder the node and its operands.
5308 TE->reorderOperands(Mask);
5309 assert(TE->ReorderIndices.empty() &&
5310 "Expected empty reorder sequence.");
5311 reorderScalars(TE->Scalars, Mask);
5312 }
5313 if (!TE->ReuseShuffleIndices.empty()) {
5314 // Apply reversed order to keep the original ordering of the reused
5315 // elements to avoid extra reorder indices shuffling.
5316 OrdersType CurrentOrder;
5317 reorderOrder(CurrentOrder, MaskOrder);
5318 SmallVector<int> NewReuses;
5319 inversePermutation(CurrentOrder, NewReuses);
5320 addMask(NewReuses, TE->ReuseShuffleIndices);
5321 TE->ReuseShuffleIndices.swap(NewReuses);
5322 }
5323 }
5324 }
5325}
5326
5327bool BoUpSLP::canReorderOperands(
5328 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5329 ArrayRef<TreeEntry *> ReorderableGathers,
5330 SmallVectorImpl<TreeEntry *> &GatherOps) {
5331 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5332 if (UserTE->isNonPowOf2Vec())
5333 return false;
5334
5335 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
5336 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
5337 return OpData.first == I &&
5338 (OpData.second->State == TreeEntry::Vectorize ||
5339 OpData.second->State == TreeEntry::StridedVectorize);
5340 }))
5341 continue;
5342 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
5343 // Do not reorder if operand node is used by many user nodes.
5344 if (any_of(TE->UserTreeIndices,
5345 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5346 return false;
5347 // Add the node to the list of the ordered nodes with the identity
5348 // order.
5349 Edges.emplace_back(I, TE);
5350 // Add ScatterVectorize nodes to the list of operands, where just
5351 // reordering of the scalars is required. Similar to the gathers, so
5352 // simply add to the list of gathered ops.
5353 // If there are reused scalars, process this node as a regular vectorize
5354 // node, just reorder reuses mask.
5355 if (TE->State != TreeEntry::Vectorize &&
5356 TE->State != TreeEntry::StridedVectorize &&
5357 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5358 GatherOps.push_back(TE);
5359 continue;
5360 }
5361 TreeEntry *Gather = nullptr;
5362 if (count_if(ReorderableGathers,
5363 [&Gather, UserTE, I](TreeEntry *TE) {
5364 assert(TE->State != TreeEntry::Vectorize &&
5365 TE->State != TreeEntry::StridedVectorize &&
5366 "Only non-vectorized nodes are expected.");
5367 if (any_of(TE->UserTreeIndices,
5368 [UserTE, I](const EdgeInfo &EI) {
5369 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5370 })) {
5371 assert(TE->isSame(UserTE->getOperand(I)) &&
5372 "Operand entry does not match operands.");
5373 Gather = TE;
5374 return true;
5375 }
5376 return false;
5377 }) > 1 &&
5378 !allConstant(UserTE->getOperand(I)))
5379 return false;
5380 if (Gather)
5381 GatherOps.push_back(Gather);
5382 }
5383 return true;
5384}
5385
5386void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
5387 SetVector<TreeEntry *> OrderedEntries;
5388 DenseSet<const TreeEntry *> GathersToOrders;
5389 // Find all reorderable leaf nodes with the given VF.
5390 // Currently the are vectorized loads,extracts without alternate operands +
5391 // some gathering of extracts.
5392 SmallVector<TreeEntry *> NonVectorized;
5393 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5394 if (TE->State != TreeEntry::Vectorize &&
5395 TE->State != TreeEntry::StridedVectorize)
5396 NonVectorized.push_back(TE.get());
5397 if (std::optional<OrdersType> CurrentOrder =
5398 getReorderingData(*TE, /*TopToBottom=*/false)) {
5399 OrderedEntries.insert(TE.get());
5400 if (!(TE->State == TreeEntry::Vectorize ||
5401 TE->State == TreeEntry::StridedVectorize) ||
5402 !TE->ReuseShuffleIndices.empty())
5403 GathersToOrders.insert(TE.get());
5404 }
5405 }
5406
5407 // 1. Propagate order to the graph nodes, which use only reordered nodes.
5408 // I.e., if the node has operands, that are reordered, try to make at least
5409 // one operand order in the natural order and reorder others + reorder the
5410 // user node itself.
5412 while (!OrderedEntries.empty()) {
5413 // 1. Filter out only reordered nodes.
5414 // 2. If the entry has multiple uses - skip it and jump to the next node.
5416 SmallVector<TreeEntry *> Filtered;
5417 for (TreeEntry *TE : OrderedEntries) {
5418 if (!(TE->State == TreeEntry::Vectorize ||
5419 TE->State == TreeEntry::StridedVectorize ||
5420 (TE->State == TreeEntry::NeedToGather &&
5421 GathersToOrders.contains(TE))) ||
5422 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5423 !all_of(drop_begin(TE->UserTreeIndices),
5424 [TE](const EdgeInfo &EI) {
5425 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5426 }) ||
5427 !Visited.insert(TE).second) {
5428 Filtered.push_back(TE);
5429 continue;
5430 }
5431 // Build a map between user nodes and their operands order to speedup
5432 // search. The graph currently does not provide this dependency directly.
5433 for (EdgeInfo &EI : TE->UserTreeIndices) {
5434 TreeEntry *UserTE = EI.UserTE;
5435 auto It = Users.find(UserTE);
5436 if (It == Users.end())
5437 It = Users.insert({UserTE, {}}).first;
5438 It->second.emplace_back(EI.EdgeIdx, TE);
5439 }
5440 }
5441 // Erase filtered entries.
5442 for (TreeEntry *TE : Filtered)
5443 OrderedEntries.remove(TE);
5445 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5446 UsersVec(Users.begin(), Users.end());
5447 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
5448 return Data1.first->Idx > Data2.first->Idx;
5449 });
5450 for (auto &Data : UsersVec) {
5451 // Check that operands are used only in the User node.
5452 SmallVector<TreeEntry *> GatherOps;
5453 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
5454 GatherOps)) {
5455 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5456 OrderedEntries.remove(Op.second);
5457 continue;
5458 }
5459 // All operands are reordered and used only in this node - propagate the
5460 // most used order to the user node.
5463 OrdersUses;
5464 // Do the analysis for each tree entry only once, otherwise the order of
5465 // the same node my be considered several times, though might be not
5466 // profitable.
5469 for (const auto &Op : Data.second) {
5470 TreeEntry *OpTE = Op.second;
5471 if (!VisitedOps.insert(OpTE).second)
5472 continue;
5473 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5474 continue;
5475 const auto Order = [&]() -> const OrdersType {
5476 if (OpTE->State == TreeEntry::NeedToGather ||
5477 !OpTE->ReuseShuffleIndices.empty())
5478 return getReorderingData(*OpTE, /*TopToBottom=*/false)
5479 .value_or(OrdersType(1));
5480 return OpTE->ReorderIndices;
5481 }();
5482 // The order is partially ordered, skip it in favor of fully non-ordered
5483 // orders.
5484 if (Order.size() == 1)
5485 continue;
5486 unsigned NumOps = count_if(
5487 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
5488 return P.second == OpTE;
5489 });
5490 // Stores actually store the mask, not the order, need to invert.
5491 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5492 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5493 SmallVector<int> Mask;
5494 inversePermutation(Order, Mask);
5495 unsigned E = Order.size();
5496 OrdersType CurrentOrder(E, E);
5497 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5498 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5499 });
5500 fixupOrderingIndices(CurrentOrder);
5501 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5502 NumOps;
5503 } else {
5504 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5505 }
5506 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
5507 const auto AllowsReordering = [&](const TreeEntry *TE) {
5508 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5509 if (TE->isNonPowOf2Vec())
5510 return false;
5511 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5512 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5513 (IgnoreReorder && TE->Idx == 0))
5514 return true;
5515 if (TE->State == TreeEntry::NeedToGather) {
5516 if (GathersToOrders.contains(TE))
5517 return !getReorderingData(*TE, /*TopToBottom=*/false)
5518 .value_or(OrdersType(1))
5519 .empty();
5520 return true;
5521 }
5522 return false;
5523 };
5524 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
5525 TreeEntry *UserTE = EI.UserTE;
5526 if (!VisitedUsers.insert(UserTE).second)
5527 continue;
5528 // May reorder user node if it requires reordering, has reused
5529 // scalars, is an alternate op vectorize node or its op nodes require
5530 // reordering.
5531 if (AllowsReordering(UserTE))
5532 continue;
5533 // Check if users allow reordering.
5534 // Currently look up just 1 level of operands to avoid increase of
5535 // the compile time.
5536 // Profitable to reorder if definitely more operands allow
5537 // reordering rather than those with natural order.
5539 if (static_cast<unsigned>(count_if(
5540 Ops, [UserTE, &AllowsReordering](
5541 const std::pair<unsigned, TreeEntry *> &Op) {
5542 return AllowsReordering(Op.second) &&
5543 all_of(Op.second->UserTreeIndices,
5544 [UserTE](const EdgeInfo &EI) {
5545 return EI.UserTE == UserTE;
5546 });
5547 })) <= Ops.size() / 2)
5548 ++Res.first->second;
5549 }
5550 }
5551 if (OrdersUses.empty()) {
5552 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5553 OrderedEntries.remove(Op.second);
5554 continue;
5555 }
5556 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5557 const unsigned Sz = Order.size();
5558 for (unsigned Idx : seq<unsigned>(0, Sz))
5559 if (Idx != Order[Idx] && Order[Idx] != Sz)
5560 return false;
5561 return true;
5562 };
5563 // Choose the most used order.
5564 unsigned IdentityCnt = 0;
5565 unsigned VF = Data.second.front().second->getVectorFactor();
5566 OrdersType IdentityOrder(VF, VF);
5567 for (auto &Pair : OrdersUses) {
5568 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5569 IdentityCnt += Pair.second;
5570 combineOrders(IdentityOrder, Pair.first);
5571 }
5572 }
5573 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5574 unsigned Cnt = IdentityCnt;
5575 for (auto &Pair : OrdersUses) {
5576 // Prefer identity order. But, if filled identity found (non-empty
5577 // order) with same number of uses, as the new candidate order, we can
5578 // choose this candidate order.
5579 if (Cnt < Pair.second) {
5580 combineOrders(Pair.first, BestOrder);
5581 BestOrder = Pair.first;
5582 Cnt = Pair.second;
5583 } else {
5584 combineOrders(BestOrder, Pair.first);
5585 }
5586 }
5587 // Set order of the user node.
5588 if (IsIdentityOrder(BestOrder)) {
5589 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5590 OrderedEntries.remove(Op.second);
5591 continue;
5592 }
5593 fixupOrderingIndices(BestOrder);
5594 // Erase operands from OrderedEntries list and adjust their orders.
5595 VisitedOps.clear();
5596 SmallVector<int> Mask;
5597 inversePermutation(BestOrder, Mask);
5598 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5599 unsigned E = BestOrder.size();
5600 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5601 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5602 });
5603 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
5604 TreeEntry *TE = Op.second;
5605 OrderedEntries.remove(TE);
5606 if (!VisitedOps.insert(TE).second)
5607 continue;
5608 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
5609 reorderNodeWithReuses(*TE, Mask);
5610 continue;
5611 }
5612 // Gathers are processed separately.
5613 if (TE->State != TreeEntry::Vectorize &&
5614 TE->State != TreeEntry::StridedVectorize &&
5615 (TE->State != TreeEntry::ScatterVectorize ||
5616 TE->ReorderIndices.empty()))
5617 continue;
5618 assert((BestOrder.size() == TE->ReorderIndices.size() ||
5619 TE->ReorderIndices.empty()) &&
5620 "Non-matching sizes of user/operand entries.");
5621 reorderOrder(TE->ReorderIndices, Mask);
5622 if (IgnoreReorder && TE == VectorizableTree.front().get())
5623 IgnoreReorder = false;
5624 }
5625 // For gathers just need to reorder its scalars.
5626 for (TreeEntry *Gather : GatherOps) {
5627 assert(Gather->ReorderIndices.empty() &&
5628 "Unexpected reordering of gathers.");
5629 if (!Gather->ReuseShuffleIndices.empty()) {
5630 // Just reorder reuses indices.
5631 reorderReuses(Gather->ReuseShuffleIndices, Mask);
5632 continue;
5633 }
5634 reorderScalars(Gather->Scalars, Mask);
5635 OrderedEntries.remove(Gather);
5636 }
5637 // Reorder operands of the user node and set the ordering for the user
5638 // node itself.
5639 if (Data.first->State != TreeEntry::Vectorize ||
5640 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5641 Data.first->getMainOp()) ||
5642 Data.first->isAltShuffle())
5643 Data.first->reorderOperands(Mask);
5644 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
5645 Data.first->isAltShuffle() ||
5646 Data.first->State == TreeEntry::StridedVectorize) {
5647 reorderScalars(Data.first->Scalars, Mask);
5648 reorderOrder(Data.first->ReorderIndices, MaskOrder,
5649 /*BottomOrder=*/true);
5650 if (Data.first->ReuseShuffleIndices.empty() &&
5651 !Data.first->ReorderIndices.empty() &&
5652 !Data.first->isAltShuffle()) {
5653 // Insert user node to the list to try to sink reordering deeper in
5654 // the graph.
5655 OrderedEntries.insert(Data.first);
5656 }
5657 } else {
5658 reorderOrder(Data.first->ReorderIndices, Mask);
5659 }
5660 }
5661 }
5662 // If the reordering is unnecessary, just remove the reorder.
5663 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5664 VectorizableTree.front()->ReuseShuffleIndices.empty())
5665 VectorizableTree.front()->ReorderIndices.clear();
5666}
5667
5669 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
5670 DenseMap<Value *, unsigned> ScalarToExtUses;
5671 // Collect the values that we need to extract from the tree.
5672 for (auto &TEPtr : VectorizableTree) {
5673 TreeEntry *Entry = TEPtr.get();
5674
5675 // No need to handle users of gathered values.
5676 if (Entry->State == TreeEntry::NeedToGather)
5677 continue;
5678
5679 // For each lane:
5680 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5681 Value *Scalar = Entry->Scalars[Lane];
5682 if (!isa<Instruction>(Scalar))
5683 continue;
5684 // All uses must be replaced already? No need to do it again.
5685 auto It = ScalarToExtUses.find(Scalar);
5686 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
5687 continue;
5688
5689 // Check if the scalar is externally used as an extra arg.
5690 const auto *ExtI = ExternallyUsedValues.find(Scalar);
5691 if (ExtI != ExternallyUsedValues.end()) {
5692 int FoundLane = Entry->findLaneForValue(Scalar);
5693 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
5694 << FoundLane << " from " << *Scalar << ".\n");
5695 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
5696 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
5697 continue;
5698 }
5699 for (User *U : Scalar->users()) {
5700 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
5701
5702 Instruction *UserInst = dyn_cast<Instruction>(U);
5703 if (!UserInst || isDeleted(UserInst))
5704 continue;
5705
5706 // Ignore users in the user ignore list.
5707 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5708 continue;
5709
5710 // Skip in-tree scalars that become vectors
5711 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5712 // Some in-tree scalars will remain as scalar in vectorized
5713 // instructions. If that is the case, the one in FoundLane will
5714 // be used.
5715 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5717 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5718 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
5719 << ".\n");
5720 assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
5721 continue;
5722 }
5723 U = nullptr;
5724 if (It != ScalarToExtUses.end()) {
5725 ExternalUses[It->second].User = nullptr;
5726 break;
5727 }
5728 }
5729
5730 int FoundLane = Entry->findLaneForValue(Scalar);
5731 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
5732 << " from lane " << FoundLane << " from " << *Scalar
5733 << ".\n");
5734 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
5735 ExternalUses.emplace_back(Scalar, U, FoundLane);
5736 if (!U)
5737 break;
5738 }
5739 }
5740 }
5741}
5742
5744BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
5746 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5747 Value *V = TE->Scalars[Lane];
5748 // To save compilation time we don't visit if we have too many users.
5749 if (V->hasNUsesOrMore(UsesLimit))
5750 break;
5751
5752 // Collect stores per pointer object.
5753 for (User *U : V->users()) {
5754 auto *SI = dyn_cast<StoreInst>(U);
5755 if (SI == nullptr || !SI->isSimple() ||
5756 !isValidElementType(SI->getValueOperand()->getType()))
5757 continue;
5758 // Skip entry if already
5759 if (getTreeEntry(U))
5760 continue;
5761
5762 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
5763 auto &StoresVec = PtrToStoresMap[Ptr];
5764 // For now just keep one store per pointer object per lane.
5765 // TODO: Extend this to support multiple stores per pointer per lane
5766 if (StoresVec.size() > Lane)
5767 continue;
5768 // Skip if in different BBs.
5769 if (!StoresVec.empty() &&
5770 SI->getParent() != StoresVec.back()->getParent())
5771 continue;
5772 // Make sure that the stores are of the same type.
5773 if (!StoresVec.empty() &&
5774 SI->getValueOperand()->getType() !=
5775 StoresVec.back()->getValueOperand()->getType())
5776 continue;
5777 StoresVec.push_back(SI);
5778 }
5779 }
5780 return PtrToStoresMap;
5781}
5782
5783bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
5784 OrdersType &ReorderIndices) const {
5785 // We check whether the stores in StoreVec can form a vector by sorting them
5786 // and checking whether they are consecutive.
5787
5788 // To avoid calling getPointersDiff() while sorting we create a vector of
5789 // pairs {store, offset from first} and sort this instead.
5790 SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
5791 StoreInst *S0 = StoresVec[0];
5792 StoreOffsetVec[0] = {S0, 0};
5793 Type *S0Ty = S0->getValueOperand()->getType();
5794 Value *S0Ptr = S0->getPointerOperand();
5795 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
5796 StoreInst *SI = StoresVec[Idx];
5797 std::optional<int> Diff =
5798 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
5799 SI->getPointerOperand(), *DL, *SE,
5800 /*StrictCheck=*/true);
5801 // We failed to compare the pointers so just abandon this StoresVec.
5802 if (!Diff)
5803 return false;
5804 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
5805 }
5806
5807 // Sort the vector based on the pointers. We create a copy because we may
5808 // need the original later for calculating the reorder (shuffle) indices.
5809 stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
5810 const std::pair<StoreInst *, int> &Pair2) {
5811 int Offset1 = Pair1.second;
5812 int Offset2 = Pair2.second;
5813 return Offset1 < Offset2;
5814 });
5815
5816 // Check if the stores are consecutive by checking if their difference is 1.
5817 for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5818 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
5819 return false;
5820
5821 // Calculate the shuffle indices according to their offset against the sorted
5822 // StoreOffsetVec.
5823 ReorderIndices.reserve(StoresVec.size());
5824 for (StoreInst *SI : StoresVec) {
5825 unsigned Idx = find_if(StoreOffsetVec,
5826 [SI](const std::pair<StoreInst *, int> &Pair) {
5827 return Pair.first == SI;
5828 }) -
5829 StoreOffsetVec.begin();
5830 ReorderIndices.push_back(Idx);
5831 }
5832 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
5833 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
5834 // same convention here.
5835 auto IsIdentityOrder = [](const OrdersType &Order) {
5836 for (unsigned Idx : seq<unsigned>(0, Order.size()))
5837 if (Idx != Order[Idx])
5838 return false;
5839 return true;
5840 };
5841 if (IsIdentityOrder(ReorderIndices))
5842 ReorderIndices.clear();
5843
5844 return true;
5845}
5846
5847#ifndef NDEBUG
5849 for (unsigned Idx : Order)
5850 dbgs() << Idx << ", ";
5851 dbgs() << "\n";
5852}
5853#endif
5854
5856BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
5857 unsigned NumLanes = TE->Scalars.size();
5858
5860 collectUserStores(TE);
5861
5862 // Holds the reorder indices for each candidate store vector that is a user of
5863 // the current TreeEntry.
5864 SmallVector<OrdersType, 1> ExternalReorderIndices;
5865
5866 // Now inspect the stores collected per pointer and look for vectorization
5867 // candidates. For each candidate calculate the reorder index vector and push
5868 // it into `ExternalReorderIndices`
5869 for (const auto &Pair : PtrToStoresMap) {
5870 auto &StoresVec = Pair.second;
5871 // If we have fewer than NumLanes stores, then we can't form a vector.
5872 if (StoresVec.size() != NumLanes)
5873 continue;
5874
5875 // If the stores are not consecutive then abandon this StoresVec.
5876 OrdersType ReorderIndices;
5877 if (!canFormVector(StoresVec, ReorderIndices))
5878 continue;
5879
5880 // We now know that the scalars in StoresVec can form a vector instruction,
5881 // so set the reorder indices.
5882 ExternalReorderIndices.push_back(ReorderIndices);
5883 }
5884 return ExternalReorderIndices;
5885}
5886
5888 const SmallDenseSet<Value *> &UserIgnoreLst) {
5889 deleteTree();
5890 UserIgnoreList = &UserIgnoreLst;
5891 if (!allSameType(Roots))
5892 return;
5893 buildTree_rec(Roots, 0, EdgeInfo());
5894}
5895
5897 deleteTree();
5898 if (!allSameType(Roots))
5899 return;
5900 buildTree_rec(Roots, 0, EdgeInfo());
5901}
5902
5903/// \return true if the specified list of values has only one instruction that
5904/// requires scheduling, false otherwise.
5905#ifndef NDEBUG
5907 Value *NeedsScheduling = nullptr;
5908 for (Value *V : VL) {
5910 continue;
5911 if (!NeedsScheduling) {
5912 NeedsScheduling = V;
5913 continue;
5914 }
5915 return false;
5916 }
5917 return NeedsScheduling;
5918}
5919#endif
5920
5921/// Generates key/subkey pair for the given value to provide effective sorting
5922/// of the values and better detection of the vectorizable values sequences. The
5923/// keys/subkeys can be used for better sorting of the values themselves (keys)
5924/// and in values subgroups (subkeys).
5925static std::pair<size_t, size_t> generateKeySubkey(
5926 Value *V, const TargetLibraryInfo *TLI,
5927 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
5928 bool AllowAlternate) {
5929 hash_code Key = hash_value(V->getValueID() + 2);
5930 hash_code SubKey = hash_value(0);
5931 // Sort the loads by the distance between the pointers.
5932 if (auto *LI = dyn_cast<LoadInst>(V)) {
5933 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
5934 if (LI->isSimple())
5935 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
5936 else
5937 Key = SubKey = hash_value(LI);
5938 } else if (isVectorLikeInstWithConstOps(V)) {
5939 // Sort extracts by the vector operands.
5940 if (isa<ExtractElementInst, UndefValue>(V))
5941 Key = hash_value(Value::UndefValueVal + 1);
5942 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
5943 if (!isUndefVector(EI->getVectorOperand()).all() &&
5944 !isa<UndefValue>(EI->getIndexOperand()))
5945 SubKey = hash_value(EI->getVectorOperand());
5946 }
5947 } else if (auto *I = dyn_cast<Instruction>(V)) {
5948 // Sort other instructions just by the opcodes except for CMPInst.
5949 // For CMP also sort by the predicate kind.
5950 if ((isa<BinaryOperator, CastInst>(I)) &&
5951 isValidForAlternation(I->getOpcode())) {
5952 if (AllowAlternate)
5953 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
5954 else
5955 Key = hash_combine(hash_value(I->getOpcode()), Key);
5956 SubKey = hash_combine(
5957 hash_value(I->getOpcode()), hash_value(I->getType()),
5958 hash_value(isa<BinaryOperator>(I)
5959 ? I->getType()
5960 : cast<CastInst>(I)->getOperand(0)->getType()));
5961 // For casts, look through the only operand to improve compile time.
5962 if (isa<CastInst>(I)) {
5963 std::pair<size_t, size_t> OpVals =
5964 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
5965 /*AllowAlternate=*/true);
5966 Key = hash_combine(OpVals.first, Key);
5967 SubKey = hash_combine(OpVals.first, SubKey);
5968 }
5969 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
5970 CmpInst::Predicate Pred = CI->getPredicate();
5971 if (CI->isCommutative())
5972 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
5974 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
5975 hash_value(SwapPred),
5976 hash_value(CI->getOperand(0)->getType()));
5977 } else if (auto *Call = dyn_cast<CallInst>(I)) {
5980 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
5981 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
5982 SubKey = hash_combine(hash_value(I->getOpcode()),
5983 hash_value(Call->getCalledFunction()));
5984 } else {
5985 Key = hash_combine(hash_value(Call), Key);
5986 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
5987 }
5988 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
5989 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
5990 hash_value(Op.Tag), SubKey);
5991 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
5992 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
5993 SubKey = hash_value(Gep->getPointerOperand());
5994 else
5995 SubKey = hash_value(Gep);
5996 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
5997 !isa<ConstantInt>(I->getOperand(1))) {
5998 // Do not try to vectorize instructions with potentially high cost.
5999 SubKey = hash_value(I);
6000 } else {
6001 SubKey = hash_value(I->getOpcode());
6002 }
6003 Key = hash_combine(hash_value(I->getParent()), Key);
6004 }
6005 return std::make_pair(Key, SubKey);
6006}
6007
6008/// Checks if the specified instruction \p I is an alternate operation for
6009/// the given \p MainOp and \p AltOp instructions.
6010static bool isAlternateInstruction(const Instruction *I,
6011 const Instruction *MainOp,
6012 const Instruction *AltOp,
6013 const TargetLibraryInfo &TLI);
6014
6015bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
6016 ArrayRef<Value *> VL) const {
6017 unsigned Opcode0 = S.getOpcode();
6018 unsigned Opcode1 = S.getAltOpcode();
6019 SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
6020 // If this pattern is supported by the target then consider it profitable.
6021 if (TTI->isLegalAltInstr(FixedVectorType::get(S.MainOp->getType(), VL.size()),
6022 Opcode0, Opcode1, OpcodeMask))
6023 return true;
6025 for (unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
6026 Operands.emplace_back();
6027 // Prepare the operand vector.
6028 for (Value *V : VL)
6029 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
6030 }
6031 if (Operands.size() == 2) {
6032 // Try find best operands candidates.
6033 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
6035 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
6036 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
6037 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
6038 std::optional<int> Res = findBestRootPair(Candidates);
6039 switch (Res.value_or(0)) {
6040 case 0:
6041 break;
6042 case 1:
6043 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
6044 break;
6045 case 2:
6046 std::swap(Operands[0][I], Operands[1][I]);
6047 break;
6048 default:
6049 llvm_unreachable("Unexpected index.");
6050 }
6051 }
6052 }
6053 DenseSet<unsigned> UniqueOpcodes;
6054 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
6055 unsigned NonInstCnt = 0;
6056 // Estimate number of instructions, required for the vectorized node and for
6057 // the buildvector node.
6058 unsigned UndefCnt = 0;
6059 // Count the number of extra shuffles, required for vector nodes.
6060 unsigned ExtraShuffleInsts = 0;
6061 // Check that operands do not contain same values and create either perfect
6062 // diamond match or shuffled match.
6063 if (Operands.size() == 2) {
6064 // Do not count same operands twice.
6065 if (Operands.front() == Operands.back()) {
6066 Operands.erase(Operands.begin());
6067 } else if (!allConstant(Operands.front()) &&
6068 all_of(Operands.front(), [&](Value *V) {
6069 return is_contained(Operands.back(), V);
6070 })) {
6071 Operands.erase(Operands.begin());
6072 ++ExtraShuffleInsts;
6073 }
6074 }
6075 const Loop *L = LI->getLoopFor(S.MainOp->getParent());
6076 // Vectorize node, if:
6077 // 1. at least single operand is constant or splat.
6078 // 2. Operands have many loop invariants (the instructions are not loop
6079 // invariants).
6080 // 3. At least single unique operands is supposed to vectorized.
6081 return none_of(Operands,
6082 [&](ArrayRef<Value *> Op) {
6083 if (allConstant(Op) ||
6084 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
6085 getSameOpcode(Op, *TLI).MainOp))
6086 return false;
6088 for (Value *V : Op) {
6089 if (isa<Constant, ExtractElementInst>(V) ||
6090 getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
6091 if (isa<UndefValue>(V))
6092 ++UndefCnt;
6093 continue;
6094 }
6095 auto Res = Uniques.try_emplace(V, 0);
6096 // Found first duplicate - need to add shuffle.
6097 if (!Res.second && Res.first->second == 1)
6098 ++ExtraShuffleInsts;
6099 ++Res.first->getSecond();
6100 if (auto *I = dyn_cast<Instruction>(V))
6101 UniqueOpcodes.insert(I->getOpcode());
6102 else if (Res.second)
6103 ++NonInstCnt;
6104 }
6105 return none_of(Uniques, [&](const auto &P) {
6106 return P.first->hasNUsesOrMore(P.second + 1) &&
6107 none_of(P.first->users(), [&](User *U) {
6108 return getTreeEntry(U) || Uniques.contains(U);
6109 });
6110 });
6111 }) ||
6112 // Do not vectorize node, if estimated number of vector instructions is
6113 // more than estimated number of buildvector instructions. Number of
6114 // vector operands is number of vector instructions + number of vector
6115 // instructions for operands (buildvectors). Number of buildvector
6116 // instructions is just number_of_operands * number_of_scalars.
6117 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
6118 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
6119 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
6120}
6121
6122BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
6123 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
6124 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
6125 assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
6126
6127 unsigned ShuffleOrOp =
6128 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
6129 auto *VL0 = cast<Instruction>(S.OpValue);
6130 switch (ShuffleOrOp) {
6131 case Instruction::PHI: {
6132 // Too many operands - gather, most probably won't be vectorized.
6133 if (VL0->getNumOperands() > MaxPHINumOperands)
6134 return TreeEntry::NeedToGather;
6135 // Check for terminator values (e.g. invoke).
6136 for (Value *V : VL)
6137 for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
6138 Instruction *Term = dyn_cast<Instruction>(Incoming);
6139 if (Term && Term->isTerminator()) {
6141 << "SLP: Need to swizzle PHINodes (terminator use).\n");
6142 return TreeEntry::NeedToGather;
6143 }
6144 }
6145
6146 return TreeEntry::Vectorize;
6147 }
6148 case Instruction::ExtractValue:
6149 case Instruction::ExtractElement: {
6150 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6151 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6152 if (!isPowerOf2_32(VL.size()))
6153 return TreeEntry::NeedToGather;
6154 if (Reuse || !CurrentOrder.empty())
6155 return TreeEntry::Vectorize;
6156 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
6157 return TreeEntry::NeedToGather;
6158 }
6159 case Instruction::InsertElement: {
6160 // Check that we have a buildvector and not a shuffle of 2 or more
6161 // different vectors.
6162 ValueSet SourceVectors;
6163 for (Value *V : VL) {
6164 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
6165 assert(getInsertIndex(V) != std::nullopt &&
6166 "Non-constant or undef index?");
6167 }
6168
6169 if (count_if(VL, [&SourceVectors](Value *V) {
6170 return !SourceVectors.contains(V);
6171 }) >= 2) {
6172 // Found 2nd source vector - cancel.
6173 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
6174 "different source vectors.\n");
6175 return TreeEntry::NeedToGather;
6176 }
6177
6178 return TreeEntry::Vectorize;
6179 }
6180 case Instruction::Load: {
6181 // Check that a vectorized load would load the same memory as a scalar
6182 // load. For example, we don't want to vectorize loads that are smaller
6183 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6184 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6185 // from such a struct, we read/write packed bits disagreeing with the
6186 // unvectorized version.
6187 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
6189 return TreeEntry::Vectorize;
6191 return TreeEntry::ScatterVectorize;
6193 return TreeEntry::StridedVectorize;
6194 case LoadsState::Gather:
6195#ifndef NDEBUG
6196 Type *ScalarTy = VL0->getType();
6197 if (DL->getTypeSizeInBits(ScalarTy) !=
6198 DL->getTypeAllocSizeInBits(ScalarTy))
6199 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
6200 else if (any_of(VL,
6201 [](Value *V) { return !cast<LoadInst>(V)->isSimple(); }))
6202 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
6203 else
6204 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
6205#endif // NDEBUG
6206 return TreeEntry::NeedToGather;
6207 }
6208 llvm_unreachable("Unexpected state of loads");
6209 }
6210 case Instruction::ZExt:
6211 case Instruction::SExt:
6212 case Instruction::FPToUI:
6213 case Instruction::FPToSI:
6214 case Instruction::FPExt:
6215 case Instruction::PtrToInt:
6216 case Instruction::IntToPtr:
6217 case Instruction::SIToFP:
6218 case Instruction::UIToFP:
6219 case Instruction::Trunc:
6220 case Instruction::FPTrunc:
6221 case Instruction::BitCast: {
6222 Type *SrcTy = VL0->getOperand(0)->getType();
6223 for (Value *V : VL) {
6224 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6225 if (Ty != SrcTy || !isValidElementType(Ty)) {
6226 LLVM_DEBUG(
6227 dbgs() << "SLP: Gathering casts with different src types.\n");
6228 return TreeEntry::NeedToGather;
6229 }
6230 }
6231 return TreeEntry::Vectorize;
6232 }
6233 case Instruction::ICmp:
6234 case Instruction::FCmp: {
6235 // Check that all of the compares have the same predicate.
6236 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
6238 Type *ComparedTy = VL0->getOperand(0)->getType();
6239 for (Value *V : VL) {
6240 CmpInst *Cmp = cast<CmpInst>(V);
6241 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
6242 Cmp->getOperand(0)->getType() != ComparedTy) {
6243 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
6244 return TreeEntry::NeedToGather;
6245 }
6246 }
6247 return TreeEntry::Vectorize;
6248 }
6249 case Instruction::Select:
6250 case Instruction::FNeg:
6251 case Instruction::Add:
6252 case Instruction::FAdd:
6253 case Instruction::Sub:
6254 case Instruction::FSub:
6255 case Instruction::Mul:
6256 case Instruction::FMul:
6257 case Instruction::UDiv:
6258 case Instruction::SDiv:
6259 case Instruction::FDiv:
6260 case Instruction::URem:
6261 case Instruction::SRem:
6262 case Instruction::FRem:
6263 case Instruction::Shl:
6264 case Instruction::LShr:
6265 case Instruction::AShr:
6266 case Instruction::And:
6267 case Instruction::Or:
6268 case Instruction::Xor:
6269 return TreeEntry::Vectorize;
6270 case Instruction::GetElementPtr: {
6271 // We don't combine GEPs with complicated (nested) indexing.
6272 for (Value *V : VL) {
6273 auto *I = dyn_cast<GetElementPtrInst>(V);
6274 if (!I)
6275 continue;
6276 if (I->getNumOperands() != 2) {
6277 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
6278 return TreeEntry::NeedToGather;
6279 }
6280 }
6281
6282 // We can't combine several GEPs into one vector if they operate on
6283 // different types.
6284 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6285 for (Value *V : VL) {
6286 auto *GEP = dyn_cast<GEPOperator>(V);
6287 if (!GEP)
6288 continue;
6289 Type *CurTy = GEP->getSourceElementType();
6290 if (Ty0 != CurTy) {
6291 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
6292 return TreeEntry::NeedToGather;
6293 }
6294 }
6295
6296 // We don't combine GEPs with non-constant indexes.
6297 Type *Ty1 = VL0->getOperand(1)->getType();
6298 for (Value *V : VL) {
6299 auto *I = dyn_cast<GetElementPtrInst>(V);
6300 if (!I)
6301 continue;
6302 auto *Op = I->getOperand(1);
6303 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6304 (Op->getType() != Ty1 &&
6305 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6306 Op->getType()->getScalarSizeInBits() >
6307 DL->getIndexSizeInBits(
6308 V->getType()->getPointerAddressSpace())))) {
6309 LLVM_DEBUG(
6310 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
6311 return TreeEntry::NeedToGather;
6312 }
6313 }
6314
6315 return TreeEntry::Vectorize;
6316 }
6317 case Instruction::Store: {
6318 // Check if the stores are consecutive or if we need to swizzle them.
6319 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6320 // Avoid types that are padded when being allocated as scalars, while
6321 // being packed together in a vector (such as i1).
6322 if (DL->getTypeSizeInBits(ScalarTy) !=
6323 DL->getTypeAllocSizeInBits(ScalarTy)) {
6324 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
6325 return TreeEntry::NeedToGather;
6326 }
6327 // Make sure all stores in the bundle are simple - we can't vectorize
6328 // atomic or volatile stores.
6329 for (Value *V : VL) {
6330 auto *SI = cast<StoreInst>(V);
6331 if (!SI->isSimple()) {
6332 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
6333 return TreeEntry::NeedToGather;
6334 }
6335 PointerOps.push_back(SI->getPointerOperand());
6336 }
6337
6338 // Check the order of pointer operands.
6339 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
6340 Value *Ptr0;
6341 Value *PtrN;
6342 if (CurrentOrder.empty()) {
6343 Ptr0 = PointerOps.front();
6344 PtrN = PointerOps.back();
6345 } else {
6346 Ptr0 = PointerOps[CurrentOrder.front()];
6347 PtrN = PointerOps[CurrentOrder.back()];
6348 }
6349 std::optional<int> Dist =
6350 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
6351 // Check that the sorted pointer operands are consecutive.
6352 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
6353 return TreeEntry::Vectorize;
6354 }
6355
6356 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
6357 return TreeEntry::NeedToGather;
6358 }
6359 case Instruction::Call: {
6360 // Check if the calls are all to the same vectorizable intrinsic or
6361 // library function.
6362 CallInst *CI = cast<CallInst>(VL0);
6364
6365 VFShape Shape = VFShape::get(
6366 CI->getFunctionType(),
6367 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
6368 false /*HasGlobalPred*/);
6369 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
6370
6371 if (!VecFunc && !isTriviallyVectorizable(ID)) {
6372 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
6373 return TreeEntry::NeedToGather;
6374 }
6375 Function *F = CI->getCalledFunction();
6376 unsigned NumArgs = CI->arg_size();
6377 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
6378 for (unsigned J = 0; J != NumArgs; ++J)
6380 ScalarArgs[J] = CI->getArgOperand(J);
6381 for (Value *V : VL) {
6382 CallInst *CI2 = dyn_cast<CallInst>(V);
6383 if (!CI2 || CI2->getCalledFunction() != F ||
6384 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
6385 (VecFunc &&
6386 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
6388 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
6389 << "\n");
6390 return TreeEntry::NeedToGather;
6391 }
6392 // Some intrinsics have scalar arguments and should be same in order for
6393 // them to be vectorized.
6394 for (unsigned J = 0; J != NumArgs; ++J) {
6396 Value *A1J = CI2->getArgOperand(J);
6397 if (ScalarArgs[J] != A1J) {
6399 << "SLP: mismatched arguments in call:" << *CI
6400 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
6401 return TreeEntry::NeedToGather;
6402 }
6403 }
6404 }
6405 // Verify that the bundle operands are identical between the two calls.
6406 if (CI->hasOperandBundles() &&
6407 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
6408 CI->op_begin() + CI->getBundleOperandsEndIndex(),
6409 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
6410 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
6411 << "!=" << *V << '\n');
6412 return TreeEntry::NeedToGather;
6413 }
6414 }
6415
6416 return TreeEntry::Vectorize;
6417 }
6418 case Instruction::ShuffleVector: {
6419 // If this is not an alternate sequence of opcode like add-sub
6420 // then do not vectorize this instruction.
6421 if (!S.isAltShuffle()) {
6422 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
6423 return TreeEntry::NeedToGather;
6424 }
6425 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
6426 LLVM_DEBUG(
6427 dbgs()
6428 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
6429 "the whole alt sequence is not profitable.\n");
6430 return TreeEntry::NeedToGather;
6431 }
6432
6433 return TreeEntry::Vectorize;
6434 }
6435 default:
6436 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
6437 return TreeEntry::NeedToGather;
6438 }
6439}
6440
6441namespace {
6442/// Allows to correctly handle operands of the phi nodes based on the \p Main
6443/// PHINode order of incoming basic blocks/values.
6444class PHIHandler {
6445 DominatorTree &DT;
6446 PHINode *Main = nullptr;
6449
6450public:
6451 PHIHandler() = delete;
6452 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
6453 : DT(DT), Main(Main), Phis(Phis),
6454 Operands(Main->getNumIncomingValues(),
6455 SmallVector<Value *>(Phis.size(), nullptr)) {}
6456 void buildOperands() {
6457 constexpr unsigned FastLimit = 4;
6458 if (Main->getNumIncomingValues() <= FastLimit) {
6459 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
6460 BasicBlock *InBB = Main->getIncomingBlock(I);
6461 if (!DT.isReachableFromEntry(InBB)) {
6462 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
6463 continue;
6464 }
6465 // Prepare the operand vector.
6466 for (auto [Idx, V] : enumerate(Phis)) {
6467 auto *P = cast<PHINode>(V);
6468 if (P->getIncomingBlock(I) == InBB)
6469 Operands[I][Idx] = P->getIncomingValue(I);
6470 else
6471 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
6472 }
6473 }
6474 return;
6475 }
6477 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
6478 BasicBlock *InBB = Main->getIncomingBlock(I);
6479 if (!DT.isReachableFromEntry(InBB)) {
6480 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
6481 continue;
6482 }
6483 Blocks.try_emplace(InBB).first->second.push_back(I);
6484 }
6485 for (auto [Idx, V] : enumerate(Phis)) {
6486 auto *P = cast<PHINode>(V);
6487 for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) {
6488 BasicBlock *InBB = P->getIncomingBlock(I);
6489 if (InBB == Main->getIncomingBlock(I)) {
6490 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
6491 continue;
6492 Operands[I][Idx] = P->getIncomingValue(I);
6493 continue;
6494 }
6495 auto It = Blocks.find(InBB);
6496 if (It == Blocks.end())
6497 continue;
6498 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
6499 }
6500 }
6501 for (const auto &P : Blocks) {
6502 if (P.getSecond().size() <= 1)
6503 continue;
6504 unsigned BasicI = P.getSecond().front();
6505 for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
6507 [&](const auto &Data) {
6508 return !Data.value() ||
6509 Data.value() == Operands[BasicI][Data.index()];
6510 }) &&
6511 "Expected empty operands list.");
6512 Operands[I] = Operands[BasicI];
6513 }
6514 }
6515 }
6516 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
6517};
6518} // namespace
6519
6520void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
6521 const EdgeInfo &UserTreeIdx) {
6522 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
6523
6524 SmallVector<int> ReuseShuffleIndices;
6525 SmallVector<Value *> UniqueValues;
6526 SmallVector<Value *> NonUniqueValueVL;
6527 auto TryToFindDuplicates = [&](const InstructionsState &S,
6528 bool DoNotFail = false) {
6529 // Check that every instruction appears once in this bundle.
6530 DenseMap<Value *, unsigned> UniquePositions(VL.size());
6531 for (Value *V : VL) {
6532 if (isConstant(V)) {
6533 ReuseShuffleIndices.emplace_back(
6534 isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());
6535 UniqueValues.emplace_back(V);
6536 continue;
6537 }
6538 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
6539 ReuseShuffleIndices.emplace_back(Res.first->second);
6540 if (Res.second)
6541 UniqueValues.emplace_back(V);
6542 }
6543 size_t NumUniqueScalarValues = UniqueValues.size();
6544 if (NumUniqueScalarValues == VL.size()) {
6545 ReuseShuffleIndices.clear();
6546 } else {
6547 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6548 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6549 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
6550 "for nodes with padding.\n");
6551 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6552 return false;
6553 }
6554 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
6555 if (NumUniqueScalarValues <= 1 ||
6556 (UniquePositions.size() == 1 && all_of(UniqueValues,
6557 [](Value *V) {
6558 return isa<UndefValue>(V) ||
6559 !isConstant(V);
6560 })) ||
6561 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6562 if (DoNotFail && UniquePositions.size() > 1 &&
6563 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6564 all_of(UniqueValues, [=](Value *V) {
6565 return isa<ExtractElementInst>(V) ||
6566 areAllUsersVectorized(cast<Instruction>(V),
6567 UserIgnoreList);
6568 })) {
6569 unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
6570 if (PWSz == VL.size()) {
6571 ReuseShuffleIndices.clear();
6572 } else {
6573 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
6574 NonUniqueValueVL.append(PWSz - UniqueValues.size(),
6575 UniqueValues.back());
6576 VL = NonUniqueValueVL;
6577 }
6578 return true;
6579 }
6580 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
6581 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6582 return false;
6583 }
6584 VL = UniqueValues;
6585 }
6586 return true;
6587 };
6588
6589 InstructionsState S = getSameOpcode(VL, *TLI);
6590
6591 // Don't vectorize ephemeral values.
6592 if (!EphValues.empty()) {
6593 for (Value *V : VL) {
6594 if (EphValues.count(V)) {
6595 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6596 << ") is ephemeral.\n");
6597 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6598 return;
6599 }
6600 }
6601 }
6602
6603 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
6604 // a load), in which case peek through to include it in the tree, without
6605 // ballooning over-budget.
6606 if (Depth >= RecursionMaxDepth &&
6607 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6608 VL.size() >= 4 &&
6609 (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {
6610 return match(I,
6612 cast<Instruction>(I)->getOpcode() ==
6613 cast<Instruction>(S.MainOp)->getOpcode();
6614 })))) {
6615 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
6616 if (TryToFindDuplicates(S))
6617 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6618 ReuseShuffleIndices);
6619 return;
6620 }
6621
6622 // Don't handle scalable vectors
6623 if (S.getOpcode() == Instruction::ExtractElement &&
6624 isa<ScalableVectorType>(
6625 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6626 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
6627 if (TryToFindDuplicates(S))
6628 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6629 ReuseShuffleIndices);
6630 return;
6631 }
6632
6633 // Don't handle vectors.
6634 if (S.OpValue->getType()->isVectorTy() &&
6635 !isa<InsertElementInst>(S.OpValue)) {
6636 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
6637 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6638 return;
6639 }
6640
6641 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6642 if (SI->getValueOperand()->getType()->isVectorTy()) {
6643 LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
6644 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6645 return;
6646 }
6647
6648 // If all of the operands are identical or constant we have a simple solution.
6649 // If we deal with insert/extract instructions, they all must have constant
6650 // indices, otherwise we should gather them, not try to vectorize.
6651 // If alternate op node with 2 elements with gathered operands - do not
6652 // vectorize.
6653 auto &&NotProfitableForVectorization = [&S, this,
6655 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6656 return false;
6657 if (VectorizableTree.size() < MinTreeSize)
6658 return false;
6659 if (Depth >= RecursionMaxDepth - 1)
6660 return true;
6661 // Check if all operands are extracts, part of vector node or can build a
6662 // regular vectorize node.
6663 SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
6664 for (Value *V : VL) {
6665 auto *I = cast<Instruction>(V);
6666 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
6667 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6668 }));
6669 }
6670 bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
6671 if ((IsCommutative &&
6672 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6673 (!IsCommutative &&
6674 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
6675 return true;
6676 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
6678 auto *I1 = cast<Instruction>(VL.front());
6679 auto *I2 = cast<Instruction>(VL.back());
6680 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6681 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6682 I2->getOperand(Op));
6683 if (static_cast<unsigned>(count_if(
6684 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6686 })) >= S.MainOp->getNumOperands() / 2)
6687 return false;
6688 if (S.MainOp->getNumOperands() > 2)
6689 return true;
6690 if (IsCommutative) {
6691 // Check permuted operands.
6692 Candidates.clear();
6693 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6694 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6695 I2->getOperand((Op + 1) % E));
6696 if (any_of(
6697 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6699 }))
6700 return false;
6701 }
6702 return true;
6703 };
6704 SmallVector<unsigned> SortedIndices;
6705 BasicBlock *BB = nullptr;
6706 bool IsScatterVectorizeUserTE =
6707 UserTreeIdx.UserTE &&
6708 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6709 bool AreAllSameInsts =
6710 (S.getOpcode() && allSameBlock(VL)) ||
6711 (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
6712 VL.size() > 2 &&
6713 all_of(VL,
6714 [&BB](Value *V) {
6715 auto *I = dyn_cast<GetElementPtrInst>(V);
6716 if (!I)
6717 return doesNotNeedToBeScheduled(V);
6718 if (!BB)
6719 BB = I->getParent();
6720 return BB == I->getParent() && I->getNumOperands() == 2;
6721 }) &&
6722 BB &&
6723 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
6724 SortedIndices));
6725 if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) ||
6726 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6727 S.OpValue) &&
6729 NotProfitableForVectorization(VL)) {
6730 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
6731 if (TryToFindDuplicates(S))
6732 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6733 ReuseShuffleIndices);
6734 return;
6735 }
6736
6737 // We now know that this is a vector of instructions of the same type from
6738 // the same block.
6739
6740 // Check if this is a duplicate of another entry.
6741 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
6742 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
6743 if (!E->isSame(VL)) {
6744 auto It = MultiNodeScalars.find(S.OpValue);
6745 if (It != MultiNodeScalars.end()) {
6746 auto *TEIt = find_if(It->getSecond(),
6747 [&](TreeEntry *ME) { return ME->isSame(VL); });
6748 if (TEIt != It->getSecond().end())
6749 E = *TEIt;
6750 else
6751 E = nullptr;
6752 } else {
6753 E = nullptr;
6754 }
6755 }
6756 if (!E) {
6757 if (!doesNotNeedToBeScheduled(S.OpValue)) {
6758 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
6759 if (TryToFindDuplicates(S))
6760 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6761 ReuseShuffleIndices);
6762 return;
6763 }
6764 } else {
6765 // Record the reuse of the tree node. FIXME, currently this is only used
6766 // to properly draw the graph rather than for the actual vectorization.
6767 E->UserTreeIndices.push_back(UserTreeIdx);
6768 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
6769 << ".\n");
6770 return;
6771 }
6772 }
6773
6774 // Check that none of the instructions in the bundle are already in the tree.
6775 for (Value *V : VL) {
6776 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
6778 continue;
6779 if (getTreeEntry(V)) {
6780 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6781 << ") is already in tree.\n");
6782 if (TryToFindDuplicates(S))
6783 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6784 ReuseShuffleIndices);
6785 return;
6786 }
6787 }
6788
6789 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
6790 if (UserIgnoreList && !UserIgnoreList->empty()) {
6791 for (Value *V : VL) {
6792 if (UserIgnoreList && UserIgnoreList->contains(V)) {
6793 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
6794 if (TryToFindDuplicates(S))
6795 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6796 ReuseShuffleIndices);
6797 return;
6798 }
6799 }
6800 }
6801
6802 // Special processing for sorted pointers for ScatterVectorize node with
6803 // constant indeces only.
6804 if (AreAllSameInsts && UserTreeIdx.UserTE &&
6805 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
6806 !(S.getOpcode() && allSameBlock(VL))) {
6807 assert(S.OpValue->getType()->isPointerTy() &&
6808 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
6809 "Expected pointers only.");
6810 // Reset S to make it GetElementPtr kind of node.
6811 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
6812 assert(It != VL.end() && "Expected at least one GEP.");
6813 S = getSameOpcode(*It, *TLI);
6814 }
6815
6816 // Check that all of the users of the scalars that we want to vectorize are
6817 // schedulable.
6818 auto *VL0 = cast<Instruction>(S.OpValue);
6819 BB = VL0->getParent();
6820
6821 if (!DT->isReachableFromEntry(BB)) {
6822 // Don't go into unreachable blocks. They may contain instructions with
6823 // dependency cycles which confuse the final scheduling.
6824 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
6825 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6826 return;
6827 }
6828
6829 // Don't go into catchswitch blocks, which can happen with PHIs.
6830 // Such blocks can only have PHIs and the catchswitch. There is no
6831 // place to insert a shuffle if we need to, so just avoid that issue.
6832 if (isa<CatchSwitchInst>(BB->getTerminator())) {
6833 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
6834 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6835 return;
6836 }
6837
6838 // Check that every instruction appears once in this bundle.
6839 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
6840 return;
6841
6842 // Perform specific checks for each particular instruction kind.
6843 OrdersType CurrentOrder;
6844 SmallVector<Value *> PointerOps;
6845 TreeEntry::EntryState State = getScalarsVectorizationState(
6846 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
6847 if (State == TreeEntry::NeedToGather) {
6848 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6849 ReuseShuffleIndices);
6850 return;
6851 }
6852
6853 auto &BSRef = BlocksSchedules[BB];
6854 if (!BSRef)
6855 BSRef = std::make_unique<BlockScheduling>(BB);
6856
6857 BlockScheduling &BS = *BSRef;
6858
6859 std::optional<ScheduleData *> Bundle =
6860 BS.tryScheduleBundle(UniqueValues, this, S);
6861#ifdef EXPENSIVE_CHECKS
6862 // Make sure we didn't break any internal invariants
6863 BS.verify();
6864#endif
6865 if (!Bundle) {
6866 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
6867 assert((!BS.getScheduleData(VL0) ||
6868 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
6869 "tryScheduleBundle should cancelScheduling on failure");
6870 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6871 ReuseShuffleIndices);
6872 NonScheduledFirst.insert(VL.front());
6873 return;
6874 }
6875 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
6876
6877 unsigned ShuffleOrOp = S.isAltShuffle() ?
6878 (unsigned) Instruction::ShuffleVector : S.getOpcode();
6879 switch (ShuffleOrOp) {
6880 case Instruction::PHI: {
6881 auto *PH = cast<PHINode>(VL0);
6882
6883 TreeEntry *TE =
6884 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
6885 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
6886
6887 // Keeps the reordered operands to avoid code duplication.
6888 PHIHandler Handler(*DT, PH, VL);
6889 Handler.buildOperands();
6890 for (unsigned I : seq<unsigned>(0, PH->getNumOperands()))
6891 TE->setOperand(I, Handler.getOperands(I));
6892 for (unsigned I : seq<unsigned>(0, PH->getNumOperands()))
6893 buildTree_rec(Handler.getOperands(I), Depth + 1, {TE, I});
6894 return;
6895 }
6896 case Instruction::ExtractValue:
6897 case Instruction::ExtractElement: {
6898 if (CurrentOrder.empty()) {
6899 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
6900 } else {
6901 LLVM_DEBUG({
6902 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
6903 "with order";
6904 for (unsigned Idx : CurrentOrder)
6905 dbgs() << " " << Idx;
6906 dbgs() << "\n";
6907 });
6908 fixupOrderingIndices(CurrentOrder);
6909 }
6910 // Insert new order with initial value 0, if it does not exist,
6911 // otherwise return the iterator to the existing one.
6912 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6913 ReuseShuffleIndices, CurrentOrder);
6914 // This is a special case, as it does not gather, but at the same time
6915 // we are not extending buildTree_rec() towards the operands.
6916 ValueList Op0;
6917 Op0.assign(VL.size(), VL0->getOperand(0));
6918 VectorizableTree.back()->setOperand(0, Op0);
6919 return;
6920 }
6921 case Instruction::InsertElement: {
6922 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
6923
6924 auto OrdCompare = [](const std::pair<int, int> &P1,
6925 const std::pair<int, int> &P2) {
6926 return P1.first > P2.first;
6927 };
6929 decltype(OrdCompare)>
6930 Indices(OrdCompare);
6931 for (int I = 0, E = VL.size(); I < E; ++I) {
6932 unsigned Idx = *getInsertIndex(VL[I]);
6933 Indices.emplace(Idx, I);
6934 }
6935 OrdersType CurrentOrder(VL.size(), VL.size());
6936 bool IsIdentity = true;
6937 for (int I = 0, E = VL.size(); I < E; ++I) {
6938 CurrentOrder[Indices.top().second] = I;
6939 IsIdentity &= Indices.top().second == I;
6940 Indices.pop();
6941 }
6942 if (IsIdentity)
6943 CurrentOrder.clear();
6944 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6945 std::nullopt, CurrentOrder);
6946 LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
6947
6948 TE->setOperandsInOrder();
6949 buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
6950 return;
6951 }
6952 case Instruction::Load: {
6953 // Check that a vectorized load would load the same memory as a scalar
6954 // load. For example, we don't want to vectorize loads that are smaller
6955 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6956 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6957 // from such a struct, we read/write packed bits disagreeing with the
6958 // unvectorized version.
6959 TreeEntry *TE = nullptr;
6960 fixupOrderingIndices(CurrentOrder);
6961 switch (State) {
6962 case TreeEntry::Vectorize:
6963 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6964 ReuseShuffleIndices, CurrentOrder);
6965 if (CurrentOrder.empty())
6966 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
6967 else
6968 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
6969 TE->setOperandsInOrder();
6970 break;
6971 case TreeEntry::StridedVectorize:
6972 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
6973 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6974 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
6975 TE->setOperandsInOrder();
6976 LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
6977 break;
6978 case TreeEntry::ScatterVectorize:
6979 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
6980 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
6981 UserTreeIdx, ReuseShuffleIndices);
6982 TE->setOperandsInOrder();
6983 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
6984 LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
6985 break;
6986 case TreeEntry::NeedToGather:
6987 llvm_unreachable("Unexpected loads state.");
6988 }
6989 return;
6990 }
6991 case Instruction::ZExt:
6992 case Instruction::SExt:
6993 case Instruction::FPToUI:
6994 case Instruction::FPToSI:
6995 case Instruction::FPExt:
6996 case Instruction::PtrToInt:
6997 case Instruction::IntToPtr:
6998 case Instruction::SIToFP:
6999 case Instruction::UIToFP:
7000 case Instruction::Trunc:
7001 case Instruction::FPTrunc:
7002 case Instruction::BitCast: {
7003 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
7004 std::make_pair(std::numeric_limits<unsigned>::min(),
7005 std::numeric_limits<unsigned>::max()));
7006 if (ShuffleOrOp == Instruction::ZExt ||
7007 ShuffleOrOp == Instruction::SExt) {
7008 CastMaxMinBWSizes = std::make_pair(
7009 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
7010 PrevMaxBW),
7011 std::min<unsigned>(
7012 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
7013 PrevMinBW));
7014 } else if (ShuffleOrOp == Instruction::Trunc) {
7015 CastMaxMinBWSizes = std::make_pair(
7016 std::max<unsigned>(
7017 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
7018 PrevMaxBW),
7019 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
7020 PrevMinBW));
7021 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
7022 } else if (ShuffleOrOp == Instruction::SIToFP ||
7023 ShuffleOrOp == Instruction::UIToFP) {
7024 unsigned NumSignBits =
7025 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
7026 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
7027 APInt Mask = DB->getDemandedBits(OpI);
7028 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
7029 }
7030 if (NumSignBits * 2 >=
7031 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
7032 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
7033 }
7034 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7035 ReuseShuffleIndices);
7036 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
7037
7038 TE->setOperandsInOrder();
7039 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7040 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
7041 return;
7042 }
7043 case Instruction::ICmp:
7044 case Instruction::FCmp: {
7045 // Check that all of the compares have the same predicate.
7046 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
7047 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7048 ReuseShuffleIndices);
7049 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
7050
7052 if (cast<CmpInst>(VL0)->isCommutative()) {
7053 // Commutative predicate - collect + sort operands of the instructions
7054 // so that each side is more likely to have the same opcode.
7056 "Commutative Predicate mismatch");
7057 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7058 } else {
7059 // Collect operands - commute if it uses the swapped predicate.
7060 for (Value *V : VL) {
7061 auto *Cmp = cast<CmpInst>(V);
7062 Value *LHS = Cmp->getOperand(0);
7063 Value *RHS = Cmp->getOperand(1);
7064 if (Cmp->getPredicate() != P0)
7065 std::swap(LHS, RHS);
7066 Left.push_back(LHS);
7067 Right.push_back(RHS);
7068 }
7069 }
7070 TE->setOperand(0, Left);
7071 TE->setOperand(1, Right);
7072 buildTree_rec(Left, Depth + 1, {TE, 0});
7073 buildTree_rec(Right, Depth + 1, {TE, 1});
7074 if (ShuffleOrOp == Instruction::ICmp) {
7075 unsigned NumSignBits0 =
7076 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
7077 if (NumSignBits0 * 2 >=
7078 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
7079 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
7080 unsigned NumSignBits1 =
7081 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
7082 if (NumSignBits1 * 2 >=
7083 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
7084 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
7085 }
7086 return;
7087 }
7088 case Instruction::Select:
7089 case Instruction::FNeg:
7090 case Instruction::Add:
7091 case Instruction::FAdd:
7092 case Instruction::Sub:
7093 case Instruction::FSub:
7094 case Instruction::Mul:
7095 case Instruction::FMul:
7096 case Instruction::UDiv:
7097 case Instruction::SDiv:
7098 case Instruction::FDiv:
7099 case Instruction::URem:
7100 case Instruction::SRem:
7101 case Instruction::FRem:
7102 case Instruction::Shl:
7103 case Instruction::LShr:
7104 case Instruction::AShr:
7105 case Instruction::And:
7106 case Instruction::Or:
7107 case Instruction::Xor: {
7108 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7109 ReuseShuffleIndices);
7110 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
7111
7112 // Sort operands of the instructions so that each side is more likely to
7113 // have the same opcode.
7114 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
7116 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7117 TE->setOperand(0, Left);
7118 TE->setOperand(1, Right);
7119 buildTree_rec(Left, Depth + 1, {TE, 0});
7120 buildTree_rec(Right, Depth + 1, {TE, 1});
7121 return;
7122 }
7123
7124 TE->setOperandsInOrder();
7125 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7126 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
7127 return;
7128 }
7129 case Instruction::GetElementPtr: {
7130 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7131 ReuseShuffleIndices);
7132 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
7134 // Prepare the operand vector for pointer operands.
7135 for (Value *V : VL) {
7136 auto *GEP = dyn_cast<GetElementPtrInst>(V);
7137 if (!GEP) {
7138 Operands.front().push_back(V);
7139 continue;
7140 }
7141 Operands.front().push_back(GEP->getPointerOperand());
7142 }
7143 TE->setOperand(0, Operands.front());
7144 // Need to cast all indices to the same type before vectorization to
7145 // avoid crash.
7146 // Required to be able to find correct matches between different gather
7147 // nodes and reuse the vectorized values rather than trying to gather them
7148 // again.
7149 int IndexIdx = 1;
7150 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
7151 Type *Ty = all_of(VL,
7152 [VL0Ty, IndexIdx](Value *V) {
7153 auto *GEP = dyn_cast<GetElementPtrInst>(V);
7154 if (!GEP)
7155 return true;
7156 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
7157 })
7158 ? VL0Ty
7159 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
7160 ->getPointerOperandType()
7161 ->getScalarType());
7162 // Prepare the operand vector.
7163 for (Value *V : VL) {
7164 auto *I = dyn_cast<GetElementPtrInst>(V);
7165 if (!I) {
7166 Operands.back().push_back(
7167 ConstantInt::get(Ty, 0, /*isSigned=*/false));
7168 continue;
7169 }
7170 auto *Op = I->getOperand(IndexIdx);
7171 auto *CI = dyn_cast<ConstantInt>(Op);
7172 if (!CI)
7173 Operands.back().push_back(Op);
7174 else
7175 Operands.back().push_back(ConstantFoldIntegerCast(
7176 CI, Ty, CI->getValue().isSignBitSet(), *DL));
7177 }
7178 TE->setOperand(IndexIdx, Operands.back());
7179
7180 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
7181 buildTree_rec(Operands[I], Depth + 1, {TE, I});
7182 return;
7183 }
7184 case Instruction::Store: {
7185 bool Consecutive = CurrentOrder.empty();
7186 if (!Consecutive)
7187 fixupOrderingIndices(CurrentOrder);
7188 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7189 ReuseShuffleIndices, CurrentOrder);
7190 TE->setOperandsInOrder();
7191 buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
7192 if (Consecutive)
7193 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
7194 else
7195 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
7196 return;
7197 }
7198 case Instruction::Call: {
7199 // Check if the calls are all to the same vectorizable intrinsic or
7200 // library function.
7201 CallInst *CI = cast<CallInst>(VL0);
7203
7204 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7205 ReuseShuffleIndices);
7206 // Sort operands of the instructions so that each side is more likely to
7207 // have the same opcode.
7208 if (isCommutative(VL0)) {
7210 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7211 TE->setOperand(0, Left);
7212 TE->setOperand(1, Right);
7214 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7215 Operands.emplace_back();
7217 continue;
7218 for (Value *V : VL) {
7219 auto *CI2 = cast<CallInst>(V);
7220 Operands.back().push_back(CI2->getArgOperand(I));
7221 }
7222 TE->setOperand(I, Operands.back());
7223 }
7224 buildTree_rec(Left, Depth + 1, {TE, 0});
7225 buildTree_rec(Right, Depth + 1, {TE, 1});
7226 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7227 if (Operands[I - 2].empty())
7228 continue;
7229 buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
7230 }
7231 return;
7232 }
7233 TE->setOperandsInOrder();
7234 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
7235 // For scalar operands no need to create an entry since no need to
7236 // vectorize it.
7238 continue;
7240 // Prepare the operand vector.
7241 for (Value *V : VL) {
7242 auto *CI2 = cast<CallInst>(V);
7243 Operands.push_back(CI2->getArgOperand(I));
7244 }
7245 buildTree_rec(Operands, Depth + 1, {TE, I});
7246 }
7247 return;
7248 }
7249 case Instruction::ShuffleVector: {
7250 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7251 ReuseShuffleIndices);
7252 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
7253
7254 // Reorder operands if reordering would enable vectorization.
7255 auto *CI = dyn_cast<CmpInst>(VL0);
7256 if (isa<BinaryOperator>(VL0) || CI) {
7258 if (!CI || all_of(VL, [](Value *V) {
7259 return cast<CmpInst>(V)->isCommutative();
7260 })) {
7261 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7262 } else {
7263 auto *MainCI = cast<CmpInst>(S.MainOp);
7264 auto *AltCI = cast<CmpInst>(S.AltOp);
7265 CmpInst::Predicate MainP = MainCI->getPredicate();
7266 CmpInst::Predicate AltP = AltCI->getPredicate();
7267 assert(MainP != AltP &&
7268 "Expected different main/alternate predicates.");
7269 // Collect operands - commute if it uses the swapped predicate or
7270 // alternate operation.
7271 for (Value *V : VL) {
7272 auto *Cmp = cast<CmpInst>(V);
7273 Value *LHS = Cmp->getOperand(0);
7274 Value *RHS = Cmp->getOperand(1);
7275
7276 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
7277 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7278 std::swap(LHS, RHS);
7279 } else {
7280 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7281 std::swap(LHS, RHS);
7282 }
7283 Left.push_back(LHS);
7284 Right.push_back(RHS);
7285 }
7286 }
7287 TE->setOperand(0, Left);
7288 TE->setOperand(1, Right);
7289 buildTree_rec(Left, Depth + 1, {TE, 0});
7290 buildTree_rec(Right, Depth + 1, {TE, 1});
7291 return;
7292 }
7293
7294 TE->setOperandsInOrder();
7295 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7296 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
7297 return;
7298 }
7299 default:
7300 break;
7301 }
7302 llvm_unreachable("Unexpected vectorization of the instructions.");
7303}
7304
7306 unsigned N = 1;
7307 Type *EltTy = T;
7308
7309 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7310 if (auto *ST = dyn_cast<StructType>(EltTy)) {
7311 // Check that struct is homogeneous.
7312 for (const auto *Ty : ST->elements())
7313 if (Ty != *ST->element_begin())
7314 return 0;
7315 N *= ST->getNumElements();
7316 EltTy = *ST->element_begin();
7317 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
7318 N *= AT->getNumElements();
7319 EltTy = AT->getElementType();
7320 } else {
7321 auto *VT = cast<FixedVectorType>(EltTy);
7322 N *= VT->getNumElements();
7323 EltTy = VT->getElementType();
7324 }
7325 }
7326
7327 if (!isValidElementType(EltTy))
7328 return 0;
7330 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7331 VTSize != DL->getTypeStoreSizeInBits(T))
7332 return 0;
7333 return N;
7334}
7335
7336bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
7337 SmallVectorImpl<unsigned> &CurrentOrder,
7338 bool ResizeAllowed) const {
7339 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7340 assert(It != VL.end() && "Expected at least one extract instruction.");
7341 auto *E0 = cast<Instruction>(*It);
7342 assert(
7343 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7344 "Invalid opcode");
7345 // Check if all of the extracts come from the same vector and from the
7346 // correct offset.
7347 Value *Vec = E0->getOperand(0);
7348
7349 CurrentOrder.clear();
7350
7351 // We have to extract from a vector/aggregate with the same number of elements.
7352 unsigned NElts;
7353 if (E0->getOpcode() == Instruction::ExtractValue) {
7354 NElts = canMapToVector(Vec->getType());
7355 if (!NElts)
7356 return false;
7357 // Check if load can be rewritten as load of vector.
7358 LoadInst *LI = dyn_cast<LoadInst>(Vec);
7359 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
7360 return false;
7361 } else {
7362 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
7363 }
7364
7365 unsigned E = VL.size();
7366 if (!ResizeAllowed && NElts != E)
7367 return false;
7368 SmallVector<int> Indices(E, PoisonMaskElem);
7369 unsigned MinIdx = NElts, MaxIdx = 0;
7370 for (auto [I, V] : enumerate(VL)) {
7371 auto *Inst = dyn_cast<Instruction>(V);
7372 if (!Inst)
7373 continue;
7374 if (Inst->getOperand(0) != Vec)
7375 return false;
7376 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
7377 if (isa<UndefValue>(EE->getIndexOperand()))
7378 continue;
7379 std::optional<unsigned> Idx = getExtractIndex(Inst);
7380 if (!Idx)
7381 return false;
7382 const unsigned ExtIdx = *Idx;
7383 if (ExtIdx >= NElts)
7384 continue;
7385 Indices[I] = ExtIdx;
7386 if (MinIdx > ExtIdx)
7387 MinIdx = ExtIdx;
7388 if (MaxIdx < ExtIdx)
7389 MaxIdx = ExtIdx;
7390 }
7391 if (MaxIdx - MinIdx + 1 > E)
7392 return false;
7393 if (MaxIdx + 1 <= E)
7394 MinIdx = 0;
7395
7396 // Check that all of the indices extract from the correct offset.
7397 bool ShouldKeepOrder = true;
7398 // Assign to all items the initial value E + 1 so we can check if the extract
7399 // instruction index was used already.
7400 // Also, later we can check that all the indices are used and we have a
7401 // consecutive access in the extract instructions, by checking that no
7402 // element of CurrentOrder still has value E + 1.
7403 CurrentOrder.assign(E, E);
7404 for (unsigned I = 0; I < E; ++I) {
7405 if (Indices[I] == PoisonMaskElem)
7406 continue;
7407 const unsigned ExtIdx = Indices[I] - MinIdx;
7408 if (CurrentOrder[ExtIdx] != E) {
7409 CurrentOrder.clear();
7410 return false;
7411 }
7412 ShouldKeepOrder &= ExtIdx == I;
7413 CurrentOrder[ExtIdx] = I;
7414 }
7415 if (ShouldKeepOrder)
7416 CurrentOrder.clear();
7417
7418 return ShouldKeepOrder;
7419}
7420
7421bool BoUpSLP::areAllUsersVectorized(
7422 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
7423 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
7424 all_of(I->users(), [this](User *U) {
7425 return ScalarToTreeEntry.contains(U) ||
7426 isVectorLikeInstWithConstOps(U) ||
7427 (isa<ExtractElementInst>(U) && MustGather.contains(U));
7428 });
7429}
7430
7431static std::pair<InstructionCost, InstructionCost>
7434 ArrayRef<Type *> ArgTys) {
7436
7437 // Calculate the cost of the scalar and vector calls.
7438 FastMathFlags FMF;
7439 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
7440 FMF = FPCI->getFastMathFlags();
7442 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
7443 dyn_cast<IntrinsicInst>(CI));
7444 auto IntrinsicCost =
7446
7447 auto Shape = VFShape::get(CI->getFunctionType(),
7449 false /*HasGlobalPred*/);
7450 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7451 auto LibCost = IntrinsicCost;
7452 if (!CI->isNoBuiltin() && VecFunc) {
7453 // Calculate the cost of the vector library call.
7454 // If the corresponding vector call is cheaper, return its cost.
7455 LibCost =
7456 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
7457 }
7458 return {IntrinsicCost, LibCost};
7459}
7460
7461void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7462 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
7463 SmallVectorImpl<Value *> *OpScalars,
7464 SmallVectorImpl<Value *> *AltScalars) const {
7465 unsigned Sz = Scalars.size();
7466 Mask.assign(Sz, PoisonMaskElem);
7467 SmallVector<int> OrderMask;
7468 if (!ReorderIndices.empty())
7469 inversePermutation(ReorderIndices, OrderMask);
7470 for (unsigned I = 0; I < Sz; ++I) {
7471 unsigned Idx = I;
7472 if (!ReorderIndices.empty())
7473 Idx = OrderMask[I];
7474 auto *OpInst = cast<Instruction>(Scalars[Idx]);
7475 if (IsAltOp(OpInst)) {
7476 Mask[I] = Sz + Idx;
7477 if (AltScalars)
7478 AltScalars->push_back(OpInst);
7479 } else {
7480 Mask[I] = Idx;
7481 if (OpScalars)
7482 OpScalars->push_back(OpInst);
7483 }
7484 }
7485 if (!ReuseShuffleIndices.empty()) {
7486 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
7487 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
7488 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7489 });
7490 Mask.swap(NewMask);
7491 }
7492}
7493
7495 const Instruction *MainOp,
7496 const Instruction *AltOp,
7497 const TargetLibraryInfo &TLI) {
7498 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7499 auto *AltCI = cast<CmpInst>(AltOp);
7500 CmpInst::Predicate MainP = MainCI->getPredicate();
7501 CmpInst::Predicate AltP = AltCI->getPredicate();
7502 assert(MainP != AltP && "Expected different main/alternate predicates.");
7503 auto *CI = cast<CmpInst>(I);
7504 if (isCmpSameOrSwapped(MainCI, CI, TLI))
7505 return false;
7506 if (isCmpSameOrSwapped(AltCI, CI, TLI))
7507 return true;
7508 CmpInst::Predicate P = CI->getPredicate();
7510
7511 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
7512 "CmpInst expected to match either main or alternate predicate or "
7513 "their swap.");
7514 (void)AltP;
7515 return MainP != P && MainP != SwappedP;
7516 }
7517 return I->getOpcode() == AltOp->getOpcode();
7518}
7519
7520TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
7521 assert(!Ops.empty());
7522 const auto *Op0 = Ops.front();
7523
7524 const bool IsConstant = all_of(Ops, [](Value *V) {
7525 // TODO: We should allow undef elements here
7526 return isConstant(V) && !isa<UndefValue>(V);
7527 });
7528 const bool IsUniform = all_of(Ops, [=](Value *V) {
7529 // TODO: We should allow undef elements here
7530 return V == Op0;
7531 });
7532 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
7533 // TODO: We should allow undef elements here
7534 if (auto *CI = dyn_cast<ConstantInt>(V))
7535 return CI->getValue().isPowerOf2();
7536 return false;
7537 });
7538 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
7539 // TODO: We should allow undef elements here
7540 if (auto *CI = dyn_cast<ConstantInt>(V))
7541 return CI->getValue().isNegatedPowerOf2();
7542 return false;
7543 });
7544
7546 if (IsConstant && IsUniform)
7548 else if (IsConstant)
7550 else if (IsUniform)
7552
7554 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
7555 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
7556
7557 return {VK, VP};
7558}
7559
7560namespace {
7561/// The base class for shuffle instruction emission and shuffle cost estimation.
7562class BaseShuffleAnalysis {
7563protected:
7564 /// Checks if the mask is an identity mask.
7565 /// \param IsStrict if is true the function returns false if mask size does
7566 /// not match vector size.
7567 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
7568 bool IsStrict) {
7569 int Limit = Mask.size();
7570 int VF = VecTy->getNumElements();
7571 int Index = -1;
7572 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
7573 return true;
7574 if (!IsStrict) {
7575 // Consider extract subvector starting from index 0.
7577 Index == 0)
7578 return true;
7579 // All VF-size submasks are identity (e.g.
7580 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
7581 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
7582 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
7583 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
7585 }))
7586 return true;
7587 }
7588 return false;
7589 }
7590
7591 /// Tries to combine 2 different masks into single one.
7592 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
7593 /// change the size of the vector, \p LocalVF is the original size of the
7594 /// shuffled vector.
7595 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
7596 ArrayRef<int> ExtMask) {
7597 unsigned VF = Mask.size();
7598 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
7599 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
7600 if (ExtMask[I] == PoisonMaskElem)
7601 continue;
7602 int MaskedIdx = Mask[ExtMask[I] % VF];
7603 NewMask[I] =
7604 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
7605 }
7606 Mask.swap(NewMask);
7607 }
7608
7609 /// Looks through shuffles trying to reduce final number of shuffles in the
7610 /// code. The function looks through the previously emitted shuffle
7611 /// instructions and properly mark indices in mask as undef.
7612 /// For example, given the code
7613 /// \code
7614 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
7615 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
7616 /// \endcode
7617 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
7618 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7619 /// <0, 1, 2, 3> for the shuffle.
7620 /// If 2 operands are of different size, the smallest one will be resized and
7621 /// the mask recalculated properly.
7622 /// For example, given the code
7623 /// \code
7624 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
7625 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
7626 /// \endcode
7627 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
7628 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7629 /// <0, 1, 2, 3> for the shuffle.
7630 /// So, it tries to transform permutations to simple vector merge, if
7631 /// possible.
7632 /// \param V The input vector which must be shuffled using the given \p Mask.
7633 /// If the better candidate is found, \p V is set to this best candidate
7634 /// vector.
7635 /// \param Mask The input mask for the shuffle. If the best candidate is found
7636 /// during looking-through-shuffles attempt, it is updated accordingly.
7637 /// \param SinglePermute true if the shuffle operation is originally a
7638 /// single-value-permutation. In this case the look-through-shuffles procedure
7639 /// may look for resizing shuffles as the best candidates.
7640 /// \return true if the shuffle results in the non-resizing identity shuffle
7641 /// (and thus can be ignored), false - otherwise.
7642 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
7643 bool SinglePermute) {
7644 Value *Op = V;
7645 ShuffleVectorInst *IdentityOp = nullptr;
7646 SmallVector<int> IdentityMask;
7647 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
7648 // Exit if not a fixed vector type or changing size shuffle.
7649 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7650 if (!SVTy)
7651 break;
7652 // Remember the identity or broadcast mask, if it is not a resizing
7653 // shuffle. If no better candidates are found, this Op and Mask will be
7654 // used in the final shuffle.
7655 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
7656 if (!IdentityOp || !SinglePermute ||
7657 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
7659 IdentityMask.size()))) {
7660 IdentityOp = SV;
7661 // Store current mask in the IdentityMask so later we did not lost
7662 // this info if IdentityOp is selected as the best candidate for the
7663 // permutation.
7664 IdentityMask.assign(Mask);
7665 }
7666 }
7667 // Remember the broadcast mask. If no better candidates are found, this Op
7668 // and Mask will be used in the final shuffle.
7669 // Zero splat can be used as identity too, since it might be used with
7670 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
7671 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
7672 // expensive, the analysis founds out, that the source vector is just a
7673 // broadcast, this original mask can be transformed to identity mask <0,
7674 // 1, 2, 3>.
7675 // \code
7676 // %0 = shuffle %v, poison, zeroinitalizer
7677 // %res = shuffle %0, poison, <3, 1, 2, 0>
7678 // \endcode
7679 // may be transformed to
7680 // \code
7681 // %0 = shuffle %v, poison, zeroinitalizer
7682 // %res = shuffle %0, poison, <0, 1, 2, 3>
7683 // \endcode
7684 if (SV->isZeroEltSplat()) {
7685 IdentityOp = SV;
7686 IdentityMask.assign(Mask);
7687 }
7688 int LocalVF = Mask.size();
7689 if (auto *SVOpTy =
7690 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7691 LocalVF = SVOpTy->getNumElements();
7692 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
7693 for (auto [Idx, I] : enumerate(Mask)) {
7694 if (I == PoisonMaskElem ||
7695 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
7696 continue;
7697 ExtMask[Idx] = SV->getMaskValue(I);
7698 }
7699 bool IsOp1Undef =
7700 isUndefVector(SV->getOperand(0),
7701 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
7702 .all();
7703 bool IsOp2Undef =
7704 isUndefVector(SV->getOperand(1),
7705 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
7706 .all();
7707 if (!IsOp1Undef && !IsOp2Undef) {
7708 // Update mask and mark undef elems.
7709 for (int &I : Mask) {
7710 if (I == PoisonMaskElem)
7711 continue;
7712 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
7714 I = PoisonMaskElem;
7715 }
7716 break;
7717 }
7718 SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
7719 SV->getShuffleMask().end());
7720 combineMasks(LocalVF, ShuffleMask, Mask);
7721 Mask.swap(ShuffleMask);
7722 if (IsOp2Undef)
7723 Op = SV->getOperand(0);
7724 else
7725 Op = SV->getOperand(1);
7726 }
7727 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
7728 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7730 if (IdentityOp) {
7731 V = IdentityOp;
7732 assert(Mask.size() == IdentityMask.size() &&
7733 "Expected masks of same sizes.");
7734 // Clear known poison elements.
7735 for (auto [I, Idx] : enumerate(Mask))
7736 if (Idx == PoisonMaskElem)
7737 IdentityMask[I] = PoisonMaskElem;
7738 Mask.swap(IdentityMask);
7739 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7740 return SinglePermute &&
7741 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
7742 /*IsStrict=*/true) ||
7743 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
7744 Shuffle->isZeroEltSplat() &&
7746 }
7747 V = Op;
7748 return false;
7749 }
7750 V = Op;
7751 return true;
7752 }
7753
7754 /// Smart shuffle instruction emission, walks through shuffles trees and
7755 /// tries to find the best matching vector for the actual shuffle
7756 /// instruction.
7757 template <typename T, typename ShuffleBuilderTy>
7758 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
7759 ShuffleBuilderTy &Builder) {
7760 assert(V1 && "Expected at least one vector value.");
7761 if (V2)
7762 Builder.resizeToMatch(V1, V2);
7763 int VF = Mask.size();
7764 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
7765 VF = FTy->getNumElements();
7766 if (V2 &&
7767 !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) {
7768 // Peek through shuffles.
7769 Value *Op1 = V1;
7770 Value *Op2 = V2;
7771 int VF =
7772 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
7773 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
7774 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
7775 for (int I = 0, E = Mask.size(); I < E; ++I) {
7776 if (Mask[I] < VF)
7777 CombinedMask1[I] = Mask[I];
7778 else
7779 CombinedMask2[I] = Mask[I] - VF;
7780 }
7781 Value *PrevOp1;
7782 Value *PrevOp2;
7783 do {
7784 PrevOp1 = Op1;
7785 PrevOp2 = Op2;
7786 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
7787 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
7788 // Check if we have 2 resizing shuffles - need to peek through operands
7789 // again.
7790 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
7791 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
7792 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
7793 for (auto [Idx, I] : enumerate(CombinedMask1)) {
7794 if (I == PoisonMaskElem)
7795 continue;
7796 ExtMask1[Idx] = SV1->getMaskValue(I);
7797 }
7798 SmallBitVector UseMask1 = buildUseMask(
7799 cast<FixedVectorType>(SV1->getOperand(1)->getType())
7800 ->getNumElements(),
7801 ExtMask1, UseMask::SecondArg);
7802 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
7803 for (auto [Idx, I] : enumerate(CombinedMask2)) {
7804 if (I == PoisonMaskElem)
7805 continue;
7806 ExtMask2[Idx] = SV2->getMaskValue(I);
7807 }
7808 SmallBitVector UseMask2 = buildUseMask(
7809 cast<FixedVectorType>(SV2->getOperand(1)->getType())
7810 ->getNumElements(),
7811 ExtMask2, UseMask::SecondArg);
7812 if (SV1->getOperand(0)->getType() ==
7813 SV2->getOperand(0)->getType() &&
7814 SV1->getOperand(0)->getType() != SV1->getType() &&
7815 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
7816 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
7817 Op1 = SV1->getOperand(0);
7818 Op2 = SV2->getOperand(0);
7819 SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
7820 SV1->getShuffleMask().end());
7821 int LocalVF = ShuffleMask1.size();
7822 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
7823 LocalVF = FTy->getNumElements();
7824 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
7825 CombinedMask1.swap(ShuffleMask1);
7826 SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
7827 SV2->getShuffleMask().end());
7828 LocalVF = ShuffleMask2.size();
7829 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
7830 LocalVF = FTy->getNumElements();
7831 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
7832 CombinedMask2.swap(ShuffleMask2);
7833 }
7834 }
7835 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
7836 Builder.resizeToMatch(Op1, Op2);
7837 VF = std::max(cast<VectorType>(Op1->getType())
7838 ->getElementCount()
7839 .getKnownMinValue(),
7840 cast<VectorType>(Op2->getType())
7841 ->getElementCount()
7842 .getKnownMinValue());
7843 for (int I = 0, E = Mask.size(); I < E; ++I) {
7844 if (CombinedMask2[I] != PoisonMaskElem) {
7845 assert(CombinedMask1[I] == PoisonMaskElem &&
7846 "Expected undefined mask element");
7847 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
7848 }
7849 }
7850 if (Op1 == Op2 &&
7851 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
7852 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
7853 isa<ShuffleVectorInst>(Op1) &&
7854 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
7855 ArrayRef(CombinedMask1))))
7856 return Builder.createIdentity(Op1);
7857 return Builder.createShuffleVector(
7858 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
7859 CombinedMask1);
7860 }
7861 if (isa<PoisonValue>(V1))
7862 return Builder.createPoison(
7863 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
7864 SmallVector<int> NewMask(Mask.begin(), Mask.end());
7865 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
7866 assert(V1 && "Expected non-null value after looking through shuffles.");
7867
7868 if (!IsIdentity)
7869 return Builder.createShuffleVector(V1, NewMask);
7870 return Builder.createIdentity(V1);
7871 }
7872};
7873} // namespace
7874
7875/// Returns the cost of the shuffle instructions with the given \p Kind, vector
7876/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
7877/// subvector pattern.
7878static InstructionCost
7880 VectorType *Tp, ArrayRef<int> Mask = std::nullopt,
7882 int Index = 0, VectorType *SubTp = nullptr,
7883 ArrayRef<const Value *> Args = std::nullopt) {
7884 if (Kind != TTI::SK_PermuteTwoSrc)
7885 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
7886 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7887 int NumSubElts;
7888 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
7889 Mask, NumSrcElts, NumSubElts, Index)) {
7890 if (Index + NumSubElts > NumSrcElts &&
7891 Index + NumSrcElts <= static_cast<int>(Mask.size()))
7892 return TTI.getShuffleCost(
7894 FixedVectorType::get(Tp->getElementType(), Mask.size()), Mask,
7896 }
7897 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
7898}
7899
7900/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
7901static std::pair<InstructionCost, InstructionCost>
7903 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
7904 Type *ScalarTy, VectorType *VecTy) {
7905 InstructionCost ScalarCost = 0;
7906 InstructionCost VecCost = 0;
7907 // Here we differentiate two cases: (1) when Ptrs represent a regular
7908 // vectorization tree node (as they are pointer arguments of scattered
7909 // loads) or (2) when Ptrs are the arguments of loads or stores being
7910 // vectorized as plane wide unit-stride load/store since all the
7911 // loads/stores are known to be from/to adjacent locations.
7912 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
7913 // Case 2: estimate costs for pointer related costs when vectorizing to
7914 // a wide load/store.
7915 // Scalar cost is estimated as a set of pointers with known relationship
7916 // between them.
7917 // For vector code we will use BasePtr as argument for the wide load/store
7918 // but we also need to account all the instructions which are going to
7919 // stay in vectorized code due to uses outside of these scalar
7920 // loads/stores.
7921 ScalarCost = TTI.getPointersChainCost(
7922 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
7923 CostKind);
7924
7925 SmallVector<const Value *> PtrsRetainedInVecCode;
7926 for (Value *V : Ptrs) {
7927 if (V == BasePtr) {
7928 PtrsRetainedInVecCode.push_back(V);
7929 continue;
7930 }
7931 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
7932 // For simplicity assume Ptr to stay in vectorized code if it's not a
7933 // GEP instruction. We don't care since it's cost considered free.
7934 // TODO: We should check for any uses outside of vectorizable tree
7935 // rather than just single use.
7936 if (!Ptr || !Ptr->hasOneUse())
7937 PtrsRetainedInVecCode.push_back(V);
7938 }
7939
7940 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
7941 // If all pointers stay in vectorized code then we don't have
7942 // any savings on that.
7943 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
7944 }
7945 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
7946 TTI::PointersChainInfo::getKnownStride(),
7947 VecTy, CostKind);
7948 } else {
7949 // Case 1: Ptrs are the arguments of loads that we are going to transform
7950 // into masked gather load intrinsic.
7951 // All the scalar GEPs will be removed as a result of vectorization.
7952 // For any external uses of some lanes extract element instructions will
7953 // be generated (which cost is estimated separately).
7954 TTI::PointersChainInfo PtrsInfo =
7955 all_of(Ptrs,
7956 [](const Value *V) {
7957 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
7958 return Ptr && !Ptr->hasAllConstantIndices();
7959 })
7960 ? TTI::PointersChainInfo::getUnknownStride()
7961 : TTI::PointersChainInfo::getKnownStride();
7962
7963 ScalarCost =
7964 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
7965 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
7966 if (!BaseGEP) {
7967 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
7968 if (It != Ptrs.end())
7969 BaseGEP = cast<GEPOperator>(*It);
7970 }
7971 if (BaseGEP) {
7972 SmallVector<const Value *> Indices(BaseGEP->indices());
7973 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
7974 BaseGEP->getPointerOperand(), Indices, VecTy,
7975 CostKind);
7976 }
7977 }
7978
7979 return std::make_pair(ScalarCost, VecCost);
7980}
7981
7984 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
7985 TreeEntry &E = *TE.get();
7986 switch (E.getOpcode()) {
7987 case Instruction::Load: {
7988 // No need to reorder masked gather loads, just reorder the scalar
7989 // operands.
7990 if (E.State != TreeEntry::Vectorize)
7991 break;
7992 Type *ScalarTy = E.getMainOp()->getType();
7993 auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
7994 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
7995 // Check if profitable to represent consecutive load + reverse as strided
7996 // load with stride -1.
7997 if (isReverseOrder(E.ReorderIndices) &&
7998 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
7999 SmallVector<int> Mask;
8000 inversePermutation(E.ReorderIndices, Mask);
8001 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
8002 InstructionCost OriginalVecCost =
8003 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
8008 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
8009 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
8010 if (StridedCost < OriginalVecCost)
8011 // Strided load is more profitable than consecutive load + reverse -
8012 // transform the node to strided load.
8013 E.State = TreeEntry::StridedVectorize;
8014 }
8015 break;
8016 }
8017 case Instruction::Store: {
8018 Type *ScalarTy =
8019 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
8020 auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
8021 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
8022 // Check if profitable to represent consecutive load + reverse as strided
8023 // load with stride -1.
8024 if (isReverseOrder(E.ReorderIndices) &&
8025 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
8026 SmallVector<int> Mask;
8027 inversePermutation(E.ReorderIndices, Mask);
8028 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
8029 InstructionCost OriginalVecCost =
8030 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
8035 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
8036 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
8037 if (StridedCost < OriginalVecCost)
8038 // Strided load is more profitable than consecutive load + reverse -
8039 // transform the node to strided load.
8040 E.State = TreeEntry::StridedVectorize;
8041 }
8042 break;
8043 }
8044 default:
8045 break;
8046 }
8047 }
8048}
8049
8050/// Merges shuffle masks and emits final shuffle instruction, if required. It
8051/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
8052/// when the actual shuffle instruction is generated only if this is actually
8053/// required. Otherwise, the shuffle instruction emission is delayed till the
8054/// end of the process, to reduce the number of emitted instructions and further
8055/// analysis/transformations.
8056class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
8057 bool IsFinalized = false;
8058 SmallVector<int> CommonMask;
8060 Type *ScalarTy = nullptr;
8061 const TargetTransformInfo &TTI;
8063 SmallDenseSet<Value *> VectorizedVals;
8064 BoUpSLP &R;
8065 SmallPtrSetImpl<Value *> &CheckedExtracts;
8066 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
8067 /// While set, still trying to estimate the cost for the same nodes and we
8068 /// can delay actual cost estimation (virtual shuffle instruction emission).
8069 /// May help better estimate the cost if same nodes must be permuted + allows
8070 /// to move most of the long shuffles cost estimation to TTI.
8071 bool SameNodesEstimated = true;
8072
8073 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
8074 if (Ty->getScalarType()->isPointerTy()) {
8078 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
8079 Ty->getScalarType());
8080 if (auto *VTy = dyn_cast<VectorType>(Ty))
8081 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
8082 return Res;
8083 }
8084 return Constant::getAllOnesValue(Ty);
8085 }
8086
8087 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
8088 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
8089 return TTI::TCC_Free;
8090 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
8091 InstructionCost GatherCost = 0;
8092 SmallVector<Value *> Gathers(VL.begin(), VL.end());
8093 // Improve gather cost for gather of loads, if we can group some of the
8094 // loads into vector loads.
8095 InstructionsState S = getSameOpcode(VL, *R.TLI);
8096 const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy);
8097 unsigned MinVF = R.getMinVF(2 * Sz);
8098 if (VL.size() > 2 &&
8099 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
8100 (InVectors.empty() &&
8101 any_of(seq<unsigned>(0, VL.size() / MinVF),
8102 [&](unsigned Idx) {
8103 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
8104 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
8105 return S.getOpcode() == Instruction::Load &&
8106 !S.isAltShuffle();
8107 }))) &&
8108 !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
8109 !isSplat(Gathers)) {
8110 InstructionCost BaseCost = R.getGatherCost(Gathers, !Root, ScalarTy);
8111 SetVector<Value *> VectorizedLoads;
8113 SmallVector<unsigned> ScatterVectorized;
8114 unsigned StartIdx = 0;
8115 unsigned VF = VL.size() / 2;
8116 for (; VF >= MinVF; VF /= 2) {
8117 for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
8118 Cnt += VF) {
8119 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
8120 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
8121 InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);
8122 if (SliceS.getOpcode() != Instruction::Load ||
8123 SliceS.isAltShuffle())
8124 continue;
8125 }
8126 if (!VectorizedLoads.count(Slice.front()) &&
8127 !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
8128 SmallVector<Value *> PointerOps;
8129 OrdersType CurrentOrder;
8130 LoadsState LS = R.canVectorizeLoads(Slice, Slice.front(),
8131 CurrentOrder, PointerOps);
8132 switch (LS) {
8136 // Mark the vectorized loads so that we don't vectorize them
8137 // again.
8138 // TODO: better handling of loads with reorders.
8139 if (((LS == LoadsState::Vectorize ||
8141 CurrentOrder.empty()) ||
8143 isReverseOrder(CurrentOrder)))
8144 VectorizedStarts.emplace_back(Cnt, LS);
8145 else
8146 ScatterVectorized.push_back(Cnt);
8147 VectorizedLoads.insert(Slice.begin(), Slice.end());
8148 // If we vectorized initial block, no need to try to vectorize
8149 // it again.
8150 if (Cnt == StartIdx)
8151 StartIdx += VF;
8152 break;
8153 case LoadsState::Gather:
8154 break;
8155 }
8156 }
8157 }
8158 // Check if the whole array was vectorized already - exit.
8159 if (StartIdx >= VL.size())
8160 break;
8161 // Found vectorizable parts - exit.
8162 if (!VectorizedLoads.empty())
8163 break;
8164 }
8165 if (!VectorizedLoads.empty()) {
8166 unsigned NumParts = TTI.getNumberOfParts(VecTy);
8167 bool NeedInsertSubvectorAnalysis =
8168 !NumParts || (VL.size() / VF) > NumParts;
8169 // Get the cost for gathered loads.
8170 for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
8171 if (VectorizedLoads.contains(VL[I]))
8172 continue;
8173 GatherCost +=
8174 getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root);
8175 }
8176 // Exclude potentially vectorized loads from list of gathered
8177 // scalars.
8178 Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));
8179 // The cost for vectorized loads.
8180 InstructionCost ScalarsCost = 0;
8181 for (Value *V : VectorizedLoads) {
8182 auto *LI = cast<LoadInst>(V);
8183 ScalarsCost +=
8184 TTI.getMemoryOpCost(Instruction::Load, LI->getType(),
8185 LI->getAlign(), LI->getPointerAddressSpace(),
8186 CostKind, TTI::OperandValueInfo(), LI);
8187 }
8188 auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);
8189 for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
8190 auto *LI = cast<LoadInst>(VL[P.first]);
8191 Align Alignment = LI->getAlign();
8192 GatherCost +=
8193 P.second == LoadsState::Vectorize
8194 ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
8195 LI->getPointerAddressSpace(), CostKind,
8198 Instruction::Load, LoadTy, LI->getPointerOperand(),
8199 /*VariableMask=*/false, Alignment, CostKind, LI);
8200 // Estimate GEP cost.
8201 SmallVector<Value *> PointerOps(VF);
8202 for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
8203 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8204 auto [ScalarGEPCost, VectorGEPCost] =
8205 getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
8206 Instruction::Load, CostKind, LI->getType(), LoadTy);
8207 GatherCost += VectorGEPCost - ScalarGEPCost;
8208 }
8209 for (unsigned P : ScatterVectorized) {
8210 auto *LI0 = cast<LoadInst>(VL[P]);
8211 ArrayRef<Value *> Slice = VL.slice(P, VF);
8212 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
8213 GatherCost += TTI.getGatherScatterOpCost(
8214 Instruction::Load, LoadTy, LI0->getPointerOperand(),
8215 /*VariableMask=*/false, CommonAlignment, CostKind, LI0);
8216 // Estimate GEP cost.
8217 SmallVector<Value *> PointerOps(VF);
8218 for (auto [I, V] : enumerate(Slice))
8219 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8220 OrdersType Order;
8221 if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE,
8222 Order)) {
8223 // TODO: improve checks if GEPs can be vectorized.
8224 Value *Ptr0 = PointerOps.front();
8225 Type *ScalarTy = Ptr0->getType();
8226 auto *VecTy = FixedVectorType::get(ScalarTy, VF);
8227 auto [ScalarGEPCost, VectorGEPCost] =
8228 getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr,
8229 CostKind, ScalarTy, VecTy);
8230 GatherCost += VectorGEPCost - ScalarGEPCost;
8231 if (!Order.empty()) {
8232 SmallVector<int> Mask;
8233 inversePermutation(Order, Mask);
8235 VecTy, Mask, CostKind);
8236 }
8237 } else {
8238 GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true,
8239 PointerOps.front()->getType());
8240 }
8241 }
8242 if (NeedInsertSubvectorAnalysis) {
8243 // Add the cost for the subvectors insert.
8244 SmallVector<int> ShuffleMask(VL.size());
8245 for (unsigned I = VF, E = VL.size(); I < E; I += VF) {
8246 for (unsigned Idx : seq<unsigned>(0, E))
8247 ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx;
8248 GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy,
8249 ShuffleMask, CostKind, I, LoadTy);
8250 }
8251 }
8252 GatherCost -= ScalarsCost;
8253 }
8254 GatherCost = std::min(BaseCost, GatherCost);
8255 } else if (!Root && isSplat(VL)) {
8256 // Found the broadcasting of the single scalar, calculate the cost as
8257 // the broadcast.
8258 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
8259 assert(It != VL.end() && "Expected at least one non-undef value.");
8260 // Add broadcast for non-identity shuffle only.
8261 bool NeedShuffle =
8262 count(VL, *It) > 1 &&
8263 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
8264 if (!NeedShuffle)
8265 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
8266 CostKind, std::distance(VL.begin(), It),
8267 PoisonValue::get(VecTy), *It);
8268
8269 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
8270 transform(VL, ShuffleMask.begin(), [](Value *V) {
8271 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8272 });
8273 InstructionCost InsertCost =
8274 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
8275 PoisonValue::get(VecTy), *It);
8277 VecTy, ShuffleMask, CostKind,
8278 /*Index=*/0, /*SubTp=*/nullptr,
8279 /*Args=*/*It);
8280 }
8281 return GatherCost +
8282 (all_of(Gathers, IsaPred<UndefValue>)
8284 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
8285 ScalarTy));
8286 };
8287
8288 /// Compute the cost of creating a vector containing the extracted values from
8289 /// \p VL.
8291 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
8292 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8293 unsigned NumParts) {
8294 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
8295 unsigned NumElts =
8296 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
8297 auto *EE = dyn_cast<ExtractElementInst>(V);
8298 if (!EE)
8299 return Sz;
8300 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8301 if (!VecTy)
8302 return Sz;
8303 return std::max(Sz, VecTy->getNumElements());
8304 });
8305 // FIXME: this must be moved to TTI for better estimation.
8306 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
8307 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
8309 -> std::optional<TTI::ShuffleKind> {
8310 if (NumElts <= EltsPerVector)
8311 return std::nullopt;
8312 int OffsetReg0 =
8313 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
8314 [](int S, int I) {
8315 if (I == PoisonMaskElem)
8316 return S;
8317 return std::min(S, I);
8318 }),
8319 EltsPerVector);
8320 int OffsetReg1 = OffsetReg0;
8321 DenseSet<int> RegIndices;
8322 // Check that if trying to permute same single/2 input vectors.
8324 int FirstRegId = -1;
8325 Indices.assign(1, OffsetReg0);
8326 for (auto [Pos, I] : enumerate(Mask)) {
8327 if (I == PoisonMaskElem)
8328 continue;
8329 int Idx = I - OffsetReg0;
8330 int RegId =
8331 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
8332 if (FirstRegId < 0)
8333 FirstRegId = RegId;
8334 RegIndices.insert(RegId);
8335 if (RegIndices.size() > 2)
8336 return std::nullopt;
8337 if (RegIndices.size() == 2) {
8338 ShuffleKind = TTI::SK_PermuteTwoSrc;
8339 if (Indices.size() == 1) {
8340 OffsetReg1 = alignDown(
8341 std::accumulate(
8342 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
8343 [&](int S, int I) {
8344 if (I == PoisonMaskElem)
8345 return S;
8346 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
8347 ((I - OffsetReg0) % NumElts) / EltsPerVector;
8348 if (RegId == FirstRegId)
8349 return S;
8350 return std::min(S, I);
8351 }),
8352 EltsPerVector);
8353 Indices.push_back(OffsetReg1 % NumElts);
8354 }
8355 Idx = I - OffsetReg1;
8356 }
8357 I = (Idx % NumElts) % EltsPerVector +
8358 (RegId == FirstRegId ? 0 : EltsPerVector);
8359 }
8360 return ShuffleKind;
8361 };
8363
8364 // Process extracts in blocks of EltsPerVector to check if the source vector
8365 // operand can be re-used directly. If not, add the cost of creating a
8366 // shuffle to extract the values into a vector register.
8367 for (unsigned Part : seq<unsigned>(NumParts)) {
8368 if (!ShuffleKinds[Part])
8369 continue;
8370 ArrayRef<int> MaskSlice = Mask.slice(
8371 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
8372 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
8373 copy(MaskSlice, SubMask.begin());
8375 std::optional<TTI::ShuffleKind> RegShuffleKind =
8376 CheckPerRegistersShuffle(SubMask, Indices);
8377 if (!RegShuffleKind) {
8378 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
8380 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
8381 Cost += ::getShuffleCost(TTI, *ShuffleKinds[Part],
8382 FixedVectorType::get(ScalarTy, NumElts),
8383 MaskSlice);
8384 continue;
8385 }
8386 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
8387 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
8388 Cost += ::getShuffleCost(TTI, *RegShuffleKind,
8389 FixedVectorType::get(ScalarTy, EltsPerVector),
8390 SubMask);
8391 }
8392 for (unsigned Idx : Indices) {
8393 assert((Idx + EltsPerVector) <= alignTo(NumElts, EltsPerVector) &&
8394 "SK_ExtractSubvector index out of range");
8397 FixedVectorType::get(ScalarTy, alignTo(NumElts, EltsPerVector)),
8398 std::nullopt, CostKind, Idx,
8399 FixedVectorType::get(ScalarTy, EltsPerVector));
8400 }
8401 // Second attempt to check, if just a permute is better estimated than
8402 // subvector extract.
8403 SubMask.assign(NumElts, PoisonMaskElem);
8404 copy(MaskSlice, SubMask.begin());
8405 InstructionCost OriginalCost =
8406 ::getShuffleCost(TTI, *ShuffleKinds[Part],
8407 FixedVectorType::get(ScalarTy, NumElts), SubMask);
8408 if (OriginalCost < Cost)
8409 Cost = OriginalCost;
8410 }
8411 return Cost;
8412 }
8413 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
8414 /// shuffle emission.
8415 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
8416 ArrayRef<int> Mask) {
8417 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8418 if (Mask[Idx] != PoisonMaskElem)
8419 CommonMask[Idx] = Idx;
8420 }
8421 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
8422 /// mask \p Mask, register number \p Part, that includes \p SliceSize
8423 /// elements.
8424 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
8425 ArrayRef<int> Mask, unsigned Part,
8426 unsigned SliceSize) {
8427 if (SameNodesEstimated) {
8428 // Delay the cost estimation if the same nodes are reshuffling.
8429 // If we already requested the cost of reshuffling of E1 and E2 before, no
8430 // need to estimate another cost with the sub-Mask, instead include this
8431 // sub-Mask into the CommonMask to estimate it later and avoid double cost
8432 // estimation.
8433 if ((InVectors.size() == 2 &&
8434 InVectors.front().get<const TreeEntry *>() == &E1 &&
8435 InVectors.back().get<const TreeEntry *>() == E2) ||
8436 (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
8437 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
8438 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
8439 [](int Idx) { return Idx == PoisonMaskElem; }) &&
8440 "Expected all poisoned elements.");
8441 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
8442 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
8443 return;
8444 }
8445 // Found non-matching nodes - need to estimate the cost for the matched
8446 // and transform mask.
8447 Cost += createShuffle(InVectors.front(),
8448 InVectors.size() == 1 ? nullptr : InVectors.back(),
8449 CommonMask);
8450 transformMaskAfterShuffle(CommonMask, CommonMask);
8451 }
8452 SameNodesEstimated = false;
8453 if (!E2 && InVectors.size() == 1) {
8454 unsigned VF = E1.getVectorFactor();
8455 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
8456 VF = std::max(VF,
8457 cast<FixedVectorType>(V1->getType())->getNumElements());
8458 } else {
8459 const auto *E = InVectors.front().get<const TreeEntry *>();
8460 VF = std::max(VF, E->getVectorFactor());
8461 }
8462 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8463 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8464 CommonMask[Idx] = Mask[Idx] + VF;
8465 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
8466 transformMaskAfterShuffle(CommonMask, CommonMask);
8467 } else {
8468 Cost += createShuffle(&E1, E2, Mask);
8469 transformMaskAfterShuffle(CommonMask, Mask);
8470 }
8471 }
8472
8473 class ShuffleCostBuilder {
8474 const TargetTransformInfo &TTI;
8475
8476 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
8477 int Index = -1;
8478 return Mask.empty() ||
8479 (VF == Mask.size() &&
8482 Index == 0);
8483 }
8484
8485 public:
8486 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
8487 ~ShuffleCostBuilder() = default;
8488 InstructionCost createShuffleVector(Value *V1, Value *,
8489 ArrayRef<int> Mask) const {
8490 // Empty mask or identity mask are free.
8491 unsigned VF =
8492 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8493 if (isEmptyOrIdentity(Mask, VF))
8494 return TTI::TCC_Free;
8495 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
8496 cast<VectorType>(V1->getType()), Mask);
8497 }
8498 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
8499 // Empty mask or identity mask are free.
8500 unsigned VF =
8501 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8502 if (isEmptyOrIdentity(Mask, VF))
8503 return TTI::TCC_Free;
8505 cast<VectorType>(V1->getType()), Mask);
8506 }
8507 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
8508 InstructionCost createPoison(Type *Ty, unsigned VF) const {
8509 return TTI::TCC_Free;
8510 }
8511 void resizeToMatch(Value *&, Value *&) const {}
8512 };
8513
8514 /// Smart shuffle instruction emission, walks through shuffles trees and
8515 /// tries to find the best matching vector for the actual shuffle
8516 /// instruction.
8518 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
8520 ArrayRef<int> Mask) {
8521 ShuffleCostBuilder Builder(TTI);
8522 SmallVector<int> CommonMask(Mask.begin(), Mask.end());
8523 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
8524 unsigned CommonVF = Mask.size();
8525 InstructionCost ExtraCost = 0;
8526 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
8527 unsigned VF) -> InstructionCost {
8528 if (E.State == TreeEntry::NeedToGather && allConstant(E.Scalars))
8529 return TTI::TCC_Free;
8530 Type *EScalarTy = E.Scalars.front()->getType();
8531 bool IsSigned = true;
8532 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
8533 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
8534 IsSigned = It->second.second;
8535 }
8536 if (EScalarTy != ScalarTy) {
8537 unsigned CastOpcode = Instruction::Trunc;
8538 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8539 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8540 if (DstSz > SrcSz)
8541 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8542 return TTI.getCastInstrCost(CastOpcode,
8543 FixedVectorType::get(ScalarTy, VF),
8544 FixedVectorType::get(EScalarTy, VF),
8545 TTI::CastContextHint::None, CostKind);
8546 }
8547 return TTI::TCC_Free;
8548 };
8549 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
8550 if (isa<Constant>(V))
8551 return TTI::TCC_Free;
8552 auto *VecTy = cast<VectorType>(V->getType());
8553 Type *EScalarTy = VecTy->getElementType();
8554 if (EScalarTy != ScalarTy) {
8555 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
8556 unsigned CastOpcode = Instruction::Trunc;
8557 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8558 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8559 if (DstSz > SrcSz)
8560 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8561 return TTI.getCastInstrCost(
8562 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
8563 VecTy, TTI::CastContextHint::None, CostKind);
8564 }
8565 return TTI::TCC_Free;
8566 };
8567 if (!V1 && !V2 && !P2.isNull()) {
8568 // Shuffle 2 entry nodes.
8569 const TreeEntry *E = P1.get<const TreeEntry *>();
8570 unsigned VF = E->getVectorFactor();
8571 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8572 CommonVF = std::max(VF, E2->getVectorFactor());
8573 assert(all_of(Mask,
8574 [=](int Idx) {
8575 return Idx < 2 * static_cast<int>(CommonVF);
8576 }) &&
8577 "All elements in mask must be less than 2 * CommonVF.");
8578 if (E->Scalars.size() == E2->Scalars.size()) {
8579 SmallVector<int> EMask = E->getCommonMask();
8580 SmallVector<int> E2Mask = E2->getCommonMask();
8581 if (!EMask.empty() || !E2Mask.empty()) {
8582 for (int &Idx : CommonMask) {
8583 if (Idx == PoisonMaskElem)
8584 continue;
8585 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
8586 Idx = EMask[Idx];
8587 else if (Idx >= static_cast<int>(CommonVF))
8588 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
8589 E->Scalars.size();
8590 }
8591 }
8592 CommonVF = E->Scalars.size();
8593 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
8594 GetNodeMinBWAffectedCost(*E2, CommonVF);
8595 } else {
8596 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
8597 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
8598 }
8599 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8600 V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
8601 } else if (!V1 && P2.isNull()) {
8602 // Shuffle single entry node.
8603 const TreeEntry *E = P1.get<const TreeEntry *>();
8604 unsigned VF = E->getVectorFactor();
8605 CommonVF = VF;
8606 assert(
8607 all_of(Mask,
8608 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8609 "All elements in mask must be less than CommonVF.");
8610 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8611 SmallVector<int> EMask = E->getCommonMask();
8612 assert(!EMask.empty() && "Expected non-empty common mask.");
8613 for (int &Idx : CommonMask) {
8614 if (Idx != PoisonMaskElem)
8615 Idx = EMask[Idx];
8616 }
8617 CommonVF = E->Scalars.size();
8618 }
8619 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
8620 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8621 // Not identity/broadcast? Try to see if the original vector is better.
8622 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8623 CommonVF == CommonMask.size() &&
8624 any_of(enumerate(CommonMask),
8625 [](const auto &&P) {
8626 return P.value() != PoisonMaskElem &&
8627 static_cast<unsigned>(P.value()) != P.index();
8628 }) &&
8629 any_of(CommonMask,
8630 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
8631 SmallVector<int> ReorderMask;
8632 inversePermutation(E->ReorderIndices, ReorderMask);
8633 ::addMask(CommonMask, ReorderMask);
8634 }
8635 } else if (V1 && P2.isNull()) {
8636 // Shuffle single vector.
8637 ExtraCost += GetValueMinBWAffectedCost(V1);
8638 CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
8639 assert(
8640 all_of(Mask,
8641 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8642 "All elements in mask must be less than CommonVF.");
8643 } else if (V1 && !V2) {
8644 // Shuffle vector and tree node.
8645 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8646 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8647 CommonVF = std::max(VF, E2->getVectorFactor());
8648 assert(all_of(Mask,
8649 [=](int Idx) {
8650 return Idx < 2 * static_cast<int>(CommonVF);
8651 }) &&
8652 "All elements in mask must be less than 2 * CommonVF.");
8653 if (E2->Scalars.size() == VF && VF != CommonVF) {
8654 SmallVector<int> E2Mask = E2->getCommonMask();
8655 assert(!E2Mask.empty() && "Expected non-empty common mask.");
8656 for (int &Idx : CommonMask) {
8657 if (Idx == PoisonMaskElem)
8658 continue;
8659 if (Idx >= static_cast<int>(CommonVF))
8660 Idx = E2Mask[Idx - CommonVF] + VF;
8661 }
8662 CommonVF = VF;
8663 }
8664 ExtraCost += GetValueMinBWAffectedCost(V1);
8665 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8666 ExtraCost += GetNodeMinBWAffectedCost(
8667 *E2, std::min(CommonVF, E2->getVectorFactor()));
8668 V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
8669 } else if (!V1 && V2) {
8670 // Shuffle vector and tree node.
8671 unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
8672 const TreeEntry *E1 = P1.get<const TreeEntry *>();
8673 CommonVF = std::max(VF, E1->getVectorFactor());
8674 assert(all_of(Mask,
8675 [=](int Idx) {
8676 return Idx < 2 * static_cast<int>(CommonVF);
8677 }) &&
8678 "All elements in mask must be less than 2 * CommonVF.");
8679 if (E1->Scalars.size() == VF && VF != CommonVF) {
8680 SmallVector<int> E1Mask = E1->getCommonMask();
8681 assert(!E1Mask.empty() && "Expected non-empty common mask.");
8682 for (int &Idx : CommonMask) {
8683 if (Idx == PoisonMaskElem)
8684 continue;
8685 if (Idx >= static_cast<int>(CommonVF))
8686 Idx = E1Mask[Idx - CommonVF] + VF;
8687 else
8688 Idx = E1Mask[Idx];
8689 }
8690 CommonVF = VF;
8691 }
8692 ExtraCost += GetNodeMinBWAffectedCost(
8693 *E1, std::min(CommonVF, E1->getVectorFactor()));
8694 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8695 ExtraCost += GetValueMinBWAffectedCost(V2);
8696 V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
8697 } else {
8698 assert(V1 && V2 && "Expected both vectors.");
8699 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8700 CommonVF =
8701 std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
8702 assert(all_of(Mask,
8703 [=](int Idx) {
8704 return Idx < 2 * static_cast<int>(CommonVF);
8705 }) &&
8706 "All elements in mask must be less than 2 * CommonVF.");
8707 ExtraCost +=
8708 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
8709 if (V1->getType() != V2->getType()) {
8710 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8711 V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
8712 } else {
8713 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
8714 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8715 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
8716 V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
8717 }
8718 }
8719 InVectors.front() = Constant::getNullValue(
8720 FixedVectorType::get(ScalarTy, CommonMask.size()));
8721 if (InVectors.size() == 2)
8722 InVectors.pop_back();
8723 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
8724 V1, V2, CommonMask, Builder);
8725 }
8726
8727public:
8729 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
8730 SmallPtrSetImpl<Value *> &CheckedExtracts)
8731 : ScalarTy(ScalarTy), TTI(TTI),
8732 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
8733 CheckedExtracts(CheckedExtracts) {}
8734 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
8735 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8736 unsigned NumParts, bool &UseVecBaseAsInput) {
8737 UseVecBaseAsInput = false;
8738 if (Mask.empty())
8739 return nullptr;
8740 Value *VecBase = nullptr;
8741 ArrayRef<Value *> VL = E->Scalars;
8742 // If the resulting type is scalarized, do not adjust the cost.
8743 if (NumParts == VL.size())
8744 return nullptr;
8745 // Check if it can be considered reused if same extractelements were
8746 // vectorized already.
8747 bool PrevNodeFound = any_of(
8748 ArrayRef(R.VectorizableTree).take_front(E->Idx),
8749 [&](const std::unique_ptr<TreeEntry> &TE) {
8750 return ((!TE->isAltShuffle() &&
8751 TE->getOpcode() == Instruction::ExtractElement) ||
8752 TE->State == TreeEntry::NeedToGather) &&
8753 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
8754 return VL.size() > Data.index() &&
8755 (Mask[Data.index()] == PoisonMaskElem ||
8756 isa<UndefValue>(VL[Data.index()]) ||
8757 Data.value() == VL[Data.index()]);
8758 });
8759 });
8760 SmallPtrSet<Value *, 4> UniqueBases;
8761 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
8762 for (unsigned Part : seq<unsigned>(NumParts)) {
8763 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
8764 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
8765 for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, Limit))) {
8766 // Ignore non-extractelement scalars.
8767 if (isa<UndefValue>(V) ||
8768 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
8769 continue;
8770 // If all users of instruction are going to be vectorized and this
8771 // instruction itself is not going to be vectorized, consider this
8772 // instruction as dead and remove its cost from the final cost of the
8773 // vectorized tree.
8774 // Also, avoid adjusting the cost for extractelements with multiple uses
8775 // in different graph entries.
8776 auto *EE = cast<ExtractElementInst>(V);
8777 VecBase = EE->getVectorOperand();
8778 UniqueBases.insert(VecBase);
8779 const TreeEntry *VE = R.getTreeEntry(V);
8780 if (!CheckedExtracts.insert(V).second ||
8781 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
8782 any_of(EE->users(),
8783 [&](User *U) {
8784 return isa<GetElementPtrInst>(U) &&
8785 !R.areAllUsersVectorized(cast<Instruction>(U),
8786 &VectorizedVals);
8787 }) ||
8788 (VE && VE != E))
8789 continue;
8790 std::optional<unsigned> EEIdx = getExtractIndex(EE);
8791 if (!EEIdx)
8792 continue;
8793 unsigned Idx = *EEIdx;
8794 // Take credit for instruction that will become dead.
8795 if (EE->hasOneUse() || !PrevNodeFound) {
8796 Instruction *Ext = EE->user_back();
8797 if (isa<SExtInst, ZExtInst>(Ext) &&
8798 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
8799 // Use getExtractWithExtendCost() to calculate the cost of
8800 // extractelement/ext pair.
8801 Cost -=
8802 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
8803 EE->getVectorOperandType(), Idx);
8804 // Add back the cost of s|zext which is subtracted separately.
8806 Ext->getOpcode(), Ext->getType(), EE->getType(),
8807 TTI::getCastContextHint(Ext), CostKind, Ext);
8808 continue;
8809 }
8810 }
8811 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
8812 CostKind, Idx);
8813 }
8814 }
8815 // Check that gather of extractelements can be represented as just a
8816 // shuffle of a single/two vectors the scalars are extracted from.
8817 // Found the bunch of extractelement instructions that must be gathered
8818 // into a vector and can be represented as a permutation elements in a
8819 // single input vector or of 2 input vectors.
8820 // Done for reused if same extractelements were vectorized already.
8821 if (!PrevNodeFound)
8822 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8823 InVectors.assign(1, E);
8824 CommonMask.assign(Mask.begin(), Mask.end());
8825 transformMaskAfterShuffle(CommonMask, CommonMask);
8826 SameNodesEstimated = false;
8827 if (NumParts != 1 && UniqueBases.size() != 1) {
8828 UseVecBaseAsInput = true;
8829 VecBase = Constant::getNullValue(
8830 FixedVectorType::get(ScalarTy, CommonMask.size()));
8831 }
8832 return VecBase;
8833 }
8834 /// Checks if the specified entry \p E needs to be delayed because of its
8835 /// dependency nodes.
8836 std::optional<InstructionCost>
8837 needToDelay(const TreeEntry *,
8839 // No need to delay the cost estimation during analysis.
8840 return std::nullopt;
8841 }
8842 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
8843 if (&E1 == &E2) {
8844 assert(all_of(Mask,
8845 [&](int Idx) {
8846 return Idx < static_cast<int>(E1.getVectorFactor());
8847 }) &&
8848 "Expected single vector shuffle mask.");
8849 add(E1, Mask);
8850 return;
8851 }
8852 if (InVectors.empty()) {
8853 CommonMask.assign(Mask.begin(), Mask.end());
8854 InVectors.assign({&E1, &E2});
8855 return;
8856 }
8857 assert(!CommonMask.empty() && "Expected non-empty common mask.");
8858 auto *MaskVecTy = FixedVectorType::get(ScalarTy, Mask.size());
8859 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
8860 if (NumParts == 0 || NumParts >= Mask.size())
8861 NumParts = 1;
8862 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
8863 const auto *It =
8864 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
8865 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8866 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
8867 }
8868 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
8869 if (InVectors.empty()) {
8870 CommonMask.assign(Mask.begin(), Mask.end());
8871 InVectors.assign(1, &E1);
8872 return;
8873 }
8874 assert(!CommonMask.empty() && "Expected non-empty common mask.");
8875 auto *MaskVecTy = FixedVectorType::get(ScalarTy, Mask.size());
8876 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
8877 if (NumParts == 0 || NumParts >= Mask.size())
8878 NumParts = 1;
8879 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
8880 const auto *It =
8881 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
8882 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8883 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
8884 if (!SameNodesEstimated && InVectors.size() == 1)
8885 InVectors.emplace_back(&E1);
8886 }
8887 /// Adds 2 input vectors and the mask for their shuffling.
8888 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
8889 // May come only for shuffling of 2 vectors with extractelements, already
8890 // handled in adjustExtracts.
8891 assert(InVectors.size() == 1 &&
8892 all_of(enumerate(CommonMask),
8893 [&](auto P) {
8894 if (P.value() == PoisonMaskElem)
8895 return Mask[P.index()] == PoisonMaskElem;
8896 auto *EI =
8897 cast<ExtractElementInst>(InVectors.front()
8898 .get<const TreeEntry *>()
8899 ->Scalars[P.index()]);
8900 return EI->getVectorOperand() == V1 ||
8901 EI->getVectorOperand() == V2;
8902 }) &&
8903 "Expected extractelement vectors.");
8904 }
8905 /// Adds another one input vector and the mask for the shuffling.
8906 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
8907 if (InVectors.empty()) {
8908 assert(CommonMask.empty() && !ForExtracts &&
8909 "Expected empty input mask/vectors.");
8910 CommonMask.assign(Mask.begin(), Mask.end());
8911 InVectors.assign(1, V1);
8912 return;
8913 }
8914 if (ForExtracts) {
8915 // No need to add vectors here, already handled them in adjustExtracts.
8916 assert(InVectors.size() == 1 &&
8917 InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
8918 all_of(enumerate(CommonMask),
8919 [&](auto P) {
8920 Value *Scalar = InVectors.front()
8921 .get<const TreeEntry *>()
8922 ->Scalars[P.index()];
8923 if (P.value() == PoisonMaskElem)
8924 return P.value() == Mask[P.index()] ||
8925 isa<UndefValue>(Scalar);
8926 if (isa<Constant>(V1))
8927 return true;
8928 auto *EI = cast<ExtractElementInst>(Scalar);
8929 return EI->getVectorOperand() == V1;
8930 }) &&
8931 "Expected only tree entry for extractelement vectors.");
8932 return;
8933 }
8934 assert(!InVectors.empty() && !CommonMask.empty() &&
8935 "Expected only tree entries from extracts/reused buildvectors.");
8936 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8937 if (InVectors.size() == 2) {
8938 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
8939 transformMaskAfterShuffle(CommonMask, CommonMask);
8940 VF = std::max<unsigned>(VF, CommonMask.size());
8941 } else if (const auto *InTE =
8942 InVectors.front().dyn_cast<const TreeEntry *>()) {
8943 VF = std::max(VF, InTE->getVectorFactor());
8944 } else {
8945 VF = std::max(
8946 VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())
8947 ->getNumElements());
8948 }
8949 InVectors.push_back(V1);
8950 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8951 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8952 CommonMask[Idx] = Mask[Idx] + VF;
8953 }
8954 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
8955 Value *Root = nullptr) {
8956 Cost += getBuildVectorCost(VL, Root);
8957 if (!Root) {
8958 // FIXME: Need to find a way to avoid use of getNullValue here.
8960 unsigned VF = VL.size();
8961 if (MaskVF != 0)
8962 VF = std::min(VF, MaskVF);
8963 for (Value *V : VL.take_front(VF)) {
8964 if (isa<UndefValue>(V)) {
8965 Vals.push_back(cast<Constant>(V));
8966 continue;
8967 }
8968 Vals.push_back(Constant::getNullValue(V->getType()));
8969 }
8970 return ConstantVector::get(Vals);
8971 }
8974 cast<FixedVectorType>(Root->getType())->getNumElements()),
8975 getAllOnesValue(*R.DL, ScalarTy));
8976 }
8978 /// Finalize emission of the shuffles.
8980 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
8981 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
8982 IsFinalized = true;
8983 if (Action) {
8984 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
8985 if (InVectors.size() == 2)
8986 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
8987 else
8988 Cost += createShuffle(Vec, nullptr, CommonMask);
8989 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8990 if (CommonMask[Idx] != PoisonMaskElem)
8991 CommonMask[Idx] = Idx;
8992 assert(VF > 0 &&
8993 "Expected vector length for the final value before action.");
8994 Value *V = Vec.get<Value *>();
8995 Action(V, CommonMask);
8996 InVectors.front() = V;
8997 }
8998 ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
8999 if (CommonMask.empty()) {
9000 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
9001 return Cost;
9002 }
9003 return Cost +
9004 createShuffle(InVectors.front(),
9005 InVectors.size() == 2 ? InVectors.back() : nullptr,
9006 CommonMask);
9007 }
9008
9010 assert((IsFinalized || CommonMask.empty()) &&
9011 "Shuffle construction must be finalized.");
9012 }
9013};
9014
9015const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
9016 unsigned Idx) const {
9017 Value *Op = E->getOperand(Idx).front();
9018 if (const TreeEntry *TE = getTreeEntry(Op)) {
9019 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9020 return EI.EdgeIdx == Idx && EI.UserTE == E;
9021 }) != TE->UserTreeIndices.end())
9022 return TE;
9023 auto MIt = MultiNodeScalars.find(Op);
9024 if (MIt != MultiNodeScalars.end()) {
9025 for (const TreeEntry *TE : MIt->second) {
9026 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9027 return EI.EdgeIdx == Idx && EI.UserTE == E;
9028 }) != TE->UserTreeIndices.end())
9029 return TE;
9030 }
9031 }
9032 }
9033 const auto *It =
9034 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9035 return TE->State == TreeEntry::NeedToGather &&
9036 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9037 return EI.EdgeIdx == Idx && EI.UserTE == E;
9038 }) != TE->UserTreeIndices.end();
9039 });
9040 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
9041 return It->get();
9042}
9043
9044TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
9045 if (TE.State == TreeEntry::ScatterVectorize ||
9046 TE.State == TreeEntry::StridedVectorize)
9048 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
9049 !TE.isAltShuffle()) {
9050 if (TE.ReorderIndices.empty())
9053 inversePermutation(TE.ReorderIndices, Mask);
9054 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
9056 }
9058}
9059
9060/// Builds the arguments types vector for the given call instruction with the
9061/// given \p ID for the specified vector factor.
9063 const Intrinsic::ID ID,
9064 const unsigned VF,
9065 unsigned MinBW) {
9066 SmallVector<Type *> ArgTys;
9067 for (auto [Idx, Arg] : enumerate(CI->args())) {
9070 ArgTys.push_back(Arg->getType());
9071 continue;
9072 }
9073 if (MinBW > 0) {
9075 IntegerType::get(CI->getContext(), MinBW), VF));
9076 continue;
9077 }
9078 }
9079 ArgTys.push_back(FixedVectorType::get(Arg->getType(), VF));
9080 }
9081 return ArgTys;
9082}
9083
9085BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
9086 SmallPtrSetImpl<Value *> &CheckedExtracts) {
9087 ArrayRef<Value *> VL = E->Scalars;
9088
9089 Type *ScalarTy = VL[0]->getType();
9090 if (E->State != TreeEntry::NeedToGather) {
9091 if (auto *SI = dyn_cast<StoreInst>(VL[0]))
9092 ScalarTy = SI->getValueOperand()->getType();
9093 else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
9094 ScalarTy = CI->getOperand(0)->getType();
9095 else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
9096 ScalarTy = IE->getOperand(1)->getType();
9097 }
9098 if (!isValidElementType(ScalarTy))
9101
9102 // If we have computed a smaller type for the expression, update VecTy so
9103 // that the costs will be accurate.
9104 auto It = MinBWs.find(E);
9105 Type *OrigScalarTy = ScalarTy;
9106 if (It != MinBWs.end())
9107 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
9108 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
9109 unsigned EntryVF = E->getVectorFactor();
9110 auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
9111
9112 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
9113 if (E->State == TreeEntry::NeedToGather) {
9114 if (allConstant(VL))
9115 return 0;
9116 if (isa<InsertElementInst>(VL[0]))
9118 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
9119 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
9120 }
9121 InstructionCost CommonCost = 0;
9123 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
9124 if (!E->ReorderIndices.empty() &&
9125 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
9126 SmallVector<int> NewMask;
9127 if (E->getOpcode() == Instruction::Store) {
9128 // For stores the order is actually a mask.
9129 NewMask.resize(E->ReorderIndices.size());
9130 copy(E->ReorderIndices, NewMask.begin());
9131 } else {
9132 inversePermutation(E->ReorderIndices, NewMask);
9133 }
9134 ::addMask(Mask, NewMask);
9135 }
9136 if (NeedToShuffleReuses)
9137 ::addMask(Mask, E->ReuseShuffleIndices);
9138 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
9139 CommonCost =
9140 TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
9141 assert((E->State == TreeEntry::Vectorize ||
9142 E->State == TreeEntry::ScatterVectorize ||
9143 E->State == TreeEntry::StridedVectorize) &&
9144 "Unhandled state");
9145 assert(E->getOpcode() &&
9146 ((allSameType(VL) && allSameBlock(VL)) ||
9147 (E->getOpcode() == Instruction::GetElementPtr &&
9148 E->getMainOp()->getType()->isPointerTy())) &&
9149 "Invalid VL");
9150 Instruction *VL0 = E->getMainOp();
9151 unsigned ShuffleOrOp =
9152 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
9153 SetVector<Value *> UniqueValues(VL.begin(), VL.end());
9154 const unsigned Sz = UniqueValues.size();
9155 SmallBitVector UsedScalars(Sz, false);
9156 for (unsigned I = 0; I < Sz; ++I) {
9157 if (getTreeEntry(UniqueValues[I]) == E)
9158 continue;
9159 UsedScalars.set(I);
9160 }
9161 auto GetCastContextHint = [&](Value *V) {
9162 if (const TreeEntry *OpTE = getTreeEntry(V))
9163 return getCastContextHint(*OpTE);
9164 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
9165 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
9168 };
9169 auto GetCostDiff =
9170 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
9172 // Calculate the cost of this instruction.
9173 InstructionCost ScalarCost = 0;
9174 if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) {
9175 // For some of the instructions no need to calculate cost for each
9176 // particular instruction, we can use the cost of the single
9177 // instruction x total number of scalar instructions.
9178 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
9179 } else {
9180 for (unsigned I = 0; I < Sz; ++I) {
9181 if (UsedScalars.test(I))
9182 continue;
9183 ScalarCost += ScalarEltCost(I);
9184 }
9185 }
9186
9187 InstructionCost VecCost = VectorCost(CommonCost);
9188 // Check if the current node must be resized, if the parent node is not
9189 // resized.
9190 if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) {
9191 const EdgeInfo &EI = E->UserTreeIndices.front();
9192 if ((EI.UserTE->getOpcode() != Instruction::Select ||
9193 EI.EdgeIdx != 0) &&
9194 It != MinBWs.end()) {
9195 auto UserBWIt = MinBWs.find(EI.UserTE);
9196 Type *UserScalarTy =
9197 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
9198 if (UserBWIt != MinBWs.end())
9199 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
9200 UserBWIt->second.first);
9201 if (ScalarTy != UserScalarTy) {
9202 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9203 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
9204 unsigned VecOpcode;
9205 auto *UserVecTy =
9206 FixedVectorType::get(UserScalarTy, E->getVectorFactor());
9207 if (BWSz > SrcBWSz)
9208 VecOpcode = Instruction::Trunc;
9209 else
9210 VecOpcode =
9211 It->second.second ? Instruction::SExt : Instruction::ZExt;
9212 TTI::CastContextHint CCH = GetCastContextHint(VL0);
9213 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
9214 CostKind);
9215 }
9216 }
9217 }
9218 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
9219 ScalarCost, "Calculated costs for Tree"));
9220 return VecCost - ScalarCost;
9221 };
9222 // Calculate cost difference from vectorizing set of GEPs.
9223 // Negative value means vectorizing is profitable.
9224 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
9225 assert((E->State == TreeEntry::Vectorize ||
9226 E->State == TreeEntry::StridedVectorize) &&
9227 "Entry state expected to be Vectorize or StridedVectorize here.");
9228 InstructionCost ScalarCost = 0;
9229 InstructionCost VecCost = 0;
9230 std::tie(ScalarCost, VecCost) = getGEPCosts(
9231 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
9232 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
9233 "Calculated GEPs cost for Tree"));
9234
9235 return VecCost - ScalarCost;
9236 };
9237
9238 switch (ShuffleOrOp) {
9239 case Instruction::PHI: {
9240 // Count reused scalars.
9241 InstructionCost ScalarCost = 0;
9243 for (Value *V : UniqueValues) {
9244 auto *PHI = dyn_cast<PHINode>(V);
9245 if (!PHI)
9246 continue;
9247
9248 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
9249 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
9250 Value *Op = PHI->getIncomingValue(I);
9251 Operands[I] = Op;
9252 }
9253 if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
9254 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
9255 if (!OpTE->ReuseShuffleIndices.empty())
9256 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
9257 OpTE->Scalars.size());
9258 }
9259
9260 return CommonCost - ScalarCost;
9261 }
9262 case Instruction::ExtractValue:
9263 case Instruction::ExtractElement: {
9264 auto GetScalarCost = [&](unsigned Idx) {
9265 auto *I = cast<Instruction>(UniqueValues[Idx]);
9266 VectorType *SrcVecTy;
9267 if (ShuffleOrOp == Instruction::ExtractElement) {
9268 auto *EE = cast<ExtractElementInst>(I);
9269 SrcVecTy = EE->getVectorOperandType();
9270 } else {
9271 auto *EV = cast<ExtractValueInst>(I);
9272 Type *AggregateTy = EV->getAggregateOperand()->getType();
9273 unsigned NumElts;
9274 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
9275 NumElts = ATy->getNumElements();
9276 else
9277 NumElts = AggregateTy->getStructNumElements();
9278 SrcVecTy = FixedVectorType::get(OrigScalarTy, NumElts);
9279 }
9280 if (I->hasOneUse()) {
9281 Instruction *Ext = I->user_back();
9282 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
9283 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
9284 // Use getExtractWithExtendCost() to calculate the cost of
9285 // extractelement/ext pair.
9287 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
9288 // Subtract the cost of s|zext which is subtracted separately.
9290 Ext->getOpcode(), Ext->getType(), I->getType(),
9292 return Cost;
9293 }
9294 }
9295 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
9297 };
9298 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
9299 return GetCostDiff(GetScalarCost, GetVectorCost);
9300 }
9301 case Instruction::InsertElement: {
9302 assert(E->ReuseShuffleIndices.empty() &&
9303 "Unique insertelements only are expected.");
9304 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
9305 unsigned const NumElts = SrcVecTy->getNumElements();
9306 unsigned const NumScalars = VL.size();
9307
9308 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
9309
9310 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
9311 unsigned OffsetBeg = *getInsertIndex(VL.front());
9312 unsigned OffsetEnd = OffsetBeg;
9313 InsertMask[OffsetBeg] = 0;
9314 for (auto [I, V] : enumerate(VL.drop_front())) {
9315 unsigned Idx = *getInsertIndex(V);
9316 if (OffsetBeg > Idx)
9317 OffsetBeg = Idx;
9318 else if (OffsetEnd < Idx)
9319 OffsetEnd = Idx;
9320 InsertMask[Idx] = I + 1;
9321 }
9322 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
9323 if (NumOfParts > 0)
9324 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
9325 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9326 VecScalarsSz;
9327 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9328 unsigned InsertVecSz = std::min<unsigned>(
9329 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
9330 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9331 bool IsWholeSubvector =
9332 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9333 // Check if we can safely insert a subvector. If it is not possible, just
9334 // generate a whole-sized vector and shuffle the source vector and the new
9335 // subvector.
9336 if (OffsetBeg + InsertVecSz > VecSz) {
9337 // Align OffsetBeg to generate correct mask.
9338 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
9339 InsertVecSz = VecSz;
9340 }
9341
9342 APInt DemandedElts = APInt::getZero(NumElts);
9343 // TODO: Add support for Instruction::InsertValue.
9345 if (!E->ReorderIndices.empty()) {
9346 inversePermutation(E->ReorderIndices, Mask);
9347 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
9348 } else {
9349 Mask.assign(VecSz, PoisonMaskElem);
9350 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
9351 }
9352 bool IsIdentity = true;
9353 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
9354 Mask.swap(PrevMask);
9355 for (unsigned I = 0; I < NumScalars; ++I) {
9356 unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
9357 DemandedElts.setBit(InsertIdx);
9358 IsIdentity &= InsertIdx - OffsetBeg == I;
9359 Mask[InsertIdx - OffsetBeg] = I;
9360 }
9361 assert(Offset < NumElts && "Failed to find vector index offset");
9362
9364 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
9365 /*Insert*/ true, /*Extract*/ false,
9366 CostKind);
9367
9368 // First cost - resize to actual vector size if not identity shuffle or
9369 // need to shift the vector.
9370 // Do not calculate the cost if the actual size is the register size and
9371 // we can merge this shuffle with the following SK_Select.
9372 auto *InsertVecTy = FixedVectorType::get(ScalarTy, InsertVecSz);
9373 if (!IsIdentity)
9375 InsertVecTy, Mask);
9376 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
9377 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9378 }));
9379 // Second cost - permutation with subvector, if some elements are from the
9380 // initial vector or inserting a subvector.
9381 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
9382 // subvector of ActualVecTy.
9383 SmallBitVector InMask =
9384 isUndefVector(FirstInsert->getOperand(0),
9385 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9386 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
9387 if (InsertVecSz != VecSz) {
9388 auto *ActualVecTy = FixedVectorType::get(ScalarTy, VecSz);
9390 std::nullopt, CostKind, OffsetBeg - Offset,
9391 InsertVecTy);
9392 } else {
9393 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
9394 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
9395 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
9396 I <= End; ++I)
9397 if (Mask[I] != PoisonMaskElem)
9398 Mask[I] = I + VecSz;
9399 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
9400 Mask[I] =
9401 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
9402 Cost +=
9403 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
9404 }
9405 }
9406 return Cost;
9407 }
9408 case Instruction::ZExt:
9409 case Instruction::SExt:
9410 case Instruction::FPToUI:
9411 case Instruction::FPToSI:
9412 case Instruction::FPExt:
9413 case Instruction::PtrToInt:
9414 case Instruction::IntToPtr:
9415 case Instruction::SIToFP:
9416 case Instruction::UIToFP:
9417 case Instruction::Trunc:
9418 case Instruction::FPTrunc:
9419 case Instruction::BitCast: {
9420 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9421 Type *SrcScalarTy = VL0->getOperand(0)->getType();
9422 auto *SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
9423 unsigned Opcode = ShuffleOrOp;
9424 unsigned VecOpcode = Opcode;
9425 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
9426 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
9427 // Check if the values are candidates to demote.
9428 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
9429 if (SrcIt != MinBWs.end()) {
9430 SrcBWSz = SrcIt->second.first;
9431 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
9432 SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
9433 }
9434 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9435 if (BWSz == SrcBWSz) {
9436 VecOpcode = Instruction::BitCast;
9437 } else if (BWSz < SrcBWSz) {
9438 VecOpcode = Instruction::Trunc;
9439 } else if (It != MinBWs.end()) {
9440 assert(BWSz > SrcBWSz && "Invalid cast!");
9441 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9442 } else if (SrcIt != MinBWs.end()) {
9443 assert(BWSz > SrcBWSz && "Invalid cast!");
9444 VecOpcode =
9445 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9446 }
9447 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
9448 !SrcIt->second.second) {
9449 VecOpcode = Instruction::UIToFP;
9450 }
9451 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
9452 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9453 return TTI->getCastInstrCost(Opcode, VL0->getType(),
9454 VL0->getOperand(0)->getType(),
9456 };
9457 auto GetVectorCost = [=](InstructionCost CommonCost) {
9458 // Do not count cost here if minimum bitwidth is in effect and it is just
9459 // a bitcast (here it is just a noop).
9460 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9461 return CommonCost;
9462 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
9463 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
9464 return CommonCost +
9465 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
9466 VecOpcode == Opcode ? VI : nullptr);
9467 };
9468 return GetCostDiff(GetScalarCost, GetVectorCost);
9469 }
9470 case Instruction::FCmp:
9471 case Instruction::ICmp:
9472 case Instruction::Select: {
9473 CmpInst::Predicate VecPred, SwappedVecPred;
9474 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
9475 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
9476 match(VL0, MatchCmp))
9477 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
9478 else
9479 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
9482 auto GetScalarCost = [&](unsigned Idx) {
9483 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9484 CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
9487 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
9488 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
9489 !match(VI, MatchCmp)) ||
9490 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9491 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
9494
9495 return TTI->getCmpSelInstrCost(E->getOpcode(), OrigScalarTy,
9496 Builder.getInt1Ty(), CurrentPred, CostKind,
9497 VI);
9498 };
9499 auto GetVectorCost = [&](InstructionCost CommonCost) {
9500 auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
9501
9503 E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
9504 // Check if it is possible and profitable to use min/max for selects
9505 // in VL.
9506 //
9507 auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
9508 if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
9509 IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
9510 {VecTy, VecTy});
9511 InstructionCost IntrinsicCost =
9512 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9513 // If the selects are the only uses of the compares, they will be
9514 // dead and we can adjust the cost by removing their cost.
9515 if (IntrinsicAndUse.second)
9516 IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy,
9517 MaskTy, VecPred, CostKind);
9518 VecCost = std::min(VecCost, IntrinsicCost);
9519 }
9520 return VecCost + CommonCost;
9521 };
9522 return GetCostDiff(GetScalarCost, GetVectorCost);
9523 }
9524 case Instruction::FNeg:
9525 case Instruction::Add:
9526 case Instruction::FAdd:
9527 case Instruction::Sub:
9528 case Instruction::FSub:
9529 case Instruction::Mul:
9530 case Instruction::FMul:
9531 case Instruction::UDiv:
9532 case Instruction::SDiv:
9533 case Instruction::FDiv:
9534 case Instruction::URem:
9535 case Instruction::SRem:
9536 case Instruction::FRem:
9537 case Instruction::Shl:
9538 case Instruction::LShr:
9539 case Instruction::AShr:
9540 case Instruction::And:
9541 case Instruction::Or:
9542 case Instruction::Xor: {
9543 auto GetScalarCost = [&](unsigned Idx) {
9544 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9545 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9546 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
9547 TTI::OperandValueInfo Op2Info =
9548 TTI::getOperandInfo(VI->getOperand(OpIdx));
9549 SmallVector<const Value *> Operands(VI->operand_values());
9550 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
9551 Op1Info, Op2Info, Operands, VI);
9552 };
9553 auto GetVectorCost = [=](InstructionCost CommonCost) {
9554 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
9555 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
9556 ArrayRef<Value *> Ops = E->getOperand(I);
9557 if (all_of(Ops, [&](Value *Op) {
9558 auto *CI = dyn_cast<ConstantInt>(Op);
9559 return CI && CI->getValue().countr_one() >= It->second.first;
9560 }))
9561 return CommonCost;
9562 }
9563 }
9564 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
9565 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
9566 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
9567 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
9568 Op2Info, std::nullopt, nullptr, TLI) +
9569 CommonCost;
9570 };
9571 return GetCostDiff(GetScalarCost, GetVectorCost);
9572 }
9573 case Instruction::GetElementPtr: {
9574 return CommonCost + GetGEPCostDiff(VL, VL0);
9575 }
9576 case Instruction::Load: {
9577 auto GetScalarCost = [&](unsigned Idx) {
9578 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
9579 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
9580 VI->getAlign(), VI->getPointerAddressSpace(),
9582 };
9583 auto *LI0 = cast<LoadInst>(VL0);
9584 auto GetVectorCost = [&](InstructionCost CommonCost) {
9585 InstructionCost VecLdCost;
9586 if (E->State == TreeEntry::Vectorize) {
9587 VecLdCost = TTI->getMemoryOpCost(
9588 Instruction::Load, VecTy, LI0->getAlign(),
9589 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
9590 } else if (E->State == TreeEntry::StridedVectorize) {
9591 Align CommonAlignment =
9592 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9593 VecLdCost = TTI->getStridedMemoryOpCost(
9594 Instruction::Load, VecTy, LI0->getPointerOperand(),
9595 /*VariableMask=*/false, CommonAlignment, CostKind);
9596 } else {
9597 assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
9598 Align CommonAlignment =
9599 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9600 VecLdCost = TTI->getGatherScatterOpCost(
9601 Instruction::Load, VecTy, LI0->getPointerOperand(),
9602 /*VariableMask=*/false, CommonAlignment, CostKind);
9603 }
9604 return VecLdCost + CommonCost;
9605 };
9606
9607 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
9608 // If this node generates masked gather load then it is not a terminal node.
9609 // Hence address operand cost is estimated separately.
9610 if (E->State == TreeEntry::ScatterVectorize)
9611 return Cost;
9612
9613 // Estimate cost of GEPs since this tree node is a terminator.
9614 SmallVector<Value *> PointerOps(VL.size());
9615 for (auto [I, V] : enumerate(VL))
9616 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
9617 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9618 }
9619 case Instruction::Store: {
9620 bool IsReorder = !E->ReorderIndices.empty();
9621 auto GetScalarCost = [=](unsigned Idx) {
9622 auto *VI = cast<StoreInst>(VL[Idx]);
9623 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
9624 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
9625 VI->getAlign(), VI->getPointerAddressSpace(),
9626 CostKind, OpInfo, VI);
9627 };
9628 auto *BaseSI =
9629 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9630 auto GetVectorCost = [=](InstructionCost CommonCost) {
9631 // We know that we can merge the stores. Calculate the cost.
9632 InstructionCost VecStCost;
9633 if (E->State == TreeEntry::StridedVectorize) {
9634 Align CommonAlignment =
9635 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
9636 VecStCost = TTI->getStridedMemoryOpCost(
9637 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9638 /*VariableMask=*/false, CommonAlignment, CostKind);
9639 } else {
9640 assert(E->State == TreeEntry::Vectorize &&
9641 "Expected either strided or consecutive stores.");
9642 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
9643 VecStCost = TTI->getMemoryOpCost(
9644 Instruction::Store, VecTy, BaseSI->getAlign(),
9645 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
9646 }
9647 return VecStCost + CommonCost;
9648 };
9649 SmallVector<Value *> PointerOps(VL.size());
9650 for (auto [I, V] : enumerate(VL)) {
9651 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
9652 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
9653 }
9654
9655 return GetCostDiff(GetScalarCost, GetVectorCost) +
9656 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9657 }
9658 case Instruction::Call: {
9659 auto GetScalarCost = [&](unsigned Idx) {
9660 auto *CI = cast<CallInst>(UniqueValues[Idx]);
9663 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
9664 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9665 }
9668 CI->getFunctionType()->params(), CostKind);
9669 };
9670 auto GetVectorCost = [=](InstructionCost CommonCost) {
9671 auto *CI = cast<CallInst>(VL0);
9673 SmallVector<Type *> ArgTys =
9675 It != MinBWs.end() ? It->second.first : 0);
9676 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
9677 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
9678 };
9679 return GetCostDiff(GetScalarCost, GetVectorCost);
9680 }
9681 case Instruction::ShuffleVector: {
9682 assert(E->isAltShuffle() &&
9683 ((Instruction::isBinaryOp(E->getOpcode()) &&
9684 Instruction::isBinaryOp(E->getAltOpcode())) ||
9685 (Instruction::isCast(E->getOpcode()) &&
9686 Instruction::isCast(E->getAltOpcode())) ||
9687 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9688 "Invalid Shuffle Vector Operand");
9689 // Try to find the previous shuffle node with the same operands and same
9690 // main/alternate ops.
9691 auto TryFindNodeWithEqualOperands = [=]() {
9692 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9693 if (TE.get() == E)
9694 break;
9695 if (TE->isAltShuffle() &&
9696 ((TE->getOpcode() == E->getOpcode() &&
9697 TE->getAltOpcode() == E->getAltOpcode()) ||
9698 (TE->getOpcode() == E->getAltOpcode() &&
9699 TE->getAltOpcode() == E->getOpcode())) &&
9700 TE->hasEqualOperands(*E))
9701 return true;
9702 }
9703 return false;
9704 };
9705 auto GetScalarCost = [&](unsigned Idx) {
9706 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9707 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
9708 (void)E;
9709 return TTI->getInstructionCost(VI, CostKind);
9710 };
9711 // Need to clear CommonCost since the final shuffle cost is included into
9712 // vector cost.
9713 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
9714 // VecCost is equal to sum of the cost of creating 2 vectors
9715 // and the cost of creating shuffle.
9716 InstructionCost VecCost = 0;
9717 if (TryFindNodeWithEqualOperands()) {
9718 LLVM_DEBUG({
9719 dbgs() << "SLP: diamond match for alternate node found.\n";
9720 E->dump();
9721 });
9722 // No need to add new vector costs here since we're going to reuse
9723 // same main/alternate vector ops, just do different shuffling.
9724 } else if (Instruction::isBinaryOp(E->getOpcode())) {
9725 VecCost =
9726 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
9727 VecCost +=
9728 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
9729 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
9730 auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
9731 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
9732 CI0->getPredicate(), CostKind, VL0);
9733 VecCost += TTIRef.getCmpSelInstrCost(
9734 E->getOpcode(), VecTy, MaskTy,
9735 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
9736 E->getAltOp());
9737 } else {
9738 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
9739 auto *SrcTy = FixedVectorType::get(SrcSclTy, VL.size());
9740 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
9741 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9742 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9743 unsigned SrcBWSz =
9744 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
9745 if (SrcIt != MinBWs.end()) {
9746 SrcBWSz = SrcIt->second.first;
9747 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
9748 SrcTy = FixedVectorType::get(SrcSclTy, VL.size());
9749 }
9750 if (BWSz <= SrcBWSz) {
9751 if (BWSz < SrcBWSz)
9752 VecCost =
9753 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
9755 LLVM_DEBUG({
9756 dbgs()
9757 << "SLP: alternate extension, which should be truncated.\n";
9758 E->dump();
9759 });
9760 return VecCost;
9761 }
9762 }
9763 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
9765 VecCost +=
9766 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
9768 }
9770 E->buildAltOpShuffleMask(
9771 [E](Instruction *I) {
9772 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
9773 return I->getOpcode() == E->getAltOpcode();
9774 },
9775 Mask);
9777 FinalVecTy, Mask);
9778 // Patterns like [fadd,fsub] can be combined into a single instruction
9779 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
9780 // need to take into account their order when looking for the most used
9781 // order.
9782 unsigned Opcode0 = E->getOpcode();
9783 unsigned Opcode1 = E->getAltOpcode();
9784 SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));
9785 // If this pattern is supported by the target then we consider the
9786 // order.
9787 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9788 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
9789 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
9790 return AltVecCost < VecCost ? AltVecCost : VecCost;
9791 }
9792 // TODO: Check the reverse order too.
9793 return VecCost;
9794 };
9795 return GetCostDiff(GetScalarCost, GetVectorCost);
9796 }
9797 default:
9798 llvm_unreachable("Unknown instruction");
9799 }
9800}
9801
9802bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
9803 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
9804 << VectorizableTree.size() << " is fully vectorizable .\n");
9805
9806 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
9808 return TE->State == TreeEntry::NeedToGather &&
9809 !any_of(TE->Scalars,
9810 [this](Value *V) { return EphValues.contains(V); }) &&
9811 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
9812 TE->Scalars.size() < Limit ||
9813 ((TE->getOpcode() == Instruction::ExtractElement ||
9814 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
9815 isFixedVectorShuffle(TE->Scalars, Mask)) ||
9816 (TE->State == TreeEntry::NeedToGather &&
9817 TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()));
9818 };
9819
9820 // We only handle trees of heights 1 and 2.
9821 if (VectorizableTree.size() == 1 &&
9822 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
9823 (ForReduction &&
9824 AreVectorizableGathers(VectorizableTree[0].get(),
9825 VectorizableTree[0]->Scalars.size()) &&
9826 VectorizableTree[0]->getVectorFactor() > 2)))
9827 return true;
9828
9829 if (VectorizableTree.size() != 2)
9830 return false;
9831
9832 // Handle splat and all-constants stores. Also try to vectorize tiny trees
9833 // with the second gather nodes if they have less scalar operands rather than
9834 // the initial tree element (may be profitable to shuffle the second gather)
9835 // or they are extractelements, which form shuffle.
9837 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
9838 AreVectorizableGathers(VectorizableTree[1].get(),
9839 VectorizableTree[0]->Scalars.size()))
9840 return true;
9841
9842 // Gathering cost would be too much for tiny trees.
9843 if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
9844 (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9845 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
9846 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
9847 return false;
9848
9849 return true;
9850}
9851
9852static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
9854 bool MustMatchOrInst) {
9855 // Look past the root to find a source value. Arbitrarily follow the
9856 // path through operand 0 of any 'or'. Also, peek through optional
9857 // shift-left-by-multiple-of-8-bits.
9858 Value *ZextLoad = Root;
9859 const APInt *ShAmtC;
9860 bool FoundOr = false;
9861 while (!isa<ConstantExpr>(ZextLoad) &&
9862 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
9863 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
9864 ShAmtC->urem(8) == 0))) {
9865 auto *BinOp = cast<BinaryOperator>(ZextLoad);
9866 ZextLoad = BinOp->getOperand(0);
9867 if (BinOp->getOpcode() == Instruction::Or)
9868 FoundOr = true;
9869 }
9870 // Check if the input is an extended load of the required or/shift expression.
9871 Value *Load;
9872 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
9873 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
9874 return false;
9875
9876 // Require that the total load bit width is a legal integer type.
9877 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
9878 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
9879 Type *SrcTy = Load->getType();
9880 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
9881 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
9882 return false;
9883
9884 // Everything matched - assume that we can fold the whole sequence using
9885 // load combining.
9886 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
9887 << *(cast<Instruction>(Root)) << "\n");
9888
9889 return true;
9890}
9891
9893 if (RdxKind != RecurKind::Or)
9894 return false;
9895
9896 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9897 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
9898 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
9899 /* MatchOr */ false);
9900}
9901
9903 // Peek through a final sequence of stores and check if all operations are
9904 // likely to be load-combined.
9905 unsigned NumElts = Stores.size();
9906 for (Value *Scalar : Stores) {
9907 Value *X;
9908 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
9909 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
9910 return false;
9911 }
9912 return true;
9913}
9914
9915bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
9916 // No need to vectorize inserts of gathered values.
9917 if (VectorizableTree.size() == 2 &&
9918 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
9919 VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9920 (VectorizableTree[1]->getVectorFactor() <= 2 ||
9921 !(isSplat(VectorizableTree[1]->Scalars) ||
9922 allConstant(VectorizableTree[1]->Scalars))))
9923 return true;
9924
9925 // If the graph includes only PHI nodes and gathers, it is defnitely not
9926 // profitable for the vectorization, we can skip it, if the cost threshold is
9927 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
9928 // gathers/buildvectors.
9929 constexpr int Limit = 4;
9930 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
9931 !VectorizableTree.empty() &&
9932 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9933 return (TE->State == TreeEntry::NeedToGather &&
9934 TE->getOpcode() != Instruction::ExtractElement &&
9935 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
9936 TE->getOpcode() == Instruction::PHI;
9937 }))
9938 return true;
9939
9940 // We can vectorize the tree if its size is greater than or equal to the
9941 // minimum size specified by the MinTreeSize command line option.
9942 if (VectorizableTree.size() >= MinTreeSize)
9943 return false;
9944
9945 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
9946 // can vectorize it if we can prove it fully vectorizable.
9947 if (isFullyVectorizableTinyTree(ForReduction))
9948 return false;
9949
9950 // Check if any of the gather node forms an insertelement buildvector
9951 // somewhere.
9952 bool IsAllowedSingleBVNode =
9953 VectorizableTree.size() > 1 ||
9954 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
9955 !VectorizableTree.front()->isAltShuffle() &&
9956 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
9957 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
9958 allSameBlock(VectorizableTree.front()->Scalars));
9959 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9960 return TE->State == TreeEntry::NeedToGather &&
9961 all_of(TE->Scalars, [&](Value *V) {
9962 return isa<ExtractElementInst, UndefValue>(V) ||
9963 (IsAllowedSingleBVNode &&
9964 !V->hasNUsesOrMore(UsesLimit) &&
9965 any_of(V->users(), IsaPred<InsertElementInst>));
9966 });
9967 }))
9968 return false;
9969
9970 assert(VectorizableTree.empty()
9971 ? ExternalUses.empty()
9972 : true && "We shouldn't have any external users");
9973
9974 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
9975 // vectorizable.
9976 return true;
9977}
9978
9980 // Walk from the bottom of the tree to the top, tracking which values are
9981 // live. When we see a call instruction that is not part of our tree,
9982 // query TTI to see if there is a cost to keeping values live over it
9983 // (for example, if spills and fills are required).
9984 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
9986
9988 Instruction *PrevInst = nullptr;
9989
9990 // The entries in VectorizableTree are not necessarily ordered by their
9991 // position in basic blocks. Collect them and order them by dominance so later
9992 // instructions are guaranteed to be visited first. For instructions in
9993 // different basic blocks, we only scan to the beginning of the block, so
9994 // their order does not matter, as long as all instructions in a basic block
9995 // are grouped together. Using dominance ensures a deterministic order.
9996 SmallVector<Instruction *, 16> OrderedScalars;
9997 for (const auto &TEPtr : VectorizableTree) {
9998 if (TEPtr->State != TreeEntry::Vectorize)
9999 continue;
10000 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
10001 if (!Inst)
10002 continue;
10003 OrderedScalars.push_back(Inst);
10004 }
10005 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
10006 auto *NodeA = DT->getNode(A->getParent());
10007 auto *NodeB = DT->getNode(B->getParent());
10008 assert(NodeA && "Should only process reachable instructions");
10009 assert(NodeB && "Should only process reachable instructions");
10010 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10011 "Different nodes should have different DFS numbers");
10012 if (NodeA != NodeB)
10013 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
10014 return B->comesBefore(A);
10015 });
10016
10017 for (Instruction *Inst : OrderedScalars) {
10018 if (!PrevInst) {
10019 PrevInst = Inst;
10020 continue;
10021 }
10022
10023 // Update LiveValues.
10024 LiveValues.erase(PrevInst);
10025 for (auto &J : PrevInst->operands()) {
10026 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
10027 LiveValues.insert(cast<Instruction>(&*J));
10028 }
10029
10030 LLVM_DEBUG({
10031 dbgs() << "SLP: #LV: " << LiveValues.size();
10032 for (auto *X : LiveValues)
10033 dbgs() << " " << X->getName();
10034 dbgs() << ", Looking at ";
10035 Inst->dump();
10036 });
10037
10038 // Now find the sequence of instructions between PrevInst and Inst.
10039 unsigned NumCalls = 0;
10040 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
10041 PrevInstIt =
10042 PrevInst->getIterator().getReverse();
10043 while (InstIt != PrevInstIt) {
10044 if (PrevInstIt == PrevInst->getParent()->rend()) {
10045 PrevInstIt = Inst->getParent()->rbegin();
10046 continue;
10047 }
10048
10049 auto NoCallIntrinsic = [this](Instruction *I) {
10050 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
10051 if (II->isAssumeLikeIntrinsic())
10052 return true;
10053 FastMathFlags FMF;
10055 for (auto &ArgOp : II->args())
10056 Tys.push_back(ArgOp->getType());
10057 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
10058 FMF = FPMO->getFastMathFlags();
10059 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
10060 FMF);
10061 InstructionCost IntrCost =
10064 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
10065 if (IntrCost < CallCost)
10066 return true;
10067 }
10068 return false;
10069 };
10070
10071 // Debug information does not impact spill cost.
10072 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
10073 &*PrevInstIt != PrevInst)
10074 NumCalls++;
10075
10076 ++PrevInstIt;
10077 }
10078
10079 if (NumCalls) {
10081 for (auto *II : LiveValues) {
10082 auto *ScalarTy = II->getType();
10083 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
10084 ScalarTy = VectorTy->getElementType();
10085 V.push_back(FixedVectorType::get(ScalarTy, BundleWidth));
10086 }
10087 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
10088 }
10089
10090 PrevInst = Inst;
10091 }
10092
10093 return Cost;
10094}
10095
10096/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
10097/// buildvector sequence.
10099 const InsertElementInst *IE2) {
10100 if (IE1 == IE2)
10101 return false;
10102 const auto *I1 = IE1;
10103 const auto *I2 = IE2;
10104 const InsertElementInst *PrevI1;
10105 const InsertElementInst *PrevI2;
10106 unsigned Idx1 = *getInsertIndex(IE1);
10107 unsigned Idx2 = *getInsertIndex(IE2);
10108 do {
10109 if (I2 == IE1)
10110 return true;
10111 if (I1 == IE2)
10112 return false;
10113 PrevI1 = I1;
10114 PrevI2 = I2;
10115 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
10116 getInsertIndex(I1).value_or(Idx2) != Idx2)
10117 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
10118 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
10119 getInsertIndex(I2).value_or(Idx1) != Idx1)
10120 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
10121 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
10122 llvm_unreachable("Two different buildvectors not expected.");
10123}
10124
10125namespace {
10126/// Returns incoming Value *, if the requested type is Value * too, or a default
10127/// value, otherwise.
10128struct ValueSelect {
10129 template <typename U>
10130 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
10131 return V;
10132 }
10133 template <typename U>
10134 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
10135 return U();
10136 }
10137};
10138} // namespace
10139
10140/// Does the analysis of the provided shuffle masks and performs the requested
10141/// actions on the vectors with the given shuffle masks. It tries to do it in
10142/// several steps.
10143/// 1. If the Base vector is not undef vector, resizing the very first mask to
10144/// have common VF and perform action for 2 input vectors (including non-undef
10145/// Base). Other shuffle masks are combined with the resulting after the 1 stage
10146/// and processed as a shuffle of 2 elements.
10147/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
10148/// action only for 1 vector with the given mask, if it is not the identity
10149/// mask.
10150/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
10151/// vectors, combing the masks properly between the steps.
10152template <typename T>
10154 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
10155 function_ref<unsigned(T *)> GetVF,
10156 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
10158 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
10159 SmallVector<int> Mask(ShuffleMask.begin()->second);
10160 auto VMIt = std::next(ShuffleMask.begin());
10161 T *Prev = nullptr;
10162 SmallBitVector UseMask =
10163 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
10164 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
10165 if (!IsBaseUndef.all()) {
10166 // Base is not undef, need to combine it with the next subvectors.
10167 std::pair<T *, bool> Res =
10168 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
10169 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
10170 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
10171 if (Mask[Idx] == PoisonMaskElem)
10172 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
10173 else
10174 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
10175 }
10176 auto *V = ValueSelect::get<T *>(Base);
10177 (void)V;
10178 assert((!V || GetVF(V) == Mask.size()) &&
10179 "Expected base vector of VF number of elements.");
10180 Prev = Action(Mask, {nullptr, Res.first});
10181 } else if (ShuffleMask.size() == 1) {
10182 // Base is undef and only 1 vector is shuffled - perform the action only for
10183 // single vector, if the mask is not the identity mask.
10184 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
10185 /*ForSingleMask=*/true);
10186 if (Res.second)
10187 // Identity mask is found.
10188 Prev = Res.first;
10189 else
10190 Prev = Action(Mask, {ShuffleMask.begin()->first});
10191 } else {
10192 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
10193 // shuffles step by step, combining shuffle between the steps.
10194 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
10195 unsigned Vec2VF = GetVF(VMIt->first);
10196 if (Vec1VF == Vec2VF) {
10197 // No need to resize the input vectors since they are of the same size, we
10198 // can shuffle them directly.
10199 ArrayRef<int> SecMask = VMIt->second;
10200 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10201 if (SecMask[I] != PoisonMaskElem) {
10202 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10203 Mask[I] = SecMask[I] + Vec1VF;
10204 }
10205 }
10206 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
10207 } else {
10208 // Vectors of different sizes - resize and reshuffle.
10209 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
10210 /*ForSingleMask=*/false);
10211 std::pair<T *, bool> Res2 =
10212 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
10213 ArrayRef<int> SecMask = VMIt->second;
10214 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10215 if (Mask[I] != PoisonMaskElem) {
10216 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10217 if (Res1.second)
10218 Mask[I] = I;
10219 } else if (SecMask[I] != PoisonMaskElem) {
10220 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10221 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
10222 }
10223 }
10224 Prev = Action(Mask, {Res1.first, Res2.first});
10225 }
10226 VMIt = std::next(VMIt);
10227 }
10228 bool IsBaseNotUndef = !IsBaseUndef.all();
10229 (void)IsBaseNotUndef;
10230 // Perform requested actions for the remaining masks/vectors.
10231 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
10232 // Shuffle other input vectors, if any.
10233 std::pair<T *, bool> Res =
10234 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
10235 ArrayRef<int> SecMask = VMIt->second;
10236 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10237 if (SecMask[I] != PoisonMaskElem) {
10238 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
10239 "Multiple uses of scalars.");
10240 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
10241 } else if (Mask[I] != PoisonMaskElem) {
10242 Mask[I] = I;
10243 }
10244 }
10245 Prev = Action(Mask, {Prev, Res.first});
10246 }
10247 return Prev;
10248}
10249
10252 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
10253 << VectorizableTree.size() << ".\n");
10254
10255 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
10256
10257 SmallPtrSet<Value *, 4> CheckedExtracts;
10258 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
10259 TreeEntry &TE = *VectorizableTree[I];
10260 if (TE.State == TreeEntry::NeedToGather) {
10261 if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
10262 E && E->getVectorFactor() == TE.getVectorFactor() &&
10263 E->isSame(TE.Scalars)) {
10264 // Some gather nodes might be absolutely the same as some vectorizable
10265 // nodes after reordering, need to handle it.
10266 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
10267 << shortBundleName(TE.Scalars) << ".\n"
10268 << "SLP: Current total cost = " << Cost << "\n");
10269 continue;
10270 }
10271 }
10272
10273 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
10274 Cost += C;
10275 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
10276 << shortBundleName(TE.Scalars) << ".\n"
10277 << "SLP: Current total cost = " << Cost << "\n");
10278 }
10279
10280 SmallPtrSet<Value *, 16> ExtractCostCalculated;
10281 InstructionCost ExtractCost = 0;
10284 SmallVector<APInt> DemandedElts;
10285 SmallDenseSet<Value *, 4> UsedInserts;
10287 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
10288 for (ExternalUser &EU : ExternalUses) {
10289 // We only add extract cost once for the same scalar.
10290 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
10291 !ExtractCostCalculated.insert(EU.Scalar).second)
10292 continue;
10293
10294 // Uses by ephemeral values are free (because the ephemeral value will be
10295 // removed prior to code generation, and so the extraction will be
10296 // removed as well).
10297 if (EphValues.count(EU.User))
10298 continue;
10299
10300 // No extract cost for vector "scalar"
10301 if (isa<FixedVectorType>(EU.Scalar->getType()))
10302 continue;
10303
10304 // If found user is an insertelement, do not calculate extract cost but try
10305 // to detect it as a final shuffled/identity match.
10306 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
10307 VU && VU->getOperand(1) == EU.Scalar) {
10308 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
10309 if (!UsedInserts.insert(VU).second)
10310 continue;
10311 std::optional<unsigned> InsertIdx = getInsertIndex(VU);
10312 if (InsertIdx) {
10313 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
10314 auto *It = find_if(
10315 FirstUsers,
10316 [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
10318 VU, cast<InsertElementInst>(Pair.first),
10319 [this](InsertElementInst *II) -> Value * {
10320 Value *Op0 = II->getOperand(0);
10321 if (getTreeEntry(II) && !getTreeEntry(Op0))
10322 return nullptr;
10323 return Op0;
10324 });
10325 });
10326 int VecId = -1;
10327 if (It == FirstUsers.end()) {
10328 (void)ShuffleMasks.emplace_back();
10329 SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
10330 if (Mask.empty())
10331 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10332 // Find the insertvector, vectorized in tree, if any.
10333 Value *Base = VU;
10334 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
10335 if (IEBase != EU.User &&
10336 (!IEBase->hasOneUse() ||
10337 getInsertIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
10338 break;
10339 // Build the mask for the vectorized insertelement instructions.
10340 if (const TreeEntry *E = getTreeEntry(IEBase)) {
10341 VU = IEBase;
10342 do {
10343 IEBase = cast<InsertElementInst>(Base);
10344 int Idx = *getInsertIndex(IEBase);
10345 assert(Mask[Idx] == PoisonMaskElem &&
10346 "InsertElementInstruction used already.");
10347 Mask[Idx] = Idx;
10348 Base = IEBase->getOperand(0);
10349 } while (E == getTreeEntry(Base));
10350 break;
10351 }
10352 Base = cast<InsertElementInst>(Base)->getOperand(0);
10353 }
10354 FirstUsers.emplace_back(VU, ScalarTE);
10355 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
10356 VecId = FirstUsers.size() - 1;
10357 auto It = MinBWs.find(ScalarTE);
10358 if (It != MinBWs.end() &&
10359 VectorCasts
10360 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
10361 .second) {
10362 unsigned BWSz = It->second.first;
10363 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
10364 unsigned VecOpcode;
10365 if (DstBWSz < BWSz)
10366 VecOpcode = Instruction::Trunc;
10367 else
10368 VecOpcode =
10369 It->second.second ? Instruction::SExt : Instruction::ZExt;
10372 VecOpcode, FTy,
10374 IntegerType::get(FTy->getContext(), BWSz),
10375 FTy->getNumElements()),
10377 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10378 << " for extending externally used vector with "
10379 "non-equal minimum bitwidth.\n");
10380 Cost += C;
10381 }
10382 } else {
10383 if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
10384 It->first = VU;
10385 VecId = std::distance(FirstUsers.begin(), It);
10386 }
10387 int InIdx = *InsertIdx;
10388 SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
10389 if (Mask.empty())
10390 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10391 Mask[InIdx] = EU.Lane;
10392 DemandedElts[VecId].setBit(InIdx);
10393 continue;
10394 }
10395 }
10396 }
10397 // Leave the GEPs as is, they are free in most cases and better to keep them
10398 // as GEPs.
10400 if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10401 if (!ValueToExtUses) {
10402 ValueToExtUses.emplace();
10403 for_each(enumerate(ExternalUses), [&](const auto &P) {
10404 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
10405 });
10406 }
10407 // Can use original GEP, if no operands vectorized or they are marked as
10408 // externally used already.
10409 bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
10410 if (!getTreeEntry(V))
10411 return true;
10412 auto It = ValueToExtUses->find(V);
10413 if (It != ValueToExtUses->end()) {
10414 // Replace all uses to avoid compiler crash.
10415 ExternalUses[It->second].User = nullptr;
10416 return true;
10417 }
10418 return false;
10419 });
10420 if (CanBeUsedAsGEP) {
10421 ExtractCost += TTI->getInstructionCost(GEP, CostKind);
10422 ExternalUsesAsGEPs.insert(EU.Scalar);
10423 continue;
10424 }
10425 }
10426
10427 // If we plan to rewrite the tree in a smaller type, we will need to sign
10428 // extend the extracted value back to the original type. Here, we account
10429 // for the extract and the added cost of the sign extend if needed.
10430 auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
10431 auto It = MinBWs.find(getTreeEntry(EU.Scalar));
10432 if (It != MinBWs.end()) {
10433 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
10434 unsigned Extend =
10435 It->second.second ? Instruction::SExt : Instruction::ZExt;
10436 VecTy = FixedVectorType::get(MinTy, BundleWidth);
10437 ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
10438 VecTy, EU.Lane);
10439 } else {
10440 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
10441 CostKind, EU.Lane);
10442 }
10443 }
10444 // Add reduced value cost, if resized.
10445 if (!VectorizedVals.empty()) {
10446 const TreeEntry &Root = *VectorizableTree.front().get();
10447 auto BWIt = MinBWs.find(&Root);
10448 if (BWIt != MinBWs.end()) {
10449 Type *DstTy = Root.Scalars.front()->getType();
10450 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
10451 unsigned SrcSz =
10452 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10453 if (OriginalSz != SrcSz) {
10454 unsigned Opcode = Instruction::Trunc;
10455 if (OriginalSz > SrcSz)
10456 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10457 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
10458 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
10461 }
10462 }
10463 }
10464
10465 InstructionCost SpillCost = getSpillCost();
10466 Cost += SpillCost + ExtractCost;
10467 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
10468 bool) {
10469 InstructionCost C = 0;
10470 unsigned VF = Mask.size();
10471 unsigned VecVF = TE->getVectorFactor();
10472 if (VF != VecVF &&
10473 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
10475 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
10476 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
10477 OrigMask.begin());
10478 C = TTI->getShuffleCost(
10480 FixedVectorType::get(TE->getMainOp()->getType(), VecVF), OrigMask);
10481 LLVM_DEBUG(
10482 dbgs() << "SLP: Adding cost " << C
10483 << " for final shuffle of insertelement external users.\n";
10484 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
10485 Cost += C;
10486 return std::make_pair(TE, true);
10487 }
10488 return std::make_pair(TE, false);
10489 };
10490 // Calculate the cost of the reshuffled vectors, if any.
10491 for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
10492 Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
10493 auto Vector = ShuffleMasks[I].takeVector();
10494 unsigned VF = 0;
10495 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
10497 assert((TEs.size() == 1 || TEs.size() == 2) &&
10498 "Expected exactly 1 or 2 tree entries.");
10499 if (TEs.size() == 1) {
10500 if (VF == 0)
10501 VF = TEs.front()->getVectorFactor();
10502 auto *FTy =
10503 FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
10504 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
10505 !all_of(enumerate(Mask), [=](const auto &Data) {
10506 return Data.value() == PoisonMaskElem ||
10507 (Data.index() < VF &&
10508 static_cast<int>(Data.index()) == Data.value());
10509 })) {
10512 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10513 << " for final shuffle of insertelement "
10514 "external users.\n";
10515 TEs.front()->dump();
10516 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10517 Cost += C;
10518 }
10519 } else {
10520 if (VF == 0) {
10521 if (TEs.front() &&
10522 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10523 VF = TEs.front()->getVectorFactor();
10524 else
10525 VF = Mask.size();
10526 }
10527 auto *FTy =
10528 FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
10531 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10532 << " for final shuffle of vector node and external "
10533 "insertelement users.\n";
10534 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10535 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10536 Cost += C;
10537 }
10538 VF = Mask.size();
10539 return TEs.back();
10540 };
10541 (void)performExtractsShuffleAction<const TreeEntry>(
10542 MutableArrayRef(Vector.data(), Vector.size()), Base,
10543 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
10544 EstimateShufflesCost);
10546 cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
10547 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
10548 Cost -= InsertCost;
10549 }
10550
10551 // Add the cost for reduced value resize (if required).
10552 if (ReductionBitWidth != 0) {
10553 assert(UserIgnoreList && "Expected reduction tree.");
10554 const TreeEntry &E = *VectorizableTree.front().get();
10555 auto It = MinBWs.find(&E);
10556 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10557 unsigned SrcSize = It->second.first;
10558 unsigned DstSize = ReductionBitWidth;
10559 unsigned Opcode = Instruction::Trunc;
10560 if (SrcSize < DstSize)
10561 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10562 auto *SrcVecTy =
10563 FixedVectorType::get(Builder.getIntNTy(SrcSize), E.getVectorFactor());
10564 auto *DstVecTy =
10565 FixedVectorType::get(Builder.getIntNTy(DstSize), E.getVectorFactor());
10566 TTI::CastContextHint CCH = getCastContextHint(E);
10567 InstructionCost CastCost;
10568 switch (E.getOpcode()) {
10569 case Instruction::SExt:
10570 case Instruction::ZExt:
10571 case Instruction::Trunc: {
10572 const TreeEntry *OpTE = getOperandEntry(&E, 0);
10573 CCH = getCastContextHint(*OpTE);
10574 break;
10575 }
10576 default:
10577 break;
10578 }
10579 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
10581 Cost += CastCost;
10582 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
10583 << " for final resize for reduction from " << SrcVecTy
10584 << " to " << DstVecTy << "\n";
10585 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10586 }
10587 }
10588
10589#ifndef NDEBUG
10590 SmallString<256> Str;
10591 {
10593 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
10594 << "SLP: Extract Cost = " << ExtractCost << ".\n"
10595 << "SLP: Total Cost = " << Cost << ".\n";
10596 }
10597 LLVM_DEBUG(dbgs() << Str);
10598 if (ViewSLPTree)
10599 ViewGraph(this, "SLP" + F->getName(), false, Str);
10600#endif
10601
10602 return Cost;
10603}
10604
10605/// Tries to find extractelement instructions with constant indices from fixed
10606/// vector type and gather such instructions into a bunch, which highly likely
10607/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10608/// successful, the matched scalars are replaced by poison values in \p VL for
10609/// future analysis.
10610std::optional<TTI::ShuffleKind>
10611BoUpSLP::tryToGatherSingleRegisterExtractElements(
10613 // Scan list of gathered scalars for extractelements that can be represented
10614 // as shuffles.
10616 SmallVector<int> UndefVectorExtracts;
10617 for (int I = 0, E = VL.size(); I < E; ++I) {
10618 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10619 if (!EI) {
10620 if (isa<UndefValue>(VL[I]))
10621 UndefVectorExtracts.push_back(I);
10622 continue;
10623 }
10624 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
10625 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
10626 continue;
10627 std::optional<unsigned> Idx = getExtractIndex(EI);
10628 // Undefined index.
10629 if (!Idx) {
10630 UndefVectorExtracts.push_back(I);
10631 continue;
10632 }
10633 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
10634 ExtractMask.reset(*Idx);
10635 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
10636 UndefVectorExtracts.push_back(I);
10637 continue;
10638 }
10639 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
10640 }
10641 // Sort the vector operands by the maximum number of uses in extractelements.
10643 for (const auto &Data : VectorOpToIdx)
10644 VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]
10645 .push_back(Data.first);
10646 for (auto &Data : VFToVector) {
10647 stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {
10648 return VectorOpToIdx.find(V1)->second.size() >
10649 VectorOpToIdx.find(V2)->second.size();
10650 });
10651 }
10652 // Find the best pair of the vectors with the same number of elements or a
10653 // single vector.
10654 const int UndefSz = UndefVectorExtracts.size();
10655 unsigned SingleMax = 0;
10656 Value *SingleVec = nullptr;
10657 unsigned PairMax = 0;
10658 std::pair<Value *, Value *> PairVec(nullptr, nullptr);
10659 for (auto &Data : VFToVector) {
10660 Value *V1 = Data.second.front();
10661 if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {
10662 SingleMax = VectorOpToIdx[V1].size() + UndefSz;
10663 SingleVec = V1;
10664 }
10665 Value *V2 = nullptr;
10666 if (Data.second.size() > 1)
10667 V2 = *std::next(Data.second.begin());
10668 if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +
10669 UndefSz) {
10670 PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;
10671 PairVec = std::make_pair(V1, V2);
10672 }
10673 }
10674 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10675 return std::nullopt;
10676 // Check if better to perform a shuffle of 2 vectors or just of a single
10677 // vector.
10678 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
10679 SmallVector<Value *> GatheredExtracts(
10680 VL.size(), PoisonValue::get(VL.front()->getType()));
10681 if (SingleMax >= PairMax && SingleMax) {
10682 for (int Idx : VectorOpToIdx[SingleVec])
10683 std::swap(GatheredExtracts[Idx], VL[Idx]);
10684 } else {
10685 for (Value *V : {PairVec.first, PairVec.second})
10686 for (int Idx : VectorOpToIdx[V])
10687 std::swap(GatheredExtracts[Idx], VL[Idx]);
10688 }
10689 // Add extracts from undefs too.
10690 for (int Idx : UndefVectorExtracts)
10691 std::swap(GatheredExtracts[Idx], VL[Idx]);
10692 // Check that gather of extractelements can be represented as just a
10693 // shuffle of a single/two vectors the scalars are extracted from.
10694 std::optional<TTI::ShuffleKind> Res =
10695 isFixedVectorShuffle(GatheredExtracts, Mask);
10696 if (!Res) {
10697 // TODO: try to check other subsets if possible.
10698 // Restore the original VL if attempt was not successful.
10699 copy(SavedVL, VL.begin());
10700 return std::nullopt;
10701 }
10702 // Restore unused scalars from mask, if some of the extractelements were not
10703 // selected for shuffle.
10704 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
10705 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
10706 isa<UndefValue>(GatheredExtracts[I])) {
10707 std::swap(VL[I], GatheredExtracts[I]);
10708 continue;
10709 }
10710 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10711 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
10712 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
10713 is_contained(UndefVectorExtracts, I))
10714 continue;
10715 }
10716 return Res;
10717}
10718
10719/// Tries to find extractelement instructions with constant indices from fixed
10720/// vector type and gather such instructions into a bunch, which highly likely
10721/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10722/// successful, the matched scalars are replaced by poison values in \p VL for
10723/// future analysis.
10725BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
10727 unsigned NumParts) const {
10728 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
10729 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
10730 Mask.assign(VL.size(), PoisonMaskElem);
10731 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
10732 for (unsigned Part : seq<unsigned>(NumParts)) {
10733 // Scan list of gathered scalars for extractelements that can be represented
10734 // as shuffles.
10736 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
10737 SmallVector<int> SubMask;
10738 std::optional<TTI::ShuffleKind> Res =
10739 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
10740 ShufflesRes[Part] = Res;
10741 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
10742 }
10743 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
10744 return Res.has_value();
10745 }))
10746 ShufflesRes.clear();
10747 return ShufflesRes;
10748}
10749
10750std::optional<TargetTransformInfo::ShuffleKind>
10751BoUpSLP::isGatherShuffledSingleRegisterEntry(
10752 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
10753 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
10754 Entries.clear();
10755 // TODO: currently checking only for Scalars in the tree entry, need to count
10756 // reused elements too for better cost estimation.
10757 const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
10758 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
10759 const BasicBlock *TEInsertBlock = nullptr;
10760 // Main node of PHI entries keeps the correct order of operands/incoming
10761 // blocks.
10762 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
10763 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
10764 TEInsertPt = TEInsertBlock->getTerminator();
10765 } else {
10766 TEInsertBlock = TEInsertPt->getParent();
10767 }
10768 if (!DT->isReachableFromEntry(TEInsertBlock))
10769 return std::nullopt;
10770 auto *NodeUI = DT->getNode(TEInsertBlock);
10771 assert(NodeUI && "Should only process reachable instructions");
10772 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
10773 auto CheckOrdering = [&](const Instruction *InsertPt) {
10774 // Argument InsertPt is an instruction where vector code for some other
10775 // tree entry (one that shares one or more scalars with TE) is going to be
10776 // generated. This lambda returns true if insertion point of vector code
10777 // for the TE dominates that point (otherwise dependency is the other way
10778 // around). The other node is not limited to be of a gather kind. Gather
10779 // nodes are not scheduled and their vector code is inserted before their
10780 // first user. If user is PHI, that is supposed to be at the end of a
10781 // predecessor block. Otherwise it is the last instruction among scalars of
10782 // the user node. So, instead of checking dependency between instructions
10783 // themselves, we check dependency between their insertion points for vector
10784 // code (since each scalar instruction ends up as a lane of a vector
10785 // instruction).
10786 const BasicBlock *InsertBlock = InsertPt->getParent();
10787 auto *NodeEUI = DT->getNode(InsertBlock);
10788 if (!NodeEUI)
10789 return false;
10790 assert((NodeUI == NodeEUI) ==
10791 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10792 "Different nodes should have different DFS numbers");
10793 // Check the order of the gather nodes users.
10794 if (TEInsertPt->getParent() != InsertBlock &&
10795 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
10796 return false;
10797 if (TEInsertPt->getParent() == InsertBlock &&
10798 TEInsertPt->comesBefore(InsertPt))
10799 return false;
10800 return true;
10801 };
10802 // Find all tree entries used by the gathered values. If no common entries
10803 // found - not a shuffle.
10804 // Here we build a set of tree nodes for each gathered value and trying to
10805 // find the intersection between these sets. If we have at least one common
10806 // tree node for each gathered value - we have just a permutation of the
10807 // single vector. If we have 2 different sets, we're in situation where we
10808 // have a permutation of 2 input vectors.
10810 DenseMap<Value *, int> UsedValuesEntry;
10811 for (Value *V : VL) {
10812 if (isConstant(V))
10813 continue;
10814 // Build a list of tree entries where V is used.
10816 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
10817 if (TEPtr == TE)
10818 continue;
10819 assert(any_of(TEPtr->Scalars,
10820 [&](Value *V) { return GatheredScalars.contains(V); }) &&
10821 "Must contain at least single gathered value.");
10822 assert(TEPtr->UserTreeIndices.size() == 1 &&
10823 "Expected only single user of a gather node.");
10824 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
10825
10826 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
10827 const Instruction *InsertPt =
10828 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
10829 : &getLastInstructionInBundle(UseEI.UserTE);
10830 if (TEInsertPt == InsertPt) {
10831 // If 2 gathers are operands of the same entry (regardless of whether
10832 // user is PHI or else), compare operands indices, use the earlier one
10833 // as the base.
10834 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
10835 continue;
10836 // If the user instruction is used for some reason in different
10837 // vectorized nodes - make it depend on index.
10838 if (TEUseEI.UserTE != UseEI.UserTE &&
10839 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
10840 continue;
10841 }
10842
10843 // Check if the user node of the TE comes after user node of TEPtr,
10844 // otherwise TEPtr depends on TE.
10845 if ((TEInsertBlock != InsertPt->getParent() ||
10846 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
10847 !CheckOrdering(InsertPt))
10848 continue;
10849 VToTEs.insert(TEPtr);
10850 }
10851 if (const TreeEntry *VTE = getTreeEntry(V)) {
10852 if (ForOrder) {
10853 if (VTE->State != TreeEntry::Vectorize) {
10854 auto It = MultiNodeScalars.find(V);
10855 if (It == MultiNodeScalars.end())
10856 continue;
10857 VTE = *It->getSecond().begin();
10858 // Iterate through all vectorized nodes.
10859 auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
10860 return MTE->State == TreeEntry::Vectorize;
10861 });
10862 if (MIt == It->getSecond().end())
10863 continue;
10864 VTE = *MIt;
10865 }
10866 }
10867 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
10868 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
10869 continue;
10870 VToTEs.insert(VTE);
10871 }
10872 if (VToTEs.empty())
10873 continue;
10874 if (UsedTEs.empty()) {
10875 // The first iteration, just insert the list of nodes to vector.
10876 UsedTEs.push_back(VToTEs);
10877 UsedValuesEntry.try_emplace(V, 0);
10878 } else {
10879 // Need to check if there are any previously used tree nodes which use V.
10880 // If there are no such nodes, consider that we have another one input
10881 // vector.
10882 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
10883 unsigned Idx = 0;
10884 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
10885 // Do we have a non-empty intersection of previously listed tree entries
10886 // and tree entries using current V?
10887 set_intersect(VToTEs, Set);
10888 if (!VToTEs.empty()) {
10889 // Yes, write the new subset and continue analysis for the next
10890 // scalar.
10891 Set.swap(VToTEs);
10892 break;
10893 }
10894 VToTEs = SavedVToTEs;
10895 ++Idx;
10896 }
10897 // No non-empty intersection found - need to add a second set of possible
10898 // source vectors.
10899 if (Idx == UsedTEs.size()) {
10900 // If the number of input vectors is greater than 2 - not a permutation,
10901 // fallback to the regular gather.
10902 // TODO: support multiple reshuffled nodes.
10903 if (UsedTEs.size() == 2)
10904 continue;
10905 UsedTEs.push_back(SavedVToTEs);
10906 Idx = UsedTEs.size() - 1;
10907 }
10908 UsedValuesEntry.try_emplace(V, Idx);
10909 }
10910 }
10911
10912 if (UsedTEs.empty()) {
10913 Entries.clear();
10914 return std::nullopt;
10915 }
10916
10917 unsigned VF = 0;
10918 if (UsedTEs.size() == 1) {
10919 // Keep the order to avoid non-determinism.
10920 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
10921 UsedTEs.front().end());
10922 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
10923 return TE1->Idx < TE2->Idx;
10924 });
10925 // Try to find the perfect match in another gather node at first.
10926 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
10927 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
10928 });
10929 if (It != FirstEntries.end() &&
10930 ((*It)->getVectorFactor() == VL.size() ||
10931 ((*It)->getVectorFactor() == TE->Scalars.size() &&
10932 TE->ReuseShuffleIndices.size() == VL.size() &&
10933 (*It)->isSame(TE->Scalars)))) {
10934 Entries.push_back(*It);
10935 if ((*It)->getVectorFactor() == VL.size()) {
10936 std::iota(std::next(Mask.begin(), Part * VL.size()),
10937 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
10938 } else {
10939 SmallVector<int> CommonMask = TE->getCommonMask();
10940 copy(CommonMask, Mask.begin());
10941 }
10942 // Clear undef scalars.
10943 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
10944 if (isa<PoisonValue>(VL[I]))
10947 }
10948 // No perfect match, just shuffle, so choose the first tree node from the
10949 // tree.
10950 Entries.push_back(FirstEntries.front());
10951 } else {
10952 // Try to find nodes with the same vector factor.
10953 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
10954 // Keep the order of tree nodes to avoid non-determinism.
10956 for (const TreeEntry *TE : UsedTEs.front()) {
10957 unsigned VF = TE->getVectorFactor();
10958 auto It = VFToTE.find(VF);
10959 if (It != VFToTE.end()) {
10960 if (It->second->Idx > TE->Idx)
10961 It->getSecond() = TE;
10962 continue;
10963 }
10964 VFToTE.try_emplace(VF, TE);
10965 }
10966 // Same, keep the order to avoid non-determinism.
10967 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
10968 UsedTEs.back().end());
10969 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
10970 return TE1->Idx < TE2->Idx;
10971 });
10972 for (const TreeEntry *TE : SecondEntries) {
10973 auto It = VFToTE.find(TE->getVectorFactor());
10974 if (It != VFToTE.end()) {
10975 VF = It->first;
10976 Entries.push_back(It->second);
10977 Entries.push_back(TE);
10978 break;
10979 }
10980 }
10981 // No 2 source vectors with the same vector factor - just choose 2 with max
10982 // index.
10983 if (Entries.empty()) {
10984 Entries.push_back(*llvm::max_element(
10985 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
10986 return TE1->Idx < TE2->Idx;
10987 }));
10988 Entries.push_back(SecondEntries.front());
10989 VF = std::max(Entries.front()->getVectorFactor(),
10990 Entries.back()->getVectorFactor());
10991 }
10992 }
10993
10994 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
10995 // Checks if the 2 PHIs are compatible in terms of high possibility to be
10996 // vectorized.
10997 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
10998 auto *PHI = cast<PHINode>(V);
10999 auto *PHI1 = cast<PHINode>(V1);
11000 // Check that all incoming values are compatible/from same parent (if they
11001 // are instructions).
11002 // The incoming values are compatible if they all are constants, or
11003 // instruction with the same/alternate opcodes from the same basic block.
11004 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
11005 Value *In = PHI->getIncomingValue(I);
11006 Value *In1 = PHI1->getIncomingValue(I);
11007 if (isConstant(In) && isConstant(In1))
11008 continue;
11009 if (!getSameOpcode({In, In1}, *TLI).getOpcode())
11010 return false;
11011 if (cast<Instruction>(In)->getParent() !=
11012 cast<Instruction>(In1)->getParent())
11013 return false;
11014 }
11015 return true;
11016 };
11017 // Check if the value can be ignored during analysis for shuffled gathers.
11018 // We suppose it is better to ignore instruction, which do not form splats,
11019 // are not vectorized/not extractelements (these instructions will be handled
11020 // by extractelements processing) or may form vector node in future.
11021 auto MightBeIgnored = [=](Value *V) {
11022 auto *I = dyn_cast<Instruction>(V);
11023 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
11025 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
11026 };
11027 // Check that the neighbor instruction may form a full vector node with the
11028 // current instruction V. It is possible, if they have same/alternate opcode
11029 // and same parent basic block.
11030 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
11031 Value *V1 = VL[Idx];
11032 bool UsedInSameVTE = false;
11033 auto It = UsedValuesEntry.find(V1);
11034 if (It != UsedValuesEntry.end())
11035 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
11036 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
11037 getSameOpcode({V, V1}, *TLI).getOpcode() &&
11038 cast<Instruction>(V)->getParent() ==
11039 cast<Instruction>(V1)->getParent() &&
11040 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
11041 };
11042 // Build a shuffle mask for better cost estimation and vector emission.
11043 SmallBitVector UsedIdxs(Entries.size());
11045 for (int I = 0, E = VL.size(); I < E; ++I) {
11046 Value *V = VL[I];
11047 auto It = UsedValuesEntry.find(V);
11048 if (It == UsedValuesEntry.end())
11049 continue;
11050 // Do not try to shuffle scalars, if they are constants, or instructions
11051 // that can be vectorized as a result of the following vector build
11052 // vectorization.
11053 if (isConstant(V) || (MightBeIgnored(V) &&
11054 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
11055 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
11056 continue;
11057 unsigned Idx = It->second;
11058 EntryLanes.emplace_back(Idx, I);
11059 UsedIdxs.set(Idx);
11060 }
11061 // Iterate through all shuffled scalars and select entries, which can be used
11062 // for final shuffle.
11064 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
11065 if (!UsedIdxs.test(I))
11066 continue;
11067 // Fix the entry number for the given scalar. If it is the first entry, set
11068 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
11069 // These indices are used when calculating final shuffle mask as the vector
11070 // offset.
11071 for (std::pair<unsigned, int> &Pair : EntryLanes)
11072 if (Pair.first == I)
11073 Pair.first = TempEntries.size();
11074 TempEntries.push_back(Entries[I]);
11075 }
11076 Entries.swap(TempEntries);
11077 if (EntryLanes.size() == Entries.size() &&
11078 !VL.equals(ArrayRef(TE->Scalars)
11079 .slice(Part * VL.size(),
11080 std::min<int>(VL.size(), TE->Scalars.size())))) {
11081 // We may have here 1 or 2 entries only. If the number of scalars is equal
11082 // to the number of entries, no need to do the analysis, it is not very
11083 // profitable. Since VL is not the same as TE->Scalars, it means we already
11084 // have some shuffles before. Cut off not profitable case.
11085 Entries.clear();
11086 return std::nullopt;
11087 }
11088 // Build the final mask, check for the identity shuffle, if possible.
11089 bool IsIdentity = Entries.size() == 1;
11090 // Pair.first is the offset to the vector, while Pair.second is the index of
11091 // scalar in the list.
11092 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
11093 unsigned Idx = Part * VL.size() + Pair.second;
11094 Mask[Idx] =
11095 Pair.first * VF +
11096 (ForOrder ? std::distance(
11097 Entries[Pair.first]->Scalars.begin(),
11098 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
11099 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
11100 IsIdentity &= Mask[Idx] == Pair.second;
11101 }
11102 switch (Entries.size()) {
11103 case 1:
11104 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
11106 break;
11107 case 2:
11108 if (EntryLanes.size() > 2 || VL.size() <= 2)
11110 break;
11111 default:
11112 break;
11113 }
11114 Entries.clear();
11115 // Clear the corresponding mask elements.
11116 std::fill(std::next(Mask.begin(), Part * VL.size()),
11117 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
11118 return std::nullopt;
11119}
11120
11122BoUpSLP::isGatherShuffledEntry(
11123 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
11124 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
11125 bool ForOrder) {
11126 assert(NumParts > 0 && NumParts < VL.size() &&
11127 "Expected positive number of registers.");
11128 Entries.clear();
11129 // No need to check for the topmost gather node.
11130 if (TE == VectorizableTree.front().get())
11131 return {};
11132 // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
11133 if (TE->isNonPowOf2Vec())
11134 return {};
11135 Mask.assign(VL.size(), PoisonMaskElem);
11136 assert(TE->UserTreeIndices.size() == 1 &&
11137 "Expected only single user of the gather node.");
11138 assert(VL.size() % NumParts == 0 &&
11139 "Number of scalars must be divisible by NumParts.");
11140 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
11142 for (unsigned Part : seq<unsigned>(NumParts)) {
11143 ArrayRef<Value *> SubVL =
11144 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
11145 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
11146 std::optional<TTI::ShuffleKind> SubRes =
11147 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
11148 ForOrder);
11149 if (!SubRes)
11150 SubEntries.clear();
11151 Res.push_back(SubRes);
11152 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
11153 SubEntries.front()->getVectorFactor() == VL.size() &&
11154 (SubEntries.front()->isSame(TE->Scalars) ||
11155 SubEntries.front()->isSame(VL))) {
11156 SmallVector<const TreeEntry *> LocalSubEntries;
11157 LocalSubEntries.swap(SubEntries);
11158 Entries.clear();
11159 Res.clear();
11160 std::iota(Mask.begin(), Mask.end(), 0);
11161 // Clear undef scalars.
11162 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
11163 if (isa<PoisonValue>(VL[I]))
11165 Entries.emplace_back(1, LocalSubEntries.front());
11167 return Res;
11168 }
11169 }
11170 if (all_of(Res,
11171 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
11172 Entries.clear();
11173 return {};
11174 }
11175 return Res;
11176}
11177
11178InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
11179 Type *ScalarTy) const {
11180 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
11181 bool DuplicateNonConst = false;
11182 // Find the cost of inserting/extracting values from the vector.
11183 // Check if the same elements are inserted several times and count them as
11184 // shuffle candidates.
11185 APInt ShuffledElements = APInt::getZero(VL.size());
11186 DenseMap<Value *, unsigned> UniqueElements;
11189 auto EstimateInsertCost = [&](unsigned I, Value *V) {
11190 if (V->getType() != ScalarTy) {
11191 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
11193 V = nullptr;
11194 }
11195 if (!ForPoisonSrc)
11196 Cost +=
11197 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
11198 I, Constant::getNullValue(VecTy), V);
11199 };
11200 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
11201 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
11202 Value *V = VL[I];
11203 // No need to shuffle duplicates for constants.
11204 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
11205 ShuffledElements.setBit(I);
11206 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
11207 continue;
11208 }
11209
11210 auto Res = UniqueElements.try_emplace(V, I);
11211 if (Res.second) {
11212 EstimateInsertCost(I, V);
11213 ShuffleMask[I] = I;
11214 continue;
11215 }
11216
11217 DuplicateNonConst = true;
11218 ShuffledElements.setBit(I);
11219 ShuffleMask[I] = Res.first->second;
11220 }
11221 if (ForPoisonSrc)
11222 Cost =
11223 TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
11224 /*Extract*/ false, CostKind);
11225 if (DuplicateNonConst)
11227 VecTy, ShuffleMask);
11228 return Cost;
11229}
11230
11231// Perform operand reordering on the instructions in VL and return the reordered
11232// operands in Left and Right.
11233void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
11236 const BoUpSLP &R) {
11237 if (VL.empty())
11238 return;
11239 VLOperands Ops(VL, R);
11240 // Reorder the operands in place.
11241 Ops.reorder();
11242 Left = Ops.getVL(0);
11243 Right = Ops.getVL(1);
11244}
11245
11246Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
11247 auto &Res = EntryToLastInstruction.FindAndConstruct(E);
11248 if (Res.second)
11249 return *Res.second;
11250 // Get the basic block this bundle is in. All instructions in the bundle
11251 // should be in this block (except for extractelement-like instructions with
11252 // constant indeces).
11253 auto *Front = E->getMainOp();
11254 auto *BB = Front->getParent();
11255 assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
11256 if (E->getOpcode() == Instruction::GetElementPtr &&
11257 !isa<GetElementPtrInst>(V))
11258 return true;
11259 auto *I = cast<Instruction>(V);
11260 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
11261 isVectorLikeInstWithConstOps(I);
11262 }));
11263
11264 auto FindLastInst = [&]() {
11265 Instruction *LastInst = Front;
11266 for (Value *V : E->Scalars) {
11267 auto *I = dyn_cast<Instruction>(V);
11268 if (!I)
11269 continue;
11270 if (LastInst->getParent() == I->getParent()) {
11271 if (LastInst->comesBefore(I))
11272 LastInst = I;
11273 continue;
11274 }
11275 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11276 !isa<GetElementPtrInst>(I)) ||
11277 (isVectorLikeInstWithConstOps(LastInst) &&
11279 "Expected vector-like or non-GEP in GEP node insts only.");
11280 if (!DT->isReachableFromEntry(LastInst->getParent())) {
11281 LastInst = I;
11282 continue;
11283 }
11284 if (!DT->isReachableFromEntry(I->getParent()))
11285 continue;
11286 auto *NodeA = DT->getNode(LastInst->getParent());
11287 auto *NodeB = DT->getNode(I->getParent());
11288 assert(NodeA && "Should only process reachable instructions");
11289 assert(NodeB && "Should only process reachable instructions");
11290 assert((NodeA == NodeB) ==
11291 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11292 "Different nodes should have different DFS numbers");
11293 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
11294 LastInst = I;
11295 }
11296 BB = LastInst->getParent();
11297 return LastInst;
11298 };
11299
11300 auto FindFirstInst = [&]() {
11301 Instruction *FirstInst = Front;
11302 for (Value *V : E->Scalars) {
11303 auto *I = dyn_cast<Instruction>(V);
11304 if (!I)
11305 continue;
11306 if (FirstInst->getParent() == I->getParent()) {
11307 if (I->comesBefore(FirstInst))
11308 FirstInst = I;
11309 continue;
11310 }
11311 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11312 !isa<GetElementPtrInst>(I)) ||
11313 (isVectorLikeInstWithConstOps(FirstInst) &&
11315 "Expected vector-like or non-GEP in GEP node insts only.");
11316 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
11317 FirstInst = I;
11318 continue;
11319 }
11320 if (!DT->isReachableFromEntry(I->getParent()))
11321 continue;
11322 auto *NodeA = DT->getNode(FirstInst->getParent());
11323 auto *NodeB = DT->getNode(I->getParent());
11324 assert(NodeA && "Should only process reachable instructions");
11325 assert(NodeB && "Should only process reachable instructions");
11326 assert((NodeA == NodeB) ==
11327 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11328 "Different nodes should have different DFS numbers");
11329 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11330 FirstInst = I;
11331 }
11332 return FirstInst;
11333 };
11334
11335 // Set the insert point to the beginning of the basic block if the entry
11336 // should not be scheduled.
11337 if (doesNotNeedToSchedule(E->Scalars) ||
11338 (E->State != TreeEntry::NeedToGather &&
11339 all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
11340 if ((E->getOpcode() == Instruction::GetElementPtr &&
11341 any_of(E->Scalars,
11342 [](Value *V) {
11343 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
11344 })) ||
11345 all_of(E->Scalars,
11346 [](Value *V) {
11347 return !isVectorLikeInstWithConstOps(V) &&
11348 isUsedOutsideBlock(V);
11349 }) ||
11350 (E->State == TreeEntry::NeedToGather && E->Idx == 0 &&
11351 all_of(E->Scalars, [](Value *V) {
11352 return isa<ExtractElementInst, UndefValue>(V) ||
11353 areAllOperandsNonInsts(V);
11354 })))
11355 Res.second = FindLastInst();
11356 else
11357 Res.second = FindFirstInst();
11358 return *Res.second;
11359 }
11360
11361 // Find the last instruction. The common case should be that BB has been
11362 // scheduled, and the last instruction is VL.back(). So we start with
11363 // VL.back() and iterate over schedule data until we reach the end of the
11364 // bundle. The end of the bundle is marked by null ScheduleData.
11365 if (BlocksSchedules.count(BB)) {
11366 Value *V = E->isOneOf(E->Scalars.back());
11368 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
11369 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11370 if (Bundle && Bundle->isPartOfBundle())
11371 for (; Bundle; Bundle = Bundle->NextInBundle)
11372 if (Bundle->OpValue == Bundle->Inst)
11373 Res.second = Bundle->Inst;
11374 }
11375
11376 // LastInst can still be null at this point if there's either not an entry
11377 // for BB in BlocksSchedules or there's no ScheduleData available for
11378 // VL.back(). This can be the case if buildTree_rec aborts for various
11379 // reasons (e.g., the maximum recursion depth is reached, the maximum region
11380 // size is reached, etc.). ScheduleData is initialized in the scheduling
11381 // "dry-run".
11382 //
11383 // If this happens, we can still find the last instruction by brute force. We
11384 // iterate forwards from Front (inclusive) until we either see all
11385 // instructions in the bundle or reach the end of the block. If Front is the
11386 // last instruction in program order, LastInst will be set to Front, and we
11387 // will visit all the remaining instructions in the block.
11388 //
11389 // One of the reasons we exit early from buildTree_rec is to place an upper
11390 // bound on compile-time. Thus, taking an additional compile-time hit here is
11391 // not ideal. However, this should be exceedingly rare since it requires that
11392 // we both exit early from buildTree_rec and that the bundle be out-of-order
11393 // (causing us to iterate all the way to the end of the block).
11394 if (!Res.second)
11395 Res.second = FindLastInst();
11396 assert(Res.second && "Failed to find last instruction in bundle");
11397 return *Res.second;
11398}
11399
11400void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
11401 auto *Front = E->getMainOp();
11402 Instruction *LastInst = &getLastInstructionInBundle(E);
11403 assert(LastInst && "Failed to find last instruction in bundle");
11404 BasicBlock::iterator LastInstIt = LastInst->getIterator();
11405 // If the instruction is PHI, set the insert point after all the PHIs.
11406 bool IsPHI = isa<PHINode>(LastInst);
11407 if (IsPHI)
11408 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
11409 if (IsPHI || (E->State != TreeEntry::NeedToGather &&
11410 doesNotNeedToSchedule(E->Scalars))) {
11411 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
11412 } else {
11413 // Set the insertion point after the last instruction in the bundle. Set the
11414 // debug location to Front.
11415 Builder.SetInsertPoint(
11416 LastInst->getParent(),
11418 }
11419 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11420}
11421
11422Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy) {
11423 // List of instructions/lanes from current block and/or the blocks which are
11424 // part of the current loop. These instructions will be inserted at the end to
11425 // make it possible to optimize loops and hoist invariant instructions out of
11426 // the loops body with better chances for success.
11428 SmallSet<int, 4> PostponedIndices;
11429 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
11430 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
11432 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
11433 InsertBB = InsertBB->getSinglePredecessor();
11434 return InsertBB && InsertBB == InstBB;
11435 };
11436 for (int I = 0, E = VL.size(); I < E; ++I) {
11437 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
11438 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11439 getTreeEntry(Inst) ||
11440 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
11441 PostponedIndices.insert(I).second)
11442 PostponedInsts.emplace_back(Inst, I);
11443 }
11444
11445 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
11446 Type *Ty) {
11447 Value *Scalar = V;
11448 if (Scalar->getType() != Ty) {
11449 assert(Scalar->getType()->isIntegerTy() && Ty->isIntegerTy() &&
11450 "Expected integer types only.");
11451 Value *V = Scalar;
11452 if (auto *CI = dyn_cast<CastInst>(Scalar);
11453 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
11454 Value *Op = CI->getOperand(0);
11455 if (auto *IOp = dyn_cast<Instruction>(Op);
11456 !IOp || !(isDeleted(IOp) || getTreeEntry(IOp)))
11457 V = Op;
11458 }
11459 Scalar = Builder.CreateIntCast(
11460 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
11461 }
11462
11463 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11464 auto *InsElt = dyn_cast<InsertElementInst>(Vec);
11465 if (!InsElt)
11466 return Vec;
11467 GatherShuffleExtractSeq.insert(InsElt);
11468 CSEBlocks.insert(InsElt->getParent());
11469 // Add to our 'need-to-extract' list.
11470 if (isa<Instruction>(V)) {
11471 if (TreeEntry *Entry = getTreeEntry(V)) {
11472 // Find which lane we need to extract.
11473 User *UserOp = nullptr;
11474 if (Scalar != V) {
11475 if (auto *SI = dyn_cast<Instruction>(Scalar))
11476 UserOp = SI;
11477 } else {
11478 UserOp = InsElt;
11479 }
11480 if (UserOp) {
11481 unsigned FoundLane = Entry->findLaneForValue(V);
11482 ExternalUses.emplace_back(V, UserOp, FoundLane);
11483 }
11484 }
11485 }
11486 return Vec;
11487 };
11488 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
11489 Value *Vec = Root ? Root : PoisonValue::get(VecTy);
11490 SmallVector<int> NonConsts;
11491 // Insert constant values at first.
11492 for (int I = 0, E = VL.size(); I < E; ++I) {
11493 if (PostponedIndices.contains(I))
11494 continue;
11495 if (!isConstant(VL[I])) {
11496 NonConsts.push_back(I);
11497 continue;
11498 }
11499 if (Root) {
11500 if (!isa<UndefValue>(VL[I])) {
11501 NonConsts.push_back(I);
11502 continue;
11503 }
11504 if (isa<PoisonValue>(VL[I]))
11505 continue;
11506 if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
11507 if (SV->getMaskValue(I) == PoisonMaskElem)
11508 continue;
11509 }
11510 }
11511 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11512 }
11513 // Insert non-constant values.
11514 for (int I : NonConsts)
11515 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11516 // Append instructions, which are/may be part of the loop, in the end to make
11517 // it possible to hoist non-loop-based instructions.
11518 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11519 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11520
11521 return Vec;
11522}
11523
11524/// Merges shuffle masks and emits final shuffle instruction, if required. It
11525/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
11526/// when the actual shuffle instruction is generated only if this is actually
11527/// required. Otherwise, the shuffle instruction emission is delayed till the
11528/// end of the process, to reduce the number of emitted instructions and further
11529/// analysis/transformations.
11530/// The class also will look through the previously emitted shuffle instructions
11531/// and properly mark indices in mask as undef.
11532/// For example, given the code
11533/// \code
11534/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
11535/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
11536/// \endcode
11537/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
11538/// look through %s1 and %s2 and emit
11539/// \code
11540/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11541/// \endcode
11542/// instead.
11543/// If 2 operands are of different size, the smallest one will be resized and
11544/// the mask recalculated properly.
11545/// For example, given the code
11546/// \code
11547/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
11548/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
11549/// \endcode
11550/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
11551/// look through %s1 and %s2 and emit
11552/// \code
11553/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11554/// \endcode
11555/// instead.
11556class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
11557 bool IsFinalized = false;
11558 /// Combined mask for all applied operands and masks. It is built during
11559 /// analysis and actual emission of shuffle vector instructions.
11560 SmallVector<int> CommonMask;
11561 /// List of operands for the shuffle vector instruction. It hold at max 2
11562 /// operands, if the 3rd is going to be added, the first 2 are combined into
11563 /// shuffle with \p CommonMask mask, the first operand sets to be the
11564 /// resulting shuffle and the second operand sets to be the newly added
11565 /// operand. The \p CommonMask is transformed in the proper way after that.
11566 SmallVector<Value *, 2> InVectors;
11567 Type *ScalarTy = nullptr;
11568 IRBuilderBase &Builder;
11569 BoUpSLP &R;
11570
11571 class ShuffleIRBuilder {
11572 IRBuilderBase &Builder;
11573 /// Holds all of the instructions that we gathered.
11574 SetVector<Instruction *> &GatherShuffleExtractSeq;
11575 /// A list of blocks that we are going to CSE.
11576 DenseSet<BasicBlock *> &CSEBlocks;
11577 /// Data layout.
11578 const DataLayout &DL;
11579
11580 public:
11581 ShuffleIRBuilder(IRBuilderBase &Builder,
11582 SetVector<Instruction *> &GatherShuffleExtractSeq,
11583 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
11584 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11585 CSEBlocks(CSEBlocks), DL(DL) {}
11586 ~ShuffleIRBuilder() = default;
11587 /// Creates shufflevector for the 2 operands with the given mask.
11588 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
11589 if (V1->getType() != V2->getType()) {
11591 V1->getType()->isIntOrIntVectorTy() &&
11592 "Expected integer vector types only.");
11593 if (V1->getType() != V2->getType()) {
11594 if (cast<VectorType>(V2->getType())
11595 ->getElementType()
11596 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
11597 ->getElementType()
11598 ->getIntegerBitWidth())
11599 V2 = Builder.CreateIntCast(
11600 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
11601 else
11602 V1 = Builder.CreateIntCast(
11603 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
11604 }
11605 }
11606 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
11607 if (auto *I = dyn_cast<Instruction>(Vec)) {
11608 GatherShuffleExtractSeq.insert(I);
11609 CSEBlocks.insert(I->getParent());
11610 }
11611 return Vec;
11612 }
11613 /// Creates permutation of the single vector operand with the given mask, if
11614 /// it is not identity mask.
11615 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
11616 if (Mask.empty())
11617 return V1;
11618 unsigned VF = Mask.size();
11619 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
11620 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
11621 return V1;
11622 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
11623 if (auto *I = dyn_cast<Instruction>(Vec)) {
11624 GatherShuffleExtractSeq.insert(I);
11625 CSEBlocks.insert(I->getParent());
11626 }
11627 return Vec;
11628 }
11629 Value *createIdentity(Value *V) { return V; }
11630 Value *createPoison(Type *Ty, unsigned VF) {
11631 return PoisonValue::get(FixedVectorType::get(Ty, VF));
11632 }
11633 /// Resizes 2 input vector to match the sizes, if the they are not equal
11634 /// yet. The smallest vector is resized to the size of the larger vector.
11635 void resizeToMatch(Value *&V1, Value *&V2) {
11636 if (V1->getType() == V2->getType())
11637 return;
11638 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
11639 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
11640 int VF = std::max(V1VF, V2VF);
11641 int MinVF = std::min(V1VF, V2VF);
11642 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
11643 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
11644 0);
11645 Value *&Op = MinVF == V1VF ? V1 : V2;
11646 Op = Builder.CreateShuffleVector(Op, IdentityMask);
11647 if (auto *I = dyn_cast<Instruction>(Op)) {
11648 GatherShuffleExtractSeq.insert(I);
11649 CSEBlocks.insert(I->getParent());
11650 }
11651 if (MinVF == V1VF)
11652 V1 = Op;
11653 else
11654 V2 = Op;
11655 }
11656 };
11657
11658 /// Smart shuffle instruction emission, walks through shuffles trees and
11659 /// tries to find the best matching vector for the actual shuffle
11660 /// instruction.
11661 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
11662 assert(V1 && "Expected at least one vector value.");
11663 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11664 R.CSEBlocks, *R.DL);
11665 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11666 ShuffleBuilder);
11667 }
11668
11669 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
11670 /// shuffle emission.
11671 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
11672 ArrayRef<int> Mask) {
11673 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11674 if (Mask[Idx] != PoisonMaskElem)
11675 CommonMask[Idx] = Idx;
11676 }
11677
11678 /// Cast value \p V to the vector type with the same number of elements, but
11679 /// the base type \p ScalarTy.
11680 Value *castToScalarTyElem(Value *V,
11681 std::optional<bool> IsSigned = std::nullopt) {
11682 auto *VecTy = cast<VectorType>(V->getType());
11683 if (VecTy->getElementType() == ScalarTy)
11684 return V;
11685 return Builder.CreateIntCast(
11686 V, VectorType::get(ScalarTy, VecTy->getElementCount()),
11687 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
11688 }
11689
11690public:
11692 : ScalarTy(ScalarTy), Builder(Builder), R(R) {}
11693
11694 /// Adjusts extractelements after reusing them.
11695 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
11696 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11697 unsigned NumParts, bool &UseVecBaseAsInput) {
11698 UseVecBaseAsInput = false;
11699 SmallPtrSet<Value *, 4> UniqueBases;
11700 Value *VecBase = nullptr;
11701 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
11702 int Idx = Mask[I];
11703 if (Idx == PoisonMaskElem)
11704 continue;
11705 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
11706 VecBase = EI->getVectorOperand();
11707 if (const TreeEntry *TE = R.getTreeEntry(VecBase))
11708 VecBase = TE->VectorizedValue;
11709 assert(VecBase && "Expected vectorized value.");
11710 UniqueBases.insert(VecBase);
11711 // If the only one use is vectorized - can delete the extractelement
11712 // itself.
11713 if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) ||
11714 any_of(EI->users(), [&](User *U) {
11715 const TreeEntry *UTE = R.getTreeEntry(U);
11716 return !UTE || R.MultiNodeScalars.contains(U) ||
11717 (isa<GetElementPtrInst>(U) &&
11718 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
11719 count_if(R.VectorizableTree,
11720 [&](const std::unique_ptr<TreeEntry> &TE) {
11721 return any_of(TE->UserTreeIndices,
11722 [&](const EdgeInfo &Edge) {
11723 return Edge.UserTE == UTE;
11724 }) &&
11725 is_contained(TE->Scalars, EI);
11726 }) != 1;
11727 }))
11728 continue;
11729 R.eraseInstruction(EI);
11730 }
11731 if (NumParts == 1 || UniqueBases.size() == 1) {
11732 assert(VecBase && "Expected vectorized value.");
11733 return castToScalarTyElem(VecBase);
11734 }
11735 UseVecBaseAsInput = true;
11736 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
11737 for (auto [I, Idx] : enumerate(Mask))
11738 if (Idx != PoisonMaskElem)
11739 Idx = I;
11740 };
11741 // Perform multi-register vector shuffle, joining them into a single virtual
11742 // long vector.
11743 // Need to shuffle each part independently and then insert all this parts
11744 // into a long virtual vector register, forming the original vector.
11745 Value *Vec = nullptr;
11746 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
11747 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
11748 for (unsigned Part : seq<unsigned>(NumParts)) {
11749 unsigned Limit = getNumElems(E->Scalars.size(), SliceSize, Part);
11751 ArrayRef(E->Scalars).slice(Part * SliceSize, Limit);
11752 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
11753 constexpr int MaxBases = 2;
11754 SmallVector<Value *, MaxBases> Bases(MaxBases);
11755#ifndef NDEBUG
11756 int PrevSize = 0;
11757#endif // NDEBUG
11758 for (const auto [I, V]: enumerate(VL)) {
11759 if (SubMask[I] == PoisonMaskElem)
11760 continue;
11761 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
11762 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
11763 VecOp = TE->VectorizedValue;
11764 assert(VecOp && "Expected vectorized value.");
11765 const int Size =
11766 cast<FixedVectorType>(VecOp->getType())->getNumElements();
11767#ifndef NDEBUG
11768 assert((PrevSize == Size || PrevSize == 0) &&
11769 "Expected vectors of the same size.");
11770 PrevSize = Size;
11771#endif // NDEBUG
11772 VecOp = castToScalarTyElem(VecOp);
11773 Bases[SubMask[I] < Size ? 0 : 1] = VecOp;
11774 }
11775 if (!Bases.front())
11776 continue;
11777 Value *SubVec;
11778 if (Bases.back()) {
11779 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
11780 TransformToIdentity(SubMask);
11781 } else {
11782 SubVec = Bases.front();
11783 }
11784 if (!Vec) {
11785 Vec = SubVec;
11786 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
11787 [&](unsigned P) {
11788 ArrayRef<int> SubMask =
11789 Mask.slice(P * SliceSize,
11790 getNumElems(Mask.size(),
11791 SliceSize, P));
11792 return all_of(SubMask, [](int Idx) {
11793 return Idx == PoisonMaskElem;
11794 });
11795 })) &&
11796 "Expected first part or all previous parts masked.");
11797 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11798 } else {
11799 unsigned VF = cast<FixedVectorType>(Vec->getType())->getNumElements();
11800 if (Vec->getType() != SubVec->getType()) {
11801 unsigned SubVecVF =
11802 cast<FixedVectorType>(SubVec->getType())->getNumElements();
11803 VF = std::max(VF, SubVecVF);
11804 }
11805 // Adjust SubMask.
11806 for (int &Idx : SubMask)
11807 if (Idx != PoisonMaskElem)
11808 Idx += VF;
11809 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11810 Vec = createShuffle(Vec, SubVec, VecMask);
11811 TransformToIdentity(VecMask);
11812 }
11813 }
11814 copy(VecMask, Mask.begin());
11815 return Vec;
11816 }
11817 /// Checks if the specified entry \p E needs to be delayed because of its
11818 /// dependency nodes.
11819 std::optional<Value *>
11820 needToDelay(const TreeEntry *E,
11822 // No need to delay emission if all deps are ready.
11823 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
11824 return all_of(
11825 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
11826 }))
11827 return std::nullopt;
11828 // Postpone gather emission, will be emitted after the end of the
11829 // process to keep correct order.
11830 auto *ResVecTy = FixedVectorType::get(ScalarTy, E->getVectorFactor());
11831 return Builder.CreateAlignedLoad(
11832 ResVecTy,
11834 MaybeAlign());
11835 }
11836 /// Adds 2 input vectors (in form of tree entries) and the mask for their
11837 /// shuffling.
11838 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
11839 Value *V1 = E1.VectorizedValue;
11840 if (V1->getType()->isIntOrIntVectorTy())
11841 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
11842 return !isKnownNonNegative(
11843 V, SimplifyQuery(*R.DL));
11844 }));
11845 Value *V2 = E2.VectorizedValue;
11846 if (V2->getType()->isIntOrIntVectorTy())
11847 V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
11848 return !isKnownNonNegative(
11849 V, SimplifyQuery(*R.DL));
11850 }));
11851 add(V1, V2, Mask);
11852 }
11853 /// Adds single input vector (in form of tree entry) and the mask for its
11854 /// shuffling.
11855 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
11856 Value *V1 = E1.VectorizedValue;
11857 if (V1->getType()->isIntOrIntVectorTy())
11858 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
11859 return !isKnownNonNegative(
11860 V, SimplifyQuery(*R.DL));
11861 }));
11862 add(V1, Mask);
11863 }
11864 /// Adds 2 input vectors and the mask for their shuffling.
11865 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
11866 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
11867 V1 = castToScalarTyElem(V1);
11868 V2 = castToScalarTyElem(V2);
11869 if (InVectors.empty()) {
11870 InVectors.push_back(V1);
11871 InVectors.push_back(V2);
11872 CommonMask.assign(Mask.begin(), Mask.end());
11873 return;
11874 }
11875 Value *Vec = InVectors.front();
11876 if (InVectors.size() == 2) {
11877 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
11878 transformMaskAfterShuffle(CommonMask, CommonMask);
11879 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
11880 Mask.size()) {
11881 Vec = createShuffle(Vec, nullptr, CommonMask);
11882 transformMaskAfterShuffle(CommonMask, CommonMask);
11883 }
11884 V1 = createShuffle(V1, V2, Mask);
11885 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11886 if (Mask[Idx] != PoisonMaskElem)
11887 CommonMask[Idx] = Idx + Sz;
11888 InVectors.front() = Vec;
11889 if (InVectors.size() == 2)
11890 InVectors.back() = V1;
11891 else
11892 InVectors.push_back(V1);
11893 }
11894 /// Adds another one input vector and the mask for the shuffling.
11895 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
11896 V1 = castToScalarTyElem(V1);
11897 if (InVectors.empty()) {
11898 if (!isa<FixedVectorType>(V1->getType())) {
11899 V1 = createShuffle(V1, nullptr, CommonMask);
11900 CommonMask.assign(Mask.size(), PoisonMaskElem);
11901 transformMaskAfterShuffle(CommonMask, Mask);
11902 }
11903 InVectors.push_back(V1);
11904 CommonMask.assign(Mask.begin(), Mask.end());
11905 return;
11906 }
11907 const auto *It = find(InVectors, V1);
11908 if (It == InVectors.end()) {
11909 if (InVectors.size() == 2 ||
11910 InVectors.front()->getType() != V1->getType() ||
11911 !isa<FixedVectorType>(V1->getType())) {
11912 Value *V = InVectors.front();
11913 if (InVectors.size() == 2) {
11914 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
11915 transformMaskAfterShuffle(CommonMask, CommonMask);
11916 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
11917 CommonMask.size()) {
11918 V = createShuffle(InVectors.front(), nullptr, CommonMask);
11919 transformMaskAfterShuffle(CommonMask, CommonMask);
11920 }
11921 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11922 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
11923 CommonMask[Idx] =
11924 V->getType() != V1->getType()
11925 ? Idx + Sz
11926 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
11927 ->getNumElements();
11928 if (V->getType() != V1->getType())
11929 V1 = createShuffle(V1, nullptr, Mask);
11930 InVectors.front() = V;
11931 if (InVectors.size() == 2)
11932 InVectors.back() = V1;
11933 else
11934 InVectors.push_back(V1);
11935 return;
11936 }
11937 // Check if second vector is required if the used elements are already
11938 // used from the first one.
11939 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11940 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
11941 InVectors.push_back(V1);
11942 break;
11943 }
11944 }
11945 int VF = CommonMask.size();
11946 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
11947 VF = FTy->getNumElements();
11948 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11949 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
11950 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
11951 }
11952 /// Adds another one input vector and the mask for the shuffling.
11954 SmallVector<int> NewMask;
11955 inversePermutation(Order, NewMask);
11956 add(V1, NewMask);
11957 }
11958 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
11959 Value *Root = nullptr) {
11960 return R.gather(VL, Root, ScalarTy);
11961 }
11962 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
11963 /// Finalize emission of the shuffles.
11964 /// \param Action the action (if any) to be performed before final applying of
11965 /// the \p ExtMask mask.
11966 Value *
11967 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
11968 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
11969 IsFinalized = true;
11970 if (Action) {
11971 Value *Vec = InVectors.front();
11972 if (InVectors.size() == 2) {
11973 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
11974 InVectors.pop_back();
11975 } else {
11976 Vec = createShuffle(Vec, nullptr, CommonMask);
11977 }
11978 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11979 if (CommonMask[Idx] != PoisonMaskElem)
11980 CommonMask[Idx] = Idx;
11981 assert(VF > 0 &&
11982 "Expected vector length for the final value before action.");
11983 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
11984 if (VecVF < VF) {
11985 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
11986 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
11987 Vec = createShuffle(Vec, nullptr, ResizeMask);
11988 }
11989 Action(Vec, CommonMask);
11990 InVectors.front() = Vec;
11991 }
11992 if (!ExtMask.empty()) {
11993 if (CommonMask.empty()) {
11994 CommonMask.assign(ExtMask.begin(), ExtMask.end());
11995 } else {
11996 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
11997 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
11998 if (ExtMask[I] == PoisonMaskElem)
11999 continue;
12000 NewMask[I] = CommonMask[ExtMask[I]];
12001 }
12002 CommonMask.swap(NewMask);
12003 }
12004 }
12005 if (CommonMask.empty()) {
12006 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
12007 return InVectors.front();
12008 }
12009 if (InVectors.size() == 2)
12010 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
12011 return createShuffle(InVectors.front(), nullptr, CommonMask);
12012 }
12013
12015 assert((IsFinalized || CommonMask.empty()) &&
12016 "Shuffle construction must be finalized.");
12017 }
12018};
12019
12020Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
12021 bool PostponedPHIs) {
12022 ValueList &VL = E->getOperand(NodeIdx);
12023 const unsigned VF = VL.size();
12024 InstructionsState S = getSameOpcode(VL, *TLI);
12025 // Special processing for GEPs bundle, which may include non-gep values.
12026 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
12027 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
12028 if (It != VL.end())
12029 S = getSameOpcode(*It, *TLI);
12030 }
12031 if (S.getOpcode()) {
12032 auto CheckSameVE = [&](const TreeEntry *VE) {
12033 return VE->isSame(VL) &&
12034 (any_of(VE->UserTreeIndices,
12035 [E, NodeIdx](const EdgeInfo &EI) {
12036 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12037 }) ||
12038 any_of(VectorizableTree,
12039 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
12040 return TE->isOperandGatherNode({E, NodeIdx}) &&
12041 VE->isSame(TE->Scalars);
12042 }));
12043 };
12044 TreeEntry *VE = getTreeEntry(S.OpValue);
12045 bool IsSameVE = VE && CheckSameVE(VE);
12046 if (!IsSameVE) {
12047 auto It = MultiNodeScalars.find(S.OpValue);
12048 if (It != MultiNodeScalars.end()) {
12049 auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
12050 return TE != VE && CheckSameVE(TE);
12051 });
12052 if (I != It->getSecond().end()) {
12053 VE = *I;
12054 IsSameVE = true;
12055 }
12056 }
12057 }
12058 if (IsSameVE) {
12059 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
12060 ShuffleInstructionBuilder ShuffleBuilder(
12061 cast<VectorType>(V->getType())->getElementType(), Builder, *this);
12062 ShuffleBuilder.add(V, Mask);
12063 return ShuffleBuilder.finalize(std::nullopt);
12064 };
12065 Value *V = vectorizeTree(VE, PostponedPHIs);
12066 if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
12067 if (!VE->ReuseShuffleIndices.empty()) {
12068 // Reshuffle to get only unique values.
12069 // If some of the scalars are duplicated in the vectorization
12070 // tree entry, we do not vectorize them but instead generate a
12071 // mask for the reuses. But if there are several users of the
12072 // same entry, they may have different vectorization factors.
12073 // This is especially important for PHI nodes. In this case, we
12074 // need to adapt the resulting instruction for the user
12075 // vectorization factor and have to reshuffle it again to take
12076 // only unique elements of the vector. Without this code the
12077 // function incorrectly returns reduced vector instruction with
12078 // the same elements, not with the unique ones.
12079
12080 // block:
12081 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
12082 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
12083 // ... (use %2)
12084 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
12085 // br %block
12087 for (auto [I, V] : enumerate(VL)) {
12088 if (isa<PoisonValue>(V))
12089 continue;
12090 Mask[I] = VE->findLaneForValue(V);
12091 }
12092 V = FinalShuffle(V, Mask);
12093 } else {
12094 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
12095 "Expected vectorization factor less "
12096 "than original vector size.");
12097 SmallVector<int> UniformMask(VF, 0);
12098 std::iota(UniformMask.begin(), UniformMask.end(), 0);
12099 V = FinalShuffle(V, UniformMask);
12100 }
12101 }
12102 // Need to update the operand gather node, if actually the operand is not a
12103 // vectorized node, but the buildvector/gather node, which matches one of
12104 // the vectorized nodes.
12105 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
12106 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12107 }) == VE->UserTreeIndices.end()) {
12108 auto *It = find_if(
12109 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12110 return TE->State == TreeEntry::NeedToGather &&
12111 TE->UserTreeIndices.front().UserTE == E &&
12112 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
12113 });
12114 assert(It != VectorizableTree.end() && "Expected gather node operand.");
12115 (*It)->VectorizedValue = V;
12116 }
12117 return V;
12118 }
12119 }
12120
12121 // Find the corresponding gather entry and vectorize it.
12122 // Allows to be more accurate with tree/graph transformations, checks for the
12123 // correctness of the transformations in many cases.
12124 auto *I = find_if(VectorizableTree,
12125 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
12126 return TE->isOperandGatherNode({E, NodeIdx});
12127 });
12128 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
12129 assert(I->get()->UserTreeIndices.size() == 1 &&
12130 "Expected only single user for the gather node.");
12131 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
12132 return vectorizeTree(I->get(), PostponedPHIs);
12133}
12134
12135template <typename BVTy, typename ResTy, typename... Args>
12136ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
12137 Args &...Params) {
12138 assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
12139 unsigned VF = E->getVectorFactor();
12140
12141 bool NeedFreeze = false;
12142 SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
12143 E->ReuseShuffleIndices.end());
12144 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
12145 // Build a mask out of the reorder indices and reorder scalars per this
12146 // mask.
12147 SmallVector<int> ReorderMask;
12148 inversePermutation(E->ReorderIndices, ReorderMask);
12149 if (!ReorderMask.empty())
12150 reorderScalars(GatheredScalars, ReorderMask);
12151 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
12152 unsigned I, unsigned SliceSize) {
12153 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
12154 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12155 }))
12156 return false;
12157 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
12158 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
12159 if (UserTE->getNumOperands() != 2)
12160 return false;
12161 auto *It =
12162 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
12163 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
12164 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
12165 }) != TE->UserTreeIndices.end();
12166 });
12167 if (It == VectorizableTree.end())
12168 return false;
12169 int Idx;
12170 if ((Mask.size() < InputVF &&
12172 Idx == 0) ||
12173 (Mask.size() == InputVF &&
12174 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
12175 std::iota(
12176 std::next(Mask.begin(), I * SliceSize),
12177 std::next(Mask.begin(),
12178 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
12179 0);
12180 } else {
12181 unsigned IVal =
12182 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
12183 std::fill(
12184 std::next(Mask.begin(), I * SliceSize),
12185 std::next(Mask.begin(),
12186 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
12187 IVal);
12188 }
12189 return true;
12190 };
12191 BVTy ShuffleBuilder(ScalarTy, Params...);
12192 ResTy Res = ResTy();
12194 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
12196 Value *ExtractVecBase = nullptr;
12197 bool UseVecBaseAsInput = false;
12200 Type *OrigScalarTy = GatheredScalars.front()->getType();
12201 auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());
12202 unsigned NumParts = TTI->getNumberOfParts(VecTy);
12203 if (NumParts == 0 || NumParts >= GatheredScalars.size())
12204 NumParts = 1;
12205 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
12206 // Check for gathered extracts.
12207 bool Resized = false;
12208 ExtractShuffles =
12209 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
12210 if (!ExtractShuffles.empty()) {
12211 SmallVector<const TreeEntry *> ExtractEntries;
12212 for (auto [Idx, I] : enumerate(ExtractMask)) {
12213 if (I == PoisonMaskElem)
12214 continue;
12215 if (const auto *TE = getTreeEntry(
12216 cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
12217 ExtractEntries.push_back(TE);
12218 }
12219 if (std::optional<ResTy> Delayed =
12220 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
12221 // Delay emission of gathers which are not ready yet.
12222 PostponedGathers.insert(E);
12223 // Postpone gather emission, will be emitted after the end of the
12224 // process to keep correct order.
12225 return *Delayed;
12226 }
12227 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
12228 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
12229 ExtractVecBase = VecBase;
12230 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
12231 if (VF == VecBaseTy->getNumElements() &&
12232 GatheredScalars.size() != VF) {
12233 Resized = true;
12234 GatheredScalars.append(VF - GatheredScalars.size(),
12235 PoisonValue::get(OrigScalarTy));
12236 }
12237 }
12238 }
12239 // Gather extracts after we check for full matched gathers only.
12240 if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
12241 E->isAltShuffle() ||
12242 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
12243 isSplat(E->Scalars) ||
12244 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
12245 GatherShuffles =
12246 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
12247 }
12248 if (!GatherShuffles.empty()) {
12249 if (std::optional<ResTy> Delayed =
12250 ShuffleBuilder.needToDelay(E, Entries)) {
12251 // Delay emission of gathers which are not ready yet.
12252 PostponedGathers.insert(E);
12253 // Postpone gather emission, will be emitted after the end of the
12254 // process to keep correct order.
12255 return *Delayed;
12256 }
12257 if (GatherShuffles.size() == 1 &&
12258 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
12259 Entries.front().front()->isSame(E->Scalars)) {
12260 // Perfect match in the graph, will reuse the previously vectorized
12261 // node. Cost is 0.
12262 LLVM_DEBUG(
12263 dbgs()
12264 << "SLP: perfect diamond match for gather bundle "
12265 << shortBundleName(E->Scalars) << ".\n");
12266 // Restore the mask for previous partially matched values.
12267 Mask.resize(E->Scalars.size());
12268 const TreeEntry *FrontTE = Entries.front().front();
12269 if (FrontTE->ReorderIndices.empty() &&
12270 ((FrontTE->ReuseShuffleIndices.empty() &&
12271 E->Scalars.size() == FrontTE->Scalars.size()) ||
12272 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
12273 std::iota(Mask.begin(), Mask.end(), 0);
12274 } else {
12275 for (auto [I, V] : enumerate(E->Scalars)) {
12276 if (isa<PoisonValue>(V)) {
12278 continue;
12279 }
12280 Mask[I] = FrontTE->findLaneForValue(V);
12281 }
12282 }
12283 ShuffleBuilder.add(*FrontTE, Mask);
12284 Res = ShuffleBuilder.finalize(E->getCommonMask());
12285 return Res;
12286 }
12287 if (!Resized) {
12288 if (GatheredScalars.size() != VF &&
12289 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
12290 return any_of(TEs, [&](const TreeEntry *TE) {
12291 return TE->getVectorFactor() == VF;
12292 });
12293 }))
12294 GatheredScalars.append(VF - GatheredScalars.size(),
12295 PoisonValue::get(OrigScalarTy));
12296 }
12297 // Remove shuffled elements from list of gathers.
12298 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
12299 if (Mask[I] != PoisonMaskElem)
12300 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
12301 }
12302 }
12303 }
12304 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
12305 SmallVectorImpl<int> &ReuseMask,
12306 bool IsRootPoison) {
12307 // For splats with can emit broadcasts instead of gathers, so try to find
12308 // such sequences.
12309 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
12310 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
12311 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
12312 SmallVector<int> UndefPos;
12313 DenseMap<Value *, unsigned> UniquePositions;
12314 // Gather unique non-const values and all constant values.
12315 // For repeated values, just shuffle them.
12316 int NumNonConsts = 0;
12317 int SinglePos = 0;
12318 for (auto [I, V] : enumerate(Scalars)) {
12319 if (isa<UndefValue>(V)) {
12320 if (!isa<PoisonValue>(V)) {
12321 ReuseMask[I] = I;
12322 UndefPos.push_back(I);
12323 }
12324 continue;
12325 }
12326 if (isConstant(V)) {
12327 ReuseMask[I] = I;
12328 continue;
12329 }
12330 ++NumNonConsts;
12331 SinglePos = I;
12332 Value *OrigV = V;
12333 Scalars[I] = PoisonValue::get(OrigScalarTy);
12334 if (IsSplat) {
12335 Scalars.front() = OrigV;
12336 ReuseMask[I] = 0;
12337 } else {
12338 const auto Res = UniquePositions.try_emplace(OrigV, I);
12339 Scalars[Res.first->second] = OrigV;
12340 ReuseMask[I] = Res.first->second;
12341 }
12342 }
12343 if (NumNonConsts == 1) {
12344 // Restore single insert element.
12345 if (IsSplat) {
12346 ReuseMask.assign(VF, PoisonMaskElem);
12347 std::swap(Scalars.front(), Scalars[SinglePos]);
12348 if (!UndefPos.empty() && UndefPos.front() == 0)
12349 Scalars.front() = UndefValue::get(OrigScalarTy);
12350 }
12351 ReuseMask[SinglePos] = SinglePos;
12352 } else if (!UndefPos.empty() && IsSplat) {
12353 // For undef values, try to replace them with the simple broadcast.
12354 // We can do it if the broadcasted value is guaranteed to be
12355 // non-poisonous, or by freezing the incoming scalar value first.
12356 auto *It = find_if(Scalars, [this, E](Value *V) {
12357 return !isa<UndefValue>(V) &&
12358 (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
12359 (E->UserTreeIndices.size() == 1 &&
12360 any_of(V->uses(), [E](const Use &U) {
12361 // Check if the value already used in the same operation in
12362 // one of the nodes already.
12363 return E->UserTreeIndices.front().EdgeIdx !=
12364 U.getOperandNo() &&
12365 is_contained(
12366 E->UserTreeIndices.front().UserTE->Scalars,
12367 U.getUser());
12368 })));
12369 });
12370 if (It != Scalars.end()) {
12371 // Replace undefs by the non-poisoned scalars and emit broadcast.
12372 int Pos = std::distance(Scalars.begin(), It);
12373 for (int I : UndefPos) {
12374 // Set the undef position to the non-poisoned scalar.
12375 ReuseMask[I] = Pos;
12376 // Replace the undef by the poison, in the mask it is replaced by
12377 // non-poisoned scalar already.
12378 if (I != Pos)
12379 Scalars[I] = PoisonValue::get(OrigScalarTy);
12380 }
12381 } else {
12382 // Replace undefs by the poisons, emit broadcast and then emit
12383 // freeze.
12384 for (int I : UndefPos) {
12385 ReuseMask[I] = PoisonMaskElem;
12386 if (isa<UndefValue>(Scalars[I]))
12387 Scalars[I] = PoisonValue::get(OrigScalarTy);
12388 }
12389 NeedFreeze = true;
12390 }
12391 }
12392 };
12393 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
12394 bool IsNonPoisoned = true;
12395 bool IsUsedInExpr = true;
12396 Value *Vec1 = nullptr;
12397 if (!ExtractShuffles.empty()) {
12398 // Gather of extractelements can be represented as just a shuffle of
12399 // a single/two vectors the scalars are extracted from.
12400 // Find input vectors.
12401 Value *Vec2 = nullptr;
12402 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12403 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
12404 ExtractMask[I] = PoisonMaskElem;
12405 }
12406 if (UseVecBaseAsInput) {
12407 Vec1 = ExtractVecBase;
12408 } else {
12409 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12410 if (ExtractMask[I] == PoisonMaskElem)
12411 continue;
12412 if (isa<UndefValue>(E->Scalars[I]))
12413 continue;
12414 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
12415 Value *VecOp = EI->getVectorOperand();
12416 if (const auto *TE = getTreeEntry(VecOp))
12417 if (TE->VectorizedValue)
12418 VecOp = TE->VectorizedValue;
12419 if (!Vec1) {
12420 Vec1 = VecOp;
12421 } else if (Vec1 != VecOp) {
12422 assert((!Vec2 || Vec2 == VecOp) &&
12423 "Expected only 1 or 2 vectors shuffle.");
12424 Vec2 = VecOp;
12425 }
12426 }
12427 }
12428 if (Vec2) {
12429 IsUsedInExpr = false;
12430 IsNonPoisoned &=
12432 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12433 } else if (Vec1) {
12434 IsUsedInExpr &= FindReusedSplat(
12435 ExtractMask,
12436 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
12437 ExtractMask.size());
12438 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
12439 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
12440 } else {
12441 IsUsedInExpr = false;
12442 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
12443 /*ForExtracts=*/true);
12444 }
12445 }
12446 if (!GatherShuffles.empty()) {
12447 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
12448 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
12449 for (const auto [I, TEs] : enumerate(Entries)) {
12450 if (TEs.empty()) {
12451 assert(!GatherShuffles[I] &&
12452 "No shuffles with empty entries list expected.");
12453 continue;
12454 }
12455 assert((TEs.size() == 1 || TEs.size() == 2) &&
12456 "Expected shuffle of 1 or 2 entries.");
12457 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
12458 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
12459 VecMask.assign(VecMask.size(), PoisonMaskElem);
12460 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
12461 if (TEs.size() == 1) {
12462 IsUsedInExpr &= FindReusedSplat(
12463 VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
12464 ShuffleBuilder.add(*TEs.front(), VecMask);
12465 if (TEs.front()->VectorizedValue)
12466 IsNonPoisoned &=
12467 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
12468 } else {
12469 IsUsedInExpr = false;
12470 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
12471 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
12472 IsNonPoisoned &=
12473 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
12474 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
12475 }
12476 }
12477 }
12478 // Try to figure out best way to combine values: build a shuffle and insert
12479 // elements or just build several shuffles.
12480 // Insert non-constant scalars.
12481 SmallVector<Value *> NonConstants(GatheredScalars);
12482 int EMSz = ExtractMask.size();
12483 int MSz = Mask.size();
12484 // Try to build constant vector and shuffle with it only if currently we
12485 // have a single permutation and more than 1 scalar constants.
12486 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
12487 bool IsIdentityShuffle =
12488 ((UseVecBaseAsInput ||
12489 all_of(ExtractShuffles,
12490 [](const std::optional<TTI::ShuffleKind> &SK) {
12491 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12493 })) &&
12494 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
12495 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
12496 (!GatherShuffles.empty() &&
12497 all_of(GatherShuffles,
12498 [](const std::optional<TTI::ShuffleKind> &SK) {
12499 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12501 }) &&
12502 none_of(Mask, [&](int I) { return I >= MSz; }) &&
12504 bool EnoughConstsForShuffle =
12505 IsSingleShuffle &&
12506 (none_of(GatheredScalars,
12507 [](Value *V) {
12508 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12509 }) ||
12510 any_of(GatheredScalars,
12511 [](Value *V) {
12512 return isa<Constant>(V) && !isa<UndefValue>(V);
12513 })) &&
12514 (!IsIdentityShuffle ||
12515 (GatheredScalars.size() == 2 &&
12516 any_of(GatheredScalars,
12517 [](Value *V) { return !isa<UndefValue>(V); })) ||
12518 count_if(GatheredScalars, [](Value *V) {
12519 return isa<Constant>(V) && !isa<PoisonValue>(V);
12520 }) > 1);
12521 // NonConstants array contains just non-constant values, GatheredScalars
12522 // contains only constant to build final vector and then shuffle.
12523 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
12524 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
12525 NonConstants[I] = PoisonValue::get(OrigScalarTy);
12526 else
12527 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
12528 }
12529 // Generate constants for final shuffle and build a mask for them.
12530 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
12531 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
12532 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
12533 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12534 ShuffleBuilder.add(BV, BVMask);
12535 }
12536 if (all_of(NonConstants, [=](Value *V) {
12537 return isa<PoisonValue>(V) ||
12538 (IsSingleShuffle && ((IsIdentityShuffle &&
12539 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
12540 }))
12541 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12542 else
12543 Res = ShuffleBuilder.finalize(
12544 E->ReuseShuffleIndices, E->Scalars.size(),
12545 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
12546 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
12547 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
12548 });
12549 } else if (!allConstant(GatheredScalars)) {
12550 // Gather unique scalars and all constants.
12551 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
12552 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
12553 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
12554 ShuffleBuilder.add(BV, ReuseMask);
12555 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12556 } else {
12557 // Gather all constants.
12558 SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem);
12559 for (auto [I, V] : enumerate(E->Scalars)) {
12560 if (!isa<PoisonValue>(V))
12561 Mask[I] = I;
12562 }
12563 Value *BV = ShuffleBuilder.gather(E->Scalars);
12564 ShuffleBuilder.add(BV, Mask);
12565 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12566 }
12567
12568 if (NeedFreeze)
12569 Res = ShuffleBuilder.createFreeze(Res);
12570 return Res;
12571}
12572
12573Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
12574 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
12575 Builder, *this);
12576}
12577
12578Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
12579 IRBuilderBase::InsertPointGuard Guard(Builder);
12580
12581 if (E->VectorizedValue &&
12582 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12583 E->isAltShuffle())) {
12584 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
12585 return E->VectorizedValue;
12586 }
12587
12588 Value *V = E->Scalars.front();
12589 Type *ScalarTy = V->getType();
12590 if (auto *Store = dyn_cast<StoreInst>(V))
12591 ScalarTy = Store->getValueOperand()->getType();
12592 else if (auto *IE = dyn_cast<InsertElementInst>(V))
12593 ScalarTy = IE->getOperand(1)->getType();
12594 auto It = MinBWs.find(E);
12595 if (It != MinBWs.end())
12596 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
12597 auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
12598 if (E->State == TreeEntry::NeedToGather) {
12599 // Set insert point for non-reduction initial nodes.
12600 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12601 setInsertPointAfterBundle(E);
12602 Value *Vec = createBuildVector(E, ScalarTy);
12603 E->VectorizedValue = Vec;
12604 return Vec;
12605 }
12606
12607 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
12608 auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
12609 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
12610 if (E->getOpcode() == Instruction::Store &&
12611 E->State == TreeEntry::Vectorize) {
12613 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
12614 E->ReorderIndices.size());
12615 ShuffleBuilder.add(V, Mask);
12616 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12617 ShuffleBuilder.addOrdered(V, std::nullopt);
12618 } else {
12619 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
12620 }
12621 return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12622 };
12623
12624 assert((E->State == TreeEntry::Vectorize ||
12625 E->State == TreeEntry::ScatterVectorize ||
12626 E->State == TreeEntry::StridedVectorize) &&
12627 "Unhandled state");
12628 unsigned ShuffleOrOp =
12629 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
12630 Instruction *VL0 = E->getMainOp();
12631 auto GetOperandSignedness = [&](unsigned Idx) {
12632 const TreeEntry *OpE = getOperandEntry(E, Idx);
12633 bool IsSigned = false;
12634 auto It = MinBWs.find(OpE);
12635 if (It != MinBWs.end())
12636 IsSigned = It->second.second;
12637 else
12638 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
12639 return !isKnownNonNegative(R, SimplifyQuery(*DL));
12640 });
12641 return IsSigned;
12642 };
12643 switch (ShuffleOrOp) {
12644 case Instruction::PHI: {
12645 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
12646 E != VectorizableTree.front().get() ||
12647 !E->UserTreeIndices.empty()) &&
12648 "PHI reordering is free.");
12649 if (PostponedPHIs && E->VectorizedValue)
12650 return E->VectorizedValue;
12651 auto *PH = cast<PHINode>(VL0);
12652 Builder.SetInsertPoint(PH->getParent(),
12653 PH->getParent()->getFirstNonPHIIt());
12654 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12655 if (PostponedPHIs || !E->VectorizedValue) {
12656 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
12657 E->PHI = NewPhi;
12658 Value *V = NewPhi;
12659
12660 // Adjust insertion point once all PHI's have been generated.
12661 Builder.SetInsertPoint(PH->getParent(),
12662 PH->getParent()->getFirstInsertionPt());
12663 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12664
12665 V = FinalShuffle(V, E, VecTy);
12666
12667 E->VectorizedValue = V;
12668 if (PostponedPHIs)
12669 return V;
12670 }
12671 PHINode *NewPhi = cast<PHINode>(E->PHI);
12672 // If phi node is fully emitted - exit.
12673 if (NewPhi->getNumIncomingValues() != 0)
12674 return NewPhi;
12675
12676 // PHINodes may have multiple entries from the same block. We want to
12677 // visit every block once.
12679
12680 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12682 BasicBlock *IBB = PH->getIncomingBlock(I);
12683
12684 // Stop emission if all incoming values are generated.
12685 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
12686 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12687 return NewPhi;
12688 }
12689
12690 if (!VisitedBBs.insert(IBB).second) {
12691 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
12692 continue;
12693 }
12694
12695 Builder.SetInsertPoint(IBB->getTerminator());
12696 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12697 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
12698 if (VecTy != Vec->getType()) {
12699 assert((It != MinBWs.end() ||
12700 getOperandEntry(E, I)->State == TreeEntry::NeedToGather ||
12701 MinBWs.contains(getOperandEntry(E, I))) &&
12702 "Expected item in MinBWs.");
12703 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
12704 }
12705 NewPhi->addIncoming(Vec, IBB);
12706 }
12707
12708 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
12709 "Invalid number of incoming values");
12710 return NewPhi;
12711 }
12712
12713 case Instruction::ExtractElement: {
12714 Value *V = E->getSingleOperand(0);
12715 if (const TreeEntry *TE = getTreeEntry(V))
12716 V = TE->VectorizedValue;
12717 setInsertPointAfterBundle(E);
12718 V = FinalShuffle(V, E, VecTy);
12719 E->VectorizedValue = V;
12720 return V;
12721 }
12722 case Instruction::ExtractValue: {
12723 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
12724 Builder.SetInsertPoint(LI);
12725 Value *Ptr = LI->getPointerOperand();
12726 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
12727 Value *NewV = propagateMetadata(V, E->Scalars);
12728 NewV = FinalShuffle(NewV, E, VecTy);
12729 E->VectorizedValue = NewV;
12730 return NewV;
12731 }
12732 case Instruction::InsertElement: {
12733 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
12734 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
12735 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
12736 ArrayRef<Value *> Op = E->getOperand(1);
12737 Type *ScalarTy = Op.front()->getType();
12738 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
12739 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
12740 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
12741 assert(Res.first > 0 && "Expected item in MinBWs.");
12742 V = Builder.CreateIntCast(
12743 V,
12745 ScalarTy,
12746 cast<FixedVectorType>(V->getType())->getNumElements()),
12747 Res.second);
12748 }
12749
12750 // Create InsertVector shuffle if necessary
12751 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
12752 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
12753 }));
12754 const unsigned NumElts =
12755 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
12756 const unsigned NumScalars = E->Scalars.size();
12757
12758 unsigned Offset = *getInsertIndex(VL0);
12759 assert(Offset < NumElts && "Failed to find vector index offset");
12760
12761 // Create shuffle to resize vector
12763 if (!E->ReorderIndices.empty()) {
12764 inversePermutation(E->ReorderIndices, Mask);
12765 Mask.append(NumElts - NumScalars, PoisonMaskElem);
12766 } else {
12767 Mask.assign(NumElts, PoisonMaskElem);
12768 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
12769 }
12770 // Create InsertVector shuffle if necessary
12771 bool IsIdentity = true;
12772 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
12773 Mask.swap(PrevMask);
12774 for (unsigned I = 0; I < NumScalars; ++I) {
12775 Value *Scalar = E->Scalars[PrevMask[I]];
12776 unsigned InsertIdx = *getInsertIndex(Scalar);
12777 IsIdentity &= InsertIdx - Offset == I;
12778 Mask[InsertIdx - Offset] = I;
12779 }
12780 if (!IsIdentity || NumElts != NumScalars) {
12781 Value *V2 = nullptr;
12782 bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
12783 SmallVector<int> InsertMask(Mask);
12784 if (NumElts != NumScalars && Offset == 0) {
12785 // Follow all insert element instructions from the current buildvector
12786 // sequence.
12787 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
12788 do {
12789 std::optional<unsigned> InsertIdx = getInsertIndex(Ins);
12790 if (!InsertIdx)
12791 break;
12792 if (InsertMask[*InsertIdx] == PoisonMaskElem)
12793 InsertMask[*InsertIdx] = *InsertIdx;
12794 if (!Ins->hasOneUse())
12795 break;
12796 Ins = dyn_cast_or_null<InsertElementInst>(
12797 Ins->getUniqueUndroppableUser());
12798 } while (Ins);
12799 SmallBitVector UseMask =
12800 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12801 SmallBitVector IsFirstPoison =
12802 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12803 SmallBitVector IsFirstUndef =
12804 isUndefVector(FirstInsert->getOperand(0), UseMask);
12805 if (!IsFirstPoison.all()) {
12806 unsigned Idx = 0;
12807 for (unsigned I = 0; I < NumElts; I++) {
12808 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
12809 IsFirstUndef.test(I)) {
12810 if (IsVNonPoisonous) {
12811 InsertMask[I] = I < NumScalars ? I : 0;
12812 continue;
12813 }
12814 if (!V2)
12815 V2 = UndefValue::get(V->getType());
12816 if (Idx >= NumScalars)
12817 Idx = NumScalars - 1;
12818 InsertMask[I] = NumScalars + Idx;
12819 ++Idx;
12820 } else if (InsertMask[I] != PoisonMaskElem &&
12821 Mask[I] == PoisonMaskElem) {
12822 InsertMask[I] = PoisonMaskElem;
12823 }
12824 }
12825 } else {
12826 InsertMask = Mask;
12827 }
12828 }
12829 if (!V2)
12830 V2 = PoisonValue::get(V->getType());
12831 V = Builder.CreateShuffleVector(V, V2, InsertMask);
12832 if (auto *I = dyn_cast<Instruction>(V)) {
12833 GatherShuffleExtractSeq.insert(I);
12834 CSEBlocks.insert(I->getParent());
12835 }
12836 }
12837
12838 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
12839 for (unsigned I = 0; I < NumElts; I++) {
12840 if (Mask[I] != PoisonMaskElem)
12841 InsertMask[Offset + I] = I;
12842 }
12843 SmallBitVector UseMask =
12844 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12845 SmallBitVector IsFirstUndef =
12846 isUndefVector(FirstInsert->getOperand(0), UseMask);
12847 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
12848 NumElts != NumScalars) {
12849 if (IsFirstUndef.all()) {
12850 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
12851 SmallBitVector IsFirstPoison =
12852 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12853 if (!IsFirstPoison.all()) {
12854 for (unsigned I = 0; I < NumElts; I++) {
12855 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
12856 InsertMask[I] = I + NumElts;
12857 }
12858 }
12859 V = Builder.CreateShuffleVector(
12860 V,
12861 IsFirstPoison.all() ? PoisonValue::get(V->getType())
12862 : FirstInsert->getOperand(0),
12863 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
12864 if (auto *I = dyn_cast<Instruction>(V)) {
12865 GatherShuffleExtractSeq.insert(I);
12866 CSEBlocks.insert(I->getParent());
12867 }
12868 }
12869 } else {
12870 SmallBitVector IsFirstPoison =
12871 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12872 for (unsigned I = 0; I < NumElts; I++) {
12873 if (InsertMask[I] == PoisonMaskElem)
12874 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
12875 else
12876 InsertMask[I] += NumElts;
12877 }
12878 V = Builder.CreateShuffleVector(
12879 FirstInsert->getOperand(0), V, InsertMask,
12880 cast<Instruction>(E->Scalars.back())->getName());
12881 if (auto *I = dyn_cast<Instruction>(V)) {
12882 GatherShuffleExtractSeq.insert(I);
12883 CSEBlocks.insert(I->getParent());
12884 }
12885 }
12886 }
12887
12888 ++NumVectorInstructions;
12889 E->VectorizedValue = V;
12890 return V;
12891 }
12892 case Instruction::ZExt:
12893 case Instruction::SExt:
12894 case Instruction::FPToUI:
12895 case Instruction::FPToSI:
12896 case Instruction::FPExt:
12897 case Instruction::PtrToInt:
12898 case Instruction::IntToPtr:
12899 case Instruction::SIToFP:
12900 case Instruction::UIToFP:
12901 case Instruction::Trunc:
12902 case Instruction::FPTrunc:
12903 case Instruction::BitCast: {
12904 setInsertPointAfterBundle(E);
12905
12906 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
12907 if (E->VectorizedValue) {
12908 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12909 return E->VectorizedValue;
12910 }
12911
12912 auto *CI = cast<CastInst>(VL0);
12913 Instruction::CastOps VecOpcode = CI->getOpcode();
12914 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
12915 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
12916 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
12917 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
12918 SrcScalarTy != CI->getOperand(0)->getType())) {
12919 // Check if the values are candidates to demote.
12920 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
12921 if (SrcIt != MinBWs.end())
12922 SrcBWSz = SrcIt->second.first;
12923 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
12924 if (BWSz == SrcBWSz) {
12925 VecOpcode = Instruction::BitCast;
12926 } else if (BWSz < SrcBWSz) {
12927 VecOpcode = Instruction::Trunc;
12928 } else if (It != MinBWs.end()) {
12929 assert(BWSz > SrcBWSz && "Invalid cast!");
12930 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12931 } else if (SrcIt != MinBWs.end()) {
12932 assert(BWSz > SrcBWSz && "Invalid cast!");
12933 VecOpcode =
12934 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
12935 }
12936 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
12937 !SrcIt->second.second) {
12938 VecOpcode = Instruction::UIToFP;
12939 }
12940 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
12941 ? InVec
12942 : Builder.CreateCast(VecOpcode, InVec, VecTy);
12943 V = FinalShuffle(V, E, VecTy);
12944
12945 E->VectorizedValue = V;
12946 ++NumVectorInstructions;
12947 return V;
12948 }
12949 case Instruction::FCmp:
12950 case Instruction::ICmp: {
12951 setInsertPointAfterBundle(E);
12952
12953 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
12954 if (E->VectorizedValue) {
12955 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12956 return E->VectorizedValue;
12957 }
12958 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
12959 if (E->VectorizedValue) {
12960 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12961 return E->VectorizedValue;
12962 }
12963 if (L->getType() != R->getType()) {
12964 assert((getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12965 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12966 MinBWs.contains(getOperandEntry(E, 0)) ||
12967 MinBWs.contains(getOperandEntry(E, 1))) &&
12968 "Expected item in MinBWs.");
12969 if (cast<VectorType>(L->getType())
12970 ->getElementType()
12971 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
12972 ->getElementType()
12973 ->getIntegerBitWidth()) {
12974 Type *CastTy = R->getType();
12975 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
12976 } else {
12977 Type *CastTy = L->getType();
12978 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
12979 }
12980 }
12981
12982 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
12983 Value *V = Builder.CreateCmp(P0, L, R);
12984 propagateIRFlags(V, E->Scalars, VL0);
12985 // Do not cast for cmps.
12986 VecTy = cast<FixedVectorType>(V->getType());
12987 V = FinalShuffle(V, E, VecTy);
12988
12989 E->VectorizedValue = V;
12990 ++NumVectorInstructions;
12991 return V;
12992 }
12993 case Instruction::Select: {
12994 setInsertPointAfterBundle(E);
12995
12996 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
12997 if (E->VectorizedValue) {
12998 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12999 return E->VectorizedValue;
13000 }
13001 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
13002 if (E->VectorizedValue) {
13003 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13004 return E->VectorizedValue;
13005 }
13006 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
13007 if (E->VectorizedValue) {
13008 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13009 return E->VectorizedValue;
13010 }
13011 if (True->getType() != VecTy || False->getType() != VecTy) {
13012 assert((It != MinBWs.end() ||
13013 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
13014 getOperandEntry(E, 2)->State == TreeEntry::NeedToGather ||
13015 MinBWs.contains(getOperandEntry(E, 1)) ||
13016 MinBWs.contains(getOperandEntry(E, 2))) &&
13017 "Expected item in MinBWs.");
13018 if (True->getType() != VecTy)
13019 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
13020 if (False->getType() != VecTy)
13021 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
13022 }
13023
13024 Value *V = Builder.CreateSelect(Cond, True, False);
13025 V = FinalShuffle(V, E, VecTy);
13026
13027 E->VectorizedValue = V;
13028 ++NumVectorInstructions;
13029 return V;
13030 }
13031 case Instruction::FNeg: {
13032 setInsertPointAfterBundle(E);
13033
13034 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
13035
13036 if (E->VectorizedValue) {
13037 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13038 return E->VectorizedValue;
13039 }
13040
13041 Value *V = Builder.CreateUnOp(
13042 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
13043 propagateIRFlags(V, E->Scalars, VL0);
13044 if (auto *I = dyn_cast<Instruction>(V))
13045 V = propagateMetadata(I, E->Scalars);
13046
13047 V = FinalShuffle(V, E, VecTy);
13048
13049 E->VectorizedValue = V;
13050 ++NumVectorInstructions;
13051
13052 return V;
13053 }
13054 case Instruction::Add:
13055 case Instruction::FAdd:
13056 case Instruction::Sub:
13057 case Instruction::FSub:
13058 case Instruction::Mul:
13059 case Instruction::FMul:
13060 case Instruction::UDiv:
13061 case Instruction::SDiv:
13062 case Instruction::FDiv:
13063 case Instruction::URem:
13064 case Instruction::SRem:
13065 case Instruction::FRem:
13066 case Instruction::Shl:
13067 case Instruction::LShr:
13068 case Instruction::AShr:
13069 case Instruction::And:
13070 case Instruction::Or:
13071 case Instruction::Xor: {
13072 setInsertPointAfterBundle(E);
13073
13074 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
13075 if (E->VectorizedValue) {
13076 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13077 return E->VectorizedValue;
13078 }
13079 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
13080 if (E->VectorizedValue) {
13081 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13082 return E->VectorizedValue;
13083 }
13084 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
13085 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
13086 ArrayRef<Value *> Ops = E->getOperand(I);
13087 if (all_of(Ops, [&](Value *Op) {
13088 auto *CI = dyn_cast<ConstantInt>(Op);
13089 return CI && CI->getValue().countr_one() >= It->second.first;
13090 })) {
13091 V = FinalShuffle(I == 0 ? RHS : LHS, E, VecTy);
13092 E->VectorizedValue = V;
13093 ++NumVectorInstructions;
13094 return V;
13095 }
13096 }
13097 }
13098 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
13099 assert((It != MinBWs.end() ||
13100 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
13101 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
13102 MinBWs.contains(getOperandEntry(E, 0)) ||
13103 MinBWs.contains(getOperandEntry(E, 1))) &&
13104 "Expected item in MinBWs.");
13105 if (LHS->getType() != VecTy)
13106 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
13107 if (RHS->getType() != VecTy)
13108 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
13109 }
13110
13111 Value *V = Builder.CreateBinOp(
13112 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
13113 RHS);
13114 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
13115 if (auto *I = dyn_cast<Instruction>(V)) {
13116 V = propagateMetadata(I, E->Scalars);
13117 // Drop nuw flags for abs(sub(commutative), true).
13118 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
13119 any_of(E->Scalars, [](Value *V) {
13120 return isCommutative(cast<Instruction>(V));
13121 }))
13122 I->setHasNoUnsignedWrap(/*b=*/false);
13123 }
13124
13125 V = FinalShuffle(V, E, VecTy);
13126
13127 E->VectorizedValue = V;
13128 ++NumVectorInstructions;
13129
13130 return V;
13131 }
13132 case Instruction::Load: {
13133 // Loads are inserted at the head of the tree because we don't want to
13134 // sink them all the way down past store instructions.
13135 setInsertPointAfterBundle(E);
13136
13137 LoadInst *LI = cast<LoadInst>(VL0);
13138 Instruction *NewLI;
13139 Value *PO = LI->getPointerOperand();
13140 if (E->State == TreeEntry::Vectorize) {
13141 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
13142 } else if (E->State == TreeEntry::StridedVectorize) {
13143 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
13144 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
13145 PO = IsReverseOrder ? PtrN : Ptr0;
13146 std::optional<int> Diff = getPointersDiff(
13147 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
13148 Type *StrideTy = DL->getIndexType(PO->getType());
13149 Value *StrideVal;
13150 if (Diff) {
13151 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
13152 StrideVal =
13153 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
13154 DL->getTypeAllocSize(ScalarTy));
13155 } else {
13156 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
13157 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
13158 return cast<LoadInst>(V)->getPointerOperand();
13159 });
13160 OrdersType Order;
13161 std::optional<Value *> Stride =
13162 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
13163 &*Builder.GetInsertPoint());
13164 Value *NewStride =
13165 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
13166 StrideVal = Builder.CreateMul(
13167 NewStride,
13168 ConstantInt::get(
13169 StrideTy,
13170 (IsReverseOrder ? -1 : 1) *
13171 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
13172 }
13173 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13174 auto *Inst = Builder.CreateIntrinsic(
13175 Intrinsic::experimental_vp_strided_load,
13176 {VecTy, PO->getType(), StrideTy},
13177 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
13178 Builder.getInt32(E->Scalars.size())});
13179 Inst->addParamAttr(
13180 /*ArgNo=*/0,
13181 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
13182 NewLI = Inst;
13183 } else {
13184 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
13185 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
13186 if (E->VectorizedValue) {
13187 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13188 return E->VectorizedValue;
13189 }
13190 // Use the minimum alignment of the gathered loads.
13191 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13192 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
13193 }
13194 Value *V = propagateMetadata(NewLI, E->Scalars);
13195
13196 V = FinalShuffle(V, E, VecTy);
13197 E->VectorizedValue = V;
13198 ++NumVectorInstructions;
13199 return V;
13200 }
13201 case Instruction::Store: {
13202 auto *SI = cast<StoreInst>(VL0);
13203
13204 setInsertPointAfterBundle(E);
13205
13206 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
13207 if (VecValue->getType() != VecTy)
13208 VecValue =
13209 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
13210 VecValue = FinalShuffle(VecValue, E, VecTy);
13211
13212 Value *Ptr = SI->getPointerOperand();
13213 Instruction *ST;
13214 if (E->State == TreeEntry::Vectorize) {
13215 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
13216 } else {
13217 assert(E->State == TreeEntry::StridedVectorize &&
13218 "Expected either strided or conseutive stores.");
13219 if (!E->ReorderIndices.empty()) {
13220 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
13221 Ptr = SI->getPointerOperand();
13222 }
13223 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
13224 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
13225 auto *Inst = Builder.CreateIntrinsic(
13226 Intrinsic::experimental_vp_strided_store,
13227 {VecTy, Ptr->getType(), StrideTy},
13228 {VecValue, Ptr,
13229 ConstantInt::get(
13230 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
13231 Builder.getAllOnesMask(VecTy->getElementCount()),
13232 Builder.getInt32(E->Scalars.size())});
13233 Inst->addParamAttr(
13234 /*ArgNo=*/1,
13235 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
13236 ST = Inst;
13237 }
13238
13239 Value *V = propagateMetadata(ST, E->Scalars);
13240
13241 E->VectorizedValue = V;
13242 ++NumVectorInstructions;
13243 return V;
13244 }
13245 case Instruction::GetElementPtr: {
13246 auto *GEP0 = cast<GetElementPtrInst>(VL0);
13247 setInsertPointAfterBundle(E);
13248
13249 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
13250 if (E->VectorizedValue) {
13251 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13252 return E->VectorizedValue;
13253 }
13254
13255 SmallVector<Value *> OpVecs;
13256 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
13257 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
13258 if (E->VectorizedValue) {
13259 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13260 return E->VectorizedValue;
13261 }
13262 OpVecs.push_back(OpVec);
13263 }
13264
13265 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
13266 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
13268 for (Value *V : E->Scalars) {
13269 if (isa<GetElementPtrInst>(V))
13270 GEPs.push_back(V);
13271 }
13272 V = propagateMetadata(I, GEPs);
13273 }
13274
13275 V = FinalShuffle(V, E, VecTy);
13276
13277 E->VectorizedValue = V;
13278 ++NumVectorInstructions;
13279
13280 return V;
13281 }
13282 case Instruction::Call: {
13283 CallInst *CI = cast<CallInst>(VL0);
13284 setInsertPointAfterBundle(E);
13285
13287
13288 SmallVector<Type *> ArgTys =
13290 It != MinBWs.end() ? It->second.first : 0);
13291 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
13292 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
13293 VecCallCosts.first <= VecCallCosts.second;
13294
13295 Value *ScalarArg = nullptr;
13296 SmallVector<Value *> OpVecs;
13297 SmallVector<Type *, 2> TysForDecl;
13298 // Add return type if intrinsic is overloaded on it.
13299 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
13300 TysForDecl.push_back(VecTy);
13301 auto *CEI = cast<CallInst>(VL0);
13302 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
13303 ValueList OpVL;
13304 // Some intrinsics have scalar arguments. This argument should not be
13305 // vectorized.
13306 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
13307 ScalarArg = CEI->getArgOperand(I);
13308 // if decided to reduce bitwidth of abs intrinsic, it second argument
13309 // must be set false (do not return poison, if value issigned min).
13310 if (ID == Intrinsic::abs && It != MinBWs.end() &&
13311 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
13312 ScalarArg = Builder.getFalse();
13313 OpVecs.push_back(ScalarArg);
13315 TysForDecl.push_back(ScalarArg->getType());
13316 continue;
13317 }
13318
13319 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
13320 if (E->VectorizedValue) {
13321 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13322 return E->VectorizedValue;
13323 }
13324 ScalarArg = CEI->getArgOperand(I);
13325 if (cast<VectorType>(OpVec->getType())->getElementType() !=
13326 ScalarArg->getType() &&
13327 It == MinBWs.end()) {
13328 auto *CastTy = FixedVectorType::get(ScalarArg->getType(),
13329 VecTy->getNumElements());
13330 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
13331 } else if (It != MinBWs.end()) {
13332 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
13333 }
13334 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
13335 OpVecs.push_back(OpVec);
13336 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
13337 TysForDecl.push_back(OpVec->getType());
13338 }
13339
13340 Function *CF;
13341 if (!UseIntrinsic) {
13342 VFShape Shape =
13345 static_cast<unsigned>(VecTy->getNumElements())),
13346 false /*HasGlobalPred*/);
13347 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
13348 } else {
13349 CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl);
13350 }
13351
13353 CI->getOperandBundlesAsDefs(OpBundles);
13354 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
13355
13356 propagateIRFlags(V, E->Scalars, VL0);
13357 V = FinalShuffle(V, E, VecTy);
13358
13359 E->VectorizedValue = V;
13360 ++NumVectorInstructions;
13361 return V;
13362 }
13363 case Instruction::ShuffleVector: {
13364 assert(E->isAltShuffle() &&
13365 ((Instruction::isBinaryOp(E->getOpcode()) &&
13366 Instruction::isBinaryOp(E->getAltOpcode())) ||
13367 (Instruction::isCast(E->getOpcode()) &&
13368 Instruction::isCast(E->getAltOpcode())) ||
13369 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
13370 "Invalid Shuffle Vector Operand");
13371
13372 Value *LHS = nullptr, *RHS = nullptr;
13373 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
13374 setInsertPointAfterBundle(E);
13375 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13376 if (E->VectorizedValue) {
13377 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13378 return E->VectorizedValue;
13379 }
13380 RHS = vectorizeOperand(E, 1, PostponedPHIs);
13381 } else {
13382 setInsertPointAfterBundle(E);
13383 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13384 }
13385 if (E->VectorizedValue) {
13386 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13387 return E->VectorizedValue;
13388 }
13389 if (LHS && RHS &&
13390 ((Instruction::isBinaryOp(E->getOpcode()) &&
13391 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
13392 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
13393 assert((It != MinBWs.end() ||
13394 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
13395 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
13396 MinBWs.contains(getOperandEntry(E, 0)) ||
13397 MinBWs.contains(getOperandEntry(E, 1))) &&
13398 "Expected item in MinBWs.");
13399 Type *CastTy = VecTy;
13400 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
13401 if (cast<VectorType>(LHS->getType())
13402 ->getElementType()
13403 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
13404 ->getElementType()
13405 ->getIntegerBitWidth())
13406 CastTy = RHS->getType();
13407 else
13408 CastTy = LHS->getType();
13409 }
13410 if (LHS->getType() != CastTy)
13411 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
13412 if (RHS->getType() != CastTy)
13413 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
13414 }
13415
13416 Value *V0, *V1;
13417 if (Instruction::isBinaryOp(E->getOpcode())) {
13418 V0 = Builder.CreateBinOp(
13419 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
13420 V1 = Builder.CreateBinOp(
13421 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
13422 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
13423 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
13424 auto *AltCI = cast<CmpInst>(E->getAltOp());
13425 CmpInst::Predicate AltPred = AltCI->getPredicate();
13426 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
13427 } else {
13428 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
13429 unsigned SrcBWSz = DL->getTypeSizeInBits(
13430 cast<VectorType>(LHS->getType())->getElementType());
13431 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
13432 if (BWSz <= SrcBWSz) {
13433 if (BWSz < SrcBWSz)
13434 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
13435 assert(LHS->getType() == VecTy && "Expected same type as operand.");
13436 if (auto *I = dyn_cast<Instruction>(LHS))
13437 LHS = propagateMetadata(I, E->Scalars);
13438 E->VectorizedValue = LHS;
13439 ++NumVectorInstructions;
13440 return LHS;
13441 }
13442 }
13443 V0 = Builder.CreateCast(
13444 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
13445 V1 = Builder.CreateCast(
13446 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
13447 }
13448 // Add V0 and V1 to later analysis to try to find and remove matching
13449 // instruction, if any.
13450 for (Value *V : {V0, V1}) {
13451 if (auto *I = dyn_cast<Instruction>(V)) {
13452 GatherShuffleExtractSeq.insert(I);
13453 CSEBlocks.insert(I->getParent());
13454 }
13455 }
13456
13457 // Create shuffle to take alternate operations from the vector.
13458 // Also, gather up main and alt scalar ops to propagate IR flags to
13459 // each vector operation.
13460 ValueList OpScalars, AltScalars;
13462 E->buildAltOpShuffleMask(
13463 [E, this](Instruction *I) {
13464 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
13465 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
13466 *TLI);
13467 },
13468 Mask, &OpScalars, &AltScalars);
13469
13470 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
13471 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
13472 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
13473 // Drop nuw flags for abs(sub(commutative), true).
13474 if (auto *I = dyn_cast<Instruction>(Vec);
13475 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
13476 any_of(E->Scalars, [](Value *V) {
13477 auto *IV = cast<Instruction>(V);
13478 return IV->getOpcode() == Instruction::Sub &&
13479 isCommutative(cast<Instruction>(IV));
13480 }))
13481 I->setHasNoUnsignedWrap(/*b=*/false);
13482 };
13483 DropNuwFlag(V0, E->getOpcode());
13484 DropNuwFlag(V1, E->getAltOpcode());
13485
13486 Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
13487 if (auto *I = dyn_cast<Instruction>(V)) {
13488 V = propagateMetadata(I, E->Scalars);
13489 GatherShuffleExtractSeq.insert(I);
13490 CSEBlocks.insert(I->getParent());
13491 }
13492
13493 E->VectorizedValue = V;
13494 ++NumVectorInstructions;
13495
13496 return V;
13497 }
13498 default:
13499 llvm_unreachable("unknown inst");
13500 }
13501 return nullptr;
13502}
13503
13505 ExtraValueToDebugLocsMap ExternallyUsedValues;
13506 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
13507 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13508}
13509
13510namespace {
13511/// Data type for handling buildvector sequences with the reused scalars from
13512/// other tree entries.
13513struct ShuffledInsertData {
13514 /// List of insertelements to be replaced by shuffles.
13515 SmallVector<InsertElementInst *> InsertElements;
13516 /// The parent vectors and shuffle mask for the given list of inserts.
13518};
13519} // namespace
13520
13522 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
13523 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
13524 Instruction *ReductionRoot) {
13525 // All blocks must be scheduled before any instructions are inserted.
13526 for (auto &BSIter : BlocksSchedules) {
13527 scheduleBlock(BSIter.second.get());
13528 }
13529 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
13530 // need to rebuild it.
13531 EntryToLastInstruction.clear();
13532
13533 if (ReductionRoot)
13534 Builder.SetInsertPoint(ReductionRoot->getParent(),
13535 ReductionRoot->getIterator());
13536 else
13537 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13538
13539 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
13540 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
13541 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13542 if (TE->State == TreeEntry::Vectorize &&
13543 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13544 TE->VectorizedValue)
13545 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
13546 // Run through the list of postponed gathers and emit them, replacing the temp
13547 // emitted allocas with actual vector instructions.
13548 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
13550 for (const TreeEntry *E : PostponedNodes) {
13551 auto *TE = const_cast<TreeEntry *>(E);
13552 if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
13553 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
13554 TE->UserTreeIndices.front().EdgeIdx)) &&
13555 VecTE->isSame(TE->Scalars))
13556 // Found gather node which is absolutely the same as one of the
13557 // vectorized nodes. It may happen after reordering.
13558 continue;
13559 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
13560 TE->VectorizedValue = nullptr;
13561 auto *UserI =
13562 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
13563 // If user is a PHI node, its vector code have to be inserted right before
13564 // block terminator. Since the node was delayed, there were some unresolved
13565 // dependencies at the moment when stab instruction was emitted. In a case
13566 // when any of these dependencies turn out an operand of another PHI, coming
13567 // from this same block, position of a stab instruction will become invalid.
13568 // The is because source vector that supposed to feed this gather node was
13569 // inserted at the end of the block [after stab instruction]. So we need
13570 // to adjust insertion point again to the end of block.
13571 if (isa<PHINode>(UserI)) {
13572 // Insert before all users.
13573 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
13574 for (User *U : PrevVec->users()) {
13575 if (U == UserI)
13576 continue;
13577 auto *UI = dyn_cast<Instruction>(U);
13578 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
13579 continue;
13580 if (UI->comesBefore(InsertPt))
13581 InsertPt = UI;
13582 }
13583 Builder.SetInsertPoint(InsertPt);
13584 } else {
13585 Builder.SetInsertPoint(PrevVec);
13586 }
13587 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
13588 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
13589 if (Vec->getType() != PrevVec->getType()) {
13590 assert(Vec->getType()->isIntOrIntVectorTy() &&
13591 PrevVec->getType()->isIntOrIntVectorTy() &&
13592 "Expected integer vector types only.");
13593 std::optional<bool> IsSigned;
13594 for (Value *V : TE->Scalars) {
13595 if (const TreeEntry *BaseTE = getTreeEntry(V)) {
13596 auto It = MinBWs.find(BaseTE);
13597 if (It != MinBWs.end()) {
13598 IsSigned = IsSigned.value_or(false) || It->second.second;
13599 if (*IsSigned)
13600 break;
13601 }
13602 for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
13603 auto It = MinBWs.find(MNTE);
13604 if (It != MinBWs.end()) {
13605 IsSigned = IsSigned.value_or(false) || It->second.second;
13606 if (*IsSigned)
13607 break;
13608 }
13609 }
13610 if (IsSigned.value_or(false))
13611 break;
13612 // Scan through gather nodes.
13613 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
13614 auto It = MinBWs.find(BVE);
13615 if (It != MinBWs.end()) {
13616 IsSigned = IsSigned.value_or(false) || It->second.second;
13617 if (*IsSigned)
13618 break;
13619 }
13620 }
13621 if (IsSigned.value_or(false))
13622 break;
13623 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
13624 IsSigned =
13625 IsSigned.value_or(false) ||
13626 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
13627 continue;
13628 }
13629 if (IsSigned.value_or(false))
13630 break;
13631 }
13632 }
13633 if (IsSigned.value_or(false)) {
13634 // Final attempt - check user node.
13635 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
13636 if (It != MinBWs.end())
13637 IsSigned = It->second.second;
13638 }
13639 assert(IsSigned &&
13640 "Expected user node or perfect diamond match in MinBWs.");
13641 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
13642 }
13643 PrevVec->replaceAllUsesWith(Vec);
13644 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
13645 // Replace the stub vector node, if it was used before for one of the
13646 // buildvector nodes already.
13647 auto It = PostponedValues.find(PrevVec);
13648 if (It != PostponedValues.end()) {
13649 for (TreeEntry *VTE : It->getSecond())
13650 VTE->VectorizedValue = Vec;
13651 }
13652 eraseInstruction(PrevVec);
13653 }
13654
13655 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
13656 << " values .\n");
13657
13658 SmallVector<ShuffledInsertData> ShuffledInserts;
13659 // Maps vector instruction to original insertelement instruction
13660 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
13661 // Maps extract Scalar to the corresponding extractelement instruction in the
13662 // basic block. Only one extractelement per block should be emitted.
13663 DenseMap<Value *,
13665 ScalarToEEs;
13666 SmallDenseSet<Value *, 4> UsedInserts;
13668 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
13669 // Extract all of the elements with the external uses.
13670 for (const auto &ExternalUse : ExternalUses) {
13671 Value *Scalar = ExternalUse.Scalar;
13672 llvm::User *User = ExternalUse.User;
13673
13674 // Skip users that we already RAUW. This happens when one instruction
13675 // has multiple uses of the same value.
13676 if (User && !is_contained(Scalar->users(), User))
13677 continue;
13678 TreeEntry *E = getTreeEntry(Scalar);
13679 assert(E && "Invalid scalar");
13680 assert(E->State != TreeEntry::NeedToGather &&
13681 "Extracting from a gather list");
13682 // Non-instruction pointers are not deleted, just skip them.
13683 if (E->getOpcode() == Instruction::GetElementPtr &&
13684 !isa<GetElementPtrInst>(Scalar))
13685 continue;
13686
13687 Value *Vec = E->VectorizedValue;
13688 assert(Vec && "Can't find vectorizable value");
13689
13690 Value *Lane = Builder.getInt32(ExternalUse.Lane);
13691 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
13692 if (Scalar->getType() != Vec->getType()) {
13693 Value *Ex = nullptr;
13694 Value *ExV = nullptr;
13695 auto *GEP = dyn_cast<GetElementPtrInst>(Scalar);
13696 bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP);
13697 auto It = ScalarToEEs.find(Scalar);
13698 if (It != ScalarToEEs.end()) {
13699 // No need to emit many extracts, just move the only one in the
13700 // current block.
13701 auto EEIt = It->second.find(Builder.GetInsertBlock());
13702 if (EEIt != It->second.end()) {
13703 Instruction *I = EEIt->second.first;
13704 if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
13705 Builder.GetInsertPoint()->comesBefore(I)) {
13706 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
13707 Builder.GetInsertPoint());
13708 if (auto *CI = EEIt->second.second)
13709 CI->moveAfter(I);
13710 }
13711 Ex = I;
13712 ExV = EEIt->second.second ? EEIt->second.second : Ex;
13713 }
13714 }
13715 if (!Ex) {
13716 // "Reuse" the existing extract to improve final codegen.
13717 if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
13718 Value *V = ES->getVectorOperand();
13719 if (const TreeEntry *ETE = getTreeEntry(V))
13720 V = ETE->VectorizedValue;
13721 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
13722 } else if (ReplaceGEP) {
13723 // Leave the GEPs as is, they are free in most cases and better to
13724 // keep them as GEPs.
13725 auto *CloneGEP = GEP->clone();
13726 if (isa<Instruction>(Vec))
13727 CloneGEP->insertBefore(*Builder.GetInsertBlock(),
13728 Builder.GetInsertPoint());
13729 else
13730 CloneGEP->insertBefore(GEP);
13731 if (GEP->hasName())
13732 CloneGEP->takeName(GEP);
13733 Ex = CloneGEP;
13734 } else {
13735 Ex = Builder.CreateExtractElement(Vec, Lane);
13736 }
13737 // If necessary, sign-extend or zero-extend ScalarRoot
13738 // to the larger type.
13739 ExV = Ex;
13740 if (Scalar->getType() != Ex->getType())
13741 ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
13742 MinBWs.find(E)->second.second);
13743 if (auto *I = dyn_cast<Instruction>(Ex))
13744 ScalarToEEs[Scalar].try_emplace(
13745 Builder.GetInsertBlock(),
13746 std::make_pair(I, cast<Instruction>(ExV)));
13747 }
13748 // The then branch of the previous if may produce constants, since 0
13749 // operand might be a constant.
13750 if (auto *ExI = dyn_cast<Instruction>(Ex)) {
13751 GatherShuffleExtractSeq.insert(ExI);
13752 CSEBlocks.insert(ExI->getParent());
13753 }
13754 return ExV;
13755 }
13756 assert(isa<FixedVectorType>(Scalar->getType()) &&
13757 isa<InsertElementInst>(Scalar) &&
13758 "In-tree scalar of vector type is not insertelement?");
13759 auto *IE = cast<InsertElementInst>(Scalar);
13760 VectorToInsertElement.try_emplace(Vec, IE);
13761 return Vec;
13762 };
13763 // If User == nullptr, the Scalar remains as scalar in vectorized
13764 // instructions or is used as extra arg. Generate ExtractElement instruction
13765 // and update the record for this scalar in ExternallyUsedValues.
13766 if (!User) {
13767 if (!ScalarsWithNullptrUser.insert(Scalar).second)
13768 continue;
13769 assert((ExternallyUsedValues.count(Scalar) ||
13770 any_of(Scalar->users(),
13771 [&](llvm::User *U) {
13772 if (ExternalUsesAsGEPs.contains(U))
13773 return true;
13774 TreeEntry *UseEntry = getTreeEntry(U);
13775 return UseEntry &&
13776 (UseEntry->State == TreeEntry::Vectorize ||
13777 UseEntry->State ==
13778 TreeEntry::StridedVectorize) &&
13779 (E->State == TreeEntry::Vectorize ||
13780 E->State == TreeEntry::StridedVectorize) &&
13781 doesInTreeUserNeedToExtract(
13782 Scalar,
13783 cast<Instruction>(UseEntry->Scalars.front()),
13784 TLI);
13785 })) &&
13786 "Scalar with nullptr User must be registered in "
13787 "ExternallyUsedValues map or remain as scalar in vectorized "
13788 "instructions");
13789 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
13790 if (auto *PHI = dyn_cast<PHINode>(VecI))
13791 Builder.SetInsertPoint(PHI->getParent(),
13792 PHI->getParent()->getFirstNonPHIIt());
13793 else
13794 Builder.SetInsertPoint(VecI->getParent(),
13795 std::next(VecI->getIterator()));
13796 } else {
13797 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13798 }
13799 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13800 // Required to update internally referenced instructions.
13801 Scalar->replaceAllUsesWith(NewInst);
13802 ReplacedExternals.emplace_back(Scalar, NewInst);
13803 continue;
13804 }
13805
13806 if (auto *VU = dyn_cast<InsertElementInst>(User);
13807 VU && VU->getOperand(1) == Scalar) {
13808 // Skip if the scalar is another vector op or Vec is not an instruction.
13809 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
13810 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
13811 if (!UsedInserts.insert(VU).second)
13812 continue;
13813 // Need to use original vector, if the root is truncated.
13814 auto BWIt = MinBWs.find(E);
13815 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
13816 auto *ScalarTy = FTy->getElementType();
13817 auto Key = std::make_pair(Vec, ScalarTy);
13818 auto VecIt = VectorCasts.find(Key);
13819 if (VecIt == VectorCasts.end()) {
13820 IRBuilderBase::InsertPointGuard Guard(Builder);
13821 if (auto *IVec = dyn_cast<PHINode>(Vec))
13822 Builder.SetInsertPoint(
13823 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
13824 else if (auto *IVec = dyn_cast<Instruction>(Vec))
13825 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
13826 Vec = Builder.CreateIntCast(
13827 Vec,
13829 ScalarTy,
13830 cast<FixedVectorType>(Vec->getType())->getNumElements()),
13831 BWIt->second.second);
13832 VectorCasts.try_emplace(Key, Vec);
13833 } else {
13834 Vec = VecIt->second;
13835 }
13836 }
13837
13838 std::optional<unsigned> InsertIdx = getInsertIndex(VU);
13839 if (InsertIdx) {
13840 auto *It =
13841 find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
13842 // Checks if 2 insertelements are from the same buildvector.
13843 InsertElementInst *VecInsert = Data.InsertElements.front();
13845 VU, VecInsert,
13846 [](InsertElementInst *II) { return II->getOperand(0); });
13847 });
13848 unsigned Idx = *InsertIdx;
13849 if (It == ShuffledInserts.end()) {
13850 (void)ShuffledInserts.emplace_back();
13851 It = std::next(ShuffledInserts.begin(),
13852 ShuffledInserts.size() - 1);
13853 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
13854 if (Mask.empty())
13855 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
13856 // Find the insertvector, vectorized in tree, if any.
13857 Value *Base = VU;
13858 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
13859 if (IEBase != User &&
13860 (!IEBase->hasOneUse() ||
13861 getInsertIndex(IEBase).value_or(Idx) == Idx))
13862 break;
13863 // Build the mask for the vectorized insertelement instructions.
13864 if (const TreeEntry *E = getTreeEntry(IEBase)) {
13865 do {
13866 IEBase = cast<InsertElementInst>(Base);
13867 int IEIdx = *getInsertIndex(IEBase);
13868 assert(Mask[IEIdx] == PoisonMaskElem &&
13869 "InsertElementInstruction used already.");
13870 Mask[IEIdx] = IEIdx;
13871 Base = IEBase->getOperand(0);
13872 } while (E == getTreeEntry(Base));
13873 break;
13874 }
13875 Base = cast<InsertElementInst>(Base)->getOperand(0);
13876 // After the vectorization the def-use chain has changed, need
13877 // to look through original insertelement instructions, if they
13878 // get replaced by vector instructions.
13879 auto It = VectorToInsertElement.find(Base);
13880 if (It != VectorToInsertElement.end())
13881 Base = It->second;
13882 }
13883 }
13884 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
13885 if (Mask.empty())
13886 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
13887 Mask[Idx] = ExternalUse.Lane;
13888 It->InsertElements.push_back(cast<InsertElementInst>(User));
13889 continue;
13890 }
13891 }
13892 }
13893 }
13894
13895 // Generate extracts for out-of-tree users.
13896 // Find the insertion point for the extractelement lane.
13897 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
13898 if (PHINode *PH = dyn_cast<PHINode>(User)) {
13899 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
13900 if (PH->getIncomingValue(I) == Scalar) {
13901 Instruction *IncomingTerminator =
13902 PH->getIncomingBlock(I)->getTerminator();
13903 if (isa<CatchSwitchInst>(IncomingTerminator)) {
13904 Builder.SetInsertPoint(VecI->getParent(),
13905 std::next(VecI->getIterator()));
13906 } else {
13907 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
13908 }
13909 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13910 PH->setOperand(I, NewInst);
13911 }
13912 }
13913 } else {
13914 Builder.SetInsertPoint(cast<Instruction>(User));
13915 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13916 User->replaceUsesOfWith(Scalar, NewInst);
13917 }
13918 } else {
13919 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13920 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13921 User->replaceUsesOfWith(Scalar, NewInst);
13922 }
13923
13924 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
13925 }
13926
13927 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
13928 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
13929 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
13930 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
13931 for (int I = 0, E = Mask.size(); I < E; ++I) {
13932 if (Mask[I] < VF)
13933 CombinedMask1[I] = Mask[I];
13934 else
13935 CombinedMask2[I] = Mask[I] - VF;
13936 }
13937 ShuffleInstructionBuilder ShuffleBuilder(
13938 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
13939 ShuffleBuilder.add(V1, CombinedMask1);
13940 if (V2)
13941 ShuffleBuilder.add(V2, CombinedMask2);
13942 return ShuffleBuilder.finalize(std::nullopt);
13943 };
13944
13945 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
13946 bool ForSingleMask) {
13947 unsigned VF = Mask.size();
13948 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
13949 if (VF != VecVF) {
13950 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
13951 Vec = CreateShuffle(Vec, nullptr, Mask);
13952 return std::make_pair(Vec, true);
13953 }
13954 if (!ForSingleMask) {
13955 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
13956 for (unsigned I = 0; I < VF; ++I) {
13957 if (Mask[I] != PoisonMaskElem)
13958 ResizeMask[Mask[I]] = Mask[I];
13959 }
13960 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
13961 }
13962 }
13963
13964 return std::make_pair(Vec, false);
13965 };
13966 // Perform shuffling of the vectorize tree entries for better handling of
13967 // external extracts.
13968 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
13969 // Find the first and the last instruction in the list of insertelements.
13970 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
13971 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
13972 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
13973 Builder.SetInsertPoint(LastInsert);
13974 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
13975 Value *NewInst = performExtractsShuffleAction<Value>(
13976 MutableArrayRef(Vector.data(), Vector.size()),
13977 FirstInsert->getOperand(0),
13978 [](Value *Vec) {
13979 return cast<VectorType>(Vec->getType())
13980 ->getElementCount()
13981 .getKnownMinValue();
13982 },
13983 ResizeToVF,
13984 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
13985 ArrayRef<Value *> Vals) {
13986 assert((Vals.size() == 1 || Vals.size() == 2) &&
13987 "Expected exactly 1 or 2 input values.");
13988 if (Vals.size() == 1) {
13989 // Do not create shuffle if the mask is a simple identity
13990 // non-resizing mask.
13991 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
13992 ->getNumElements() ||
13993 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
13994 return CreateShuffle(Vals.front(), nullptr, Mask);
13995 return Vals.front();
13996 }
13997 return CreateShuffle(Vals.front() ? Vals.front()
13998 : FirstInsert->getOperand(0),
13999 Vals.back(), Mask);
14000 });
14001 auto It = ShuffledInserts[I].InsertElements.rbegin();
14002 // Rebuild buildvector chain.
14003 InsertElementInst *II = nullptr;
14004 if (It != ShuffledInserts[I].InsertElements.rend())
14005 II = *It;
14007 while (It != ShuffledInserts[I].InsertElements.rend()) {
14008 assert(II && "Must be an insertelement instruction.");
14009 if (*It == II)
14010 ++It;
14011 else
14012 Inserts.push_back(cast<Instruction>(II));
14013 II = dyn_cast<InsertElementInst>(II->getOperand(0));
14014 }
14015 for (Instruction *II : reverse(Inserts)) {
14016 II->replaceUsesOfWith(II->getOperand(0), NewInst);
14017 if (auto *NewI = dyn_cast<Instruction>(NewInst))
14018 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
14019 II->moveAfter(NewI);
14020 NewInst = II;
14021 }
14022 LastInsert->replaceAllUsesWith(NewInst);
14023 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
14024 IE->replaceUsesOfWith(IE->getOperand(0),
14025 PoisonValue::get(IE->getOperand(0)->getType()));
14026 IE->replaceUsesOfWith(IE->getOperand(1),
14027 PoisonValue::get(IE->getOperand(1)->getType()));
14028 eraseInstruction(IE);
14029 }
14030 CSEBlocks.insert(LastInsert->getParent());
14031 }
14032
14033 SmallVector<Instruction *> RemovedInsts;
14034 // For each vectorized value:
14035 for (auto &TEPtr : VectorizableTree) {
14036 TreeEntry *Entry = TEPtr.get();
14037
14038 // No need to handle users of gathered values.
14039 if (Entry->State == TreeEntry::NeedToGather)
14040 continue;
14041
14042 assert(Entry->VectorizedValue && "Can't find vectorizable value");
14043
14044 // For each lane:
14045 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
14046 Value *Scalar = Entry->Scalars[Lane];
14047
14048 if (Entry->getOpcode() == Instruction::GetElementPtr &&
14049 !isa<GetElementPtrInst>(Scalar))
14050 continue;
14051#ifndef NDEBUG
14052 Type *Ty = Scalar->getType();
14053 if (!Ty->isVoidTy()) {
14054 for (User *U : Scalar->users()) {
14055 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
14056
14057 // It is legal to delete users in the ignorelist.
14058 assert((getTreeEntry(U) ||
14059 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14060 (isa_and_nonnull<Instruction>(U) &&
14061 isDeleted(cast<Instruction>(U)))) &&
14062 "Deleting out-of-tree value");
14063 }
14064 }
14065#endif
14066 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
14067 eraseInstruction(cast<Instruction>(Scalar));
14068 // Retain to-be-deleted instructions for some debug-info
14069 // bookkeeping. NOTE: eraseInstruction only marks the instruction for
14070 // deletion - instructions are not deleted until later.
14071 RemovedInsts.push_back(cast<Instruction>(Scalar));
14072 }
14073 }
14074
14075 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
14076 // new vector instruction.
14077 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
14078 V->mergeDIAssignID(RemovedInsts);
14079
14080 Builder.ClearInsertionPoint();
14081 InstrElementSize.clear();
14082
14083 const TreeEntry &RootTE = *VectorizableTree.front().get();
14084 Value *Vec = RootTE.VectorizedValue;
14085 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
14086 It != MinBWs.end() &&
14087 ReductionBitWidth != It->second.first) {
14088 IRBuilder<>::InsertPointGuard Guard(Builder);
14089 Builder.SetInsertPoint(ReductionRoot->getParent(),
14090 ReductionRoot->getIterator());
14091 Vec = Builder.CreateIntCast(
14092 Vec,
14093 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
14094 cast<VectorType>(Vec->getType())->getElementCount()),
14095 It->second.second);
14096 }
14097 return Vec;
14098}
14099
14101 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
14102 << " gather sequences instructions.\n");
14103 // LICM InsertElementInst sequences.
14104 for (Instruction *I : GatherShuffleExtractSeq) {
14105 if (isDeleted(I))
14106 continue;
14107
14108 // Check if this block is inside a loop.
14109 Loop *L = LI->getLoopFor(I->getParent());
14110 if (!L)
14111 continue;
14112
14113 // Check if it has a preheader.
14114 BasicBlock *PreHeader = L->getLoopPreheader();
14115 if (!PreHeader)
14116 continue;
14117
14118 // If the vector or the element that we insert into it are
14119 // instructions that are defined in this basic block then we can't
14120 // hoist this instruction.
14121 if (any_of(I->operands(), [L](Value *V) {
14122 auto *OpI = dyn_cast<Instruction>(V);
14123 return OpI && L->contains(OpI);
14124 }))
14125 continue;
14126
14127 // We can hoist this instruction. Move it to the pre-header.
14128 I->moveBefore(PreHeader->getTerminator());
14129 CSEBlocks.insert(PreHeader);
14130 }
14131
14132 // Make a list of all reachable blocks in our CSE queue.
14134 CSEWorkList.reserve(CSEBlocks.size());
14135 for (BasicBlock *BB : CSEBlocks)
14136 if (DomTreeNode *N = DT->getNode(BB)) {
14138 CSEWorkList.push_back(N);
14139 }
14140
14141 // Sort blocks by domination. This ensures we visit a block after all blocks
14142 // dominating it are visited.
14143 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
14144 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
14145 "Different nodes should have different DFS numbers");
14146 return A->getDFSNumIn() < B->getDFSNumIn();
14147 });
14148
14149 // Less defined shuffles can be replaced by the more defined copies.
14150 // Between two shuffles one is less defined if it has the same vector operands
14151 // and its mask indeces are the same as in the first one or undefs. E.g.
14152 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
14153 // poison, <0, 0, 0, 0>.
14154 auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
14155 SmallVectorImpl<int> &NewMask) {
14156 if (I1->getType() != I2->getType())
14157 return false;
14158 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
14159 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
14160 if (!SI1 || !SI2)
14161 return I1->isIdenticalTo(I2);
14162 if (SI1->isIdenticalTo(SI2))
14163 return true;
14164 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
14165 if (SI1->getOperand(I) != SI2->getOperand(I))
14166 return false;
14167 // Check if the second instruction is more defined than the first one.
14168 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
14169 ArrayRef<int> SM1 = SI1->getShuffleMask();
14170 // Count trailing undefs in the mask to check the final number of used
14171 // registers.
14172 unsigned LastUndefsCnt = 0;
14173 for (int I = 0, E = NewMask.size(); I < E; ++I) {
14174 if (SM1[I] == PoisonMaskElem)
14175 ++LastUndefsCnt;
14176 else
14177 LastUndefsCnt = 0;
14178 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
14179 NewMask[I] != SM1[I])
14180 return false;
14181 if (NewMask[I] == PoisonMaskElem)
14182 NewMask[I] = SM1[I];
14183 }
14184 // Check if the last undefs actually change the final number of used vector
14185 // registers.
14186 return SM1.size() - LastUndefsCnt > 1 &&
14187 TTI->getNumberOfParts(SI1->getType()) ==
14189 FixedVectorType::get(SI1->getType()->getElementType(),
14190 SM1.size() - LastUndefsCnt));
14191 };
14192 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
14193 // instructions. TODO: We can further optimize this scan if we split the
14194 // instructions into different buckets based on the insert lane.
14196 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
14197 assert(*I &&
14198 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
14199 "Worklist not sorted properly!");
14200 BasicBlock *BB = (*I)->getBlock();
14201 // For all instructions in blocks containing gather sequences:
14202 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
14203 if (isDeleted(&In))
14204 continue;
14205 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
14206 !GatherShuffleExtractSeq.contains(&In))
14207 continue;
14208
14209 // Check if we can replace this instruction with any of the
14210 // visited instructions.
14211 bool Replaced = false;
14212 for (Instruction *&V : Visited) {
14213 SmallVector<int> NewMask;
14214 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
14215 DT->dominates(V->getParent(), In.getParent())) {
14216 In.replaceAllUsesWith(V);
14217 eraseInstruction(&In);
14218 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
14219 if (!NewMask.empty())
14220 SI->setShuffleMask(NewMask);
14221 Replaced = true;
14222 break;
14223 }
14224 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
14225 GatherShuffleExtractSeq.contains(V) &&
14226 IsIdenticalOrLessDefined(V, &In, NewMask) &&
14227 DT->dominates(In.getParent(), V->getParent())) {
14228 In.moveAfter(V);
14229 V->replaceAllUsesWith(&In);
14231 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
14232 if (!NewMask.empty())
14233 SI->setShuffleMask(NewMask);
14234 V = &In;
14235 Replaced = true;
14236 break;
14237 }
14238 }
14239 if (!Replaced) {
14240 assert(!is_contained(Visited, &In));
14241 Visited.push_back(&In);
14242 }
14243 }
14244 }
14245 CSEBlocks.clear();
14246 GatherShuffleExtractSeq.clear();
14247}
14248
14249BoUpSLP::ScheduleData *
14250BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
14251 ScheduleData *Bundle = nullptr;
14252 ScheduleData *PrevInBundle = nullptr;
14253 for (Value *V : VL) {
14255 continue;
14256 ScheduleData *BundleMember = getScheduleData(V);
14257 assert(BundleMember &&
14258 "no ScheduleData for bundle member "
14259 "(maybe not in same basic block)");
14260 assert(BundleMember->isSchedulingEntity() &&
14261 "bundle member already part of other bundle");
14262 if (PrevInBundle) {
14263 PrevInBundle->NextInBundle = BundleMember;
14264 } else {
14265 Bundle = BundleMember;
14266 }
14267
14268 // Group the instructions to a bundle.
14269 BundleMember->FirstInBundle = Bundle;
14270 PrevInBundle = BundleMember;
14271 }
14272 assert(Bundle && "Failed to find schedule bundle");
14273 return Bundle;
14274}
14275
14276// Groups the instructions to a bundle (which is then a single scheduling entity)
14277// and schedules instructions until the bundle gets ready.
14278std::optional<BoUpSLP::ScheduleData *>
14279BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
14280 const InstructionsState &S) {
14281 // No need to schedule PHIs, insertelement, extractelement and extractvalue
14282 // instructions.
14283 if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
14285 return nullptr;
14286
14287 // Initialize the instruction bundle.
14288 Instruction *OldScheduleEnd = ScheduleEnd;
14289 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
14290
14291 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
14292 ScheduleData *Bundle) {
14293 // The scheduling region got new instructions at the lower end (or it is a
14294 // new region for the first bundle). This makes it necessary to
14295 // recalculate all dependencies.
14296 // It is seldom that this needs to be done a second time after adding the
14297 // initial bundle to the region.
14298 if (ScheduleEnd != OldScheduleEnd) {
14299 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
14300 doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
14301 ReSchedule = true;
14302 }
14303 if (Bundle) {
14304 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
14305 << " in block " << BB->getName() << "\n");
14306 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
14307 }
14308
14309 if (ReSchedule) {
14310 resetSchedule();
14311 initialFillReadyList(ReadyInsts);
14312 }
14313
14314 // Now try to schedule the new bundle or (if no bundle) just calculate
14315 // dependencies. As soon as the bundle is "ready" it means that there are no
14316 // cyclic dependencies and we can schedule it. Note that's important that we
14317 // don't "schedule" the bundle yet (see cancelScheduling).
14318 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
14319 !ReadyInsts.empty()) {
14320 ScheduleData *Picked = ReadyInsts.pop_back_val();
14321 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
14322 "must be ready to schedule");
14323 schedule(Picked, ReadyInsts);
14324 }
14325 };
14326
14327 // Make sure that the scheduling region contains all
14328 // instructions of the bundle.
14329 for (Value *V : VL) {
14331 continue;
14332 if (!extendSchedulingRegion(V, S)) {
14333 // If the scheduling region got new instructions at the lower end (or it
14334 // is a new region for the first bundle). This makes it necessary to
14335 // recalculate all dependencies.
14336 // Otherwise the compiler may crash trying to incorrectly calculate
14337 // dependencies and emit instruction in the wrong order at the actual
14338 // scheduling.
14339 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
14340 return std::nullopt;
14341 }
14342 }
14343
14344 bool ReSchedule = false;
14345 for (Value *V : VL) {
14347 continue;
14348 ScheduleData *BundleMember = getScheduleData(V);
14349 assert(BundleMember &&
14350 "no ScheduleData for bundle member (maybe not in same basic block)");
14351
14352 // Make sure we don't leave the pieces of the bundle in the ready list when
14353 // whole bundle might not be ready.
14354 ReadyInsts.remove(BundleMember);
14355
14356 if (!BundleMember->IsScheduled)
14357 continue;
14358 // A bundle member was scheduled as single instruction before and now
14359 // needs to be scheduled as part of the bundle. We just get rid of the
14360 // existing schedule.
14361 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
14362 << " was already scheduled\n");
14363 ReSchedule = true;
14364 }
14365
14366 auto *Bundle = buildBundle(VL);
14367 TryScheduleBundleImpl(ReSchedule, Bundle);
14368 if (!Bundle->isReady()) {
14369 cancelScheduling(VL, S.OpValue);
14370 return std::nullopt;
14371 }
14372 return Bundle;
14373}
14374
14375void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
14376 Value *OpValue) {
14377 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
14379 return;
14380
14381 if (doesNotNeedToBeScheduled(OpValue))
14382 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
14383 ScheduleData *Bundle = getScheduleData(OpValue);
14384 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
14385 assert(!Bundle->IsScheduled &&
14386 "Can't cancel bundle which is already scheduled");
14387 assert(Bundle->isSchedulingEntity() &&
14388 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
14389 "tried to unbundle something which is not a bundle");
14390
14391 // Remove the bundle from the ready list.
14392 if (Bundle->isReady())
14393 ReadyInsts.remove(Bundle);
14394
14395 // Un-bundle: make single instructions out of the bundle.
14396 ScheduleData *BundleMember = Bundle;
14397 while (BundleMember) {
14398 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
14399 BundleMember->FirstInBundle = BundleMember;
14400 ScheduleData *Next = BundleMember->NextInBundle;
14401 BundleMember->NextInBundle = nullptr;
14402 BundleMember->TE = nullptr;
14403 if (BundleMember->unscheduledDepsInBundle() == 0) {
14404 ReadyInsts.insert(BundleMember);
14405 }
14406 BundleMember = Next;
14407 }
14408}
14409
14410BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
14411 // Allocate a new ScheduleData for the instruction.
14412 if (ChunkPos >= ChunkSize) {
14413 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
14414 ChunkPos = 0;
14415 }
14416 return &(ScheduleDataChunks.back()[ChunkPos++]);
14417}
14418
14419bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
14420 const InstructionsState &S) {
14421 if (getScheduleData(V, isOneOf(S, V)))
14422 return true;
14423 Instruction *I = dyn_cast<Instruction>(V);
14424 assert(I && "bundle member must be an instruction");
14425 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
14427 "phi nodes/insertelements/extractelements/extractvalues don't need to "
14428 "be scheduled");
14429 auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
14430 ScheduleData *ISD = getScheduleData(I);
14431 if (!ISD)
14432 return false;
14433 assert(isInSchedulingRegion(ISD) &&
14434 "ScheduleData not in scheduling region");
14435 ScheduleData *SD = allocateScheduleDataChunks();
14436 SD->Inst = I;
14437 SD->init(SchedulingRegionID, S.OpValue);
14438 ExtraScheduleDataMap[I][S.OpValue] = SD;
14439 return true;
14440 };
14441 if (CheckScheduleForI(I))
14442 return true;
14443 if (!ScheduleStart) {
14444 // It's the first instruction in the new region.
14445 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
14446 ScheduleStart = I;
14447 ScheduleEnd = I->getNextNode();
14448 if (isOneOf(S, I) != I)
14449 CheckScheduleForI(I);
14450 assert(ScheduleEnd && "tried to vectorize a terminator?");
14451 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
14452 return true;
14453 }
14454 // Search up and down at the same time, because we don't know if the new
14455 // instruction is above or below the existing scheduling region.
14456 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
14457 // against the budget. Otherwise debug info could affect codegen.
14459 ++ScheduleStart->getIterator().getReverse();
14460 BasicBlock::reverse_iterator UpperEnd = BB->rend();
14461 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
14462 BasicBlock::iterator LowerEnd = BB->end();
14463 auto IsAssumeLikeIntr = [](const Instruction &I) {
14464 if (auto *II = dyn_cast<IntrinsicInst>(&I))
14465 return II->isAssumeLikeIntrinsic();
14466 return false;
14467 };
14468 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14469 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14470 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
14471 &*DownIter != I) {
14472 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14473 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
14474 return false;
14475 }
14476
14477 ++UpIter;
14478 ++DownIter;
14479
14480 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14481 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14482 }
14483 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
14484 assert(I->getParent() == ScheduleStart->getParent() &&
14485 "Instruction is in wrong basic block.");
14486 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
14487 ScheduleStart = I;
14488 if (isOneOf(S, I) != I)
14489 CheckScheduleForI(I);
14490 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
14491 << "\n");
14492 return true;
14493 }
14494 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
14495 "Expected to reach top of the basic block or instruction down the "
14496 "lower end.");
14497 assert(I->getParent() == ScheduleEnd->getParent() &&
14498 "Instruction is in wrong basic block.");
14499 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
14500 nullptr);
14501 ScheduleEnd = I->getNextNode();
14502 if (isOneOf(S, I) != I)
14503 CheckScheduleForI(I);
14504 assert(ScheduleEnd && "tried to vectorize a terminator?");
14505 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
14506 return true;
14507}
14508
14509void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
14510 Instruction *ToI,
14511 ScheduleData *PrevLoadStore,
14512 ScheduleData *NextLoadStore) {
14513 ScheduleData *CurrentLoadStore = PrevLoadStore;
14514 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
14515 // No need to allocate data for non-schedulable instructions.
14517 continue;
14518 ScheduleData *SD = ScheduleDataMap.lookup(I);
14519 if (!SD) {
14520 SD = allocateScheduleDataChunks();
14521 ScheduleDataMap[I] = SD;
14522 SD->Inst = I;
14523 }
14524 assert(!isInSchedulingRegion(SD) &&
14525 "new ScheduleData already in scheduling region");
14526 SD->init(SchedulingRegionID, I);
14527
14528 if (I->mayReadOrWriteMemory() &&
14529 (!isa<IntrinsicInst>(I) ||
14530 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
14531 cast<IntrinsicInst>(I)->getIntrinsicID() !=
14532 Intrinsic::pseudoprobe))) {
14533 // Update the linked list of memory accessing instructions.
14534 if (CurrentLoadStore) {
14535 CurrentLoadStore->NextLoadStore = SD;
14536 } else {
14537 FirstLoadStoreInRegion = SD;
14538 }
14539 CurrentLoadStore = SD;
14540 }
14541
14542 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14543 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14544 RegionHasStackSave = true;
14545 }
14546 if (NextLoadStore) {
14547 if (CurrentLoadStore)
14548 CurrentLoadStore->NextLoadStore = NextLoadStore;
14549 } else {
14550 LastLoadStoreInRegion = CurrentLoadStore;
14551 }
14552}
14553
14554void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14555 bool InsertInReadyList,
14556 BoUpSLP *SLP) {
14557 assert(SD->isSchedulingEntity());
14558
14560 WorkList.push_back(SD);
14561
14562 while (!WorkList.empty()) {
14563 ScheduleData *SD = WorkList.pop_back_val();
14564 for (ScheduleData *BundleMember = SD; BundleMember;
14565 BundleMember = BundleMember->NextInBundle) {
14566 assert(isInSchedulingRegion(BundleMember));
14567 if (BundleMember->hasValidDependencies())
14568 continue;
14569
14570 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
14571 << "\n");
14572 BundleMember->Dependencies = 0;
14573 BundleMember->resetUnscheduledDeps();
14574
14575 // Handle def-use chain dependencies.
14576 if (BundleMember->OpValue != BundleMember->Inst) {
14577 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
14578 BundleMember->Dependencies++;
14579 ScheduleData *DestBundle = UseSD->FirstInBundle;
14580 if (!DestBundle->IsScheduled)
14581 BundleMember->incrementUnscheduledDeps(1);
14582 if (!DestBundle->hasValidDependencies())
14583 WorkList.push_back(DestBundle);
14584 }
14585 } else {
14586 for (User *U : BundleMember->Inst->users()) {
14587 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
14588 BundleMember->Dependencies++;
14589 ScheduleData *DestBundle = UseSD->FirstInBundle;
14590 if (!DestBundle->IsScheduled)
14591 BundleMember->incrementUnscheduledDeps(1);
14592 if (!DestBundle->hasValidDependencies())
14593 WorkList.push_back(DestBundle);
14594 }
14595 }
14596 }
14597
14598 auto MakeControlDependent = [&](Instruction *I) {
14599 auto *DepDest = getScheduleData(I);
14600 assert(DepDest && "must be in schedule window");
14601 DepDest->ControlDependencies.push_back(BundleMember);
14602 BundleMember->Dependencies++;
14603 ScheduleData *DestBundle = DepDest->FirstInBundle;
14604 if (!DestBundle->IsScheduled)
14605 BundleMember->incrementUnscheduledDeps(1);
14606 if (!DestBundle->hasValidDependencies())
14607 WorkList.push_back(DestBundle);
14608 };
14609
14610 // Any instruction which isn't safe to speculate at the beginning of the
14611 // block is control dependend on any early exit or non-willreturn call
14612 // which proceeds it.
14613 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
14614 for (Instruction *I = BundleMember->Inst->getNextNode();
14615 I != ScheduleEnd; I = I->getNextNode()) {
14616 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
14617 continue;
14618
14619 // Add the dependency
14620 MakeControlDependent(I);
14621
14623 // Everything past here must be control dependent on I.
14624 break;
14625 }
14626 }
14627
14628 if (RegionHasStackSave) {
14629 // If we have an inalloc alloca instruction, it needs to be scheduled
14630 // after any preceeding stacksave. We also need to prevent any alloca
14631 // from reordering above a preceeding stackrestore.
14632 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
14633 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
14634 for (Instruction *I = BundleMember->Inst->getNextNode();
14635 I != ScheduleEnd; I = I->getNextNode()) {
14636 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14637 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14638 // Any allocas past here must be control dependent on I, and I
14639 // must be memory dependend on BundleMember->Inst.
14640 break;
14641
14642 if (!isa<AllocaInst>(I))
14643 continue;
14644
14645 // Add the dependency
14646 MakeControlDependent(I);
14647 }
14648 }
14649
14650 // In addition to the cases handle just above, we need to prevent
14651 // allocas and loads/stores from moving below a stacksave or a
14652 // stackrestore. Avoiding moving allocas below stackrestore is currently
14653 // thought to be conservatism. Moving loads/stores below a stackrestore
14654 // can lead to incorrect code.
14655 if (isa<AllocaInst>(BundleMember->Inst) ||
14656 BundleMember->Inst->mayReadOrWriteMemory()) {
14657 for (Instruction *I = BundleMember->Inst->getNextNode();
14658 I != ScheduleEnd; I = I->getNextNode()) {
14659 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
14660 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14661 continue;
14662
14663 // Add the dependency
14664 MakeControlDependent(I);
14665 break;
14666 }
14667 }
14668 }
14669
14670 // Handle the memory dependencies (if any).
14671 ScheduleData *DepDest = BundleMember->NextLoadStore;
14672 if (!DepDest)
14673 continue;
14674 Instruction *SrcInst = BundleMember->Inst;
14675 assert(SrcInst->mayReadOrWriteMemory() &&
14676 "NextLoadStore list for non memory effecting bundle?");
14677 MemoryLocation SrcLoc = getLocation(SrcInst);
14678 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14679 unsigned NumAliased = 0;
14680 unsigned DistToSrc = 1;
14681
14682 for (; DepDest; DepDest = DepDest->NextLoadStore) {
14683 assert(isInSchedulingRegion(DepDest));
14684
14685 // We have two limits to reduce the complexity:
14686 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
14687 // SLP->isAliased (which is the expensive part in this loop).
14688 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
14689 // the whole loop (even if the loop is fast, it's quadratic).
14690 // It's important for the loop break condition (see below) to
14691 // check this limit even between two read-only instructions.
14692 if (DistToSrc >= MaxMemDepDistance ||
14693 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
14694 (NumAliased >= AliasedCheckLimit ||
14695 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
14696
14697 // We increment the counter only if the locations are aliased
14698 // (instead of counting all alias checks). This gives a better
14699 // balance between reduced runtime and accurate dependencies.
14700 NumAliased++;
14701
14702 DepDest->MemoryDependencies.push_back(BundleMember);
14703 BundleMember->Dependencies++;
14704 ScheduleData *DestBundle = DepDest->FirstInBundle;
14705 if (!DestBundle->IsScheduled) {
14706 BundleMember->incrementUnscheduledDeps(1);
14707 }
14708 if (!DestBundle->hasValidDependencies()) {
14709 WorkList.push_back(DestBundle);
14710 }
14711 }
14712
14713 // Example, explaining the loop break condition: Let's assume our
14714 // starting instruction is i0 and MaxMemDepDistance = 3.
14715 //
14716 // +--------v--v--v
14717 // i0,i1,i2,i3,i4,i5,i6,i7,i8
14718 // +--------^--^--^
14719 //
14720 // MaxMemDepDistance let us stop alias-checking at i3 and we add
14721 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
14722 // Previously we already added dependencies from i3 to i6,i7,i8
14723 // (because of MaxMemDepDistance). As we added a dependency from
14724 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
14725 // and we can abort this loop at i6.
14726 if (DistToSrc >= 2 * MaxMemDepDistance)
14727 break;
14728 DistToSrc++;
14729 }
14730 }
14731 if (InsertInReadyList && SD->isReady()) {
14732 ReadyInsts.insert(SD);
14733 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
14734 << "\n");
14735 }
14736 }
14737}
14738
14739void BoUpSLP::BlockScheduling::resetSchedule() {
14740 assert(ScheduleStart &&
14741 "tried to reset schedule on block which has not been scheduled");
14742 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
14743 doForAllOpcodes(I, [&](ScheduleData *SD) {
14744 assert(isInSchedulingRegion(SD) &&
14745 "ScheduleData not in scheduling region");
14746 SD->IsScheduled = false;
14747 SD->resetUnscheduledDeps();
14748 });
14749 }
14750 ReadyInsts.clear();
14751}
14752
14753void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14754 if (!BS->ScheduleStart)
14755 return;
14756
14757 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
14758
14759 // A key point - if we got here, pre-scheduling was able to find a valid
14760 // scheduling of the sub-graph of the scheduling window which consists
14761 // of all vector bundles and their transitive users. As such, we do not
14762 // need to reschedule anything *outside of* that subgraph.
14763
14764 BS->resetSchedule();
14765
14766 // For the real scheduling we use a more sophisticated ready-list: it is
14767 // sorted by the original instruction location. This lets the final schedule
14768 // be as close as possible to the original instruction order.
14769 // WARNING: If changing this order causes a correctness issue, that means
14770 // there is some missing dependence edge in the schedule data graph.
14771 struct ScheduleDataCompare {
14772 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
14773 return SD2->SchedulingPriority < SD1->SchedulingPriority;
14774 }
14775 };
14776 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14777
14778 // Ensure that all dependency data is updated (for nodes in the sub-graph)
14779 // and fill the ready-list with initial instructions.
14780 int Idx = 0;
14781 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
14782 I = I->getNextNode()) {
14783 BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
14784 TreeEntry *SDTE = getTreeEntry(SD->Inst);
14785 (void)SDTE;
14787 SD->isPartOfBundle() ==
14788 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
14789 "scheduler and vectorizer bundle mismatch");
14790 SD->FirstInBundle->SchedulingPriority = Idx++;
14791
14792 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14793 BS->calculateDependencies(SD, false, this);
14794 });
14795 }
14796 BS->initialFillReadyList(ReadyInsts);
14797
14798 Instruction *LastScheduledInst = BS->ScheduleEnd;
14799
14800 // Do the "real" scheduling.
14801 while (!ReadyInsts.empty()) {
14802 ScheduleData *Picked = *ReadyInsts.begin();
14803 ReadyInsts.erase(ReadyInsts.begin());
14804
14805 // Move the scheduled instruction(s) to their dedicated places, if not
14806 // there yet.
14807 for (ScheduleData *BundleMember = Picked; BundleMember;
14808 BundleMember = BundleMember->NextInBundle) {
14809 Instruction *PickedInst = BundleMember->Inst;
14810 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
14811 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
14812 LastScheduledInst = PickedInst;
14813 }
14814
14815 BS->schedule(Picked, ReadyInsts);
14816 }
14817
14818 // Check that we didn't break any of our invariants.
14819#ifdef EXPENSIVE_CHECKS
14820 BS->verify();
14821#endif
14822
14823#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
14824 // Check that all schedulable entities got scheduled
14825 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
14826 BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
14827 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
14828 assert(SD->IsScheduled && "must be scheduled at this point");
14829 }
14830 });
14831 }
14832#endif
14833
14834 // Avoid duplicate scheduling of the block.
14835 BS->ScheduleStart = nullptr;
14836}
14837
14839 // If V is a store, just return the width of the stored value (or value
14840 // truncated just before storing) without traversing the expression tree.
14841 // This is the common case.
14842 if (auto *Store = dyn_cast<StoreInst>(V))
14843 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
14844
14845 if (auto *IEI = dyn_cast<InsertElementInst>(V))
14846 return getVectorElementSize(IEI->getOperand(1));
14847
14848 auto E = InstrElementSize.find(V);
14849 if (E != InstrElementSize.end())
14850 return E->second;
14851
14852 // If V is not a store, we can traverse the expression tree to find loads
14853 // that feed it. The type of the loaded value may indicate a more suitable
14854 // width than V's type. We want to base the vector element size on the width
14855 // of memory operations where possible.
14858 if (auto *I = dyn_cast<Instruction>(V)) {
14859 Worklist.emplace_back(I, I->getParent(), 0);
14860 Visited.insert(I);
14861 }
14862
14863 // Traverse the expression tree in bottom-up order looking for loads. If we
14864 // encounter an instruction we don't yet handle, we give up.
14865 auto Width = 0u;
14866 Value *FirstNonBool = nullptr;
14867 while (!Worklist.empty()) {
14868 auto [I, Parent, Level] = Worklist.pop_back_val();
14869
14870 // We should only be looking at scalar instructions here. If the current
14871 // instruction has a vector type, skip.
14872 auto *Ty = I->getType();
14873 if (isa<VectorType>(Ty))
14874 continue;
14875 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
14876 FirstNonBool = I;
14877 if (Level > RecursionMaxDepth)
14878 continue;
14879
14880 // If the current instruction is a load, update MaxWidth to reflect the
14881 // width of the loaded value.
14882 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
14883 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
14884
14885 // Otherwise, we need to visit the operands of the instruction. We only
14886 // handle the interesting cases from buildTree here. If an operand is an
14887 // instruction we haven't yet visited and from the same basic block as the
14888 // user or the use is a PHI node, we add it to the worklist.
14891 for (Use &U : I->operands()) {
14892 if (auto *J = dyn_cast<Instruction>(U.get()))
14893 if (Visited.insert(J).second &&
14894 (isa<PHINode>(I) || J->getParent() == Parent)) {
14895 Worklist.emplace_back(J, J->getParent(), Level + 1);
14896 continue;
14897 }
14898 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
14899 FirstNonBool = U.get();
14900 }
14901 } else {
14902 break;
14903 }
14904 }
14905
14906 // If we didn't encounter a memory access in the expression tree, or if we
14907 // gave up for some reason, just return the width of V. Otherwise, return the
14908 // maximum width we found.
14909 if (!Width) {
14910 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
14911 V = FirstNonBool;
14912 Width = DL->getTypeSizeInBits(V->getType());
14913 }
14914
14915 for (Instruction *I : Visited)
14916 InstrElementSize[I] = Width;
14917
14918 return Width;
14919}
14920
14921bool BoUpSLP::collectValuesToDemote(
14922 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
14924 unsigned &MaxDepthLevel, bool &IsProfitableToDemote,
14925 bool IsTruncRoot) const {
14926 // We can always demote constants.
14927 if (all_of(E.Scalars, IsaPred<Constant>))
14928 return true;
14929
14930 unsigned OrigBitWidth = DL->getTypeSizeInBits(E.Scalars.front()->getType());
14931 if (OrigBitWidth == BitWidth) {
14932 MaxDepthLevel = 1;
14933 return true;
14934 }
14935
14936 // If the value is not a vectorized instruction in the expression and not used
14937 // by the insertelement instruction and not used in multiple vector nodes, it
14938 // cannot be demoted.
14939 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
14940 return !isKnownNonNegative(R, SimplifyQuery(*DL));
14941 });
14942 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
14943 if (MultiNodeScalars.contains(V))
14944 return false;
14945 // For lat shuffle of sext/zext with many uses need to check the extra bit
14946 // for unsigned values, otherwise may have incorrect casting for reused
14947 // scalars.
14948 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
14949 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
14950 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14951 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
14952 return true;
14953 }
14954 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
14955 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
14956 if (IsSignedNode)
14957 ++BitWidth1;
14958 if (auto *I = dyn_cast<Instruction>(V)) {
14959 APInt Mask = DB->getDemandedBits(I);
14960 unsigned BitWidth2 =
14961 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
14962 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
14963 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
14964 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
14965 break;
14966 BitWidth2 *= 2;
14967 }
14968 BitWidth1 = std::min(BitWidth1, BitWidth2);
14969 }
14970 BitWidth = std::max(BitWidth, BitWidth1);
14971 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
14972 };
14973 using namespace std::placeholders;
14974 auto FinalAnalysis = [&]() {
14975 if (!IsProfitableToDemote)
14976 return false;
14977 bool Res = all_of(
14978 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
14979 // Demote gathers.
14980 if (Res && E.State == TreeEntry::NeedToGather) {
14981 // Check possible extractelement instructions bases and final vector
14982 // length.
14983 SmallPtrSet<Value *, 4> UniqueBases;
14984 for (Value *V : E.Scalars) {
14985 auto *EE = dyn_cast<ExtractElementInst>(V);
14986 if (!EE)
14987 continue;
14988 UniqueBases.insert(EE->getVectorOperand());
14989 }
14990 const unsigned VF = E.Scalars.size();
14991 Type *OrigScalarTy = E.Scalars.front()->getType();
14992 if (UniqueBases.size() <= 2 ||
14993 TTI->getNumberOfParts(FixedVectorType::get(OrigScalarTy, VF)) ==
14995 IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
14996 ToDemote.push_back(E.Idx);
14997 }
14998 return Res;
14999 };
15000 if (E.State == TreeEntry::NeedToGather || !Visited.insert(&E).second ||
15001 any_of(E.Scalars, [&](Value *V) {
15002 return all_of(V->users(), [&](User *U) {
15003 return isa<InsertElementInst>(U) && !getTreeEntry(U);
15004 });
15005 }))
15006 return FinalAnalysis();
15007
15008 if (any_of(E.Scalars, [&](Value *V) {
15009 return !all_of(V->users(), [=](User *U) {
15010 return getTreeEntry(U) ||
15011 (UserIgnoreList && UserIgnoreList->contains(U)) ||
15012 (!isa<CmpInst>(U) && U->getType()->isSized() &&
15013 !U->getType()->isScalableTy() &&
15014 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
15015 }) && !IsPotentiallyTruncated(V, BitWidth);
15016 }))
15017 return false;
15018
15019 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
15020 bool &NeedToExit) {
15021 NeedToExit = false;
15022 unsigned InitLevel = MaxDepthLevel;
15023 for (const TreeEntry *Op : Operands) {
15024 unsigned Level = InitLevel;
15025 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
15026 ToDemote, Visited, Level, IsProfitableToDemote,
15027 IsTruncRoot)) {
15028 if (!IsProfitableToDemote)
15029 return false;
15030 NeedToExit = true;
15031 if (!FinalAnalysis())
15032 return false;
15033 continue;
15034 }
15035 MaxDepthLevel = std::max(MaxDepthLevel, Level);
15036 }
15037 return true;
15038 };
15039 auto AttemptCheckBitwidth =
15040 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
15041 // Try all bitwidth < OrigBitWidth.
15042 NeedToExit = false;
15043 unsigned BestFailBitwidth = 0;
15044 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
15045 if (Checker(BitWidth, OrigBitWidth))
15046 return true;
15047 if (BestFailBitwidth == 0 && FinalAnalysis())
15048 BestFailBitwidth = BitWidth;
15049 }
15050 if (BitWidth >= OrigBitWidth) {
15051 if (BestFailBitwidth == 0) {
15052 BitWidth = OrigBitWidth;
15053 return false;
15054 }
15055 MaxDepthLevel = 1;
15056 BitWidth = BestFailBitwidth;
15057 NeedToExit = true;
15058 return true;
15059 }
15060 return false;
15061 };
15062 auto TryProcessInstruction =
15063 [&](unsigned &BitWidth,
15065 function_ref<bool(unsigned, unsigned)> Checker = {}) {
15066 if (Operands.empty()) {
15067 if (!IsTruncRoot)
15068 MaxDepthLevel = 1;
15069 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15070 std::ref(BitWidth)));
15071 } else {
15072 // Several vectorized uses? Check if we can truncate it, otherwise -
15073 // exit.
15074 if (E.UserTreeIndices.size() > 1 &&
15075 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15076 std::ref(BitWidth))))
15077 return false;
15078 bool NeedToExit = false;
15079 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
15080 return false;
15081 if (NeedToExit)
15082 return true;
15083 if (!ProcessOperands(Operands, NeedToExit))
15084 return false;
15085 if (NeedToExit)
15086 return true;
15087 }
15088
15089 ++MaxDepthLevel;
15090 // Record the entry that we can demote.
15091 ToDemote.push_back(E.Idx);
15092 return IsProfitableToDemote;
15093 };
15094 switch (E.getOpcode()) {
15095
15096 // We can always demote truncations and extensions. Since truncations can
15097 // seed additional demotion, we save the truncated value.
15098 case Instruction::Trunc:
15099 if (IsProfitableToDemoteRoot)
15100 IsProfitableToDemote = true;
15101 return TryProcessInstruction(BitWidth);
15102 case Instruction::ZExt:
15103 case Instruction::SExt:
15104 IsProfitableToDemote = true;
15105 return TryProcessInstruction(BitWidth);
15106
15107 // We can demote certain binary operations if we can demote both of their
15108 // operands.
15109 case Instruction::Add:
15110 case Instruction::Sub:
15111 case Instruction::Mul:
15112 case Instruction::And:
15113 case Instruction::Or:
15114 case Instruction::Xor: {
15115 return TryProcessInstruction(
15116 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
15117 }
15118 case Instruction::Shl: {
15119 // If we are truncating the result of this SHL, and if it's a shift of an
15120 // inrange amount, we can always perform a SHL in a smaller type.
15121 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
15122 return all_of(E.Scalars, [&](Value *V) {
15123 auto *I = cast<Instruction>(V);
15124 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15125 return AmtKnownBits.getMaxValue().ult(BitWidth);
15126 });
15127 };
15128 return TryProcessInstruction(
15129 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
15130 }
15131 case Instruction::LShr: {
15132 // If this is a truncate of a logical shr, we can truncate it to a smaller
15133 // lshr iff we know that the bits we would otherwise be shifting in are
15134 // already zeros.
15135 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15136 return all_of(E.Scalars, [&](Value *V) {
15137 auto *I = cast<Instruction>(V);
15138 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15139 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15140 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15141 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
15142 SimplifyQuery(*DL));
15143 });
15144 };
15145 return TryProcessInstruction(
15146 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15147 LShrChecker);
15148 }
15149 case Instruction::AShr: {
15150 // If this is a truncate of an arithmetic shr, we can truncate it to a
15151 // smaller ashr iff we know that all the bits from the sign bit of the
15152 // original type and the sign bit of the truncate type are similar.
15153 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15154 return all_of(E.Scalars, [&](Value *V) {
15155 auto *I = cast<Instruction>(V);
15156 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15157 unsigned ShiftedBits = OrigBitWidth - BitWidth;
15158 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15159 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15160 nullptr, DT);
15161 });
15162 };
15163 return TryProcessInstruction(
15164 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15165 AShrChecker);
15166 }
15167 case Instruction::UDiv:
15168 case Instruction::URem: {
15169 // UDiv and URem can be truncated if all the truncated bits are zero.
15170 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15171 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15172 return all_of(E.Scalars, [&](Value *V) {
15173 auto *I = cast<Instruction>(V);
15174 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15175 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
15176 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15177 });
15178 };
15179 return TryProcessInstruction(
15180 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
15181 }
15182
15183 // We can demote selects if we can demote their true and false values.
15184 case Instruction::Select: {
15185 return TryProcessInstruction(
15186 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
15187 }
15188
15189 // We can demote phis if we can demote all their incoming operands. Note that
15190 // we don't need to worry about cycles since we ensure single use above.
15191 case Instruction::PHI: {
15192 const unsigned NumOps = E.getNumOperands();
15194 transform(seq<unsigned>(0, NumOps), Ops.begin(),
15195 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
15196
15197 return TryProcessInstruction(BitWidth, Ops);
15198 }
15199
15200 case Instruction::Call: {
15201 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
15202 if (!IC)
15203 break;
15205 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
15206 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
15207 break;
15208 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
15209 function_ref<bool(unsigned, unsigned)> CallChecker;
15210 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15211 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15212 return all_of(E.Scalars, [&](Value *V) {
15213 auto *I = cast<Instruction>(V);
15214 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
15215 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15216 return MaskedValueIsZero(I->getOperand(0), Mask,
15217 SimplifyQuery(*DL)) &&
15218 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15219 }
15220 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
15221 "Expected min/max intrinsics only.");
15222 unsigned SignBits = OrigBitWidth - BitWidth;
15223 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
15224 unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15225 nullptr, DT);
15226 unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
15227 nullptr, DT);
15228 return SignBits <= Op0SignBits &&
15229 ((SignBits != Op0SignBits &&
15230 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
15231 MaskedValueIsZero(I->getOperand(0), Mask,
15232 SimplifyQuery(*DL))) &&
15233 SignBits <= Op1SignBits &&
15234 ((SignBits != Op1SignBits &&
15235 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
15236 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
15237 });
15238 };
15239 if (ID != Intrinsic::abs) {
15240 Operands.push_back(getOperandEntry(&E, 1));
15241 CallChecker = CompChecker;
15242 }
15243 InstructionCost BestCost =
15244 std::numeric_limits<InstructionCost::CostType>::max();
15245 unsigned BestBitWidth = BitWidth;
15246 unsigned VF = E.Scalars.size();
15247 // Choose the best bitwidth based on cost estimations.
15248 auto Checker = [&](unsigned BitWidth, unsigned) {
15249 unsigned MinBW = PowerOf2Ceil(BitWidth);
15250 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(IC, ID, VF, MinBW);
15251 auto VecCallCosts = getVectorCallCosts(
15252 IC,
15253 FixedVectorType::get(IntegerType::get(IC->getContext(), MinBW), VF),
15254 TTI, TLI, ArgTys);
15255 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
15256 if (Cost < BestCost) {
15257 BestCost = Cost;
15258 BestBitWidth = BitWidth;
15259 }
15260 return false;
15261 };
15262 [[maybe_unused]] bool NeedToExit;
15263 (void)AttemptCheckBitwidth(Checker, NeedToExit);
15264 BitWidth = BestBitWidth;
15265 return TryProcessInstruction(BitWidth, Operands, CallChecker);
15266 }
15267
15268 // Otherwise, conservatively give up.
15269 default:
15270 break;
15271 }
15272 MaxDepthLevel = 1;
15273 return FinalAnalysis();
15274}
15275
15276static RecurKind getRdxKind(Value *V);
15277
15279 // We only attempt to truncate integer expressions.
15280 bool IsStoreOrInsertElt =
15281 VectorizableTree.front()->getOpcode() == Instruction::Store ||
15282 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
15283 if ((IsStoreOrInsertElt || UserIgnoreList) &&
15284 ExtraBitWidthNodes.size() <= 1 &&
15285 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
15286 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
15287 return;
15288
15289 unsigned NodeIdx = 0;
15290 if (IsStoreOrInsertElt &&
15291 VectorizableTree.front()->State != TreeEntry::NeedToGather)
15292 NodeIdx = 1;
15293
15294 // Ensure the roots of the vectorizable tree don't form a cycle.
15295 if (VectorizableTree[NodeIdx]->State == TreeEntry::NeedToGather ||
15296 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
15297 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15298 [NodeIdx](const EdgeInfo &EI) {
15299 return EI.UserTE->Idx >
15300 static_cast<int>(NodeIdx);
15301 })))
15302 return;
15303
15304 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
15305 // resize to the final type.
15306 bool IsTruncRoot = false;
15307 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
15308 SmallVector<unsigned> RootDemotes;
15309 if (NodeIdx != 0 &&
15310 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15311 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15312 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
15313 IsTruncRoot = true;
15314 RootDemotes.push_back(NodeIdx);
15315 IsProfitableToDemoteRoot = true;
15316 ++NodeIdx;
15317 }
15318
15319 // Analyzed the reduction already and not profitable - exit.
15320 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
15321 return;
15322
15323 SmallVector<unsigned> ToDemote;
15324 auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
15325 bool IsProfitableToDemoteRoot, unsigned Opcode,
15326 unsigned Limit, bool IsTruncRoot,
15327 bool IsSignedCmp) {
15328 ToDemote.clear();
15329 unsigned VF = E.getVectorFactor();
15330 auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType());
15331 if (!TreeRootIT || !Opcode)
15332 return 0u;
15333
15334 if (any_of(E.Scalars,
15335 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
15336 return 0u;
15337
15338 unsigned NumParts =
15339 TTI->getNumberOfParts(FixedVectorType::get(TreeRootIT, VF));
15340
15341 // The maximum bit width required to represent all the values that can be
15342 // demoted without loss of precision. It would be safe to truncate the roots
15343 // of the expression to this width.
15344 unsigned MaxBitWidth = 1u;
15345
15346 // True if the roots can be zero-extended back to their original type,
15347 // rather than sign-extended. We know that if the leading bits are not
15348 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
15349 // True.
15350 // Determine if the sign bit of all the roots is known to be zero. If not,
15351 // IsKnownPositive is set to False.
15352 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
15353 KnownBits Known = computeKnownBits(R, *DL);
15354 return Known.isNonNegative();
15355 });
15356
15357 // We first check if all the bits of the roots are demanded. If they're not,
15358 // we can truncate the roots to this narrower type.
15359 for (Value *Root : E.Scalars) {
15360 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
15361 TypeSize NumTypeBits = DL->getTypeSizeInBits(Root->getType());
15362 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15363 // If we can't prove that the sign bit is zero, we must add one to the
15364 // maximum bit width to account for the unknown sign bit. This preserves
15365 // the existing sign bit so we can safely sign-extend the root back to the
15366 // original type. Otherwise, if we know the sign bit is zero, we will
15367 // zero-extend the root instead.
15368 //
15369 // FIXME: This is somewhat suboptimal, as there will be cases where adding
15370 // one to the maximum bit width will yield a larger-than-necessary
15371 // type. In general, we need to add an extra bit only if we can't
15372 // prove that the upper bit of the original type is equal to the
15373 // upper bit of the proposed smaller type. If these two bits are
15374 // the same (either zero or one) we know that sign-extending from
15375 // the smaller type will result in the same value. Here, since we
15376 // can't yet prove this, we are just making the proposed smaller
15377 // type larger to ensure correctness.
15378 if (!IsKnownPositive)
15379 ++BitWidth1;
15380
15381 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
15382 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15383 MaxBitWidth =
15384 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
15385 }
15386
15387 if (MaxBitWidth < 8 && MaxBitWidth > 1)
15388 MaxBitWidth = 8;
15389
15390 // If the original type is large, but reduced type does not improve the reg
15391 // use - ignore it.
15392 if (NumParts > 1 &&
15393 NumParts ==
15395 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
15396 return 0u;
15397
15398 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
15399 Opcode == Instruction::SExt ||
15400 Opcode == Instruction::ZExt || NumParts > 1;
15401 // Conservatively determine if we can actually truncate the roots of the
15402 // expression. Collect the values that can be demoted in ToDemote and
15403 // additional roots that require investigating in Roots.
15405 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
15406 bool NeedToDemote = IsProfitableToDemote;
15407
15408 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
15409 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
15410 IsTruncRoot) ||
15411 (MaxDepthLevel <= Limit &&
15412 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
15413 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
15414 DL->getTypeSizeInBits(TreeRootIT) /
15415 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
15416 ->getOperand(0)
15417 ->getType()) >
15418 2)))))
15419 return 0u;
15420 // Round MaxBitWidth up to the next power-of-two.
15421 MaxBitWidth = bit_ceil(MaxBitWidth);
15422
15423 return MaxBitWidth;
15424 };
15425
15426 // If we can truncate the root, we must collect additional values that might
15427 // be demoted as a result. That is, those seeded by truncations we will
15428 // modify.
15429 // Add reduction ops sizes, if any.
15430 if (UserIgnoreList &&
15431 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
15432 for (Value *V : *UserIgnoreList) {
15433 auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
15434 auto NumTypeBits = DL->getTypeSizeInBits(V->getType());
15435 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15437 ++BitWidth1;
15438 unsigned BitWidth2 = BitWidth1;
15440 auto Mask = DB->getDemandedBits(cast<Instruction>(V));
15441 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15442 }
15443 ReductionBitWidth =
15444 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
15445 }
15446 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
15447 ReductionBitWidth = 8;
15448
15449 ReductionBitWidth = bit_ceil(ReductionBitWidth);
15450 }
15451 bool IsTopRoot = NodeIdx == 0;
15452 while (NodeIdx < VectorizableTree.size() &&
15453 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15454 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15455 RootDemotes.push_back(NodeIdx);
15456 ++NodeIdx;
15457 IsTruncRoot = true;
15458 }
15459 bool IsSignedCmp = false;
15460 while (NodeIdx < VectorizableTree.size()) {
15461 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
15462 unsigned Limit = 2;
15463 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
15464 if (IsTopRoot &&
15465 ReductionBitWidth ==
15466 DL->getTypeSizeInBits(
15467 VectorizableTree.front()->Scalars.front()->getType()))
15468 Limit = 3;
15469 unsigned MaxBitWidth = ComputeMaxBitWidth(
15470 *VectorizableTree[NodeIdx].get(), IsTopRoot, IsProfitableToDemoteRoot,
15471 Opcode, Limit, IsTruncRoot, IsSignedCmp);
15472 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
15473 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
15474 ReductionBitWidth = bit_ceil(MaxBitWidth);
15475 else if (MaxBitWidth == 0)
15476 ReductionBitWidth = 0;
15477 }
15478
15479 for (unsigned Idx : RootDemotes) {
15480 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
15481 uint32_t OrigBitWidth = DL->getTypeSizeInBits(V->getType());
15482 if (OrigBitWidth > MaxBitWidth) {
15483 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
15484 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
15485 }
15486 return false;
15487 }))
15488 ToDemote.push_back(Idx);
15489 }
15490 RootDemotes.clear();
15491 IsTopRoot = false;
15492 IsProfitableToDemoteRoot = true;
15493
15494 if (ExtraBitWidthNodes.empty()) {
15495 NodeIdx = VectorizableTree.size();
15496 } else {
15497 unsigned NewIdx = 0;
15498 do {
15499 NewIdx = *ExtraBitWidthNodes.begin();
15500 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
15501 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
15502 NodeIdx = NewIdx;
15503 IsTruncRoot =
15504 NodeIdx < VectorizableTree.size() &&
15505 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15506 [](const EdgeInfo &EI) {
15507 return EI.EdgeIdx == 0 &&
15508 EI.UserTE->getOpcode() == Instruction::Trunc &&
15509 !EI.UserTE->isAltShuffle();
15510 });
15511 IsSignedCmp =
15512 NodeIdx < VectorizableTree.size() &&
15513 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15514 [&](const EdgeInfo &EI) {
15515 return EI.UserTE->getOpcode() == Instruction::ICmp &&
15516 any_of(EI.UserTE->Scalars, [&](Value *V) {
15517 auto *IC = dyn_cast<ICmpInst>(V);
15518 return IC &&
15519 (IC->isSigned() ||
15520 !isKnownNonNegative(IC->getOperand(0),
15521 SimplifyQuery(*DL)) ||
15522 !isKnownNonNegative(IC->getOperand(1),
15523 SimplifyQuery(*DL)));
15524 });
15525 });
15526 }
15527
15528 // If the maximum bit width we compute is less than the with of the roots'
15529 // type, we can proceed with the narrowing. Otherwise, do nothing.
15530 if (MaxBitWidth == 0 ||
15531 MaxBitWidth >=
15532 cast<IntegerType>(TreeRoot.front()->getType())->getBitWidth()) {
15533 if (UserIgnoreList)
15534 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
15535 continue;
15536 }
15537
15538 // Finally, map the values we can demote to the maximum bit with we
15539 // computed.
15540 for (unsigned Idx : ToDemote) {
15541 TreeEntry *TE = VectorizableTree[Idx].get();
15542 if (MinBWs.contains(TE))
15543 continue;
15544 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
15545 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15546 });
15547 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
15548 }
15549 }
15550}
15551
15553 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
15554 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
15555 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
15556 auto *AA = &AM.getResult<AAManager>(F);
15557 auto *LI = &AM.getResult<LoopAnalysis>(F);
15558 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
15559 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
15560 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
15562
15563 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
15564 if (!Changed)
15565 return PreservedAnalyses::all();
15566
15569 return PA;
15570}
15571
15573 TargetTransformInfo *TTI_,
15574 TargetLibraryInfo *TLI_, AAResults *AA_,
15575 LoopInfo *LI_, DominatorTree *DT_,
15576 AssumptionCache *AC_, DemandedBits *DB_,
15579 return false;
15580 SE = SE_;
15581 TTI = TTI_;
15582 TLI = TLI_;
15583 AA = AA_;
15584 LI = LI_;
15585 DT = DT_;
15586 AC = AC_;
15587 DB = DB_;
15588 DL = &F.getDataLayout();
15589
15590 Stores.clear();
15591 GEPs.clear();
15592 bool Changed = false;
15593
15594 // If the target claims to have no vector registers don't attempt
15595 // vectorization.
15597 LLVM_DEBUG(
15598 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
15599 return false;
15600 }
15601
15602 // Don't vectorize when the attribute NoImplicitFloat is used.
15603 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
15604 return false;
15605
15606 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
15607
15608 // Use the bottom up slp vectorizer to construct chains that start with
15609 // store instructions.
15610 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
15611
15612 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
15613 // delete instructions.
15614
15615 // Update DFS numbers now so that we can use them for ordering.
15616 DT->updateDFSNumbers();
15617
15618 // Scan the blocks in the function in post order.
15619 for (auto *BB : post_order(&F.getEntryBlock())) {
15620 // Start new block - clear the list of reduction roots.
15621 R.clearReductionData();
15622 collectSeedInstructions(BB);
15623
15624 // Vectorize trees that end at stores.
15625 if (!Stores.empty()) {
15626 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
15627 << " underlying objects.\n");
15628 Changed |= vectorizeStoreChains(R);
15629 }
15630
15631 // Vectorize trees that end at reductions.
15632 Changed |= vectorizeChainsInBlock(BB, R);
15633
15634 // Vectorize the index computations of getelementptr instructions. This
15635 // is primarily intended to catch gather-like idioms ending at
15636 // non-consecutive loads.
15637 if (!GEPs.empty()) {
15638 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
15639 << " underlying objects.\n");
15640 Changed |= vectorizeGEPIndices(BB, R);
15641 }
15642 }
15643
15644 if (Changed) {
15645 R.optimizeGatherSequence();
15646 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
15647 }
15648 return Changed;
15649}
15650
15651std::optional<bool>
15652SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
15653 unsigned Idx, unsigned MinVF,
15654 unsigned &Size) {
15655 Size = 0;
15656 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
15657 << "\n");
15658 const unsigned Sz = R.getVectorElementSize(Chain[0]);
15659 unsigned VF = Chain.size();
15660
15661 if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) {
15662 // Check if vectorizing with a non-power-of-2 VF should be considered. At
15663 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
15664 // all vector lanes are used.
15665 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
15666 return false;
15667 }
15668
15669 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
15670 << "\n");
15671
15672 SetVector<Value *> ValOps;
15673 for (Value *V : Chain)
15674 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
15675 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
15676 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
15677 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
15678 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
15679 bool IsPowerOf2 =
15680 isPowerOf2_32(ValOps.size()) ||
15681 (VectorizeNonPowerOf2 && isPowerOf2_32(ValOps.size() + 1));
15682 if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
15683 (!S.MainOp->isSafeToRemove() ||
15684 any_of(ValOps.getArrayRef(),
15685 [&](Value *V) {
15686 return !isa<ExtractElementInst>(V) &&
15687 (V->getNumUses() > Chain.size() ||
15688 any_of(V->users(), [&](User *U) {
15689 return !Stores.contains(U);
15690 }));
15691 }))) ||
15692 (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
15693 Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
15694 return false;
15695 }
15696 }
15697 if (R.isLoadCombineCandidate(Chain))
15698 return true;
15699 R.buildTree(Chain);
15700 // Check if tree tiny and store itself or its value is not vectorized.
15701 if (R.isTreeTinyAndNotFullyVectorizable()) {
15702 if (R.isGathered(Chain.front()) ||
15703 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
15704 return std::nullopt;
15705 Size = R.getTreeSize();
15706 return false;
15707 }
15708 R.reorderTopToBottom();
15709 R.reorderBottomToTop();
15710 R.buildExternalUses();
15711
15712 R.computeMinimumValueSizes();
15713 R.transformNodes();
15714
15715 Size = R.getTreeSize();
15716 if (S.getOpcode() == Instruction::Load)
15717 Size = 2; // cut off masked gather small trees
15718 InstructionCost Cost = R.getTreeCost();
15719
15720 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
15721 if (Cost < -SLPCostThreshold) {
15722 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
15723
15724 using namespace ore;
15725
15726 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
15727 cast<StoreInst>(Chain[0]))
15728 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
15729 << " and with tree size "
15730 << NV("TreeSize", R.getTreeSize()));
15731
15732 R.vectorizeTree();
15733 return true;
15734 }
15735
15736 return false;
15737}
15738
15739/// Checks if the quadratic mean deviation is less than 90% of the mean size.
15740static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
15741 bool First) {
15742 unsigned Num = 0;
15743 uint64_t Sum = std::accumulate(
15744 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
15745 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
15746 unsigned Size = First ? Val.first : Val.second;
15747 if (Size == 1)
15748 return V;
15749 ++Num;
15750 return V + Size;
15751 });
15752 if (Num == 0)
15753 return true;
15754 uint64_t Mean = Sum / Num;
15755 if (Mean == 0)
15756 return true;
15757 uint64_t Dev = std::accumulate(
15758 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
15759 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
15760 unsigned P = First ? Val.first : Val.second;
15761 if (P == 1)
15762 return V;
15763 return V + (P - Mean) * (P - Mean);
15764 }) /
15765 Num;
15766 return Dev * 81 / (Mean * Mean) == 0;
15767}
15768
15769bool SLPVectorizerPass::vectorizeStores(
15770 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
15771 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
15772 &Visited) {
15773 // We may run into multiple chains that merge into a single chain. We mark the
15774 // stores that we vectorized so that we don't visit the same store twice.
15775 BoUpSLP::ValueSet VectorizedStores;
15776 bool Changed = false;
15777
15778 struct StoreDistCompare {
15779 bool operator()(const std::pair<unsigned, int> &Op1,
15780 const std::pair<unsigned, int> &Op2) const {
15781 return Op1.second < Op2.second;
15782 }
15783 };
15784 // A set of pairs (index of store in Stores array ref, Distance of the store
15785 // address relative to base store address in units).
15786 using StoreIndexToDistSet =
15787 std::set<std::pair<unsigned, int>, StoreDistCompare>;
15788 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
15789 int PrevDist = -1;
15791 // Collect the chain into a list.
15792 for (auto [Idx, Data] : enumerate(Set)) {
15793 if (Operands.empty() || Data.second - PrevDist == 1) {
15794 Operands.push_back(Stores[Data.first]);
15795 PrevDist = Data.second;
15796 if (Idx != Set.size() - 1)
15797 continue;
15798 }
15799 auto E = make_scope_exit([&, &DataVar = Data]() {
15800 Operands.clear();
15801 Operands.push_back(Stores[DataVar.first]);
15802 PrevDist = DataVar.second;
15803 });
15804
15805 if (Operands.size() <= 1 ||
15806 !Visited
15807 .insert({Operands.front(),
15808 cast<StoreInst>(Operands.front())->getValueOperand(),
15809 Operands.back(),
15810 cast<StoreInst>(Operands.back())->getValueOperand(),
15811 Operands.size()})
15812 .second)
15813 continue;
15814
15815 unsigned MaxVecRegSize = R.getMaxVecRegSize();
15816 unsigned EltSize = R.getVectorElementSize(Operands[0]);
15817 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
15818
15819 unsigned MaxVF =
15820 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
15821 unsigned MaxRegVF = MaxVF;
15822 auto *Store = cast<StoreInst>(Operands[0]);
15823 Type *StoreTy = Store->getValueOperand()->getType();
15824 Type *ValueTy = StoreTy;
15825 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
15826 ValueTy = Trunc->getSrcTy();
15827 if (ValueTy == StoreTy &&
15828 R.getVectorElementSize(Store->getValueOperand()) <= EltSize)
15829 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
15830 unsigned MinVF = std::max<unsigned>(
15832 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
15833 ValueTy)));
15834
15835 if (MaxVF < MinVF) {
15836 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
15837 << ") < "
15838 << "MinVF (" << MinVF << ")\n");
15839 continue;
15840 }
15841
15842 unsigned NonPowerOf2VF = 0;
15844 // First try vectorizing with a non-power-of-2 VF. At the moment, only
15845 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
15846 // lanes are used.
15847 unsigned CandVF = Operands.size();
15848 if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxRegVF)
15849 NonPowerOf2VF = CandVF;
15850 }
15851
15852 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
15853 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
15854 unsigned Size = MinVF;
15855 for_each(reverse(CandidateVFs), [&](unsigned &VF) {
15856 VF = Size > MaxVF ? NonPowerOf2VF : Size;
15857 Size *= 2;
15858 });
15859 unsigned End = Operands.size();
15860 unsigned Repeat = 0;
15861 constexpr unsigned MaxAttempts = 4;
15863 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
15864 P.first = P.second = 1;
15865 });
15867 auto IsNotVectorized = [](bool First,
15868 const std::pair<unsigned, unsigned> &P) {
15869 return First ? P.first > 0 : P.second > 0;
15870 };
15871 auto IsVectorized = [](bool First,
15872 const std::pair<unsigned, unsigned> &P) {
15873 return First ? P.first == 0 : P.second == 0;
15874 };
15875 auto VFIsProfitable = [](bool First, unsigned Size,
15876 const std::pair<unsigned, unsigned> &P) {
15877 return First ? Size >= P.first : Size >= P.second;
15878 };
15879 auto FirstSizeSame = [](unsigned Size,
15880 const std::pair<unsigned, unsigned> &P) {
15881 return Size == P.first;
15882 };
15883 while (true) {
15884 ++Repeat;
15885 bool RepeatChanged = false;
15886 bool AnyProfitableGraph = false;
15887 for (unsigned Size : CandidateVFs) {
15888 AnyProfitableGraph = false;
15889 unsigned StartIdx = std::distance(
15890 RangeSizes.begin(),
15891 find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
15892 std::placeholders::_1)));
15893 while (StartIdx < End) {
15894 unsigned EndIdx =
15895 std::distance(RangeSizes.begin(),
15896 find_if(RangeSizes.drop_front(StartIdx),
15897 std::bind(IsVectorized, Size >= MaxRegVF,
15898 std::placeholders::_1)));
15899 unsigned Sz = EndIdx >= End ? End : EndIdx;
15900 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
15901 if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
15902 Size >= MaxRegVF)) {
15903 ++Cnt;
15904 continue;
15905 }
15907 assert(all_of(Slice,
15908 [&](Value *V) {
15909 return cast<StoreInst>(V)
15910 ->getValueOperand()
15911 ->getType() ==
15912 cast<StoreInst>(Slice.front())
15913 ->getValueOperand()
15914 ->getType();
15915 }) &&
15916 "Expected all operands of same type.");
15917 if (!NonSchedulable.empty()) {
15918 auto [NonSchedSizeMax, NonSchedSizeMin] =
15919 NonSchedulable.lookup(Slice.front());
15920 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
15921 Cnt += NonSchedSizeMax;
15922 continue;
15923 }
15924 }
15925 unsigned TreeSize;
15926 std::optional<bool> Res =
15927 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
15928 if (!Res) {
15929 NonSchedulable
15930 .try_emplace(Slice.front(), std::make_pair(Size, Size))
15931 .first->getSecond()
15932 .second = Size;
15933 } else if (*Res) {
15934 // Mark the vectorized stores so that we don't vectorize them
15935 // again.
15936 VectorizedStores.insert(Slice.begin(), Slice.end());
15937 // Mark the vectorized stores so that we don't vectorize them
15938 // again.
15939 AnyProfitableGraph = RepeatChanged = Changed = true;
15940 // If we vectorized initial block, no need to try to vectorize
15941 // it again.
15942 for_each(RangeSizes.slice(Cnt, Size),
15943 [](std::pair<unsigned, unsigned> &P) {
15944 P.first = P.second = 0;
15945 });
15946 if (Cnt < StartIdx + MinVF) {
15947 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
15948 [](std::pair<unsigned, unsigned> &P) {
15949 P.first = P.second = 0;
15950 });
15951 StartIdx = Cnt + Size;
15952 }
15953 if (Cnt > Sz - Size - MinVF) {
15954 for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
15955 [](std::pair<unsigned, unsigned> &P) {
15956 P.first = P.second = 0;
15957 });
15958 if (Sz == End)
15959 End = Cnt;
15960 Sz = Cnt;
15961 }
15962 Cnt += Size;
15963 continue;
15964 }
15965 if (Size > 2 && Res &&
15966 !all_of(RangeSizes.slice(Cnt, Size),
15967 std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
15968 std::placeholders::_1))) {
15969 Cnt += Size;
15970 continue;
15971 }
15972 // Check for the very big VFs that we're not rebuilding same
15973 // trees, just with larger number of elements.
15974 if (Size > MaxRegVF && TreeSize > 1 &&
15975 all_of(RangeSizes.slice(Cnt, Size),
15976 std::bind(FirstSizeSame, TreeSize,
15977 std::placeholders::_1))) {
15978 Cnt += Size;
15979 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
15980 ++Cnt;
15981 continue;
15982 }
15983 if (TreeSize > 1)
15984 for_each(RangeSizes.slice(Cnt, Size),
15985 [&](std::pair<unsigned, unsigned> &P) {
15986 if (Size >= MaxRegVF)
15987 P.second = std::max(P.second, TreeSize);
15988 else
15989 P.first = std::max(P.first, TreeSize);
15990 });
15991 ++Cnt;
15992 AnyProfitableGraph = true;
15993 }
15994 if (StartIdx >= End)
15995 break;
15996 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
15997 AnyProfitableGraph = true;
15998 StartIdx = std::distance(
15999 RangeSizes.begin(),
16000 find_if(RangeSizes.drop_front(Sz),
16001 std::bind(IsNotVectorized, Size >= MaxRegVF,
16002 std::placeholders::_1)));
16003 }
16004 if (!AnyProfitableGraph && Size >= MaxRegVF)
16005 break;
16006 }
16007 // All values vectorized - exit.
16008 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
16009 return P.first == 0 && P.second == 0;
16010 }))
16011 break;
16012 // Check if tried all attempts or no need for the last attempts at all.
16013 if (Repeat >= MaxAttempts ||
16014 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
16015 break;
16016 constexpr unsigned StoresLimit = 64;
16017 const unsigned MaxTotalNum = bit_floor(std::min<unsigned>(
16018 Operands.size(),
16019 static_cast<unsigned>(
16020 End -
16021 std::distance(
16022 RangeSizes.begin(),
16023 find_if(RangeSizes, std::bind(IsNotVectorized, true,
16024 std::placeholders::_1))) +
16025 1)));
16026 unsigned VF = PowerOf2Ceil(CandidateVFs.front()) * 2;
16027 if (VF > MaxTotalNum || VF >= StoresLimit)
16028 break;
16029 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
16030 if (P.first != 0)
16031 P.first = std::max(P.second, P.first);
16032 });
16033 // Last attempt to vectorize max number of elements, if all previous
16034 // attempts were unsuccessful because of the cost issues.
16035 CandidateVFs.clear();
16036 CandidateVFs.push_back(VF);
16037 }
16038 }
16039 };
16040
16041 // Stores pair (first: index of the store into Stores array ref, address of
16042 // which taken as base, second: sorted set of pairs {index, dist}, which are
16043 // indices of stores in the set and their store location distances relative to
16044 // the base address).
16045
16046 // Need to store the index of the very first store separately, since the set
16047 // may be reordered after the insertion and the first store may be moved. This
16048 // container allows to reduce number of calls of getPointersDiff() function.
16050 // Inserts the specified store SI with the given index Idx to the set of the
16051 // stores. If the store with the same distance is found already - stop
16052 // insertion, try to vectorize already found stores. If some stores from this
16053 // sequence were not vectorized - try to vectorize them with the new store
16054 // later. But this logic is applied only to the stores, that come before the
16055 // previous store with the same distance.
16056 // Example:
16057 // 1. store x, %p
16058 // 2. store y, %p+1
16059 // 3. store z, %p+2
16060 // 4. store a, %p
16061 // 5. store b, %p+3
16062 // - Scan this from the last to first store. The very first bunch of stores is
16063 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
16064 // vector).
16065 // - The next store in the list - #1 - has the same distance from store #5 as
16066 // the store #4.
16067 // - Try to vectorize sequence of stores 4,2,3,5.
16068 // - If all these stores are vectorized - just drop them.
16069 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
16070 // - Start new stores sequence.
16071 // The new bunch of stores is {1, {1, 0}}.
16072 // - Add the stores from previous sequence, that were not vectorized.
16073 // Here we consider the stores in the reversed order, rather they are used in
16074 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
16075 // Store #3 can be added -> comes after store #4 with the same distance as
16076 // store #1.
16077 // Store #5 cannot be added - comes before store #4.
16078 // This logic allows to improve the compile time, we assume that the stores
16079 // after previous store with the same distance most likely have memory
16080 // dependencies and no need to waste compile time to try to vectorize them.
16081 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
16082 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
16083 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
16084 std::optional<int> Diff = getPointersDiff(
16085 Stores[Set.first]->getValueOperand()->getType(),
16086 Stores[Set.first]->getPointerOperand(),
16087 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
16088 /*StrictCheck=*/true);
16089 if (!Diff)
16090 continue;
16091 auto It = Set.second.find(std::make_pair(Idx, *Diff));
16092 if (It == Set.second.end()) {
16093 Set.second.emplace(Idx, *Diff);
16094 return;
16095 }
16096 // Try to vectorize the first found set to avoid duplicate analysis.
16097 TryToVectorize(Set.second);
16098 StoreIndexToDistSet PrevSet;
16099 PrevSet.swap(Set.second);
16100 Set.first = Idx;
16101 Set.second.emplace(Idx, 0);
16102 // Insert stores that followed previous match to try to vectorize them
16103 // with this store.
16104 unsigned StartIdx = It->first + 1;
16105 SmallBitVector UsedStores(Idx - StartIdx);
16106 // Distances to previously found dup store (or this store, since they
16107 // store to the same addresses).
16108 SmallVector<int> Dists(Idx - StartIdx, 0);
16109 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
16110 // Do not try to vectorize sequences, we already tried.
16111 if (Pair.first <= It->first ||
16112 VectorizedStores.contains(Stores[Pair.first]))
16113 break;
16114 unsigned BI = Pair.first - StartIdx;
16115 UsedStores.set(BI);
16116 Dists[BI] = Pair.second - It->second;
16117 }
16118 for (unsigned I = StartIdx; I < Idx; ++I) {
16119 unsigned BI = I - StartIdx;
16120 if (UsedStores.test(BI))
16121 Set.second.emplace(I, Dists[BI]);
16122 }
16123 return;
16124 }
16125 auto &Res = SortedStores.emplace_back();
16126 Res.first = Idx;
16127 Res.second.emplace(Idx, 0);
16128 };
16129 StoreInst *PrevStore = Stores.front();
16130 for (auto [I, SI] : enumerate(Stores)) {
16131 // Check that we do not try to vectorize stores of different types.
16132 if (PrevStore->getValueOperand()->getType() !=
16133 SI->getValueOperand()->getType()) {
16134 for (auto &Set : SortedStores)
16135 TryToVectorize(Set.second);
16136 SortedStores.clear();
16137 PrevStore = SI;
16138 }
16139 FillStoresSet(I, SI);
16140 }
16141
16142 // Final vectorization attempt.
16143 for (auto &Set : SortedStores)
16144 TryToVectorize(Set.second);
16145
16146 return Changed;
16147}
16148
16149void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
16150 // Initialize the collections. We will make a single pass over the block.
16151 Stores.clear();
16152 GEPs.clear();
16153
16154 // Visit the store and getelementptr instructions in BB and organize them in
16155 // Stores and GEPs according to the underlying objects of their pointer
16156 // operands.
16157 for (Instruction &I : *BB) {
16158 // Ignore store instructions that are volatile or have a pointer operand
16159 // that doesn't point to a scalar type.
16160 if (auto *SI = dyn_cast<StoreInst>(&I)) {
16161 if (!SI->isSimple())
16162 continue;
16163 if (!isValidElementType(SI->getValueOperand()->getType()))
16164 continue;
16165 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
16166 }
16167
16168 // Ignore getelementptr instructions that have more than one index, a
16169 // constant index, or a pointer operand that doesn't point to a scalar
16170 // type.
16171 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
16172 if (GEP->getNumIndices() != 1)
16173 continue;
16174 Value *Idx = GEP->idx_begin()->get();
16175 if (isa<Constant>(Idx))
16176 continue;
16177 if (!isValidElementType(Idx->getType()))
16178 continue;
16179 if (GEP->getType()->isVectorTy())
16180 continue;
16181 GEPs[GEP->getPointerOperand()].push_back(GEP);
16182 }
16183 }
16184}
16185
16186bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
16187 bool MaxVFOnly) {
16188 if (VL.size() < 2)
16189 return false;
16190
16191 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
16192 << VL.size() << ".\n");
16193
16194 // Check that all of the parts are instructions of the same type,
16195 // we permit an alternate opcode via InstructionsState.
16196 InstructionsState S = getSameOpcode(VL, *TLI);
16197 if (!S.getOpcode())
16198 return false;
16199
16200 Instruction *I0 = cast<Instruction>(S.OpValue);
16201 // Make sure invalid types (including vector type) are rejected before
16202 // determining vectorization factor for scalar instructions.
16203 for (Value *V : VL) {
16204 Type *Ty = V->getType();
16205 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
16206 // NOTE: the following will give user internal llvm type name, which may
16207 // not be useful.
16208 R.getORE()->emit([&]() {
16209 std::string TypeStr;
16210 llvm::raw_string_ostream rso(TypeStr);
16211 Ty->print(rso);
16212 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
16213 << "Cannot SLP vectorize list: type "
16214 << rso.str() + " is unsupported by vectorizer";
16215 });
16216 return false;
16217 }
16218 }
16219
16220 unsigned Sz = R.getVectorElementSize(I0);
16221 unsigned MinVF = R.getMinVF(Sz);
16222 unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
16223 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
16224 if (MaxVF < 2) {
16225 R.getORE()->emit([&]() {
16226 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
16227 << "Cannot SLP vectorize list: vectorization factor "
16228 << "less than 2 is not supported";
16229 });
16230 return false;
16231 }
16232
16233 bool Changed = false;
16234 bool CandidateFound = false;
16235 InstructionCost MinCost = SLPCostThreshold.getValue();
16236 Type *ScalarTy = VL[0]->getType();
16237 if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
16238 ScalarTy = IE->getOperand(1)->getType();
16239
16240 unsigned NextInst = 0, MaxInst = VL.size();
16241 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
16242 // No actual vectorization should happen, if number of parts is the same as
16243 // provided vectorization factor (i.e. the scalar type is used for vector
16244 // code during codegen).
16245 auto *VecTy = FixedVectorType::get(ScalarTy, VF);
16246 if (TTI->getNumberOfParts(VecTy) == VF)
16247 continue;
16248 for (unsigned I = NextInst; I < MaxInst; ++I) {
16249 unsigned ActualVF = std::min(MaxInst - I, VF);
16250
16251 if (!isPowerOf2_32(ActualVF))
16252 continue;
16253
16254 if (MaxVFOnly && ActualVF < MaxVF)
16255 break;
16256 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
16257 break;
16258
16259 ArrayRef<Value *> Ops = VL.slice(I, ActualVF);
16260 // Check that a previous iteration of this loop did not delete the Value.
16261 if (llvm::any_of(Ops, [&R](Value *V) {
16262 auto *I = dyn_cast<Instruction>(V);
16263 return I && R.isDeleted(I);
16264 }))
16265 continue;
16266
16267 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
16268 << "\n");
16269
16270 R.buildTree(Ops);
16271 if (R.isTreeTinyAndNotFullyVectorizable())
16272 continue;
16273 R.reorderTopToBottom();
16274 R.reorderBottomToTop(
16275 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
16276 !R.doesRootHaveInTreeUses());
16277 R.buildExternalUses();
16278
16279 R.computeMinimumValueSizes();
16280 R.transformNodes();
16281 InstructionCost Cost = R.getTreeCost();
16282 CandidateFound = true;
16283 MinCost = std::min(MinCost, Cost);
16284
16285 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
16286 << " for VF=" << ActualVF << "\n");
16287 if (Cost < -SLPCostThreshold) {
16288 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
16289 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
16290 cast<Instruction>(Ops[0]))
16291 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
16292 << " and with tree size "
16293 << ore::NV("TreeSize", R.getTreeSize()));
16294
16295 R.vectorizeTree();
16296 // Move to the next bundle.
16297 I += VF - 1;
16298 NextInst = I + 1;
16299 Changed = true;
16300 }
16301 }
16302 }
16303
16304 if (!Changed && CandidateFound) {
16305 R.getORE()->emit([&]() {
16306 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
16307 << "List vectorization was possible but not beneficial with cost "
16308 << ore::NV("Cost", MinCost) << " >= "
16309 << ore::NV("Treshold", -SLPCostThreshold);
16310 });
16311 } else if (!Changed) {
16312 R.getORE()->emit([&]() {
16313 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
16314 << "Cannot SLP vectorize list: vectorization was impossible"
16315 << " with available vectorization factors";
16316 });
16317 }
16318 return Changed;
16319}
16320
16321bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
16322 if (!I)
16323 return false;
16324
16325 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
16326 return false;
16327
16328 Value *P = I->getParent();
16329
16330 // Vectorize in current basic block only.
16331 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
16332 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
16333 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
16334 return false;
16335
16336 // First collect all possible candidates
16338 Candidates.emplace_back(Op0, Op1);
16339
16340 auto *A = dyn_cast<BinaryOperator>(Op0);
16341 auto *B = dyn_cast<BinaryOperator>(Op1);
16342 // Try to skip B.
16343 if (A && B && B->hasOneUse()) {
16344 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
16345 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
16346 if (B0 && B0->getParent() == P)
16347 Candidates.emplace_back(A, B0);
16348 if (B1 && B1->getParent() == P)
16349 Candidates.emplace_back(A, B1);
16350 }
16351 // Try to skip A.
16352 if (B && A && A->hasOneUse()) {
16353 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
16354 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
16355 if (A0 && A0->getParent() == P)
16356 Candidates.emplace_back(A0, B);
16357 if (A1 && A1->getParent() == P)
16358 Candidates.emplace_back(A1, B);
16359 }
16360
16361 if (Candidates.size() == 1)
16362 return tryToVectorizeList({Op0, Op1}, R);
16363
16364 // We have multiple options. Try to pick the single best.
16365 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
16366 if (!BestCandidate)
16367 return false;
16368 return tryToVectorizeList(
16369 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
16370}
16371
16372namespace {
16373
16374/// Model horizontal reductions.
16375///
16376/// A horizontal reduction is a tree of reduction instructions that has values
16377/// that can be put into a vector as its leaves. For example:
16378///
16379/// mul mul mul mul
16380/// \ / \ /
16381/// + +
16382/// \ /
16383/// +
16384/// This tree has "mul" as its leaf values and "+" as its reduction
16385/// instructions. A reduction can feed into a store or a binary operation
16386/// feeding a phi.
16387/// ...
16388/// \ /
16389/// +
16390/// |
16391/// phi +=
16392///
16393/// Or:
16394/// ...
16395/// \ /
16396/// +
16397/// |
16398/// *p =
16399///
16400class HorizontalReduction {
16401 using ReductionOpsType = SmallVector<Value *, 16>;
16402 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
16403 ReductionOpsListType ReductionOps;
16404 /// List of possibly reduced values.
16406 /// Maps reduced value to the corresponding reduction operation.
16408 // Use map vector to make stable output.
16410 WeakTrackingVH ReductionRoot;
16411 /// The type of reduction operation.
16412 RecurKind RdxKind;
16413 /// Checks if the optimization of original scalar identity operations on
16414 /// matched horizontal reductions is enabled and allowed.
16415 bool IsSupportedHorRdxIdentityOp = false;
16416
16417 static bool isCmpSelMinMax(Instruction *I) {
16418 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
16420 }
16421
16422 // And/or are potentially poison-safe logical patterns like:
16423 // select x, y, false
16424 // select x, true, y
16425 static bool isBoolLogicOp(Instruction *I) {
16426 return isa<SelectInst>(I) &&
16427 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
16428 }
16429
16430 /// Checks if instruction is associative and can be vectorized.
16431 static bool isVectorizable(RecurKind Kind, Instruction *I) {
16432 if (Kind == RecurKind::None)
16433 return false;
16434
16435 // Integer ops that map to select instructions or intrinsics are fine.
16437 isBoolLogicOp(I))
16438 return true;
16439
16440 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
16441 // FP min/max are associative except for NaN and -0.0. We do not
16442 // have to rule out -0.0 here because the intrinsic semantics do not
16443 // specify a fixed result for it.
16444 return I->getFastMathFlags().noNaNs();
16445 }
16446
16447 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
16448 return true;
16449
16450 return I->isAssociative();
16451 }
16452
16453 static Value *getRdxOperand(Instruction *I, unsigned Index) {
16454 // Poison-safe 'or' takes the form: select X, true, Y
16455 // To make that work with the normal operand processing, we skip the
16456 // true value operand.
16457 // TODO: Change the code and data structures to handle this without a hack.
16458 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
16459 return I->getOperand(2);
16460 return I->getOperand(Index);
16461 }
16462
16463 /// Creates reduction operation with the current opcode.
16464 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
16465 Value *RHS, const Twine &Name, bool UseSelect) {
16466 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
16467 switch (Kind) {
16468 case RecurKind::Or:
16469 if (UseSelect &&
16471 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
16472 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16473 Name);
16474 case RecurKind::And:
16475 if (UseSelect &&
16477 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
16478 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16479 Name);
16480 case RecurKind::Add:
16481 case RecurKind::Mul:
16482 case RecurKind::Xor:
16483 case RecurKind::FAdd:
16484 case RecurKind::FMul:
16485 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16486 Name);
16487 case RecurKind::FMax:
16488 return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
16489 case RecurKind::FMin:
16490 return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
16491 case RecurKind::FMaximum:
16492 return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
16493 case RecurKind::FMinimum:
16494 return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
16495 case RecurKind::SMax:
16496 if (UseSelect) {
16497 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
16498 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16499 }
16500 return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
16501 case RecurKind::SMin:
16502 if (UseSelect) {
16503 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
16504 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16505 }
16506 return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
16507 case RecurKind::UMax:
16508 if (UseSelect) {
16509 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
16510 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16511 }
16512 return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
16513 case RecurKind::UMin:
16514 if (UseSelect) {
16515 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
16516 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16517 }
16518 return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
16519 default:
16520 llvm_unreachable("Unknown reduction operation.");
16521 }
16522 }
16523
16524 /// Creates reduction operation with the current opcode with the IR flags
16525 /// from \p ReductionOps, dropping nuw/nsw flags.
16526 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
16527 Value *RHS, const Twine &Name,
16528 const ReductionOpsListType &ReductionOps) {
16529 bool UseSelect = ReductionOps.size() == 2 ||
16530 // Logical or/and.
16531 (ReductionOps.size() == 1 &&
16532 any_of(ReductionOps.front(), IsaPred<SelectInst>));
16533 assert((!UseSelect || ReductionOps.size() != 2 ||
16534 isa<SelectInst>(ReductionOps[1][0])) &&
16535 "Expected cmp + select pairs for reduction");
16536 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
16538 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
16539 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
16540 /*IncludeWrapFlags=*/false);
16541 propagateIRFlags(Op, ReductionOps[1], nullptr,
16542 /*IncludeWrapFlags=*/false);
16543 return Op;
16544 }
16545 }
16546 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
16547 return Op;
16548 }
16549
16550public:
16551 static RecurKind getRdxKind(Value *V) {
16552 auto *I = dyn_cast<Instruction>(V);
16553 if (!I)
16554 return RecurKind::None;
16555 if (match(I, m_Add(m_Value(), m_Value())))
16556 return RecurKind::Add;
16557 if (match(I, m_Mul(m_Value(), m_Value())))
16558 return RecurKind::Mul;
16559 if (match(I, m_And(m_Value(), m_Value())) ||
16561 return RecurKind::And;
16562 if (match(I, m_Or(m_Value(), m_Value())) ||
16564 return RecurKind::Or;
16565 if (match(I, m_Xor(m_Value(), m_Value())))
16566 return RecurKind::Xor;
16567 if (match(I, m_FAdd(m_Value(), m_Value())))
16568 return RecurKind::FAdd;
16569 if (match(I, m_FMul(m_Value(), m_Value())))
16570 return RecurKind::FMul;
16571
16572 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
16573 return RecurKind::FMax;
16574 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
16575 return RecurKind::FMin;
16576
16577 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
16578 return RecurKind::FMaximum;
16579 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
16580 return RecurKind::FMinimum;
16581 // This matches either cmp+select or intrinsics. SLP is expected to handle
16582 // either form.
16583 // TODO: If we are canonicalizing to intrinsics, we can remove several
16584 // special-case paths that deal with selects.
16585 if (match(I, m_SMax(m_Value(), m_Value())))
16586 return RecurKind::SMax;
16587 if (match(I, m_SMin(m_Value(), m_Value())))
16588 return RecurKind::SMin;
16589 if (match(I, m_UMax(m_Value(), m_Value())))
16590 return RecurKind::UMax;
16591 if (match(I, m_UMin(m_Value(), m_Value())))
16592 return RecurKind::UMin;
16593
16594 if (auto *Select = dyn_cast<SelectInst>(I)) {
16595 // Try harder: look for min/max pattern based on instructions producing
16596 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
16597 // During the intermediate stages of SLP, it's very common to have
16598 // pattern like this (since optimizeGatherSequence is run only once
16599 // at the end):
16600 // %1 = extractelement <2 x i32> %a, i32 0
16601 // %2 = extractelement <2 x i32> %a, i32 1
16602 // %cond = icmp sgt i32 %1, %2
16603 // %3 = extractelement <2 x i32> %a, i32 0
16604 // %4 = extractelement <2 x i32> %a, i32 1
16605 // %select = select i1 %cond, i32 %3, i32 %4
16606 CmpInst::Predicate Pred;
16607 Instruction *L1;
16608 Instruction *L2;
16609
16610 Value *LHS = Select->getTrueValue();
16611 Value *RHS = Select->getFalseValue();
16612 Value *Cond = Select->getCondition();
16613
16614 // TODO: Support inverse predicates.
16615 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
16616 if (!isa<ExtractElementInst>(RHS) ||
16617 !L2->isIdenticalTo(cast<Instruction>(RHS)))
16618 return RecurKind::None;
16619 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
16620 if (!isa<ExtractElementInst>(LHS) ||
16621 !L1->isIdenticalTo(cast<Instruction>(LHS)))
16622 return RecurKind::None;
16623 } else {
16624 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
16625 return RecurKind::None;
16626 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
16627 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
16628 !L2->isIdenticalTo(cast<Instruction>(RHS)))
16629 return RecurKind::None;
16630 }
16631
16632 switch (Pred) {
16633 default:
16634 return RecurKind::None;
16635 case CmpInst::ICMP_SGT:
16636 case CmpInst::ICMP_SGE:
16637 return RecurKind::SMax;
16638 case CmpInst::ICMP_SLT:
16639 case CmpInst::ICMP_SLE:
16640 return RecurKind::SMin;
16641 case CmpInst::ICMP_UGT:
16642 case CmpInst::ICMP_UGE:
16643 return RecurKind::UMax;
16644 case CmpInst::ICMP_ULT:
16645 case CmpInst::ICMP_ULE:
16646 return RecurKind::UMin;
16647 }
16648 }
16649 return RecurKind::None;
16650 }
16651
16652 /// Get the index of the first operand.
16653 static unsigned getFirstOperandIndex(Instruction *I) {
16654 return isCmpSelMinMax(I) ? 1 : 0;
16655 }
16656
16657private:
16658 /// Total number of operands in the reduction operation.
16659 static unsigned getNumberOfOperands(Instruction *I) {
16660 return isCmpSelMinMax(I) ? 3 : 2;
16661 }
16662
16663 /// Checks if the instruction is in basic block \p BB.
16664 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
16665 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
16666 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
16667 auto *Sel = cast<SelectInst>(I);
16668 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
16669 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
16670 }
16671 return I->getParent() == BB;
16672 }
16673
16674 /// Expected number of uses for reduction operations/reduced values.
16675 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
16676 if (IsCmpSelMinMax) {
16677 // SelectInst must be used twice while the condition op must have single
16678 // use only.
16679 if (auto *Sel = dyn_cast<SelectInst>(I))
16680 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
16681 return I->hasNUses(2);
16682 }
16683
16684 // Arithmetic reduction operation must be used once only.
16685 return I->hasOneUse();
16686 }
16687
16688 /// Initializes the list of reduction operations.
16689 void initReductionOps(Instruction *I) {
16690 if (isCmpSelMinMax(I))
16691 ReductionOps.assign(2, ReductionOpsType());
16692 else
16693 ReductionOps.assign(1, ReductionOpsType());
16694 }
16695
16696 /// Add all reduction operations for the reduction instruction \p I.
16697 void addReductionOps(Instruction *I) {
16698 if (isCmpSelMinMax(I)) {
16699 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
16700 ReductionOps[1].emplace_back(I);
16701 } else {
16702 ReductionOps[0].emplace_back(I);
16703 }
16704 }
16705
16706 static bool isGoodForReduction(ArrayRef<Value *> Data) {
16707 int Sz = Data.size();
16708 auto *I = dyn_cast<Instruction>(Data.front());
16709 return Sz > 1 || isConstant(Data.front()) ||
16710 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
16711 }
16712
16713public:
16714 HorizontalReduction() = default;
16715
16716 /// Try to find a reduction tree.
16717 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
16718 ScalarEvolution &SE, const DataLayout &DL,
16719 const TargetLibraryInfo &TLI) {
16720 RdxKind = HorizontalReduction::getRdxKind(Root);
16721 if (!isVectorizable(RdxKind, Root))
16722 return false;
16723
16724 // Analyze "regular" integer/FP types for reductions - no target-specific
16725 // types or pointers.
16726 Type *Ty = Root->getType();
16727 if (!isValidElementType(Ty) || Ty->isPointerTy())
16728 return false;
16729
16730 // Though the ultimate reduction may have multiple uses, its condition must
16731 // have only single use.
16732 if (auto *Sel = dyn_cast<SelectInst>(Root))
16733 if (!Sel->getCondition()->hasOneUse())
16734 return false;
16735
16736 ReductionRoot = Root;
16737
16738 // Iterate through all the operands of the possible reduction tree and
16739 // gather all the reduced values, sorting them by their value id.
16740 BasicBlock *BB = Root->getParent();
16741 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
16742 SmallVector<Instruction *> Worklist(1, Root);
16743 // Checks if the operands of the \p TreeN instruction are also reduction
16744 // operations or should be treated as reduced values or an extra argument,
16745 // which is not part of the reduction.
16746 auto CheckOperands = [&](Instruction *TreeN,
16747 SmallVectorImpl<Value *> &ExtraArgs,
16748 SmallVectorImpl<Value *> &PossibleReducedVals,
16749 SmallVectorImpl<Instruction *> &ReductionOps) {
16750 for (int I = getFirstOperandIndex(TreeN),
16751 End = getNumberOfOperands(TreeN);
16752 I < End; ++I) {
16753 Value *EdgeVal = getRdxOperand(TreeN, I);
16754 ReducedValsToOps[EdgeVal].push_back(TreeN);
16755 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
16756 // Edge has wrong parent - mark as an extra argument.
16757 if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
16758 !hasSameParent(EdgeInst, BB)) {
16759 ExtraArgs.push_back(EdgeVal);
16760 continue;
16761 }
16762 // If the edge is not an instruction, or it is different from the main
16763 // reduction opcode or has too many uses - possible reduced value.
16764 // Also, do not try to reduce const values, if the operation is not
16765 // foldable.
16766 if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
16767 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
16768 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
16769 !isVectorizable(RdxKind, EdgeInst) ||
16770 (R.isAnalyzedReductionRoot(EdgeInst) &&
16771 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
16772 PossibleReducedVals.push_back(EdgeVal);
16773 continue;
16774 }
16775 ReductionOps.push_back(EdgeInst);
16776 }
16777 };
16778 // Try to regroup reduced values so that it gets more profitable to try to
16779 // reduce them. Values are grouped by their value ids, instructions - by
16780 // instruction op id and/or alternate op id, plus do extra analysis for
16781 // loads (grouping them by the distabce between pointers) and cmp
16782 // instructions (grouping them by the predicate).
16784 PossibleReducedVals;
16785 initReductionOps(Root);
16787 SmallSet<size_t, 2> LoadKeyUsed;
16788 SmallPtrSet<Value *, 4> DoNotReverseVals;
16789
16790 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
16792 if (LoadKeyUsed.contains(Key)) {
16793 auto LIt = LoadsMap.find(Ptr);
16794 if (LIt != LoadsMap.end()) {
16795 for (LoadInst *RLI : LIt->second) {
16796 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
16797 LI->getType(), LI->getPointerOperand(), DL, SE,
16798 /*StrictCheck=*/true))
16799 return hash_value(RLI->getPointerOperand());
16800 }
16801 for (LoadInst *RLI : LIt->second) {
16803 LI->getPointerOperand(), TLI)) {
16804 hash_code SubKey = hash_value(RLI->getPointerOperand());
16805 DoNotReverseVals.insert(RLI);
16806 return SubKey;
16807 }
16808 }
16809 if (LIt->second.size() > 2) {
16810 hash_code SubKey =
16811 hash_value(LIt->second.back()->getPointerOperand());
16812 DoNotReverseVals.insert(LIt->second.back());
16813 return SubKey;
16814 }
16815 }
16816 }
16817 LoadKeyUsed.insert(Key);
16818 LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
16819 return hash_value(LI->getPointerOperand());
16820 };
16821
16822 while (!Worklist.empty()) {
16823 Instruction *TreeN = Worklist.pop_back_val();
16825 SmallVector<Value *> PossibleRedVals;
16826 SmallVector<Instruction *> PossibleReductionOps;
16827 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
16828 // If too many extra args - mark the instruction itself as a reduction
16829 // value, not a reduction operation.
16830 if (Args.size() < 2) {
16831 addReductionOps(TreeN);
16832 // Add extra args.
16833 if (!Args.empty()) {
16834 assert(Args.size() == 1 && "Expected only single argument.");
16835 ExtraArgs[TreeN] = Args.front();
16836 }
16837 // Add reduction values. The values are sorted for better vectorization
16838 // results.
16839 for (Value *V : PossibleRedVals) {
16840 size_t Key, Idx;
16841 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
16842 /*AllowAlternate=*/false);
16843 ++PossibleReducedVals[Key][Idx]
16844 .insert(std::make_pair(V, 0))
16845 .first->second;
16846 }
16847 Worklist.append(PossibleReductionOps.rbegin(),
16848 PossibleReductionOps.rend());
16849 } else {
16850 size_t Key, Idx;
16851 std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey,
16852 /*AllowAlternate=*/false);
16853 ++PossibleReducedVals[Key][Idx]
16854 .insert(std::make_pair(TreeN, 0))
16855 .first->second;
16856 }
16857 }
16858 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
16859 // Sort values by the total number of values kinds to start the reduction
16860 // from the longest possible reduced values sequences.
16861 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
16862 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
16863 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
16864 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
16865 It != E; ++It) {
16866 PossibleRedValsVect.emplace_back();
16867 auto RedValsVect = It->second.takeVector();
16868 stable_sort(RedValsVect, llvm::less_second());
16869 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
16870 PossibleRedValsVect.back().append(Data.second, Data.first);
16871 }
16872 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
16873 return P1.size() > P2.size();
16874 });
16875 int NewIdx = -1;
16876 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
16877 if (isGoodForReduction(Data) ||
16878 (isa<LoadInst>(Data.front()) && NewIdx >= 0 &&
16879 isa<LoadInst>(ReducedVals[NewIdx].front()) &&
16881 cast<LoadInst>(Data.front())->getPointerOperand()) ==
16882 getUnderlyingObject(cast<LoadInst>(ReducedVals[NewIdx].front())
16883 ->getPointerOperand()))) {
16884 if (NewIdx < 0) {
16885 NewIdx = ReducedVals.size();
16886 ReducedVals.emplace_back();
16887 }
16888 if (DoNotReverseVals.contains(Data.front()))
16889 ReducedVals[NewIdx].append(Data.begin(), Data.end());
16890 else
16891 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
16892 } else {
16893 ReducedVals.emplace_back().append(Data.rbegin(), Data.rend());
16894 }
16895 }
16896 }
16897 // Sort the reduced values by number of same/alternate opcode and/or pointer
16898 // operand.
16899 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
16900 return P1.size() > P2.size();
16901 });
16902 return true;
16903 }
16904
16905 /// Attempt to vectorize the tree found by matchAssociativeReduction.
16906 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
16907 const TargetLibraryInfo &TLI) {
16908 constexpr int ReductionLimit = 4;
16909 constexpr unsigned RegMaxNumber = 4;
16910 constexpr unsigned RedValsMaxNumber = 128;
16911 // If there are a sufficient number of reduction values, reduce
16912 // to a nearby power-of-2. We can safely generate oversized
16913 // vectors and rely on the backend to split them to legal sizes.
16914 unsigned NumReducedVals =
16915 std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0,
16916 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
16917 if (!isGoodForReduction(Vals))
16918 return Num;
16919 return Num + Vals.size();
16920 });
16921 if (NumReducedVals < ReductionLimit &&
16923 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
16924 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
16925 }))) {
16926 for (ReductionOpsType &RdxOps : ReductionOps)
16927 for (Value *RdxOp : RdxOps)
16928 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
16929 return nullptr;
16930 }
16931
16932 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
16933 TargetFolder(DL));
16934 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
16935
16936 // Track the reduced values in case if they are replaced by extractelement
16937 // because of the vectorization.
16939 ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
16940 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
16941 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
16942 ExternallyUsedValues.reserve(ExtraArgs.size() + 1);
16943 // The same extra argument may be used several times, so log each attempt
16944 // to use it.
16945 for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
16946 assert(Pair.first && "DebugLoc must be set.");
16947 ExternallyUsedValues[Pair.second].push_back(Pair.first);
16948 TrackedVals.try_emplace(Pair.second, Pair.second);
16949 }
16950
16951 // The compare instruction of a min/max is the insertion point for new
16952 // instructions and may be replaced with a new compare instruction.
16953 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
16954 assert(isa<SelectInst>(RdxRootInst) &&
16955 "Expected min/max reduction to have select root instruction");
16956 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
16957 assert(isa<Instruction>(ScalarCond) &&
16958 "Expected min/max reduction to have compare condition");
16959 return cast<Instruction>(ScalarCond);
16960 };
16961
16962 // Return new VectorizedTree, based on previous value.
16963 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
16964 if (VectorizedTree) {
16965 // Update the final value in the reduction.
16967 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
16968 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
16970 !isGuaranteedNotToBePoison(VectorizedTree))) {
16971 auto It = ReducedValsToOps.find(Res);
16972 if (It != ReducedValsToOps.end() &&
16973 any_of(It->getSecond(),
16974 [](Instruction *I) { return isBoolLogicOp(I); }))
16975 std::swap(VectorizedTree, Res);
16976 }
16977
16978 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
16979 ReductionOps);
16980 }
16981 // Initialize the final value in the reduction.
16982 return Res;
16983 };
16984 bool AnyBoolLogicOp =
16985 any_of(ReductionOps.back(), [](Value *V) {
16986 return isBoolLogicOp(cast<Instruction>(V));
16987 });
16988 // The reduction root is used as the insertion point for new instructions,
16989 // so set it as externally used to prevent it from being deleted.
16990 ExternallyUsedValues[ReductionRoot];
16991 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
16992 ReductionOps.front().size());
16993 for (ReductionOpsType &RdxOps : ReductionOps)
16994 for (Value *RdxOp : RdxOps) {
16995 if (!RdxOp)
16996 continue;
16997 IgnoreList.insert(RdxOp);
16998 }
16999 // Intersect the fast-math-flags from all reduction operations.
17000 FastMathFlags RdxFMF;
17001 RdxFMF.set();
17002 for (Value *U : IgnoreList)
17003 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
17004 RdxFMF &= FPMO->getFastMathFlags();
17005 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
17006
17007 // Need to track reduced vals, they may be changed during vectorization of
17008 // subvectors.
17009 for (ArrayRef<Value *> Candidates : ReducedVals)
17010 for (Value *V : Candidates)
17011 TrackedVals.try_emplace(V, V);
17012
17013 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
17014 // List of the values that were reduced in other trees as part of gather
17015 // nodes and thus requiring extract if fully vectorized in other trees.
17016 SmallPtrSet<Value *, 4> RequiredExtract;
17017 Value *VectorizedTree = nullptr;
17018 bool CheckForReusedReductionOps = false;
17019 // Try to vectorize elements based on their type.
17020 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
17021 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
17022 InstructionsState S = getSameOpcode(OrigReducedVals, TLI);
17023 SmallVector<Value *> Candidates;
17024 Candidates.reserve(2 * OrigReducedVals.size());
17025 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
17026 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
17027 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
17028 // Check if the reduction value was not overriden by the extractelement
17029 // instruction because of the vectorization and exclude it, if it is not
17030 // compatible with other values.
17031 // Also check if the instruction was folded to constant/other value.
17032 auto *Inst = dyn_cast<Instruction>(RdxVal);
17033 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
17034 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
17035 (S.getOpcode() && !Inst))
17036 continue;
17037 Candidates.push_back(RdxVal);
17038 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
17039 }
17040 bool ShuffledExtracts = false;
17041 // Try to handle shuffled extractelements.
17042 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
17043 I + 1 < E) {
17044 InstructionsState NextS = getSameOpcode(ReducedVals[I + 1], TLI);
17045 if (NextS.getOpcode() == Instruction::ExtractElement &&
17046 !NextS.isAltShuffle()) {
17047 SmallVector<Value *> CommonCandidates(Candidates);
17048 for (Value *RV : ReducedVals[I + 1]) {
17049 Value *RdxVal = TrackedVals.find(RV)->second;
17050 // Check if the reduction value was not overriden by the
17051 // extractelement instruction because of the vectorization and
17052 // exclude it, if it is not compatible with other values.
17053 if (auto *Inst = dyn_cast<Instruction>(RdxVal))
17054 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
17055 continue;
17056 CommonCandidates.push_back(RdxVal);
17057 TrackedToOrig.try_emplace(RdxVal, RV);
17058 }
17060 if (isFixedVectorShuffle(CommonCandidates, Mask)) {
17061 ++I;
17062 Candidates.swap(CommonCandidates);
17063 ShuffledExtracts = true;
17064 }
17065 }
17066 }
17067
17068 // Emit code for constant values.
17069 if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 &&
17070 allConstant(Candidates)) {
17071 Value *Res = Candidates.front();
17072 ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond();
17073 for (Value *VC : ArrayRef(Candidates).drop_front()) {
17074 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
17075 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
17076 if (auto *ResI = dyn_cast<Instruction>(Res))
17077 V.analyzedReductionRoot(ResI);
17078 }
17079 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
17080 continue;
17081 }
17082
17083 unsigned NumReducedVals = Candidates.size();
17084 if (NumReducedVals < ReductionLimit &&
17085 (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization ||
17086 !isSplat(Candidates)))
17087 continue;
17088
17089 // Check if we support repeated scalar values processing (optimization of
17090 // original scalar identity operations on matched horizontal reductions).
17091 IsSupportedHorRdxIdentityOp =
17092 AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
17093 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
17094 // Gather same values.
17095 MapVector<Value *, unsigned> SameValuesCounter;
17096 if (IsSupportedHorRdxIdentityOp)
17097 for (Value *V : Candidates)
17098 ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second;
17099 // Used to check if the reduced values used same number of times. In this
17100 // case the compiler may produce better code. E.g. if reduced values are
17101 // aabbccdd (8 x values), then the first node of the tree will have a node
17102 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
17103 // Plus, the final reduction will be performed on <8 x aabbccdd>.
17104 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
17105 // x abcd) * 2.
17106 // Currently it only handles add/fadd/xor. and/or/min/max do not require
17107 // this analysis, other operations may require an extra estimation of
17108 // the profitability.
17109 bool SameScaleFactor = false;
17110 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
17111 SameValuesCounter.size() != Candidates.size();
17112 if (OptReusedScalars) {
17113 SameScaleFactor =
17114 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
17115 RdxKind == RecurKind::Xor) &&
17116 all_of(drop_begin(SameValuesCounter),
17117 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
17118 return P.second == SameValuesCounter.front().second;
17119 });
17120 Candidates.resize(SameValuesCounter.size());
17121 transform(SameValuesCounter, Candidates.begin(),
17122 [](const auto &P) { return P.first; });
17123 NumReducedVals = Candidates.size();
17124 // Have a reduction of the same element.
17125 if (NumReducedVals == 1) {
17126 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
17127 unsigned Cnt = SameValuesCounter.lookup(OrigV);
17128 Value *RedVal =
17129 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
17130 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17131 VectorizedVals.try_emplace(OrigV, Cnt);
17132 continue;
17133 }
17134 }
17135
17136 unsigned MaxVecRegSize = V.getMaxVecRegSize();
17137 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
17138 unsigned MaxElts =
17139 RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize);
17140
17141 unsigned ReduxWidth = std::min<unsigned>(
17142 llvm::bit_floor(NumReducedVals),
17143 std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
17144 RegMaxNumber * RedValsMaxNumber));
17145 unsigned Start = 0;
17146 unsigned Pos = Start;
17147 // Restarts vectorization attempt with lower vector factor.
17148 unsigned PrevReduxWidth = ReduxWidth;
17149 bool CheckForReusedReductionOpsLocal = false;
17150 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
17151 &CheckForReusedReductionOpsLocal,
17152 &PrevReduxWidth, &V,
17153 &IgnoreList](bool IgnoreVL = false) {
17154 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
17155 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
17156 // Check if any of the reduction ops are gathered. If so, worth
17157 // trying again with less number of reduction ops.
17158 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
17159 }
17160 ++Pos;
17161 if (Pos < NumReducedVals - ReduxWidth + 1)
17162 return IsAnyRedOpGathered;
17163 Pos = Start;
17164 ReduxWidth /= 2;
17165 return IsAnyRedOpGathered;
17166 };
17167 bool AnyVectorized = false;
17168 while (Pos < NumReducedVals - ReduxWidth + 1 &&
17169 ReduxWidth >= ReductionLimit) {
17170 // Dependency in tree of the reduction ops - drop this attempt, try
17171 // later.
17172 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
17173 Start == 0) {
17174 CheckForReusedReductionOps = true;
17175 break;
17176 }
17177 PrevReduxWidth = ReduxWidth;
17178 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
17179 // Beeing analyzed already - skip.
17180 if (V.areAnalyzedReductionVals(VL)) {
17181 (void)AdjustReducedVals(/*IgnoreVL=*/true);
17182 continue;
17183 }
17184 // Early exit if any of the reduction values were deleted during
17185 // previous vectorization attempts.
17186 if (any_of(VL, [&V](Value *RedVal) {
17187 auto *RedValI = dyn_cast<Instruction>(RedVal);
17188 if (!RedValI)
17189 return false;
17190 return V.isDeleted(RedValI);
17191 }))
17192 break;
17193 V.buildTree(VL, IgnoreList);
17194 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
17195 if (!AdjustReducedVals())
17196 V.analyzedReductionVals(VL);
17197 continue;
17198 }
17199 if (V.isLoadCombineReductionCandidate(RdxKind)) {
17200 if (!AdjustReducedVals())
17201 V.analyzedReductionVals(VL);
17202 continue;
17203 }
17204 V.reorderTopToBottom();
17205 // No need to reorder the root node at all.
17206 V.reorderBottomToTop(/*IgnoreReorder=*/true);
17207 // Keep extracted other reduction values, if they are used in the
17208 // vectorization trees.
17209 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
17210 ExternallyUsedValues);
17211 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
17212 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
17213 continue;
17214 for (Value *V : ReducedVals[Cnt])
17215 if (isa<Instruction>(V))
17216 LocalExternallyUsedValues[TrackedVals[V]];
17217 }
17218 if (!IsSupportedHorRdxIdentityOp) {
17219 // Number of uses of the candidates in the vector of values.
17220 assert(SameValuesCounter.empty() &&
17221 "Reused values counter map is not empty");
17222 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17223 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17224 continue;
17225 Value *V = Candidates[Cnt];
17226 Value *OrigV = TrackedToOrig.find(V)->second;
17227 ++SameValuesCounter[OrigV];
17228 }
17229 }
17230 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
17231 // Gather externally used values.
17233 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17234 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17235 continue;
17236 Value *RdxVal = Candidates[Cnt];
17237 if (!Visited.insert(RdxVal).second)
17238 continue;
17239 // Check if the scalar was vectorized as part of the vectorization
17240 // tree but not the top node.
17241 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
17242 LocalExternallyUsedValues[RdxVal];
17243 continue;
17244 }
17245 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17246 unsigned NumOps =
17247 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
17248 if (NumOps != ReducedValsToOps.find(OrigV)->second.size())
17249 LocalExternallyUsedValues[RdxVal];
17250 }
17251 // Do not need the list of reused scalars in regular mode anymore.
17252 if (!IsSupportedHorRdxIdentityOp)
17253 SameValuesCounter.clear();
17254 for (Value *RdxVal : VL)
17255 if (RequiredExtract.contains(RdxVal))
17256 LocalExternallyUsedValues[RdxVal];
17257 // Update LocalExternallyUsedValues for the scalar, replaced by
17258 // extractelement instructions.
17259 DenseMap<Value *, Value *> ReplacementToExternal;
17260 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals)
17261 ReplacementToExternal.try_emplace(Pair.second, Pair.first);
17262 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
17263 Value *Ext = Pair.first;
17264 auto RIt = ReplacementToExternal.find(Ext);
17265 while (RIt != ReplacementToExternal.end()) {
17266 Ext = RIt->second;
17267 RIt = ReplacementToExternal.find(Ext);
17268 }
17269 auto *It = ExternallyUsedValues.find(Ext);
17270 if (It == ExternallyUsedValues.end())
17271 continue;
17272 LocalExternallyUsedValues[Pair.second].append(It->second);
17273 }
17274 V.buildExternalUses(LocalExternallyUsedValues);
17275
17276 V.computeMinimumValueSizes();
17277 V.transformNodes();
17278
17279 // Estimate cost.
17280 InstructionCost TreeCost = V.getTreeCost(VL);
17281 InstructionCost ReductionCost =
17282 getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
17283 InstructionCost Cost = TreeCost + ReductionCost;
17284 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
17285 << " for reduction\n");
17286 if (!Cost.isValid())
17287 break;
17288 if (Cost >= -SLPCostThreshold) {
17289 V.getORE()->emit([&]() {
17291 SV_NAME, "HorSLPNotBeneficial",
17292 ReducedValsToOps.find(VL[0])->second.front())
17293 << "Vectorizing horizontal reduction is possible "
17294 << "but not beneficial with cost " << ore::NV("Cost", Cost)
17295 << " and threshold "
17296 << ore::NV("Threshold", -SLPCostThreshold);
17297 });
17298 if (!AdjustReducedVals())
17299 V.analyzedReductionVals(VL);
17300 continue;
17301 }
17302
17303 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
17304 << Cost << ". (HorRdx)\n");
17305 V.getORE()->emit([&]() {
17306 return OptimizationRemark(
17307 SV_NAME, "VectorizedHorizontalReduction",
17308 ReducedValsToOps.find(VL[0])->second.front())
17309 << "Vectorized horizontal reduction with cost "
17310 << ore::NV("Cost", Cost) << " and with tree size "
17311 << ore::NV("TreeSize", V.getTreeSize());
17312 });
17313
17314 Builder.setFastMathFlags(RdxFMF);
17315
17316 // Emit a reduction. If the root is a select (min/max idiom), the insert
17317 // point is the compare condition of that select.
17318 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
17319 Instruction *InsertPt = RdxRootInst;
17320 if (IsCmpSelMinMax)
17321 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
17322
17323 // Vectorize a tree.
17324 Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,
17325 ReplacedExternals, InsertPt);
17326
17327 Builder.SetInsertPoint(InsertPt);
17328
17329 // To prevent poison from leaking across what used to be sequential,
17330 // safe, scalar boolean logic operations, the reduction operand must be
17331 // frozen.
17332 if ((isBoolLogicOp(RdxRootInst) ||
17333 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
17334 !isGuaranteedNotToBePoison(VectorizedRoot))
17335 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
17336
17337 // Emit code to correctly handle reused reduced values, if required.
17338 if (OptReusedScalars && !SameScaleFactor) {
17339 VectorizedRoot =
17340 emitReusedOps(VectorizedRoot, Builder, V.getRootNodeScalars(),
17341 SameValuesCounter, TrackedToOrig);
17342 }
17343
17344 Value *ReducedSubTree =
17345 emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
17346 if (ReducedSubTree->getType() != VL.front()->getType()) {
17347 ReducedSubTree = Builder.CreateIntCast(
17348 ReducedSubTree, VL.front()->getType(), any_of(VL, [&](Value *R) {
17350 R, cast<Instruction>(ReductionOps.front().front())
17351 ->getModule()
17352 ->getDataLayout());
17353 return !Known.isNonNegative();
17354 }));
17355 }
17356
17357 // Improved analysis for add/fadd/xor reductions with same scale factor
17358 // for all operands of reductions. We can emit scalar ops for them
17359 // instead.
17360 if (OptReusedScalars && SameScaleFactor)
17361 ReducedSubTree = emitScaleForReusedOps(
17362 ReducedSubTree, Builder, SameValuesCounter.front().second);
17363
17364 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
17365 // Count vectorized reduced values to exclude them from final reduction.
17366 for (Value *RdxVal : VL) {
17367 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17368 if (IsSupportedHorRdxIdentityOp) {
17369 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
17370 continue;
17371 }
17372 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
17373 if (!V.isVectorized(RdxVal))
17374 RequiredExtract.insert(RdxVal);
17375 }
17376 Pos += ReduxWidth;
17377 Start = Pos;
17378 ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
17379 AnyVectorized = true;
17380 }
17381 if (OptReusedScalars && !AnyVectorized) {
17382 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
17383 Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second);
17384 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17385 Value *OrigV = TrackedToOrig.find(P.first)->second;
17386 VectorizedVals.try_emplace(OrigV, P.second);
17387 }
17388 continue;
17389 }
17390 }
17391 if (VectorizedTree) {
17392 // Reorder operands of bool logical op in the natural order to avoid
17393 // possible problem with poison propagation. If not possible to reorder
17394 // (both operands are originally RHS), emit an extra freeze instruction
17395 // for the LHS operand.
17396 // I.e., if we have original code like this:
17397 // RedOp1 = select i1 ?, i1 LHS, i1 false
17398 // RedOp2 = select i1 RHS, i1 ?, i1 false
17399
17400 // Then, we swap LHS/RHS to create a new op that matches the poison
17401 // semantics of the original code.
17402
17403 // If we have original code like this and both values could be poison:
17404 // RedOp1 = select i1 ?, i1 LHS, i1 false
17405 // RedOp2 = select i1 ?, i1 RHS, i1 false
17406
17407 // Then, we must freeze LHS in the new op.
17408 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
17409 Instruction *RedOp1,
17410 Instruction *RedOp2,
17411 bool InitStep) {
17412 if (!AnyBoolLogicOp)
17413 return;
17414 if (isBoolLogicOp(RedOp1) &&
17415 ((!InitStep && LHS == VectorizedTree) ||
17416 getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
17417 return;
17418 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
17419 getRdxOperand(RedOp2, 0) == RHS ||
17421 std::swap(LHS, RHS);
17422 return;
17423 }
17424 if (LHS != VectorizedTree)
17425 LHS = Builder.CreateFreeze(LHS);
17426 };
17427 // Finish the reduction.
17428 // Need to add extra arguments and not vectorized possible reduction
17429 // values.
17430 // Try to avoid dependencies between the scalar remainders after
17431 // reductions.
17432 auto FinalGen =
17434 bool InitStep) {
17435 unsigned Sz = InstVals.size();
17437 Sz % 2);
17438 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
17439 Instruction *RedOp = InstVals[I + 1].first;
17440 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
17441 Value *RdxVal1 = InstVals[I].second;
17442 Value *StableRdxVal1 = RdxVal1;
17443 auto It1 = TrackedVals.find(RdxVal1);
17444 if (It1 != TrackedVals.end())
17445 StableRdxVal1 = It1->second;
17446 Value *RdxVal2 = InstVals[I + 1].second;
17447 Value *StableRdxVal2 = RdxVal2;
17448 auto It2 = TrackedVals.find(RdxVal2);
17449 if (It2 != TrackedVals.end())
17450 StableRdxVal2 = It2->second;
17451 // To prevent poison from leaking across what used to be
17452 // sequential, safe, scalar boolean logic operations, the
17453 // reduction operand must be frozen.
17454 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
17455 RedOp, InitStep);
17456 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
17457 StableRdxVal2, "op.rdx", ReductionOps);
17458 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
17459 }
17460 if (Sz % 2 == 1)
17461 ExtraReds[Sz / 2] = InstVals.back();
17462 return ExtraReds;
17463 };
17465 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
17466 VectorizedTree);
17468 for (ArrayRef<Value *> Candidates : ReducedVals) {
17469 for (Value *RdxVal : Candidates) {
17470 if (!Visited.insert(RdxVal).second)
17471 continue;
17472 unsigned NumOps = VectorizedVals.lookup(RdxVal);
17473 for (Instruction *RedOp :
17474 ArrayRef(ReducedValsToOps.find(RdxVal)->second)
17475 .drop_back(NumOps))
17476 ExtraReductions.emplace_back(RedOp, RdxVal);
17477 }
17478 }
17479 for (auto &Pair : ExternallyUsedValues) {
17480 // Add each externally used value to the final reduction.
17481 for (auto *I : Pair.second)
17482 ExtraReductions.emplace_back(I, Pair.first);
17483 }
17484 // Iterate through all not-vectorized reduction values/extra arguments.
17485 bool InitStep = true;
17486 while (ExtraReductions.size() > 1) {
17488 FinalGen(ExtraReductions, InitStep);
17489 ExtraReductions.swap(NewReds);
17490 InitStep = false;
17491 }
17492 VectorizedTree = ExtraReductions.front().second;
17493
17494 ReductionRoot->replaceAllUsesWith(VectorizedTree);
17495
17496 // The original scalar reduction is expected to have no remaining
17497 // uses outside the reduction tree itself. Assert that we got this
17498 // correct, replace internal uses with undef, and mark for eventual
17499 // deletion.
17500#ifndef NDEBUG
17501 SmallSet<Value *, 4> IgnoreSet;
17502 for (ArrayRef<Value *> RdxOps : ReductionOps)
17503 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
17504#endif
17505 for (ArrayRef<Value *> RdxOps : ReductionOps) {
17506 for (Value *Ignore : RdxOps) {
17507 if (!Ignore)
17508 continue;
17509#ifndef NDEBUG
17510 for (auto *U : Ignore->users()) {
17511 assert(IgnoreSet.count(U) &&
17512 "All users must be either in the reduction ops list.");
17513 }
17514#endif
17515 if (!Ignore->use_empty()) {
17516 Value *Undef = UndefValue::get(Ignore->getType());
17517 Ignore->replaceAllUsesWith(Undef);
17518 }
17519 V.eraseInstruction(cast<Instruction>(Ignore));
17520 }
17521 }
17522 } else if (!CheckForReusedReductionOps) {
17523 for (ReductionOpsType &RdxOps : ReductionOps)
17524 for (Value *RdxOp : RdxOps)
17525 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17526 }
17527 return VectorizedTree;
17528 }
17529
17530private:
17531 /// Calculate the cost of a reduction.
17532 InstructionCost getReductionCost(TargetTransformInfo *TTI,
17533 ArrayRef<Value *> ReducedVals,
17534 bool IsCmpSelMinMax, unsigned ReduxWidth,
17535 FastMathFlags FMF) {
17537 Type *ScalarTy = ReducedVals.front()->getType();
17538 FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
17539 InstructionCost VectorCost = 0, ScalarCost;
17540 // If all of the reduced values are constant, the vector cost is 0, since
17541 // the reduction value can be calculated at the compile time.
17542 bool AllConsts = allConstant(ReducedVals);
17543 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
17545 // Scalar cost is repeated for N-1 elements.
17546 int Cnt = ReducedVals.size();
17547 for (Value *RdxVal : ReducedVals) {
17548 if (Cnt == 1)
17549 break;
17550 --Cnt;
17551 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
17552 Cost += GenCostFn();
17553 continue;
17554 }
17555 InstructionCost ScalarCost = 0;
17556 for (User *U : RdxVal->users()) {
17557 auto *RdxOp = cast<Instruction>(U);
17558 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
17559 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
17560 continue;
17561 }
17562 ScalarCost = InstructionCost::getInvalid();
17563 break;
17564 }
17565 if (ScalarCost.isValid())
17566 Cost += ScalarCost;
17567 else
17568 Cost += GenCostFn();
17569 }
17570 return Cost;
17571 };
17572 switch (RdxKind) {
17573 case RecurKind::Add:
17574 case RecurKind::Mul:
17575 case RecurKind::Or:
17576 case RecurKind::And:
17577 case RecurKind::Xor:
17578 case RecurKind::FAdd:
17579 case RecurKind::FMul: {
17580 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
17581 if (!AllConsts)
17582 VectorCost =
17583 TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
17584 ScalarCost = EvaluateScalarCost([&]() {
17585 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
17586 });
17587 break;
17588 }
17589 case RecurKind::FMax:
17590 case RecurKind::FMin:
17591 case RecurKind::FMaximum:
17592 case RecurKind::FMinimum:
17593 case RecurKind::SMax:
17594 case RecurKind::SMin:
17595 case RecurKind::UMax:
17596 case RecurKind::UMin: {
17598 if (!AllConsts)
17599 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
17600 ScalarCost = EvaluateScalarCost([&]() {
17601 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
17602 return TTI->getIntrinsicInstrCost(ICA, CostKind);
17603 });
17604 break;
17605 }
17606 default:
17607 llvm_unreachable("Expected arithmetic or min/max reduction operation");
17608 }
17609
17610 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
17611 << " for reduction of " << shortBundleName(ReducedVals)
17612 << " (It is a splitting reduction)\n");
17613 return VectorCost - ScalarCost;
17614 }
17615
17616 /// Emit a horizontal reduction of the vectorized value.
17617 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
17618 unsigned ReduxWidth, const TargetTransformInfo *TTI) {
17619 assert(VectorizedValue && "Need to have a vectorized tree node");
17620 assert(isPowerOf2_32(ReduxWidth) &&
17621 "We only handle power-of-two reductions for now");
17622 assert(RdxKind != RecurKind::FMulAdd &&
17623 "A call to the llvm.fmuladd intrinsic is not handled yet");
17624
17625 ++NumVectorInstructions;
17626 return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind);
17627 }
17628
17629 /// Emits optimized code for unique scalar value reused \p Cnt times.
17630 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17631 unsigned Cnt) {
17632 assert(IsSupportedHorRdxIdentityOp &&
17633 "The optimization of matched scalar identity horizontal reductions "
17634 "must be supported.");
17635 switch (RdxKind) {
17636 case RecurKind::Add: {
17637 // res = mul vv, n
17638 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
17639 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
17640 << VectorizedValue << ". (HorRdx)\n");
17641 return Builder.CreateMul(VectorizedValue, Scale);
17642 }
17643 case RecurKind::Xor: {
17644 // res = n % 2 ? 0 : vv
17645 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
17646 << ". (HorRdx)\n");
17647 if (Cnt % 2 == 0)
17648 return Constant::getNullValue(VectorizedValue->getType());
17649 return VectorizedValue;
17650 }
17651 case RecurKind::FAdd: {
17652 // res = fmul v, n
17653 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
17654 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
17655 << VectorizedValue << ". (HorRdx)\n");
17656 return Builder.CreateFMul(VectorizedValue, Scale);
17657 }
17658 case RecurKind::And:
17659 case RecurKind::Or:
17660 case RecurKind::SMax:
17661 case RecurKind::SMin:
17662 case RecurKind::UMax:
17663 case RecurKind::UMin:
17664 case RecurKind::FMax:
17665 case RecurKind::FMin:
17666 case RecurKind::FMaximum:
17667 case RecurKind::FMinimum:
17668 // res = vv
17669 return VectorizedValue;
17670 case RecurKind::Mul:
17671 case RecurKind::FMul:
17672 case RecurKind::FMulAdd:
17673 case RecurKind::IAnyOf:
17674 case RecurKind::FAnyOf:
17675 case RecurKind::None:
17676 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
17677 }
17678 return nullptr;
17679 }
17680
17681 /// Emits actual operation for the scalar identity values, found during
17682 /// horizontal reduction analysis.
17683 Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17685 const MapVector<Value *, unsigned> &SameValuesCounter,
17686 const DenseMap<Value *, Value *> &TrackedToOrig) {
17687 assert(IsSupportedHorRdxIdentityOp &&
17688 "The optimization of matched scalar identity horizontal reductions "
17689 "must be supported.");
17690 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
17691 if (VTy->getElementType() != VL.front()->getType()) {
17692 VectorizedValue = Builder.CreateIntCast(
17693 VectorizedValue,
17694 FixedVectorType::get(VL.front()->getType(), VTy->getNumElements()),
17695 any_of(VL, [&](Value *R) {
17697 R, cast<Instruction>(ReductionOps.front().front())
17698 ->getModule()
17699 ->getDataLayout());
17700 return !Known.isNonNegative();
17701 }));
17702 }
17703 switch (RdxKind) {
17704 case RecurKind::Add: {
17705 // root = mul prev_root, <1, 1, n, 1>
17707 for (Value *V : VL) {
17708 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17709 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
17710 }
17711 auto *Scale = ConstantVector::get(Vals);
17712 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
17713 << VectorizedValue << ". (HorRdx)\n");
17714 return Builder.CreateMul(VectorizedValue, Scale);
17715 }
17716 case RecurKind::And:
17717 case RecurKind::Or:
17718 // No need for multiple or/and(s).
17719 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
17720 << ". (HorRdx)\n");
17721 return VectorizedValue;
17722 case RecurKind::SMax:
17723 case RecurKind::SMin:
17724 case RecurKind::UMax:
17725 case RecurKind::UMin:
17726 case RecurKind::FMax:
17727 case RecurKind::FMin:
17728 case RecurKind::FMaximum:
17729 case RecurKind::FMinimum:
17730 // No need for multiple min/max(s) of the same value.
17731 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
17732 << ". (HorRdx)\n");
17733 return VectorizedValue;
17734 case RecurKind::Xor: {
17735 // Replace values with even number of repeats with 0, since
17736 // x xor x = 0.
17737 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
17738 // 7>, if elements 4th and 6th elements have even number of repeats.
17740 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
17742 std::iota(Mask.begin(), Mask.end(), 0);
17743 bool NeedShuffle = false;
17744 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
17745 Value *V = VL[I];
17746 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17747 if (Cnt % 2 == 0) {
17748 Mask[I] = VF;
17749 NeedShuffle = true;
17750 }
17751 }
17752 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
17753 : Mask) dbgs()
17754 << I << " ";
17755 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
17756 if (NeedShuffle)
17757 VectorizedValue = Builder.CreateShuffleVector(
17758 VectorizedValue,
17759 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
17760 return VectorizedValue;
17761 }
17762 case RecurKind::FAdd: {
17763 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
17765 for (Value *V : VL) {
17766 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17767 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
17768 }
17769 auto *Scale = ConstantVector::get(Vals);
17770 return Builder.CreateFMul(VectorizedValue, Scale);
17771 }
17772 case RecurKind::Mul:
17773 case RecurKind::FMul:
17774 case RecurKind::FMulAdd:
17775 case RecurKind::IAnyOf:
17776 case RecurKind::FAnyOf:
17777 case RecurKind::None:
17778 llvm_unreachable("Unexpected reduction kind for reused scalars.");
17779 }
17780 return nullptr;
17781 }
17782};
17783} // end anonymous namespace
17784
17785/// Gets recurrence kind from the specified value.
17787 return HorizontalReduction::getRdxKind(V);
17788}
17789static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
17790 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
17791 return cast<FixedVectorType>(IE->getType())->getNumElements();
17792
17793 unsigned AggregateSize = 1;
17794 auto *IV = cast<InsertValueInst>(InsertInst);
17795 Type *CurrentType = IV->getType();
17796 do {
17797 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
17798 for (auto *Elt : ST->elements())
17799 if (Elt != ST->getElementType(0)) // check homogeneity
17800 return std::nullopt;
17801 AggregateSize *= ST->getNumElements();
17802 CurrentType = ST->getElementType(0);
17803 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
17804 AggregateSize *= AT->getNumElements();
17805 CurrentType = AT->getElementType();
17806 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
17807 AggregateSize *= VT->getNumElements();
17808 return AggregateSize;
17809 } else if (CurrentType->isSingleValueType()) {
17810 return AggregateSize;
17811 } else {
17812 return std::nullopt;
17813 }
17814 } while (true);
17815}
17816
17817static void findBuildAggregate_rec(Instruction *LastInsertInst,
17819 SmallVectorImpl<Value *> &BuildVectorOpds,
17820 SmallVectorImpl<Value *> &InsertElts,
17821 unsigned OperandOffset) {
17822 do {
17823 Value *InsertedOperand = LastInsertInst->getOperand(1);
17824 std::optional<unsigned> OperandIndex =
17825 getInsertIndex(LastInsertInst, OperandOffset);
17826 if (!OperandIndex)
17827 return;
17828 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
17829 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
17830 BuildVectorOpds, InsertElts, *OperandIndex);
17831
17832 } else {
17833 BuildVectorOpds[*OperandIndex] = InsertedOperand;
17834 InsertElts[*OperandIndex] = LastInsertInst;
17835 }
17836 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
17837 } while (LastInsertInst != nullptr &&
17838 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
17839 LastInsertInst->hasOneUse());
17840}
17841
17842/// Recognize construction of vectors like
17843/// %ra = insertelement <4 x float> poison, float %s0, i32 0
17844/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
17845/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
17846/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
17847/// starting from the last insertelement or insertvalue instruction.
17848///
17849/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
17850/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
17851/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
17852///
17853/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
17854///
17855/// \return true if it matches.
17856static bool findBuildAggregate(Instruction *LastInsertInst,
17858 SmallVectorImpl<Value *> &BuildVectorOpds,
17859 SmallVectorImpl<Value *> &InsertElts) {
17860
17861 assert((isa<InsertElementInst>(LastInsertInst) ||
17862 isa<InsertValueInst>(LastInsertInst)) &&
17863 "Expected insertelement or insertvalue instruction!");
17864
17865 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
17866 "Expected empty result vectors!");
17867
17868 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
17869 if (!AggregateSize)
17870 return false;
17871 BuildVectorOpds.resize(*AggregateSize);
17872 InsertElts.resize(*AggregateSize);
17873
17874 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);
17875 llvm::erase(BuildVectorOpds, nullptr);
17876 llvm::erase(InsertElts, nullptr);
17877 if (BuildVectorOpds.size() >= 2)
17878 return true;
17879
17880 return false;
17881}
17882
17883/// Try and get a reduction instruction from a phi node.
17884///
17885/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
17886/// if they come from either \p ParentBB or a containing loop latch.
17887///
17888/// \returns A candidate reduction value if possible, or \code nullptr \endcode
17889/// if not possible.
17891 BasicBlock *ParentBB, LoopInfo *LI) {
17892 // There are situations where the reduction value is not dominated by the
17893 // reduction phi. Vectorizing such cases has been reported to cause
17894 // miscompiles. See PR25787.
17895 auto DominatedReduxValue = [&](Value *R) {
17896 return isa<Instruction>(R) &&
17897 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
17898 };
17899
17900 Instruction *Rdx = nullptr;
17901
17902 // Return the incoming value if it comes from the same BB as the phi node.
17903 if (P->getIncomingBlock(0) == ParentBB) {
17904 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
17905 } else if (P->getIncomingBlock(1) == ParentBB) {
17906 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
17907 }
17908
17909 if (Rdx && DominatedReduxValue(Rdx))
17910 return Rdx;
17911
17912 // Otherwise, check whether we have a loop latch to look at.
17913 Loop *BBL = LI->getLoopFor(ParentBB);
17914 if (!BBL)
17915 return nullptr;
17916 BasicBlock *BBLatch = BBL->getLoopLatch();
17917 if (!BBLatch)
17918 return nullptr;
17919
17920 // There is a loop latch, return the incoming value if it comes from
17921 // that. This reduction pattern occasionally turns up.
17922 if (P->getIncomingBlock(0) == BBLatch) {
17923 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
17924 } else if (P->getIncomingBlock(1) == BBLatch) {
17925 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
17926 }
17927
17928 if (Rdx && DominatedReduxValue(Rdx))
17929 return Rdx;
17930
17931 return nullptr;
17932}
17933
17934static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
17935 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
17936 return true;
17937 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
17938 return true;
17939 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
17940 return true;
17941 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
17942 return true;
17943 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
17944 return true;
17945 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
17946 return true;
17947 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
17948 return true;
17949 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
17950 return true;
17951 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
17952 return true;
17953 return false;
17954}
17955
17956/// We could have an initial reduction that is not an add.
17957/// r *= v1 + v2 + v3 + v4
17958/// In such a case start looking for a tree rooted in the first '+'.
17959/// \Returns the new root if found, which may be nullptr if not an instruction.
17961 Instruction *Root) {
17962 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
17963 isa<IntrinsicInst>(Root)) &&
17964 "Expected binop, select, or intrinsic for reduction matching");
17965 Value *LHS =
17966 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
17967 Value *RHS =
17968 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
17969 if (LHS == Phi)
17970 return dyn_cast<Instruction>(RHS);
17971 if (RHS == Phi)
17972 return dyn_cast<Instruction>(LHS);
17973 return nullptr;
17974}
17975
17976/// \p Returns the first operand of \p I that does not match \p Phi. If
17977/// operand is not an instruction it returns nullptr.
17979 Value *Op0 = nullptr;
17980 Value *Op1 = nullptr;
17981 if (!matchRdxBop(I, Op0, Op1))
17982 return nullptr;
17983 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
17984}
17985
17986/// \Returns true if \p I is a candidate instruction for reduction vectorization.
17988 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
17989 Value *B0 = nullptr, *B1 = nullptr;
17990 bool IsBinop = matchRdxBop(I, B0, B1);
17991 return IsBinop || IsSelect;
17992}
17993
17994bool SLPVectorizerPass::vectorizeHorReduction(
17996 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
17997 if (!ShouldVectorizeHor)
17998 return false;
17999 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
18000
18001 if (Root->getParent() != BB || isa<PHINode>(Root))
18002 return false;
18003
18004 // If we can find a secondary reduction root, use that instead.
18005 auto SelectRoot = [&]() {
18006 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
18007 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
18008 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
18009 return NewRoot;
18010 return Root;
18011 };
18012
18013 // Start analysis starting from Root instruction. If horizontal reduction is
18014 // found, try to vectorize it. If it is not a horizontal reduction or
18015 // vectorization is not possible or not effective, and currently analyzed
18016 // instruction is a binary operation, try to vectorize the operands, using
18017 // pre-order DFS traversal order. If the operands were not vectorized, repeat
18018 // the same procedure considering each operand as a possible root of the
18019 // horizontal reduction.
18020 // Interrupt the process if the Root instruction itself was vectorized or all
18021 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
18022 // If a horizintal reduction was not matched or vectorized we collect
18023 // instructions for possible later attempts for vectorization.
18024 std::queue<std::pair<Instruction *, unsigned>> Stack;
18025 Stack.emplace(SelectRoot(), 0);
18026 SmallPtrSet<Value *, 8> VisitedInstrs;
18027 bool Res = false;
18028 auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
18029 if (R.isAnalyzedReductionRoot(Inst))
18030 return nullptr;
18031 if (!isReductionCandidate(Inst))
18032 return nullptr;
18033 HorizontalReduction HorRdx;
18034 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
18035 return nullptr;
18036 return HorRdx.tryToReduce(R, *DL, TTI, *TLI);
18037 };
18038 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
18039 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
18040 FutureSeed = getNonPhiOperand(Root, P);
18041 if (!FutureSeed)
18042 return false;
18043 }
18044 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
18045 // analysis is done separately.
18046 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
18047 PostponedInsts.push_back(FutureSeed);
18048 return true;
18049 };
18050
18051 while (!Stack.empty()) {
18052 Instruction *Inst;
18053 unsigned Level;
18054 std::tie(Inst, Level) = Stack.front();
18055 Stack.pop();
18056 // Do not try to analyze instruction that has already been vectorized.
18057 // This may happen when we vectorize instruction operands on a previous
18058 // iteration while stack was populated before that happened.
18059 if (R.isDeleted(Inst))
18060 continue;
18061 if (Value *VectorizedV = TryToReduce(Inst)) {
18062 Res = true;
18063 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
18064 // Try to find another reduction.
18065 Stack.emplace(I, Level);
18066 continue;
18067 }
18068 } else {
18069 // We could not vectorize `Inst` so try to use it as a future seed.
18070 if (!TryAppendToPostponedInsts(Inst)) {
18071 assert(Stack.empty() && "Expected empty stack");
18072 break;
18073 }
18074 }
18075
18076 // Try to vectorize operands.
18077 // Continue analysis for the instruction from the same basic block only to
18078 // save compile time.
18079 if (++Level < RecursionMaxDepth)
18080 for (auto *Op : Inst->operand_values())
18081 if (VisitedInstrs.insert(Op).second)
18082 if (auto *I = dyn_cast<Instruction>(Op))
18083 // Do not try to vectorize CmpInst operands, this is done
18084 // separately.
18085 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
18086 !R.isDeleted(I) && I->getParent() == BB)
18087 Stack.emplace(I, Level);
18088 }
18089 return Res;
18090}
18091
18092bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
18093 BasicBlock *BB, BoUpSLP &R,
18095 SmallVector<WeakTrackingVH> PostponedInsts;
18096 bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
18097 Res |= tryToVectorize(PostponedInsts, R);
18098 return Res;
18099}
18100
18101bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
18102 BoUpSLP &R) {
18103 bool Res = false;
18104 for (Value *V : Insts)
18105 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
18106 Res |= tryToVectorize(Inst, R);
18107 return Res;
18108}
18109
18110bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
18111 BasicBlock *BB, BoUpSLP &R) {
18112 if (!R.canMapToVector(IVI->getType()))
18113 return false;
18114
18115 SmallVector<Value *, 16> BuildVectorOpds;
18116 SmallVector<Value *, 16> BuildVectorInsts;
18117 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
18118 return false;
18119
18120 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
18121 // Aggregate value is unlikely to be processed in vector register.
18122 return tryToVectorizeList(BuildVectorOpds, R);
18123}
18124
18125bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
18126 BasicBlock *BB, BoUpSLP &R) {
18127 SmallVector<Value *, 16> BuildVectorInsts;
18128 SmallVector<Value *, 16> BuildVectorOpds;
18130 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
18131 (llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
18132 isFixedVectorShuffle(BuildVectorOpds, Mask)))
18133 return false;
18134
18135 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
18136 return tryToVectorizeList(BuildVectorInsts, R);
18137}
18138
18139template <typename T>
18141 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
18142 function_ref<bool(T *, T *)> AreCompatible,
18143 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
18144 bool MaxVFOnly, BoUpSLP &R) {
18145 bool Changed = false;
18146 // Sort by type, parent, operands.
18147 stable_sort(Incoming, Comparator);
18148
18149 // Try to vectorize elements base on their type.
18150 SmallVector<T *> Candidates;
18151 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) {
18152 // Look for the next elements with the same type, parent and operand
18153 // kinds.
18154 auto *SameTypeIt = IncIt;
18155 while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))
18156 ++SameTypeIt;
18157
18158 // Try to vectorize them.
18159 unsigned NumElts = (SameTypeIt - IncIt);
18160 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
18161 << NumElts << ")\n");
18162 // The vectorization is a 3-state attempt:
18163 // 1. Try to vectorize instructions with the same/alternate opcodes with the
18164 // size of maximal register at first.
18165 // 2. Try to vectorize remaining instructions with the same type, if
18166 // possible. This may result in the better vectorization results rather than
18167 // if we try just to vectorize instructions with the same/alternate opcodes.
18168 // 3. Final attempt to try to vectorize all instructions with the
18169 // same/alternate ops only, this may result in some extra final
18170 // vectorization.
18171 if (NumElts > 1 &&
18172 TryToVectorizeHelper(ArrayRef(IncIt, NumElts), MaxVFOnly)) {
18173 // Success start over because instructions might have been changed.
18174 Changed = true;
18175 } else {
18176 /// \Returns the minimum number of elements that we will attempt to
18177 /// vectorize.
18178 auto GetMinNumElements = [&R](Value *V) {
18179 unsigned EltSize = R.getVectorElementSize(V);
18180 return std::max(2U, R.getMaxVecRegSize() / EltSize);
18181 };
18182 if (NumElts < GetMinNumElements(*IncIt) &&
18183 (Candidates.empty() ||
18184 Candidates.front()->getType() == (*IncIt)->getType())) {
18185 Candidates.append(IncIt, std::next(IncIt, NumElts));
18186 }
18187 }
18188 // Final attempt to vectorize instructions with the same types.
18189 if (Candidates.size() > 1 &&
18190 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
18191 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
18192 // Success start over because instructions might have been changed.
18193 Changed = true;
18194 } else if (MaxVFOnly) {
18195 // Try to vectorize using small vectors.
18196 for (auto *It = Candidates.begin(), *End = Candidates.end();
18197 It != End;) {
18198 auto *SameTypeIt = It;
18199 while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))
18200 ++SameTypeIt;
18201 unsigned NumElts = (SameTypeIt - It);
18202 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(It, NumElts),
18203 /*MaxVFOnly=*/false))
18204 Changed = true;
18205 It = SameTypeIt;
18206 }
18207 }
18208 Candidates.clear();
18209 }
18210
18211 // Start over at the next instruction of a different type (or the end).
18212 IncIt = SameTypeIt;
18213 }
18214 return Changed;
18215}
18216
18217/// Compare two cmp instructions. If IsCompatibility is true, function returns
18218/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
18219/// operands. If IsCompatibility is false, function implements strict weak
18220/// ordering relation between two cmp instructions, returning true if the first
18221/// instruction is "less" than the second, i.e. its predicate is less than the
18222/// predicate of the second or the operands IDs are less than the operands IDs
18223/// of the second cmp instruction.
18224template <bool IsCompatibility>
18225static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
18226 const DominatorTree &DT) {
18227 assert(isValidElementType(V->getType()) &&
18228 isValidElementType(V2->getType()) &&
18229 "Expected valid element types only.");
18230 if (V == V2)
18231 return IsCompatibility;
18232 auto *CI1 = cast<CmpInst>(V);
18233 auto *CI2 = cast<CmpInst>(V2);
18234 if (CI1->getOperand(0)->getType()->getTypeID() <
18235 CI2->getOperand(0)->getType()->getTypeID())
18236 return !IsCompatibility;
18237 if (CI1->getOperand(0)->getType()->getTypeID() >
18238 CI2->getOperand(0)->getType()->getTypeID())
18239 return false;
18240 CmpInst::Predicate Pred1 = CI1->getPredicate();
18241 CmpInst::Predicate Pred2 = CI2->getPredicate();
18244 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
18245 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
18246 if (BasePred1 < BasePred2)
18247 return !IsCompatibility;
18248 if (BasePred1 > BasePred2)
18249 return false;
18250 // Compare operands.
18251 bool CI1Preds = Pred1 == BasePred1;
18252 bool CI2Preds = Pred2 == BasePred1;
18253 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
18254 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
18255 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
18256 if (Op1 == Op2)
18257 continue;
18258 if (Op1->getValueID() < Op2->getValueID())
18259 return !IsCompatibility;
18260 if (Op1->getValueID() > Op2->getValueID())
18261 return false;
18262 if (auto *I1 = dyn_cast<Instruction>(Op1))
18263 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
18264 if (IsCompatibility) {
18265 if (I1->getParent() != I2->getParent())
18266 return false;
18267 } else {
18268 // Try to compare nodes with same parent.
18269 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
18270 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
18271 if (!NodeI1)
18272 return NodeI2 != nullptr;
18273 if (!NodeI2)
18274 return false;
18275 assert((NodeI1 == NodeI2) ==
18276 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18277 "Different nodes should have different DFS numbers");
18278 if (NodeI1 != NodeI2)
18279 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18280 }
18281 InstructionsState S = getSameOpcode({I1, I2}, TLI);
18282 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
18283 continue;
18284 if (IsCompatibility)
18285 return false;
18286 if (I1->getOpcode() != I2->getOpcode())
18287 return I1->getOpcode() < I2->getOpcode();
18288 }
18289 }
18290 return IsCompatibility;
18291}
18292
18293template <typename ItT>
18294bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
18295 BasicBlock *BB, BoUpSLP &R) {
18296 bool Changed = false;
18297 // Try to find reductions first.
18298 for (CmpInst *I : CmpInsts) {
18299 if (R.isDeleted(I))
18300 continue;
18301 for (Value *Op : I->operands())
18302 if (auto *RootOp = dyn_cast<Instruction>(Op))
18303 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI);
18304 }
18305 // Try to vectorize operands as vector bundles.
18306 for (CmpInst *I : CmpInsts) {
18307 if (R.isDeleted(I))
18308 continue;
18309 Changed |= tryToVectorize(I, R);
18310 }
18311 // Try to vectorize list of compares.
18312 // Sort by type, compare predicate, etc.
18313 auto CompareSorter = [&](Value *V, Value *V2) {
18314 if (V == V2)
18315 return false;
18316 return compareCmp<false>(V, V2, *TLI, *DT);
18317 };
18318
18319 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
18320 if (V1 == V2)
18321 return true;
18322 return compareCmp<true>(V1, V2, *TLI, *DT);
18323 };
18324
18326 for (Instruction *V : CmpInsts)
18327 if (!R.isDeleted(V) && isValidElementType(V->getType()))
18328 Vals.push_back(V);
18329 if (Vals.size() <= 1)
18330 return Changed;
18331 Changed |= tryToVectorizeSequence<Value>(
18332 Vals, CompareSorter, AreCompatibleCompares,
18333 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18334 // Exclude possible reductions from other blocks.
18335 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
18336 return any_of(V->users(), [V](User *U) {
18337 auto *Select = dyn_cast<SelectInst>(U);
18338 return Select &&
18339 Select->getParent() != cast<Instruction>(V)->getParent();
18340 });
18341 });
18342 if (ArePossiblyReducedInOtherBlock)
18343 return false;
18344 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18345 },
18346 /*MaxVFOnly=*/true, R);
18347 return Changed;
18348}
18349
18350bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
18351 BasicBlock *BB, BoUpSLP &R) {
18352 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
18353 "This function only accepts Insert instructions");
18354 bool OpsChanged = false;
18355 SmallVector<WeakTrackingVH> PostponedInsts;
18356 // pass1 - try to vectorize reductions only
18357 for (auto *I : reverse(Instructions)) {
18358 if (R.isDeleted(I))
18359 continue;
18360 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
18361 }
18362 // pass2 - try to match and vectorize a buildvector sequence.
18363 for (auto *I : reverse(Instructions)) {
18364 if (R.isDeleted(I) || isa<CmpInst>(I))
18365 continue;
18366 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
18367 OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
18368 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
18369 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
18370 }
18371 }
18372 // Now try to vectorize postponed instructions.
18373 OpsChanged |= tryToVectorize(PostponedInsts, R);
18374
18375 Instructions.clear();
18376 return OpsChanged;
18377}
18378
18379bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
18380 bool Changed = false;
18382 SmallPtrSet<Value *, 16> VisitedInstrs;
18383 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
18384 // node. Allows better to identify the chains that can be vectorized in the
18385 // better way.
18387 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
18389 isValidElementType(V2->getType()) &&
18390 "Expected vectorizable types only.");
18391 // It is fine to compare type IDs here, since we expect only vectorizable
18392 // types, like ints, floats and pointers, we don't care about other type.
18393 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
18394 return true;
18395 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
18396 return false;
18397 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
18398 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
18399 if (Opcodes1.size() < Opcodes2.size())
18400 return true;
18401 if (Opcodes1.size() > Opcodes2.size())
18402 return false;
18403 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
18404 {
18405 // Instructions come first.
18406 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
18407 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
18408 if (I1 && I2) {
18409 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
18410 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
18411 if (!NodeI1)
18412 return NodeI2 != nullptr;
18413 if (!NodeI2)
18414 return false;
18415 assert((NodeI1 == NodeI2) ==
18416 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18417 "Different nodes should have different DFS numbers");
18418 if (NodeI1 != NodeI2)
18419 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18420 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18421 if (S.getOpcode() && !S.isAltShuffle())
18422 continue;
18423 return I1->getOpcode() < I2->getOpcode();
18424 }
18425 if (I1)
18426 return true;
18427 if (I2)
18428 return false;
18429 }
18430 {
18431 // Non-undef constants come next.
18432 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
18433 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
18434 if (C1 && C2)
18435 continue;
18436 if (C1)
18437 return true;
18438 if (C2)
18439 return false;
18440 }
18441 bool U1 = isa<UndefValue>(Opcodes1[I]);
18442 bool U2 = isa<UndefValue>(Opcodes2[I]);
18443 {
18444 // Non-constant non-instructions come next.
18445 if (!U1 && !U2) {
18446 auto ValID1 = Opcodes1[I]->getValueID();
18447 auto ValID2 = Opcodes2[I]->getValueID();
18448 if (ValID1 == ValID2)
18449 continue;
18450 if (ValID1 < ValID2)
18451 return true;
18452 if (ValID1 > ValID2)
18453 return false;
18454 }
18455 if (!U1)
18456 return true;
18457 if (!U2)
18458 return false;
18459 }
18460 // Undefs come last.
18461 assert(U1 && U2 && "The only thing left should be undef & undef.");
18462 continue;
18463 }
18464 return false;
18465 };
18466 auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) {
18467 if (V1 == V2)
18468 return true;
18469 if (V1->getType() != V2->getType())
18470 return false;
18471 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
18472 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
18473 if (Opcodes1.size() != Opcodes2.size())
18474 return false;
18475 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
18476 // Undefs are compatible with any other value.
18477 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
18478 continue;
18479 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
18480 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
18481 if (I1->getParent() != I2->getParent())
18482 return false;
18483 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18484 if (S.getOpcode())
18485 continue;
18486 return false;
18487 }
18488 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
18489 continue;
18490 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
18491 return false;
18492 }
18493 return true;
18494 };
18495
18496 bool HaveVectorizedPhiNodes = false;
18497 do {
18498 // Collect the incoming values from the PHIs.
18499 Incoming.clear();
18500 for (Instruction &I : *BB) {
18501 auto *P = dyn_cast<PHINode>(&I);
18502 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
18503 break;
18504
18505 // No need to analyze deleted, vectorized and non-vectorizable
18506 // instructions.
18507 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
18508 isValidElementType(P->getType()))
18509 Incoming.push_back(P);
18510 }
18511
18512 if (Incoming.size() <= 1)
18513 break;
18514
18515 // Find the corresponding non-phi nodes for better matching when trying to
18516 // build the tree.
18517 for (Value *V : Incoming) {
18518 SmallVectorImpl<Value *> &Opcodes =
18519 PHIToOpcodes.try_emplace(V).first->getSecond();
18520 if (!Opcodes.empty())
18521 continue;
18522 SmallVector<Value *, 4> Nodes(1, V);
18524 while (!Nodes.empty()) {
18525 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
18526 if (!Visited.insert(PHI).second)
18527 continue;
18528 for (Value *V : PHI->incoming_values()) {
18529 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
18530 Nodes.push_back(PHI1);
18531 continue;
18532 }
18533 Opcodes.emplace_back(V);
18534 }
18535 }
18536 }
18537
18538 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
18539 Incoming, PHICompare, AreCompatiblePHIs,
18540 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18541 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18542 },
18543 /*MaxVFOnly=*/true, R);
18544 Changed |= HaveVectorizedPhiNodes;
18545 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
18546 } while (HaveVectorizedPhiNodes);
18547
18548 VisitedInstrs.clear();
18549
18550 InstSetVector PostProcessInserts;
18551 SmallSetVector<CmpInst *, 8> PostProcessCmps;
18552 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
18553 // also vectorizes `PostProcessCmps`.
18554 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
18555 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
18556 if (VectorizeCmps) {
18557 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
18558 PostProcessCmps.clear();
18559 }
18560 PostProcessInserts.clear();
18561 return Changed;
18562 };
18563 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
18564 auto IsInPostProcessInstrs = [&](Instruction *I) {
18565 if (auto *Cmp = dyn_cast<CmpInst>(I))
18566 return PostProcessCmps.contains(Cmp);
18567 return isa<InsertElementInst, InsertValueInst>(I) &&
18568 PostProcessInserts.contains(I);
18569 };
18570 // Returns true if `I` is an instruction without users, like terminator, or
18571 // function call with ignored return value, store. Ignore unused instructions
18572 // (basing on instruction type, except for CallInst and InvokeInst).
18573 auto HasNoUsers = [](Instruction *I) {
18574 return I->use_empty() &&
18575 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
18576 };
18577 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
18578 // Skip instructions with scalable type. The num of elements is unknown at
18579 // compile-time for scalable type.
18580 if (isa<ScalableVectorType>(It->getType()))
18581 continue;
18582
18583 // Skip instructions marked for the deletion.
18584 if (R.isDeleted(&*It))
18585 continue;
18586 // We may go through BB multiple times so skip the one we have checked.
18587 if (!VisitedInstrs.insert(&*It).second) {
18588 if (HasNoUsers(&*It) &&
18589 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
18590 // We would like to start over since some instructions are deleted
18591 // and the iterator may become invalid value.
18592 Changed = true;
18593 It = BB->begin();
18594 E = BB->end();
18595 }
18596 continue;
18597 }
18598
18599 if (isa<DbgInfoIntrinsic>(It))
18600 continue;
18601
18602 // Try to vectorize reductions that use PHINodes.
18603 if (PHINode *P = dyn_cast<PHINode>(It)) {
18604 // Check that the PHI is a reduction PHI.
18605 if (P->getNumIncomingValues() == 2) {
18606 // Try to match and vectorize a horizontal reduction.
18607 Instruction *Root = getReductionInstr(DT, P, BB, LI);
18608 if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
18609 Changed = true;
18610 It = BB->begin();
18611 E = BB->end();
18612 continue;
18613 }
18614 }
18615 // Try to vectorize the incoming values of the PHI, to catch reductions
18616 // that feed into PHIs.
18617 for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) {
18618 // Skip if the incoming block is the current BB for now. Also, bypass
18619 // unreachable IR for efficiency and to avoid crashing.
18620 // TODO: Collect the skipped incoming values and try to vectorize them
18621 // after processing BB.
18622 if (BB == P->getIncomingBlock(I) ||
18623 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
18624 continue;
18625
18626 // Postponed instructions should not be vectorized here, delay their
18627 // vectorization.
18628 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
18629 PI && !IsInPostProcessInstrs(PI))
18630 Changed |= vectorizeRootInstruction(nullptr, PI,
18631 P->getIncomingBlock(I), R, TTI);
18632 }
18633 continue;
18634 }
18635
18636 if (HasNoUsers(&*It)) {
18637 bool OpsChanged = false;
18638 auto *SI = dyn_cast<StoreInst>(It);
18639 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
18640 if (SI) {
18641 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
18642 // Try to vectorize chain in store, if this is the only store to the
18643 // address in the block.
18644 // TODO: This is just a temporarily solution to save compile time. Need
18645 // to investigate if we can safely turn on slp-vectorize-hor-store
18646 // instead to allow lookup for reduction chains in all non-vectorized
18647 // stores (need to check side effects and compile time).
18648 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
18649 SI->getValueOperand()->hasOneUse();
18650 }
18651 if (TryToVectorizeRoot) {
18652 for (auto *V : It->operand_values()) {
18653 // Postponed instructions should not be vectorized here, delay their
18654 // vectorization.
18655 if (auto *VI = dyn_cast<Instruction>(V);
18656 VI && !IsInPostProcessInstrs(VI))
18657 // Try to match and vectorize a horizontal reduction.
18658 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI);
18659 }
18660 }
18661 // Start vectorization of post-process list of instructions from the
18662 // top-tree instructions to try to vectorize as many instructions as
18663 // possible.
18664 OpsChanged |=
18665 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
18666 if (OpsChanged) {
18667 // We would like to start over since some instructions are deleted
18668 // and the iterator may become invalid value.
18669 Changed = true;
18670 It = BB->begin();
18671 E = BB->end();
18672 continue;
18673 }
18674 }
18675
18676 if (isa<InsertElementInst, InsertValueInst>(It))
18677 PostProcessInserts.insert(&*It);
18678 else if (isa<CmpInst>(It))
18679 PostProcessCmps.insert(cast<CmpInst>(&*It));
18680 }
18681
18682 return Changed;
18683}
18684
18685bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
18686 auto Changed = false;
18687 for (auto &Entry : GEPs) {
18688 // If the getelementptr list has fewer than two elements, there's nothing
18689 // to do.
18690 if (Entry.second.size() < 2)
18691 continue;
18692
18693 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
18694 << Entry.second.size() << ".\n");
18695
18696 // Process the GEP list in chunks suitable for the target's supported
18697 // vector size. If a vector register can't hold 1 element, we are done. We
18698 // are trying to vectorize the index computations, so the maximum number of
18699 // elements is based on the size of the index expression, rather than the
18700 // size of the GEP itself (the target's pointer size).
18701 unsigned MaxVecRegSize = R.getMaxVecRegSize();
18702 unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin());
18703 if (MaxVecRegSize < EltSize)
18704 continue;
18705
18706 unsigned MaxElts = MaxVecRegSize / EltSize;
18707 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
18708 auto Len = std::min<unsigned>(BE - BI, MaxElts);
18709 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
18710
18711 // Initialize a set a candidate getelementptrs. Note that we use a
18712 // SetVector here to preserve program order. If the index computations
18713 // are vectorizable and begin with loads, we want to minimize the chance
18714 // of having to reorder them later.
18715 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
18716
18717 // Some of the candidates may have already been vectorized after we
18718 // initially collected them or their index is optimized to constant value.
18719 // If so, they are marked as deleted, so remove them from the set of
18720 // candidates.
18721 Candidates.remove_if([&R](Value *I) {
18722 return R.isDeleted(cast<Instruction>(I)) ||
18723 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
18724 });
18725
18726 // Remove from the set of candidates all pairs of getelementptrs with
18727 // constant differences. Such getelementptrs are likely not good
18728 // candidates for vectorization in a bottom-up phase since one can be
18729 // computed from the other. We also ensure all candidate getelementptr
18730 // indices are unique.
18731 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
18732 auto *GEPI = GEPList[I];
18733 if (!Candidates.count(GEPI))
18734 continue;
18735 auto *SCEVI = SE->getSCEV(GEPList[I]);
18736 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
18737 auto *GEPJ = GEPList[J];
18738 auto *SCEVJ = SE->getSCEV(GEPList[J]);
18739 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
18740 Candidates.remove(GEPI);
18741 Candidates.remove(GEPJ);
18742 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
18743 Candidates.remove(GEPJ);
18744 }
18745 }
18746 }
18747
18748 // We break out of the above computation as soon as we know there are
18749 // fewer than two candidates remaining.
18750 if (Candidates.size() < 2)
18751 continue;
18752
18753 // Add the single, non-constant index of each candidate to the bundle. We
18754 // ensured the indices met these constraints when we originally collected
18755 // the getelementptrs.
18756 SmallVector<Value *, 16> Bundle(Candidates.size());
18757 auto BundleIndex = 0u;
18758 for (auto *V : Candidates) {
18759 auto *GEP = cast<GetElementPtrInst>(V);
18760 auto *GEPIdx = GEP->idx_begin()->get();
18761 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
18762 Bundle[BundleIndex++] = GEPIdx;
18763 }
18764
18765 // Try and vectorize the indices. We are currently only interested in
18766 // gather-like cases of the form:
18767 //
18768 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
18769 //
18770 // where the loads of "a", the loads of "b", and the subtractions can be
18771 // performed in parallel. It's likely that detecting this pattern in a
18772 // bottom-up phase will be simpler and less costly than building a
18773 // full-blown top-down phase beginning at the consecutive loads.
18774 Changed |= tryToVectorizeList(Bundle, R);
18775 }
18776 }
18777 return Changed;
18778}
18779
18780bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
18781 bool Changed = false;
18782 // Sort by type, base pointers and values operand. Value operands must be
18783 // compatible (have the same opcode, same parent), otherwise it is
18784 // definitely not profitable to try to vectorize them.
18785 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
18786 if (V->getValueOperand()->getType()->getTypeID() <
18787 V2->getValueOperand()->getType()->getTypeID())
18788 return true;
18789 if (V->getValueOperand()->getType()->getTypeID() >
18790 V2->getValueOperand()->getType()->getTypeID())
18791 return false;
18792 if (V->getPointerOperandType()->getTypeID() <
18793 V2->getPointerOperandType()->getTypeID())
18794 return true;
18795 if (V->getPointerOperandType()->getTypeID() >
18796 V2->getPointerOperandType()->getTypeID())
18797 return false;
18798 // UndefValues are compatible with all other values.
18799 if (isa<UndefValue>(V->getValueOperand()) ||
18800 isa<UndefValue>(V2->getValueOperand()))
18801 return false;
18802 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
18803 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
18805 DT->getNode(I1->getParent());
18807 DT->getNode(I2->getParent());
18808 assert(NodeI1 && "Should only process reachable instructions");
18809 assert(NodeI2 && "Should only process reachable instructions");
18810 assert((NodeI1 == NodeI2) ==
18811 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18812 "Different nodes should have different DFS numbers");
18813 if (NodeI1 != NodeI2)
18814 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18815 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18816 if (S.getOpcode())
18817 return false;
18818 return I1->getOpcode() < I2->getOpcode();
18819 }
18820 if (isa<Constant>(V->getValueOperand()) &&
18821 isa<Constant>(V2->getValueOperand()))
18822 return false;
18823 return V->getValueOperand()->getValueID() <
18824 V2->getValueOperand()->getValueID();
18825 };
18826
18827 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
18828 if (V1 == V2)
18829 return true;
18830 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
18831 return false;
18832 if (V1->getPointerOperandType() != V2->getPointerOperandType())
18833 return false;
18834 // Undefs are compatible with any other value.
18835 if (isa<UndefValue>(V1->getValueOperand()) ||
18836 isa<UndefValue>(V2->getValueOperand()))
18837 return true;
18838 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
18839 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
18840 if (I1->getParent() != I2->getParent())
18841 return false;
18842 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18843 return S.getOpcode() > 0;
18844 }
18845 if (isa<Constant>(V1->getValueOperand()) &&
18846 isa<Constant>(V2->getValueOperand()))
18847 return true;
18848 return V1->getValueOperand()->getValueID() ==
18849 V2->getValueOperand()->getValueID();
18850 };
18851
18852 // Attempt to sort and vectorize each of the store-groups.
18854 for (auto &Pair : Stores) {
18855 if (Pair.second.size() < 2)
18856 continue;
18857
18858 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
18859 << Pair.second.size() << ".\n");
18860
18861 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
18862 continue;
18863
18864 // Reverse stores to do bottom-to-top analysis. This is important if the
18865 // values are stores to the same addresses several times, in this case need
18866 // to follow the stores order (reversed to meet the memory dependecies).
18867 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
18868 Pair.second.rend());
18869 Changed |= tryToVectorizeSequence<StoreInst>(
18870 ReversedStores, StoreSorter, AreCompatibleStores,
18871 [&](ArrayRef<StoreInst *> Candidates, bool) {
18872 return vectorizeStores(Candidates, R, Attempted);
18873 },
18874 /*MaxVFOnly=*/false, R);
18875 }
18876 return Changed;
18877}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:537
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition: LICM.cpp:1500
Loop::LoopBounds::Direction Direction
Definition: LoopInfo.cpp:231
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(VerifyEach)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > AllowHorRdxIdenityOptimization("slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, cl::desc("Allow optimization of original scalar identity operations on " "matched horizontal reductions."))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static Value * isOneOf(const InstructionsState &S, Value *Op)
Chooses the correct key for scheduling data.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
#define SV_NAME
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static std::string shortBundleName(ArrayRef< Value * > VL)
Print a short descriptor of the instruction bundle suitable for debug output.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static std::optional< unsigned > getInsertIndex(const Value *InsertInst, unsigned Offset=0)
SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt)
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
This defines the Use class.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition: VPlanSLP.cpp:154
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:77
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1309
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1636
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:179
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:265
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:424
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:187
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:174
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:228
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:204
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:168
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:195
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:232
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:451
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:438
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:169
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:209
reverse_iterator rend()
Definition: BasicBlock.h:456
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:167
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:229
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1236
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:2070
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1965
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2207
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:2064
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1323
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1401
unsigned arg_size() const
Definition: InstrTypes.h:1408
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1594
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:2061
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:530
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:747
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1104
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:786
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:787
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:781
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:785
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:783
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:909
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:871
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:847
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2231
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:155
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1450
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1399
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:417
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:484
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
Definition: DataLayout.cpp:905
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
value_type & FindAndConstruct(const KeyT &Key)
Definition: DenseMap.h:364
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:321
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:308
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void set()
Definition: FMF.h:62
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:130
Type * getReturnType() const
Definition: DerivedTypes.h:124
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:914
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:92
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2255
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:921
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:509
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2458
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:537
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2263
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1805
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:464
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1090
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:173
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2533
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:172
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:309
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:218
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1864
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:846
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1751
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:484
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2364
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2395
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2247
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2492
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:469
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1664
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:167
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2271
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2159
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2194
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:178
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1824
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2410
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1585
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1359
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:630
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2664
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:282
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:764
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:476
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
Definition: Instruction.h:279
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:280
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
An instruction for reading from memory.
Definition: Instructions.h:173
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:258
Value * getPointerOperand()
Definition: Instructions.h:252
bool isSimple() const
Definition: Instructions.h:244
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:208
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:571
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
iterator end()
Definition: MapVector.h:71
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:55
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:118
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
void reserve(size_type NumEntries)
Grow the MapVector so that it can contain at least NumEntries items before resizing again.
Definition: MapVector.h:64
size_type size() const
Definition: MapVector.h:60
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:83
void clear()
Definition: MapVector.h:88
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
T & front() const
front - Get the first element.
Definition: ArrayRef.h:363
iterator end() const
Definition: ArrayRef.h:357
iterator begin() const
Definition: ArrayRef.h:356
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:376
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
Definition: ArrayRef.h:449
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:155
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1814
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:323
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:361
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:412
iterator end() const
Definition: SmallPtrSet.h:437
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
iterator begin() const
Definition: SmallPtrSet.h:432
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:418
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:236
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:981
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:289
Type * getPointerOperandType() const
Definition: Instructions.h:379
Value * getValueOperand()
Definition: Instructions.h:373
Value * getPointerOperand()
Definition: Instructions.h:376
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:34
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr) const
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:160
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:287
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:166
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1795
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:242
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Definition: User.h:73
op_iterator op_begin()
Definition: User.h:234
Value * getOperand(unsigned i) const
Definition: User.h:169
iterator_range< value_op_iterator > operand_values()
Definition: User.h:266
The Vector Function Database.
Definition: VectorUtils.h:30
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:71
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:532
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:153
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Type * getElementType() const
Definition: DerivedTypes.h:436
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type size() const
Definition: DenseSet.h:81
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
bool erase(const ValueT &V)
Definition: DenseSet.h:101
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:75
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals=std::nullopt)
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:105
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ Entry
Definition: COFF.h:811
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1484
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:816
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:875
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239
@ Undef
Value of the register doesn't matter.
ManagedStatic< cl::opt< FnT >, OptCreatorT > Action
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:480
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1715
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:128
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:950
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:540
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:431
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
Value * createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a target reduction of the given vector.
Definition: LoopUtils.cpp:1154
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2400
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:50
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7095
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2059
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1754
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:120
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:419
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1223
constexpr int PoisonMaskElem
@ Other
Any other memory.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ Or
Bitwise or logical OR of integers.
@ None
Not a recurrence.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Definition: STLExtras.h:1986
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:427
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
InstructionCost Cost
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:483
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:593
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:471
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2228
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:97
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:220
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1450
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1459
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.