LLVM 20.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <memory>
97#include <optional>
98#include <set>
99#include <string>
100#include <tuple>
101#include <utility>
102
103using namespace llvm;
104using namespace llvm::PatternMatch;
105using namespace slpvectorizer;
106
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
109
110STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111
112static cl::opt<bool>
113 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
114 cl::desc("Run the SLP vectorization passes"));
115
116static cl::opt<bool>
117 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
118 cl::desc("Enable vectorization for wider vector utilization"));
119
120static cl::opt<int>
122 cl::desc("Only vectorize if you gain more than this "
123 "number "));
124
126 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
127 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
128 "heuristics and makes vectorization decision via cost modeling."));
129
130static cl::opt<bool>
131ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
132 cl::desc("Attempt to vectorize horizontal reductions"));
133
135 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
136 cl::desc(
137 "Attempt to vectorize horizontal reductions feeding into a store"));
138
139// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
140// even if we match a reduction but do not vectorize in the end.
142 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
143 cl::desc("Allow optimization of original scalar identity operations on "
144 "matched horizontal reductions."));
145
146static cl::opt<int>
148 cl::desc("Attempt to vectorize for this register size in bits"));
149
152 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
153
154/// Limits the size of scheduling regions in a block.
155/// It avoid long compile times for _very_ large blocks where vector
156/// instructions are spread over a wide range.
157/// This limit is way higher than needed by real-world functions.
158static cl::opt<int>
159ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
160 cl::desc("Limit the size of the SLP scheduling region per block"));
161
163 "slp-min-reg-size", cl::init(128), cl::Hidden,
164 cl::desc("Attempt to vectorize for this register size in bits"));
165
167 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
168 cl::desc("Limit the recursion depth when building a vectorizable tree"));
169
171 "slp-min-tree-size", cl::init(3), cl::Hidden,
172 cl::desc("Only vectorize small trees if they are fully vectorizable"));
173
174// The maximum depth that the look-ahead score heuristic will explore.
175// The higher this value, the higher the compilation time overhead.
177 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
178 cl::desc("The maximum look-ahead depth for operand reordering scores"));
179
180// The maximum depth that the look-ahead score heuristic will explore
181// when it probing among candidates for vectorization tree roots.
182// The higher this value, the higher the compilation time overhead but unlike
183// similar limit for operands ordering this is less frequently used, hence
184// impact of higher value is less noticeable.
186 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
187 cl::desc("The maximum look-ahead depth for searching best rooting option"));
188
190 "slp-min-strided-loads", cl::init(2), cl::Hidden,
191 cl::desc("The minimum number of loads, which should be considered strided, "
192 "if the stride is > 1 or is runtime value"));
193
195 "slp-max-stride", cl::init(8), cl::Hidden,
196 cl::desc("The maximum stride, considered to be profitable."));
197
198static cl::opt<bool>
199 ViewSLPTree("view-slp-tree", cl::Hidden,
200 cl::desc("Display the SLP trees with Graphviz"));
201
203 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
204 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
205
206// Limit the number of alias checks. The limit is chosen so that
207// it has no negative effect on the llvm benchmarks.
208static const unsigned AliasedCheckLimit = 10;
209
210// Limit of the number of uses for potentially transformed instructions/values,
211// used in checks to avoid compile-time explode.
212static constexpr int UsesLimit = 64;
213
214// Another limit for the alias checks: The maximum distance between load/store
215// instructions where alias checks are done.
216// This limit is useful for very large basic blocks.
217static const unsigned MaxMemDepDistance = 160;
218
219/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
220/// regions to be handled.
221static const int MinScheduleRegionSize = 16;
222
223/// Maximum allowed number of operands in the PHI nodes.
224static const unsigned MaxPHINumOperands = 128;
225
226/// Predicate for the element types that the SLP vectorizer supports.
227///
228/// The most important thing to filter here are types which are invalid in LLVM
229/// vectors. We also filter target specific types which have absolutely no
230/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
231/// avoids spending time checking the cost model and realizing that they will
232/// be inevitably scalarized.
233static bool isValidElementType(Type *Ty) {
234 // TODO: Support ScalableVectorType.
235 if (SLPReVec && isa<FixedVectorType>(Ty))
236 Ty = Ty->getScalarType();
237 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
238 !Ty->isPPC_FP128Ty();
239}
240
241/// \returns the number of elements for Ty.
242static unsigned getNumElements(Type *Ty) {
243 assert(!isa<ScalableVectorType>(Ty) &&
244 "ScalableVectorType is not supported.");
245 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
246 return VecTy->getNumElements();
247 return 1;
248}
249
250/// \returns the vector type of ScalarTy based on vectorization factor.
251static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
252 return FixedVectorType::get(ScalarTy->getScalarType(),
253 VF * getNumElements(ScalarTy));
254}
255
256/// \returns True if the value is a constant (but not globals/constant
257/// expressions).
258static bool isConstant(Value *V) {
259 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
260}
261
262/// Checks if \p V is one of vector-like instructions, i.e. undef,
263/// insertelement/extractelement with constant indices for fixed vector type or
264/// extractvalue instruction.
266 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
267 !isa<ExtractValueInst, UndefValue>(V))
268 return false;
269 auto *I = dyn_cast<Instruction>(V);
270 if (!I || isa<ExtractValueInst>(I))
271 return true;
272 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
273 return false;
274 if (isa<ExtractElementInst>(I))
275 return isConstant(I->getOperand(1));
276 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
277 return isConstant(I->getOperand(2));
278}
279
280/// Returns power-of-2 number of elements in a single register (part), given the
281/// total number of elements \p Size and number of registers (parts) \p
282/// NumParts.
283static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
284 return PowerOf2Ceil(divideCeil(Size, NumParts));
285}
286
287/// Returns correct remaining number of elements, considering total amount \p
288/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
289/// and current register (part) \p Part.
290static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
291 unsigned Part) {
292 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
293}
294
295#if !defined(NDEBUG)
296/// Print a short descriptor of the instruction bundle suitable for debug output.
297static std::string shortBundleName(ArrayRef<Value *> VL) {
298 std::string Result;
299 raw_string_ostream OS(Result);
300 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
301 OS.flush();
302 return Result;
303}
304#endif
305
306/// \returns true if all of the instructions in \p VL are in the same block or
307/// false otherwise.
309 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
310 if (!I0)
311 return false;
313 return true;
314
315 BasicBlock *BB = I0->getParent();
316 for (int I = 1, E = VL.size(); I < E; I++) {
317 auto *II = dyn_cast<Instruction>(VL[I]);
318 if (!II)
319 return false;
320
321 if (BB != II->getParent())
322 return false;
323 }
324 return true;
325}
326
327/// \returns True if all of the values in \p VL are constants (but not
328/// globals/constant expressions).
330 // Constant expressions and globals can't be vectorized like normal integer/FP
331 // constants.
332 return all_of(VL, isConstant);
333}
334
335/// \returns True if all of the values in \p VL are identical or some of them
336/// are UndefValue.
337static bool isSplat(ArrayRef<Value *> VL) {
338 Value *FirstNonUndef = nullptr;
339 for (Value *V : VL) {
340 if (isa<UndefValue>(V))
341 continue;
342 if (!FirstNonUndef) {
343 FirstNonUndef = V;
344 continue;
345 }
346 if (V != FirstNonUndef)
347 return false;
348 }
349 return FirstNonUndef != nullptr;
350}
351
352/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
354 if (auto *Cmp = dyn_cast<CmpInst>(I))
355 return Cmp->isCommutative();
356 if (auto *BO = dyn_cast<BinaryOperator>(I))
357 return BO->isCommutative() ||
358 (BO->getOpcode() == Instruction::Sub &&
359 !BO->hasNUsesOrMore(UsesLimit) &&
360 all_of(
361 BO->uses(),
362 [](const Use &U) {
363 // Commutative, if icmp eq/ne sub, 0
364 ICmpInst::Predicate Pred;
365 if (match(U.getUser(),
366 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
367 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
368 return true;
369 // Commutative, if abs(sub nsw, true) or abs(sub, false).
370 ConstantInt *Flag;
371 return match(U.getUser(),
372 m_Intrinsic<Intrinsic::abs>(
373 m_Specific(U.get()), m_ConstantInt(Flag))) &&
374 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
375 Flag->isOne());
376 })) ||
377 (BO->getOpcode() == Instruction::FSub &&
378 !BO->hasNUsesOrMore(UsesLimit) &&
379 all_of(BO->uses(), [](const Use &U) {
380 return match(U.getUser(),
381 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
382 }));
383 return I->isCommutative();
384}
385
386template <typename T>
387static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
388 unsigned Offset) {
389 static_assert(std::is_same_v<T, InsertElementInst> ||
390 std::is_same_v<T, ExtractElementInst>,
391 "unsupported T");
392 int Index = Offset;
393 if (const auto *IE = dyn_cast<T>(Inst)) {
394 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
395 if (!VT)
396 return std::nullopt;
397 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
398 if (!CI)
399 return std::nullopt;
400 if (CI->getValue().uge(VT->getNumElements()))
401 return std::nullopt;
402 Index *= VT->getNumElements();
403 Index += CI->getZExtValue();
404 return Index;
405 }
406 return std::nullopt;
407}
408
409/// \returns inserting or extracting index of InsertElement, ExtractElement or
410/// InsertValue instruction, using Offset as base offset for index.
411/// \returns std::nullopt if the index is not an immediate.
412static std::optional<unsigned> getElementIndex(const Value *Inst,
413 unsigned Offset = 0) {
414 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
415 return Index;
416 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
417 return Index;
418
419 int Index = Offset;
420
421 const auto *IV = dyn_cast<InsertValueInst>(Inst);
422 if (!IV)
423 return std::nullopt;
424
425 Type *CurrentType = IV->getType();
426 for (unsigned I : IV->indices()) {
427 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
428 Index *= ST->getNumElements();
429 CurrentType = ST->getElementType(I);
430 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
431 Index *= AT->getNumElements();
432 CurrentType = AT->getElementType();
433 } else {
434 return std::nullopt;
435 }
436 Index += I;
437 }
438 return Index;
439}
440
441namespace {
442/// Specifies the way the mask should be analyzed for undefs/poisonous elements
443/// in the shuffle mask.
444enum class UseMask {
445 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
446 ///< check for the mask elements for the first argument (mask
447 ///< indices are in range [0:VF)).
448 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
449 ///< for the mask elements for the second argument (mask indices
450 ///< are in range [VF:2*VF))
451 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
452 ///< future shuffle elements and mark them as ones as being used
453 ///< in future. Non-undef elements are considered as unused since
454 ///< they're already marked as used in the mask.
455};
456} // namespace
457
458/// Prepares a use bitset for the given mask either for the first argument or
459/// for the second.
461 UseMask MaskArg) {
462 SmallBitVector UseMask(VF, true);
463 for (auto [Idx, Value] : enumerate(Mask)) {
464 if (Value == PoisonMaskElem) {
465 if (MaskArg == UseMask::UndefsAsMask)
466 UseMask.reset(Idx);
467 continue;
468 }
469 if (MaskArg == UseMask::FirstArg && Value < VF)
470 UseMask.reset(Value);
471 else if (MaskArg == UseMask::SecondArg && Value >= VF)
472 UseMask.reset(Value - VF);
473 }
474 return UseMask;
475}
476
477/// Checks if the given value is actually an undefined constant vector.
478/// Also, if the \p UseMask is not empty, tries to check if the non-masked
479/// elements actually mask the insertelement buildvector, if any.
480template <bool IsPoisonOnly = false>
482 const SmallBitVector &UseMask = {}) {
483 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
484 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
485 if (isa<T>(V))
486 return Res;
487 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
488 if (!VecTy)
489 return Res.reset();
490 auto *C = dyn_cast<Constant>(V);
491 if (!C) {
492 if (!UseMask.empty()) {
493 const Value *Base = V;
494 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
495 Base = II->getOperand(0);
496 if (isa<T>(II->getOperand(1)))
497 continue;
498 std::optional<unsigned> Idx = getElementIndex(II);
499 if (!Idx) {
500 Res.reset();
501 return Res;
502 }
503 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
504 Res.reset(*Idx);
505 }
506 // TODO: Add analysis for shuffles here too.
507 if (V == Base) {
508 Res.reset();
509 } else {
510 SmallBitVector SubMask(UseMask.size(), false);
511 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
512 }
513 } else {
514 Res.reset();
515 }
516 return Res;
517 }
518 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
519 if (Constant *Elem = C->getAggregateElement(I))
520 if (!isa<T>(Elem) &&
521 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
522 Res.reset(I);
523 }
524 return Res;
525}
526
527/// Checks if the vector of instructions can be represented as a shuffle, like:
528/// %x0 = extractelement <4 x i8> %x, i32 0
529/// %x3 = extractelement <4 x i8> %x, i32 3
530/// %y1 = extractelement <4 x i8> %y, i32 1
531/// %y2 = extractelement <4 x i8> %y, i32 2
532/// %x0x0 = mul i8 %x0, %x0
533/// %x3x3 = mul i8 %x3, %x3
534/// %y1y1 = mul i8 %y1, %y1
535/// %y2y2 = mul i8 %y2, %y2
536/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
537/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
538/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
539/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
540/// ret <4 x i8> %ins4
541/// can be transformed into:
542/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
543/// i32 6>
544/// %2 = mul <4 x i8> %1, %1
545/// ret <4 x i8> %2
546/// Mask will return the Shuffle Mask equivalent to the extracted elements.
547/// TODO: Can we split off and reuse the shuffle mask detection from
548/// ShuffleVectorInst/getShuffleCost?
549static std::optional<TargetTransformInfo::ShuffleKind>
551 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
552 if (It == VL.end())
553 return std::nullopt;
554 unsigned Size =
555 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
556 auto *EI = dyn_cast<ExtractElementInst>(V);
557 if (!EI)
558 return S;
559 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
560 if (!VTy)
561 return S;
562 return std::max(S, VTy->getNumElements());
563 });
564
565 Value *Vec1 = nullptr;
566 Value *Vec2 = nullptr;
567 bool HasNonUndefVec = any_of(VL, [](Value *V) {
568 auto *EE = dyn_cast<ExtractElementInst>(V);
569 if (!EE)
570 return false;
571 Value *Vec = EE->getVectorOperand();
572 if (isa<UndefValue>(Vec))
573 return false;
574 return isGuaranteedNotToBePoison(Vec);
575 });
576 enum ShuffleMode { Unknown, Select, Permute };
577 ShuffleMode CommonShuffleMode = Unknown;
578 Mask.assign(VL.size(), PoisonMaskElem);
579 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
580 // Undef can be represented as an undef element in a vector.
581 if (isa<UndefValue>(VL[I]))
582 continue;
583 auto *EI = cast<ExtractElementInst>(VL[I]);
584 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
585 return std::nullopt;
586 auto *Vec = EI->getVectorOperand();
587 // We can extractelement from undef or poison vector.
588 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
589 continue;
590 // All vector operands must have the same number of vector elements.
591 if (isa<UndefValue>(Vec)) {
592 Mask[I] = I;
593 } else {
594 if (isa<UndefValue>(EI->getIndexOperand()))
595 continue;
596 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
597 if (!Idx)
598 return std::nullopt;
599 // Undefined behavior if Idx is negative or >= Size.
600 if (Idx->getValue().uge(Size))
601 continue;
602 unsigned IntIdx = Idx->getValue().getZExtValue();
603 Mask[I] = IntIdx;
604 }
605 if (isUndefVector(Vec).all() && HasNonUndefVec)
606 continue;
607 // For correct shuffling we have to have at most 2 different vector operands
608 // in all extractelement instructions.
609 if (!Vec1 || Vec1 == Vec) {
610 Vec1 = Vec;
611 } else if (!Vec2 || Vec2 == Vec) {
612 Vec2 = Vec;
613 Mask[I] += Size;
614 } else {
615 return std::nullopt;
616 }
617 if (CommonShuffleMode == Permute)
618 continue;
619 // If the extract index is not the same as the operation number, it is a
620 // permutation.
621 if (Mask[I] % Size != I) {
622 CommonShuffleMode = Permute;
623 continue;
624 }
625 CommonShuffleMode = Select;
626 }
627 // If we're not crossing lanes in different vectors, consider it as blending.
628 if (CommonShuffleMode == Select && Vec2)
630 // If Vec2 was never used, we have a permutation of a single vector, otherwise
631 // we have permutation of 2 vectors.
634}
635
636/// \returns True if Extract{Value,Element} instruction extracts element Idx.
637static std::optional<unsigned> getExtractIndex(Instruction *E) {
638 unsigned Opcode = E->getOpcode();
639 assert((Opcode == Instruction::ExtractElement ||
640 Opcode == Instruction::ExtractValue) &&
641 "Expected extractelement or extractvalue instruction.");
642 if (Opcode == Instruction::ExtractElement) {
643 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
644 if (!CI)
645 return std::nullopt;
646 return CI->getZExtValue();
647 }
648 auto *EI = cast<ExtractValueInst>(E);
649 if (EI->getNumIndices() != 1)
650 return std::nullopt;
651 return *EI->idx_begin();
652}
653
654namespace {
655
656/// Main data required for vectorization of instructions.
657struct InstructionsState {
658 /// The very first instruction in the list with the main opcode.
659 Value *OpValue = nullptr;
660
661 /// The main/alternate instruction.
662 Instruction *MainOp = nullptr;
663 Instruction *AltOp = nullptr;
664
665 /// The main/alternate opcodes for the list of instructions.
666 unsigned getOpcode() const {
667 return MainOp ? MainOp->getOpcode() : 0;
668 }
669
670 unsigned getAltOpcode() const {
671 return AltOp ? AltOp->getOpcode() : 0;
672 }
673
674 /// Some of the instructions in the list have alternate opcodes.
675 bool isAltShuffle() const { return AltOp != MainOp; }
676
677 bool isOpcodeOrAlt(Instruction *I) const {
678 unsigned CheckedOpcode = I->getOpcode();
679 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
680 }
681
682 InstructionsState() = delete;
683 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
684 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
685};
686
687} // end anonymous namespace
688
689/// Chooses the correct key for scheduling data. If \p Op has the same (or
690/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
691/// OpValue.
692static Value *isOneOf(const InstructionsState &S, Value *Op) {
693 auto *I = dyn_cast<Instruction>(Op);
694 if (I && S.isOpcodeOrAlt(I))
695 return Op;
696 return S.OpValue;
697}
698
699/// \returns true if \p Opcode is allowed as part of the main/alternate
700/// instruction for SLP vectorization.
701///
702/// Example of unsupported opcode is SDIV that can potentially cause UB if the
703/// "shuffled out" lane would result in division by zero.
704static bool isValidForAlternation(unsigned Opcode) {
705 if (Instruction::isIntDivRem(Opcode))
706 return false;
707
708 return true;
709}
710
711static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
712 const TargetLibraryInfo &TLI,
713 unsigned BaseIndex = 0);
714
715/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
716/// compatible instructions or constants, or just some other regular values.
717static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
718 Value *Op1, const TargetLibraryInfo &TLI) {
719 return (isConstant(BaseOp0) && isConstant(Op0)) ||
720 (isConstant(BaseOp1) && isConstant(Op1)) ||
721 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
722 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
723 BaseOp0 == Op0 || BaseOp1 == Op1 ||
724 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
725 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
726}
727
728/// \returns true if a compare instruction \p CI has similar "look" and
729/// same predicate as \p BaseCI, "as is" or with its operands and predicate
730/// swapped, false otherwise.
731static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
732 const TargetLibraryInfo &TLI) {
733 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
734 "Assessing comparisons of different types?");
735 CmpInst::Predicate BasePred = BaseCI->getPredicate();
736 CmpInst::Predicate Pred = CI->getPredicate();
738
739 Value *BaseOp0 = BaseCI->getOperand(0);
740 Value *BaseOp1 = BaseCI->getOperand(1);
741 Value *Op0 = CI->getOperand(0);
742 Value *Op1 = CI->getOperand(1);
743
744 return (BasePred == Pred &&
745 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
746 (BasePred == SwappedPred &&
747 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
748}
749
750/// \returns analysis of the Instructions in \p VL described in
751/// InstructionsState, the Opcode that we suppose the whole list
752/// could be vectorized even if its structure is diverse.
753static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
754 const TargetLibraryInfo &TLI,
755 unsigned BaseIndex) {
756 // Make sure these are all Instructions.
757 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
758 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
759
760 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
761 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
762 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
763 CmpInst::Predicate BasePred =
764 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
766 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
767 unsigned AltOpcode = Opcode;
768 unsigned AltIndex = BaseIndex;
769
770 bool SwappedPredsCompatible = [&]() {
771 if (!IsCmpOp)
772 return false;
773 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
774 UniquePreds.insert(BasePred);
775 UniqueNonSwappedPreds.insert(BasePred);
776 for (Value *V : VL) {
777 auto *I = dyn_cast<CmpInst>(V);
778 if (!I)
779 return false;
780 CmpInst::Predicate CurrentPred = I->getPredicate();
781 CmpInst::Predicate SwappedCurrentPred =
782 CmpInst::getSwappedPredicate(CurrentPred);
783 UniqueNonSwappedPreds.insert(CurrentPred);
784 if (!UniquePreds.contains(CurrentPred) &&
785 !UniquePreds.contains(SwappedCurrentPred))
786 UniquePreds.insert(CurrentPred);
787 }
788 // Total number of predicates > 2, but if consider swapped predicates
789 // compatible only 2, consider swappable predicates as compatible opcodes,
790 // not alternate.
791 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
792 }();
793 // Check for one alternate opcode from another BinaryOperator.
794 // TODO - generalize to support all operators (types, calls etc.).
795 auto *IBase = cast<Instruction>(VL[BaseIndex]);
796 Intrinsic::ID BaseID = 0;
797 SmallVector<VFInfo> BaseMappings;
798 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
800 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
801 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
802 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
803 }
804 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
805 auto *I = cast<Instruction>(VL[Cnt]);
806 unsigned InstOpcode = I->getOpcode();
807 if (IsBinOp && isa<BinaryOperator>(I)) {
808 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
809 continue;
810 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
811 isValidForAlternation(Opcode)) {
812 AltOpcode = InstOpcode;
813 AltIndex = Cnt;
814 continue;
815 }
816 } else if (IsCastOp && isa<CastInst>(I)) {
817 Value *Op0 = IBase->getOperand(0);
818 Type *Ty0 = Op0->getType();
819 Value *Op1 = I->getOperand(0);
820 Type *Ty1 = Op1->getType();
821 if (Ty0 == Ty1) {
822 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
823 continue;
824 if (Opcode == AltOpcode) {
826 isValidForAlternation(InstOpcode) &&
827 "Cast isn't safe for alternation, logic needs to be updated!");
828 AltOpcode = InstOpcode;
829 AltIndex = Cnt;
830 continue;
831 }
832 }
833 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
834 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
835 Type *Ty0 = BaseInst->getOperand(0)->getType();
836 Type *Ty1 = Inst->getOperand(0)->getType();
837 if (Ty0 == Ty1) {
838 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
839 // Check for compatible operands. If the corresponding operands are not
840 // compatible - need to perform alternate vectorization.
841 CmpInst::Predicate CurrentPred = Inst->getPredicate();
842 CmpInst::Predicate SwappedCurrentPred =
843 CmpInst::getSwappedPredicate(CurrentPred);
844
845 if ((E == 2 || SwappedPredsCompatible) &&
846 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
847 continue;
848
849 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
850 continue;
851 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
852 if (AltIndex != BaseIndex) {
853 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
854 continue;
855 } else if (BasePred != CurrentPred) {
856 assert(
857 isValidForAlternation(InstOpcode) &&
858 "CmpInst isn't safe for alternation, logic needs to be updated!");
859 AltIndex = Cnt;
860 continue;
861 }
862 CmpInst::Predicate AltPred = AltInst->getPredicate();
863 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
864 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
865 continue;
866 }
867 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
868 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
869 if (Gep->getNumOperands() != 2 ||
870 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
871 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
872 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
874 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
875 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
876 auto *BaseLI = cast<LoadInst>(IBase);
877 if (!LI->isSimple() || !BaseLI->isSimple())
878 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
879 } else if (auto *Call = dyn_cast<CallInst>(I)) {
880 auto *CallBase = cast<CallInst>(IBase);
881 if (Call->getCalledFunction() != CallBase->getCalledFunction())
882 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
883 if (Call->hasOperandBundles() && (!CallBase->hasOperandBundles() ||
884 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
885 Call->op_begin() + Call->getBundleOperandsEndIndex(),
886 CallBase->op_begin() +
888 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
890 if (ID != BaseID)
891 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
892 if (!ID) {
893 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
894 if (Mappings.size() != BaseMappings.size() ||
895 Mappings.front().ISA != BaseMappings.front().ISA ||
896 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
897 Mappings.front().VectorName != BaseMappings.front().VectorName ||
898 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
899 Mappings.front().Shape.Parameters !=
900 BaseMappings.front().Shape.Parameters)
901 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
902 }
903 }
904 continue;
905 }
906 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
907 }
908
909 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
910 cast<Instruction>(VL[AltIndex]));
911}
912
913/// \returns true if all of the values in \p VL have the same type or false
914/// otherwise.
916 Type *Ty = VL.front()->getType();
917 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
918}
919
920/// \returns True if in-tree use also needs extract. This refers to
921/// possible scalar operand in vectorized instruction.
922static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
923 TargetLibraryInfo *TLI) {
924 unsigned Opcode = UserInst->getOpcode();
925 switch (Opcode) {
926 case Instruction::Load: {
927 LoadInst *LI = cast<LoadInst>(UserInst);
928 return (LI->getPointerOperand() == Scalar);
929 }
930 case Instruction::Store: {
931 StoreInst *SI = cast<StoreInst>(UserInst);
932 return (SI->getPointerOperand() == Scalar);
933 }
934 case Instruction::Call: {
935 CallInst *CI = cast<CallInst>(UserInst);
937 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
938 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
939 Arg.value().get() == Scalar;
940 });
941 }
942 default:
943 return false;
944 }
945}
946
947/// \returns the AA location that is being access by the instruction.
949 if (StoreInst *SI = dyn_cast<StoreInst>(I))
950 return MemoryLocation::get(SI);
951 if (LoadInst *LI = dyn_cast<LoadInst>(I))
952 return MemoryLocation::get(LI);
953 return MemoryLocation();
954}
955
956/// \returns True if the instruction is not a volatile or atomic load/store.
957static bool isSimple(Instruction *I) {
958 if (LoadInst *LI = dyn_cast<LoadInst>(I))
959 return LI->isSimple();
960 if (StoreInst *SI = dyn_cast<StoreInst>(I))
961 return SI->isSimple();
962 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
963 return !MI->isVolatile();
964 return true;
965}
966
967/// Shuffles \p Mask in accordance with the given \p SubMask.
968/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
969/// one but two input vectors.
970static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
971 bool ExtendingManyInputs = false) {
972 if (SubMask.empty())
973 return;
974 assert(
975 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
976 // Check if input scalars were extended to match the size of other node.
977 (SubMask.size() == Mask.size() &&
978 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
979 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
980 "SubMask with many inputs support must be larger than the mask.");
981 if (Mask.empty()) {
982 Mask.append(SubMask.begin(), SubMask.end());
983 return;
984 }
985 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
986 int TermValue = std::min(Mask.size(), SubMask.size());
987 for (int I = 0, E = SubMask.size(); I < E; ++I) {
988 if (SubMask[I] == PoisonMaskElem ||
989 (!ExtendingManyInputs &&
990 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
991 continue;
992 NewMask[I] = Mask[SubMask[I]];
993 }
994 Mask.swap(NewMask);
995}
996
997/// Order may have elements assigned special value (size) which is out of
998/// bounds. Such indices only appear on places which correspond to undef values
999/// (see canReuseExtract for details) and used in order to avoid undef values
1000/// have effect on operands ordering.
1001/// The first loop below simply finds all unused indices and then the next loop
1002/// nest assigns these indices for undef values positions.
1003/// As an example below Order has two undef positions and they have assigned
1004/// values 3 and 7 respectively:
1005/// before: 6 9 5 4 9 2 1 0
1006/// after: 6 3 5 4 7 2 1 0
1008 const unsigned Sz = Order.size();
1009 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1010 SmallBitVector MaskedIndices(Sz);
1011 for (unsigned I = 0; I < Sz; ++I) {
1012 if (Order[I] < Sz)
1013 UnusedIndices.reset(Order[I]);
1014 else
1015 MaskedIndices.set(I);
1016 }
1017 if (MaskedIndices.none())
1018 return;
1019 assert(UnusedIndices.count() == MaskedIndices.count() &&
1020 "Non-synced masked/available indices.");
1021 int Idx = UnusedIndices.find_first();
1022 int MIdx = MaskedIndices.find_first();
1023 while (MIdx >= 0) {
1024 assert(Idx >= 0 && "Indices must be synced.");
1025 Order[MIdx] = Idx;
1026 Idx = UnusedIndices.find_next(Idx);
1027 MIdx = MaskedIndices.find_next(MIdx);
1028 }
1029}
1030
1031/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1032/// Opcode1.
1034 unsigned Opcode1) {
1035 SmallBitVector OpcodeMask(VL.size(), false);
1036 for (unsigned Lane : seq<unsigned>(VL.size()))
1037 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1038 OpcodeMask.set(Lane);
1039 return OpcodeMask;
1040}
1041
1042namespace llvm {
1043
1045 SmallVectorImpl<int> &Mask) {
1046 Mask.clear();
1047 const unsigned E = Indices.size();
1048 Mask.resize(E, PoisonMaskElem);
1049 for (unsigned I = 0; I < E; ++I)
1050 Mask[Indices[I]] = I;
1051}
1052
1053/// Reorders the list of scalars in accordance with the given \p Mask.
1055 ArrayRef<int> Mask) {
1056 assert(!Mask.empty() && "Expected non-empty mask.");
1057 SmallVector<Value *> Prev(Scalars.size(),
1058 PoisonValue::get(Scalars.front()->getType()));
1059 Prev.swap(Scalars);
1060 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1061 if (Mask[I] != PoisonMaskElem)
1062 Scalars[Mask[I]] = Prev[I];
1063}
1064
1065/// Checks if the provided value does not require scheduling. It does not
1066/// require scheduling if this is not an instruction or it is an instruction
1067/// that does not read/write memory and all operands are either not instructions
1068/// or phi nodes or instructions from different blocks.
1070 auto *I = dyn_cast<Instruction>(V);
1071 if (!I)
1072 return true;
1073 return !mayHaveNonDefUseDependency(*I) &&
1074 all_of(I->operands(), [I](Value *V) {
1075 auto *IO = dyn_cast<Instruction>(V);
1076 if (!IO)
1077 return true;
1078 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1079 });
1080}
1081
1082/// Checks if the provided value does not require scheduling. It does not
1083/// require scheduling if this is not an instruction or it is an instruction
1084/// that does not read/write memory and all users are phi nodes or instructions
1085/// from the different blocks.
1086static bool isUsedOutsideBlock(Value *V) {
1087 auto *I = dyn_cast<Instruction>(V);
1088 if (!I)
1089 return true;
1090 // Limits the number of uses to save compile time.
1091 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1092 all_of(I->users(), [I](User *U) {
1093 auto *IU = dyn_cast<Instruction>(U);
1094 if (!IU)
1095 return true;
1096 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1097 });
1098}
1099
1100/// Checks if the specified value does not require scheduling. It does not
1101/// require scheduling if all operands and all users do not need to be scheduled
1102/// in the current basic block.
1105}
1106
1107/// Checks if the specified array of instructions does not require scheduling.
1108/// It is so if all either instructions have operands that do not require
1109/// scheduling or their users do not require scheduling since they are phis or
1110/// in other basic blocks.
1112 return !VL.empty() &&
1114}
1115
1116namespace slpvectorizer {
1117
1118/// Bottom Up SLP Vectorizer.
1119class BoUpSLP {
1120 struct TreeEntry;
1121 struct ScheduleData;
1124
1125public:
1126 /// Tracks the state we can represent the loads in the given sequence.
1127 enum class LoadsState {
1128 Gather,
1129 Vectorize,
1132 };
1133
1141
1143 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1146 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1147 AC(AC), DB(DB), DL(DL), ORE(ORE),
1148 Builder(Se->getContext(), TargetFolder(*DL)) {
1149 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1150 // Use the vector register size specified by the target unless overridden
1151 // by a command-line option.
1152 // TODO: It would be better to limit the vectorization factor based on
1153 // data type rather than just register size. For example, x86 AVX has
1154 // 256-bit registers, but it does not support integer operations
1155 // at that width (that requires AVX2).
1156 if (MaxVectorRegSizeOption.getNumOccurrences())
1157 MaxVecRegSize = MaxVectorRegSizeOption;
1158 else
1159 MaxVecRegSize =
1161 .getFixedValue();
1162
1163 if (MinVectorRegSizeOption.getNumOccurrences())
1164 MinVecRegSize = MinVectorRegSizeOption;
1165 else
1166 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1167 }
1168
1169 /// Vectorize the tree that starts with the elements in \p VL.
1170 /// Returns the vectorized root.
1172
1173 /// Vectorize the tree but with the list of externally used values \p
1174 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1175 /// generated extractvalue instructions.
1176 /// \param ReplacedExternals containd list of replaced external values
1177 /// {scalar, replace} after emitting extractelement for external uses.
1178 Value *
1179 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1180 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1181 Instruction *ReductionRoot = nullptr);
1182
1183 /// \returns the cost incurred by unwanted spills and fills, caused by
1184 /// holding live values over call sites.
1186
1187 /// \returns the vectorization cost of the subtree that starts at \p VL.
1188 /// A negative number means that this is profitable.
1189 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1190
1191 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1192 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1193 void buildTree(ArrayRef<Value *> Roots,
1194 const SmallDenseSet<Value *> &UserIgnoreLst);
1195
1196 /// Construct a vectorizable tree that starts at \p Roots.
1197 void buildTree(ArrayRef<Value *> Roots);
1198
1199 /// Returns whether the root node has in-tree uses.
1201 return !VectorizableTree.empty() &&
1202 !VectorizableTree.front()->UserTreeIndices.empty();
1203 }
1204
1205 /// Return the scalars of the root node.
1207 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1208 return VectorizableTree.front()->Scalars;
1209 }
1210
1211 /// Checks if the root graph node can be emitted with narrower bitwidth at
1212 /// codegen and returns it signedness, if so.
1214 return MinBWs.at(VectorizableTree.front().get()).second;
1215 }
1216
1217 /// Builds external uses of the vectorized scalars, i.e. the list of
1218 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1219 /// ExternallyUsedValues contains additional list of external uses to handle
1220 /// vectorization of reductions.
1221 void
1222 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1223
1224 /// Transforms graph nodes to target specific representations, if profitable.
1225 void transformNodes();
1226
1227 /// Clear the internal data structures that are created by 'buildTree'.
1228 void deleteTree() {
1229 VectorizableTree.clear();
1230 ScalarToTreeEntry.clear();
1231 MultiNodeScalars.clear();
1232 MustGather.clear();
1233 NonScheduledFirst.clear();
1234 EntryToLastInstruction.clear();
1235 ExternalUses.clear();
1236 ExternalUsesAsGEPs.clear();
1237 for (auto &Iter : BlocksSchedules) {
1238 BlockScheduling *BS = Iter.second.get();
1239 BS->clear();
1240 }
1241 MinBWs.clear();
1242 ReductionBitWidth = 0;
1243 CastMaxMinBWSizes.reset();
1244 ExtraBitWidthNodes.clear();
1245 InstrElementSize.clear();
1246 UserIgnoreList = nullptr;
1247 PostponedGathers.clear();
1248 ValueToGatherNodes.clear();
1249 }
1250
1251 unsigned getTreeSize() const { return VectorizableTree.size(); }
1252
1253 /// Perform LICM and CSE on the newly generated gather sequences.
1255
1256 /// Checks if the specified gather tree entry \p TE can be represented as a
1257 /// shuffled vector entry + (possibly) permutation with other gathers. It
1258 /// implements the checks only for possibly ordered scalars (Loads,
1259 /// ExtractElement, ExtractValue), which can be part of the graph.
1260 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1261
1262 /// Sort loads into increasing pointers offsets to allow greater clustering.
1263 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1264
1265 /// Gets reordering data for the given tree entry. If the entry is vectorized
1266 /// - just return ReorderIndices, otherwise check if the scalars can be
1267 /// reordered and return the most optimal order.
1268 /// \return std::nullopt if ordering is not important, empty order, if
1269 /// identity order is important, or the actual order.
1270 /// \param TopToBottom If true, include the order of vectorized stores and
1271 /// insertelement nodes, otherwise skip them.
1272 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1273 bool TopToBottom);
1274
1275 /// Reorders the current graph to the most profitable order starting from the
1276 /// root node to the leaf nodes. The best order is chosen only from the nodes
1277 /// of the same size (vectorization factor). Smaller nodes are considered
1278 /// parts of subgraph with smaller VF and they are reordered independently. We
1279 /// can make it because we still need to extend smaller nodes to the wider VF
1280 /// and we can merge reordering shuffles with the widening shuffles.
1281 void reorderTopToBottom();
1282
1283 /// Reorders the current graph to the most profitable order starting from
1284 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1285 /// number of reshuffles if the leaf nodes use the same order. In this case we
1286 /// can merge the orders and just shuffle user node instead of shuffling its
1287 /// operands. Plus, even the leaf nodes have different orders, it allows to
1288 /// sink reordering in the graph closer to the root node and merge it later
1289 /// during analysis.
1290 void reorderBottomToTop(bool IgnoreReorder = false);
1291
1292 /// \return The vector element size in bits to use when vectorizing the
1293 /// expression tree ending at \p V. If V is a store, the size is the width of
1294 /// the stored value. Otherwise, the size is the width of the largest loaded
1295 /// value reaching V. This method is used by the vectorizer to calculate
1296 /// vectorization factors.
1297 unsigned getVectorElementSize(Value *V);
1298
1299 /// Compute the minimum type sizes required to represent the entries in a
1300 /// vectorizable tree.
1302
1303 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1304 unsigned getMaxVecRegSize() const {
1305 return MaxVecRegSize;
1306 }
1307
1308 // \returns minimum vector register size as set by cl::opt.
1309 unsigned getMinVecRegSize() const {
1310 return MinVecRegSize;
1311 }
1312
1313 unsigned getMinVF(unsigned Sz) const {
1314 return std::max(2U, getMinVecRegSize() / Sz);
1315 }
1316
1317 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1318 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1319 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1320 return MaxVF ? MaxVF : UINT_MAX;
1321 }
1322
1323 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1324 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1325 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1326 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1327 ///
1328 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1329 unsigned canMapToVector(Type *T) const;
1330
1331 /// \returns True if the VectorizableTree is both tiny and not fully
1332 /// vectorizable. We do not vectorize such trees.
1333 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1334
1335 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1336 /// can be load combined in the backend. Load combining may not be allowed in
1337 /// the IR optimizer, so we do not want to alter the pattern. For example,
1338 /// partially transforming a scalar bswap() pattern into vector code is
1339 /// effectively impossible for the backend to undo.
1340 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1341 /// may not be necessary.
1342 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1343
1344 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1345 /// can be load combined in the backend. Load combining may not be allowed in
1346 /// the IR optimizer, so we do not want to alter the pattern. For example,
1347 /// partially transforming a scalar bswap() pattern into vector code is
1348 /// effectively impossible for the backend to undo.
1349 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1350 /// may not be necessary.
1351 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1352
1353 /// Checks if the given array of loads can be represented as a vectorized,
1354 /// scatter or just simple gather.
1355 /// \param VL list of loads.
1356 /// \param VL0 main load value.
1357 /// \param Order returned order of load instructions.
1358 /// \param PointerOps returned list of pointer operands.
1359 /// \param TryRecursiveCheck used to check if long masked gather can be
1360 /// represented as a serie of loads/insert subvector, if profitable.
1363 SmallVectorImpl<Value *> &PointerOps,
1364 bool TryRecursiveCheck = true) const;
1365
1367
1368 /// This structure holds any data we need about the edges being traversed
1369 /// during buildTree_rec(). We keep track of:
1370 /// (i) the user TreeEntry index, and
1371 /// (ii) the index of the edge.
1372 struct EdgeInfo {
1373 EdgeInfo() = default;
1374 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1376 /// The user TreeEntry.
1377 TreeEntry *UserTE = nullptr;
1378 /// The operand index of the use.
1379 unsigned EdgeIdx = UINT_MAX;
1380#ifndef NDEBUG
1382 const BoUpSLP::EdgeInfo &EI) {
1383 EI.dump(OS);
1384 return OS;
1385 }
1386 /// Debug print.
1387 void dump(raw_ostream &OS) const {
1388 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1389 << " EdgeIdx:" << EdgeIdx << "}";
1390 }
1391 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1392#endif
1393 bool operator == (const EdgeInfo &Other) const {
1394 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1395 }
1396 };
1397
1398 /// A helper class used for scoring candidates for two consecutive lanes.
1400 const TargetLibraryInfo &TLI;
1401 const DataLayout &DL;
1402 ScalarEvolution &SE;
1403 const BoUpSLP &R;
1404 int NumLanes; // Total number of lanes (aka vectorization factor).
1405 int MaxLevel; // The maximum recursion depth for accumulating score.
1406
1407 public:
1409 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1410 int MaxLevel)
1411 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1412 MaxLevel(MaxLevel) {}
1413
1414 // The hard-coded scores listed here are not very important, though it shall
1415 // be higher for better matches to improve the resulting cost. When
1416 // computing the scores of matching one sub-tree with another, we are
1417 // basically counting the number of values that are matching. So even if all
1418 // scores are set to 1, we would still get a decent matching result.
1419 // However, sometimes we have to break ties. For example we may have to
1420 // choose between matching loads vs matching opcodes. This is what these
1421 // scores are helping us with: they provide the order of preference. Also,
1422 // this is important if the scalar is externally used or used in another
1423 // tree entry node in the different lane.
1424
1425 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1426 static const int ScoreConsecutiveLoads = 4;
1427 /// The same load multiple times. This should have a better score than
1428 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1429 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1430 /// a vector load and 1.0 for a broadcast.
1431 static const int ScoreSplatLoads = 3;
1432 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1433 static const int ScoreReversedLoads = 3;
1434 /// A load candidate for masked gather.
1435 static const int ScoreMaskedGatherCandidate = 1;
1436 /// ExtractElementInst from same vector and consecutive indexes.
1437 static const int ScoreConsecutiveExtracts = 4;
1438 /// ExtractElementInst from same vector and reversed indices.
1439 static const int ScoreReversedExtracts = 3;
1440 /// Constants.
1441 static const int ScoreConstants = 2;
1442 /// Instructions with the same opcode.
1443 static const int ScoreSameOpcode = 2;
1444 /// Instructions with alt opcodes (e.g, add + sub).
1445 static const int ScoreAltOpcodes = 1;
1446 /// Identical instructions (a.k.a. splat or broadcast).
1447 static const int ScoreSplat = 1;
1448 /// Matching with an undef is preferable to failing.
1449 static const int ScoreUndef = 1;
1450 /// Score for failing to find a decent match.
1451 static const int ScoreFail = 0;
1452 /// Score if all users are vectorized.
1453 static const int ScoreAllUserVectorized = 1;
1454
1455 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1456 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1457 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1458 /// MainAltOps.
1460 ArrayRef<Value *> MainAltOps) const {
1461 if (!isValidElementType(V1->getType()) ||
1462 !isValidElementType(V2->getType()))
1464
1465 if (V1 == V2) {
1466 if (isa<LoadInst>(V1)) {
1467 // Retruns true if the users of V1 and V2 won't need to be extracted.
1468 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1469 // Bail out if we have too many uses to save compilation time.
1470 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1471 return false;
1472
1473 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1474 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1475 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1476 });
1477 };
1478 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1479 };
1480 // A broadcast of a load can be cheaper on some targets.
1481 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1482 ElementCount::getFixed(NumLanes)) &&
1483 ((int)V1->getNumUses() == NumLanes ||
1484 AllUsersAreInternal(V1, V2)))
1486 }
1488 }
1489
1490 auto CheckSameEntryOrFail = [&]() {
1491 if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1492 TE1 && TE1 == R.getTreeEntry(V2))
1495 };
1496
1497 auto *LI1 = dyn_cast<LoadInst>(V1);
1498 auto *LI2 = dyn_cast<LoadInst>(V2);
1499 if (LI1 && LI2) {
1500 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1501 !LI2->isSimple())
1502 return CheckSameEntryOrFail();
1503
1504 std::optional<int> Dist = getPointersDiff(
1505 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1506 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1507 if (!Dist || *Dist == 0) {
1508 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1509 getUnderlyingObject(LI2->getPointerOperand()) &&
1510 R.TTI->isLegalMaskedGather(
1511 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
1513 return CheckSameEntryOrFail();
1514 }
1515 // The distance is too large - still may be profitable to use masked
1516 // loads/gathers.
1517 if (std::abs(*Dist) > NumLanes / 2)
1519 // This still will detect consecutive loads, but we might have "holes"
1520 // in some cases. It is ok for non-power-2 vectorization and may produce
1521 // better results. It should not affect current vectorization.
1524 }
1525
1526 auto *C1 = dyn_cast<Constant>(V1);
1527 auto *C2 = dyn_cast<Constant>(V2);
1528 if (C1 && C2)
1530
1531 // Extracts from consecutive indexes of the same vector better score as
1532 // the extracts could be optimized away.
1533 Value *EV1;
1534 ConstantInt *Ex1Idx;
1535 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1536 // Undefs are always profitable for extractelements.
1537 // Compiler can easily combine poison and extractelement <non-poison> or
1538 // undef and extractelement <poison>. But combining undef +
1539 // extractelement <non-poison-but-may-produce-poison> requires some
1540 // extra operations.
1541 if (isa<UndefValue>(V2))
1542 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1545 Value *EV2 = nullptr;
1546 ConstantInt *Ex2Idx = nullptr;
1547 if (match(V2,
1549 m_Undef())))) {
1550 // Undefs are always profitable for extractelements.
1551 if (!Ex2Idx)
1553 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1555 if (EV2 == EV1) {
1556 int Idx1 = Ex1Idx->getZExtValue();
1557 int Idx2 = Ex2Idx->getZExtValue();
1558 int Dist = Idx2 - Idx1;
1559 // The distance is too large - still may be profitable to use
1560 // shuffles.
1561 if (std::abs(Dist) == 0)
1563 if (std::abs(Dist) > NumLanes / 2)
1567 }
1569 }
1570 return CheckSameEntryOrFail();
1571 }
1572
1573 auto *I1 = dyn_cast<Instruction>(V1);
1574 auto *I2 = dyn_cast<Instruction>(V2);
1575 if (I1 && I2) {
1576 if (I1->getParent() != I2->getParent())
1577 return CheckSameEntryOrFail();
1578 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1579 Ops.push_back(I1);
1580 Ops.push_back(I2);
1581 InstructionsState S = getSameOpcode(Ops, TLI);
1582 // Note: Only consider instructions with <= 2 operands to avoid
1583 // complexity explosion.
1584 if (S.getOpcode() &&
1585 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1586 !S.isAltShuffle()) &&
1587 all_of(Ops, [&S](Value *V) {
1588 return cast<Instruction>(V)->getNumOperands() ==
1589 S.MainOp->getNumOperands();
1590 }))
1591 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1593 }
1594
1595 if (isa<UndefValue>(V2))
1597
1598 return CheckSameEntryOrFail();
1599 }
1600
1601 /// Go through the operands of \p LHS and \p RHS recursively until
1602 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1603 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1604 /// of \p U1 and \p U2), except at the beginning of the recursion where
1605 /// these are set to nullptr.
1606 ///
1607 /// For example:
1608 /// \verbatim
1609 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1610 /// \ / \ / \ / \ /
1611 /// + + + +
1612 /// G1 G2 G3 G4
1613 /// \endverbatim
1614 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1615 /// each level recursively, accumulating the score. It starts from matching
1616 /// the additions at level 0, then moves on to the loads (level 1). The
1617 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1618 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1619 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1620 /// Please note that the order of the operands does not matter, as we
1621 /// evaluate the score of all profitable combinations of operands. In
1622 /// other words the score of G1 and G4 is the same as G1 and G2. This
1623 /// heuristic is based on ideas described in:
1624 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1625 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1626 /// Luís F. W. Góes
1628 Instruction *U2, int CurrLevel,
1629 ArrayRef<Value *> MainAltOps) const {
1630
1631 // Get the shallow score of V1 and V2.
1632 int ShallowScoreAtThisLevel =
1633 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1634
1635 // If reached MaxLevel,
1636 // or if V1 and V2 are not instructions,
1637 // or if they are SPLAT,
1638 // or if they are not consecutive,
1639 // or if profitable to vectorize loads or extractelements, early return
1640 // the current cost.
1641 auto *I1 = dyn_cast<Instruction>(LHS);
1642 auto *I2 = dyn_cast<Instruction>(RHS);
1643 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1644 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1645 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1646 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1647 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1648 ShallowScoreAtThisLevel))
1649 return ShallowScoreAtThisLevel;
1650 assert(I1 && I2 && "Should have early exited.");
1651
1652 // Contains the I2 operand indexes that got matched with I1 operands.
1653 SmallSet<unsigned, 4> Op2Used;
1654
1655 // Recursion towards the operands of I1 and I2. We are trying all possible
1656 // operand pairs, and keeping track of the best score.
1657 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1658 OpIdx1 != NumOperands1; ++OpIdx1) {
1659 // Try to pair op1I with the best operand of I2.
1660 int MaxTmpScore = 0;
1661 unsigned MaxOpIdx2 = 0;
1662 bool FoundBest = false;
1663 // If I2 is commutative try all combinations.
1664 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1665 unsigned ToIdx = isCommutative(I2)
1666 ? I2->getNumOperands()
1667 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1668 assert(FromIdx <= ToIdx && "Bad index");
1669 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1670 // Skip operands already paired with OpIdx1.
1671 if (Op2Used.count(OpIdx2))
1672 continue;
1673 // Recursively calculate the cost at each level
1674 int TmpScore =
1675 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1676 I1, I2, CurrLevel + 1, std::nullopt);
1677 // Look for the best score.
1678 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1679 TmpScore > MaxTmpScore) {
1680 MaxTmpScore = TmpScore;
1681 MaxOpIdx2 = OpIdx2;
1682 FoundBest = true;
1683 }
1684 }
1685 if (FoundBest) {
1686 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1687 Op2Used.insert(MaxOpIdx2);
1688 ShallowScoreAtThisLevel += MaxTmpScore;
1689 }
1690 }
1691 return ShallowScoreAtThisLevel;
1692 }
1693 };
1694 /// A helper data structure to hold the operands of a vector of instructions.
1695 /// This supports a fixed vector length for all operand vectors.
1697 /// For each operand we need (i) the value, and (ii) the opcode that it
1698 /// would be attached to if the expression was in a left-linearized form.
1699 /// This is required to avoid illegal operand reordering.
1700 /// For example:
1701 /// \verbatim
1702 /// 0 Op1
1703 /// |/
1704 /// Op1 Op2 Linearized + Op2
1705 /// \ / ----------> |/
1706 /// - -
1707 ///
1708 /// Op1 - Op2 (0 + Op1) - Op2
1709 /// \endverbatim
1710 ///
1711 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1712 ///
1713 /// Another way to think of this is to track all the operations across the
1714 /// path from the operand all the way to the root of the tree and to
1715 /// calculate the operation that corresponds to this path. For example, the
1716 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1717 /// corresponding operation is a '-' (which matches the one in the
1718 /// linearized tree, as shown above).
1719 ///
1720 /// For lack of a better term, we refer to this operation as Accumulated
1721 /// Path Operation (APO).
1722 struct OperandData {
1723 OperandData() = default;
1724 OperandData(Value *V, bool APO, bool IsUsed)
1725 : V(V), APO(APO), IsUsed(IsUsed) {}
1726 /// The operand value.
1727 Value *V = nullptr;
1728 /// TreeEntries only allow a single opcode, or an alternate sequence of
1729 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1730 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1731 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1732 /// (e.g., Add/Mul)
1733 bool APO = false;
1734 /// Helper data for the reordering function.
1735 bool IsUsed = false;
1736 };
1737
1738 /// During operand reordering, we are trying to select the operand at lane
1739 /// that matches best with the operand at the neighboring lane. Our
1740 /// selection is based on the type of value we are looking for. For example,
1741 /// if the neighboring lane has a load, we need to look for a load that is
1742 /// accessing a consecutive address. These strategies are summarized in the
1743 /// 'ReorderingMode' enumerator.
1744 enum class ReorderingMode {
1745 Load, ///< Matching loads to consecutive memory addresses
1746 Opcode, ///< Matching instructions based on opcode (same or alternate)
1747 Constant, ///< Matching constants
1748 Splat, ///< Matching the same instruction multiple times (broadcast)
1749 Failed, ///< We failed to create a vectorizable group
1750 };
1751
1753
1754 /// A vector of operand vectors.
1756
1757 const TargetLibraryInfo &TLI;
1758 const DataLayout &DL;
1759 ScalarEvolution &SE;
1760 const BoUpSLP &R;
1761 const Loop *L = nullptr;
1762
1763 /// \returns the operand data at \p OpIdx and \p Lane.
1764 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1765 return OpsVec[OpIdx][Lane];
1766 }
1767
1768 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1769 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1770 return OpsVec[OpIdx][Lane];
1771 }
1772
1773 /// Clears the used flag for all entries.
1774 void clearUsed() {
1775 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1776 OpIdx != NumOperands; ++OpIdx)
1777 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1778 ++Lane)
1779 OpsVec[OpIdx][Lane].IsUsed = false;
1780 }
1781
1782 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1783 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1784 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1785 }
1786
1787 /// \param Lane lane of the operands under analysis.
1788 /// \param OpIdx operand index in \p Lane lane we're looking the best
1789 /// candidate for.
1790 /// \param Idx operand index of the current candidate value.
1791 /// \returns The additional score due to possible broadcasting of the
1792 /// elements in the lane. It is more profitable to have power-of-2 unique
1793 /// elements in the lane, it will be vectorized with higher probability
1794 /// after removing duplicates. Currently the SLP vectorizer supports only
1795 /// vectorization of the power-of-2 number of unique scalars.
1796 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1797 Value *IdxLaneV = getData(Idx, Lane).V;
1798 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1799 return 0;
1801 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1802 if (Ln == Lane)
1803 continue;
1804 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1805 if (!isa<Instruction>(OpIdxLnV))
1806 return 0;
1807 Uniques.insert(OpIdxLnV);
1808 }
1809 int UniquesCount = Uniques.size();
1810 int UniquesCntWithIdxLaneV =
1811 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1812 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1813 int UniquesCntWithOpIdxLaneV =
1814 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1815 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1816 return 0;
1817 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1818 UniquesCntWithOpIdxLaneV) -
1819 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1820 }
1821
1822 /// \param Lane lane of the operands under analysis.
1823 /// \param OpIdx operand index in \p Lane lane we're looking the best
1824 /// candidate for.
1825 /// \param Idx operand index of the current candidate value.
1826 /// \returns The additional score for the scalar which users are all
1827 /// vectorized.
1828 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1829 Value *IdxLaneV = getData(Idx, Lane).V;
1830 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1831 // Do not care about number of uses for vector-like instructions
1832 // (extractelement/extractvalue with constant indices), they are extracts
1833 // themselves and already externally used. Vectorization of such
1834 // instructions does not add extra extractelement instruction, just may
1835 // remove it.
1836 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1837 isVectorLikeInstWithConstOps(OpIdxLaneV))
1839 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1840 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1841 return 0;
1842 return R.areAllUsersVectorized(IdxLaneI)
1844 : 0;
1845 }
1846
1847 /// Score scaling factor for fully compatible instructions but with
1848 /// different number of external uses. Allows better selection of the
1849 /// instructions with less external uses.
1850 static const int ScoreScaleFactor = 10;
1851
1852 /// \Returns the look-ahead score, which tells us how much the sub-trees
1853 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1854 /// score. This helps break ties in an informed way when we cannot decide on
1855 /// the order of the operands by just considering the immediate
1856 /// predecessors.
1857 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1858 int Lane, unsigned OpIdx, unsigned Idx,
1859 bool &IsUsed) {
1860 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1862 // Keep track of the instruction stack as we recurse into the operands
1863 // during the look-ahead score exploration.
1864 int Score =
1865 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1866 /*CurrLevel=*/1, MainAltOps);
1867 if (Score) {
1868 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1869 if (Score <= -SplatScore) {
1870 // Set the minimum score for splat-like sequence to avoid setting
1871 // failed state.
1872 Score = 1;
1873 } else {
1874 Score += SplatScore;
1875 // Scale score to see the difference between different operands
1876 // and similar operands but all vectorized/not all vectorized
1877 // uses. It does not affect actual selection of the best
1878 // compatible operand in general, just allows to select the
1879 // operand with all vectorized uses.
1880 Score *= ScoreScaleFactor;
1881 Score += getExternalUseScore(Lane, OpIdx, Idx);
1882 IsUsed = true;
1883 }
1884 }
1885 return Score;
1886 }
1887
1888 /// Best defined scores per lanes between the passes. Used to choose the
1889 /// best operand (with the highest score) between the passes.
1890 /// The key - {Operand Index, Lane}.
1891 /// The value - the best score between the passes for the lane and the
1892 /// operand.
1894 BestScoresPerLanes;
1895
1896 // Search all operands in Ops[*][Lane] for the one that matches best
1897 // Ops[OpIdx][LastLane] and return its opreand index.
1898 // If no good match can be found, return std::nullopt.
1899 std::optional<unsigned>
1900 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1901 ArrayRef<ReorderingMode> ReorderingModes,
1902 ArrayRef<Value *> MainAltOps) {
1903 unsigned NumOperands = getNumOperands();
1904
1905 // The operand of the previous lane at OpIdx.
1906 Value *OpLastLane = getData(OpIdx, LastLane).V;
1907
1908 // Our strategy mode for OpIdx.
1909 ReorderingMode RMode = ReorderingModes[OpIdx];
1910 if (RMode == ReorderingMode::Failed)
1911 return std::nullopt;
1912
1913 // The linearized opcode of the operand at OpIdx, Lane.
1914 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1915
1916 // The best operand index and its score.
1917 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1918 // are using the score to differentiate between the two.
1919 struct BestOpData {
1920 std::optional<unsigned> Idx;
1921 unsigned Score = 0;
1922 } BestOp;
1923 BestOp.Score =
1924 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1925 .first->second;
1926
1927 // Track if the operand must be marked as used. If the operand is set to
1928 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1929 // want to reestimate the operands again on the following iterations).
1930 bool IsUsed = RMode == ReorderingMode::Splat ||
1931 RMode == ReorderingMode::Constant ||
1932 RMode == ReorderingMode::Load;
1933 // Iterate through all unused operands and look for the best.
1934 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1935 // Get the operand at Idx and Lane.
1936 OperandData &OpData = getData(Idx, Lane);
1937 Value *Op = OpData.V;
1938 bool OpAPO = OpData.APO;
1939
1940 // Skip already selected operands.
1941 if (OpData.IsUsed)
1942 continue;
1943
1944 // Skip if we are trying to move the operand to a position with a
1945 // different opcode in the linearized tree form. This would break the
1946 // semantics.
1947 if (OpAPO != OpIdxAPO)
1948 continue;
1949
1950 // Look for an operand that matches the current mode.
1951 switch (RMode) {
1952 case ReorderingMode::Load:
1953 case ReorderingMode::Opcode: {
1954 bool LeftToRight = Lane > LastLane;
1955 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1956 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1957 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1958 OpIdx, Idx, IsUsed);
1959 if (Score > static_cast<int>(BestOp.Score) ||
1960 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
1961 Idx == OpIdx)) {
1962 BestOp.Idx = Idx;
1963 BestOp.Score = Score;
1964 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1965 }
1966 break;
1967 }
1968 case ReorderingMode::Constant:
1969 if (isa<Constant>(Op) ||
1970 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
1971 BestOp.Idx = Idx;
1972 if (isa<Constant>(Op)) {
1974 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1976 }
1977 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
1978 IsUsed = false;
1979 }
1980 break;
1981 case ReorderingMode::Splat:
1982 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
1983 IsUsed = Op == OpLastLane;
1984 if (Op == OpLastLane) {
1985 BestOp.Score = LookAheadHeuristics::ScoreSplat;
1986 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1988 }
1989 BestOp.Idx = Idx;
1990 }
1991 break;
1992 case ReorderingMode::Failed:
1993 llvm_unreachable("Not expected Failed reordering mode.");
1994 }
1995 }
1996
1997 if (BestOp.Idx) {
1998 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1999 return BestOp.Idx;
2000 }
2001 // If we could not find a good match return std::nullopt.
2002 return std::nullopt;
2003 }
2004
2005 /// Helper for reorderOperandVecs.
2006 /// \returns the lane that we should start reordering from. This is the one
2007 /// which has the least number of operands that can freely move about or
2008 /// less profitable because it already has the most optimal set of operands.
2009 unsigned getBestLaneToStartReordering() const {
2010 unsigned Min = UINT_MAX;
2011 unsigned SameOpNumber = 0;
2012 // std::pair<unsigned, unsigned> is used to implement a simple voting
2013 // algorithm and choose the lane with the least number of operands that
2014 // can freely move about or less profitable because it already has the
2015 // most optimal set of operands. The first unsigned is a counter for
2016 // voting, the second unsigned is the counter of lanes with instructions
2017 // with same/alternate opcodes and same parent basic block.
2019 // Try to be closer to the original results, if we have multiple lanes
2020 // with same cost. If 2 lanes have the same cost, use the one with the
2021 // lowest index.
2022 for (int I = getNumLanes(); I > 0; --I) {
2023 unsigned Lane = I - 1;
2024 OperandsOrderData NumFreeOpsHash =
2025 getMaxNumOperandsThatCanBeReordered(Lane);
2026 // Compare the number of operands that can move and choose the one with
2027 // the least number.
2028 if (NumFreeOpsHash.NumOfAPOs < Min) {
2029 Min = NumFreeOpsHash.NumOfAPOs;
2030 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2031 HashMap.clear();
2032 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2033 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2034 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2035 // Select the most optimal lane in terms of number of operands that
2036 // should be moved around.
2037 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2038 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2039 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2040 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2041 auto *It = HashMap.find(NumFreeOpsHash.Hash);
2042 if (It == HashMap.end())
2043 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2044 else
2045 ++It->second.first;
2046 }
2047 }
2048 // Select the lane with the minimum counter.
2049 unsigned BestLane = 0;
2050 unsigned CntMin = UINT_MAX;
2051 for (const auto &Data : reverse(HashMap)) {
2052 if (Data.second.first < CntMin) {
2053 CntMin = Data.second.first;
2054 BestLane = Data.second.second;
2055 }
2056 }
2057 return BestLane;
2058 }
2059
2060 /// Data structure that helps to reorder operands.
2061 struct OperandsOrderData {
2062 /// The best number of operands with the same APOs, which can be
2063 /// reordered.
2064 unsigned NumOfAPOs = UINT_MAX;
2065 /// Number of operands with the same/alternate instruction opcode and
2066 /// parent.
2067 unsigned NumOpsWithSameOpcodeParent = 0;
2068 /// Hash for the actual operands ordering.
2069 /// Used to count operands, actually their position id and opcode
2070 /// value. It is used in the voting mechanism to find the lane with the
2071 /// least number of operands that can freely move about or less profitable
2072 /// because it already has the most optimal set of operands. Can be
2073 /// replaced with SmallVector<unsigned> instead but hash code is faster
2074 /// and requires less memory.
2075 unsigned Hash = 0;
2076 };
2077 /// \returns the maximum number of operands that are allowed to be reordered
2078 /// for \p Lane and the number of compatible instructions(with the same
2079 /// parent/opcode). This is used as a heuristic for selecting the first lane
2080 /// to start operand reordering.
2081 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2082 unsigned CntTrue = 0;
2083 unsigned NumOperands = getNumOperands();
2084 // Operands with the same APO can be reordered. We therefore need to count
2085 // how many of them we have for each APO, like this: Cnt[APO] = x.
2086 // Since we only have two APOs, namely true and false, we can avoid using
2087 // a map. Instead we can simply count the number of operands that
2088 // correspond to one of them (in this case the 'true' APO), and calculate
2089 // the other by subtracting it from the total number of operands.
2090 // Operands with the same instruction opcode and parent are more
2091 // profitable since we don't need to move them in many cases, with a high
2092 // probability such lane already can be vectorized effectively.
2093 bool AllUndefs = true;
2094 unsigned NumOpsWithSameOpcodeParent = 0;
2095 Instruction *OpcodeI = nullptr;
2096 BasicBlock *Parent = nullptr;
2097 unsigned Hash = 0;
2098 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2099 const OperandData &OpData = getData(OpIdx, Lane);
2100 if (OpData.APO)
2101 ++CntTrue;
2102 // Use Boyer-Moore majority voting for finding the majority opcode and
2103 // the number of times it occurs.
2104 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2105 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
2106 I->getParent() != Parent) {
2107 if (NumOpsWithSameOpcodeParent == 0) {
2108 NumOpsWithSameOpcodeParent = 1;
2109 OpcodeI = I;
2110 Parent = I->getParent();
2111 } else {
2112 --NumOpsWithSameOpcodeParent;
2113 }
2114 } else {
2115 ++NumOpsWithSameOpcodeParent;
2116 }
2117 }
2118 Hash = hash_combine(
2119 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2120 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2121 }
2122 if (AllUndefs)
2123 return {};
2124 OperandsOrderData Data;
2125 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2126 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2127 Data.Hash = Hash;
2128 return Data;
2129 }
2130
2131 /// Go through the instructions in VL and append their operands.
2132 void appendOperandsOfVL(ArrayRef<Value *> VL) {
2133 assert(!VL.empty() && "Bad VL");
2134 assert((empty() || VL.size() == getNumLanes()) &&
2135 "Expected same number of lanes");
2136 assert(isa<Instruction>(VL[0]) && "Expected instruction");
2137 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2138 constexpr unsigned IntrinsicNumOperands = 2;
2139 if (isa<IntrinsicInst>(VL[0]))
2140 NumOperands = IntrinsicNumOperands;
2141 OpsVec.resize(NumOperands);
2142 unsigned NumLanes = VL.size();
2143 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2144 OpsVec[OpIdx].resize(NumLanes);
2145 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2146 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2147 // Our tree has just 3 nodes: the root and two operands.
2148 // It is therefore trivial to get the APO. We only need to check the
2149 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2150 // RHS operand. The LHS operand of both add and sub is never attached
2151 // to an inversese operation in the linearized form, therefore its APO
2152 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2153
2154 // Since operand reordering is performed on groups of commutative
2155 // operations or alternating sequences (e.g., +, -), we can safely
2156 // tell the inverse operations by checking commutativity.
2157 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2158 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2159 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2160 APO, false};
2161 }
2162 }
2163 }
2164
2165 /// \returns the number of operands.
2166 unsigned getNumOperands() const { return OpsVec.size(); }
2167
2168 /// \returns the number of lanes.
2169 unsigned getNumLanes() const { return OpsVec[0].size(); }
2170
2171 /// \returns the operand value at \p OpIdx and \p Lane.
2172 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2173 return getData(OpIdx, Lane).V;
2174 }
2175
2176 /// \returns true if the data structure is empty.
2177 bool empty() const { return OpsVec.empty(); }
2178
2179 /// Clears the data.
2180 void clear() { OpsVec.clear(); }
2181
2182 /// \Returns true if there are enough operands identical to \p Op to fill
2183 /// the whole vector (it is mixed with constants or loop invariant values).
2184 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2185 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2186 bool OpAPO = getData(OpIdx, Lane).APO;
2187 bool IsInvariant = L && L->isLoopInvariant(Op);
2188 unsigned Cnt = 0;
2189 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2190 if (Ln == Lane)
2191 continue;
2192 // This is set to true if we found a candidate for broadcast at Lane.
2193 bool FoundCandidate = false;
2194 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2195 OperandData &Data = getData(OpI, Ln);
2196 if (Data.APO != OpAPO || Data.IsUsed)
2197 continue;
2198 Value *OpILane = getValue(OpI, Lane);
2199 bool IsConstantOp = isa<Constant>(OpILane);
2200 // Consider the broadcast candidate if:
2201 // 1. Same value is found in one of the operands.
2202 if (Data.V == Op ||
2203 // 2. The operand in the given lane is not constant but there is a
2204 // constant operand in another lane (which can be moved to the
2205 // given lane). In this case we can represent it as a simple
2206 // permutation of constant and broadcast.
2207 (!IsConstantOp &&
2208 ((Lns > 2 && isa<Constant>(Data.V)) ||
2209 // 2.1. If we have only 2 lanes, need to check that value in the
2210 // next lane does not build same opcode sequence.
2211 (Lns == 2 &&
2212 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
2213 .getOpcode() &&
2214 isa<Constant>(Data.V)))) ||
2215 // 3. The operand in the current lane is loop invariant (can be
2216 // hoisted out) and another operand is also a loop invariant
2217 // (though not a constant). In this case the whole vector can be
2218 // hoisted out.
2219 // FIXME: need to teach the cost model about this case for better
2220 // estimation.
2221 (IsInvariant && !isa<Constant>(Data.V) &&
2222 !getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
2223 L->isLoopInvariant(Data.V))) {
2224 FoundCandidate = true;
2225 Data.IsUsed = Data.V == Op;
2226 if (Data.V == Op)
2227 ++Cnt;
2228 break;
2229 }
2230 }
2231 if (!FoundCandidate)
2232 return false;
2233 }
2234 return getNumLanes() == 2 || Cnt > 1;
2235 }
2236
2237 /// Checks if there is at least single compatible operand in lanes other
2238 /// than \p Lane, compatible with the operand \p Op.
2239 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
2240 bool OpAPO = getData(OpIdx, Lane).APO;
2241 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2242 if (Ln == Lane)
2243 continue;
2244 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2245 const OperandData &Data = getData(OpI, Ln);
2246 if (Data.APO != OpAPO || Data.IsUsed)
2247 return true;
2248 Value *OpILn = getValue(OpI, Ln);
2249 return (L && L->isLoopInvariant(OpILn)) ||
2250 (getSameOpcode({Op, OpILn}, TLI).getOpcode() &&
2251 Op->getParent() == cast<Instruction>(OpILn)->getParent());
2252 }))
2253 return true;
2254 }
2255 return false;
2256 }
2257
2258 public:
2259 /// Initialize with all the operands of the instruction vector \p RootVL.
2261 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2262 L(R.LI->getLoopFor(
2263 (cast<Instruction>(RootVL.front())->getParent()))) {
2264 // Append all the operands of RootVL.
2265 appendOperandsOfVL(RootVL);
2266 }
2267
2268 /// \Returns a value vector with the operands across all lanes for the
2269 /// opearnd at \p OpIdx.
2270 ValueList getVL(unsigned OpIdx) const {
2271 ValueList OpVL(OpsVec[OpIdx].size());
2272 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2273 "Expected same num of lanes across all operands");
2274 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2275 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2276 return OpVL;
2277 }
2278
2279 // Performs operand reordering for 2 or more operands.
2280 // The original operands are in OrigOps[OpIdx][Lane].
2281 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2282 void reorder() {
2283 unsigned NumOperands = getNumOperands();
2284 unsigned NumLanes = getNumLanes();
2285 // Each operand has its own mode. We are using this mode to help us select
2286 // the instructions for each lane, so that they match best with the ones
2287 // we have selected so far.
2288 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2289
2290 // This is a greedy single-pass algorithm. We are going over each lane
2291 // once and deciding on the best order right away with no back-tracking.
2292 // However, in order to increase its effectiveness, we start with the lane
2293 // that has operands that can move the least. For example, given the
2294 // following lanes:
2295 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2296 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2297 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2298 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2299 // we will start at Lane 1, since the operands of the subtraction cannot
2300 // be reordered. Then we will visit the rest of the lanes in a circular
2301 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2302
2303 // Find the first lane that we will start our search from.
2304 unsigned FirstLane = getBestLaneToStartReordering();
2305
2306 // Initialize the modes.
2307 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2308 Value *OpLane0 = getValue(OpIdx, FirstLane);
2309 // Keep track if we have instructions with all the same opcode on one
2310 // side.
2311 if (isa<LoadInst>(OpLane0))
2312 ReorderingModes[OpIdx] = ReorderingMode::Load;
2313 else if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2314 // Check if OpLane0 should be broadcast.
2315 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2316 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2317 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2318 else
2319 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2320 } else if (isa<Constant>(OpLane0))
2321 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2322 else if (isa<Argument>(OpLane0))
2323 // Our best hope is a Splat. It may save some cost in some cases.
2324 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2325 else
2326 // NOTE: This should be unreachable.
2327 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2328 }
2329
2330 // Check that we don't have same operands. No need to reorder if operands
2331 // are just perfect diamond or shuffled diamond match. Do not do it only
2332 // for possible broadcasts or non-power of 2 number of scalars (just for
2333 // now).
2334 auto &&SkipReordering = [this]() {
2335 SmallPtrSet<Value *, 4> UniqueValues;
2336 ArrayRef<OperandData> Op0 = OpsVec.front();
2337 for (const OperandData &Data : Op0)
2338 UniqueValues.insert(Data.V);
2339 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2340 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2341 return !UniqueValues.contains(Data.V);
2342 }))
2343 return false;
2344 }
2345 // TODO: Check if we can remove a check for non-power-2 number of
2346 // scalars after full support of non-power-2 vectorization.
2347 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2348 };
2349
2350 // If the initial strategy fails for any of the operand indexes, then we
2351 // perform reordering again in a second pass. This helps avoid assigning
2352 // high priority to the failed strategy, and should improve reordering for
2353 // the non-failed operand indexes.
2354 for (int Pass = 0; Pass != 2; ++Pass) {
2355 // Check if no need to reorder operands since they're are perfect or
2356 // shuffled diamond match.
2357 // Need to do it to avoid extra external use cost counting for
2358 // shuffled matches, which may cause regressions.
2359 if (SkipReordering())
2360 break;
2361 // Skip the second pass if the first pass did not fail.
2362 bool StrategyFailed = false;
2363 // Mark all operand data as free to use.
2364 clearUsed();
2365 // We keep the original operand order for the FirstLane, so reorder the
2366 // rest of the lanes. We are visiting the nodes in a circular fashion,
2367 // using FirstLane as the center point and increasing the radius
2368 // distance.
2369 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2370 for (unsigned I = 0; I < NumOperands; ++I)
2371 MainAltOps[I].push_back(getData(I, FirstLane).V);
2372
2373 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2374 // Visit the lane on the right and then the lane on the left.
2375 for (int Direction : {+1, -1}) {
2376 int Lane = FirstLane + Direction * Distance;
2377 if (Lane < 0 || Lane >= (int)NumLanes)
2378 continue;
2379 int LastLane = Lane - Direction;
2380 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2381 "Out of bounds");
2382 // Look for a good match for each operand.
2383 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2384 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2385 std::optional<unsigned> BestIdx = getBestOperand(
2386 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2387 // By not selecting a value, we allow the operands that follow to
2388 // select a better matching value. We will get a non-null value in
2389 // the next run of getBestOperand().
2390 if (BestIdx) {
2391 // Swap the current operand with the one returned by
2392 // getBestOperand().
2393 swap(OpIdx, *BestIdx, Lane);
2394 } else {
2395 // Enable the second pass.
2396 StrategyFailed = true;
2397 }
2398 // Try to get the alternate opcode and follow it during analysis.
2399 if (MainAltOps[OpIdx].size() != 2) {
2400 OperandData &AltOp = getData(OpIdx, Lane);
2401 InstructionsState OpS =
2402 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2403 if (OpS.getOpcode() && OpS.isAltShuffle())
2404 MainAltOps[OpIdx].push_back(AltOp.V);
2405 }
2406 }
2407 }
2408 }
2409 // Skip second pass if the strategy did not fail.
2410 if (!StrategyFailed)
2411 break;
2412 }
2413 }
2414
2415#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2416 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2417 switch (RMode) {
2418 case ReorderingMode::Load:
2419 return "Load";
2420 case ReorderingMode::Opcode:
2421 return "Opcode";
2422 case ReorderingMode::Constant:
2423 return "Constant";
2424 case ReorderingMode::Splat:
2425 return "Splat";
2426 case ReorderingMode::Failed:
2427 return "Failed";
2428 }
2429 llvm_unreachable("Unimplemented Reordering Type");
2430 }
2431
2432 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2433 raw_ostream &OS) {
2434 return OS << getModeStr(RMode);
2435 }
2436
2437 /// Debug print.
2438 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2439 printMode(RMode, dbgs());
2440 }
2441
2442 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2443 return printMode(RMode, OS);
2444 }
2445
2447 const unsigned Indent = 2;
2448 unsigned Cnt = 0;
2449 for (const OperandDataVec &OpDataVec : OpsVec) {
2450 OS << "Operand " << Cnt++ << "\n";
2451 for (const OperandData &OpData : OpDataVec) {
2452 OS.indent(Indent) << "{";
2453 if (Value *V = OpData.V)
2454 OS << *V;
2455 else
2456 OS << "null";
2457 OS << ", APO:" << OpData.APO << "}\n";
2458 }
2459 OS << "\n";
2460 }
2461 return OS;
2462 }
2463
2464 /// Debug print.
2465 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2466#endif
2467 };
2468
2469 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2470 /// for a pair which have highest score deemed to have best chance to form
2471 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2472 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2473 /// of the cost, considered to be good enough score.
2474 std::optional<int>
2475 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2476 int Limit = LookAheadHeuristics::ScoreFail) const {
2477 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2479 int BestScore = Limit;
2480 std::optional<int> Index;
2481 for (int I : seq<int>(0, Candidates.size())) {
2482 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2483 Candidates[I].second,
2484 /*U1=*/nullptr, /*U2=*/nullptr,
2485 /*Level=*/1, std::nullopt);
2486 if (Score > BestScore) {
2487 BestScore = Score;
2488 Index = I;
2489 }
2490 }
2491 return Index;
2492 }
2493
2494 /// Checks if the instruction is marked for deletion.
2495 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2496
2497 /// Removes an instruction from its block and eventually deletes it.
2498 /// It's like Instruction::eraseFromParent() except that the actual deletion
2499 /// is delayed until BoUpSLP is destructed.
2501 DeletedInstructions.insert(I);
2502 }
2503
2504 /// Remove instructions from the parent function and clear the operands of \p
2505 /// DeadVals instructions, marking for deletion trivially dead operands.
2506 template <typename T>
2509 for (T *V : DeadVals) {
2510 auto *I = cast<Instruction>(V);
2511 DeletedInstructions.insert(I);
2512 }
2513 DenseSet<Value *> Processed;
2514 for (T *V : DeadVals) {
2515 if (!V || !Processed.insert(V).second)
2516 continue;
2517 auto *I = cast<Instruction>(V);
2520 if (const TreeEntry *Entry = getTreeEntry(I)) {
2521 Entries.push_back(Entry);
2522 auto It = MultiNodeScalars.find(I);
2523 if (It != MultiNodeScalars.end())
2524 Entries.append(It->second.begin(), It->second.end());
2525 }
2526 for (Use &U : I->operands()) {
2527 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2528 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2530 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
2531 return Entry->VectorizedValue == OpI;
2532 })))
2533 DeadInsts.push_back(OpI);
2534 }
2535 I->dropAllReferences();
2536 }
2537 for (T *V : DeadVals) {
2538 auto *I = cast<Instruction>(V);
2539 if (!I->getParent())
2540 continue;
2541 assert((I->use_empty() || all_of(I->uses(),
2542 [&](Use &U) {
2543 return isDeleted(
2544 cast<Instruction>(U.getUser()));
2545 })) &&
2546 "trying to erase instruction with users.");
2547 I->removeFromParent();
2548 SE->forgetValue(I);
2549 }
2550 // Process the dead instruction list until empty.
2551 while (!DeadInsts.empty()) {
2552 Value *V = DeadInsts.pop_back_val();
2553 Instruction *VI = cast_or_null<Instruction>(V);
2554 if (!VI || !VI->getParent())
2555 continue;
2557 "Live instruction found in dead worklist!");
2558 assert(VI->use_empty() && "Instructions with uses are not dead.");
2559
2560 // Don't lose the debug info while deleting the instructions.
2561 salvageDebugInfo(*VI);
2562
2563 // Null out all of the instruction's operands to see if any operand
2564 // becomes dead as we go.
2565 for (Use &OpU : VI->operands()) {
2566 Value *OpV = OpU.get();
2567 if (!OpV)
2568 continue;
2569 OpU.set(nullptr);
2570
2571 if (!OpV->use_empty())
2572 continue;
2573
2574 // If the operand is an instruction that became dead as we nulled out
2575 // the operand, and if it is 'trivially' dead, delete it in a future
2576 // loop iteration.
2577 if (auto *OpI = dyn_cast<Instruction>(OpV))
2578 if (!DeletedInstructions.contains(OpI) &&
2580 DeadInsts.push_back(OpI);
2581 }
2582
2583 VI->removeFromParent();
2584 DeletedInstructions.insert(VI);
2585 SE->forgetValue(VI);
2586 }
2587 }
2588
2589 /// Checks if the instruction was already analyzed for being possible
2590 /// reduction root.
2592 return AnalyzedReductionsRoots.count(I);
2593 }
2594 /// Register given instruction as already analyzed for being possible
2595 /// reduction root.
2597 AnalyzedReductionsRoots.insert(I);
2598 }
2599 /// Checks if the provided list of reduced values was checked already for
2600 /// vectorization.
2602 return AnalyzedReductionVals.contains(hash_value(VL));
2603 }
2604 /// Adds the list of reduced values to list of already checked values for the
2605 /// vectorization.
2607 AnalyzedReductionVals.insert(hash_value(VL));
2608 }
2609 /// Clear the list of the analyzed reduction root instructions.
2611 AnalyzedReductionsRoots.clear();
2612 AnalyzedReductionVals.clear();
2613 AnalyzedMinBWVals.clear();
2614 }
2615 /// Checks if the given value is gathered in one of the nodes.
2616 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2617 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2618 }
2619 /// Checks if the given value is gathered in one of the nodes.
2620 bool isGathered(const Value *V) const {
2621 return MustGather.contains(V);
2622 }
2623 /// Checks if the specified value was not schedule.
2624 bool isNotScheduled(const Value *V) const {
2625 return NonScheduledFirst.contains(V);
2626 }
2627
2628 /// Check if the value is vectorized in the tree.
2629 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2630
2631 ~BoUpSLP();
2632
2633private:
2634 /// Determine if a node \p E in can be demoted to a smaller type with a
2635 /// truncation. We collect the entries that will be demoted in ToDemote.
2636 /// \param E Node for analysis
2637 /// \param ToDemote indices of the nodes to be demoted.
2638 bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
2639 unsigned &BitWidth,
2640 SmallVectorImpl<unsigned> &ToDemote,
2642 unsigned &MaxDepthLevel,
2643 bool &IsProfitableToDemote,
2644 bool IsTruncRoot) const;
2645
2646 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2647 /// reordering (i.e. the operands can be reordered because they have only one
2648 /// user and reordarable).
2649 /// \param ReorderableGathers List of all gather nodes that require reordering
2650 /// (e.g., gather of extractlements or partially vectorizable loads).
2651 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2652 /// reordering, subset of \p NonVectorized.
2653 bool
2654 canReorderOperands(TreeEntry *UserTE,
2655 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2656 ArrayRef<TreeEntry *> ReorderableGathers,
2657 SmallVectorImpl<TreeEntry *> &GatherOps);
2658
2659 /// Checks if the given \p TE is a gather node with clustered reused scalars
2660 /// and reorders it per given \p Mask.
2661 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2662
2663 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2664 /// if any. If it is not vectorized (gather node), returns nullptr.
2665 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2666 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2667 TreeEntry *TE = nullptr;
2668 const auto *It = find_if(VL, [&](Value *V) {
2669 TE = getTreeEntry(V);
2670 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2671 return true;
2672 auto It = MultiNodeScalars.find(V);
2673 if (It != MultiNodeScalars.end()) {
2674 for (TreeEntry *E : It->second) {
2675 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2676 TE = E;
2677 return true;
2678 }
2679 }
2680 }
2681 return false;
2682 });
2683 if (It != VL.end()) {
2684 assert(TE->isSame(VL) && "Expected same scalars.");
2685 return TE;
2686 }
2687 return nullptr;
2688 }
2689
2690 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2691 /// if any. If it is not vectorized (gather node), returns nullptr.
2692 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2693 unsigned OpIdx) const {
2694 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2695 const_cast<TreeEntry *>(UserTE), OpIdx);
2696 }
2697
2698 /// Checks if all users of \p I are the part of the vectorization tree.
2699 bool areAllUsersVectorized(
2700 Instruction *I,
2701 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2702
2703 /// Return information about the vector formed for the specified index
2704 /// of a vector of (the same) instruction.
2706
2707 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2708 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2709
2710 /// \returns Cast context for the given graph node.
2712 getCastContextHint(const TreeEntry &TE) const;
2713
2714 /// \returns the cost of the vectorizable entry.
2715 InstructionCost getEntryCost(const TreeEntry *E,
2716 ArrayRef<Value *> VectorizedVals,
2717 SmallPtrSetImpl<Value *> &CheckedExtracts);
2718
2719 /// This is the recursive part of buildTree.
2720 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2721 const EdgeInfo &EI);
2722
2723 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2724 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2725 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2726 /// returns false, setting \p CurrentOrder to either an empty vector or a
2727 /// non-identity permutation that allows to reuse extract instructions.
2728 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2729 /// extract order.
2730 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2731 SmallVectorImpl<unsigned> &CurrentOrder,
2732 bool ResizeAllowed = false) const;
2733
2734 /// Vectorize a single entry in the tree.
2735 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2736 /// avoid issues with def-use order.
2737 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2738
2739 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2740 /// \p E.
2741 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2742 /// avoid issues with def-use order.
2743 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2744
2745 /// Create a new vector from a list of scalar values. Produces a sequence
2746 /// which exploits values reused across lanes, and arranges the inserts
2747 /// for ease of later optimization.
2748 template <typename BVTy, typename ResTy, typename... Args>
2749 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
2750
2751 /// Create a new vector from a list of scalar values. Produces a sequence
2752 /// which exploits values reused across lanes, and arranges the inserts
2753 /// for ease of later optimization.
2754 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
2755
2756 /// Returns the instruction in the bundle, which can be used as a base point
2757 /// for scheduling. Usually it is the last instruction in the bundle, except
2758 /// for the case when all operands are external (in this case, it is the first
2759 /// instruction in the list).
2760 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2761
2762 /// Tries to find extractelement instructions with constant indices from fixed
2763 /// vector type and gather such instructions into a bunch, which highly likely
2764 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2765 /// was successful, the matched scalars are replaced by poison values in \p VL
2766 /// for future analysis.
2767 std::optional<TargetTransformInfo::ShuffleKind>
2768 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2769 SmallVectorImpl<int> &Mask) const;
2770
2771 /// Tries to find extractelement instructions with constant indices from fixed
2772 /// vector type and gather such instructions into a bunch, which highly likely
2773 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2774 /// was successful, the matched scalars are replaced by poison values in \p VL
2775 /// for future analysis.
2777 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2779 unsigned NumParts) const;
2780
2781 /// Checks if the gathered \p VL can be represented as a single register
2782 /// shuffle(s) of previous tree entries.
2783 /// \param TE Tree entry checked for permutation.
2784 /// \param VL List of scalars (a subset of the TE scalar), checked for
2785 /// permutations. Must form single-register vector.
2786 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2787 /// commands to build the mask using the original vector value, without
2788 /// relying on the potential reordering.
2789 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2790 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2791 std::optional<TargetTransformInfo::ShuffleKind>
2792 isGatherShuffledSingleRegisterEntry(
2793 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2794 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2795 bool ForOrder);
2796
2797 /// Checks if the gathered \p VL can be represented as multi-register
2798 /// shuffle(s) of previous tree entries.
2799 /// \param TE Tree entry checked for permutation.
2800 /// \param VL List of scalars (a subset of the TE scalar), checked for
2801 /// permutations.
2802 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2803 /// commands to build the mask using the original vector value, without
2804 /// relying on the potential reordering.
2805 /// \returns per-register series of ShuffleKind, if gathered values can be
2806 /// represented as shuffles of previous tree entries. \p Mask is filled with
2807 /// the shuffle mask (also on per-register base).
2809 isGatherShuffledEntry(
2810 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2812 unsigned NumParts, bool ForOrder = false);
2813
2814 /// \returns the scalarization cost for this list of values. Assuming that
2815 /// this subtree gets vectorized, we may need to extract the values from the
2816 /// roots. This method calculates the cost of extracting the values.
2817 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2818 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
2819 Type *ScalarTy) const;
2820
2821 /// Set the Builder insert point to one after the last instruction in
2822 /// the bundle
2823 void setInsertPointAfterBundle(const TreeEntry *E);
2824
2825 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2826 /// specified, the starting vector value is poison.
2827 Value *gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy);
2828
2829 /// \returns whether the VectorizableTree is fully vectorizable and will
2830 /// be beneficial even the tree height is tiny.
2831 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2832
2833 /// Reorder commutative or alt operands to get better probability of
2834 /// generating vectorized code.
2835 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2838 const BoUpSLP &R);
2839
2840 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2841 /// users of \p TE and collects the stores. It returns the map from the store
2842 /// pointers to the collected stores.
2844 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2845
2846 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2847 /// stores in \p StoresVec can form a vector instruction. If so it returns
2848 /// true and populates \p ReorderIndices with the shuffle indices of the
2849 /// stores when compared to the sorted vector.
2850 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2851 OrdersType &ReorderIndices) const;
2852
2853 /// Iterates through the users of \p TE, looking for scalar stores that can be
2854 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2855 /// their order and builds an order index vector for each store bundle. It
2856 /// returns all these order vectors found.
2857 /// We run this after the tree has formed, otherwise we may come across user
2858 /// instructions that are not yet in the tree.
2860 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2861
2862 struct TreeEntry {
2863 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2864 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2865
2866 /// \returns Common mask for reorder indices and reused scalars.
2867 SmallVector<int> getCommonMask() const {
2869 inversePermutation(ReorderIndices, Mask);
2870 ::addMask(Mask, ReuseShuffleIndices);
2871 return Mask;
2872 }
2873
2874 /// \returns true if the scalars in VL are equal to this entry.
2875 bool isSame(ArrayRef<Value *> VL) const {
2876 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2877 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2878 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2879 return VL.size() == Mask.size() &&
2880 std::equal(VL.begin(), VL.end(), Mask.begin(),
2881 [Scalars](Value *V, int Idx) {
2882 return (isa<UndefValue>(V) &&
2883 Idx == PoisonMaskElem) ||
2884 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2885 });
2886 };
2887 if (!ReorderIndices.empty()) {
2888 // TODO: implement matching if the nodes are just reordered, still can
2889 // treat the vector as the same if the list of scalars matches VL
2890 // directly, without reordering.
2892 inversePermutation(ReorderIndices, Mask);
2893 if (VL.size() == Scalars.size())
2894 return IsSame(Scalars, Mask);
2895 if (VL.size() == ReuseShuffleIndices.size()) {
2896 ::addMask(Mask, ReuseShuffleIndices);
2897 return IsSame(Scalars, Mask);
2898 }
2899 return false;
2900 }
2901 return IsSame(Scalars, ReuseShuffleIndices);
2902 }
2903
2904 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2905 return isGather() && UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2906 UserTreeIndices.front().UserTE == UserEI.UserTE;
2907 }
2908
2909 /// \returns true if current entry has same operands as \p TE.
2910 bool hasEqualOperands(const TreeEntry &TE) const {
2911 if (TE.getNumOperands() != getNumOperands())
2912 return false;
2913 SmallBitVector Used(getNumOperands());
2914 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2915 unsigned PrevCount = Used.count();
2916 for (unsigned K = 0; K < E; ++K) {
2917 if (Used.test(K))
2918 continue;
2919 if (getOperand(K) == TE.getOperand(I)) {
2920 Used.set(K);
2921 break;
2922 }
2923 }
2924 // Check if we actually found the matching operand.
2925 if (PrevCount == Used.count())
2926 return false;
2927 }
2928 return true;
2929 }
2930
2931 /// \return Final vectorization factor for the node. Defined by the total
2932 /// number of vectorized scalars, including those, used several times in the
2933 /// entry and counted in the \a ReuseShuffleIndices, if any.
2934 unsigned getVectorFactor() const {
2935 if (!ReuseShuffleIndices.empty())
2936 return ReuseShuffleIndices.size();
2937 return Scalars.size();
2938 };
2939
2940 /// Checks if the current node is a gather node.
2941 bool isGather() const {return State == NeedToGather; }
2942
2943 /// A vector of scalars.
2944 ValueList Scalars;
2945
2946 /// The Scalars are vectorized into this value. It is initialized to Null.
2947 WeakTrackingVH VectorizedValue = nullptr;
2948
2949 /// New vector phi instructions emitted for the vectorized phi nodes.
2950 PHINode *PHI = nullptr;
2951
2952 /// Do we need to gather this sequence or vectorize it
2953 /// (either with vector instruction or with scatter/gather
2954 /// intrinsics for store/load)?
2955 enum EntryState {
2956 Vectorize,
2957 ScatterVectorize,
2958 StridedVectorize,
2959 NeedToGather
2960 };
2961 EntryState State;
2962
2963 /// Does this sequence require some shuffling?
2964 SmallVector<int, 4> ReuseShuffleIndices;
2965
2966 /// Does this entry require reordering?
2967 SmallVector<unsigned, 4> ReorderIndices;
2968
2969 /// Points back to the VectorizableTree.
2970 ///
2971 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2972 /// to be a pointer and needs to be able to initialize the child iterator.
2973 /// Thus we need a reference back to the container to translate the indices
2974 /// to entries.
2975 VecTreeTy &Container;
2976
2977 /// The TreeEntry index containing the user of this entry. We can actually
2978 /// have multiple users so the data structure is not truly a tree.
2979 SmallVector<EdgeInfo, 1> UserTreeIndices;
2980
2981 /// The index of this treeEntry in VectorizableTree.
2982 int Idx = -1;
2983
2984 private:
2985 /// The operands of each instruction in each lane Operands[op_index][lane].
2986 /// Note: This helps avoid the replication of the code that performs the
2987 /// reordering of operands during buildTree_rec() and vectorizeTree().
2989
2990 /// The main/alternate instruction.
2991 Instruction *MainOp = nullptr;
2992 Instruction *AltOp = nullptr;
2993
2994 public:
2995 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
2996 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2997 if (Operands.size() < OpIdx + 1)
2998 Operands.resize(OpIdx + 1);
2999 assert(Operands[OpIdx].empty() && "Already resized?");
3000 assert(OpVL.size() <= Scalars.size() &&
3001 "Number of operands is greater than the number of scalars.");
3002 Operands[OpIdx].resize(OpVL.size());
3003 copy(OpVL, Operands[OpIdx].begin());
3004 }
3005
3006 /// Set the operands of this bundle in their original order.
3007 void setOperandsInOrder() {
3008 assert(Operands.empty() && "Already initialized?");
3009 auto *I0 = cast<Instruction>(Scalars[0]);
3010 Operands.resize(I0->getNumOperands());
3011 unsigned NumLanes = Scalars.size();
3012 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
3013 OpIdx != NumOperands; ++OpIdx) {
3014 Operands[OpIdx].resize(NumLanes);
3015 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3016 auto *I = cast<Instruction>(Scalars[Lane]);
3017 assert(I->getNumOperands() == NumOperands &&
3018 "Expected same number of operands");
3019 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
3020 }
3021 }
3022 }
3023
3024 /// Reorders operands of the node to the given mask \p Mask.
3025 void reorderOperands(ArrayRef<int> Mask) {
3026 for (ValueList &Operand : Operands)
3027 reorderScalars(Operand, Mask);
3028 }
3029
3030 /// \returns the \p OpIdx operand of this TreeEntry.
3031 ValueList &getOperand(unsigned OpIdx) {
3032 assert(OpIdx < Operands.size() && "Off bounds");
3033 return Operands[OpIdx];
3034 }
3035
3036 /// \returns the \p OpIdx operand of this TreeEntry.
3037 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
3038 assert(OpIdx < Operands.size() && "Off bounds");
3039 return Operands[OpIdx];
3040 }
3041
3042 /// \returns the number of operands.
3043 unsigned getNumOperands() const { return Operands.size(); }
3044
3045 /// \return the single \p OpIdx operand.
3046 Value *getSingleOperand(unsigned OpIdx) const {
3047 assert(OpIdx < Operands.size() && "Off bounds");
3048 assert(!Operands[OpIdx].empty() && "No operand available");
3049 return Operands[OpIdx][0];
3050 }
3051
3052 /// Some of the instructions in the list have alternate opcodes.
3053 bool isAltShuffle() const { return MainOp != AltOp; }
3054
3055 bool isOpcodeOrAlt(Instruction *I) const {
3056 unsigned CheckedOpcode = I->getOpcode();
3057 return (getOpcode() == CheckedOpcode ||
3058 getAltOpcode() == CheckedOpcode);
3059 }
3060
3061 /// Chooses the correct key for scheduling data. If \p Op has the same (or
3062 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3063 /// \p OpValue.
3064 Value *isOneOf(Value *Op) const {
3065 auto *I = dyn_cast<Instruction>(Op);
3066 if (I && isOpcodeOrAlt(I))
3067 return Op;
3068 return MainOp;
3069 }
3070
3071 void setOperations(const InstructionsState &S) {
3072 MainOp = S.MainOp;
3073 AltOp = S.AltOp;
3074 }
3075
3076 Instruction *getMainOp() const {
3077 return MainOp;
3078 }
3079
3080 Instruction *getAltOp() const {
3081 return AltOp;
3082 }
3083
3084 /// The main/alternate opcodes for the list of instructions.
3085 unsigned getOpcode() const {
3086 return MainOp ? MainOp->getOpcode() : 0;
3087 }
3088
3089 unsigned getAltOpcode() const {
3090 return AltOp ? AltOp->getOpcode() : 0;
3091 }
3092
3093 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3094 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3095 int findLaneForValue(Value *V) const {
3096 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
3097 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3098 if (!ReorderIndices.empty())
3099 FoundLane = ReorderIndices[FoundLane];
3100 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3101 if (!ReuseShuffleIndices.empty()) {
3102 FoundLane = std::distance(ReuseShuffleIndices.begin(),
3103 find(ReuseShuffleIndices, FoundLane));
3104 }
3105 return FoundLane;
3106 }
3107
3108 /// Build a shuffle mask for graph entry which represents a merge of main
3109 /// and alternate operations.
3110 void
3111 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3113 SmallVectorImpl<Value *> *OpScalars = nullptr,
3114 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3115
3116 /// Return true if this is a non-power-of-2 node.
3117 bool isNonPowOf2Vec() const {
3118 bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size());
3119 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3120 "Reshuffling not supported with non-power-of-2 vectors yet.");
3121 return IsNonPowerOf2;
3122 }
3123
3124#ifndef NDEBUG
3125 /// Debug printer.
3126 LLVM_DUMP_METHOD void dump() const {
3127 dbgs() << Idx << ".\n";
3128 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
3129 dbgs() << "Operand " << OpI << ":\n";
3130 for (const Value *V : Operands[OpI])
3131 dbgs().indent(2) << *V << "\n";
3132 }
3133 dbgs() << "Scalars: \n";
3134 for (Value *V : Scalars)
3135 dbgs().indent(2) << *V << "\n";
3136 dbgs() << "State: ";
3137 switch (State) {
3138 case Vectorize:
3139 dbgs() << "Vectorize\n";
3140 break;
3141 case ScatterVectorize:
3142 dbgs() << "ScatterVectorize\n";
3143 break;
3144 case StridedVectorize:
3145 dbgs() << "StridedVectorize\n";
3146 break;
3147 case NeedToGather:
3148 dbgs() << "NeedToGather\n";
3149 break;
3150 }
3151 dbgs() << "MainOp: ";
3152 if (MainOp)
3153 dbgs() << *MainOp << "\n";
3154 else
3155 dbgs() << "NULL\n";
3156 dbgs() << "AltOp: ";
3157 if (AltOp)
3158 dbgs() << *AltOp << "\n";
3159 else
3160 dbgs() << "NULL\n";
3161 dbgs() << "VectorizedValue: ";
3162 if (VectorizedValue)
3163 dbgs() << *VectorizedValue << "\n";
3164 else
3165 dbgs() << "NULL\n";
3166 dbgs() << "ReuseShuffleIndices: ";
3167 if (ReuseShuffleIndices.empty())
3168 dbgs() << "Empty";
3169 else
3170 for (int ReuseIdx : ReuseShuffleIndices)
3171 dbgs() << ReuseIdx << ", ";
3172 dbgs() << "\n";
3173 dbgs() << "ReorderIndices: ";
3174 for (unsigned ReorderIdx : ReorderIndices)
3175 dbgs() << ReorderIdx << ", ";
3176 dbgs() << "\n";
3177 dbgs() << "UserTreeIndices: ";
3178 for (const auto &EInfo : UserTreeIndices)
3179 dbgs() << EInfo << ", ";
3180 dbgs() << "\n";
3181 }
3182#endif
3183 };
3184
3185#ifndef NDEBUG
3186 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3187 InstructionCost VecCost, InstructionCost ScalarCost,
3188 StringRef Banner) const {
3189 dbgs() << "SLP: " << Banner << ":\n";
3190 E->dump();
3191 dbgs() << "SLP: Costs:\n";
3192 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3193 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
3194 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
3195 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3196 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3197 }
3198#endif
3199
3200 /// Create a new VectorizableTree entry.
3201 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3202 std::optional<ScheduleData *> Bundle,
3203 const InstructionsState &S,
3204 const EdgeInfo &UserTreeIdx,
3205 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3206 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3207 TreeEntry::EntryState EntryState =
3208 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3209 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3210 ReuseShuffleIndices, ReorderIndices);
3211 }
3212
3213 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3214 TreeEntry::EntryState EntryState,
3215 std::optional<ScheduleData *> Bundle,
3216 const InstructionsState &S,
3217 const EdgeInfo &UserTreeIdx,
3218 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3219 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3220 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3221 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3222 "Need to vectorize gather entry?");
3223 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3224 TreeEntry *Last = VectorizableTree.back().get();
3225 Last->Idx = VectorizableTree.size() - 1;
3226 Last->State = EntryState;
3227 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3228 ReuseShuffleIndices.end());
3229 if (ReorderIndices.empty()) {
3230 Last->Scalars.assign(VL.begin(), VL.end());
3231 Last->setOperations(S);
3232 } else {
3233 // Reorder scalars and build final mask.
3234 Last->Scalars.assign(VL.size(), nullptr);
3235 transform(ReorderIndices, Last->Scalars.begin(),
3236 [VL](unsigned Idx) -> Value * {
3237 if (Idx >= VL.size())
3238 return UndefValue::get(VL.front()->getType());
3239 return VL[Idx];
3240 });
3241 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3242 Last->setOperations(S);
3243 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3244 }
3245 if (!Last->isGather()) {
3246 for (Value *V : VL) {
3247 const TreeEntry *TE = getTreeEntry(V);
3248 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3249 "Scalar already in tree!");
3250 if (TE) {
3251 if (TE != Last)
3252 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3253 continue;
3254 }
3255 ScalarToTreeEntry[V] = Last;
3256 }
3257 // Update the scheduler bundle to point to this TreeEntry.
3258 ScheduleData *BundleMember = *Bundle;
3259 assert((BundleMember || isa<PHINode>(S.MainOp) ||
3260 isVectorLikeInstWithConstOps(S.MainOp) ||
3261 doesNotNeedToSchedule(VL)) &&
3262 "Bundle and VL out of sync");
3263 if (BundleMember) {
3264 for (Value *V : VL) {
3266 continue;
3267 if (!BundleMember)
3268 continue;
3269 BundleMember->TE = Last;
3270 BundleMember = BundleMember->NextInBundle;
3271 }
3272 }
3273 assert(!BundleMember && "Bundle and VL out of sync");
3274 } else {
3275 // Build a map for gathered scalars to the nodes where they are used.
3276 bool AllConstsOrCasts = true;
3277 for (Value *V : VL)
3278 if (!isConstant(V)) {
3279 auto *I = dyn_cast<CastInst>(V);
3280 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3281 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3282 }
3283 if (AllConstsOrCasts)
3284 CastMaxMinBWSizes =
3285 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3286 MustGather.insert(VL.begin(), VL.end());
3287 }
3288
3289 if (UserTreeIdx.UserTE) {
3290 Last->UserTreeIndices.push_back(UserTreeIdx);
3291 assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3292 "Reordering isn't implemented for non-power-of-2 nodes yet");
3293 }
3294 return Last;
3295 }
3296
3297 /// -- Vectorization State --
3298 /// Holds all of the tree entries.
3299 TreeEntry::VecTreeTy VectorizableTree;
3300
3301#ifndef NDEBUG
3302 /// Debug printer.
3303 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3304 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3305 VectorizableTree[Id]->dump();
3306 dbgs() << "\n";
3307 }
3308 }
3309#endif
3310
3311 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3312
3313 const TreeEntry *getTreeEntry(Value *V) const {
3314 return ScalarToTreeEntry.lookup(V);
3315 }
3316
3317 /// Check that the operand node of alternate node does not generate
3318 /// buildvector sequence. If it is, then probably not worth it to build
3319 /// alternate shuffle, if number of buildvector operands + alternate
3320 /// instruction > than the number of buildvector instructions.
3321 /// \param S the instructions state of the analyzed values.
3322 /// \param VL list of the instructions with alternate opcodes.
3323 bool areAltOperandsProfitable(const InstructionsState &S,
3324 ArrayRef<Value *> VL) const;
3325
3326 /// Checks if the specified list of the instructions/values can be vectorized
3327 /// and fills required data before actual scheduling of the instructions.
3328 TreeEntry::EntryState getScalarsVectorizationState(
3329 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
3330 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
3331
3332 /// Maps a specific scalar to its tree entry.
3333 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3334
3335 /// List of scalars, used in several vectorize nodes, and the list of the
3336 /// nodes.
3338
3339 /// Maps a value to the proposed vectorizable size.
3340 SmallDenseMap<Value *, unsigned> InstrElementSize;
3341
3342 /// A list of scalars that we found that we need to keep as scalars.
3343 ValueSet MustGather;
3344
3345 /// A set of first non-schedulable values.
3346 ValueSet NonScheduledFirst;
3347
3348 /// A map between the vectorized entries and the last instructions in the
3349 /// bundles. The bundles are built in use order, not in the def order of the
3350 /// instructions. So, we cannot rely directly on the last instruction in the
3351 /// bundle being the last instruction in the program order during
3352 /// vectorization process since the basic blocks are affected, need to
3353 /// pre-gather them before.
3354 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3355
3356 /// List of gather nodes, depending on other gather/vector nodes, which should
3357 /// be emitted after the vector instruction emission process to correctly
3358 /// handle order of the vector instructions and shuffles.
3359 SetVector<const TreeEntry *> PostponedGathers;
3360
3361 using ValueToGatherNodesMap =
3363 ValueToGatherNodesMap ValueToGatherNodes;
3364
3365 /// This POD struct describes one external user in the vectorized tree.
3366 struct ExternalUser {
3367 ExternalUser(Value *S, llvm::User *U, int L)
3368 : Scalar(S), User(U), Lane(L) {}
3369
3370 // Which scalar in our function.
3371 Value *Scalar;
3372
3373 // Which user that uses the scalar.
3375
3376 // Which lane does the scalar belong to.
3377 int Lane;
3378 };
3379 using UserList = SmallVector<ExternalUser, 16>;
3380
3381 /// Checks if two instructions may access the same memory.
3382 ///
3383 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3384 /// is invariant in the calling loop.
3385 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3386 Instruction *Inst2) {
3387 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3388 return true;
3389 // First check if the result is already in the cache.
3390 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3391 auto It = AliasCache.find(Key);
3392 if (It != AliasCache.end())
3393 return It->second;
3394 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3395 // Store the result in the cache.
3396 AliasCache.try_emplace(Key, Aliased);
3397 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3398 return Aliased;
3399 }
3400
3401 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3402
3403 /// Cache for alias results.
3404 /// TODO: consider moving this to the AliasAnalysis itself.
3406
3407 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3408 // globally through SLP because we don't perform any action which
3409 // invalidates capture results.
3410 BatchAAResults BatchAA;
3411
3412 /// Temporary store for deleted instructions. Instructions will be deleted
3413 /// eventually when the BoUpSLP is destructed. The deferral is required to
3414 /// ensure that there are no incorrect collisions in the AliasCache, which
3415 /// can happen if a new instruction is allocated at the same address as a
3416 /// previously deleted instruction.
3417 DenseSet<Instruction *> DeletedInstructions;
3418
3419 /// Set of the instruction, being analyzed already for reductions.
3420 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3421
3422 /// Set of hashes for the list of reduction values already being analyzed.
3423 DenseSet<size_t> AnalyzedReductionVals;
3424
3425 /// Values, already been analyzed for mininmal bitwidth and found to be
3426 /// non-profitable.
3427 DenseSet<Value *> AnalyzedMinBWVals;
3428
3429 /// A list of values that need to extracted out of the tree.
3430 /// This list holds pairs of (Internal Scalar : External User). External User
3431 /// can be nullptr, it means that this Internal Scalar will be used later,
3432 /// after vectorization.
3433 UserList ExternalUses;
3434
3435 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3436 /// extractelement instructions.
3437 SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;
3438
3439 /// Values used only by @llvm.assume calls.
3441
3442 /// Holds all of the instructions that we gathered, shuffle instructions and
3443 /// extractelements.
3444 SetVector<Instruction *> GatherShuffleExtractSeq;
3445
3446 /// A list of blocks that we are going to CSE.
3447 DenseSet<BasicBlock *> CSEBlocks;
3448
3449 /// Contains all scheduling relevant data for an instruction.
3450 /// A ScheduleData either represents a single instruction or a member of an
3451 /// instruction bundle (= a group of instructions which is combined into a
3452 /// vector instruction).
3453 struct ScheduleData {
3454 // The initial value for the dependency counters. It means that the
3455 // dependencies are not calculated yet.
3456 enum { InvalidDeps = -1 };
3457
3458 ScheduleData() = default;
3459
3460 void init(int BlockSchedulingRegionID, Value *OpVal) {
3461 FirstInBundle = this;
3462 NextInBundle = nullptr;
3463 NextLoadStore = nullptr;
3464 IsScheduled = false;
3465 SchedulingRegionID = BlockSchedulingRegionID;
3466 clearDependencies();
3467 OpValue = OpVal;
3468 TE = nullptr;
3469 }
3470
3471 /// Verify basic self consistency properties
3472 void verify() {
3473 if (hasValidDependencies()) {
3474 assert(UnscheduledDeps <= Dependencies && "invariant");
3475 } else {
3476 assert(UnscheduledDeps == Dependencies && "invariant");
3477 }
3478
3479 if (IsScheduled) {
3480 assert(isSchedulingEntity() &&
3481 "unexpected scheduled state");
3482 for (const ScheduleData *BundleMember = this; BundleMember;
3483 BundleMember = BundleMember->NextInBundle) {
3484 assert(BundleMember->hasValidDependencies() &&
3485 BundleMember->UnscheduledDeps == 0 &&
3486 "unexpected scheduled state");
3487 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3488 "only bundle is marked scheduled");
3489 }
3490 }
3491
3492 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3493 "all bundle members must be in same basic block");
3494 }
3495
3496 /// Returns true if the dependency information has been calculated.
3497 /// Note that depenendency validity can vary between instructions within
3498 /// a single bundle.
3499 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3500
3501 /// Returns true for single instructions and for bundle representatives
3502 /// (= the head of a bundle).
3503 bool isSchedulingEntity() const { return FirstInBundle == this; }
3504
3505 /// Returns true if it represents an instruction bundle and not only a
3506 /// single instruction.
3507 bool isPartOfBundle() const {
3508 return NextInBundle != nullptr || FirstInBundle != this || TE;
3509 }
3510
3511 /// Returns true if it is ready for scheduling, i.e. it has no more
3512 /// unscheduled depending instructions/bundles.
3513 bool isReady() const {
3514 assert(isSchedulingEntity() &&
3515 "can't consider non-scheduling entity for ready list");
3516 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3517 }
3518
3519 /// Modifies the number of unscheduled dependencies for this instruction,
3520 /// and returns the number of remaining dependencies for the containing
3521 /// bundle.
3522 int incrementUnscheduledDeps(int Incr) {
3523 assert(hasValidDependencies() &&
3524 "increment of unscheduled deps would be meaningless");
3525 UnscheduledDeps += Incr;
3526 return FirstInBundle->unscheduledDepsInBundle();
3527 }
3528
3529 /// Sets the number of unscheduled dependencies to the number of
3530 /// dependencies.
3531 void resetUnscheduledDeps() {
3532 UnscheduledDeps = Dependencies;
3533 }
3534
3535 /// Clears all dependency information.
3536 void clearDependencies() {
3537 Dependencies = InvalidDeps;
3538 resetUnscheduledDeps();
3539 MemoryDependencies.clear();
3540 ControlDependencies.clear();
3541 }
3542
3543 int unscheduledDepsInBundle() const {
3544 assert(isSchedulingEntity() && "only meaningful on the bundle");
3545 int Sum = 0;
3546 for (const ScheduleData *BundleMember = this; BundleMember;
3547 BundleMember = BundleMember->NextInBundle) {
3548 if (BundleMember->UnscheduledDeps == InvalidDeps)
3549 return InvalidDeps;
3550 Sum += BundleMember->UnscheduledDeps;
3551 }
3552 return Sum;
3553 }
3554
3555 void dump(raw_ostream &os) const {
3556 if (!isSchedulingEntity()) {
3557 os << "/ " << *Inst;
3558 } else if (NextInBundle) {
3559 os << '[' << *Inst;
3560 ScheduleData *SD = NextInBundle;
3561 while (SD) {
3562 os << ';' << *SD->Inst;
3563 SD = SD->NextInBundle;
3564 }
3565 os << ']';
3566 } else {
3567 os << *Inst;
3568 }
3569 }
3570
3571 Instruction *Inst = nullptr;
3572
3573 /// Opcode of the current instruction in the schedule data.
3574 Value *OpValue = nullptr;
3575
3576 /// The TreeEntry that this instruction corresponds to.
3577 TreeEntry *TE = nullptr;
3578
3579 /// Points to the head in an instruction bundle (and always to this for
3580 /// single instructions).
3581 ScheduleData *FirstInBundle = nullptr;
3582
3583 /// Single linked list of all instructions in a bundle. Null if it is a
3584 /// single instruction.
3585 ScheduleData *NextInBundle = nullptr;
3586
3587 /// Single linked list of all memory instructions (e.g. load, store, call)
3588 /// in the block - until the end of the scheduling region.
3589 ScheduleData *NextLoadStore = nullptr;
3590
3591 /// The dependent memory instructions.
3592 /// This list is derived on demand in calculateDependencies().
3593 SmallVector<ScheduleData *, 4> MemoryDependencies;
3594
3595 /// List of instructions which this instruction could be control dependent
3596 /// on. Allowing such nodes to be scheduled below this one could introduce
3597 /// a runtime fault which didn't exist in the original program.
3598 /// ex: this is a load or udiv following a readonly call which inf loops
3599 SmallVector<ScheduleData *, 4> ControlDependencies;
3600
3601 /// This ScheduleData is in the current scheduling region if this matches
3602 /// the current SchedulingRegionID of BlockScheduling.
3603 int SchedulingRegionID = 0;
3604
3605 /// Used for getting a "good" final ordering of instructions.
3606 int SchedulingPriority = 0;
3607
3608 /// The number of dependencies. Constitutes of the number of users of the
3609 /// instruction plus the number of dependent memory instructions (if any).
3610 /// This value is calculated on demand.
3611 /// If InvalidDeps, the number of dependencies is not calculated yet.
3612 int Dependencies = InvalidDeps;
3613
3614 /// The number of dependencies minus the number of dependencies of scheduled
3615 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3616 /// for scheduling.
3617 /// Note that this is negative as long as Dependencies is not calculated.
3618 int UnscheduledDeps = InvalidDeps;
3619
3620 /// True if this instruction is scheduled (or considered as scheduled in the
3621 /// dry-run).
3622 bool IsScheduled = false;
3623 };
3624
3625#ifndef NDEBUG
3627 const BoUpSLP::ScheduleData &SD) {
3628 SD.dump(os);
3629 return os;
3630 }
3631#endif
3632
3633 friend struct GraphTraits<BoUpSLP *>;
3634 friend struct DOTGraphTraits<BoUpSLP *>;
3635
3636 /// Contains all scheduling data for a basic block.
3637 /// It does not schedules instructions, which are not memory read/write
3638 /// instructions and their operands are either constants, or arguments, or
3639 /// phis, or instructions from others blocks, or their users are phis or from
3640 /// the other blocks. The resulting vector instructions can be placed at the
3641 /// beginning of the basic block without scheduling (if operands does not need
3642 /// to be scheduled) or at the end of the block (if users are outside of the
3643 /// block). It allows to save some compile time and memory used by the
3644 /// compiler.
3645 /// ScheduleData is assigned for each instruction in between the boundaries of
3646 /// the tree entry, even for those, which are not part of the graph. It is
3647 /// required to correctly follow the dependencies between the instructions and
3648 /// their correct scheduling. The ScheduleData is not allocated for the
3649 /// instructions, which do not require scheduling, like phis, nodes with
3650 /// extractelements/insertelements only or nodes with instructions, with
3651 /// uses/operands outside of the block.
3652 struct BlockScheduling {
3653 BlockScheduling(BasicBlock *BB)
3654 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3655
3656 void clear() {
3657 ReadyInsts.clear();
3658 ScheduleStart = nullptr;
3659 ScheduleEnd = nullptr;
3660 FirstLoadStoreInRegion = nullptr;
3661 LastLoadStoreInRegion = nullptr;
3662 RegionHasStackSave = false;
3663
3664 // Reduce the maximum schedule region size by the size of the
3665 // previous scheduling run.
3666 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3667 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3668 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3669 ScheduleRegionSize = 0;
3670
3671 // Make a new scheduling region, i.e. all existing ScheduleData is not
3672 // in the new region yet.
3673 ++SchedulingRegionID;
3674 }
3675
3676 ScheduleData *getScheduleData(Instruction *I) {
3677 if (BB != I->getParent())
3678 // Avoid lookup if can't possibly be in map.
3679 return nullptr;
3680 ScheduleData *SD = ScheduleDataMap.lookup(I);
3681 if (SD && isInSchedulingRegion(SD))
3682 return SD;
3683 return nullptr;
3684 }
3685
3686 ScheduleData *getScheduleData(Value *V) {
3687 if (auto *I = dyn_cast<Instruction>(V))
3688 return getScheduleData(I);
3689 return nullptr;
3690 }
3691
3692 ScheduleData *getScheduleData(Value *V, Value *Key) {
3693 if (V == Key)
3694 return getScheduleData(V);
3695 auto I = ExtraScheduleDataMap.find(V);
3696 if (I != ExtraScheduleDataMap.end()) {
3697 ScheduleData *SD = I->second.lookup(Key);
3698 if (SD && isInSchedulingRegion(SD))
3699 return SD;
3700 }
3701 return nullptr;
3702 }
3703
3704 bool isInSchedulingRegion(ScheduleData *SD) const {
3705 return SD->SchedulingRegionID == SchedulingRegionID;
3706 }
3707
3708 /// Marks an instruction as scheduled and puts all dependent ready
3709 /// instructions into the ready-list.
3710 template <typename ReadyListType>
3711 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3712 SD->IsScheduled = true;
3713 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3714
3715 for (ScheduleData *BundleMember = SD; BundleMember;
3716 BundleMember = BundleMember->NextInBundle) {
3717 if (BundleMember->Inst != BundleMember->OpValue)
3718 continue;
3719
3720 // Handle the def-use chain dependencies.
3721
3722 // Decrement the unscheduled counter and insert to ready list if ready.
3723 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3724 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
3725 if (OpDef && OpDef->hasValidDependencies() &&
3726 OpDef->incrementUnscheduledDeps(-1) == 0) {
3727 // There are no more unscheduled dependencies after
3728 // decrementing, so we can put the dependent instruction
3729 // into the ready list.
3730 ScheduleData *DepBundle = OpDef->FirstInBundle;
3731 assert(!DepBundle->IsScheduled &&
3732 "already scheduled bundle gets ready");
3733 ReadyList.insert(DepBundle);
3734 LLVM_DEBUG(dbgs()
3735 << "SLP: gets ready (def): " << *DepBundle << "\n");
3736 }
3737 });
3738 };
3739
3740 // If BundleMember is a vector bundle, its operands may have been
3741 // reordered during buildTree(). We therefore need to get its operands
3742 // through the TreeEntry.
3743 if (TreeEntry *TE = BundleMember->TE) {
3744 // Need to search for the lane since the tree entry can be reordered.
3745 int Lane = std::distance(TE->Scalars.begin(),
3746 find(TE->Scalars, BundleMember->Inst));
3747 assert(Lane >= 0 && "Lane not set");
3748
3749 // Since vectorization tree is being built recursively this assertion
3750 // ensures that the tree entry has all operands set before reaching
3751 // this code. Couple of exceptions known at the moment are extracts
3752 // where their second (immediate) operand is not added. Since
3753 // immediates do not affect scheduler behavior this is considered
3754 // okay.
3755 auto *In = BundleMember->Inst;
3756 assert(
3757 In &&
3758 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3759 In->getNumOperands() == TE->getNumOperands()) &&
3760 "Missed TreeEntry operands?");
3761 (void)In; // fake use to avoid build failure when assertions disabled
3762
3763 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3764 OpIdx != NumOperands; ++OpIdx)
3765 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3766 DecrUnsched(I);
3767 } else {
3768 // If BundleMember is a stand-alone instruction, no operand reordering
3769 // has taken place, so we directly access its operands.
3770 for (Use &U : BundleMember->Inst->operands())
3771 if (auto *I = dyn_cast<Instruction>(U.get()))
3772 DecrUnsched(I);
3773 }
3774 // Handle the memory dependencies.
3775 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3776 if (MemoryDepSD->hasValidDependencies() &&
3777 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3778 // There are no more unscheduled dependencies after decrementing,
3779 // so we can put the dependent instruction into the ready list.
3780 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3781 assert(!DepBundle->IsScheduled &&
3782 "already scheduled bundle gets ready");
3783 ReadyList.insert(DepBundle);
3785 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3786 }
3787 }
3788 // Handle the control dependencies.
3789 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3790 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3791 // There are no more unscheduled dependencies after decrementing,
3792 // so we can put the dependent instruction into the ready list.
3793 ScheduleData *DepBundle = DepSD->FirstInBundle;
3794 assert(!DepBundle->IsScheduled &&
3795 "already scheduled bundle gets ready");
3796 ReadyList.insert(DepBundle);
3798 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3799 }
3800 }
3801 }
3802 }
3803
3804 /// Verify basic self consistency properties of the data structure.
3805 void verify() {
3806 if (!ScheduleStart)
3807 return;
3808
3809 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3810 ScheduleStart->comesBefore(ScheduleEnd) &&
3811 "Not a valid scheduling region?");
3812
3813 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3814 auto *SD = getScheduleData(I);
3815 if (!SD)
3816 continue;
3817 assert(isInSchedulingRegion(SD) &&
3818 "primary schedule data not in window?");
3819 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3820 "entire bundle in window!");
3821 (void)SD;
3822 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3823 }
3824
3825 for (auto *SD : ReadyInsts) {
3826 assert(SD->isSchedulingEntity() && SD->isReady() &&
3827 "item in ready list not ready?");
3828 (void)SD;
3829 }
3830 }
3831
3832 void doForAllOpcodes(Value *V,
3833 function_ref<void(ScheduleData *SD)> Action) {
3834 if (ScheduleData *SD = getScheduleData(V))
3835 Action(SD);
3836 auto I = ExtraScheduleDataMap.find(V);
3837 if (I != ExtraScheduleDataMap.end())
3838 for (auto &P : I->second)
3839 if (isInSchedulingRegion(P.second))
3840 Action(P.second);
3841 }
3842
3843 /// Put all instructions into the ReadyList which are ready for scheduling.
3844 template <typename ReadyListType>
3845 void initialFillReadyList(ReadyListType &ReadyList) {
3846 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3847 doForAllOpcodes(I, [&](ScheduleData *SD) {
3848 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3849 SD->isReady()) {
3850 ReadyList.insert(SD);
3851 LLVM_DEBUG(dbgs()
3852 << "SLP: initially in ready list: " << *SD << "\n");
3853 }
3854 });
3855 }
3856 }
3857
3858 /// Build a bundle from the ScheduleData nodes corresponding to the
3859 /// scalar instruction for each lane.
3860 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3861
3862 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3863 /// cyclic dependencies. This is only a dry-run, no instructions are
3864 /// actually moved at this stage.
3865 /// \returns the scheduling bundle. The returned Optional value is not
3866 /// std::nullopt if \p VL is allowed to be scheduled.
3867 std::optional<ScheduleData *>
3868 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3869 const InstructionsState &S);
3870
3871 /// Un-bundles a group of instructions.
3872 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3873
3874 /// Allocates schedule data chunk.
3875 ScheduleData *allocateScheduleDataChunks();
3876
3877 /// Extends the scheduling region so that V is inside the region.
3878 /// \returns true if the region size is within the limit.
3879 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3880
3881 /// Initialize the ScheduleData structures for new instructions in the
3882 /// scheduling region.
3883 void initScheduleData(Instruction *FromI, Instruction *ToI,
3884 ScheduleData *PrevLoadStore,
3885 ScheduleData *NextLoadStore);
3886
3887 /// Updates the dependency information of a bundle and of all instructions/
3888 /// bundles which depend on the original bundle.
3889 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3890 BoUpSLP *SLP);
3891
3892 /// Sets all instruction in the scheduling region to un-scheduled.
3893 void resetSchedule();
3894
3895 BasicBlock *BB;
3896
3897 /// Simple memory allocation for ScheduleData.
3899
3900 /// The size of a ScheduleData array in ScheduleDataChunks.
3901 int ChunkSize;
3902
3903 /// The allocator position in the current chunk, which is the last entry
3904 /// of ScheduleDataChunks.
3905 int ChunkPos;
3906
3907 /// Attaches ScheduleData to Instruction.
3908 /// Note that the mapping survives during all vectorization iterations, i.e.
3909 /// ScheduleData structures are recycled.
3911
3912 /// Attaches ScheduleData to Instruction with the leading key.
3914 ExtraScheduleDataMap;
3915
3916 /// The ready-list for scheduling (only used for the dry-run).
3917 SetVector<ScheduleData *> ReadyInsts;
3918
3919 /// The first instruction of the scheduling region.
3920 Instruction *ScheduleStart = nullptr;
3921
3922 /// The first instruction _after_ the scheduling region.
3923 Instruction *ScheduleEnd = nullptr;
3924
3925 /// The first memory accessing instruction in the scheduling region
3926 /// (can be null).
3927 ScheduleData *FirstLoadStoreInRegion = nullptr;
3928
3929 /// The last memory accessing instruction in the scheduling region
3930 /// (can be null).
3931 ScheduleData *LastLoadStoreInRegion = nullptr;
3932
3933 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3934 /// region? Used to optimize the dependence calculation for the
3935 /// common case where there isn't.
3936 bool RegionHasStackSave = false;
3937
3938 /// The current size of the scheduling region.
3939 int ScheduleRegionSize = 0;
3940
3941 /// The maximum size allowed for the scheduling region.
3942 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3943
3944 /// The ID of the scheduling region. For a new vectorization iteration this
3945 /// is incremented which "removes" all ScheduleData from the region.
3946 /// Make sure that the initial SchedulingRegionID is greater than the
3947 /// initial SchedulingRegionID in ScheduleData (which is 0).
3948 int SchedulingRegionID = 1;
3949 };
3950
3951 /// Attaches the BlockScheduling structures to basic blocks.
3953
3954 /// Performs the "real" scheduling. Done before vectorization is actually
3955 /// performed in a basic block.
3956 void scheduleBlock(BlockScheduling *BS);
3957
3958 /// List of users to ignore during scheduling and that don't need extracting.
3959 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3960
3961 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3962 /// sorted SmallVectors of unsigned.
3963 struct OrdersTypeDenseMapInfo {
3964 static OrdersType getEmptyKey() {
3965 OrdersType V;
3966 V.push_back(~1U);
3967 return V;
3968 }
3969
3970 static OrdersType getTombstoneKey() {
3971 OrdersType V;
3972 V.push_back(~2U);
3973 return V;
3974 }
3975
3976 static unsigned getHashValue(const OrdersType &V) {
3977 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3978 }
3979
3980 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3981 return LHS == RHS;
3982 }
3983 };
3984
3985 // Analysis and block reference.
3986 Function *F;
3987 ScalarEvolution *SE;
3989 TargetLibraryInfo *TLI;
3990 LoopInfo *LI;
3991 DominatorTree *DT;
3992 AssumptionCache *AC;
3993 DemandedBits *DB;
3994 const DataLayout *DL;
3996
3997 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3998 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3999
4000 /// Instruction builder to construct the vectorized tree.
4002
4003 /// A map of scalar integer values to the smallest bit width with which they
4004 /// can legally be represented. The values map to (width, signed) pairs,
4005 /// where "width" indicates the minimum bit width and "signed" is True if the
4006 /// value must be signed-extended, rather than zero-extended, back to its
4007 /// original width.
4009
4010 /// Final size of the reduced vector, if the current graph represents the
4011 /// input for the reduction and it was possible to narrow the size of the
4012 /// reduction.
4013 unsigned ReductionBitWidth = 0;
4014
4015 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4016 /// type sizes, used in the tree.
4017 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4018
4019 /// Indices of the vectorized nodes, which supposed to be the roots of the new
4020 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4021 DenseSet<unsigned> ExtraBitWidthNodes;
4022};
4023
4024} // end namespace slpvectorizer
4025
4026template <> struct GraphTraits<BoUpSLP *> {
4027 using TreeEntry = BoUpSLP::TreeEntry;
4028
4029 /// NodeRef has to be a pointer per the GraphWriter.
4031
4033
4034 /// Add the VectorizableTree to the index iterator to be able to return
4035 /// TreeEntry pointers.
4036 struct ChildIteratorType
4037 : public iterator_adaptor_base<
4038 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4040
4042 ContainerTy &VT)
4043 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
4044
4045 NodeRef operator*() { return I->UserTE; }
4046 };
4047
4049 return R.VectorizableTree[0].get();
4050 }
4051
4052 static ChildIteratorType child_begin(NodeRef N) {
4053 return {N->UserTreeIndices.begin(), N->Container};
4054 }
4055
4056 static ChildIteratorType child_end(NodeRef N) {
4057 return {N->UserTreeIndices.end(), N->Container};
4058 }
4059
4060 /// For the node iterator we just need to turn the TreeEntry iterator into a
4061 /// TreeEntry* iterator so that it dereferences to NodeRef.
4062 class nodes_iterator {
4064 ItTy It;
4065
4066 public:
4067 nodes_iterator(const ItTy &It2) : It(It2) {}
4068 NodeRef operator*() { return It->get(); }
4069 nodes_iterator operator++() {
4070 ++It;
4071 return *this;
4072 }
4073 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
4074 };
4075
4076 static nodes_iterator nodes_begin(BoUpSLP *R) {
4077 return nodes_iterator(R->VectorizableTree.begin());
4078 }
4079
4080 static nodes_iterator nodes_end(BoUpSLP *R) {
4081 return nodes_iterator(R->VectorizableTree.end());
4082 }
4083
4084 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
4085};
4086
4087template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
4088 using TreeEntry = BoUpSLP::TreeEntry;
4089
4090 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
4091
4092 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
4093 std::string Str;
4095 OS << Entry->Idx << ".\n";
4096 if (isSplat(Entry->Scalars))
4097 OS << "<splat> ";
4098 for (auto *V : Entry->Scalars) {
4099 OS << *V;
4100 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
4101 return EU.Scalar == V;
4102 }))
4103 OS << " <extract>";
4104 OS << "\n";
4105 }
4106 return Str;
4107 }
4108
4109 static std::string getNodeAttributes(const TreeEntry *Entry,
4110 const BoUpSLP *) {
4111 if (Entry->isGather())
4112 return "color=red";
4113 if (Entry->State == TreeEntry::ScatterVectorize ||
4114 Entry->State == TreeEntry::StridedVectorize)
4115 return "color=blue";
4116 return "";
4117 }
4118};
4119
4120} // end namespace llvm
4121
4124 for (auto *I : DeletedInstructions) {
4125 if (!I->getParent()) {
4126 // Temporarily insert instruction back to erase them from parent and
4127 // memory later.
4128 if (isa<PHINode>(I))
4129 // Phi nodes must be the very first instructions in the block.
4130 I->insertBefore(F->getEntryBlock(),
4131 F->getEntryBlock().getFirstNonPHIIt());
4132 else
4133 I->insertBefore(F->getEntryBlock().getTerminator());
4134 continue;
4135 }
4136 for (Use &U : I->operands()) {
4137 auto *Op = dyn_cast<Instruction>(U.get());
4138 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
4140 DeadInsts.emplace_back(Op);
4141 }
4142 I->dropAllReferences();
4143 }
4144 for (auto *I : DeletedInstructions) {
4145 assert(I->use_empty() &&
4146 "trying to erase instruction with users.");
4147 I->eraseFromParent();
4148 }
4149
4150 // Cleanup any dead scalar code feeding the vectorized instructions
4152
4153#ifdef EXPENSIVE_CHECKS
4154 // If we could guarantee that this call is not extremely slow, we could
4155 // remove the ifdef limitation (see PR47712).
4156 assert(!verifyFunction(*F, &dbgs()));
4157#endif
4158}
4159
4160/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4161/// contains original mask for the scalars reused in the node. Procedure
4162/// transform this mask in accordance with the given \p Mask.
4164 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4165 "Expected non-empty mask.");
4166 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4167 Prev.swap(Reuses);
4168 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
4169 if (Mask[I] != PoisonMaskElem)
4170 Reuses[Mask[I]] = Prev[I];
4171}
4172
4173/// Reorders the given \p Order according to the given \p Mask. \p Order - is
4174/// the original order of the scalars. Procedure transforms the provided order
4175/// in accordance with the given \p Mask. If the resulting \p Order is just an
4176/// identity order, \p Order is cleared.
4178 bool BottomOrder = false) {
4179 assert(!Mask.empty() && "Expected non-empty mask.");
4180 unsigned Sz = Mask.size();
4181 if (BottomOrder) {
4182 SmallVector<unsigned> PrevOrder;
4183 if (Order.empty()) {
4184 PrevOrder.resize(Sz);
4185 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4186 } else {
4187 PrevOrder.swap(Order);
4188 }
4189 Order.assign(Sz, Sz);
4190 for (unsigned I = 0; I < Sz; ++I)
4191 if (Mask[I] != PoisonMaskElem)
4192 Order[I] = PrevOrder[Mask[I]];
4193 if (all_of(enumerate(Order), [&](const auto &Data) {
4194 return Data.value() == Sz || Data.index() == Data.value();
4195 })) {
4196 Order.clear();
4197 return;
4198 }
4199 fixupOrderingIndices(Order);
4200 return;
4201 }
4202 SmallVector<int> MaskOrder;
4203 if (Order.empty()) {
4204 MaskOrder.resize(Sz);
4205 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4206 } else {
4207 inversePermutation(Order, MaskOrder);
4208 }
4209 reorderReuses(MaskOrder, Mask);
4210 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4211 Order.clear();
4212 return;
4213 }
4214 Order.assign(Sz, Sz);
4215 for (unsigned I = 0; I < Sz; ++I)
4216 if (MaskOrder[I] != PoisonMaskElem)
4217 Order[MaskOrder[I]] = I;
4218 fixupOrderingIndices(Order);
4219}
4220
4221std::optional<BoUpSLP::OrdersType>
4222BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4223 assert(TE.isGather() && "Expected gather node only.");
4224 // Try to find subvector extract/insert patterns and reorder only such
4225 // patterns.
4226 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4227 Type *ScalarTy = GatheredScalars.front()->getType();
4228 int NumScalars = GatheredScalars.size();
4229 if (!isValidElementType(ScalarTy))
4230 return std::nullopt;
4231 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4232 int NumParts = TTI->getNumberOfParts(VecTy);
4233 if (NumParts == 0 || NumParts >= NumScalars)
4234 NumParts = 1;
4235 SmallVector<int> ExtractMask;
4236 SmallVector<int> Mask;
4239 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4241 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4242 /*ForOrder=*/true);
4243 // No shuffled operands - ignore.
4244 if (GatherShuffles.empty() && ExtractShuffles.empty())
4245 return std::nullopt;
4246 OrdersType CurrentOrder(NumScalars, NumScalars);
4247 if (GatherShuffles.size() == 1 &&
4248 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4249 Entries.front().front()->isSame(TE.Scalars)) {
4250 // Perfect match in the graph, will reuse the previously vectorized
4251 // node. Cost is 0.
4252 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4253 return CurrentOrder;
4254 }
4255 auto IsSplatMask = [](ArrayRef<int> Mask) {
4256 int SingleElt = PoisonMaskElem;
4257 return all_of(Mask, [&](int I) {
4258 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4259 SingleElt = I;
4260 return I == PoisonMaskElem || I == SingleElt;
4261 });
4262 };
4263 // Exclusive broadcast mask - ignore.
4264 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4265 (Entries.size() != 1 ||
4266 Entries.front().front()->ReorderIndices.empty())) ||
4267 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4268 return std::nullopt;
4269 SmallBitVector ShuffledSubMasks(NumParts);
4270 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4271 ArrayRef<int> Mask, int PartSz, int NumParts,
4272 function_ref<unsigned(unsigned)> GetVF) {
4273 for (int I : seq<int>(0, NumParts)) {
4274 if (ShuffledSubMasks.test(I))
4275 continue;
4276 const int VF = GetVF(I);
4277 if (VF == 0)
4278 continue;
4279 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
4280 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4281 // Shuffle of at least 2 vectors - ignore.
4282 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4283 std::fill(Slice.begin(), Slice.end(), NumScalars);
4284 ShuffledSubMasks.set(I);
4285 continue;
4286 }
4287 // Try to include as much elements from the mask as possible.
4288 int FirstMin = INT_MAX;
4289 int SecondVecFound = false;
4290 for (int K : seq<int>(Limit)) {
4291 int Idx = Mask[I * PartSz + K];
4292 if (Idx == PoisonMaskElem) {
4293 Value *V = GatheredScalars[I * PartSz + K];
4294 if (isConstant(V) && !isa<PoisonValue>(V)) {
4295 SecondVecFound = true;
4296 break;
4297 }
4298 continue;
4299 }
4300 if (Idx < VF) {
4301 if (FirstMin > Idx)
4302 FirstMin = Idx;
4303 } else {
4304 SecondVecFound = true;
4305 break;
4306 }
4307 }
4308 FirstMin = (FirstMin / PartSz) * PartSz;
4309 // Shuffle of at least 2 vectors - ignore.
4310 if (SecondVecFound) {
4311 std::fill(Slice.begin(), Slice.end(), NumScalars);
4312 ShuffledSubMasks.set(I);
4313 continue;
4314 }
4315 for (int K : seq<int>(Limit)) {
4316 int Idx = Mask[I * PartSz + K];
4317 if (Idx == PoisonMaskElem)
4318 continue;
4319 Idx -= FirstMin;
4320 if (Idx >= PartSz) {
4321 SecondVecFound = true;
4322 break;
4323 }
4324 if (CurrentOrder[I * PartSz + Idx] >
4325 static_cast<unsigned>(I * PartSz + K) &&
4326 CurrentOrder[I * PartSz + Idx] !=
4327 static_cast<unsigned>(I * PartSz + Idx))
4328 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4329 }
4330 // Shuffle of at least 2 vectors - ignore.
4331 if (SecondVecFound) {
4332 std::fill(Slice.begin(), Slice.end(), NumScalars);
4333 ShuffledSubMasks.set(I);
4334 continue;
4335 }
4336 }
4337 };
4338 int PartSz = getPartNumElems(NumScalars, NumParts);
4339 if (!ExtractShuffles.empty())
4340 TransformMaskToOrder(
4341 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4342 if (!ExtractShuffles[I])
4343 return 0U;
4344 unsigned VF = 0;
4345 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
4346 for (unsigned Idx : seq<unsigned>(Sz)) {
4347 int K = I * PartSz + Idx;
4348 if (ExtractMask[K] == PoisonMaskElem)
4349 continue;
4350 if (!TE.ReuseShuffleIndices.empty())
4351 K = TE.ReuseShuffleIndices[K];
4352 if (!TE.ReorderIndices.empty())
4353 K = std::distance(TE.ReorderIndices.begin(),
4354 find(TE.ReorderIndices, K));
4355 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4356 if (!EI)
4357 continue;
4358 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4359 ->getElementCount()
4360 .getKnownMinValue());
4361 }
4362 return VF;
4363 });
4364 // Check special corner case - single shuffle of the same entry.
4365 if (GatherShuffles.size() == 1 && NumParts != 1) {
4366 if (ShuffledSubMasks.any())
4367 return std::nullopt;
4368 PartSz = NumScalars;
4369 NumParts = 1;
4370 }
4371 if (!Entries.empty())
4372 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4373 if (!GatherShuffles[I])
4374 return 0U;
4375 return std::max(Entries[I].front()->getVectorFactor(),
4376 Entries[I].back()->getVectorFactor());
4377 });
4378 int NumUndefs =
4379 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4380 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4381 return std::nullopt;
4382 return std::move(CurrentOrder);
4383}
4384
4385static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4386 const TargetLibraryInfo &TLI,
4387 bool CompareOpcodes = true) {
4388 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
4389 return false;
4390 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4391 if (!GEP1)
4392 return false;
4393 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4394 if (!GEP2)
4395 return false;
4396 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4397 ((isConstant(GEP1->getOperand(1)) &&
4398 isConstant(GEP2->getOperand(1))) ||
4399 !CompareOpcodes ||
4400 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4401 .getOpcode());
4402}
4403
4404/// Calculates minimal alignment as a common alignment.
4405template <typename T>
4407 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4408 for (Value *V : VL.drop_front())
4409 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4410 return CommonAlignment;
4411}
4412
4413/// Check if \p Order represents reverse order.
4415 unsigned Sz = Order.size();
4416 return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
4417 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4418 });
4419}
4420
4421/// Checks if the provided list of pointers \p Pointers represents the strided
4422/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4423/// Otherwise, if \p Inst is not specified, just initialized optional value is
4424/// returned to show that the pointers represent strided pointers. If \p Inst
4425/// specified, the runtime stride is materialized before the given \p Inst.
4426/// \returns std::nullopt if the pointers are not pointers with the runtime
4427/// stride, nullptr or actual stride value, otherwise.
4428static std::optional<Value *>
4430 const DataLayout &DL, ScalarEvolution &SE,
4431 SmallVectorImpl<unsigned> &SortedIndices,
4432 Instruction *Inst = nullptr) {
4434 const SCEV *PtrSCEVLowest = nullptr;
4435 const SCEV *PtrSCEVHighest = nullptr;
4436 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4437 // addresses).
4438 for (Value *Ptr : PointerOps) {
4439 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4440 if (!PtrSCEV)
4441 return std::nullopt;
4442 SCEVs.push_back(PtrSCEV);
4443 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4444 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4445 continue;
4446 }
4447 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4448 if (isa<SCEVCouldNotCompute>(Diff))
4449 return std::nullopt;
4450 if (Diff->isNonConstantNegative()) {
4451 PtrSCEVLowest = PtrSCEV;
4452 continue;
4453 }
4454 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4455 if (isa<SCEVCouldNotCompute>(Diff1))
4456 return std::nullopt;
4457 if (Diff1->isNonConstantNegative()) {
4458 PtrSCEVHighest = PtrSCEV;
4459 continue;
4460 }
4461 }
4462 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4463 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4464 if (isa<SCEVCouldNotCompute>(Dist))
4465 return std::nullopt;
4466 int Size = DL.getTypeStoreSize(ElemTy);
4467 auto TryGetStride = [&](const SCEV *Dist,
4468 const SCEV *Multiplier) -> const SCEV * {
4469 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4470 if (M->getOperand(0) == Multiplier)
4471 return M->getOperand(1);
4472 if (M->getOperand(1) == Multiplier)
4473 return M->getOperand(0);
4474 return nullptr;
4475 }
4476 if (Multiplier == Dist)
4477 return SE.getConstant(Dist->getType(), 1);
4478 return SE.getUDivExactExpr(Dist, Multiplier);
4479 };
4480 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4481 const SCEV *Stride = nullptr;
4482 if (Size != 1 || SCEVs.size() > 2) {
4483 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4484 Stride = TryGetStride(Dist, Sz);
4485 if (!Stride)
4486 return std::nullopt;
4487 }
4488 if (!Stride || isa<SCEVConstant>(Stride))
4489 return std::nullopt;
4490 // Iterate through all pointers and check if all distances are
4491 // unique multiple of Stride.
4492 using DistOrdPair = std::pair<int64_t, int>;
4493 auto Compare = llvm::less_first();
4494 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4495 int Cnt = 0;
4496 bool IsConsecutive = true;
4497 for (const SCEV *PtrSCEV : SCEVs) {
4498 unsigned Dist = 0;
4499 if (PtrSCEV != PtrSCEVLowest) {
4500 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4501 const SCEV *Coeff = TryGetStride(Diff, Stride);
4502 if (!Coeff)
4503 return std::nullopt;
4504 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4505 if (!SC || isa<SCEVCouldNotCompute>(SC))
4506 return std::nullopt;
4507 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4508 SE.getMulExpr(Stride, SC)))
4509 ->isZero())
4510 return std::nullopt;
4511 Dist = SC->getAPInt().getZExtValue();
4512 }
4513 // If the strides are not the same or repeated, we can't vectorize.
4514 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4515 return std::nullopt;
4516 auto Res = Offsets.emplace(Dist, Cnt);
4517 if (!Res.second)
4518 return std::nullopt;
4519 // Consecutive order if the inserted element is the last one.
4520 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4521 ++Cnt;
4522 }
4523 if (Offsets.size() != SCEVs.size())
4524 return std::nullopt;
4525 SortedIndices.clear();
4526 if (!IsConsecutive) {
4527 // Fill SortedIndices array only if it is non-consecutive.
4528 SortedIndices.resize(PointerOps.size());
4529 Cnt = 0;
4530 for (const std::pair<int64_t, int> &Pair : Offsets) {
4531 SortedIndices[Cnt] = Pair.second;
4532 ++Cnt;
4533 }
4534 }
4535 if (!Inst)
4536 return nullptr;
4537 SCEVExpander Expander(SE, DL, "strided-load-vec");
4538 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4539}
4540
4541static std::pair<InstructionCost, InstructionCost>
4543 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4544 Type *ScalarTy, VectorType *VecTy);
4545
4547 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
4548 SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
4549 // Check that a vectorized load would load the same memory as a scalar
4550 // load. For example, we don't want to vectorize loads that are smaller
4551 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4552 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4553 // from such a struct, we read/write packed bits disagreeing with the
4554 // unvectorized version.
4555 Type *ScalarTy = VL0->getType();
4556
4557 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4558 return LoadsState::Gather;
4559
4560 // Make sure all loads in the bundle are simple - we can't vectorize
4561 // atomic or volatile loads.
4562 PointerOps.clear();
4563 const unsigned Sz = VL.size();
4564 PointerOps.resize(Sz);
4565 auto *POIter = PointerOps.begin();
4566 for (Value *V : VL) {
4567 auto *L = cast<LoadInst>(V);
4568 if (!L->isSimple())
4569 return LoadsState::Gather;
4570 *POIter = L->getPointerOperand();
4571 ++POIter;
4572 }
4573
4574 Order.clear();
4575 auto *VecTy = getWidenedType(ScalarTy, Sz);
4576 // Check the order of pointer operands or that all pointers are the same.
4577 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4578 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4579 if (!Order.empty() && !isPowerOf2_32(VL.size())) {
4580 assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4581 "supported with VectorizeNonPowerOf2");
4582 return LoadsState::Gather;
4583 }
4584
4585 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4586 if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
4587 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4588 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4590 if (IsSorted || all_of(PointerOps, [&](Value *P) {
4591 return arePointersCompatible(P, PointerOps.front(), *TLI);
4592 })) {
4593 if (IsSorted) {
4594 Value *Ptr0;
4595 Value *PtrN;
4596 if (Order.empty()) {
4597 Ptr0 = PointerOps.front();
4598 PtrN = PointerOps.back();
4599 } else {
4600 Ptr0 = PointerOps[Order.front()];
4601 PtrN = PointerOps[Order.back()];
4602 }
4603 std::optional<int> Diff =
4604 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
4605 // Check that the sorted loads are consecutive.
4606 if (static_cast<unsigned>(*Diff) == Sz - 1)
4607 return LoadsState::Vectorize;
4608 // Simple check if not a strided access - clear order.
4609 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4610 // Try to generate strided load node if:
4611 // 1. Target with strided load support is detected.
4612 // 2. The number of loads is greater than MinProfitableStridedLoads,
4613 // or the potential stride <= MaxProfitableLoadStride and the
4614 // potential stride is power-of-2 (to avoid perf regressions for the very
4615 // small number of loads) and max distance > number of loads, or potential
4616 // stride is -1.
4617 // 3. The loads are ordered, or number of unordered loads <=
4618 // MaxProfitableUnorderedLoads, or loads are in reversed order.
4619 // (this check is to avoid extra costs for very expensive shuffles).
4620 if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
4621 (static_cast<unsigned>(std::abs(*Diff)) <=
4623 isPowerOf2_32(std::abs(*Diff)))) &&
4624 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4625 *Diff == -(static_cast<int>(Sz) - 1))) {
4626 int Stride = *Diff / static_cast<int>(Sz - 1);
4627 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4628 Align Alignment =
4629 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
4630 ->getAlign();
4631 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
4632 // Iterate through all pointers and check if all distances are
4633 // unique multiple of Dist.
4634 SmallSet<int, 4> Dists;
4635 for (Value *Ptr : PointerOps) {
4636 int Dist = 0;
4637 if (Ptr == PtrN)
4638 Dist = *Diff;
4639 else if (Ptr != Ptr0)
4640 Dist =
4641 *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
4642 // If the strides are not the same or repeated, we can't
4643 // vectorize.
4644 if (((Dist / Stride) * Stride) != Dist ||
4645 !Dists.insert(Dist).second)
4646 break;
4647 }
4648 if (Dists.size() == Sz)
4650 }
4651 }
4652 }
4653 }
4654 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4655 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
4656 unsigned MinVF = getMinVF(Sz);
4657 unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
4658 MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
4659 for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4660 unsigned VectorizedCnt = 0;
4662 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4663 Cnt += VF, ++VectorizedCnt) {
4664 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4666 SmallVector<Value *> PointerOps;
4667 LoadsState LS =
4668 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
4669 /*TryRecursiveCheck=*/false);
4670 // Check that the sorted loads are consecutive.
4671 if (LS == LoadsState::Gather)
4672 break;
4673 // If need the reorder - consider as high-cost masked gather for now.
4674 if ((LS == LoadsState::Vectorize ||
4676 !Order.empty() && !isReverseOrder(Order))
4678 States.push_back(LS);
4679 }
4680 // Can be vectorized later as a serie of loads/insertelements.
4681 if (VectorizedCnt == VL.size() / VF) {
4682 // Compare masked gather cost and loads + insersubvector costs.
4684 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4685 TTI, PointerOps, PointerOps.front(), Instruction::GetElementPtr,
4686 CostKind, ScalarTy, VecTy);
4687 InstructionCost MaskedGatherCost =
4689 Instruction::Load, VecTy,
4690 cast<LoadInst>(VL0)->getPointerOperand(),
4691 /*VariableMask=*/false, CommonAlignment, CostKind) +
4692 VectorGEPCost - ScalarGEPCost;
4693 InstructionCost VecLdCost = 0;
4694 auto *SubVecTy = getWidenedType(ScalarTy, VF);
4695 for (auto [I, LS] : enumerate(States)) {
4696 auto *LI0 = cast<LoadInst>(VL[I * VF]);
4697 switch (LS) {
4698 case LoadsState::Vectorize: {
4699 auto [ScalarGEPCost, VectorGEPCost] =
4700 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4701 LI0->getPointerOperand(), Instruction::Load,
4702 CostKind, ScalarTy, SubVecTy);
4703 VecLdCost += TTI.getMemoryOpCost(
4704 Instruction::Load, SubVecTy, LI0->getAlign(),
4705 LI0->getPointerAddressSpace(), CostKind,
4707 VectorGEPCost - ScalarGEPCost;
4708 break;
4709 }
4711 auto [ScalarGEPCost, VectorGEPCost] =
4712 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4713 LI0->getPointerOperand(), Instruction::Load,
4714 CostKind, ScalarTy, SubVecTy);
4715 VecLdCost +=
4717 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4718 /*VariableMask=*/false, CommonAlignment, CostKind) +
4719 VectorGEPCost - ScalarGEPCost;
4720 break;
4721 }
4723 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4724 TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4725 LI0->getPointerOperand(), Instruction::GetElementPtr,
4726 CostKind, ScalarTy, SubVecTy);
4727 VecLdCost +=
4729 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4730 /*VariableMask=*/false, CommonAlignment, CostKind) +
4731 VectorGEPCost - ScalarGEPCost;
4732 break;
4733 }
4734 case LoadsState::Gather:
4736 "Expected only consecutive, strided or masked gather loads.");
4737 }
4738 SmallVector<int> ShuffleMask(VL.size());
4739 for (int Idx : seq<int>(0, VL.size()))
4740 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4741 VecLdCost +=
4742 TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy, ShuffleMask,
4743 CostKind, I * VF, SubVecTy);
4744 }
4745 // If masked gather cost is higher - better to vectorize, so
4746 // consider it as a gather node. It will be better estimated
4747 // later.
4748 if (MaskedGatherCost >= VecLdCost)
4749 return true;
4750 }
4751 }
4752 return false;
4753 };
4754 // TODO: need to improve analysis of the pointers, if not all of them are
4755 // GEPs or have > 2 operands, we end up with a gather node, which just
4756 // increases the cost.
4757 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
4758 bool ProfitableGatherPointers =
4759 L && Sz > 2 &&
4760 static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
4761 return L->isLoopInvariant(V);
4762 })) <= Sz / 2;
4763 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
4764 auto *GEP = dyn_cast<GetElementPtrInst>(P);
4765 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
4766 (GEP && GEP->getNumOperands() == 2 &&
4767 isa<Constant, Instruction>(GEP->getOperand(1)));
4768 })) {
4769 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4770 if (TTI->isLegalMaskedGather(VecTy, CommonAlignment) &&
4771 !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
4772 // Check if potential masked gather can be represented as series
4773 // of loads + insertsubvectors.
4774 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4775 // If masked gather cost is higher - better to vectorize, so
4776 // consider it as a gather node. It will be better estimated
4777 // later.
4778 return LoadsState::Gather;
4779 }
4781 }
4782 }
4783 }
4784
4785 return LoadsState::Gather;
4786}
4787
4789 const DataLayout &DL, ScalarEvolution &SE,
4790 SmallVectorImpl<unsigned> &SortedIndices) {
4792 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4793 "Expected list of pointer operands.");
4794 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4795 // Ptr into, sort and return the sorted indices with values next to one
4796 // another.
4798 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4799
4800 unsigned Cnt = 1;
4801 for (Value *Ptr : VL.drop_front()) {
4802 bool Found = any_of(Bases, [&](auto &Base) {
4803 std::optional<int> Diff =
4804 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4805 /*StrictCheck=*/true);
4806 if (!Diff)
4807 return false;
4808
4809 Base.second.emplace_back(Ptr, *Diff, Cnt++);
4810 return true;
4811 });
4812
4813 if (!Found) {
4814 // If we haven't found enough to usefully cluster, return early.
4815 if (Bases.size() > VL.size() / 2 - 1)
4816 return false;
4817
4818 // Not found already - add a new Base
4819 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
4820 }
4821 }
4822
4823 // For each of the bases sort the pointers by Offset and check if any of the
4824 // base become consecutively allocated.
4825 bool AnyConsecutive = false;
4826 for (auto &Base : Bases) {
4827 auto &Vec = Base.second;
4828 if (Vec.size() > 1) {
4829 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
4830 const std::tuple<Value *, int, unsigned> &Y) {
4831 return std::get<1>(X) < std::get<1>(Y);
4832 });
4833 int InitialOffset = std::get<1>(Vec[0]);
4834 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
4835 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4836 });
4837 }
4838 }
4839
4840 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4841 SortedIndices.clear();
4842 if (!AnyConsecutive)
4843 return false;
4844
4845 for (auto &Base : Bases) {
4846 for (auto &T : Base.second)
4847 SortedIndices.push_back(std::get<2>(T));
4848 }
4849
4850 assert(SortedIndices.size() == VL.size() &&
4851 "Expected SortedIndices to be the size of VL");
4852 return true;
4853}
4854
4855std::optional<BoUpSLP::OrdersType>
4856BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4857 assert(TE.isGather() && "Expected gather node only.");
4858 Type *ScalarTy = TE.Scalars[0]->getType();
4859
4861 Ptrs.reserve(TE.Scalars.size());
4862 for (Value *V : TE.Scalars) {
4863 auto *L = dyn_cast<LoadInst>(V);
4864 if (!L || !L->isSimple())
4865 return std::nullopt;
4866 Ptrs.push_back(L->getPointerOperand());
4867 }
4868
4869 BoUpSLP::OrdersType Order;
4870 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
4871 return std::move(Order);
4872 return std::nullopt;
4873}
4874
4875/// Check if two insertelement instructions are from the same buildvector.
4878 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4879 // Instructions must be from the same basic blocks.
4880 if (VU->getParent() != V->getParent())
4881 return false;
4882 // Checks if 2 insertelements are from the same buildvector.
4883 if (VU->getType() != V->getType())
4884 return false;
4885 // Multiple used inserts are separate nodes.
4886 if (!VU->hasOneUse() && !V->hasOneUse())
4887 return false;
4888 auto *IE1 = VU;
4889 auto *IE2 = V;
4890 std::optional<unsigned> Idx1 = getElementIndex(IE1);
4891 std::optional<unsigned> Idx2 = getElementIndex(IE2);
4892 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4893 return false;
4894 // Go through the vector operand of insertelement instructions trying to find
4895 // either VU as the original vector for IE2 or V as the original vector for
4896 // IE1.
4897 SmallBitVector ReusedIdx(
4898 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
4899 bool IsReusedIdx = false;
4900 do {
4901 if (IE2 == VU && !IE1)
4902 return VU->hasOneUse();
4903 if (IE1 == V && !IE2)
4904 return V->hasOneUse();
4905 if (IE1 && IE1 != V) {
4906 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
4907 IsReusedIdx |= ReusedIdx.test(Idx1);
4908 ReusedIdx.set(Idx1);
4909 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4910 IE1 = nullptr;
4911 else
4912 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4913 }
4914 if (IE2 && IE2 != VU) {
4915 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
4916 IsReusedIdx |= ReusedIdx.test(Idx2);
4917 ReusedIdx.set(Idx2);
4918 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4919 IE2 = nullptr;
4920 else
4921 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4922 }
4923 } while (!IsReusedIdx && (IE1 || IE2));
4924 return false;
4925}
4926
4927std::optional<BoUpSLP::OrdersType>
4928BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4929 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
4930 if (TE.isNonPowOf2Vec())
4931 return std::nullopt;
4932
4933 // No need to reorder if need to shuffle reuses, still need to shuffle the
4934 // node.
4935 if (!TE.ReuseShuffleIndices.empty()) {
4936 if (isSplat(TE.Scalars))
4937 return std::nullopt;
4938 // Check if reuse shuffle indices can be improved by reordering.
4939 // For this, check that reuse mask is "clustered", i.e. each scalar values
4940 // is used once in each submask of size <number_of_scalars>.
4941 // Example: 4 scalar values.
4942 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4943 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4944 // element 3 is used twice in the second submask.
4945 unsigned Sz = TE.Scalars.size();
4946 if (TE.isGather()) {
4947 if (std::optional<OrdersType> CurrentOrder =
4949 SmallVector<int> Mask;
4950 fixupOrderingIndices(*CurrentOrder);
4951 inversePermutation(*CurrentOrder, Mask);
4952 ::addMask(Mask, TE.ReuseShuffleIndices);
4953 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4954 unsigned Sz = TE.Scalars.size();
4955 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
4956 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
4957 if (Idx != PoisonMaskElem)
4958 Res[Idx + K * Sz] = I + K * Sz;
4959 }
4960 return std::move(Res);
4961 }
4962 }
4963 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4964 TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
4965 2 * TE.getVectorFactor())) == 1)
4966 return std::nullopt;
4967 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
4968 Sz)) {
4969 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4970 if (TE.ReorderIndices.empty())
4971 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4972 else
4973 inversePermutation(TE.ReorderIndices, ReorderMask);
4974 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4975 unsigned VF = ReorderMask.size();
4976 OrdersType ResOrder(VF, VF);
4977 unsigned NumParts = divideCeil(VF, Sz);
4978 SmallBitVector UsedVals(NumParts);
4979 for (unsigned I = 0; I < VF; I += Sz) {
4980 int Val = PoisonMaskElem;
4981 unsigned UndefCnt = 0;
4982 unsigned Limit = std::min(Sz, VF - I);
4983 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
4984 [&](int Idx) {
4985 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
4986 Val = Idx;
4987 if (Idx == PoisonMaskElem)
4988 ++UndefCnt;
4989 return Idx != PoisonMaskElem && Idx != Val;
4990 }) ||
4991 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
4992 UndefCnt > Sz / 2)
4993 return std::nullopt;
4994 UsedVals.set(Val);
4995 for (unsigned K = 0; K < NumParts; ++K)
4996 ResOrder[Val + Sz * K] = I + K;
4997 }
4998 return std::move(ResOrder);
4999 }
5000 unsigned VF = TE.getVectorFactor();
5001 // Try build correct order for extractelement instructions.
5002 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
5003 TE.ReuseShuffleIndices.end());
5004 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
5005 all_of(TE.Scalars, [Sz](Value *V) {
5006 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5007 return Idx && *Idx < Sz;
5008 })) {
5009 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5010 if (TE.ReorderIndices.empty())
5011 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5012 else
5013 inversePermutation(TE.ReorderIndices, ReorderMask);
5014 for (unsigned I = 0; I < VF; ++I) {
5015 int &Idx = ReusedMask[I];
5016 if (Idx == PoisonMaskElem)
5017 continue;
5018 Value *V = TE.Scalars[ReorderMask[Idx]];
5019 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
5020 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
5021 }
5022 }
5023 // Build the order of the VF size, need to reorder reuses shuffles, they are
5024 // always of VF size.
5025 OrdersType ResOrder(VF);
5026 std::iota(ResOrder.begin(), ResOrder.end(), 0);
5027 auto *It = ResOrder.begin();
5028 for (unsigned K = 0; K < VF; K += Sz) {
5029 OrdersType CurrentOrder(TE.ReorderIndices);
5030 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
5031 if (SubMask.front() == PoisonMaskElem)
5032 std::iota(SubMask.begin(), SubMask.end(), 0);
5033 reorderOrder(CurrentOrder, SubMask);
5034 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
5035 std::advance(It, Sz);
5036 }
5037 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
5038 return Data.index() == Data.value();
5039 }))
5040 return std::nullopt; // No need to reorder.
5041 return std::move(ResOrder);
5042 }
5043 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5044 any_of(TE.UserTreeIndices,
5045 [](const EdgeInfo &EI) {
5046 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5047 }) &&
5048 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
5049 return std::nullopt;
5050 if ((TE.State == TreeEntry::Vectorize ||
5051 TE.State == TreeEntry::StridedVectorize) &&
5052 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5053 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
5054 !TE.isAltShuffle())
5055 return TE.ReorderIndices;
5056 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5057 auto PHICompare = [&](unsigned I1, unsigned I2) {
5058 Value *V1 = TE.Scalars[I1];
5059 Value *V2 = TE.Scalars[I2];
5060 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
5061 return false;
5062 if (V1->getNumUses() < V2->getNumUses())
5063 return true;
5064 if (V1->getNumUses() > V2->getNumUses())
5065 return false;
5066 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
5067 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5068 if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
5069 if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
5071 IE1, IE2,
5072 [](InsertElementInst *II) { return II->getOperand(0); }))
5073 return I1 < I2;
5074 return getElementIndex(IE1) < getElementIndex(IE2);
5075 }
5076 if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
5077 if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
5078 if (EE1->getOperand(0) != EE2->getOperand(0))
5079 return I1 < I2;
5080 return getElementIndex(EE1) < getElementIndex(EE2);
5081 }
5082 return I1 < I2;
5083 };
5084 auto IsIdentityOrder = [](const OrdersType &Order) {
5085 for (unsigned Idx : seq<unsigned>(0, Order.size()))
5086 if (Idx != Order[Idx])
5087 return false;
5088 return true;
5089 };
5090 if (!TE.ReorderIndices.empty())
5091 return TE.ReorderIndices;
5093 SmallVector<unsigned> Phis(TE.Scalars.size());
5094 std::iota(Phis.begin(), Phis.end(), 0);
5095 OrdersType ResOrder(TE.Scalars.size());
5096 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
5097 PhiToId[Id] = Id;
5098 stable_sort(Phis, PHICompare);
5099 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
5100 ResOrder[Id] = PhiToId[Phis[Id]];
5101 if (IsIdentityOrder(ResOrder))
5102 return std::nullopt; // No need to reorder.
5103 return std::move(ResOrder);
5104 }
5105 if (TE.isGather() && !TE.isAltShuffle() && allSameType(TE.Scalars)) {
5106 // TODO: add analysis of other gather nodes with extractelement
5107 // instructions and other values/instructions, not only undefs.
5108 if ((TE.getOpcode() == Instruction::ExtractElement ||
5109 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5110 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5111 all_of(TE.Scalars, [](Value *V) {
5112 auto *EE = dyn_cast<ExtractElementInst>(V);
5113 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5114 })) {
5115 // Check that gather of extractelements can be represented as
5116 // just a shuffle of a single vector.
5117 OrdersType CurrentOrder;
5118 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
5119 /*ResizeAllowed=*/true);
5120 if (Reuse || !CurrentOrder.empty())
5121 return std::move(CurrentOrder);
5122 }
5123 // If the gather node is <undef, v, .., poison> and
5124 // insertelement poison, v, 0 [+ permute]
5125 // is cheaper than
5126 // insertelement poison, v, n - try to reorder.
5127 // If rotating the whole graph, exclude the permute cost, the whole graph
5128 // might be transformed.
5129 int Sz = TE.Scalars.size();
5130 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
5131 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5132 const auto *It =
5133 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
5134 if (It == TE.Scalars.begin())
5135 return OrdersType();
5136 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
5137 if (It != TE.Scalars.end()) {
5138 OrdersType Order(Sz, Sz);
5139 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5140 Order[Idx] = 0;
5141 fixupOrderingIndices(Order);
5142 SmallVector<int> Mask;
5143 inversePermutation(Order, Mask);
5144 InstructionCost PermuteCost =
5145 TopToBottom
5146 ? 0
5148 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
5149 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
5150 PoisonValue::get(Ty), *It);
5151 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
5152 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
5153 PoisonValue::get(Ty), *It);
5154 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5155 OrdersType Order(Sz, Sz);
5156 Order[Idx] = 0;
5157 return std::move(Order);
5158 }
5159 }
5160 }
5161 if (isSplat(TE.Scalars))
5162 return std::nullopt;
5163 if (TE.Scalars.size() >= 4)
5164 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
5165 return Order;
5166 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
5167 return CurrentOrder;
5168 }
5169 return std::nullopt;
5170}
5171
5172/// Checks if the given mask is a "clustered" mask with the same clusters of
5173/// size \p Sz, which are not identity submasks.
5175 unsigned Sz) {
5176 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
5177 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
5178 return false;
5179 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
5180 ArrayRef<int> Cluster = Mask.slice(I, Sz);
5181 if (Cluster != FirstCluster)
5182 return false;
5183 }
5184 return true;
5185}
5186
5187void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
5188 // Reorder reuses mask.
5189 reorderReuses(TE.ReuseShuffleIndices, Mask);
5190 const unsigned Sz = TE.Scalars.size();
5191 // For vectorized and non-clustered reused no need to do anything else.
5192 if (!TE.isGather() ||
5194 Sz) ||
5195 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
5196 return;
5197 SmallVector<int> NewMask;
5198 inversePermutation(TE.ReorderIndices, NewMask);
5199 addMask(NewMask, TE.ReuseShuffleIndices);
5200 // Clear reorder since it is going to be applied to the new mask.
5201 TE.ReorderIndices.clear();
5202 // Try to improve gathered nodes with clustered reuses, if possible.
5203 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
5204 SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
5205 inversePermutation(NewOrder, NewMask);
5206 reorderScalars(TE.Scalars, NewMask);
5207 // Fill the reuses mask with the identity submasks.
5208 for (auto *It = TE.ReuseShuffleIndices.begin(),
5209 *End = TE.ReuseShuffleIndices.end();
5210 It != End; std::advance(It, Sz))
5211 std::iota(It, std::next(It, Sz), 0);
5212}
5213
5215 ArrayRef<unsigned> SecondaryOrder) {
5216 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5217 "Expected same size of orders");
5218 unsigned Sz = Order.size();
5219 SmallBitVector UsedIndices(Sz);
5220 for (unsigned Idx : seq<unsigned>(0, Sz)) {
5221 if (Order[Idx] != Sz)
5222 UsedIndices.set(Order[Idx]);
5223 }
5224 if (SecondaryOrder.empty()) {
5225 for (unsigned Idx : seq<unsigned>(0, Sz))
5226 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5227 Order[Idx] = Idx;
5228 } else {
5229 for (unsigned Idx : seq<unsigned>(0, Sz))
5230 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5231 !UsedIndices.test(SecondaryOrder[Idx]))
5232 Order[Idx] = SecondaryOrder[Idx];
5233 }
5234}
5235
5237 // Maps VF to the graph nodes.
5239 // ExtractElement gather nodes which can be vectorized and need to handle
5240 // their ordering.
5242
5243 // Phi nodes can have preferred ordering based on their result users
5245
5246 // AltShuffles can also have a preferred ordering that leads to fewer
5247 // instructions, e.g., the addsub instruction in x86.
5248 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5249
5250 // Maps a TreeEntry to the reorder indices of external users.
5252 ExternalUserReorderMap;
5253 // Find all reorderable nodes with the given VF.
5254 // Currently the are vectorized stores,loads,extracts + some gathering of
5255 // extracts.
5256 for_each(VectorizableTree, [&, &TTIRef = *TTI](
5257 const std::unique_ptr<TreeEntry> &TE) {
5258 // Look for external users that will probably be vectorized.
5259 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5260 findExternalStoreUsersReorderIndices(TE.get());
5261 if (!ExternalUserReorderIndices.empty()) {
5262 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5263 ExternalUserReorderMap.try_emplace(TE.get(),
5264 std::move(ExternalUserReorderIndices));
5265 }
5266
5267 // Patterns like [fadd,fsub] can be combined into a single instruction in
5268 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5269 // to take into account their order when looking for the most used order.
5270 if (TE->isAltShuffle()) {
5271 VectorType *VecTy =
5272 getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
5273 unsigned Opcode0 = TE->getOpcode();
5274 unsigned Opcode1 = TE->getAltOpcode();
5275 SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
5276 // If this pattern is supported by the target then we consider the order.
5277 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5278 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5279 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
5280 }
5281 // TODO: Check the reverse order too.
5282 }
5283
5284 if (std::optional<OrdersType> CurrentOrder =
5285 getReorderingData(*TE, /*TopToBottom=*/true)) {
5286 // Do not include ordering for nodes used in the alt opcode vectorization,
5287 // better to reorder them during bottom-to-top stage. If follow the order
5288 // here, it causes reordering of the whole graph though actually it is
5289 // profitable just to reorder the subgraph that starts from the alternate
5290 // opcode vectorization node. Such nodes already end-up with the shuffle
5291 // instruction and it is just enough to change this shuffle rather than
5292 // rotate the scalars for the whole graph.
5293 unsigned Cnt = 0;
5294 const TreeEntry *UserTE = TE.get();
5295 while (UserTE && Cnt < RecursionMaxDepth) {
5296 if (UserTE->UserTreeIndices.size() != 1)
5297 break;
5298 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
5299 return EI.UserTE->State == TreeEntry::Vectorize &&
5300 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5301 }))
5302 return;
5303 UserTE = UserTE->UserTreeIndices.back().UserTE;
5304 ++Cnt;
5305 }
5306 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5307 if (!(TE->State == TreeEntry::Vectorize ||
5308 TE->State == TreeEntry::StridedVectorize) ||
5309 !TE->ReuseShuffleIndices.empty())
5310 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
5311 if (TE->State == TreeEntry::Vectorize &&
5312 TE->getOpcode() == Instruction::PHI)
5313 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
5314 }
5315 });
5316
5317 // Reorder the graph nodes according to their vectorization factor.
5318 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5319 VF /= 2) {
5320 auto It = VFToOrderedEntries.find(VF);
5321 if (It == VFToOrderedEntries.end())
5322 continue;
5323 // Try to find the most profitable order. We just are looking for the most
5324 // used order and reorder scalar elements in the nodes according to this
5325 // mostly used order.
5326 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5327 // All operands are reordered and used only in this node - propagate the
5328 // most used order to the user node.
5331 OrdersUses;
5333 for (const TreeEntry *OpTE : OrderedEntries) {
5334 // No need to reorder this nodes, still need to extend and to use shuffle,
5335 // just need to merge reordering shuffle and the reuse shuffle.
5336 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5337 continue;
5338 // Count number of orders uses.
5339 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5340 &PhisToOrders]() -> const OrdersType & {
5341 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
5342 auto It = GathersToOrders.find(OpTE);
5343 if (It != GathersToOrders.end())
5344 return It->second;
5345 }
5346 if (OpTE->isAltShuffle()) {
5347 auto It = AltShufflesToOrders.find(OpTE);
5348 if (It != AltShufflesToOrders.end())
5349 return It->second;
5350 }
5351 if (OpTE->State == TreeEntry::Vectorize &&
5352 OpTE->getOpcode() == Instruction::PHI) {
5353 auto It = PhisToOrders.find(OpTE);
5354 if (It != PhisToOrders.end())
5355 return It->second;
5356 }
5357 return OpTE->ReorderIndices;
5358 }();
5359 // First consider the order of the external scalar users.
5360 auto It = ExternalUserReorderMap.find(OpTE);
5361 if (It != ExternalUserReorderMap.end()) {
5362 const auto &ExternalUserReorderIndices = It->second;
5363 // If the OpTE vector factor != number of scalars - use natural order,
5364 // it is an attempt to reorder node with reused scalars but with
5365 // external uses.
5366 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5367 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
5368 ExternalUserReorderIndices.size();
5369 } else {
5370 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
5371 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5372 }
5373 // No other useful reorder data in this entry.
5374 if (Order.empty())
5375 continue;
5376 }
5377 // Stores actually store the mask, not the order, need to invert.
5378 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5379 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5380 SmallVector<int> Mask;
5381 inversePermutation(Order, Mask);
5382 unsigned E = Order.size();
5383 OrdersType CurrentOrder(E, E);
5384 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5385 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5386 });
5387 fixupOrderingIndices(CurrentOrder);
5388 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5389 } else {
5390 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5391 }
5392 }
5393 if (OrdersUses.empty())
5394 continue;
5395 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5396 const unsigned Sz = Order.size();
5397 for (unsigned Idx : seq<unsigned>(0, Sz))
5398 if (Idx != Order[Idx] && Order[Idx] != Sz)
5399 return false;
5400 return true;
5401 };
5402 // Choose the most used order.
5403 unsigned IdentityCnt = 0;
5404 unsigned FilledIdentityCnt = 0;
5405 OrdersType IdentityOrder(VF, VF);
5406 for (auto &Pair : OrdersUses) {
5407 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5408 if (!Pair.first.empty())
5409 FilledIdentityCnt += Pair.second;
5410 IdentityCnt += Pair.second;
5411 combineOrders(IdentityOrder, Pair.first);
5412 }
5413 }
5414 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5415 unsigned Cnt = IdentityCnt;
5416 for (auto &Pair : OrdersUses) {
5417 // Prefer identity order. But, if filled identity found (non-empty order)
5418 // with same number of uses, as the new candidate order, we can choose
5419 // this candidate order.
5420 if (Cnt < Pair.second ||
5421 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5422 Cnt == Pair.second && !BestOrder.empty() &&
5423 IsIdentityOrder(BestOrder))) {
5424 combineOrders(Pair.first, BestOrder);
5425 BestOrder = Pair.first;
5426 Cnt = Pair.second;
5427 } else {
5428 combineOrders(BestOrder, Pair.first);
5429 }
5430 }
5431 // Set order of the user node.
5432 if (IsIdentityOrder(BestOrder))
5433 continue;
5434 fixupOrderingIndices(BestOrder);
5435 SmallVector<int> Mask;
5436 inversePermutation(BestOrder, Mask);
5437 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5438 unsigned E = BestOrder.size();
5439 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5440 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5441 });
5442 // Do an actual reordering, if profitable.
5443 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5444 // Just do the reordering for the nodes with the given VF.
5445 if (TE->Scalars.size() != VF) {
5446 if (TE->ReuseShuffleIndices.size() == VF) {
5447 // Need to reorder the reuses masks of the operands with smaller VF to
5448 // be able to find the match between the graph nodes and scalar
5449 // operands of the given node during vectorization/cost estimation.
5450 assert(all_of(TE->UserTreeIndices,
5451 [VF, &TE](const EdgeInfo &EI) {
5452 return EI.UserTE->Scalars.size() == VF ||
5453 EI.UserTE->Scalars.size() ==
5454 TE->Scalars.size();
5455 }) &&
5456 "All users must be of VF size.");
5457 // Update ordering of the operands with the smaller VF than the given
5458 // one.
5459 reorderNodeWithReuses(*TE, Mask);
5460 }
5461 continue;
5462 }
5463 if ((TE->State == TreeEntry::Vectorize ||
5464 TE->State == TreeEntry::StridedVectorize) &&
5466 InsertElementInst>(TE->getMainOp()) &&
5467 !TE->isAltShuffle()) {
5468 // Build correct orders for extract{element,value}, loads and
5469 // stores.
5470 reorderOrder(TE->ReorderIndices, Mask);
5471 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5472 TE->reorderOperands(Mask);
5473 } else {
5474 // Reorder the node and its operands.
5475 TE->reorderOperands(Mask);
5476 assert(TE->ReorderIndices.empty() &&
5477 "Expected empty reorder sequence.");
5478 reorderScalars(TE->Scalars, Mask);
5479 }
5480 if (!TE->ReuseShuffleIndices.empty()) {
5481 // Apply reversed order to keep the original ordering of the reused
5482 // elements to avoid extra reorder indices shuffling.
5483 OrdersType CurrentOrder;
5484 reorderOrder(CurrentOrder, MaskOrder);
5485 SmallVector<int> NewReuses;
5486 inversePermutation(CurrentOrder, NewReuses);
5487 addMask(NewReuses, TE->ReuseShuffleIndices);
5488 TE->ReuseShuffleIndices.swap(NewReuses);
5489 }
5490 }
5491 }
5492}
5493
5494bool BoUpSLP::canReorderOperands(
5495 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5496 ArrayRef<TreeEntry *> ReorderableGathers,
5497 SmallVectorImpl<TreeEntry *> &GatherOps) {
5498 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5499 if (UserTE->isNonPowOf2Vec())
5500 return false;
5501
5502 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
5503 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
5504 return OpData.first == I &&
5505 (OpData.second->State == TreeEntry::Vectorize ||
5506 OpData.second->State == TreeEntry::StridedVectorize);
5507 }))
5508 continue;
5509 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
5510 // Do not reorder if operand node is used by many user nodes.
5511 if (any_of(TE->UserTreeIndices,
5512 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5513 return false;
5514 // Add the node to the list of the ordered nodes with the identity
5515 // order.
5516 Edges.emplace_back(I, TE);
5517 // Add ScatterVectorize nodes to the list of operands, where just
5518 // reordering of the scalars is required. Similar to the gathers, so
5519 // simply add to the list of gathered ops.
5520 // If there are reused scalars, process this node as a regular vectorize
5521 // node, just reorder reuses mask.
5522 if (TE->State != TreeEntry::Vectorize &&
5523 TE->State != TreeEntry::StridedVectorize &&
5524 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5525 GatherOps.push_back(TE);
5526 continue;
5527 }
5528 TreeEntry *Gather = nullptr;
5529 if (count_if(ReorderableGathers,
5530 [&Gather, UserTE, I](TreeEntry *TE) {
5531 assert(TE->State != TreeEntry::Vectorize &&
5532 TE->State != TreeEntry::StridedVectorize &&
5533 "Only non-vectorized nodes are expected.");
5534 if (any_of(TE->UserTreeIndices,
5535 [UserTE, I](const EdgeInfo &EI) {
5536 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5537 })) {
5538 assert(TE->isSame(UserTE->getOperand(I)) &&
5539 "Operand entry does not match operands.");
5540 Gather = TE;
5541 return true;
5542 }
5543 return false;
5544 }) > 1 &&
5545 !allConstant(UserTE->getOperand(I)))
5546 return false;
5547 if (Gather)
5548 GatherOps.push_back(Gather);
5549 }
5550 return true;
5551}
5552
5553void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
5554 SetVector<TreeEntry *> OrderedEntries;
5555 DenseSet<const TreeEntry *> GathersToOrders;
5556 // Find all reorderable leaf nodes with the given VF.
5557 // Currently the are vectorized loads,extracts without alternate operands +
5558 // some gathering of extracts.
5559 SmallVector<TreeEntry *> NonVectorized;
5560 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5561 if (TE->State != TreeEntry::Vectorize &&
5562 TE->State != TreeEntry::StridedVectorize)
5563 NonVectorized.push_back(TE.get());
5564 if (std::optional<OrdersType> CurrentOrder =
5565 getReorderingData(*TE, /*TopToBottom=*/false)) {
5566 OrderedEntries.insert(TE.get());
5567 if (!(TE->State == TreeEntry::Vectorize ||
5568 TE->State == TreeEntry::StridedVectorize) ||
5569 !TE->ReuseShuffleIndices.empty())
5570 GathersToOrders.insert(TE.get());
5571 }
5572 }
5573
5574 // 1. Propagate order to the graph nodes, which use only reordered nodes.
5575 // I.e., if the node has operands, that are reordered, try to make at least
5576 // one operand order in the natural order and reorder others + reorder the
5577 // user node itself.
5579 while (!OrderedEntries.empty()) {
5580 // 1. Filter out only reordered nodes.
5581 // 2. If the entry has multiple uses - skip it and jump to the next node.
5583 SmallVector<TreeEntry *> Filtered;
5584 for (TreeEntry *TE : OrderedEntries) {
5585 if (!(TE->State == TreeEntry::Vectorize ||
5586 TE->State == TreeEntry::StridedVectorize ||
5587 (TE->isGather() && GathersToOrders.contains(TE))) ||
5588 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5589 !all_of(drop_begin(TE->UserTreeIndices),
5590 [TE](const EdgeInfo &EI) {
5591 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5592 }) ||
5593 !Visited.insert(TE).second) {
5594 Filtered.push_back(TE);
5595 continue;
5596 }
5597 // Build a map between user nodes and their operands order to speedup
5598 // search. The graph currently does not provide this dependency directly.
5599 for (EdgeInfo &EI : TE->UserTreeIndices) {
5600 TreeEntry *UserTE = EI.UserTE;
5601 auto It = Users.find(UserTE);
5602 if (It == Users.end())
5603 It = Users.insert({UserTE, {}}).first;
5604 It->second.emplace_back(EI.EdgeIdx, TE);
5605 }
5606 }
5607 // Erase filtered entries.
5608 for (TreeEntry *TE : Filtered)
5609 OrderedEntries.remove(TE);
5611 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5612 UsersVec(Users.begin(), Users.end());
5613 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
5614 return Data1.first->Idx > Data2.first->Idx;
5615 });
5616 for (auto &Data : UsersVec) {
5617 // Check that operands are used only in the User node.
5618 SmallVector<TreeEntry *> GatherOps;
5619 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
5620 GatherOps)) {
5621 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5622 OrderedEntries.remove(Op.second);
5623 continue;
5624 }
5625 // All operands are reordered and used only in this node - propagate the
5626 // most used order to the user node.
5629 OrdersUses;
5630 // Do the analysis for each tree entry only once, otherwise the order of
5631 // the same node my be considered several times, though might be not
5632 // profitable.
5635 for (const auto &Op : Data.second) {
5636 TreeEntry *OpTE = Op.second;
5637 if (!VisitedOps.insert(OpTE).second)
5638 continue;
5639 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5640 continue;
5641 const auto Order = [&]() -> const OrdersType {
5642 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
5643 return getReorderingData(*OpTE, /*TopToBottom=*/false)
5644 .value_or(OrdersType(1));
5645 return OpTE->ReorderIndices;
5646 }();
5647 // The order is partially ordered, skip it in favor of fully non-ordered
5648 // orders.
5649 if (Order.size() == 1)
5650 continue;
5651 unsigned NumOps = count_if(
5652 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
5653 return P.second == OpTE;
5654 });
5655 // Stores actually store the mask, not the order, need to invert.
5656 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5657 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5658 SmallVector<int> Mask;
5659 inversePermutation(Order, Mask);
5660 unsigned E = Order.size();
5661 OrdersType CurrentOrder(E, E);
5662 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5663 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5664 });
5665 fixupOrderingIndices(CurrentOrder);
5666 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5667 NumOps;
5668 } else {
5669 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5670 }
5671 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
5672 const auto AllowsReordering = [&](const TreeEntry *TE) {
5673 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5674 if (TE->isNonPowOf2Vec())
5675 return false;
5676 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5677 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5678 (IgnoreReorder && TE->Idx == 0))
5679 return true;
5680 if (TE->isGather()) {
5681 if (GathersToOrders.contains(TE))
5682 return !getReorderingData(*TE, /*TopToBottom=*/false)
5683 .value_or(OrdersType(1))
5684 .empty();
5685 return true;
5686 }
5687 return false;
5688 };
5689 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
5690 TreeEntry *UserTE = EI.UserTE;
5691 if (!VisitedUsers.insert(UserTE).second)
5692 continue;
5693 // May reorder user node if it requires reordering, has reused
5694 // scalars, is an alternate op vectorize node or its op nodes require
5695 // reordering.
5696 if (AllowsReordering(UserTE))
5697 continue;
5698 // Check if users allow reordering.
5699 // Currently look up just 1 level of operands to avoid increase of
5700 // the compile time.
5701 // Profitable to reorder if definitely more operands allow
5702 // reordering rather than those with natural order.
5704 if (static_cast<unsigned>(count_if(
5705 Ops, [UserTE, &AllowsReordering](
5706 const std::pair<unsigned, TreeEntry *> &Op) {
5707 return AllowsReordering(Op.second) &&
5708 all_of(Op.second->UserTreeIndices,
5709 [UserTE](const EdgeInfo &EI) {
5710 return EI.UserTE == UserTE;
5711 });
5712 })) <= Ops.size() / 2)
5713 ++Res.first->second;
5714 }
5715 }
5716 if (OrdersUses.empty()) {
5717 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5718 OrderedEntries.remove(Op.second);
5719 continue;
5720 }
5721 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5722 const unsigned Sz = Order.size();
5723 for (unsigned Idx : seq<unsigned>(0, Sz))
5724 if (Idx != Order[Idx] && Order[Idx] != Sz)
5725 return false;
5726 return true;
5727 };
5728 // Choose the most used order.
5729 unsigned IdentityCnt = 0;
5730 unsigned VF = Data.second.front().second->getVectorFactor();
5731 OrdersType IdentityOrder(VF, VF);
5732 for (auto &Pair : OrdersUses) {
5733 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5734 IdentityCnt += Pair.second;
5735 combineOrders(IdentityOrder, Pair.first);
5736 }
5737 }
5738 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5739 unsigned Cnt = IdentityCnt;
5740 for (auto &Pair : OrdersUses) {
5741 // Prefer identity order. But, if filled identity found (non-empty
5742 // order) with same number of uses, as the new candidate order, we can
5743 // choose this candidate order.
5744 if (Cnt < Pair.second) {
5745 combineOrders(Pair.first, BestOrder);
5746 BestOrder = Pair.first;
5747 Cnt = Pair.second;
5748 } else {
5749 combineOrders(BestOrder, Pair.first);
5750 }
5751 }
5752 // Set order of the user node.
5753 if (IsIdentityOrder(BestOrder)) {
5754 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5755 OrderedEntries.remove(Op.second);
5756 continue;
5757 }
5758 fixupOrderingIndices(BestOrder);
5759 // Erase operands from OrderedEntries list and adjust their orders.
5760 VisitedOps.clear();
5761 SmallVector<int> Mask;
5762 inversePermutation(BestOrder, Mask);
5763 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5764 unsigned E = BestOrder.size();
5765 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5766 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5767 });
5768 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
5769 TreeEntry *TE = Op.second;
5770 OrderedEntries.remove(TE);
5771 if (!VisitedOps.insert(TE).second)
5772 continue;
5773 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
5774 reorderNodeWithReuses(*TE, Mask);
5775 continue;
5776 }
5777 // Gathers are processed separately.
5778 if (TE->State != TreeEntry::Vectorize &&
5779 TE->State != TreeEntry::StridedVectorize &&
5780 (TE->State != TreeEntry::ScatterVectorize ||
5781 TE->ReorderIndices.empty()))
5782 continue;
5783 assert((BestOrder.size() == TE->ReorderIndices.size() ||
5784 TE->ReorderIndices.empty()) &&
5785 "Non-matching sizes of user/operand entries.");
5786 reorderOrder(TE->ReorderIndices, Mask);
5787 if (IgnoreReorder && TE == VectorizableTree.front().get())
5788 IgnoreReorder = false;
5789 }
5790 // For gathers just need to reorder its scalars.
5791 for (TreeEntry *Gather : GatherOps) {
5792 assert(Gather->ReorderIndices.empty() &&
5793 "Unexpected reordering of gathers.");
5794 if (!Gather->ReuseShuffleIndices.empty()) {
5795 // Just reorder reuses indices.
5796 reorderReuses(Gather->ReuseShuffleIndices, Mask);
5797 continue;
5798 }
5799 reorderScalars(Gather->Scalars, Mask);
5800 OrderedEntries.remove(Gather);
5801 }
5802 // Reorder operands of the user node and set the ordering for the user
5803 // node itself.
5804 if (Data.first->State != TreeEntry::Vectorize ||
5805 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5806 Data.first->getMainOp()) ||
5807 Data.first->isAltShuffle())
5808 Data.first->reorderOperands(Mask);
5809 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
5810 Data.first->isAltShuffle() ||
5811 Data.first->State == TreeEntry::StridedVectorize) {
5812 reorderScalars(Data.first->Scalars, Mask);
5813 reorderOrder(Data.first->ReorderIndices, MaskOrder,
5814 /*BottomOrder=*/true);
5815 if (Data.first->ReuseShuffleIndices.empty() &&
5816 !Data.first->ReorderIndices.empty() &&
5817 !Data.first->isAltShuffle()) {
5818 // Insert user node to the list to try to sink reordering deeper in
5819 // the graph.
5820 OrderedEntries.insert(Data.first);
5821 }
5822 } else {
5823 reorderOrder(Data.first->ReorderIndices, Mask);
5824 }
5825 }
5826 }
5827 // If the reordering is unnecessary, just remove the reorder.
5828 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5829 VectorizableTree.front()->ReuseShuffleIndices.empty())
5830 VectorizableTree.front()->ReorderIndices.clear();
5831}
5832
5834 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
5835 DenseMap<Value *, unsigned> ScalarToExtUses;
5836 // Collect the values that we need to extract from the tree.
5837 for (auto &TEPtr : VectorizableTree) {
5838 TreeEntry *Entry = TEPtr.get();
5839
5840 // No need to handle users of gathered values.
5841 if (Entry->isGather())
5842 continue;
5843
5844 // For each lane:
5845 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5846 Value *Scalar = Entry->Scalars[Lane];
5847 if (!isa<Instruction>(Scalar))
5848 continue;
5849 // All uses must be replaced already? No need to do it again.
5850 auto It = ScalarToExtUses.find(Scalar);
5851 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
5852 continue;
5853
5854 // Check if the scalar is externally used as an extra arg.
5855 const auto *ExtI = ExternallyUsedValues.find(Scalar);
5856 if (ExtI != ExternallyUsedValues.end()) {
5857 int FoundLane = Entry->findLaneForValue(Scalar);
5858 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
5859 << FoundLane << " from " << *Scalar << ".\n");
5860 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
5861 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
5862 continue;
5863 }
5864 for (User *U : Scalar->users()) {
5865 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
5866
5867 Instruction *UserInst = dyn_cast<Instruction>(U);
5868 if (!UserInst || isDeleted(UserInst))
5869 continue;
5870
5871 // Ignore users in the user ignore list.
5872 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5873 continue;
5874
5875 // Skip in-tree scalars that become vectors
5876 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5877 // Some in-tree scalars will remain as scalar in vectorized
5878 // instructions. If that is the case, the one in FoundLane will
5879 // be used.
5880 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5882 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5883 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
5884 << ".\n");
5885 assert(!UseEntry->isGather() && "Bad state");
5886 continue;
5887 }
5888 U = nullptr;
5889 if (It != ScalarToExtUses.end()) {
5890 ExternalUses[It->second].User = nullptr;
5891 break;
5892 }
5893 }
5894
5895 if (U && Scalar->hasNUsesOrMore(UsesLimit))
5896 U = nullptr;
5897 int FoundLane = Entry->findLaneForValue(Scalar);
5898 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
5899 << " from lane " << FoundLane << " from " << *Scalar
5900 << ".\n");
5901 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
5902 ExternalUses.emplace_back(Scalar, U, FoundLane);
5903 if (!U)
5904 break;
5905 }
5906 }
5907 }
5908}
5909
5911BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
5913 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5914 Value *V = TE->Scalars[Lane];
5915 // To save compilation time we don't visit if we have too many users.
5916 if (V->hasNUsesOrMore(UsesLimit))
5917 break;
5918
5919 // Collect stores per pointer object.
5920 for (User *U : V->users()) {
5921 auto *SI = dyn_cast<StoreInst>(U);
5922 if (SI == nullptr || !SI->isSimple() ||
5923 !isValidElementType(SI->getValueOperand()->getType()))
5924 continue;
5925 // Skip entry if already
5926 if (getTreeEntry(U))
5927 continue;
5928
5929 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
5930 auto &StoresVec = PtrToStoresMap[Ptr];
5931 // For now just keep one store per pointer object per lane.
5932 // TODO: Extend this to support multiple stores per pointer per lane
5933 if (StoresVec.size() > Lane)
5934 continue;
5935 // Skip if in different BBs.
5936 if (!StoresVec.empty() &&
5937 SI->getParent() != StoresVec.back()->getParent())
5938 continue;
5939 // Make sure that the stores are of the same type.
5940 if (!StoresVec.empty() &&
5941 SI->getValueOperand()->getType() !=
5942 StoresVec.back()->getValueOperand()->getType())
5943 continue;
5944 StoresVec.push_back(SI);
5945 }
5946 }
5947 return PtrToStoresMap;
5948}
5949
5950bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
5951 OrdersType &ReorderIndices) const {
5952 // We check whether the stores in StoreVec can form a vector by sorting them
5953 // and checking whether they are consecutive.
5954
5955 // To avoid calling getPointersDiff() while sorting we create a vector of
5956 // pairs {store, offset from first} and sort this instead.
5957 SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
5958 StoreInst *S0 = StoresVec[0];
5959 StoreOffsetVec[0] = {S0, 0};
5960 Type *S0Ty = S0->getValueOperand()->getType();
5961 Value *S0Ptr = S0->getPointerOperand();
5962 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
5963 StoreInst *SI = StoresVec[Idx];
5964 std::optional<int> Diff =
5965 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
5966 SI->getPointerOperand(), *DL, *SE,
5967 /*StrictCheck=*/true);
5968 // We failed to compare the pointers so just abandon this StoresVec.
5969 if (!Diff)
5970 return false;
5971 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
5972 }
5973
5974 // Sort the vector based on the pointers. We create a copy because we may
5975 // need the original later for calculating the reorder (shuffle) indices.
5976 stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
5977 const std::pair<StoreInst *, int> &Pair2) {
5978 int Offset1 = Pair1.second;
5979 int Offset2 = Pair2.second;
5980 return Offset1 < Offset2;
5981 });
5982
5983 // Check if the stores are consecutive by checking if their difference is 1.
5984 for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5985 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
5986 return false;
5987
5988 // Calculate the shuffle indices according to their offset against the sorted
5989 // StoreOffsetVec.
5990 ReorderIndices.reserve(StoresVec.size());
5991 for (StoreInst *SI : StoresVec) {
5992 unsigned Idx = find_if(StoreOffsetVec,
5993 [SI](const std::pair<StoreInst *, int> &Pair) {
5994 return Pair.first == SI;
5995 }) -
5996 StoreOffsetVec.begin();
5997 ReorderIndices.push_back(Idx);
5998 }
5999 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
6000 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
6001 // same convention here.
6002 auto IsIdentityOrder = [](const OrdersType &Order) {
6003 for (unsigned Idx : seq<unsigned>(0, Order.size()))
6004 if (Idx != Order[Idx])
6005 return false;
6006 return true;
6007 };
6008 if (IsIdentityOrder(ReorderIndices))
6009 ReorderIndices.clear();
6010
6011 return true;
6012}
6013
6014#ifndef NDEBUG
6016 for (unsigned Idx : Order)
6017 dbgs() << Idx << ", ";
6018 dbgs() << "\n";
6019}
6020#endif
6021
6023BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
6024 unsigned NumLanes = TE->Scalars.size();
6025
6027 collectUserStores(TE);
6028
6029 // Holds the reorder indices for each candidate store vector that is a user of
6030 // the current TreeEntry.
6031 SmallVector<OrdersType, 1> ExternalReorderIndices;
6032
6033 // Now inspect the stores collected per pointer and look for vectorization
6034 // candidates. For each candidate calculate the reorder index vector and push
6035 // it into `ExternalReorderIndices`
6036 for (const auto &Pair : PtrToStoresMap) {
6037 auto &StoresVec = Pair.second;
6038 // If we have fewer than NumLanes stores, then we can't form a vector.
6039 if (StoresVec.size() != NumLanes)
6040 continue;
6041
6042 // If the stores are not consecutive then abandon this StoresVec.
6043 OrdersType ReorderIndices;
6044 if (!canFormVector(StoresVec, ReorderIndices))
6045 continue;
6046
6047 // We now know that the scalars in StoresVec can form a vector instruction,
6048 // so set the reorder indices.
6049 ExternalReorderIndices.push_back(ReorderIndices);
6050 }
6051 return ExternalReorderIndices;
6052}
6053
6055 const SmallDenseSet<Value *> &UserIgnoreLst) {
6056 deleteTree();
6057 UserIgnoreList = &UserIgnoreLst;
6058 if (!allSameType(Roots))
6059 return;
6060 buildTree_rec(Roots, 0, EdgeInfo());
6061}
6062
6064 deleteTree();
6065 if (!allSameType(Roots))
6066 return;
6067 buildTree_rec(Roots, 0, EdgeInfo());
6068}
6069
6070/// \return true if the specified list of values has only one instruction that
6071/// requires scheduling, false otherwise.
6072#ifndef NDEBUG
6074 Value *NeedsScheduling = nullptr;
6075 for (Value *V : VL) {
6077 continue;
6078 if (!NeedsScheduling) {
6079 NeedsScheduling = V;
6080 continue;
6081 }
6082 return false;
6083 }
6084 return NeedsScheduling;
6085}
6086#endif
6087
6088/// Generates key/subkey pair for the given value to provide effective sorting
6089/// of the values and better detection of the vectorizable values sequences. The
6090/// keys/subkeys can be used for better sorting of the values themselves (keys)
6091/// and in values subgroups (subkeys).
6092static std::pair<size_t, size_t> generateKeySubkey(
6093 Value *V, const TargetLibraryInfo *TLI,
6094 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
6095 bool AllowAlternate) {
6096 hash_code Key = hash_value(V->getValueID() + 2);
6097 hash_code SubKey = hash_value(0);
6098 // Sort the loads by the distance between the pointers.
6099 if (auto *LI = dyn_cast<LoadInst>(V)) {
6100 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
6101 if (LI->isSimple())
6102 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
6103 else
6104 Key = SubKey = hash_value(LI);
6105 } else if (isVectorLikeInstWithConstOps(V)) {
6106 // Sort extracts by the vector operands.
6107 if (isa<ExtractElementInst, UndefValue>(V))
6108 Key = hash_value(Value::UndefValueVal + 1);
6109 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
6110 if (!isUndefVector(EI->getVectorOperand()).all() &&
6111 !isa<UndefValue>(EI->getIndexOperand()))
6112 SubKey = hash_value(EI->getVectorOperand());
6113 }
6114 } else if (auto *I = dyn_cast<Instruction>(V)) {
6115 // Sort other instructions just by the opcodes except for CMPInst.
6116 // For CMP also sort by the predicate kind.
6117 if ((isa<BinaryOperator, CastInst>(I)) &&
6118 isValidForAlternation(I->getOpcode())) {
6119 if (AllowAlternate)
6120 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
6121 else
6122 Key = hash_combine(hash_value(I->getOpcode()), Key);
6123 SubKey = hash_combine(
6124 hash_value(I->getOpcode()), hash_value(I->getType()),
6125 hash_value(isa<BinaryOperator>(I)
6126 ? I->getType()
6127 : cast<CastInst>(I)->getOperand(0)->getType()));
6128 // For casts, look through the only operand to improve compile time.
6129 if (isa<CastInst>(I)) {
6130 std::pair<size_t, size_t> OpVals =
6131 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
6132 /*AllowAlternate=*/true);
6133 Key = hash_combine(OpVals.first, Key);
6134 SubKey = hash_combine(OpVals.first, SubKey);
6135 }
6136 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
6137 CmpInst::Predicate Pred = CI->getPredicate();
6138 if (CI->isCommutative())
6139 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
6141 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
6142 hash_value(SwapPred),
6143 hash_value(CI->getOperand(0)->getType()));
6144 } else if (auto *Call = dyn_cast<CallInst>(I)) {
6147 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
6148 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
6149 SubKey = hash_combine(hash_value(I->getOpcode()),
6150 hash_value(Call->getCalledFunction()));
6151 } else {
6152 Key = hash_combine(hash_value(Call), Key);
6153 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
6154 }
6155 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
6156 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
6157 hash_value(Op.Tag), SubKey);
6158 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
6159 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
6160 SubKey = hash_value(Gep->getPointerOperand());
6161 else
6162 SubKey = hash_value(Gep);
6163 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
6164 !isa<ConstantInt>(I->getOperand(1))) {
6165 // Do not try to vectorize instructions with potentially high cost.
6166 SubKey = hash_value(I);
6167 } else {
6168 SubKey = hash_value(I->getOpcode());
6169 }
6170 Key = hash_combine(hash_value(I->getParent()), Key);
6171 }
6172 return std::make_pair(Key, SubKey);
6173}
6174
6175/// Checks if the specified instruction \p I is an alternate operation for
6176/// the given \p MainOp and \p AltOp instructions.
6177static bool isAlternateInstruction(const Instruction *I,
6178 const Instruction *MainOp,
6179 const Instruction *AltOp,
6180 const TargetLibraryInfo &TLI);
6181
6182bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
6183 ArrayRef<Value *> VL) const {
6184 unsigned Opcode0 = S.getOpcode();
6185 unsigned Opcode1 = S.getAltOpcode();
6186 SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
6187 // If this pattern is supported by the target then consider it profitable.
6188 if (TTI->isLegalAltInstr(getWidenedType(S.MainOp->getType(), VL.size()),
6189 Opcode0, Opcode1, OpcodeMask))
6190 return true;
6192 for (unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
6193 Operands.emplace_back();
6194 // Prepare the operand vector.
6195 for (Value *V : VL)
6196 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
6197 }
6198 if (Operands.size() == 2) {
6199 // Try find best operands candidates.
6200 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
6202 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
6203 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
6204 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
6205 std::optional<int> Res = findBestRootPair(Candidates);
6206 switch (Res.value_or(0)) {
6207 case 0:
6208 break;
6209 case 1:
6210 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
6211 break;
6212 case 2:
6213 std::swap(Operands[0][I], Operands[1][I]);
6214 break;
6215 default:
6216 llvm_unreachable("Unexpected index.");
6217 }
6218 }
6219 }
6220 DenseSet<unsigned> UniqueOpcodes;
6221 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
6222 unsigned NonInstCnt = 0;
6223 // Estimate number of instructions, required for the vectorized node and for
6224 // the buildvector node.
6225 unsigned UndefCnt = 0;
6226 // Count the number of extra shuffles, required for vector nodes.
6227 unsigned ExtraShuffleInsts = 0;
6228 // Check that operands do not contain same values and create either perfect
6229 // diamond match or shuffled match.
6230 if (Operands.size() == 2) {
6231 // Do not count same operands twice.
6232 if (Operands.front() == Operands.back()) {
6233 Operands.erase(Operands.begin());
6234 } else if (!allConstant(Operands.front()) &&
6235 all_of(Operands.front(), [&](Value *V) {
6236 return is_contained(Operands.back(), V);
6237 })) {
6238 Operands.erase(Operands.begin());
6239 ++ExtraShuffleInsts;
6240 }
6241 }
6242 const Loop *L = LI->getLoopFor(S.MainOp->getParent());
6243 // Vectorize node, if:
6244 // 1. at least single operand is constant or splat.
6245 // 2. Operands have many loop invariants (the instructions are not loop
6246 // invariants).
6247 // 3. At least single unique operands is supposed to vectorized.
6248 return none_of(Operands,
6249 [&](ArrayRef<Value *> Op) {
6250 if (allConstant(Op) ||
6251 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
6252 getSameOpcode(Op, *TLI).MainOp))
6253 return false;
6255 for (Value *V : Op) {
6256 if (isa<Constant, ExtractElementInst>(V) ||
6257 getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
6258 if (isa<UndefValue>(V))
6259 ++UndefCnt;
6260 continue;
6261 }
6262 auto Res = Uniques.try_emplace(V, 0);
6263 // Found first duplicate - need to add shuffle.
6264 if (!Res.second && Res.first->second == 1)
6265 ++ExtraShuffleInsts;
6266 ++Res.first->getSecond();
6267 if (auto *I = dyn_cast<Instruction>(V))
6268 UniqueOpcodes.insert(I->getOpcode());
6269 else if (Res.second)
6270 ++NonInstCnt;
6271 }
6272 return none_of(Uniques, [&](const auto &P) {
6273 return P.first->hasNUsesOrMore(P.second + 1) &&
6274 none_of(P.first->users(), [&](User *U) {
6275 return getTreeEntry(U) || Uniques.contains(U);
6276 });
6277 });
6278 }) ||
6279 // Do not vectorize node, if estimated number of vector instructions is
6280 // more than estimated number of buildvector instructions. Number of
6281 // vector operands is number of vector instructions + number of vector
6282 // instructions for operands (buildvectors). Number of buildvector
6283 // instructions is just number_of_operands * number_of_scalars.
6284 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
6285 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
6286 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
6287}
6288
6289BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
6290 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
6291 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
6292 assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
6293
6294 unsigned ShuffleOrOp =
6295 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
6296 auto *VL0 = cast<Instruction>(S.OpValue);
6297 switch (ShuffleOrOp) {
6298 case Instruction::PHI: {
6299 // Too many operands - gather, most probably won't be vectorized.
6300 if (VL0->getNumOperands() > MaxPHINumOperands)
6301 return TreeEntry::NeedToGather;
6302 // Check for terminator values (e.g. invoke).
6303 for (Value *V : VL)
6304 for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
6305 Instruction *Term = dyn_cast<Instruction>(Incoming);
6306 if (Term && Term->isTerminator()) {
6308 << "SLP: Need to swizzle PHINodes (terminator use).\n");
6309 return TreeEntry::NeedToGather;
6310 }
6311 }
6312
6313 return TreeEntry::Vectorize;
6314 }
6315 case Instruction::ExtractValue:
6316 case Instruction::ExtractElement: {
6317 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6318 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6319 if (!isPowerOf2_32(VL.size()))
6320 return TreeEntry::NeedToGather;
6321 if (Reuse || !CurrentOrder.empty())
6322 return TreeEntry::Vectorize;
6323 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
6324 return TreeEntry::NeedToGather;
6325 }
6326 case Instruction::InsertElement: {
6327 // Check that we have a buildvector and not a shuffle of 2 or more
6328 // different vectors.
6329 ValueSet SourceVectors;
6330 for (Value *V : VL) {
6331 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
6332 assert(getElementIndex(V) != std::nullopt &&
6333 "Non-constant or undef index?");
6334 }
6335
6336 if (count_if(VL, [&SourceVectors](Value *V) {
6337 return !SourceVectors.contains(V);
6338 }) >= 2) {
6339 // Found 2nd source vector - cancel.
6340 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
6341 "different source vectors.\n");
6342 return TreeEntry::NeedToGather;
6343 }
6344
6345 return TreeEntry::Vectorize;
6346 }
6347 case Instruction::Load: {
6348 // Check that a vectorized load would load the same memory as a scalar
6349 // load. For example, we don't want to vectorize loads that are smaller
6350 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6351 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6352 // from such a struct, we read/write packed bits disagreeing with the
6353 // unvectorized version.
6354 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
6356 return TreeEntry::Vectorize;
6358 return TreeEntry::ScatterVectorize;
6360 return TreeEntry::StridedVectorize;
6361 case LoadsState::Gather:
6362#ifndef NDEBUG
6363 Type *ScalarTy = VL0->getType();
6364 if (DL->getTypeSizeInBits(ScalarTy) !=
6365 DL->getTypeAllocSizeInBits(ScalarTy))
6366 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
6367 else if (any_of(VL,
6368 [](Value *V) { return !cast<LoadInst>(V)->isSimple(); }))
6369 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
6370 else
6371 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
6372#endif // NDEBUG
6373 return TreeEntry::NeedToGather;
6374 }
6375 llvm_unreachable("Unexpected state of loads");
6376 }
6377 case Instruction::ZExt:
6378 case Instruction::SExt:
6379 case Instruction::FPToUI:
6380 case Instruction::FPToSI:
6381 case Instruction::FPExt:
6382 case Instruction::PtrToInt:
6383 case Instruction::IntToPtr:
6384 case Instruction::SIToFP:
6385 case Instruction::UIToFP:
6386 case Instruction::Trunc:
6387 case Instruction::FPTrunc:
6388 case Instruction::BitCast: {
6389 Type *SrcTy = VL0->getOperand(0)->getType();
6390 for (Value *V : VL) {
6391 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6392 if (Ty != SrcTy || !isValidElementType(Ty)) {
6393 LLVM_DEBUG(
6394 dbgs() << "SLP: Gathering casts with different src types.\n");
6395 return TreeEntry::NeedToGather;
6396 }
6397 }
6398 return TreeEntry::Vectorize;
6399 }
6400 case Instruction::ICmp:
6401 case Instruction::FCmp: {
6402 // Check that all of the compares have the same predicate.
6403 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
6405 Type *ComparedTy = VL0->getOperand(0)->getType();
6406 for (Value *V : VL) {
6407 CmpInst *Cmp = cast<CmpInst>(V);
6408 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
6409 Cmp->getOperand(0)->getType() != ComparedTy) {
6410 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
6411 return TreeEntry::NeedToGather;
6412 }
6413 }
6414 return TreeEntry::Vectorize;
6415 }
6416 case Instruction::Select:
6417 case Instruction::FNeg:
6418 case Instruction::Add:
6419 case Instruction::FAdd:
6420 case Instruction::Sub:
6421 case Instruction::FSub:
6422 case Instruction::Mul:
6423 case Instruction::FMul:
6424 case Instruction::UDiv:
6425 case Instruction::SDiv:
6426 case Instruction::FDiv:
6427 case Instruction::URem:
6428 case Instruction::SRem:
6429 case Instruction::FRem:
6430 case Instruction::Shl:
6431 case Instruction::LShr:
6432 case Instruction::AShr:
6433 case Instruction::And:
6434 case Instruction::Or:
6435 case Instruction::Xor:
6436 return TreeEntry::Vectorize;
6437 case Instruction::GetElementPtr: {
6438 // We don't combine GEPs with complicated (nested) indexing.
6439 for (Value *V : VL) {
6440 auto *I = dyn_cast<GetElementPtrInst>(V);
6441 if (!I)
6442 continue;
6443 if (I->getNumOperands() != 2) {
6444 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
6445 return TreeEntry::NeedToGather;
6446 }
6447 }
6448
6449 // We can't combine several GEPs into one vector if they operate on
6450 // different types.
6451 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6452 for (Value *V : VL) {
6453 auto *GEP = dyn_cast<GEPOperator>(V);
6454 if (!GEP)
6455 continue;
6456 Type *CurTy = GEP->getSourceElementType();
6457 if (Ty0 != CurTy) {
6458 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
6459 return TreeEntry::NeedToGather;
6460 }
6461 }
6462
6463 // We don't combine GEPs with non-constant indexes.
6464 Type *Ty1 = VL0->getOperand(1)->getType();
6465 for (Value *V : VL) {
6466 auto *I = dyn_cast<GetElementPtrInst>(V);
6467 if (!I)
6468 continue;
6469 auto *Op = I->getOperand(1);
6470 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6471 (Op->getType() != Ty1 &&
6472 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6473 Op->getType()->getScalarSizeInBits() >
6474 DL->getIndexSizeInBits(
6475 V->getType()->getPointerAddressSpace())))) {
6476 LLVM_DEBUG(
6477 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
6478 return TreeEntry::NeedToGather;
6479 }
6480 }
6481
6482 return TreeEntry::Vectorize;
6483 }
6484 case Instruction::Store: {
6485 // Check if the stores are consecutive or if we need to swizzle them.
6486 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6487 // Avoid types that are padded when being allocated as scalars, while
6488 // being packed together in a vector (such as i1).
6489 if (DL->getTypeSizeInBits(ScalarTy) !=
6490 DL->getTypeAllocSizeInBits(ScalarTy)) {
6491 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
6492 return TreeEntry::NeedToGather;
6493 }
6494 // Make sure all stores in the bundle are simple - we can't vectorize
6495 // atomic or volatile stores.
6496 for (Value *V : VL) {
6497 auto *SI = cast<StoreInst>(V);
6498 if (!SI->isSimple()) {
6499 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
6500 return TreeEntry::NeedToGather;
6501 }
6502 PointerOps.push_back(SI->getPointerOperand());
6503 }
6504
6505 // Check the order of pointer operands.
6506 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
6507 Value *Ptr0;
6508 Value *PtrN;
6509 if (CurrentOrder.empty()) {
6510 Ptr0 = PointerOps.front();
6511 PtrN = PointerOps.back();
6512 } else {
6513 Ptr0 = PointerOps[CurrentOrder.front()];
6514 PtrN = PointerOps[CurrentOrder.back()];
6515 }
6516 std::optional<int> Dist =
6517 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
6518 // Check that the sorted pointer operands are consecutive.
6519 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
6520 return TreeEntry::Vectorize;
6521 }
6522
6523 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
6524 return TreeEntry::NeedToGather;
6525 }
6526 case Instruction::Call: {
6527 // Check if the calls are all to the same vectorizable intrinsic or
6528 // library function.
6529 CallInst *CI = cast<CallInst>(VL0);
6531
6532 VFShape Shape = VFShape::get(
6533 CI->getFunctionType(),
6534 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
6535 false /*HasGlobalPred*/);
6536 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
6537
6538 if (!VecFunc && !isTriviallyVectorizable(ID)) {
6539 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
6540 return TreeEntry::NeedToGather;
6541 }
6542 Function *F = CI->getCalledFunction();
6543 unsigned NumArgs = CI->arg_size();
6544 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
6545 for (unsigned J = 0; J != NumArgs; ++J)
6547 ScalarArgs[J] = CI->getArgOperand(J);
6548 for (Value *V : VL) {
6549 CallInst *CI2 = dyn_cast<CallInst>(V);
6550 if (!CI2 || CI2->getCalledFunction() != F ||
6551 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
6552 (VecFunc &&
6553 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
6555 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
6556 << "\n");
6557 return TreeEntry::NeedToGather;
6558 }
6559 // Some intrinsics have scalar arguments and should be same in order for
6560 // them to be vectorized.
6561 for (unsigned J = 0; J != NumArgs; ++J) {
6563 Value *A1J = CI2->getArgOperand(J);
6564 if (ScalarArgs[J] != A1J) {
6566 << "SLP: mismatched arguments in call:" << *CI
6567 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
6568 return TreeEntry::NeedToGather;
6569 }
6570 }
6571 }
6572 // Verify that the bundle operands are identical between the two calls.
6573 if (CI->hasOperandBundles() &&
6574 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
6575 CI->op_begin() + CI->getBundleOperandsEndIndex(),
6576 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
6577 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
6578 << "!=" << *V << '\n');
6579 return TreeEntry::NeedToGather;
6580 }
6581 }
6582
6583 return TreeEntry::Vectorize;
6584 }
6585 case Instruction::ShuffleVector: {
6586 // If this is not an alternate sequence of opcode like add-sub
6587 // then do not vectorize this instruction.
6588 if (!S.isAltShuffle()) {
6589 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
6590 return TreeEntry::NeedToGather;
6591 }
6592 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
6593 LLVM_DEBUG(
6594 dbgs()
6595 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
6596 "the whole alt sequence is not profitable.\n");
6597 return TreeEntry::NeedToGather;
6598 }
6599
6600 return TreeEntry::Vectorize;
6601 }
6602 default:
6603 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
6604 return TreeEntry::NeedToGather;
6605 }
6606}
6607
6608namespace {
6609/// Allows to correctly handle operands of the phi nodes based on the \p Main
6610/// PHINode order of incoming basic blocks/values.
6611class PHIHandler {
6612 DominatorTree &DT;
6613 PHINode *Main = nullptr;
6616
6617public:
6618 PHIHandler() = delete;
6619 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
6620 : DT(DT), Main(Main), Phis(Phis),
6621 Operands(Main->getNumIncomingValues(),
6622 SmallVector<Value *>(Phis.size(), nullptr)) {}
6623 void buildOperands() {
6624 constexpr unsigned FastLimit = 4;
6625 if (Main->getNumIncomingValues() <= FastLimit) {
6626 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
6627 BasicBlock *InBB = Main->getIncomingBlock(I);
6628 if (!DT.isReachableFromEntry(InBB)) {
6629 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
6630 continue;
6631 }
6632 // Prepare the operand vector.
6633 for (auto [Idx, V] : enumerate(Phis)) {
6634 auto *P = cast<PHINode>(V);
6635 if (P->getIncomingBlock(I) == InBB)
6636 Operands[I][Idx] = P->getIncomingValue(I);
6637 else
6638 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
6639 }
6640 }
6641 return;
6642 }
6644 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
6645 BasicBlock *InBB = Main->getIncomingBlock(I);
6646 if (!DT.isReachableFromEntry(InBB)) {
6647 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
6648 continue;
6649 }
6650 Blocks.try_emplace(InBB).first->second.push_back(I);
6651 }
6652 for (auto [Idx, V] : enumerate(Phis)) {
6653 auto *P = cast<PHINode>(V);
6654 for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) {
6655 BasicBlock *InBB = P->getIncomingBlock(I);
6656 if (InBB == Main->getIncomingBlock(I)) {
6657 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
6658 continue;
6659 Operands[I][Idx] = P->getIncomingValue(I);
6660 continue;
6661 }
6662 auto It = Blocks.find(InBB);
6663 if (It == Blocks.end())
6664 continue;
6665 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
6666 }
6667 }
6668 for (const auto &P : Blocks) {
6669 if (P.getSecond().size() <= 1)
6670 continue;
6671 unsigned BasicI = P.getSecond().front();
6672 for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
6674 [&](const auto &Data) {
6675 return !Data.value() ||
6676 Data.value() == Operands[BasicI][Data.index()];
6677 }) &&
6678 "Expected empty operands list.");
6679 Operands[I] = Operands[BasicI];
6680 }
6681 }
6682 }
6683 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
6684};
6685} // namespace
6686
6687void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
6688 const EdgeInfo &UserTreeIdx) {
6689 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
6690
6691 SmallVector<int> ReuseShuffleIndices;
6692 SmallVector<Value *> UniqueValues;
6693 SmallVector<Value *> NonUniqueValueVL;
6694 auto TryToFindDuplicates = [&](const InstructionsState &S,
6695 bool DoNotFail = false) {
6696 // Check that every instruction appears once in this bundle.
6697 DenseMap<Value *, unsigned> UniquePositions(VL.size());
6698 for (Value *V : VL) {
6699 if (isConstant(V)) {
6700 ReuseShuffleIndices.emplace_back(
6701 isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());
6702 UniqueValues.emplace_back(V);
6703 continue;
6704 }
6705 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
6706 ReuseShuffleIndices.emplace_back(Res.first->second);
6707 if (Res.second)
6708 UniqueValues.emplace_back(V);
6709 }
6710 size_t NumUniqueScalarValues = UniqueValues.size();
6711 if (NumUniqueScalarValues == VL.size()) {
6712 ReuseShuffleIndices.clear();
6713 } else {
6714 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6715 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6716 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
6717 "for nodes with padding.\n");
6718 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6719 return false;
6720 }
6721 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
6722 if (NumUniqueScalarValues <= 1 ||
6723 (UniquePositions.size() == 1 && all_of(UniqueValues,
6724 [](Value *V) {
6725 return isa<UndefValue>(V) ||
6726 !isConstant(V);
6727 })) ||
6728 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6729 if (DoNotFail && UniquePositions.size() > 1 &&
6730 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6731 all_of(UniqueValues, [=](Value *V) {
6732 return isa<ExtractElementInst>(V) ||
6733 areAllUsersVectorized(cast<Instruction>(V),
6734 UserIgnoreList);
6735 })) {
6736 unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
6737 if (PWSz == VL.size()) {
6738 ReuseShuffleIndices.clear();
6739 } else {
6740 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
6741 NonUniqueValueVL.append(PWSz - UniqueValues.size(),
6742 UniqueValues.back());
6743 VL = NonUniqueValueVL;
6744 }
6745 return true;
6746 }
6747 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
6748 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6749 return false;
6750 }
6751 VL = UniqueValues;
6752 }
6753 return true;
6754 };
6755
6756 InstructionsState S = getSameOpcode(VL, *TLI);
6757
6758 // Don't vectorize ephemeral values.
6759 if (!EphValues.empty()) {
6760 for (Value *V : VL) {
6761 if (EphValues.count(V)) {
6762 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6763 << ") is ephemeral.\n");
6764 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6765 return;
6766 }
6767 }
6768 }
6769
6770 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
6771 // a load), in which case peek through to include it in the tree, without
6772 // ballooning over-budget.
6773 if (Depth >= RecursionMaxDepth &&
6774 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6775 VL.size() >= 4 &&
6776 (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {
6777 return match(I,
6779 cast<Instruction>(I)->getOpcode() ==
6780 cast<Instruction>(S.MainOp)->getOpcode();
6781 })))) {
6782 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
6783 if (TryToFindDuplicates(S))
6784 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6785 ReuseShuffleIndices);
6786 return;
6787 }
6788
6789 // Don't handle scalable vectors
6790 if (S.getOpcode() == Instruction::ExtractElement &&
6791 isa<ScalableVectorType>(
6792 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6793 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
6794 if (TryToFindDuplicates(S))
6795 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6796 ReuseShuffleIndices);
6797 return;
6798 }
6799
6800 // Don't handle vectors.
6801 if (!SLPReVec && S.OpValue->getType()->isVectorTy() &&
6802 !isa<InsertElementInst>(S.OpValue)) {
6803 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
6804 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6805 return;
6806 }
6807
6808 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6809 if (!SLPReVec && SI->getValueOperand()->getType()->isVectorTy()) {
6810 LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
6811 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6812 return;
6813 }
6814
6815 // If all of the operands are identical or constant we have a simple solution.
6816 // If we deal with insert/extract instructions, they all must have constant
6817 // indices, otherwise we should gather them, not try to vectorize.
6818 // If alternate op node with 2 elements with gathered operands - do not
6819 // vectorize.
6820 auto &&NotProfitableForVectorization = [&S, this,
6822 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6823 return false;
6824 if (VectorizableTree.size() < MinTreeSize)
6825 return false;
6826 if (Depth >= RecursionMaxDepth - 1)
6827 return true;
6828 // Check if all operands are extracts, part of vector node or can build a
6829 // regular vectorize node.
6830 SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
6831 for (Value *V : VL) {
6832 auto *I = cast<Instruction>(V);
6833 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
6834 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6835 }));
6836 }
6837 bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
6838 if ((IsCommutative &&
6839 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6840 (!IsCommutative &&
6841 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
6842 return true;
6843 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
6845 auto *I1 = cast<Instruction>(VL.front());
6846 auto *I2 = cast<Instruction>(VL.back());
6847 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6848 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6849 I2->getOperand(Op));
6850 if (static_cast<unsigned>(count_if(
6851 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6853 })) >= S.MainOp->getNumOperands() / 2)
6854 return false;
6855 if (S.MainOp->getNumOperands() > 2)
6856 return true;
6857 if (IsCommutative) {
6858 // Check permuted operands.
6859 Candidates.clear();
6860 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6861 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6862 I2->getOperand((Op + 1) % E));
6863 if (any_of(
6864 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6866 }))
6867 return false;
6868 }
6869 return true;
6870 };
6871 SmallVector<unsigned> SortedIndices;
6872 BasicBlock *BB = nullptr;
6873 bool IsScatterVectorizeUserTE =
6874 UserTreeIdx.UserTE &&
6875 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6876 bool AreAllSameBlock = S.getOpcode() && allSameBlock(VL);
6877 bool AreScatterAllGEPSameBlock =
6878 (IsScatterVectorizeUserTE && S.OpValue->getType()->isPointerTy() &&
6879 VL.size() > 2 &&
6880 all_of(VL,
6881 [&BB](Value *V) {
6882 auto *I = dyn_cast<GetElementPtrInst>(V);
6883 if (!I)
6884 return doesNotNeedToBeScheduled(V);
6885 if (!BB)
6886 BB = I->getParent();
6887 return BB == I->getParent() && I->getNumOperands() == 2;
6888 }) &&
6889 BB &&
6890 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
6891 SortedIndices));
6892 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
6893 if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) ||
6894 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6895 S.OpValue) &&
6897 NotProfitableForVectorization(VL)) {
6898 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
6899 if (TryToFindDuplicates(S))
6900 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6901 ReuseShuffleIndices);
6902 return;
6903 }
6904
6905 // We now know that this is a vector of instructions of the same type from
6906 // the same block.
6907
6908 // Check if this is a duplicate of another entry.
6909 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
6910 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
6911 if (!E->isSame(VL)) {
6912 auto It = MultiNodeScalars.find(S.OpValue);
6913 if (It != MultiNodeScalars.end()) {
6914 auto *TEIt = find_if(It->getSecond(),
6915 [&](TreeEntry *ME) { return ME->isSame(VL); });
6916 if (TEIt != It->getSecond().end())
6917 E = *TEIt;
6918 else
6919 E = nullptr;
6920 } else {
6921 E = nullptr;
6922 }
6923 }
6924 if (!E) {
6925 if (!doesNotNeedToBeScheduled(S.OpValue)) {
6926 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
6927 if (TryToFindDuplicates(S))
6928 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6929 ReuseShuffleIndices);
6930 return;
6931 }
6932 } else {
6933 // Record the reuse of the tree node. FIXME, currently this is only used
6934 // to properly draw the graph rather than for the actual vectorization.
6935 E->UserTreeIndices.push_back(UserTreeIdx);
6936 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
6937 << ".\n");
6938 return;
6939 }
6940 }
6941
6942 // Check that none of the instructions in the bundle are already in the tree.
6943 for (Value *V : VL) {
6944 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
6946 continue;
6947 if (getTreeEntry(V)) {
6948 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6949 << ") is already in tree.\n");
6950 if (TryToFindDuplicates(S))
6951 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6952 ReuseShuffleIndices);
6953 return;
6954 }
6955 }
6956
6957 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
6958 if (UserIgnoreList && !UserIgnoreList->empty()) {
6959 for (Value *V : VL) {
6960 if (UserIgnoreList && UserIgnoreList->contains(V)) {
6961 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
6962 if (TryToFindDuplicates(S))
6963 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6964 ReuseShuffleIndices);
6965 return;
6966 }
6967 }
6968 }
6969
6970 // Special processing for sorted pointers for ScatterVectorize node with
6971 // constant indeces only.
6972 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
6973 assert(S.OpValue->getType()->isPointerTy() &&
6974 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
6975 "Expected pointers only.");
6976 // Reset S to make it GetElementPtr kind of node.
6977 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
6978 assert(It != VL.end() && "Expected at least one GEP.");
6979 S = getSameOpcode(*It, *TLI);
6980 }
6981
6982 // Check that all of the users of the scalars that we want to vectorize are
6983 // schedulable.
6984 auto *VL0 = cast<Instruction>(S.OpValue);
6985 BB = VL0->getParent();
6986
6987 if (!DT->isReachableFromEntry(BB)) {
6988 // Don't go into unreachable blocks. They may contain instructions with
6989 // dependency cycles which confuse the final scheduling.
6990 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
6991 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6992 return;
6993 }
6994
6995 // Don't go into catchswitch blocks, which can happen with PHIs.
6996 // Such blocks can only have PHIs and the catchswitch. There is no
6997 // place to insert a shuffle if we need to, so just avoid that issue.
6998 if (isa<CatchSwitchInst>(BB->getTerminator())) {
6999 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
7000 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
7001 return;
7002 }
7003
7004 // Check that every instruction appears once in this bundle.
7005 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
7006 return;
7007
7008 // Perform specific checks for each particular instruction kind.
7009 OrdersType CurrentOrder;
7010 SmallVector<Value *> PointerOps;
7011 TreeEntry::EntryState State = getScalarsVectorizationState(
7012 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
7013 if (State == TreeEntry::NeedToGather) {
7014 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7015 ReuseShuffleIndices);
7016 return;
7017 }
7018
7019 auto &BSRef = BlocksSchedules[BB];
7020 if (!BSRef)
7021 BSRef = std::make_unique<BlockScheduling>(BB);
7022
7023 BlockScheduling &BS = *BSRef;
7024
7025 std::optional<ScheduleData *> Bundle =
7026 BS.tryScheduleBundle(UniqueValues, this, S);
7027#ifdef EXPENSIVE_CHECKS
7028 // Make sure we didn't break any internal invariants
7029 BS.verify();
7030#endif
7031 if (!Bundle) {
7032 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
7033 assert((!BS.getScheduleData(VL0) ||
7034 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
7035 "tryScheduleBundle should cancelScheduling on failure");
7036 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7037 ReuseShuffleIndices);
7038 NonScheduledFirst.insert(VL.front());
7039 return;
7040 }
7041 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
7042
7043 unsigned ShuffleOrOp = S.isAltShuffle() ?
7044 (unsigned) Instruction::ShuffleVector : S.getOpcode();
7045 switch (ShuffleOrOp) {
7046 case Instruction::PHI: {
7047 auto *PH = cast<PHINode>(VL0);
7048
7049 TreeEntry *TE =
7050 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
7051 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
7052
7053 // Keeps the reordered operands to avoid code duplication.
7054 PHIHandler Handler(*DT, PH, VL);
7055 Handler.buildOperands();
7056 for (unsigned I : seq<unsigned>(0, PH->getNumOperands()))
7057 TE->setOperand(I, Handler.getOperands(I));
7058 for (unsigned I : seq<unsigned>(0, PH->getNumOperands()))
7059 buildTree_rec(Handler.getOperands(I), Depth + 1, {TE, I});
7060 return;
7061 }
7062 case Instruction::ExtractValue:
7063 case Instruction::ExtractElement: {
7064 if (CurrentOrder.empty()) {
7065 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
7066 } else {
7067 LLVM_DEBUG({
7068 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
7069 "with order";
7070 for (unsigned Idx : CurrentOrder)
7071 dbgs() << " " << Idx;
7072 dbgs() << "\n";
7073 });
7074 fixupOrderingIndices(CurrentOrder);
7075 }
7076 // Insert new order with initial value 0, if it does not exist,
7077 // otherwise return the iterator to the existing one.
7078 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7079 ReuseShuffleIndices, CurrentOrder);
7080 // This is a special case, as it does not gather, but at the same time
7081 // we are not extending buildTree_rec() towards the operands.
7082 ValueList Op0;
7083 Op0.assign(VL.size(), VL0->getOperand(0));
7084 VectorizableTree.back()->setOperand(0, Op0);
7085 return;
7086 }
7087 case Instruction::InsertElement: {
7088 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
7089
7090 auto OrdCompare = [](const std::pair<int, int> &P1,
7091 const std::pair<int, int> &P2) {
7092 return P1.first > P2.first;
7093 };
7095 decltype(OrdCompare)>
7096 Indices(OrdCompare);
7097 for (int I = 0, E = VL.size(); I < E; ++I) {
7098 unsigned Idx = *getElementIndex(VL[I]);
7099 Indices.emplace(Idx, I);
7100 }
7101 OrdersType CurrentOrder(VL.size(), VL.size());
7102 bool IsIdentity = true;
7103 for (int I = 0, E = VL.size(); I < E; ++I) {
7104 CurrentOrder[Indices.top().second] = I;
7105 IsIdentity &= Indices.top().second == I;
7106 Indices.pop();
7107 }
7108 if (IsIdentity)
7109 CurrentOrder.clear();
7110 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7111 std::nullopt, CurrentOrder);
7112 LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
7113
7114 TE->setOperandsInOrder();
7115 buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
7116 return;
7117 }
7118 case Instruction::Load: {
7119 // Check that a vectorized load would load the same memory as a scalar
7120 // load. For example, we don't want to vectorize loads that are smaller
7121 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7122 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7123 // from such a struct, we read/write packed bits disagreeing with the
7124 // unvectorized version.
7125 TreeEntry *TE = nullptr;
7126 fixupOrderingIndices(CurrentOrder);
7127 switch (State) {
7128 case TreeEntry::Vectorize:
7129 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7130 ReuseShuffleIndices, CurrentOrder);
7131 if (CurrentOrder.empty())
7132 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
7133 else
7134 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
7135 TE->setOperandsInOrder();
7136 break;
7137 case TreeEntry::StridedVectorize:
7138 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
7139 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
7140 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
7141 TE->setOperandsInOrder();
7142 LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
7143 break;
7144 case TreeEntry::ScatterVectorize:
7145 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
7146 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
7147 UserTreeIdx, ReuseShuffleIndices);
7148 TE->setOperandsInOrder();
7149 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
7150 LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
7151 break;
7152 case TreeEntry::NeedToGather:
7153 llvm_unreachable("Unexpected loads state.");
7154 }
7155 return;
7156 }
7157 case Instruction::ZExt:
7158 case Instruction::SExt:
7159 case Instruction::FPToUI:
7160 case Instruction::FPToSI:
7161 case Instruction::FPExt:
7162 case Instruction::PtrToInt:
7163 case Instruction::IntToPtr:
7164 case Instruction::SIToFP:
7165 case Instruction::UIToFP:
7166 case Instruction::Trunc:
7167 case Instruction::FPTrunc:
7168 case Instruction::BitCast: {
7169 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
7170 std::make_pair(std::numeric_limits<unsigned>::min(),
7171 std::numeric_limits<unsigned>::max()));
7172 if (ShuffleOrOp == Instruction::ZExt ||
7173 ShuffleOrOp == Instruction::SExt) {
7174 CastMaxMinBWSizes = std::make_pair(
7175 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
7176 PrevMaxBW),
7177 std::min<unsigned>(
7178 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
7179 PrevMinBW));
7180 } else if (ShuffleOrOp == Instruction::Trunc) {
7181 CastMaxMinBWSizes = std::make_pair(
7182 std::max<unsigned>(
7183 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
7184 PrevMaxBW),
7185 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
7186 PrevMinBW));
7187 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
7188 } else if (ShuffleOrOp == Instruction::SIToFP ||
7189 ShuffleOrOp == Instruction::UIToFP) {
7190 unsigned NumSignBits =
7191 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
7192 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
7193 APInt Mask = DB->getDemandedBits(OpI);
7194 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
7195 }
7196 if (NumSignBits * 2 >=
7197 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
7198 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
7199 }
7200 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7201 ReuseShuffleIndices);
7202 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
7203
7204 TE->setOperandsInOrder();
7205 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7206 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
7207 return;
7208 }
7209 case Instruction::ICmp:
7210 case Instruction::FCmp: {
7211 // Check that all of the compares have the same predicate.
7212 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
7213 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7214 ReuseShuffleIndices);
7215 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
7216
7218 if (cast<CmpInst>(VL0)->isCommutative()) {
7219 // Commutative predicate - collect + sort operands of the instructions
7220 // so that each side is more likely to have the same opcode.
7222 "Commutative Predicate mismatch");
7223 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7224 } else {
7225 // Collect operands - commute if it uses the swapped predicate.
7226 for (Value *V : VL) {
7227 auto *Cmp = cast<CmpInst>(V);
7228 Value *LHS = Cmp->getOperand(0);
7229 Value *RHS = Cmp->getOperand(1);
7230 if (Cmp->getPredicate() != P0)
7231 std::swap(LHS, RHS);
7232 Left.push_back(LHS);
7233 Right.push_back(RHS);
7234 }
7235 }
7236 TE->setOperand(0, Left);
7237 TE->setOperand(1, Right);
7238 buildTree_rec(Left, Depth + 1, {TE, 0});
7239 buildTree_rec(Right, Depth + 1, {TE, 1});
7240 if (ShuffleOrOp == Instruction::ICmp) {
7241 unsigned NumSignBits0 =
7242 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
7243 if (NumSignBits0 * 2 >=
7244 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
7245 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
7246 unsigned NumSignBits1 =
7247 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
7248 if (NumSignBits1 * 2 >=
7249 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
7250 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
7251 }
7252 return;
7253 }
7254 case Instruction::Select:
7255 case Instruction::FNeg:
7256 case Instruction::Add:
7257 case Instruction::FAdd:
7258 case Instruction::Sub:
7259 case Instruction::FSub:
7260 case Instruction::Mul:
7261 case Instruction::FMul:
7262 case Instruction::UDiv:
7263 case Instruction::SDiv:
7264 case Instruction::FDiv:
7265 case Instruction::URem:
7266 case Instruction::SRem:
7267 case Instruction::FRem:
7268 case Instruction::Shl:
7269 case Instruction::LShr:
7270 case Instruction::AShr:
7271 case Instruction::And:
7272 case Instruction::Or:
7273 case Instruction::Xor: {
7274 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7275 ReuseShuffleIndices);
7276 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
7277
7278 // Sort operands of the instructions so that each side is more likely to
7279 // have the same opcode.
7280 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
7282 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7283 TE->setOperand(0, Left);
7284 TE->setOperand(1, Right);
7285 buildTree_rec(Left, Depth + 1, {TE, 0});
7286 buildTree_rec(Right, Depth + 1, {TE, 1});
7287 return;
7288 }
7289
7290 TE->setOperandsInOrder();
7291 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7292 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
7293 return;
7294 }
7295 case Instruction::GetElementPtr: {
7296 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7297 ReuseShuffleIndices);
7298 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
7300 // Prepare the operand vector for pointer operands.
7301 for (Value *V : VL) {
7302 auto *GEP = dyn_cast<GetElementPtrInst>(V);
7303 if (!GEP) {
7304 Operands.front().push_back(V);
7305 continue;
7306 }
7307 Operands.front().push_back(GEP->getPointerOperand());
7308 }
7309 TE->setOperand(0, Operands.front());
7310 // Need to cast all indices to the same type before vectorization to
7311 // avoid crash.
7312 // Required to be able to find correct matches between different gather
7313 // nodes and reuse the vectorized values rather than trying to gather them
7314 // again.
7315 int IndexIdx = 1;
7316 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
7317 Type *Ty = all_of(VL,
7318 [VL0Ty, IndexIdx](Value *V) {
7319 auto *GEP = dyn_cast<GetElementPtrInst>(V);
7320 if (!GEP)
7321 return true;
7322 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
7323 })
7324 ? VL0Ty
7325 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
7326 ->getPointerOperandType()
7327 ->getScalarType());
7328 // Prepare the operand vector.
7329 for (Value *V : VL) {
7330 auto *I = dyn_cast<GetElementPtrInst>(V);
7331 if (!I) {
7332 Operands.back().push_back(
7333 ConstantInt::get(Ty, 0, /*isSigned=*/false));
7334 continue;
7335 }
7336 auto *Op = I->getOperand(IndexIdx);
7337 auto *CI = dyn_cast<ConstantInt>(Op);
7338 if (!CI)
7339 Operands.back().push_back(Op);
7340 else
7341 Operands.back().push_back(ConstantFoldIntegerCast(
7342 CI, Ty, CI->getValue().isSignBitSet(), *DL));
7343 }
7344 TE->setOperand(IndexIdx, Operands.back());
7345
7346 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
7347 buildTree_rec(Operands[I], Depth + 1, {TE, I});
7348 return;
7349 }
7350 case Instruction::Store: {
7351 bool Consecutive = CurrentOrder.empty();
7352 if (!Consecutive)
7353 fixupOrderingIndices(CurrentOrder);
7354 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7355 ReuseShuffleIndices, CurrentOrder);
7356 TE->setOperandsInOrder();
7357 buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
7358 if (Consecutive)
7359 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
7360 else
7361 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
7362 return;
7363 }
7364 case Instruction::Call: {
7365 // Check if the calls are all to the same vectorizable intrinsic or
7366 // library function.
7367 CallInst *CI = cast<CallInst>(VL0);
7369
7370 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7371 ReuseShuffleIndices);
7372 // Sort operands of the instructions so that each side is more likely to
7373 // have the same opcode.
7374 if (isCommutative(VL0)) {
7376 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7377 TE->setOperand(0, Left);
7378 TE->setOperand(1, Right);
7380 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7381 Operands.emplace_back();
7383 continue;
7384 for (Value *V : VL) {
7385 auto *CI2 = cast<CallInst>(V);
7386 Operands.back().push_back(CI2->getArgOperand(I));
7387 }
7388 TE->setOperand(I, Operands.back());
7389 }
7390 buildTree_rec(Left, Depth + 1, {TE, 0});
7391 buildTree_rec(Right, Depth + 1, {TE, 1});
7392 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7393 if (Operands[I - 2].empty())
7394 continue;
7395 buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
7396 }
7397 return;
7398 }
7399 TE->setOperandsInOrder();
7400 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
7401 // For scalar operands no need to create an entry since no need to
7402 // vectorize it.
7404 continue;
7406 // Prepare the operand vector.
7407 for (Value *V : VL) {
7408 auto *CI2 = cast<CallInst>(V);
7409 Operands.push_back(CI2->getArgOperand(I));
7410 }
7411 buildTree_rec(Operands, Depth + 1, {TE, I});
7412 }
7413 return;
7414 }
7415 case Instruction::ShuffleVector: {
7416 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7417 ReuseShuffleIndices);
7418 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
7419
7420 // Reorder operands if reordering would enable vectorization.
7421 auto *CI = dyn_cast<CmpInst>(VL0);
7422 if (isa<BinaryOperator>(VL0) || CI) {
7424 if (!CI || all_of(VL, [](Value *V) {
7425 return cast<CmpInst>(V)->isCommutative();
7426 })) {
7427 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7428 } else {
7429 auto *MainCI = cast<CmpInst>(S.MainOp);
7430 auto *AltCI = cast<CmpInst>(S.AltOp);
7431 CmpInst::Predicate MainP = MainCI->getPredicate();
7432 CmpInst::Predicate AltP = AltCI->getPredicate();
7433 assert(MainP != AltP &&
7434 "Expected different main/alternate predicates.");
7435 // Collect operands - commute if it uses the swapped predicate or
7436 // alternate operation.
7437 for (Value *V : VL) {
7438 auto *Cmp = cast<CmpInst>(V);
7439 Value *LHS = Cmp->getOperand(0);
7440 Value *RHS = Cmp->getOperand(1);
7441
7442 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
7443 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7444 std::swap(LHS, RHS);
7445 } else {
7446 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7447 std::swap(LHS, RHS);
7448 }
7449 Left.push_back(LHS);
7450 Right.push_back(RHS);
7451 }
7452 }
7453 TE->setOperand(0, Left);
7454 TE->setOperand(1, Right);
7455 buildTree_rec(Left, Depth + 1, {TE, 0});
7456 buildTree_rec(Right, Depth + 1, {TE, 1});
7457 return;
7458 }
7459
7460 TE->setOperandsInOrder();
7461 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7462 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
7463 return;
7464 }
7465 default:
7466 break;
7467 }
7468 llvm_unreachable("Unexpected vectorization of the instructions.");
7469}
7470
7472 unsigned N = 1;
7473 Type *EltTy = T;
7474
7475 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7476 if (auto *ST = dyn_cast<StructType>(EltTy)) {
7477 // Check that struct is homogeneous.
7478 for (const auto *Ty : ST->elements())
7479 if (Ty != *ST->element_begin())
7480 return 0;
7481 N *= ST->getNumElements();
7482 EltTy = *ST->element_begin();
7483 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
7484 N *= AT->getNumElements();
7485 EltTy = AT->getElementType();
7486 } else {
7487 auto *VT = cast<FixedVectorType>(EltTy);
7488 N *= VT->getNumElements();
7489 EltTy = VT->getElementType();
7490 }
7491 }
7492
7493 if (!isValidElementType(EltTy))
7494 return 0;
7495 uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
7496 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7497 VTSize != DL->getTypeStoreSizeInBits(T))
7498 return 0;
7499 return N;
7500}
7501
7502bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
7503 SmallVectorImpl<unsigned> &CurrentOrder,
7504 bool ResizeAllowed) const {
7505 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7506 assert(It != VL.end() && "Expected at least one extract instruction.");
7507 auto *E0 = cast<Instruction>(*It);
7508 assert(
7509 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7510 "Invalid opcode");
7511 // Check if all of the extracts come from the same vector and from the
7512 // correct offset.
7513 Value *Vec = E0->getOperand(0);
7514
7515 CurrentOrder.clear();
7516
7517 // We have to extract from a vector/aggregate with the same number of elements.
7518 unsigned NElts;
7519 if (E0->getOpcode() == Instruction::ExtractValue) {
7520 NElts = canMapToVector(Vec->getType());
7521 if (!NElts)
7522 return false;
7523 // Check if load can be rewritten as load of vector.
7524 LoadInst *LI = dyn_cast<LoadInst>(Vec);
7525 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
7526 return false;
7527 } else {
7528 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
7529 }
7530
7531 unsigned E = VL.size();
7532 if (!ResizeAllowed && NElts != E)
7533 return false;
7534 SmallVector<int> Indices(E, PoisonMaskElem);
7535 unsigned MinIdx = NElts, MaxIdx = 0;
7536 for (auto [I, V] : enumerate(VL)) {
7537 auto *Inst = dyn_cast<Instruction>(V);
7538 if (!Inst)
7539 continue;
7540 if (Inst->getOperand(0) != Vec)
7541 return false;
7542 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
7543 if (isa<UndefValue>(EE->getIndexOperand()))
7544 continue;
7545 std::optional<unsigned> Idx = getExtractIndex(Inst);
7546 if (!Idx)
7547 return false;
7548 const unsigned ExtIdx = *Idx;
7549 if (ExtIdx >= NElts)
7550 continue;
7551 Indices[I] = ExtIdx;
7552 if (MinIdx > ExtIdx)
7553 MinIdx = ExtIdx;
7554 if (MaxIdx < ExtIdx)
7555 MaxIdx = ExtIdx;
7556 }
7557 if (MaxIdx - MinIdx + 1 > E)
7558 return false;
7559 if (MaxIdx + 1 <= E)
7560 MinIdx = 0;
7561
7562 // Check that all of the indices extract from the correct offset.
7563 bool ShouldKeepOrder = true;
7564 // Assign to all items the initial value E + 1 so we can check if the extract
7565 // instruction index was used already.
7566 // Also, later we can check that all the indices are used and we have a
7567 // consecutive access in the extract instructions, by checking that no
7568 // element of CurrentOrder still has value E + 1.
7569 CurrentOrder.assign(E, E);
7570 for (unsigned I = 0; I < E; ++I) {
7571 if (Indices[I] == PoisonMaskElem)
7572 continue;
7573 const unsigned ExtIdx = Indices[I] - MinIdx;
7574 if (CurrentOrder[ExtIdx] != E) {
7575 CurrentOrder.clear();
7576 return false;
7577 }
7578 ShouldKeepOrder &= ExtIdx == I;
7579 CurrentOrder[ExtIdx] = I;
7580 }
7581 if (ShouldKeepOrder)
7582 CurrentOrder.clear();
7583
7584 return ShouldKeepOrder;
7585}
7586
7587bool BoUpSLP::areAllUsersVectorized(
7588 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
7589 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
7590 all_of(I->users(), [this](User *U) {
7591 return ScalarToTreeEntry.contains(U) ||
7592 isVectorLikeInstWithConstOps(U) ||
7593 (isa<ExtractElementInst>(U) && MustGather.contains(U));
7594 });
7595}
7596
7597static std::pair<InstructionCost, InstructionCost>
7600 ArrayRef<Type *> ArgTys) {
7602
7603 // Calculate the cost of the scalar and vector calls.
7604 FastMathFlags FMF;
7605 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
7606 FMF = FPCI->getFastMathFlags();
7608 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
7609 dyn_cast<IntrinsicInst>(CI));
7610 auto IntrinsicCost =
7612
7613 auto Shape = VFShape::get(CI->getFunctionType(),
7615 false /*HasGlobalPred*/);
7616 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7617 auto LibCost = IntrinsicCost;
7618 if (!CI->isNoBuiltin() && VecFunc) {
7619 // Calculate the cost of the vector library call.
7620 // If the corresponding vector call is cheaper, return its cost.
7621 LibCost =
7622 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
7623 }
7624 return {IntrinsicCost, LibCost};
7625}
7626
7627void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7628 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
7629 SmallVectorImpl<Value *> *OpScalars,
7630 SmallVectorImpl<Value *> *AltScalars) const {
7631 unsigned Sz = Scalars.size();
7632 Mask.assign(Sz, PoisonMaskElem);
7633 SmallVector<int> OrderMask;
7634 if (!ReorderIndices.empty())
7635 inversePermutation(ReorderIndices, OrderMask);
7636 for (unsigned I = 0; I < Sz; ++I) {
7637 unsigned Idx = I;
7638 if (!ReorderIndices.empty())
7639 Idx = OrderMask[I];
7640 auto *OpInst = cast<Instruction>(Scalars[Idx]);
7641 if (IsAltOp(OpInst)) {
7642 Mask[I] = Sz + Idx;
7643 if (AltScalars)
7644 AltScalars->push_back(OpInst);
7645 } else {
7646 Mask[I] = Idx;
7647 if (OpScalars)
7648 OpScalars->push_back(OpInst);
7649 }
7650 }
7651 if (!ReuseShuffleIndices.empty()) {
7652 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
7653 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
7654 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7655 });
7656 Mask.swap(NewMask);
7657 }
7658}
7659
7661 const Instruction *MainOp,
7662 const Instruction *AltOp,
7663 const TargetLibraryInfo &TLI) {
7664 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7665 auto *AltCI = cast<CmpInst>(AltOp);
7666 CmpInst::Predicate MainP = MainCI->getPredicate();
7667 CmpInst::Predicate AltP = AltCI->getPredicate();
7668 assert(MainP != AltP && "Expected different main/alternate predicates.");
7669 auto *CI = cast<CmpInst>(I);
7670 if (isCmpSameOrSwapped(MainCI, CI, TLI))
7671 return false;
7672 if (isCmpSameOrSwapped(AltCI, CI, TLI))
7673 return true;
7674 CmpInst::Predicate P = CI->getPredicate();
7676
7677 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
7678 "CmpInst expected to match either main or alternate predicate or "
7679 "their swap.");
7680 (void)AltP;
7681 return MainP != P && MainP != SwappedP;
7682 }
7683 return I->getOpcode() == AltOp->getOpcode();
7684}
7685
7686TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
7687 assert(!Ops.empty());
7688 const auto *Op0 = Ops.front();
7689
7690 const bool IsConstant = all_of(Ops, [](Value *V) {
7691 // TODO: We should allow undef elements here
7692 return isConstant(V) && !isa<UndefValue>(V);
7693 });
7694 const bool IsUniform = all_of(Ops, [=](Value *V) {
7695 // TODO: We should allow undef elements here
7696 return V == Op0;
7697 });
7698 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
7699 // TODO: We should allow undef elements here
7700 if (auto *CI = dyn_cast<ConstantInt>(V))
7701 return CI->getValue().isPowerOf2();
7702 return false;
7703 });
7704 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
7705 // TODO: We should allow undef elements here
7706 if (auto *CI = dyn_cast<ConstantInt>(V))
7707 return CI->getValue().isNegatedPowerOf2();
7708 return false;
7709 });
7710
7712 if (IsConstant && IsUniform)
7714 else if (IsConstant)
7716 else if (IsUniform)
7718
7720 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
7721 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
7722
7723 return {VK, VP};
7724}
7725
7726namespace {
7727/// The base class for shuffle instruction emission and shuffle cost estimation.
7728class BaseShuffleAnalysis {
7729protected:
7730 /// Checks if the mask is an identity mask.
7731 /// \param IsStrict if is true the function returns false if mask size does
7732 /// not match vector size.
7733 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
7734 bool IsStrict) {
7735 int Limit = Mask.size();
7736 int VF = VecTy->getNumElements();
7737 int Index = -1;
7738 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
7739 return true;
7740 if (!IsStrict) {
7741 // Consider extract subvector starting from index 0.
7743 Index == 0)
7744 return true;
7745 // All VF-size submasks are identity (e.g.
7746 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
7747 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
7748 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
7749 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
7751 }))
7752 return true;
7753 }
7754 return false;
7755 }
7756
7757 /// Tries to combine 2 different masks into single one.
7758 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
7759 /// change the size of the vector, \p LocalVF is the original size of the
7760 /// shuffled vector.
7761 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
7762 ArrayRef<int> ExtMask) {
7763 unsigned VF = Mask.size();
7764 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
7765 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
7766 if (ExtMask[I] == PoisonMaskElem)
7767 continue;
7768 int MaskedIdx = Mask[ExtMask[I] % VF];
7769 NewMask[I] =
7770 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
7771 }
7772 Mask.swap(NewMask);
7773 }
7774
7775 /// Looks through shuffles trying to reduce final number of shuffles in the
7776 /// code. The function looks through the previously emitted shuffle
7777 /// instructions and properly mark indices in mask as undef.
7778 /// For example, given the code
7779 /// \code
7780 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
7781 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
7782 /// \endcode
7783 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
7784 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7785 /// <0, 1, 2, 3> for the shuffle.
7786 /// If 2 operands are of different size, the smallest one will be resized and
7787 /// the mask recalculated properly.
7788 /// For example, given the code
7789 /// \code
7790 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
7791 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
7792 /// \endcode
7793 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
7794 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7795 /// <0, 1, 2, 3> for the shuffle.
7796 /// So, it tries to transform permutations to simple vector merge, if
7797 /// possible.
7798 /// \param V The input vector which must be shuffled using the given \p Mask.
7799 /// If the better candidate is found, \p V is set to this best candidate
7800 /// vector.
7801 /// \param Mask The input mask for the shuffle. If the best candidate is found
7802 /// during looking-through-shuffles attempt, it is updated accordingly.
7803 /// \param SinglePermute true if the shuffle operation is originally a
7804 /// single-value-permutation. In this case the look-through-shuffles procedure
7805 /// may look for resizing shuffles as the best candidates.
7806 /// \return true if the shuffle results in the non-resizing identity shuffle
7807 /// (and thus can be ignored), false - otherwise.
7808 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
7809 bool SinglePermute) {
7810 Value *Op = V;
7811 ShuffleVectorInst *IdentityOp = nullptr;
7812 SmallVector<int> IdentityMask;
7813 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
7814 // Exit if not a fixed vector type or changing size shuffle.
7815 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7816 if (!SVTy)
7817 break;
7818 // Remember the identity or broadcast mask, if it is not a resizing
7819 // shuffle. If no better candidates are found, this Op and Mask will be
7820 // used in the final shuffle.
7821 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
7822 if (!IdentityOp || !SinglePermute ||
7823 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
7825 IdentityMask.size()))) {
7826 IdentityOp = SV;
7827 // Store current mask in the IdentityMask so later we did not lost
7828 // this info if IdentityOp is selected as the best candidate for the
7829 // permutation.
7830 IdentityMask.assign(Mask);
7831 }
7832 }
7833 // Remember the broadcast mask. If no better candidates are found, this Op
7834 // and Mask will be used in the final shuffle.
7835 // Zero splat can be used as identity too, since it might be used with
7836 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
7837 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
7838 // expensive, the analysis founds out, that the source vector is just a
7839 // broadcast, this original mask can be transformed to identity mask <0,
7840 // 1, 2, 3>.
7841 // \code
7842 // %0 = shuffle %v, poison, zeroinitalizer
7843 // %res = shuffle %0, poison, <3, 1, 2, 0>
7844 // \endcode
7845 // may be transformed to
7846 // \code
7847 // %0 = shuffle %v, poison, zeroinitalizer
7848 // %res = shuffle %0, poison, <0, 1, 2, 3>
7849 // \endcode
7850 if (SV->isZeroEltSplat()) {
7851 IdentityOp = SV;
7852 IdentityMask.assign(Mask);
7853 }
7854 int LocalVF = Mask.size();
7855 if (auto *SVOpTy =
7856 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7857 LocalVF = SVOpTy->getNumElements();
7858 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
7859 for (auto [Idx, I] : enumerate(Mask)) {
7860 if (I == PoisonMaskElem ||
7861 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
7862 continue;
7863 ExtMask[Idx] = SV->getMaskValue(I);
7864 }
7865 bool IsOp1Undef =
7866 isUndefVector(SV->getOperand(0),
7867 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
7868 .all();
7869 bool IsOp2Undef =
7870 isUndefVector(SV->getOperand(1),
7871 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
7872 .all();
7873 if (!IsOp1Undef && !IsOp2Undef) {
7874 // Update mask and mark undef elems.
7875 for (int &I : Mask) {
7876 if (I == PoisonMaskElem)
7877 continue;
7878 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
7880 I = PoisonMaskElem;
7881 }
7882 break;
7883 }
7884 SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
7885 SV->getShuffleMask().end());
7886 combineMasks(LocalVF, ShuffleMask, Mask);
7887 Mask.swap(ShuffleMask);
7888 if (IsOp2Undef)
7889 Op = SV->getOperand(0);
7890 else
7891 Op = SV->getOperand(1);
7892 }
7893 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
7894 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7896 if (IdentityOp) {
7897 V = IdentityOp;
7898 assert(Mask.size() == IdentityMask.size() &&
7899 "Expected masks of same sizes.");
7900 // Clear known poison elements.
7901 for (auto [I, Idx] : enumerate(Mask))
7902 if (Idx == PoisonMaskElem)
7903 IdentityMask[I] = PoisonMaskElem;
7904 Mask.swap(IdentityMask);
7905 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7906 return SinglePermute &&
7907 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
7908 /*IsStrict=*/true) ||
7909 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
7910 Shuffle->isZeroEltSplat() &&
7912 }
7913 V = Op;
7914 return false;
7915 }
7916 V = Op;
7917 return true;
7918 }
7919
7920 /// Smart shuffle instruction emission, walks through shuffles trees and
7921 /// tries to find the best matching vector for the actual shuffle
7922 /// instruction.
7923 template <typename T, typename ShuffleBuilderTy>
7924 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
7925 ShuffleBuilderTy &Builder) {
7926 assert(V1 && "Expected at least one vector value.");
7927 if (V2)
7928 Builder.resizeToMatch(V1, V2);
7929 int VF = Mask.size();
7930 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
7931 VF = FTy->getNumElements();
7932 if (V2 &&
7933 !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) {
7934 // Peek through shuffles.
7935 Value *Op1 = V1;
7936 Value *Op2 = V2;
7937 int VF =
7938 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
7939 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
7940 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
7941 for (int I = 0, E = Mask.size(); I < E; ++I) {
7942 if (Mask[I] < VF)
7943 CombinedMask1[I] = Mask[I];
7944 else
7945 CombinedMask2[I] = Mask[I] - VF;
7946 }
7947 Value *PrevOp1;
7948 Value *PrevOp2;
7949 do {
7950 PrevOp1 = Op1;
7951 PrevOp2 = Op2;
7952 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
7953 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
7954 // Check if we have 2 resizing shuffles - need to peek through operands
7955 // again.
7956 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
7957 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
7958 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
7959 for (auto [Idx, I] : enumerate(CombinedMask1)) {
7960 if (I == PoisonMaskElem)
7961 continue;
7962 ExtMask1[Idx] = SV1->getMaskValue(I);
7963 }
7964 SmallBitVector UseMask1 = buildUseMask(
7965 cast<FixedVectorType>(SV1->getOperand(1)->getType())
7966 ->getNumElements(),
7967 ExtMask1, UseMask::SecondArg);
7968 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
7969 for (auto [Idx, I] : enumerate(CombinedMask2)) {
7970 if (I == PoisonMaskElem)
7971 continue;
7972 ExtMask2[Idx] = SV2->getMaskValue(I);
7973 }
7974 SmallBitVector UseMask2 = buildUseMask(
7975 cast<FixedVectorType>(SV2->getOperand(1)->getType())
7976 ->getNumElements(),
7977 ExtMask2, UseMask::SecondArg);
7978 if (SV1->getOperand(0)->getType() ==
7979 SV2->getOperand(0)->getType() &&
7980 SV1->getOperand(0)->getType() != SV1->getType() &&
7981 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
7982 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
7983 Op1 = SV1->getOperand(0);
7984 Op2 = SV2->getOperand(0);
7985 SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
7986 SV1->getShuffleMask().end());
7987 int LocalVF = ShuffleMask1.size();
7988 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
7989 LocalVF = FTy->getNumElements();
7990 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
7991 CombinedMask1.swap(ShuffleMask1);
7992 SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
7993 SV2->getShuffleMask().end());
7994 LocalVF = ShuffleMask2.size();
7995 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
7996 LocalVF = FTy->getNumElements();
7997 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
7998 CombinedMask2.swap(ShuffleMask2);
7999 }
8000 }
8001 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
8002 Builder.resizeToMatch(Op1, Op2);
8003 VF = std::max(cast<VectorType>(Op1->getType())
8004 ->getElementCount()
8005 .getKnownMinValue(),
8006 cast<VectorType>(Op2->getType())
8007 ->getElementCount()
8008 .getKnownMinValue());
8009 for (int I = 0, E = Mask.size(); I < E; ++I) {
8010 if (CombinedMask2[I] != PoisonMaskElem) {
8011 assert(CombinedMask1[I] == PoisonMaskElem &&
8012 "Expected undefined mask element");
8013 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
8014 }
8015 }
8016 if (Op1 == Op2 &&
8017 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
8018 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
8019 isa<ShuffleVectorInst>(Op1) &&
8020 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
8021 ArrayRef(CombinedMask1))))
8022 return Builder.createIdentity(Op1);
8023 return Builder.createShuffleVector(
8024 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
8025 CombinedMask1);
8026 }
8027 if (isa<PoisonValue>(V1))
8028 return Builder.createPoison(
8029 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
8030 SmallVector<int> NewMask(Mask.begin(), Mask.end());
8031 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
8032 assert(V1 && "Expected non-null value after looking through shuffles.");
8033
8034 if (!IsIdentity)
8035 return Builder.createShuffleVector(V1, NewMask);
8036 return Builder.createIdentity(V1);
8037 }
8038};
8039} // namespace
8040
8041/// Returns the cost of the shuffle instructions with the given \p Kind, vector
8042/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
8043/// subvector pattern.
8044static InstructionCost
8046 VectorType *Tp, ArrayRef<int> Mask = std::nullopt,
8048 int Index = 0, VectorType *SubTp = nullptr,
8049 ArrayRef<const Value *> Args = std::nullopt) {
8050 if (Kind != TTI::SK_PermuteTwoSrc)
8051 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
8052 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
8053 int NumSubElts;
8054 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
8055 Mask, NumSrcElts, NumSubElts, Index)) {
8056 if (Index + NumSubElts > NumSrcElts &&
8057 Index + NumSrcElts <= static_cast<int>(Mask.size()))
8058 return TTI.getShuffleCost(
8060 getWidenedType(Tp->getElementType(), Mask.size()), Mask,
8062 }
8063 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
8064}
8065
8066/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
8067static std::pair<InstructionCost, InstructionCost>
8069 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
8070 Type *ScalarTy, VectorType *VecTy) {
8071 InstructionCost ScalarCost = 0;
8072 InstructionCost VecCost = 0;
8073 // Here we differentiate two cases: (1) when Ptrs represent a regular
8074 // vectorization tree node (as they are pointer arguments of scattered
8075 // loads) or (2) when Ptrs are the arguments of loads or stores being
8076 // vectorized as plane wide unit-stride load/store since all the
8077 // loads/stores are known to be from/to adjacent locations.
8078 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
8079 // Case 2: estimate costs for pointer related costs when vectorizing to
8080 // a wide load/store.
8081 // Scalar cost is estimated as a set of pointers with known relationship
8082 // between them.
8083 // For vector code we will use BasePtr as argument for the wide load/store
8084 // but we also need to account all the instructions which are going to
8085 // stay in vectorized code due to uses outside of these scalar
8086 // loads/stores.
8087 ScalarCost = TTI.getPointersChainCost(
8088 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
8089 CostKind);
8090
8091 SmallVector<const Value *> PtrsRetainedInVecCode;
8092 for (Value *V : Ptrs) {
8093 if (V == BasePtr) {
8094 PtrsRetainedInVecCode.push_back(V);
8095 continue;
8096 }
8097 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
8098 // For simplicity assume Ptr to stay in vectorized code if it's not a
8099 // GEP instruction. We don't care since it's cost considered free.
8100 // TODO: We should check for any uses outside of vectorizable tree
8101 // rather than just single use.
8102 if (!Ptr || !Ptr->hasOneUse())
8103 PtrsRetainedInVecCode.push_back(V);
8104 }
8105
8106 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
8107 // If all pointers stay in vectorized code then we don't have
8108 // any savings on that.
8109 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
8110 }
8111 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
8112 TTI::PointersChainInfo::getKnownStride(),
8113 VecTy, CostKind);
8114 } else {
8115 // Case 1: Ptrs are the arguments of loads that we are going to transform
8116 // into masked gather load intrinsic.
8117 // All the scalar GEPs will be removed as a result of vectorization.
8118 // For any external uses of some lanes extract element instructions will
8119 // be generated (which cost is estimated separately).
8120 TTI::PointersChainInfo PtrsInfo =
8121 all_of(Ptrs,
8122 [](const Value *V) {
8123 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
8124 return Ptr && !Ptr->hasAllConstantIndices();
8125 })
8126 ? TTI::PointersChainInfo::getUnknownStride()
8127 : TTI::PointersChainInfo::getKnownStride();
8128
8129 ScalarCost =
8130 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
8131 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
8132 if (!BaseGEP) {
8133 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
8134 if (It != Ptrs.end())
8135 BaseGEP = cast<GEPOperator>(*It);
8136 }
8137 if (BaseGEP) {
8138 SmallVector<const Value *> Indices(BaseGEP->indices());
8139 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
8140 BaseGEP->getPointerOperand(), Indices, VecTy,
8141 CostKind);
8142 }
8143 }
8144
8145 return std::make_pair(ScalarCost, VecCost);
8146}
8147
8150 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8151 TreeEntry &E = *TE;
8152 switch (E.getOpcode()) {
8153 case Instruction::Load: {
8154 // No need to reorder masked gather loads, just reorder the scalar
8155 // operands.
8156 if (E.State != TreeEntry::Vectorize)
8157 break;
8158 Type *ScalarTy = E.getMainOp()->getType();
8159 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
8160 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
8161 // Check if profitable to represent consecutive load + reverse as strided
8162 // load with stride -1.
8163 if (isReverseOrder(E.ReorderIndices) &&
8164 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
8165 SmallVector<int> Mask;
8166 inversePermutation(E.ReorderIndices, Mask);
8167 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
8168 InstructionCost OriginalVecCost =
8169 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
8174 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
8175 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
8176 if (StridedCost < OriginalVecCost)
8177 // Strided load is more profitable than consecutive load + reverse -
8178 // transform the node to strided load.
8179 E.State = TreeEntry::StridedVectorize;
8180 }
8181 break;
8182 }
8183 case Instruction::Store: {
8184 Type *ScalarTy =
8185 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
8186 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
8187 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
8188 // Check if profitable to represent consecutive load + reverse as strided
8189 // load with stride -1.
8190 if (isReverseOrder(E.ReorderIndices) &&
8191 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
8192 SmallVector<int> Mask;
8193 inversePermutation(E.ReorderIndices, Mask);
8194 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
8195 InstructionCost OriginalVecCost =
8196 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
8201 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
8202 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
8203 if (StridedCost < OriginalVecCost)
8204 // Strided load is more profitable than consecutive load + reverse -
8205 // transform the node to strided load.
8206 E.State = TreeEntry::StridedVectorize;
8207 }
8208 break;
8209 }
8210 default:
8211 break;
8212 }
8213 }
8214}
8215
8216/// Merges shuffle masks and emits final shuffle instruction, if required. It
8217/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
8218/// when the actual shuffle instruction is generated only if this is actually
8219/// required. Otherwise, the shuffle instruction emission is delayed till the
8220/// end of the process, to reduce the number of emitted instructions and further
8221/// analysis/transformations.
8222class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
8223 bool IsFinalized = false;
8224 SmallVector<int> CommonMask;
8226 Type *ScalarTy = nullptr;
8227 const TargetTransformInfo &TTI;
8229 SmallDenseSet<Value *> VectorizedVals;
8230 BoUpSLP &R;
8231 SmallPtrSetImpl<Value *> &CheckedExtracts;
8232 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
8233 /// While set, still trying to estimate the cost for the same nodes and we
8234 /// can delay actual cost estimation (virtual shuffle instruction emission).
8235 /// May help better estimate the cost if same nodes must be permuted + allows
8236 /// to move most of the long shuffles cost estimation to TTI.
8237 bool SameNodesEstimated = true;
8238
8239 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
8240 if (Ty->getScalarType()->isPointerTy()) {
8244 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
8245 Ty->getScalarType());
8246 if (auto *VTy = dyn_cast<VectorType>(Ty))
8247 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
8248 return Res;
8249 }
8250 return Constant::getAllOnesValue(Ty);
8251 }
8252
8253 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
8254 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
8255 return TTI::TCC_Free;
8256 auto *VecTy = getWidenedType(ScalarTy, VL.size());
8257 InstructionCost GatherCost = 0;
8258 SmallVector<Value *> Gathers(VL.begin(), VL.end());
8259 // Improve gather cost for gather of loads, if we can group some of the
8260 // loads into vector loads.
8261 InstructionsState S = getSameOpcode(VL, *R.TLI);
8262 const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy);
8263 unsigned MinVF = R.getMinVF(2 * Sz);
8264 if (VL.size() > 2 &&
8265 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
8266 (InVectors.empty() &&
8267 any_of(seq<unsigned>(0, VL.size() / MinVF),
8268 [&](unsigned Idx) {
8269 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
8270 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
8271 return S.getOpcode() == Instruction::Load &&
8272 !S.isAltShuffle();
8273 }))) &&
8274 !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
8275 !isSplat(Gathers)) {
8276 InstructionCost BaseCost = R.getGatherCost(Gathers, !Root, ScalarTy);
8277 SetVector<Value *> VectorizedLoads;
8279 SmallVector<unsigned> ScatterVectorized;
8280 unsigned StartIdx = 0;
8281 unsigned VF = VL.size() / 2;
8282 for (; VF >= MinVF; VF /= 2) {
8283 for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
8284 Cnt += VF) {
8285 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
8286 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
8287 InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);
8288 if (SliceS.getOpcode() != Instruction::Load ||
8289 SliceS.isAltShuffle())
8290 continue;
8291 }
8292 if (!VectorizedLoads.count(Slice.front()) &&
8293 !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
8294 SmallVector<Value *> PointerOps;
8295 OrdersType CurrentOrder;
8296 LoadsState LS = R.canVectorizeLoads(Slice, Slice.front(),
8297 CurrentOrder, PointerOps);
8298 switch (LS) {
8302 // Mark the vectorized loads so that we don't vectorize them
8303 // again.
8304 // TODO: better handling of loads with reorders.
8305 if (((LS == LoadsState::Vectorize ||
8307 CurrentOrder.empty()) ||
8309 isReverseOrder(CurrentOrder)))
8310 VectorizedStarts.emplace_back(Cnt, LS);
8311 else
8312 ScatterVectorized.push_back(Cnt);
8313 VectorizedLoads.insert(Slice.begin(), Slice.end());
8314 // If we vectorized initial block, no need to try to vectorize
8315 // it again.
8316 if (Cnt == StartIdx)
8317 StartIdx += VF;
8318 break;
8319 case LoadsState::Gather:
8320 break;
8321 }
8322 }
8323 }
8324 // Check if the whole array was vectorized already - exit.
8325 if (StartIdx >= VL.size())
8326 break;
8327 // Found vectorizable parts - exit.
8328 if (!VectorizedLoads.empty())
8329 break;
8330 }
8331 if (!VectorizedLoads.empty()) {
8332 unsigned NumParts = TTI.getNumberOfParts(VecTy);
8333 bool NeedInsertSubvectorAnalysis =
8334 !NumParts || (VL.size() / VF) > NumParts;
8335 // Get the cost for gathered loads.
8336 for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
8337 if (VectorizedLoads.contains(VL[I]))
8338 continue;
8339 GatherCost +=
8340 getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root);
8341 }
8342 // Exclude potentially vectorized loads from list of gathered
8343 // scalars.
8344 Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));
8345 // The cost for vectorized loads.
8346 InstructionCost ScalarsCost = 0;
8347 for (Value *V : VectorizedLoads) {
8348 auto *LI = cast<LoadInst>(V);
8349 ScalarsCost +=
8350 TTI.getMemoryOpCost(Instruction::Load, LI->getType(),
8351 LI->getAlign(), LI->getPointerAddressSpace(),
8352 CostKind, TTI::OperandValueInfo(), LI);
8353 }
8354 auto *LoadTy = getWidenedType(VL.front()->getType(), VF);
8355 for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
8356 auto *LI = cast<LoadInst>(VL[P.first]);
8357 Align Alignment = LI->getAlign();
8358 GatherCost +=
8359 P.second == LoadsState::Vectorize
8360 ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
8361 LI->getPointerAddressSpace(), CostKind,
8364 Instruction::Load, LoadTy, LI->getPointerOperand(),
8365 /*VariableMask=*/false, Alignment, CostKind, LI);
8366 // Estimate GEP cost.
8367 SmallVector<Value *> PointerOps(VF);
8368 for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
8369 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8370 auto [ScalarGEPCost, VectorGEPCost] =
8371 getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
8372 Instruction::Load, CostKind, LI->getType(), LoadTy);
8373 GatherCost += VectorGEPCost - ScalarGEPCost;
8374 }
8375 for (unsigned P : ScatterVectorized) {
8376 auto *LI0 = cast<LoadInst>(VL[P]);
8377 ArrayRef<Value *> Slice = VL.slice(P, VF);
8378 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
8379 GatherCost += TTI.getGatherScatterOpCost(
8380 Instruction::Load, LoadTy, LI0->getPointerOperand(),
8381 /*VariableMask=*/false, CommonAlignment, CostKind, LI0);
8382 // Estimate GEP cost.
8383 SmallVector<Value *> PointerOps(VF);
8384 for (auto [I, V] : enumerate(Slice))
8385 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8386 OrdersType Order;
8387 if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE,
8388 Order)) {
8389 // TODO: improve checks if GEPs can be vectorized.
8390 Value *Ptr0 = PointerOps.front();
8391 Type *ScalarTy = Ptr0->getType();
8392 auto *VecTy = getWidenedType(ScalarTy, VF);
8393 auto [ScalarGEPCost, VectorGEPCost] =
8394 getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr,
8395 CostKind, ScalarTy, VecTy);
8396 GatherCost += VectorGEPCost - ScalarGEPCost;
8397 if (!Order.empty()) {
8398 SmallVector<int> Mask;
8399 inversePermutation(Order, Mask);
8401 VecTy, Mask, CostKind);
8402 }
8403 } else {
8404 GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true,
8405 PointerOps.front()->getType());
8406 }
8407 }
8408 if (NeedInsertSubvectorAnalysis) {
8409 // Add the cost for the subvectors insert.
8410 SmallVector<int> ShuffleMask(VL.size());
8411 for (unsigned I = VF, E = VL.size(); I < E; I += VF) {
8412 for (unsigned Idx : seq<unsigned>(0, E))
8413 ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx;
8414 GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy,
8415 ShuffleMask, CostKind, I, LoadTy);
8416 }
8417 }
8418 GatherCost -= ScalarsCost;
8419 }
8420 GatherCost = std::min(BaseCost, GatherCost);
8421 } else if (!Root && isSplat(VL)) {
8422 // Found the broadcasting of the single scalar, calculate the cost as
8423 // the broadcast.
8424 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
8425 assert(It != VL.end() && "Expected at least one non-undef value.");
8426 // Add broadcast for non-identity shuffle only.
8427 bool NeedShuffle =
8428 count(VL, *It) > 1 &&
8429 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
8430 if (!NeedShuffle)
8431 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
8432 CostKind, std::distance(VL.begin(), It),
8433 PoisonValue::get(VecTy), *It);
8434
8435 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
8436 transform(VL, ShuffleMask.begin(), [](Value *V) {
8437 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8438 });
8439 InstructionCost InsertCost =
8440 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
8441 PoisonValue::get(VecTy), *It);
8443 VecTy, ShuffleMask, CostKind,
8444 /*Index=*/0, /*SubTp=*/nullptr,
8445 /*Args=*/*It);
8446 }
8447 return GatherCost +
8448 (all_of(Gathers, IsaPred<UndefValue>)
8450 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
8451 ScalarTy));
8452 };
8453
8454 /// Compute the cost of creating a vector containing the extracted values from
8455 /// \p VL.
8457 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
8458 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8459 unsigned NumParts) {
8460 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
8461 unsigned NumElts =
8462 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
8463 auto *EE = dyn_cast<ExtractElementInst>(V);
8464 if (!EE)
8465 return Sz;
8466 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8467 if (!VecTy)
8468 return Sz;
8469 return std::max(Sz, VecTy->getNumElements());
8470 });
8471 // FIXME: this must be moved to TTI for better estimation.
8472 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
8473 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
8475 -> std::optional<TTI::ShuffleKind> {
8476 if (NumElts <= EltsPerVector)
8477 return std::nullopt;
8478 int OffsetReg0 =
8479 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
8480 [](int S, int I) {
8481 if (I == PoisonMaskElem)
8482 return S;
8483 return std::min(S, I);
8484 }),
8485 EltsPerVector);
8486 int OffsetReg1 = OffsetReg0;
8487 DenseSet<int> RegIndices;
8488 // Check that if trying to permute same single/2 input vectors.
8490 int FirstRegId = -1;
8491 Indices.assign(1, OffsetReg0);
8492 for (auto [Pos, I] : enumerate(Mask)) {
8493 if (I == PoisonMaskElem)
8494 continue;
8495 int Idx = I - OffsetReg0;
8496 int RegId =
8497 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
8498 if (FirstRegId < 0)
8499 FirstRegId = RegId;
8500 RegIndices.insert(RegId);
8501 if (RegIndices.size() > 2)
8502 return std::nullopt;
8503 if (RegIndices.size() == 2) {
8504 ShuffleKind = TTI::SK_PermuteTwoSrc;
8505 if (Indices.size() == 1) {
8506 OffsetReg1 = alignDown(
8507 std::accumulate(
8508 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
8509 [&](int S, int I) {
8510 if (I == PoisonMaskElem)
8511 return S;
8512 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
8513 ((I - OffsetReg0) % NumElts) / EltsPerVector;
8514 if (RegId == FirstRegId)
8515 return S;
8516 return std::min(S, I);
8517 }),
8518 EltsPerVector);
8519 Indices.push_back(OffsetReg1 % NumElts);
8520 }
8521 Idx = I - OffsetReg1;
8522 }
8523 I = (Idx % NumElts) % EltsPerVector +
8524 (RegId == FirstRegId ? 0 : EltsPerVector);
8525 }
8526 return ShuffleKind;
8527 };
8529
8530 // Process extracts in blocks of EltsPerVector to check if the source vector
8531 // operand can be re-used directly. If not, add the cost of creating a
8532 // shuffle to extract the values into a vector register.
8533 for (unsigned Part : seq<unsigned>(NumParts)) {
8534 if (!ShuffleKinds[Part])
8535 continue;
8536 ArrayRef<int> MaskSlice = Mask.slice(
8537 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
8538 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
8539 copy(MaskSlice, SubMask.begin());
8541 std::optional<TTI::ShuffleKind> RegShuffleKind =
8542 CheckPerRegistersShuffle(SubMask, Indices);
8543 if (!RegShuffleKind) {
8544 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
8546 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
8547 Cost +=
8548 ::getShuffleCost(TTI, *ShuffleKinds[Part],
8549 getWidenedType(ScalarTy, NumElts), MaskSlice);
8550 continue;
8551 }
8552 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
8553 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
8554 Cost +=
8555 ::getShuffleCost(TTI, *RegShuffleKind,
8556 getWidenedType(ScalarTy, EltsPerVector), SubMask);
8557 }
8558 for (unsigned Idx : Indices) {
8559 assert((Idx + EltsPerVector) <= alignTo(NumElts, EltsPerVector) &&
8560 "SK_ExtractSubvector index out of range");
8563 getWidenedType(ScalarTy, alignTo(NumElts, EltsPerVector)),
8564 std::nullopt, CostKind, Idx,
8565 getWidenedType(ScalarTy, EltsPerVector));
8566 }
8567 // Second attempt to check, if just a permute is better estimated than
8568 // subvector extract.
8569 SubMask.assign(NumElts, PoisonMaskElem);
8570 copy(MaskSlice, SubMask.begin());
8571 InstructionCost OriginalCost = ::getShuffleCost(
8572 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
8573 if (OriginalCost < Cost)
8574 Cost = OriginalCost;
8575 }
8576 return Cost;
8577 }
8578 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
8579 /// shuffle emission.
8580 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
8581 ArrayRef<int> Mask) {
8582 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8583 if (Mask[Idx] != PoisonMaskElem)
8584 CommonMask[Idx] = Idx;
8585 }
8586 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
8587 /// mask \p Mask, register number \p Part, that includes \p SliceSize
8588 /// elements.
8589 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
8590 ArrayRef<int> Mask, unsigned Part,
8591 unsigned SliceSize) {
8592 if (SameNodesEstimated) {
8593 // Delay the cost estimation if the same nodes are reshuffling.
8594 // If we already requested the cost of reshuffling of E1 and E2 before, no
8595 // need to estimate another cost with the sub-Mask, instead include this
8596 // sub-Mask into the CommonMask to estimate it later and avoid double cost
8597 // estimation.
8598 if ((InVectors.size() == 2 &&
8599 InVectors.front().get<const TreeEntry *>() == &E1 &&
8600 InVectors.back().get<const TreeEntry *>() == E2) ||
8601 (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
8602 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
8603 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
8604 [](int Idx) { return Idx == PoisonMaskElem; }) &&
8605 "Expected all poisoned elements.");
8606 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
8607 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
8608 return;
8609 }
8610 // Found non-matching nodes - need to estimate the cost for the matched
8611 // and transform mask.
8612 Cost += createShuffle(InVectors.front(),
8613 InVectors.size() == 1 ? nullptr : InVectors.back(),
8614 CommonMask);
8615 transformMaskAfterShuffle(CommonMask, CommonMask);
8616 }
8617 SameNodesEstimated = false;
8618 if (!E2 && InVectors.size() == 1) {
8619 unsigned VF = E1.getVectorFactor();
8620 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
8621 VF = std::max(VF,
8622 cast<FixedVectorType>(V1->getType())->getNumElements());
8623 } else {
8624 const auto *E = InVectors.front().get<const TreeEntry *>();
8625 VF = std::max(VF, E->getVectorFactor());
8626 }
8627 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8628 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8629 CommonMask[Idx] = Mask[Idx] + VF;
8630 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
8631 transformMaskAfterShuffle(CommonMask, CommonMask);
8632 } else {
8633 Cost += createShuffle(&E1, E2, Mask);
8634 transformMaskAfterShuffle(CommonMask, Mask);
8635 }
8636 }
8637
8638 class ShuffleCostBuilder {
8639 const TargetTransformInfo &TTI;
8640
8641 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
8642 int Index = -1;
8643 return Mask.empty() ||
8644 (VF == Mask.size() &&
8647 Index == 0);
8648 }
8649
8650 public:
8651 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
8652 ~ShuffleCostBuilder() = default;
8653 InstructionCost createShuffleVector(Value *V1, Value *,
8654 ArrayRef<int> Mask) const {
8655 // Empty mask or identity mask are free.
8656 unsigned VF =
8657 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8658 if (isEmptyOrIdentity(Mask, VF))
8659 return TTI::TCC_Free;
8660 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
8661 cast<VectorType>(V1->getType()), Mask);
8662 }
8663 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
8664 // Empty mask or identity mask are free.
8665 unsigned VF =
8666 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8667 if (isEmptyOrIdentity(Mask, VF))
8668 return TTI::TCC_Free;
8670 cast<VectorType>(V1->getType()), Mask);
8671 }
8672 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
8673 InstructionCost createPoison(Type *Ty, unsigned VF) const {
8674 return TTI::TCC_Free;
8675 }
8676 void resizeToMatch(Value *&, Value *&) const {}
8677 };
8678
8679 /// Smart shuffle instruction emission, walks through shuffles trees and
8680 /// tries to find the best matching vector for the actual shuffle
8681 /// instruction.
8683 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
8685 ArrayRef<int> Mask) {
8686 ShuffleCostBuilder Builder(TTI);
8687 SmallVector<int> CommonMask(Mask.begin(), Mask.end());
8688 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
8689 unsigned CommonVF = Mask.size();
8690 InstructionCost ExtraCost = 0;
8691 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
8692 unsigned VF) -> InstructionCost {
8693 if (E.isGather() && allConstant(E.Scalars))
8694 return TTI::TCC_Free;
8695 Type *EScalarTy = E.Scalars.front()->getType();
8696 bool IsSigned = true;
8697 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
8698 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
8699 IsSigned = It->second.second;
8700 }
8701 if (EScalarTy != ScalarTy) {
8702 unsigned CastOpcode = Instruction::Trunc;
8703 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8704 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8705 if (DstSz > SrcSz)
8706 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8707 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
8708 getWidenedType(EScalarTy, VF),
8709 TTI::CastContextHint::None, CostKind);
8710 }
8711 return TTI::TCC_Free;
8712 };
8713 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
8714 if (isa<Constant>(V))
8715 return TTI::TCC_Free;
8716 auto *VecTy = cast<VectorType>(V->getType());
8717 Type *EScalarTy = VecTy->getElementType();
8718 if (EScalarTy != ScalarTy) {
8719 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
8720 unsigned CastOpcode = Instruction::Trunc;
8721 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8722 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8723 if (DstSz > SrcSz)
8724 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8725 return TTI.getCastInstrCost(
8726 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
8727 VecTy, TTI::CastContextHint::None, CostKind);
8728 }
8729 return TTI::TCC_Free;
8730 };
8731 if (!V1 && !V2 && !P2.isNull()) {
8732 // Shuffle 2 entry nodes.
8733 const TreeEntry *E = P1.get<const TreeEntry *>();
8734 unsigned VF = E->getVectorFactor();
8735 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8736 CommonVF = std::max(VF, E2->getVectorFactor());
8737 assert(all_of(Mask,
8738 [=](int Idx) {
8739 return Idx < 2 * static_cast<int>(CommonVF);
8740 }) &&
8741 "All elements in mask must be less than 2 * CommonVF.");
8742 if (E->Scalars.size() == E2->Scalars.size()) {
8743 SmallVector<int> EMask = E->getCommonMask();
8744 SmallVector<int> E2Mask = E2->getCommonMask();
8745 if (!EMask.empty() || !E2Mask.empty()) {
8746 for (int &Idx : CommonMask) {
8747 if (Idx == PoisonMaskElem)
8748 continue;
8749 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
8750 Idx = EMask[Idx];
8751 else if (Idx >= static_cast<int>(CommonVF))
8752 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
8753 E->Scalars.size();
8754 }
8755 }
8756 CommonVF = E->Scalars.size();
8757 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
8758 GetNodeMinBWAffectedCost(*E2, CommonVF);
8759 } else {
8760 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
8761 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
8762 }
8763 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8764 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8765 } else if (!V1 && P2.isNull()) {
8766 // Shuffle single entry node.
8767 const TreeEntry *E = P1.get<const TreeEntry *>();
8768 unsigned VF = E->getVectorFactor();
8769 CommonVF = VF;
8770 assert(
8771 all_of(Mask,
8772 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8773 "All elements in mask must be less than CommonVF.");
8774 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8775 SmallVector<int> EMask = E->getCommonMask();
8776 assert(!EMask.empty() && "Expected non-empty common mask.");
8777 for (int &Idx : CommonMask) {
8778 if (Idx != PoisonMaskElem)
8779 Idx = EMask[Idx];
8780 }
8781 CommonVF = E->Scalars.size();
8782 }
8783 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
8784 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8785 // Not identity/broadcast? Try to see if the original vector is better.
8786 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8787 CommonVF == CommonMask.size() &&
8788 any_of(enumerate(CommonMask),
8789 [](const auto &&P) {
8790 return P.value() != PoisonMaskElem &&
8791 static_cast<unsigned>(P.value()) != P.index();
8792 }) &&
8793 any_of(CommonMask,
8794 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
8795 SmallVector<int> ReorderMask;
8796 inversePermutation(E->ReorderIndices, ReorderMask);
8797 ::addMask(CommonMask, ReorderMask);
8798 }
8799 } else if (V1 && P2.isNull()) {
8800 // Shuffle single vector.
8801 ExtraCost += GetValueMinBWAffectedCost(V1);
8802 CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
8803 assert(
8804 all_of(Mask,
8805 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8806 "All elements in mask must be less than CommonVF.");
8807 } else if (V1 && !V2) {
8808 // Shuffle vector and tree node.
8809 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8810 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8811 CommonVF = std::max(VF, E2->getVectorFactor());
8812 assert(all_of(Mask,
8813 [=](int Idx) {
8814 return Idx < 2 * static_cast<int>(CommonVF);
8815 }) &&
8816 "All elements in mask must be less than 2 * CommonVF.");
8817 if (E2->Scalars.size() == VF && VF != CommonVF) {
8818 SmallVector<int> E2Mask = E2->getCommonMask();
8819 assert(!E2Mask.empty() && "Expected non-empty common mask.");
8820 for (int &Idx : CommonMask) {
8821 if (Idx == PoisonMaskElem)
8822 continue;
8823 if (Idx >= static_cast<int>(CommonVF))
8824 Idx = E2Mask[Idx - CommonVF] + VF;
8825 }
8826 CommonVF = VF;
8827 }
8828 ExtraCost += GetValueMinBWAffectedCost(V1);
8829 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8830 ExtraCost += GetNodeMinBWAffectedCost(
8831 *E2, std::min(CommonVF, E2->getVectorFactor()));
8832 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8833 } else if (!V1 && V2) {
8834 // Shuffle vector and tree node.
8835 unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
8836 const TreeEntry *E1 = P1.get<const TreeEntry *>();
8837 CommonVF = std::max(VF, E1->getVectorFactor());
8838 assert(all_of(Mask,
8839 [=](int Idx) {
8840 return Idx < 2 * static_cast<int>(CommonVF);
8841 }) &&
8842 "All elements in mask must be less than 2 * CommonVF.");
8843 if (E1->Scalars.size() == VF && VF != CommonVF) {
8844 SmallVector<int> E1Mask = E1->getCommonMask();
8845 assert(!E1Mask.empty() && "Expected non-empty common mask.");
8846 for (int &Idx : CommonMask) {
8847 if (Idx == PoisonMaskElem)
8848 continue;
8849 if (Idx >= static_cast<int>(CommonVF))
8850 Idx = E1Mask[Idx - CommonVF] + VF;
8851 else
8852 Idx = E1Mask[Idx];
8853 }
8854 CommonVF = VF;
8855 }
8856 ExtraCost += GetNodeMinBWAffectedCost(
8857 *E1, std::min(CommonVF, E1->getVectorFactor()));
8858 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8859 ExtraCost += GetValueMinBWAffectedCost(V2);
8860 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8861 } else {
8862 assert(V1 && V2 && "Expected both vectors.");
8863 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8864 CommonVF =
8865 std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
8866 assert(all_of(Mask,
8867 [=](int Idx) {
8868 return Idx < 2 * static_cast<int>(CommonVF);
8869 }) &&
8870 "All elements in mask must be less than 2 * CommonVF.");
8871 ExtraCost +=
8872 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
8873 if (V1->getType() != V2->getType()) {
8874 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8875 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8876 } else {
8877 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
8878 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8879 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
8880 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8881 }
8882 }
8883 InVectors.front() =
8884 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
8885 if (InVectors.size() == 2)
8886 InVectors.pop_back();
8887 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
8888 V1, V2, CommonMask, Builder);
8889 }
8890
8891public:
8893 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
8894 SmallPtrSetImpl<Value *> &CheckedExtracts)
8895 : ScalarTy(ScalarTy), TTI(TTI),
8896 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
8897 CheckedExtracts(CheckedExtracts) {}
8898 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
8899 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8900 unsigned NumParts, bool &UseVecBaseAsInput) {
8901 UseVecBaseAsInput = false;
8902 if (Mask.empty())
8903 return nullptr;
8904 Value *VecBase = nullptr;
8905 ArrayRef<Value *> VL = E->Scalars;
8906 // If the resulting type is scalarized, do not adjust the cost.
8907 if (NumParts == VL.size())
8908 return nullptr;
8909 // Check if it can be considered reused if same extractelements were
8910 // vectorized already.
8911 bool PrevNodeFound = any_of(
8912 ArrayRef(R.VectorizableTree).take_front(E->Idx),
8913 [&](const std::unique_ptr<TreeEntry> &TE) {
8914 return ((!TE->isAltShuffle() &&
8915 TE->getOpcode() == Instruction::ExtractElement) ||
8916 TE->isGather()) &&
8917 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
8918 return VL.size() > Data.index() &&
8919 (Mask[Data.index()] == PoisonMaskElem ||
8920 isa<UndefValue>(VL[Data.index()]) ||
8921 Data.value() == VL[Data.index()]);
8922 });
8923 });
8924 SmallPtrSet<Value *, 4> UniqueBases;
8925 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
8926 for (unsigned Part : seq<unsigned>(NumParts)) {
8927 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
8928 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
8929 for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, Limit))) {
8930 // Ignore non-extractelement scalars.
8931 if (isa<UndefValue>(V) ||
8932 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
8933 continue;
8934 // If all users of instruction are going to be vectorized and this
8935 // instruction itself is not going to be vectorized, consider this
8936 // instruction as dead and remove its cost from the final cost of the
8937 // vectorized tree.
8938 // Also, avoid adjusting the cost for extractelements with multiple uses
8939 // in different graph entries.
8940 auto *EE = cast<ExtractElementInst>(V);
8941 VecBase = EE->getVectorOperand();
8942 UniqueBases.insert(VecBase);
8943 const TreeEntry *VE = R.getTreeEntry(V);
8944 if (!CheckedExtracts.insert(V).second ||
8945 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
8946 any_of(EE->users(),
8947 [&](User *U) {
8948 return isa<GetElementPtrInst>(U) &&
8949 !R.areAllUsersVectorized(cast<Instruction>(U),
8950 &VectorizedVals);
8951 }) ||
8952 (VE && VE != E))
8953 continue;
8954 std::optional<unsigned> EEIdx = getExtractIndex(EE);
8955 if (!EEIdx)
8956 continue;
8957 unsigned Idx = *EEIdx;
8958 // Take credit for instruction that will become dead.
8959 if (EE->hasOneUse() || !PrevNodeFound) {
8960 Instruction *Ext = EE->user_back();
8961 if (isa<SExtInst, ZExtInst>(Ext) &&
8962 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
8963 // Use getExtractWithExtendCost() to calculate the cost of
8964 // extractelement/ext pair.
8965 Cost -=
8966 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
8967 EE->getVectorOperandType(), Idx);
8968 // Add back the cost of s|zext which is subtracted separately.
8970 Ext->getOpcode(), Ext->getType(), EE->getType(),
8971 TTI::getCastContextHint(Ext), CostKind, Ext);
8972 continue;
8973 }
8974 }
8975 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
8976 CostKind, Idx);
8977 }
8978 }
8979 // Check that gather of extractelements can be represented as just a
8980 // shuffle of a single/two vectors the scalars are extracted from.
8981 // Found the bunch of extractelement instructions that must be gathered
8982 // into a vector and can be represented as a permutation elements in a
8983 // single input vector or of 2 input vectors.
8984 // Done for reused if same extractelements were vectorized already.
8985 if (!PrevNodeFound)
8986 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8987 InVectors.assign(1, E);
8988 CommonMask.assign(Mask.begin(), Mask.end());
8989 transformMaskAfterShuffle(CommonMask, CommonMask);
8990 SameNodesEstimated = false;
8991 if (NumParts != 1 && UniqueBases.size() != 1) {
8992 UseVecBaseAsInput = true;
8993 VecBase =
8994 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
8995 }
8996 return VecBase;
8997 }
8998 /// Checks if the specified entry \p E needs to be delayed because of its
8999 /// dependency nodes.
9000 std::optional<InstructionCost>
9001 needToDelay(const TreeEntry *,
9003 // No need to delay the cost estimation during analysis.
9004 return std::nullopt;
9005 }
9006 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
9007 if (&E1 == &E2) {
9008 assert(all_of(Mask,
9009 [&](int Idx) {
9010 return Idx < static_cast<int>(E1.getVectorFactor());
9011 }) &&
9012 "Expected single vector shuffle mask.");
9013 add(E1, Mask);
9014 return;
9015 }
9016 if (InVectors.empty()) {
9017 CommonMask.assign(Mask.begin(), Mask.end());
9018 InVectors.assign({&E1, &E2});
9019 return;
9020 }
9021 assert(!CommonMask.empty() && "Expected non-empty common mask.");
9022 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9023 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
9024 if (NumParts == 0 || NumParts >= Mask.size())
9025 NumParts = 1;
9026 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
9027 const auto *It =
9028 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
9029 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
9030 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
9031 }
9032 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
9033 if (InVectors.empty()) {
9034 CommonMask.assign(Mask.begin(), Mask.end());
9035 InVectors.assign(1, &E1);
9036 return;
9037 }
9038 assert(!CommonMask.empty() && "Expected non-empty common mask.");
9039 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9040 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
9041 if (NumParts == 0 || NumParts >= Mask.size())
9042 NumParts = 1;
9043 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
9044 const auto *It =
9045 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
9046 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
9047 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
9048 if (!SameNodesEstimated && InVectors.size() == 1)
9049 InVectors.emplace_back(&E1);
9050 }
9051 /// Adds 2 input vectors and the mask for their shuffling.
9052 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
9053 // May come only for shuffling of 2 vectors with extractelements, already
9054 // handled in adjustExtracts.
9055 assert(InVectors.size() == 1 &&
9056 all_of(enumerate(CommonMask),
9057 [&](auto P) {
9058 if (P.value() == PoisonMaskElem)
9059 return Mask[P.index()] == PoisonMaskElem;
9060 auto *EI =
9061 cast<ExtractElementInst>(InVectors.front()
9062 .get<const TreeEntry *>()
9063 ->Scalars[P.index()]);
9064 return EI->getVectorOperand() == V1 ||
9065 EI->getVectorOperand() == V2;
9066 }) &&
9067 "Expected extractelement vectors.");
9068 }
9069 /// Adds another one input vector and the mask for the shuffling.
9070 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
9071 if (InVectors.empty()) {
9072 assert(CommonMask.empty() && !ForExtracts &&
9073 "Expected empty input mask/vectors.");
9074 CommonMask.assign(Mask.begin(), Mask.end());
9075 InVectors.assign(1, V1);
9076 return;
9077 }
9078 if (ForExtracts) {
9079 // No need to add vectors here, already handled them in adjustExtracts.
9080 assert(InVectors.size() == 1 &&
9081 InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
9082 all_of(enumerate(CommonMask),
9083 [&](auto P) {
9084 Value *Scalar = InVectors.front()
9085 .get<const TreeEntry *>()
9086 ->Scalars[P.index()];
9087 if (P.value() == PoisonMaskElem)
9088 return P.value() == Mask[P.index()] ||
9089 isa<UndefValue>(Scalar);
9090 if (isa<Constant>(V1))
9091 return true;
9092 auto *EI = cast<ExtractElementInst>(Scalar);
9093 return EI->getVectorOperand() == V1;
9094 }) &&
9095 "Expected only tree entry for extractelement vectors.");
9096 return;
9097 }
9098 assert(!InVectors.empty() && !CommonMask.empty() &&
9099 "Expected only tree entries from extracts/reused buildvectors.");
9100 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
9101 if (InVectors.size() == 2) {
9102 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
9103 transformMaskAfterShuffle(CommonMask, CommonMask);
9104 VF = std::max<unsigned>(VF, CommonMask.size());
9105 } else if (const auto *InTE =
9106 InVectors.front().dyn_cast<const TreeEntry *>()) {
9107 VF = std::max(VF, InTE->getVectorFactor());
9108 } else {
9109 VF = std::max(
9110 VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())
9111 ->getNumElements());
9112 }
9113 InVectors.push_back(V1);
9114 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
9115 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
9116 CommonMask[Idx] = Mask[Idx] + VF;
9117 }
9118 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
9119 Value *Root = nullptr) {
9120 Cost += getBuildVectorCost(VL, Root);
9121 if (!Root) {
9122 // FIXME: Need to find a way to avoid use of getNullValue here.
9124 unsigned VF = VL.size();
9125 if (MaskVF != 0)
9126 VF = std::min(VF, MaskVF);
9127 for (Value *V : VL.take_front(VF)) {
9128 if (isa<UndefValue>(V)) {
9129 Vals.push_back(cast<Constant>(V));
9130 continue;
9131 }
9132 Vals.push_back(Constant::getNullValue(V->getType()));
9133 }
9134 return ConstantVector::get(Vals);
9135 }
9138 cast<FixedVectorType>(Root->getType())->getNumElements()),
9139 getAllOnesValue(*R.DL, ScalarTy));
9140 }
9142 /// Finalize emission of the shuffles.
9144 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
9145 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
9146 IsFinalized = true;
9147 if (Action) {
9148 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
9149 if (InVectors.size() == 2)
9150 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
9151 else
9152 Cost += createShuffle(Vec, nullptr, CommonMask);
9153 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
9154 if (CommonMask[Idx] != PoisonMaskElem)
9155 CommonMask[Idx] = Idx;
9156 assert(VF > 0 &&
9157 "Expected vector length for the final value before action.");
9158 Value *V = Vec.get<Value *>();
9159 Action(V, CommonMask);
9160 InVectors.front() = V;
9161 }
9162 ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
9163 if (CommonMask.empty()) {
9164 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
9165 return Cost;
9166 }
9167 return Cost +
9168 createShuffle(InVectors.front(),
9169 InVectors.size() == 2 ? InVectors.back() : nullptr,
9170 CommonMask);
9171 }
9172
9174 assert((IsFinalized || CommonMask.empty()) &&
9175 "Shuffle construction must be finalized.");
9176 }
9177};
9178
9179const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
9180 unsigned Idx) const {
9181 Value *Op = E->getOperand(Idx).front();
9182 if (const TreeEntry *TE = getTreeEntry(Op)) {
9183 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9184 return EI.EdgeIdx == Idx && EI.UserTE == E;
9185 }) != TE->UserTreeIndices.end())
9186 return TE;
9187 auto MIt = MultiNodeScalars.find(Op);
9188 if (MIt != MultiNodeScalars.end()) {
9189 for (const TreeEntry *TE : MIt->second) {
9190 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9191 return EI.EdgeIdx == Idx && EI.UserTE == E;
9192 }) != TE->UserTreeIndices.end())
9193 return TE;
9194 }
9195 }
9196 }
9197 const auto *It =
9198 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9199 return TE->isGather() &&
9200 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9201 return EI.EdgeIdx == Idx && EI.UserTE == E;
9202 }) != TE->UserTreeIndices.end();
9203 });
9204 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
9205 return It->get();
9206}
9207
9208TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
9209 if (TE.State == TreeEntry::ScatterVectorize ||
9210 TE.State == TreeEntry::StridedVectorize)
9212 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
9213 !TE.isAltShuffle()) {
9214 if (TE.ReorderIndices.empty())
9217 inversePermutation(TE.ReorderIndices, Mask);
9218 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
9220 }
9222}
9223
9224/// Builds the arguments types vector for the given call instruction with the
9225/// given \p ID for the specified vector factor.
9227 const Intrinsic::ID ID,
9228 const unsigned VF,
9229 unsigned MinBW) {
9230 SmallVector<Type *> ArgTys;
9231 for (auto [Idx, Arg] : enumerate(CI->args())) {
9234 ArgTys.push_back(Arg->getType());
9235 continue;
9236 }
9237 if (MinBW > 0) {
9238 ArgTys.push_back(
9239 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
9240 continue;
9241 }
9242 }
9243 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
9244 }
9245 return ArgTys;
9246}
9247
9249BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
9250 SmallPtrSetImpl<Value *> &CheckedExtracts) {
9251 ArrayRef<Value *> VL = E->Scalars;
9252
9253 Type *ScalarTy = VL[0]->getType();
9254 if (!E->isGather()) {
9255 if (auto *SI = dyn_cast<StoreInst>(VL[0]))
9256 ScalarTy = SI->getValueOperand()->getType();
9257 else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
9258 ScalarTy = CI->getOperand(0)->getType();
9259 else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
9260 ScalarTy = IE->getOperand(1)->getType();
9261 }
9262 if (!isValidElementType(ScalarTy))
9265
9266 // If we have computed a smaller type for the expression, update VecTy so
9267 // that the costs will be accurate.
9268 auto It = MinBWs.find(E);
9269 Type *OrigScalarTy = ScalarTy;
9270 if (It != MinBWs.end())
9271 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
9272 auto *VecTy = getWidenedType(ScalarTy, VL.size());
9273 unsigned EntryVF = E->getVectorFactor();
9274 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
9275
9276 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
9277 if (E->isGather()) {
9278 if (allConstant(VL))
9279 return 0;
9280 if (isa<InsertElementInst>(VL[0]))
9282 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
9283 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
9284 }
9285 InstructionCost CommonCost = 0;
9287 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
9288 if (!E->ReorderIndices.empty() &&
9289 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
9290 SmallVector<int> NewMask;
9291 if (E->getOpcode() == Instruction::Store) {
9292 // For stores the order is actually a mask.
9293 NewMask.resize(E->ReorderIndices.size());
9294 copy(E->ReorderIndices, NewMask.begin());
9295 } else {
9296 inversePermutation(E->ReorderIndices, NewMask);
9297 }
9298 ::addMask(Mask, NewMask);
9299 }
9300 if (NeedToShuffleReuses)
9301 ::addMask(Mask, E->ReuseShuffleIndices);
9302 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
9303 CommonCost =
9304 TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
9305 assert((E->State == TreeEntry::Vectorize ||
9306 E->State == TreeEntry::ScatterVectorize ||
9307 E->State == TreeEntry::StridedVectorize) &&
9308 "Unhandled state");
9309 assert(E->getOpcode() &&
9310 ((allSameType(VL) && allSameBlock(VL)) ||
9311 (E->getOpcode() == Instruction::GetElementPtr &&
9312 E->getMainOp()->getType()->isPointerTy())) &&
9313 "Invalid VL");
9314 Instruction *VL0 = E->getMainOp();
9315 unsigned ShuffleOrOp =
9316 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
9317 SetVector<Value *> UniqueValues(VL.begin(), VL.end());
9318 const unsigned Sz = UniqueValues.size();
9319 SmallBitVector UsedScalars(Sz, false);
9320 for (unsigned I = 0; I < Sz; ++I) {
9321 if (getTreeEntry(UniqueValues[I]) == E)
9322 continue;
9323 UsedScalars.set(I);
9324 }
9325 auto GetCastContextHint = [&](Value *V) {
9326 if (const TreeEntry *OpTE = getTreeEntry(V))
9327 return getCastContextHint(*OpTE);
9328 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
9329 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
9332 };
9333 auto GetCostDiff =
9334 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
9336 // Calculate the cost of this instruction.
9337 InstructionCost ScalarCost = 0;
9338 if (isa<CastInst, CallInst>(VL0)) {
9339 // For some of the instructions no need to calculate cost for each
9340 // particular instruction, we can use the cost of the single
9341 // instruction x total number of scalar instructions.
9342 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
9343 } else {
9344 for (unsigned I = 0; I < Sz; ++I) {
9345 if (UsedScalars.test(I))
9346 continue;
9347 ScalarCost += ScalarEltCost(I);
9348 }
9349 }
9350
9351 InstructionCost VecCost = VectorCost(CommonCost);
9352 // Check if the current node must be resized, if the parent node is not
9353 // resized.
9354 if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) {
9355 const EdgeInfo &EI = E->UserTreeIndices.front();
9356 if ((EI.UserTE->getOpcode() != Instruction::Select ||
9357 EI.EdgeIdx != 0) &&
9358 It != MinBWs.end()) {
9359 auto UserBWIt = MinBWs.find(EI.UserTE);
9360 Type *UserScalarTy =
9361 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
9362 if (UserBWIt != MinBWs.end())
9363 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
9364 UserBWIt->second.first);
9365 if (ScalarTy != UserScalarTy) {
9366 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9367 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
9368 unsigned VecOpcode;
9369 auto *UserVecTy =
9370 getWidenedType(UserScalarTy, E->getVectorFactor());
9371 if (BWSz > SrcBWSz)
9372 VecOpcode = Instruction::Trunc;
9373 else
9374 VecOpcode =
9375 It->second.second ? Instruction::SExt : Instruction::ZExt;
9376 TTI::CastContextHint CCH = GetCastContextHint(VL0);
9377 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
9378 CostKind);
9379 }
9380 }
9381 }
9382 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
9383 ScalarCost, "Calculated costs for Tree"));
9384 return VecCost - ScalarCost;
9385 };
9386 // Calculate cost difference from vectorizing set of GEPs.
9387 // Negative value means vectorizing is profitable.
9388 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
9389 assert((E->State == TreeEntry::Vectorize ||
9390 E->State == TreeEntry::StridedVectorize) &&
9391 "Entry state expected to be Vectorize or StridedVectorize here.");
9392 InstructionCost ScalarCost = 0;
9393 InstructionCost VecCost = 0;
9394 std::tie(ScalarCost, VecCost) = getGEPCosts(
9395 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
9396 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
9397 "Calculated GEPs cost for Tree"));
9398
9399 return VecCost - ScalarCost;
9400 };
9401
9402 switch (ShuffleOrOp) {
9403 case Instruction::PHI: {
9404 // Count reused scalars.
9405 InstructionCost ScalarCost = 0;
9407 for (Value *V : UniqueValues) {
9408 auto *PHI = dyn_cast<PHINode>(V);
9409 if (!PHI)
9410 continue;
9411
9412 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
9413 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
9414 Value *Op = PHI->getIncomingValue(I);
9415 Operands[I] = Op;
9416 }
9417 if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
9418 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
9419 if (!OpTE->ReuseShuffleIndices.empty())
9420 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
9421 OpTE->Scalars.size());
9422 }
9423
9424 return CommonCost - ScalarCost;
9425 }
9426 case Instruction::ExtractValue:
9427 case Instruction::ExtractElement: {
9428 auto GetScalarCost = [&](unsigned Idx) {
9429 auto *I = cast<Instruction>(UniqueValues[Idx]);
9430 VectorType *SrcVecTy;
9431 if (ShuffleOrOp == Instruction::ExtractElement) {
9432 auto *EE = cast<ExtractElementInst>(I);
9433 SrcVecTy = EE->getVectorOperandType();
9434 } else {
9435 auto *EV = cast<ExtractValueInst>(I);
9436 Type *AggregateTy = EV->getAggregateOperand()->getType();
9437 unsigned NumElts;
9438 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
9439 NumElts = ATy->getNumElements();
9440 else
9441 NumElts = AggregateTy->getStructNumElements();
9442 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
9443 }
9444 if (I->hasOneUse()) {
9445 Instruction *Ext = I->user_back();
9446 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
9447 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
9448 // Use getExtractWithExtendCost() to calculate the cost of
9449 // extractelement/ext pair.
9451 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
9452 // Subtract the cost of s|zext which is subtracted separately.
9454 Ext->getOpcode(), Ext->getType(), I->getType(),
9456 return Cost;
9457 }
9458 }
9459 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
9461 };
9462 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
9463 return GetCostDiff(GetScalarCost, GetVectorCost);
9464 }
9465 case Instruction::InsertElement: {
9466 assert(E->ReuseShuffleIndices.empty() &&
9467 "Unique insertelements only are expected.");
9468 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
9469 unsigned const NumElts = SrcVecTy->getNumElements();
9470 unsigned const NumScalars = VL.size();
9471
9472 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
9473
9474 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
9475 unsigned OffsetBeg = *getElementIndex(VL.front());
9476 unsigned OffsetEnd = OffsetBeg;
9477 InsertMask[OffsetBeg] = 0;
9478 for (auto [I, V] : enumerate(VL.drop_front())) {
9479 unsigned Idx = *getElementIndex(V);
9480 if (OffsetBeg > Idx)
9481 OffsetBeg = Idx;
9482 else if (OffsetEnd < Idx)
9483 OffsetEnd = Idx;
9484 InsertMask[Idx] = I + 1;
9485 }
9486 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
9487 if (NumOfParts > 0)
9488 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
9489 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9490 VecScalarsSz;
9491 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9492 unsigned InsertVecSz = std::min<unsigned>(
9493 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
9494 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9495 bool IsWholeSubvector =
9496 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9497 // Check if we can safely insert a subvector. If it is not possible, just
9498 // generate a whole-sized vector and shuffle the source vector and the new
9499 // subvector.
9500 if (OffsetBeg + InsertVecSz > VecSz) {
9501 // Align OffsetBeg to generate correct mask.
9502 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
9503 InsertVecSz = VecSz;
9504 }
9505
9506 APInt DemandedElts = APInt::getZero(NumElts);
9507 // TODO: Add support for Instruction::InsertValue.
9509 if (!E->ReorderIndices.empty()) {
9510 inversePermutation(E->ReorderIndices, Mask);
9511 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
9512 } else {
9513 Mask.assign(VecSz, PoisonMaskElem);
9514 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
9515 }
9516 bool IsIdentity = true;
9517 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
9518 Mask.swap(PrevMask);
9519 for (unsigned I = 0; I < NumScalars; ++I) {
9520 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
9521 DemandedElts.setBit(InsertIdx);
9522 IsIdentity &= InsertIdx - OffsetBeg == I;
9523 Mask[InsertIdx - OffsetBeg] = I;
9524 }
9525 assert(Offset < NumElts && "Failed to find vector index offset");
9526
9528 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
9529 /*Insert*/ true, /*Extract*/ false,
9530 CostKind);
9531
9532 // First cost - resize to actual vector size if not identity shuffle or
9533 // need to shift the vector.
9534 // Do not calculate the cost if the actual size is the register size and
9535 // we can merge this shuffle with the following SK_Select.
9536 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
9537 if (!IsIdentity)
9539 InsertVecTy, Mask);
9540 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
9541 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9542 }));
9543 // Second cost - permutation with subvector, if some elements are from the
9544 // initial vector or inserting a subvector.
9545 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
9546 // subvector of ActualVecTy.
9547 SmallBitVector InMask =
9548 isUndefVector(FirstInsert->getOperand(0),
9549 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9550 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
9551 if (InsertVecSz != VecSz) {
9552 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
9554 std::nullopt, CostKind, OffsetBeg - Offset,
9555 InsertVecTy);
9556 } else {
9557 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
9558 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
9559 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
9560 I <= End; ++I)
9561 if (Mask[I] != PoisonMaskElem)
9562 Mask[I] = I + VecSz;
9563 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
9564 Mask[I] =
9565 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
9566 Cost +=
9567 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
9568 }
9569 }
9570 return Cost;
9571 }
9572 case Instruction::ZExt:
9573 case Instruction::SExt:
9574 case Instruction::FPToUI:
9575 case Instruction::FPToSI:
9576 case Instruction::FPExt:
9577 case Instruction::PtrToInt:
9578 case Instruction::IntToPtr:
9579 case Instruction::SIToFP:
9580 case Instruction::UIToFP:
9581 case Instruction::Trunc:
9582 case Instruction::FPTrunc:
9583 case Instruction::BitCast: {
9584 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9585 Type *SrcScalarTy = VL0->getOperand(0)->getType();
9586 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
9587 unsigned Opcode = ShuffleOrOp;
9588 unsigned VecOpcode = Opcode;
9589 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
9590 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
9591 // Check if the values are candidates to demote.
9592 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
9593 if (SrcIt != MinBWs.end()) {
9594 SrcBWSz = SrcIt->second.first;
9595 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
9596 SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
9597 }
9598 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9599 if (BWSz == SrcBWSz) {
9600 VecOpcode = Instruction::BitCast;
9601 } else if (BWSz < SrcBWSz) {
9602 VecOpcode = Instruction::Trunc;
9603 } else if (It != MinBWs.end()) {
9604 assert(BWSz > SrcBWSz && "Invalid cast!");
9605 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9606 } else if (SrcIt != MinBWs.end()) {
9607 assert(BWSz > SrcBWSz && "Invalid cast!");
9608 VecOpcode =
9609 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9610 }
9611 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
9612 !SrcIt->second.second) {
9613 VecOpcode = Instruction::UIToFP;
9614 }
9615 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
9616 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9617 return TTI->getCastInstrCost(Opcode, VL0->getType(),
9618 VL0->getOperand(0)->getType(),
9620 };
9621 auto GetVectorCost = [=](InstructionCost CommonCost) {
9622 // Do not count cost here if minimum bitwidth is in effect and it is just
9623 // a bitcast (here it is just a noop).
9624 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9625 return CommonCost;
9626 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
9627 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
9628 return CommonCost +
9629 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
9630 VecOpcode == Opcode ? VI : nullptr);
9631 };
9632 return GetCostDiff(GetScalarCost, GetVectorCost);
9633 }
9634 case Instruction::FCmp:
9635 case Instruction::ICmp:
9636 case Instruction::Select: {
9637 CmpInst::Predicate VecPred, SwappedVecPred;
9638 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
9639 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
9640 match(VL0, MatchCmp))
9641 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
9642 else
9643 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
9646 auto GetScalarCost = [&](unsigned Idx) {
9647 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9648 CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
9651 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
9652 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
9653 !match(VI, MatchCmp)) ||
9654 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9655 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
9658
9660 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
9661 CostKind, VI);
9662 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI);
9663 if (MinMaxID != Intrinsic::not_intrinsic) {
9664 Type *CanonicalType = OrigScalarTy;
9665 if (CanonicalType->isPtrOrPtrVectorTy())
9666 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
9667 CanonicalType->getContext(),
9668 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
9669
9670 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
9671 {CanonicalType, CanonicalType});
9672 InstructionCost IntrinsicCost =
9673 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9674 // If the selects are the only uses of the compares, they will be
9675 // dead and we can adjust the cost by removing their cost.
9676 if (SelectOnly) {
9677 auto *CI = cast<CmpInst>(VI->getOperand(0));
9678 IntrinsicCost -= TTI->getCmpSelInstrCost(
9679 CI->getOpcode(), OrigScalarTy, Builder.getInt1Ty(),
9680 CI->getPredicate(), CostKind, CI);
9681 }
9682 ScalarCost = std::min(ScalarCost, IntrinsicCost);
9683 }
9684
9685 return ScalarCost;
9686 };
9687 auto GetVectorCost = [&](InstructionCost CommonCost) {
9688 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
9689
9691 E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
9692 // Check if it is possible and profitable to use min/max for selects
9693 // in VL.
9694 //
9695 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL);
9696 if (MinMaxID != Intrinsic::not_intrinsic) {
9697 Type *CanonicalType = VecTy;
9698 if (CanonicalType->isPtrOrPtrVectorTy())
9699 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
9700 CanonicalType->getContext(),
9701 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
9702 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
9703 {CanonicalType, CanonicalType});
9704 InstructionCost IntrinsicCost =
9705 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9706 // If the selects are the only uses of the compares, they will be
9707 // dead and we can adjust the cost by removing their cost.
9708 if (SelectOnly) {
9709 auto *CI =
9710 cast<CmpInst>(cast<Instruction>(VL.front())->getOperand(0));
9711 IntrinsicCost -= TTI->getCmpSelInstrCost(CI->getOpcode(), VecTy,
9712 MaskTy, VecPred, CostKind);
9713 }
9714 VecCost = std::min(VecCost, IntrinsicCost);
9715 }
9716 return VecCost + CommonCost;
9717 };
9718 return GetCostDiff(GetScalarCost, GetVectorCost);
9719 }
9720 case Instruction::FNeg:
9721 case Instruction::Add:
9722 case Instruction::FAdd:
9723 case Instruction::Sub:
9724 case Instruction::FSub:
9725 case Instruction::Mul:
9726 case Instruction::FMul:
9727 case Instruction::UDiv:
9728 case Instruction::SDiv:
9729 case Instruction::FDiv:
9730 case Instruction::URem:
9731 case Instruction::SRem:
9732 case Instruction::FRem:
9733 case Instruction::Shl:
9734 case Instruction::LShr:
9735 case Instruction::AShr:
9736 case Instruction::And:
9737 case Instruction::Or:
9738 case Instruction::Xor: {
9739 auto GetScalarCost = [&](unsigned Idx) {
9740 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9741 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9742 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
9743 TTI::OperandValueInfo Op2Info =
9744 TTI::getOperandInfo(VI->getOperand(OpIdx));
9745 SmallVector<const Value *> Operands(VI->operand_values());
9746 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
9747 Op1Info, Op2Info, Operands, VI);
9748 };
9749 auto GetVectorCost = [=](InstructionCost CommonCost) {
9750 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
9751 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
9752 ArrayRef<Value *> Ops = E->getOperand(I);
9753 if (all_of(Ops, [&](Value *Op) {
9754 auto *CI = dyn_cast<ConstantInt>(Op);
9755 return CI && CI->getValue().countr_one() >= It->second.first;
9756 }))
9757 return CommonCost;
9758 }
9759 }
9760 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
9761 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
9762 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
9763 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
9764 Op2Info, std::nullopt, nullptr, TLI) +
9765 CommonCost;
9766 };
9767 return GetCostDiff(GetScalarCost, GetVectorCost);
9768 }
9769 case Instruction::GetElementPtr: {
9770 return CommonCost + GetGEPCostDiff(VL, VL0);
9771 }
9772 case Instruction::Load: {
9773 auto GetScalarCost = [&](unsigned Idx) {
9774 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
9775 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
9776 VI->getAlign(), VI->getPointerAddressSpace(),
9778 };
9779 auto *LI0 = cast<LoadInst>(VL0);
9780 auto GetVectorCost = [&](InstructionCost CommonCost) {
9781 InstructionCost VecLdCost;
9782 if (E->State == TreeEntry::Vectorize) {
9783 VecLdCost = TTI->getMemoryOpCost(
9784 Instruction::Load, VecTy, LI0->getAlign(),
9785 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
9786 } else if (E->State == TreeEntry::StridedVectorize) {
9787 Align CommonAlignment =
9788 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9789 VecLdCost = TTI->getStridedMemoryOpCost(
9790 Instruction::Load, VecTy, LI0->getPointerOperand(),
9791 /*VariableMask=*/false, CommonAlignment, CostKind);
9792 } else {
9793 assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
9794 Align CommonAlignment =
9795 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9796 VecLdCost = TTI->getGatherScatterOpCost(
9797 Instruction::Load, VecTy, LI0->getPointerOperand(),
9798 /*VariableMask=*/false, CommonAlignment, CostKind);
9799 }
9800 return VecLdCost + CommonCost;
9801 };
9802
9803 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
9804 // If this node generates masked gather load then it is not a terminal node.
9805 // Hence address operand cost is estimated separately.
9806 if (E->State == TreeEntry::ScatterVectorize)
9807 return Cost;
9808
9809 // Estimate cost of GEPs since this tree node is a terminator.
9810 SmallVector<Value *> PointerOps(VL.size());
9811 for (auto [I, V] : enumerate(VL))
9812 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
9813 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9814 }
9815 case Instruction::Store: {
9816 bool IsReorder = !E->ReorderIndices.empty();
9817 auto GetScalarCost = [=](unsigned Idx) {
9818 auto *VI = cast<StoreInst>(VL[Idx]);
9819 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
9820 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
9821 VI->getAlign(), VI->getPointerAddressSpace(),
9822 CostKind, OpInfo, VI);
9823 };
9824 auto *BaseSI =
9825 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9826 auto GetVectorCost = [=](InstructionCost CommonCost) {
9827 // We know that we can merge the stores. Calculate the cost.
9828 InstructionCost VecStCost;
9829 if (E->State == TreeEntry::StridedVectorize) {
9830 Align CommonAlignment =
9831 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
9832 VecStCost = TTI->getStridedMemoryOpCost(
9833 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9834 /*VariableMask=*/false, CommonAlignment, CostKind);
9835 } else {
9836 assert(E->State == TreeEntry::Vectorize &&
9837 "Expected either strided or consecutive stores.");
9838 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
9839 VecStCost = TTI->getMemoryOpCost(
9840 Instruction::Store, VecTy, BaseSI->getAlign(),
9841 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
9842 }
9843 return VecStCost + CommonCost;
9844 };
9845 SmallVector<Value *> PointerOps(VL.size());
9846 for (auto [I, V] : enumerate(VL)) {
9847 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
9848 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
9849 }
9850
9851 return GetCostDiff(GetScalarCost, GetVectorCost) +
9852 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9853 }
9854 case Instruction::Call: {
9855 auto GetScalarCost = [&](unsigned Idx) {
9856 auto *CI = cast<CallInst>(UniqueValues[Idx]);
9859 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
9860 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9861 }
9864 CI->getFunctionType()->params(), CostKind);
9865 };
9866 auto GetVectorCost = [=](InstructionCost CommonCost) {
9867 auto *CI = cast<CallInst>(VL0);
9869 SmallVector<Type *> ArgTys =
9871 It != MinBWs.end() ? It->second.first : 0);
9872 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
9873 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
9874 };
9875 return GetCostDiff(GetScalarCost, GetVectorCost);
9876 }
9877 case Instruction::ShuffleVector: {
9878 assert(E->isAltShuffle() &&
9879 ((Instruction::isBinaryOp(E->getOpcode()) &&
9880 Instruction::isBinaryOp(E->getAltOpcode())) ||
9881 (Instruction::isCast(E->getOpcode()) &&
9882 Instruction::isCast(E->getAltOpcode())) ||
9883 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9884 "Invalid Shuffle Vector Operand");
9885 // Try to find the previous shuffle node with the same operands and same
9886 // main/alternate ops.
9887 auto TryFindNodeWithEqualOperands = [=]() {
9888 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9889 if (TE.get() == E)
9890 break;
9891 if (TE->isAltShuffle() &&
9892 ((TE->getOpcode() == E->getOpcode() &&
9893 TE->getAltOpcode() == E->getAltOpcode()) ||
9894 (TE->getOpcode() == E->getAltOpcode() &&
9895 TE->getAltOpcode() == E->getOpcode())) &&
9896 TE->hasEqualOperands(*E))
9897 return true;
9898 }
9899 return false;
9900 };
9901 auto GetScalarCost = [&](unsigned Idx) {
9902 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9903 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
9904 (void)E;
9905 return TTI->getInstructionCost(VI, CostKind);
9906 };
9907 // Need to clear CommonCost since the final shuffle cost is included into
9908 // vector cost.
9909 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
9910 // VecCost is equal to sum of the cost of creating 2 vectors
9911 // and the cost of creating shuffle.
9912 InstructionCost VecCost = 0;
9913 if (TryFindNodeWithEqualOperands()) {
9914 LLVM_DEBUG({
9915 dbgs() << "SLP: diamond match for alternate node found.\n";
9916 E->dump();
9917 });
9918 // No need to add new vector costs here since we're going to reuse
9919 // same main/alternate vector ops, just do different shuffling.
9920 } else if (Instruction::isBinaryOp(E->getOpcode())) {
9921 VecCost =
9922 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
9923 VecCost +=
9924 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
9925 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
9926 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
9927 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
9928 CI0->getPredicate(), CostKind, VL0);
9929 VecCost += TTIRef.getCmpSelInstrCost(
9930 E->getOpcode(), VecTy, MaskTy,
9931 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
9932 E->getAltOp());
9933 } else {
9934 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
9935 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
9936 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
9937 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9938 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9939 unsigned SrcBWSz =
9940 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
9941 if (SrcIt != MinBWs.end()) {
9942 SrcBWSz = SrcIt->second.first;
9943 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
9944 SrcTy = getWidenedType(SrcSclTy, VL.size());
9945 }
9946 if (BWSz <= SrcBWSz) {
9947 if (BWSz < SrcBWSz)
9948 VecCost =
9949 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
9951 LLVM_DEBUG({
9952 dbgs()
9953 << "SLP: alternate extension, which should be truncated.\n";
9954 E->dump();
9955 });
9956 return VecCost;
9957 }
9958 }
9959 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
9961 VecCost +=
9962 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
9964 }
9966 E->buildAltOpShuffleMask(
9967 [E](Instruction *I) {
9968 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
9969 return I->getOpcode() == E->getAltOpcode();
9970 },
9971 Mask);
9973 FinalVecTy, Mask);
9974 // Patterns like [fadd,fsub] can be combined into a single instruction
9975 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
9976 // need to take into account their order when looking for the most used
9977 // order.
9978 unsigned Opcode0 = E->getOpcode();
9979 unsigned Opcode1 = E->getAltOpcode();
9980 SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));
9981 // If this pattern is supported by the target then we consider the
9982 // order.
9983 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9984 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
9985 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
9986 return AltVecCost < VecCost ? AltVecCost : VecCost;
9987 }
9988 // TODO: Check the reverse order too.
9989 return VecCost;
9990 };
9991 return GetCostDiff(GetScalarCost, GetVectorCost);
9992 }
9993 default:
9994 llvm_unreachable("Unknown instruction");
9995 }
9996}
9997
9998bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
9999 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
10000 << VectorizableTree.size() << " is fully vectorizable .\n");
10001
10002 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
10004 return TE->isGather() &&
10005 !any_of(TE->Scalars,
10006 [this](Value *V) { return EphValues.contains(V); }) &&
10007 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
10008 TE->Scalars.size() < Limit ||
10009 ((TE->getOpcode() == Instruction::ExtractElement ||
10010 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
10011 isFixedVectorShuffle(TE->Scalars, Mask)) ||
10012 (TE->isGather() && TE->getOpcode() == Instruction::Load &&
10013 !TE->isAltShuffle()));
10014 };
10015
10016 // We only handle trees of heights 1 and 2.
10017 if (VectorizableTree.size() == 1 &&
10018 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
10019 (ForReduction &&
10020 AreVectorizableGathers(VectorizableTree[0].get(),
10021 VectorizableTree[0]->Scalars.size()) &&
10022 VectorizableTree[0]->getVectorFactor() > 2)))
10023 return true;
10024
10025 if (VectorizableTree.size() != 2)
10026 return false;
10027
10028 // Handle splat and all-constants stores. Also try to vectorize tiny trees
10029 // with the second gather nodes if they have less scalar operands rather than
10030 // the initial tree element (may be profitable to shuffle the second gather)
10031 // or they are extractelements, which form shuffle.
10033 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
10034 AreVectorizableGathers(VectorizableTree[1].get(),
10035 VectorizableTree[0]->Scalars.size()))
10036 return true;
10037
10038 // Gathering cost would be too much for tiny trees.
10039 if (VectorizableTree[0]->isGather() ||
10040 (VectorizableTree[1]->isGather() &&
10041 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
10042 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
10043 return false;
10044
10045 return true;
10046}
10047
10048static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
10050 bool MustMatchOrInst) {
10051 // Look past the root to find a source value. Arbitrarily follow the
10052 // path through operand 0 of any 'or'. Also, peek through optional
10053 // shift-left-by-multiple-of-8-bits.
10054 Value *ZextLoad = Root;
10055 const APInt *ShAmtC;
10056 bool FoundOr = false;
10057 while (!isa<ConstantExpr>(ZextLoad) &&
10058 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
10059 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
10060 ShAmtC->urem(8) == 0))) {
10061 auto *BinOp = cast<BinaryOperator>(ZextLoad);
10062 ZextLoad = BinOp->getOperand(0);
10063 if (BinOp->getOpcode() == Instruction::Or)
10064 FoundOr = true;
10065 }
10066 // Check if the input is an extended load of the required or/shift expression.
10067 Value *Load;
10068 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
10069 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
10070 return false;
10071
10072 // Require that the total load bit width is a legal integer type.
10073 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
10074 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
10075 Type *SrcTy = Load->getType();
10076 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
10077 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
10078 return false;
10079
10080 // Everything matched - assume that we can fold the whole sequence using
10081 // load combining.
10082 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
10083 << *(cast<Instruction>(Root)) << "\n");
10084
10085 return true;
10086}
10087
10089 if (RdxKind != RecurKind::Or)
10090 return false;
10091
10092 unsigned NumElts = VectorizableTree[0]->Scalars.size();
10093 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
10094 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
10095 /* MatchOr */ false);
10096}
10097
10099 // Peek through a final sequence of stores and check if all operations are
10100 // likely to be load-combined.
10101 unsigned NumElts = Stores.size();
10102 for (Value *Scalar : Stores) {
10103 Value *X;
10104 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
10105 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
10106 return false;
10107 }
10108 return true;
10109}
10110
10111bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
10112 // No need to vectorize inserts of gathered values.
10113 if (VectorizableTree.size() == 2 &&
10114 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
10115 VectorizableTree[1]->isGather() &&
10116 (VectorizableTree[1]->getVectorFactor() <= 2 ||
10117 !(isSplat(VectorizableTree[1]->Scalars) ||
10118 allConstant(VectorizableTree[1]->Scalars))))
10119 return true;
10120
10121 // If the graph includes only PHI nodes and gathers, it is defnitely not
10122 // profitable for the vectorization, we can skip it, if the cost threshold is
10123 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
10124 // gathers/buildvectors.
10125 constexpr int Limit = 4;
10126 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
10127 !VectorizableTree.empty() &&
10128 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
10129 return (TE->isGather() &&
10130 TE->getOpcode() != Instruction::ExtractElement &&
10131 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
10132 TE->getOpcode() == Instruction::PHI;
10133 }))
10134 return true;
10135
10136 // We can vectorize the tree if its size is greater than or equal to the
10137 // minimum size specified by the MinTreeSize command line option.
10138 if (VectorizableTree.size() >= MinTreeSize)
10139 return false;
10140
10141 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
10142 // can vectorize it if we can prove it fully vectorizable.
10143 if (isFullyVectorizableTinyTree(ForReduction))
10144 return false;
10145
10146 // Check if any of the gather node forms an insertelement buildvector
10147 // somewhere.
10148 bool IsAllowedSingleBVNode =
10149 VectorizableTree.size() > 1 ||
10150 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
10151 !VectorizableTree.front()->isAltShuffle() &&
10152 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
10153 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
10154 allSameBlock(VectorizableTree.front()->Scalars));
10155 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
10156 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
10157 return isa<ExtractElementInst, UndefValue>(V) ||
10158 (IsAllowedSingleBVNode &&
10159 !V->hasNUsesOrMore(UsesLimit) &&
10160 any_of(V->users(), IsaPred<InsertElementInst>));
10161 });
10162 }))
10163 return false;
10164
10165 assert(VectorizableTree.empty()
10166 ? ExternalUses.empty()
10167 : true && "We shouldn't have any external users");
10168
10169 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
10170 // vectorizable.
10171 return true;
10172}
10173
10175 // Walk from the bottom of the tree to the top, tracking which values are
10176 // live. When we see a call instruction that is not part of our tree,
10177 // query TTI to see if there is a cost to keeping values live over it
10178 // (for example, if spills and fills are required).
10179 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
10181
10183 Instruction *PrevInst = nullptr;
10184
10185 // The entries in VectorizableTree are not necessarily ordered by their
10186 // position in basic blocks. Collect them and order them by dominance so later
10187 // instructions are guaranteed to be visited first. For instructions in
10188 // different basic blocks, we only scan to the beginning of the block, so
10189 // their order does not matter, as long as all instructions in a basic block
10190 // are grouped together. Using dominance ensures a deterministic order.
10191 SmallVector<Instruction *, 16> OrderedScalars;
10192 for (const auto &TEPtr : VectorizableTree) {
10193 if (TEPtr->State != TreeEntry::Vectorize)
10194 continue;
10195 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
10196 if (!Inst)
10197 continue;
10198 OrderedScalars.push_back(Inst);
10199 }
10200 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
10201 auto *NodeA = DT->getNode(A->getParent());
10202 auto *NodeB = DT->getNode(B->getParent());
10203 assert(NodeA && "Should only process reachable instructions");
10204 assert(NodeB && "Should only process reachable instructions");
10205 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10206 "Different nodes should have different DFS numbers");
10207 if (NodeA != NodeB)
10208 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
10209 return B->comesBefore(A);
10210 });
10211
10212 for (Instruction *Inst : OrderedScalars) {
10213 if (!PrevInst) {
10214 PrevInst = Inst;
10215 continue;
10216 }
10217
10218 // Update LiveValues.
10219 LiveValues.erase(PrevInst);
10220 for (auto &J : PrevInst->operands()) {
10221 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
10222 LiveValues.insert(cast<Instruction>(&*J));
10223 }
10224
10225 LLVM_DEBUG({
10226 dbgs() << "SLP: #LV: " << LiveValues.size();
10227 for (auto *X : LiveValues)
10228 dbgs() << " " << X->getName();
10229 dbgs() << ", Looking at ";
10230 Inst->dump();
10231 });
10232
10233 // Now find the sequence of instructions between PrevInst and Inst.
10234 unsigned NumCalls = 0;
10235 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
10236 PrevInstIt =
10237 PrevInst->getIterator().getReverse();
10238 while (InstIt != PrevInstIt) {
10239 if (PrevInstIt == PrevInst->getParent()->rend()) {
10240 PrevInstIt = Inst->getParent()->rbegin();
10241 continue;
10242 }
10243
10244 auto NoCallIntrinsic = [this](Instruction *I) {
10245 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
10246 if (II->isAssumeLikeIntrinsic())
10247 return true;
10248 FastMathFlags FMF;
10250 for (auto &ArgOp : II->args())
10251 Tys.push_back(ArgOp->getType());
10252 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
10253 FMF = FPMO->getFastMathFlags();
10254 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
10255 FMF);
10256 InstructionCost IntrCost =
10259 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
10260 if (IntrCost < CallCost)
10261 return true;
10262 }
10263 return false;
10264 };
10265
10266 // Debug information does not impact spill cost.
10267 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
10268 &*PrevInstIt != PrevInst)
10269 NumCalls++;
10270
10271 ++PrevInstIt;
10272 }
10273
10274 if (NumCalls) {
10276 for (auto *II : LiveValues) {
10277 auto *ScalarTy = II->getType();
10278 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
10279 ScalarTy = VectorTy->getElementType();
10280 V.push_back(getWidenedType(ScalarTy, BundleWidth));
10281 }
10282 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
10283 }
10284
10285 PrevInst = Inst;
10286 }
10287
10288 return Cost;
10289}
10290
10291/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
10292/// buildvector sequence.
10294 const InsertElementInst *IE2) {
10295 if (IE1 == IE2)
10296 return false;
10297 const auto *I1 = IE1;
10298 const auto *I2 = IE2;
10299 const InsertElementInst *PrevI1;
10300 const InsertElementInst *PrevI2;
10301 unsigned Idx1 = *getElementIndex(IE1);
10302 unsigned Idx2 = *getElementIndex(IE2);
10303 do {
10304 if (I2 == IE1)
10305 return true;
10306 if (I1 == IE2)
10307 return false;
10308 PrevI1 = I1;
10309 PrevI2 = I2;
10310 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
10311 getElementIndex(I1).value_or(Idx2) != Idx2)
10312 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
10313 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
10314 getElementIndex(I2).value_or(Idx1) != Idx1)
10315 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
10316 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
10317 llvm_unreachable("Two different buildvectors not expected.");
10318}
10319
10320namespace {
10321/// Returns incoming Value *, if the requested type is Value * too, or a default
10322/// value, otherwise.
10323struct ValueSelect {
10324 template <typename U>
10325 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
10326 return V;
10327 }
10328 template <typename U>
10329 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
10330 return U();
10331 }
10332};
10333} // namespace
10334
10335/// Does the analysis of the provided shuffle masks and performs the requested
10336/// actions on the vectors with the given shuffle masks. It tries to do it in
10337/// several steps.
10338/// 1. If the Base vector is not undef vector, resizing the very first mask to
10339/// have common VF and perform action for 2 input vectors (including non-undef
10340/// Base). Other shuffle masks are combined with the resulting after the 1 stage
10341/// and processed as a shuffle of 2 elements.
10342/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
10343/// action only for 1 vector with the given mask, if it is not the identity
10344/// mask.
10345/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
10346/// vectors, combing the masks properly between the steps.
10347template <typename T>
10349 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
10350 function_ref<unsigned(T *)> GetVF,
10351 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
10353 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
10354 SmallVector<int> Mask(ShuffleMask.begin()->second);
10355 auto VMIt = std::next(ShuffleMask.begin());
10356 T *Prev = nullptr;
10357 SmallBitVector UseMask =
10358 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
10359 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
10360 if (!IsBaseUndef.all()) {
10361 // Base is not undef, need to combine it with the next subvectors.
10362 std::pair<T *, bool> Res =
10363 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
10364 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
10365 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
10366 if (Mask[Idx] == PoisonMaskElem)
10367 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
10368 else
10369 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
10370 }
10371 auto *V = ValueSelect::get<T *>(Base);
10372 (void)V;
10373 assert((!V || GetVF(V) == Mask.size()) &&
10374 "Expected base vector of VF number of elements.");
10375 Prev = Action(Mask, {nullptr, Res.first});
10376 } else if (ShuffleMask.size() == 1) {
10377 // Base is undef and only 1 vector is shuffled - perform the action only for
10378 // single vector, if the mask is not the identity mask.
10379 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
10380 /*ForSingleMask=*/true);
10381 if (Res.second)
10382 // Identity mask is found.
10383 Prev = Res.first;
10384 else
10385 Prev = Action(Mask, {ShuffleMask.begin()->first});
10386 } else {
10387 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
10388 // shuffles step by step, combining shuffle between the steps.
10389 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
10390 unsigned Vec2VF = GetVF(VMIt->first);
10391 if (Vec1VF == Vec2VF) {
10392 // No need to resize the input vectors since they are of the same size, we
10393 // can shuffle them directly.
10394 ArrayRef<int> SecMask = VMIt->second;
10395 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10396 if (SecMask[I] != PoisonMaskElem) {
10397 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10398 Mask[I] = SecMask[I] + Vec1VF;
10399 }
10400 }
10401 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
10402 } else {
10403 // Vectors of different sizes - resize and reshuffle.
10404 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
10405 /*ForSingleMask=*/false);
10406 std::pair<T *, bool> Res2 =
10407 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
10408 ArrayRef<int> SecMask = VMIt->second;
10409 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10410 if (Mask[I] != PoisonMaskElem) {
10411 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10412 if (Res1.second)
10413 Mask[I] = I;
10414 } else if (SecMask[I] != PoisonMaskElem) {
10415 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10416 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
10417 }
10418 }
10419 Prev = Action(Mask, {Res1.first, Res2.first});
10420 }
10421 VMIt = std::next(VMIt);
10422 }
10423 bool IsBaseNotUndef = !IsBaseUndef.all();
10424 (void)IsBaseNotUndef;
10425 // Perform requested actions for the remaining masks/vectors.
10426 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
10427 // Shuffle other input vectors, if any.
10428 std::pair<T *, bool> Res =
10429 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
10430 ArrayRef<int> SecMask = VMIt->second;
10431 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10432 if (SecMask[I] != PoisonMaskElem) {
10433 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
10434 "Multiple uses of scalars.");
10435 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
10436 } else if (Mask[I] != PoisonMaskElem) {
10437 Mask[I] = I;
10438 }
10439 }
10440 Prev = Action(Mask, {Prev, Res.first});
10441 }
10442 return Prev;
10443}
10444
10447 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
10448 << VectorizableTree.size() << ".\n");
10449
10450 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
10451
10452 SmallPtrSet<Value *, 4> CheckedExtracts;
10453 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
10454 TreeEntry &TE = *VectorizableTree[I];
10455 if (TE.isGather()) {
10456 if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
10457 E && E->getVectorFactor() == TE.getVectorFactor() &&
10458 E->isSame(TE.Scalars)) {
10459 // Some gather nodes might be absolutely the same as some vectorizable
10460 // nodes after reordering, need to handle it.
10461 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
10462 << shortBundleName(TE.Scalars) << ".\n"
10463 << "SLP: Current total cost = " << Cost << "\n");
10464 continue;
10465 }
10466 }
10467
10468 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
10469 Cost += C;
10470 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
10471 << shortBundleName(TE.Scalars) << ".\n"
10472 << "SLP: Current total cost = " << Cost << "\n");
10473 }
10474
10475 SmallPtrSet<Value *, 16> ExtractCostCalculated;
10476 InstructionCost ExtractCost = 0;
10479 SmallVector<APInt> DemandedElts;
10480 SmallDenseSet<Value *, 4> UsedInserts;
10482 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
10483 for (ExternalUser &EU : ExternalUses) {
10484 // We only add extract cost once for the same scalar.
10485 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
10486 !ExtractCostCalculated.insert(EU.Scalar).second)
10487 continue;
10488
10489 // Uses by ephemeral values are free (because the ephemeral value will be
10490 // removed prior to code generation, and so the extraction will be
10491 // removed as well).
10492 if (EphValues.count(EU.User))
10493 continue;
10494
10495 // No extract cost for vector "scalar"
10496 if (isa<FixedVectorType>(EU.Scalar->getType()))
10497 continue;
10498
10499 // If found user is an insertelement, do not calculate extract cost but try
10500 // to detect it as a final shuffled/identity match.
10501 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
10502 VU && VU->getOperand(1) == EU.Scalar) {
10503 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
10504 if (!UsedInserts.insert(VU).second)
10505 continue;
10506 std::optional<unsigned> InsertIdx = getElementIndex(VU);
10507 if (InsertIdx) {
10508 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
10509 auto *It = find_if(
10510 FirstUsers,
10511 [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
10513 VU, cast<InsertElementInst>(Pair.first),
10514 [this](InsertElementInst *II) -> Value * {
10515 Value *Op0 = II->getOperand(0);
10516 if (getTreeEntry(II) && !getTreeEntry(Op0))
10517 return nullptr;
10518 return Op0;
10519 });
10520 });
10521 int VecId = -1;
10522 if (It == FirstUsers.end()) {
10523 (void)ShuffleMasks.emplace_back();
10524 SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
10525 if (Mask.empty())
10526 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10527 // Find the insertvector, vectorized in tree, if any.
10528 Value *Base = VU;
10529 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
10530 if (IEBase != EU.User &&
10531 (!IEBase->hasOneUse() ||
10532 getElementIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
10533 break;
10534 // Build the mask for the vectorized insertelement instructions.
10535 if (const TreeEntry *E = getTreeEntry(IEBase)) {
10536 VU = IEBase;
10537 do {
10538 IEBase = cast<InsertElementInst>(Base);
10539 int Idx = *getElementIndex(IEBase);
10540 assert(Mask[Idx] == PoisonMaskElem &&
10541 "InsertElementInstruction used already.");
10542 Mask[Idx] = Idx;
10543 Base = IEBase->getOperand(0);
10544 } while (E == getTreeEntry(Base));
10545 break;
10546 }
10547 Base = cast<InsertElementInst>(Base)->getOperand(0);
10548 }
10549 FirstUsers.emplace_back(VU, ScalarTE);
10550 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
10551 VecId = FirstUsers.size() - 1;
10552 auto It = MinBWs.find(ScalarTE);
10553 if (It != MinBWs.end() &&
10554 VectorCasts
10555 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
10556 .second) {
10557 unsigned BWSz = It->second.first;
10558 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
10559 unsigned VecOpcode;
10560 if (DstBWSz < BWSz)
10561 VecOpcode = Instruction::Trunc;
10562 else
10563 VecOpcode =
10564 It->second.second ? Instruction::SExt : Instruction::ZExt;
10567 VecOpcode, FTy,
10568 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
10569 FTy->getNumElements()),
10571 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10572 << " for extending externally used vector with "
10573 "non-equal minimum bitwidth.\n");
10574 Cost += C;
10575 }
10576 } else {
10577 if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
10578 It->first = VU;
10579 VecId = std::distance(FirstUsers.begin(), It);
10580 }
10581 int InIdx = *InsertIdx;
10582 SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
10583 if (Mask.empty())
10584 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10585 Mask[InIdx] = EU.Lane;
10586 DemandedElts[VecId].setBit(InIdx);
10587 continue;
10588 }
10589 }
10590 }
10591 // Leave the GEPs as is, they are free in most cases and better to keep them
10592 // as GEPs.
10594 if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10595 if (!ValueToExtUses) {
10596 ValueToExtUses.emplace();
10597 for_each(enumerate(ExternalUses), [&](const auto &P) {
10598 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
10599 });
10600 }
10601 // Can use original GEP, if no operands vectorized or they are marked as
10602 // externally used already.
10603 bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
10604 if (!getTreeEntry(V))
10605 return true;
10606 auto It = ValueToExtUses->find(V);
10607 if (It != ValueToExtUses->end()) {
10608 // Replace all uses to avoid compiler crash.
10609 ExternalUses[It->second].User = nullptr;
10610 return true;
10611 }
10612 return false;
10613 });
10614 if (CanBeUsedAsGEP) {
10615 ExtractCost += TTI->getInstructionCost(GEP, CostKind);
10616 ExternalUsesAsGEPs.insert(EU.Scalar);
10617 continue;
10618 }
10619 }
10620
10621 // If we plan to rewrite the tree in a smaller type, we will need to sign
10622 // extend the extracted value back to the original type. Here, we account
10623 // for the extract and the added cost of the sign extend if needed.
10624 auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
10625 auto It = MinBWs.find(getTreeEntry(EU.Scalar));
10626 if (It != MinBWs.end()) {
10627 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
10628 unsigned Extend =
10629 It->second.second ? Instruction::SExt : Instruction::ZExt;
10630 VecTy = getWidenedType(MinTy, BundleWidth);
10631 ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
10632 VecTy, EU.Lane);
10633 } else {
10634 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
10635 CostKind, EU.Lane);
10636 }
10637 }
10638 // Add reduced value cost, if resized.
10639 if (!VectorizedVals.empty()) {
10640 const TreeEntry &Root = *VectorizableTree.front();
10641 auto BWIt = MinBWs.find(&Root);
10642 if (BWIt != MinBWs.end()) {
10643 Type *DstTy = Root.Scalars.front()->getType();
10644 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
10645 unsigned SrcSz =
10646 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10647 if (OriginalSz != SrcSz) {
10648 unsigned Opcode = Instruction::Trunc;
10649 if (OriginalSz > SrcSz)
10650 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10651 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
10652 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
10655 }
10656 }
10657 }
10658
10659 InstructionCost SpillCost = getSpillCost();
10660 Cost += SpillCost + ExtractCost;
10661 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
10662 bool) {
10663 InstructionCost C = 0;
10664 unsigned VF = Mask.size();
10665 unsigned VecVF = TE->getVectorFactor();
10666 if (VF != VecVF &&
10667 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
10669 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
10670 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
10671 OrigMask.begin());
10673 getWidenedType(TE->getMainOp()->getType(), VecVF),
10674 OrigMask);
10675 LLVM_DEBUG(
10676 dbgs() << "SLP: Adding cost " << C
10677 << " for final shuffle of insertelement external users.\n";
10678 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
10679 Cost += C;
10680 return std::make_pair(TE, true);
10681 }
10682 return std::make_pair(TE, false);
10683 };
10684 // Calculate the cost of the reshuffled vectors, if any.
10685 for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
10686 Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
10687 auto Vector = ShuffleMasks[I].takeVector();
10688 unsigned VF = 0;
10689 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
10691 assert((TEs.size() == 1 || TEs.size() == 2) &&
10692 "Expected exactly 1 or 2 tree entries.");
10693 if (TEs.size() == 1) {
10694 if (VF == 0)
10695 VF = TEs.front()->getVectorFactor();
10696 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
10697 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
10698 !all_of(enumerate(Mask), [=](const auto &Data) {
10699 return Data.value() == PoisonMaskElem ||
10700 (Data.index() < VF &&
10701 static_cast<int>(Data.index()) == Data.value());
10702 })) {
10705 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10706 << " for final shuffle of insertelement "
10707 "external users.\n";
10708 TEs.front()->dump();
10709 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10710 Cost += C;
10711 }
10712 } else {
10713 if (VF == 0) {
10714 if (TEs.front() &&
10715 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10716 VF = TEs.front()->getVectorFactor();
10717 else
10718 VF = Mask.size();
10719 }
10720 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
10723 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10724 << " for final shuffle of vector node and external "
10725 "insertelement users.\n";
10726 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10727 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10728 Cost += C;
10729 }
10730 VF = Mask.size();
10731 return TEs.back();
10732 };
10733 (void)performExtractsShuffleAction<const TreeEntry>(
10734 MutableArrayRef(Vector.data(), Vector.size()), Base,
10735 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
10736 EstimateShufflesCost);
10738 cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
10739 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
10740 Cost -= InsertCost;
10741 }
10742
10743 // Add the cost for reduced value resize (if required).
10744 if (ReductionBitWidth != 0) {
10745 assert(UserIgnoreList && "Expected reduction tree.");
10746 const TreeEntry &E = *VectorizableTree.front();
10747 auto It = MinBWs.find(&E);
10748 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10749 unsigned SrcSize = It->second.first;
10750 unsigned DstSize = ReductionBitWidth;
10751 unsigned Opcode = Instruction::Trunc;
10752 if (SrcSize < DstSize)
10753 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10754 auto *SrcVecTy =
10755 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
10756 auto *DstVecTy =
10757 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
10758 TTI::CastContextHint CCH = getCastContextHint(E);
10759 InstructionCost CastCost;
10760 switch (E.getOpcode()) {
10761 case Instruction::SExt:
10762 case Instruction::ZExt:
10763 case Instruction::Trunc: {
10764 const TreeEntry *OpTE = getOperandEntry(&E, 0);
10765 CCH = getCastContextHint(*OpTE);
10766 break;
10767 }
10768 default:
10769 break;
10770 }
10771 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
10773 Cost += CastCost;
10774 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
10775 << " for final resize for reduction from " << SrcVecTy
10776 << " to " << DstVecTy << "\n";
10777 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10778 }
10779 }
10780
10781#ifndef NDEBUG
10782 SmallString<256> Str;
10783 {
10785 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
10786 << "SLP: Extract Cost = " << ExtractCost << ".\n"
10787 << "SLP: Total Cost = " << Cost << ".\n";
10788 }
10789 LLVM_DEBUG(dbgs() << Str);
10790 if (ViewSLPTree)
10791 ViewGraph(this, "SLP" + F->getName(), false, Str);
10792#endif
10793
10794 return Cost;
10795}
10796
10797/// Tries to find extractelement instructions with constant indices from fixed
10798/// vector type and gather such instructions into a bunch, which highly likely
10799/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10800/// successful, the matched scalars are replaced by poison values in \p VL for
10801/// future analysis.
10802std::optional<TTI::ShuffleKind>
10803BoUpSLP::tryToGatherSingleRegisterExtractElements(
10805 // Scan list of gathered scalars for extractelements that can be represented
10806 // as shuffles.
10808 SmallVector<int> UndefVectorExtracts;
10809 for (int I = 0, E = VL.size(); I < E; ++I) {
10810 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10811 if (!EI) {
10812 if (isa<UndefValue>(VL[I]))
10813 UndefVectorExtracts.push_back(I);
10814 continue;
10815 }
10816 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
10817 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
10818 continue;
10819 std::optional<unsigned> Idx = getExtractIndex(EI);
10820 // Undefined index.
10821 if (!Idx) {
10822 UndefVectorExtracts.push_back(I);
10823 continue;
10824 }
10825 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
10826 ExtractMask.reset(*Idx);
10827 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
10828 UndefVectorExtracts.push_back(I);
10829 continue;
10830 }
10831 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
10832 }
10833 // Sort the vector operands by the maximum number of uses in extractelements.
10835 VectorOpToIdx.takeVector();
10836 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
10837 return P1.second.size() > P2.second.size();
10838 });
10839 // Find the best pair of the vectors or a single vector.
10840 const int UndefSz = UndefVectorExtracts.size();
10841 unsigned SingleMax = 0;
10842 unsigned PairMax = 0;
10843 if (!Vectors.empty()) {
10844 SingleMax = Vectors.front().second.size() + UndefSz;
10845 if (Vectors.size() > 1) {
10846 auto *ItNext = std::next(Vectors.begin());
10847 PairMax = SingleMax + ItNext->second.size();
10848 }
10849 }
10850 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10851 return std::nullopt;
10852 // Check if better to perform a shuffle of 2 vectors or just of a single
10853 // vector.
10854 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
10855 SmallVector<Value *> GatheredExtracts(
10856 VL.size(), PoisonValue::get(VL.front()->getType()));
10857 if (SingleMax >= PairMax && SingleMax) {
10858 for (int Idx : Vectors.front().second)
10859 std::swap(GatheredExtracts[Idx], VL[Idx]);
10860 } else if (!Vectors.empty()) {
10861 for (unsigned Idx : {0, 1})
10862 for (int Idx : Vectors[Idx].second)
10863 std::swap(GatheredExtracts[Idx], VL[Idx]);
10864 }
10865 // Add extracts from undefs too.
10866 for (int Idx : UndefVectorExtracts)
10867 std::swap(GatheredExtracts[Idx], VL[Idx]);
10868 // Check that gather of extractelements can be represented as just a
10869 // shuffle of a single/two vectors the scalars are extracted from.
10870 std::optional<TTI::ShuffleKind> Res =
10871 isFixedVectorShuffle(GatheredExtracts, Mask);
10872 if (!Res) {
10873 // TODO: try to check other subsets if possible.
10874 // Restore the original VL if attempt was not successful.
10875 copy(SavedVL, VL.begin());
10876 return std::nullopt;
10877 }
10878 // Restore unused scalars from mask, if some of the extractelements were not
10879 // selected for shuffle.
10880 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
10881 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
10882 isa<UndefValue>(GatheredExtracts[I])) {
10883 std::swap(VL[I], GatheredExtracts[I]);
10884 continue;
10885 }
10886 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10887 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
10888 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
10889 is_contained(UndefVectorExtracts, I))
10890 continue;
10891 }
10892 return Res;
10893}
10894
10895/// Tries to find extractelement instructions with constant indices from fixed
10896/// vector type and gather such instructions into a bunch, which highly likely
10897/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10898/// successful, the matched scalars are replaced by poison values in \p VL for
10899/// future analysis.
10901BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
10903 unsigned NumParts) const {
10904 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
10905 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
10906 Mask.assign(VL.size(), PoisonMaskElem);
10907 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
10908 for (unsigned Part : seq<unsigned>(NumParts)) {
10909 // Scan list of gathered scalars for extractelements that can be represented
10910 // as shuffles.
10912 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
10913 SmallVector<int> SubMask;
10914 std::optional<TTI::ShuffleKind> Res =
10915 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
10916 ShufflesRes[Part] = Res;
10917 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
10918 }
10919 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
10920 return Res.has_value();
10921 }))
10922 ShufflesRes.clear();
10923 return ShufflesRes;
10924}
10925
10926std::optional<TargetTransformInfo::ShuffleKind>
10927BoUpSLP::isGatherShuffledSingleRegisterEntry(
10928 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
10929 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
10930 Entries.clear();
10931 // TODO: currently checking only for Scalars in the tree entry, need to count
10932 // reused elements too for better cost estimation.
10933 const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
10934 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
10935 const BasicBlock *TEInsertBlock = nullptr;
10936 // Main node of PHI entries keeps the correct order of operands/incoming
10937 // blocks.
10938 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
10939 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
10940 TEInsertPt = TEInsertBlock->getTerminator();
10941 } else {
10942 TEInsertBlock = TEInsertPt->getParent();
10943 }
10944 if (!DT->isReachableFromEntry(TEInsertBlock))
10945 return std::nullopt;
10946 auto *NodeUI = DT->getNode(TEInsertBlock);
10947 assert(NodeUI && "Should only process reachable instructions");
10948 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
10949 auto CheckOrdering = [&](const Instruction *InsertPt) {
10950 // Argument InsertPt is an instruction where vector code for some other
10951 // tree entry (one that shares one or more scalars with TE) is going to be
10952 // generated. This lambda returns true if insertion point of vector code
10953 // for the TE dominates that point (otherwise dependency is the other way
10954 // around). The other node is not limited to be of a gather kind. Gather
10955 // nodes are not scheduled and their vector code is inserted before their
10956 // first user. If user is PHI, that is supposed to be at the end of a
10957 // predecessor block. Otherwise it is the last instruction among scalars of
10958 // the user node. So, instead of checking dependency between instructions
10959 // themselves, we check dependency between their insertion points for vector
10960 // code (since each scalar instruction ends up as a lane of a vector
10961 // instruction).
10962 const BasicBlock *InsertBlock = InsertPt->getParent();
10963 auto *NodeEUI = DT->getNode(InsertBlock);
10964 if (!NodeEUI)
10965 return false;
10966 assert((NodeUI == NodeEUI) ==
10967 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10968 "Different nodes should have different DFS numbers");
10969 // Check the order of the gather nodes users.
10970 if (TEInsertPt->getParent() != InsertBlock &&
10971 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
10972 return false;
10973 if (TEInsertPt->getParent() == InsertBlock &&
10974 TEInsertPt->comesBefore(InsertPt))
10975 return false;
10976 return true;
10977 };
10978 // Find all tree entries used by the gathered values. If no common entries
10979 // found - not a shuffle.
10980 // Here we build a set of tree nodes for each gathered value and trying to
10981 // find the intersection between these sets. If we have at least one common
10982 // tree node for each gathered value - we have just a permutation of the
10983 // single vector. If we have 2 different sets, we're in situation where we
10984 // have a permutation of 2 input vectors.
10986 DenseMap<Value *, int> UsedValuesEntry;
10987 for (Value *V : VL) {
10988 if (isConstant(V))
10989 continue;
10990 // Build a list of tree entries where V is used.
10992 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
10993 if (TEPtr == TE)
10994 continue;
10995 assert(any_of(TEPtr->Scalars,
10996 [&](Value *V) { return GatheredScalars.contains(V); }) &&
10997 "Must contain at least single gathered value.");
10998 assert(TEPtr->UserTreeIndices.size() == 1 &&
10999 "Expected only single user of a gather node.");
11000 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
11001
11002 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
11003 const Instruction *InsertPt =
11004 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
11005 : &getLastInstructionInBundle(UseEI.UserTE);
11006 if (TEInsertPt == InsertPt) {
11007 // If 2 gathers are operands of the same entry (regardless of whether
11008 // user is PHI or else), compare operands indices, use the earlier one
11009 // as the base.
11010 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
11011 continue;
11012 // If the user instruction is used for some reason in different
11013 // vectorized nodes - make it depend on index.
11014 if (TEUseEI.UserTE != UseEI.UserTE &&
11015 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
11016 continue;
11017 }
11018
11019 // Check if the user node of the TE comes after user node of TEPtr,
11020 // otherwise TEPtr depends on TE.
11021 if ((TEInsertBlock != InsertPt->getParent() ||
11022 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
11023 !CheckOrdering(InsertPt))
11024 continue;
11025 VToTEs.insert(TEPtr);
11026 }
11027 if (const TreeEntry *VTE = getTreeEntry(V)) {
11028 if (ForOrder) {
11029 if (VTE->State != TreeEntry::Vectorize) {
11030 auto It = MultiNodeScalars.find(V);
11031 if (It == MultiNodeScalars.end())
11032 continue;
11033 VTE = *It->getSecond().begin();
11034 // Iterate through all vectorized nodes.
11035 auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
11036 return MTE->State == TreeEntry::Vectorize;
11037 });
11038 if (MIt == It->getSecond().end())
11039 continue;
11040 VTE = *MIt;
11041 }
11042 }
11043 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
11044 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
11045 continue;
11046 VToTEs.insert(VTE);
11047 }
11048 if (VToTEs.empty())
11049 continue;
11050 if (UsedTEs.empty()) {
11051 // The first iteration, just insert the list of nodes to vector.
11052 UsedTEs.push_back(VToTEs);
11053 UsedValuesEntry.try_emplace(V, 0);
11054 } else {
11055 // Need to check if there are any previously used tree nodes which use V.
11056 // If there are no such nodes, consider that we have another one input
11057 // vector.
11058 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
11059 unsigned Idx = 0;
11060 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
11061 // Do we have a non-empty intersection of previously listed tree entries
11062 // and tree entries using current V?
11063 set_intersect(VToTEs, Set);
11064 if (!VToTEs.empty()) {
11065 // Yes, write the new subset and continue analysis for the next
11066 // scalar.
11067 Set.swap(VToTEs);
11068 break;
11069 }
11070 VToTEs = SavedVToTEs;
11071 ++Idx;
11072 }
11073 // No non-empty intersection found - need to add a second set of possible
11074 // source vectors.
11075 if (Idx == UsedTEs.size()) {
11076 // If the number of input vectors is greater than 2 - not a permutation,
11077 // fallback to the regular gather.
11078 // TODO: support multiple reshuffled nodes.
11079 if (UsedTEs.size() == 2)
11080 continue;
11081 UsedTEs.push_back(SavedVToTEs);
11082 Idx = UsedTEs.size() - 1;
11083 }
11084 UsedValuesEntry.try_emplace(V, Idx);
11085 }
11086 }
11087
11088 if (UsedTEs.empty()) {
11089 Entries.clear();
11090 return std::nullopt;
11091 }
11092
11093 unsigned VF = 0;
11094 if (UsedTEs.size() == 1) {
11095 // Keep the order to avoid non-determinism.
11096 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
11097 UsedTEs.front().end());
11098 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
11099 return TE1->Idx < TE2->Idx;
11100 });
11101 // Try to find the perfect match in another gather node at first.
11102 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
11103 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
11104 });
11105 if (It != FirstEntries.end() &&
11106 ((*It)->getVectorFactor() == VL.size() ||
11107 ((*It)->getVectorFactor() == TE->Scalars.size() &&
11108 TE->ReuseShuffleIndices.size() == VL.size() &&
11109 (*It)->isSame(TE->Scalars)))) {
11110 Entries.push_back(*It);
11111 if ((*It)->getVectorFactor() == VL.size()) {
11112 std::iota(std::next(Mask.begin(), Part * VL.size()),
11113 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
11114 } else {
11115 SmallVector<int> CommonMask = TE->getCommonMask();
11116 copy(CommonMask, Mask.begin());
11117 }
11118 // Clear undef scalars.
11119 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
11120 if (isa<PoisonValue>(VL[I]))
11123 }
11124 // No perfect match, just shuffle, so choose the first tree node from the
11125 // tree.
11126 Entries.push_back(FirstEntries.front());
11127 } else {
11128 // Try to find nodes with the same vector factor.
11129 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
11130 // Keep the order of tree nodes to avoid non-determinism.
11132 for (const TreeEntry *TE : UsedTEs.front()) {
11133 unsigned VF = TE->getVectorFactor();
11134 auto It = VFToTE.find(VF);
11135 if (It != VFToTE.end()) {
11136 if (It->second->Idx > TE->Idx)
11137 It->getSecond() = TE;
11138 continue;
11139 }
11140 VFToTE.try_emplace(VF, TE);
11141 }
11142 // Same, keep the order to avoid non-determinism.
11143 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
11144 UsedTEs.back().end());
11145 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
11146 return TE1->Idx < TE2->Idx;
11147 });
11148 for (const TreeEntry *TE : SecondEntries) {
11149 auto It = VFToTE.find(TE->getVectorFactor());
11150 if (It != VFToTE.end()) {
11151 VF = It->first;
11152 Entries.push_back(It->second);
11153 Entries.push_back(TE);
11154 break;
11155 }
11156 }
11157 // No 2 source vectors with the same vector factor - just choose 2 with max
11158 // index.
11159 if (Entries.empty()) {
11160 Entries.push_back(*llvm::max_element(
11161 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
11162 return TE1->Idx < TE2->Idx;
11163 }));
11164 Entries.push_back(SecondEntries.front());
11165 VF = std::max(Entries.front()->getVectorFactor(),
11166 Entries.back()->getVectorFactor());
11167 }
11168 }
11169
11170 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
11171 // Checks if the 2 PHIs are compatible in terms of high possibility to be
11172 // vectorized.
11173 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
11174 auto *PHI = cast<PHINode>(V);
11175 auto *PHI1 = cast<PHINode>(V1);
11176 // Check that all incoming values are compatible/from same parent (if they
11177 // are instructions).
11178 // The incoming values are compatible if they all are constants, or
11179 // instruction with the same/alternate opcodes from the same basic block.
11180 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
11181 Value *In = PHI->getIncomingValue(I);
11182 Value *In1 = PHI1->getIncomingValue(I);
11183 if (isConstant(In) && isConstant(In1))
11184 continue;
11185 if (!getSameOpcode({In, In1}, *TLI).getOpcode())
11186 return false;
11187 if (cast<Instruction>(In)->getParent() !=
11188 cast<Instruction>(In1)->getParent())
11189 return false;
11190 }
11191 return true;
11192 };
11193 // Check if the value can be ignored during analysis for shuffled gathers.
11194 // We suppose it is better to ignore instruction, which do not form splats,
11195 // are not vectorized/not extractelements (these instructions will be handled
11196 // by extractelements processing) or may form vector node in future.
11197 auto MightBeIgnored = [=](Value *V) {
11198 auto *I = dyn_cast<Instruction>(V);
11199 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
11201 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
11202 };
11203 // Check that the neighbor instruction may form a full vector node with the
11204 // current instruction V. It is possible, if they have same/alternate opcode
11205 // and same parent basic block.
11206 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
11207 Value *V1 = VL[Idx];
11208 bool UsedInSameVTE = false;
11209 auto It = UsedValuesEntry.find(V1);
11210 if (It != UsedValuesEntry.end())
11211 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
11212 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
11213 getSameOpcode({V, V1}, *TLI).getOpcode() &&
11214 cast<Instruction>(V)->getParent() ==
11215 cast<Instruction>(V1)->getParent() &&
11216 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
11217 };
11218 // Build a shuffle mask for better cost estimation and vector emission.
11219 SmallBitVector UsedIdxs(Entries.size());
11221 for (int I = 0, E = VL.size(); I < E; ++I) {
11222 Value *V = VL[I];
11223 auto It = UsedValuesEntry.find(V);
11224 if (It == UsedValuesEntry.end())
11225 continue;
11226 // Do not try to shuffle scalars, if they are constants, or instructions
11227 // that can be vectorized as a result of the following vector build
11228 // vectorization.
11229 if (isConstant(V) || (MightBeIgnored(V) &&
11230 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
11231 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
11232 continue;
11233 unsigned Idx = It->second;
11234 EntryLanes.emplace_back(Idx, I);
11235 UsedIdxs.set(Idx);
11236 }
11237 // Iterate through all shuffled scalars and select entries, which can be used
11238 // for final shuffle.
11240 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
11241 if (!UsedIdxs.test(I))
11242 continue;
11243 // Fix the entry number for the given scalar. If it is the first entry, set
11244 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
11245 // These indices are used when calculating final shuffle mask as the vector
11246 // offset.
11247 for (std::pair<unsigned, int> &Pair : EntryLanes)
11248 if (Pair.first == I)
11249 Pair.first = TempEntries.size();
11250 TempEntries.push_back(Entries[I]);
11251 }
11252 Entries.swap(TempEntries);
11253 if (EntryLanes.size() == Entries.size() &&
11254 !VL.equals(ArrayRef(TE->Scalars)
11255 .slice(Part * VL.size(),
11256 std::min<int>(VL.size(), TE->Scalars.size())))) {
11257 // We may have here 1 or 2 entries only. If the number of scalars is equal
11258 // to the number of entries, no need to do the analysis, it is not very
11259 // profitable. Since VL is not the same as TE->Scalars, it means we already
11260 // have some shuffles before. Cut off not profitable case.
11261 Entries.clear();
11262 return std::nullopt;
11263 }
11264 // Build the final mask, check for the identity shuffle, if possible.
11265 bool IsIdentity = Entries.size() == 1;
11266 // Pair.first is the offset to the vector, while Pair.second is the index of
11267 // scalar in the list.
11268 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
11269 unsigned Idx = Part * VL.size() + Pair.second;
11270 Mask[Idx] =
11271 Pair.first * VF +
11272 (ForOrder ? std::distance(
11273 Entries[Pair.first]->Scalars.begin(),
11274 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
11275 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
11276 IsIdentity &= Mask[Idx] == Pair.second;
11277 }
11278 switch (Entries.size()) {
11279 case 1:
11280 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
11282 break;
11283 case 2:
11284 if (EntryLanes.size() > 2 || VL.size() <= 2)
11286 break;
11287 default:
11288 break;
11289 }
11290 Entries.clear();
11291 // Clear the corresponding mask elements.
11292 std::fill(std::next(Mask.begin(), Part * VL.size()),
11293 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
11294 return std::nullopt;
11295}
11296
11298BoUpSLP::isGatherShuffledEntry(
11299 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
11300 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
11301 bool ForOrder) {
11302 assert(NumParts > 0 && NumParts < VL.size() &&
11303 "Expected positive number of registers.");
11304 Entries.clear();
11305 // No need to check for the topmost gather node.
11306 if (TE == VectorizableTree.front().get())
11307 return {};
11308 // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
11309 if (TE->isNonPowOf2Vec())
11310 return {};
11311 Mask.assign(VL.size(), PoisonMaskElem);
11312 assert(TE->UserTreeIndices.size() == 1 &&
11313 "Expected only single user of the gather node.");
11314 assert(VL.size() % NumParts == 0 &&
11315 "Number of scalars must be divisible by NumParts.");
11316 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
11318 for (unsigned Part : seq<unsigned>(NumParts)) {
11319 ArrayRef<Value *> SubVL =
11320 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
11321 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
11322 std::optional<TTI::ShuffleKind> SubRes =
11323 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
11324 ForOrder);
11325 if (!SubRes)
11326 SubEntries.clear();
11327 Res.push_back(SubRes);
11328 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
11329 SubEntries.front()->getVectorFactor() == VL.size() &&
11330 (SubEntries.front()->isSame(TE->Scalars) ||
11331 SubEntries.front()->isSame(VL))) {
11332 SmallVector<const TreeEntry *> LocalSubEntries;
11333 LocalSubEntries.swap(SubEntries);
11334 Entries.clear();
11335 Res.clear();
11336 std::iota(Mask.begin(), Mask.end(), 0);
11337 // Clear undef scalars.
11338 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
11339 if (isa<PoisonValue>(VL[I]))
11341 Entries.emplace_back(1, LocalSubEntries.front());
11343 return Res;
11344 }
11345 }
11346 if (all_of(Res,
11347 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
11348 Entries.clear();
11349 return {};
11350 }
11351 return Res;
11352}
11353
11354InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
11355 Type *ScalarTy) const {
11356 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11357 bool DuplicateNonConst = false;
11358 // Find the cost of inserting/extracting values from the vector.
11359 // Check if the same elements are inserted several times and count them as
11360 // shuffle candidates.
11361 APInt ShuffledElements = APInt::getZero(VL.size());
11362 DenseMap<Value *, unsigned> UniqueElements;
11365 auto EstimateInsertCost = [&](unsigned I, Value *V) {
11366 if (V->getType() != ScalarTy) {
11367 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
11369 V = nullptr;
11370 }
11371 if (!ForPoisonSrc)
11372 Cost +=
11373 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
11374 I, Constant::getNullValue(VecTy), V);
11375 };
11376 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
11377 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
11378 Value *V = VL[I];
11379 // No need to shuffle duplicates for constants.
11380 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
11381 ShuffledElements.setBit(I);
11382 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
11383 continue;
11384 }
11385
11386 auto Res = UniqueElements.try_emplace(V, I);
11387 if (Res.second) {
11388 EstimateInsertCost(I, V);
11389 ShuffleMask[I] = I;
11390 continue;
11391 }
11392
11393 DuplicateNonConst = true;
11394 ShuffledElements.setBit(I);
11395 ShuffleMask[I] = Res.first->second;
11396 }
11397 if (ForPoisonSrc)
11398 Cost =
11399 TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
11400 /*Extract*/ false, CostKind);
11401 if (DuplicateNonConst)
11403 VecTy, ShuffleMask);
11404 return Cost;
11405}
11406
11407// Perform operand reordering on the instructions in VL and return the reordered
11408// operands in Left and Right.
11409void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
11412 const BoUpSLP &R) {
11413 if (VL.empty())
11414 return;
11415 VLOperands Ops(VL, R);
11416 // Reorder the operands in place.
11417 Ops.reorder();
11418 Left = Ops.getVL(0);
11419 Right = Ops.getVL(1);
11420}
11421
11422Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
11423 auto &Res = EntryToLastInstruction.FindAndConstruct(E);
11424 if (Res.second)
11425 return *Res.second;
11426 // Get the basic block this bundle is in. All instructions in the bundle
11427 // should be in this block (except for extractelement-like instructions with
11428 // constant indeces).
11429 auto *Front = E->getMainOp();
11430 auto *BB = Front->getParent();
11431 assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
11432 if (E->getOpcode() == Instruction::GetElementPtr &&
11433 !isa<GetElementPtrInst>(V))
11434 return true;
11435 auto *I = cast<Instruction>(V);
11436 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
11437 isVectorLikeInstWithConstOps(I);
11438 }));
11439
11440 auto FindLastInst = [&]() {
11441 Instruction *LastInst = Front;
11442 for (Value *V : E->Scalars) {
11443 auto *I = dyn_cast<Instruction>(V);
11444 if (!I)
11445 continue;
11446 if (LastInst->getParent() == I->getParent()) {
11447 if (LastInst->comesBefore(I))
11448 LastInst = I;
11449 continue;
11450 }
11451 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11452 !isa<GetElementPtrInst>(I)) ||
11453 (isVectorLikeInstWithConstOps(LastInst) &&
11455 "Expected vector-like or non-GEP in GEP node insts only.");
11456 if (!DT->isReachableFromEntry(LastInst->getParent())) {
11457 LastInst = I;
11458 continue;
11459 }
11460 if (!DT->isReachableFromEntry(I->getParent()))
11461 continue;
11462 auto *NodeA = DT->getNode(LastInst->getParent());
11463 auto *NodeB = DT->getNode(I->getParent());
11464 assert(NodeA && "Should only process reachable instructions");
11465 assert(NodeB && "Should only process reachable instructions");
11466 assert((NodeA == NodeB) ==
11467 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11468 "Different nodes should have different DFS numbers");
11469 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
11470 LastInst = I;
11471 }
11472 BB = LastInst->getParent();
11473 return LastInst;
11474 };
11475
11476 auto FindFirstInst = [&]() {
11477 Instruction *FirstInst = Front;
11478 for (Value *V : E->Scalars) {
11479 auto *I = dyn_cast<Instruction>(V);
11480 if (!I)
11481 continue;
11482 if (FirstInst->getParent() == I->getParent()) {
11483 if (I->comesBefore(FirstInst))
11484 FirstInst = I;
11485 continue;
11486 }
11487 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11488 !isa<GetElementPtrInst>(I)) ||
11489 (isVectorLikeInstWithConstOps(FirstInst) &&
11491 "Expected vector-like or non-GEP in GEP node insts only.");
11492 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
11493 FirstInst = I;
11494 continue;
11495 }
11496 if (!DT->isReachableFromEntry(I->getParent()))
11497 continue;
11498 auto *NodeA = DT->getNode(FirstInst->getParent());
11499 auto *NodeB = DT->getNode(I->getParent());
11500 assert(NodeA && "Should only process reachable instructions");
11501 assert(NodeB && "Should only process reachable instructions");
11502 assert((NodeA == NodeB) ==
11503 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11504 "Different nodes should have different DFS numbers");
11505 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11506 FirstInst = I;
11507 }
11508 return FirstInst;
11509 };
11510
11511 // Set the insert point to the beginning of the basic block if the entry
11512 // should not be scheduled.
11513 if (doesNotNeedToSchedule(E->Scalars) ||
11514 (!E->isGather() && all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
11515 if ((E->getOpcode() == Instruction::GetElementPtr &&
11516 any_of(E->Scalars,
11517 [](Value *V) {
11518 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
11519 })) ||
11520 all_of(E->Scalars,
11521 [](Value *V) {
11522 return !isVectorLikeInstWithConstOps(V) &&
11523 isUsedOutsideBlock(V);
11524 }) ||
11525 (E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) {
11526 return isa<ExtractElementInst, UndefValue>(V) ||
11527 areAllOperandsNonInsts(V);
11528 })))
11529 Res.second = FindLastInst();
11530 else
11531 Res.second = FindFirstInst();
11532 return *Res.second;
11533 }
11534
11535 // Find the last instruction. The common case should be that BB has been
11536 // scheduled, and the last instruction is VL.back(). So we start with
11537 // VL.back() and iterate over schedule data until we reach the end of the
11538 // bundle. The end of the bundle is marked by null ScheduleData.
11539 if (BlocksSchedules.count(BB)) {
11540 Value *V = E->isOneOf(E->Scalars.back());
11542 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
11543 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11544 if (Bundle && Bundle->isPartOfBundle())
11545 for (; Bundle; Bundle = Bundle->NextInBundle)
11546 if (Bundle->OpValue == Bundle->Inst)
11547 Res.second = Bundle->Inst;
11548 }
11549
11550 // LastInst can still be null at this point if there's either not an entry
11551 // for BB in BlocksSchedules or there's no ScheduleData available for
11552 // VL.back(). This can be the case if buildTree_rec aborts for various
11553 // reasons (e.g., the maximum recursion depth is reached, the maximum region
11554 // size is reached, etc.). ScheduleData is initialized in the scheduling
11555 // "dry-run".
11556 //
11557 // If this happens, we can still find the last instruction by brute force. We
11558 // iterate forwards from Front (inclusive) until we either see all
11559 // instructions in the bundle or reach the end of the block. If Front is the
11560 // last instruction in program order, LastInst will be set to Front, and we
11561 // will visit all the remaining instructions in the block.
11562 //
11563 // One of the reasons we exit early from buildTree_rec is to place an upper
11564 // bound on compile-time. Thus, taking an additional compile-time hit here is
11565 // not ideal. However, this should be exceedingly rare since it requires that
11566 // we both exit early from buildTree_rec and that the bundle be out-of-order
11567 // (causing us to iterate all the way to the end of the block).
11568 if (!Res.second)
11569 Res.second = FindLastInst();
11570 assert(Res.second && "Failed to find last instruction in bundle");
11571 return *Res.second;
11572}
11573
11574void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
11575 auto *Front = E->getMainOp();
11576 Instruction *LastInst = &getLastInstructionInBundle(E);
11577 assert(LastInst && "Failed to find last instruction in bundle");
11578 BasicBlock::iterator LastInstIt = LastInst->getIterator();
11579 // If the instruction is PHI, set the insert point after all the PHIs.
11580 bool IsPHI = isa<PHINode>(LastInst);
11581 if (IsPHI)
11582 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
11583 if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) {
11584 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
11585 } else {
11586 // Set the insertion point after the last instruction in the bundle. Set the
11587 // debug location to Front.
11588 Builder.SetInsertPoint(
11589 LastInst->getParent(),
11591 }
11592 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11593}
11594
11595Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy) {
11596 // List of instructions/lanes from current block and/or the blocks which are
11597 // part of the current loop. These instructions will be inserted at the end to
11598 // make it possible to optimize loops and hoist invariant instructions out of
11599 // the loops body with better chances for success.
11601 SmallSet<int, 4> PostponedIndices;
11602 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
11603 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
11605 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
11606 InsertBB = InsertBB->getSinglePredecessor();
11607 return InsertBB && InsertBB == InstBB;
11608 };
11609 for (int I = 0, E = VL.size(); I < E; ++I) {
11610 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
11611 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11612 getTreeEntry(Inst) ||
11613 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
11614 PostponedIndices.insert(I).second)
11615 PostponedInsts.emplace_back(Inst, I);
11616 }
11617
11618 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
11619 Type *Ty) {
11620 Value *Scalar = V;
11621 if (Scalar->getType() != Ty) {
11622 assert(Scalar->getType()->isIntegerTy() && Ty->isIntegerTy() &&
11623 "Expected integer types only.");
11624 Value *V = Scalar;
11625 if (auto *CI = dyn_cast<CastInst>(Scalar);
11626 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
11627 Value *Op = CI->getOperand(0);
11628 if (auto *IOp = dyn_cast<Instruction>(Op);
11629 !IOp || !(isDeleted(IOp) || getTreeEntry(IOp)))
11630 V = Op;
11631 }
11632 Scalar = Builder.CreateIntCast(
11633 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
11634 }
11635
11636 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11637 auto *InsElt = dyn_cast<InsertElementInst>(Vec);
11638 if (!InsElt)
11639 return Vec;
11640 GatherShuffleExtractSeq.insert(InsElt);
11641 CSEBlocks.insert(InsElt->getParent());
11642 // Add to our 'need-to-extract' list.
11643 if (isa<Instruction>(V)) {
11644 if (TreeEntry *Entry = getTreeEntry(V)) {
11645 // Find which lane we need to extract.
11646 User *UserOp = nullptr;
11647 if (Scalar != V) {
11648 if (auto *SI = dyn_cast<Instruction>(Scalar))
11649 UserOp = SI;
11650 } else {
11651 UserOp = InsElt;
11652 }
11653 if (UserOp) {
11654 unsigned FoundLane = Entry->findLaneForValue(V);
11655 ExternalUses.emplace_back(V, UserOp, FoundLane);
11656 }
11657 }
11658 }
11659 return Vec;
11660 };
11661 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11662 Value *Vec = Root ? Root : PoisonValue::get(VecTy);
11663 SmallVector<int> NonConsts;
11664 // Insert constant values at first.
11665 for (int I = 0, E = VL.size(); I < E; ++I) {
11666 if (PostponedIndices.contains(I))
11667 continue;
11668 if (!isConstant(VL[I])) {
11669 NonConsts.push_back(I);
11670 continue;
11671 }
11672 if (Root) {
11673 if (!isa<UndefValue>(VL[I])) {
11674 NonConsts.push_back(I);
11675 continue;
11676 }
11677 if (isa<PoisonValue>(VL[I]))
11678 continue;
11679 if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
11680 if (SV->getMaskValue(I) == PoisonMaskElem)
11681 continue;
11682 }
11683 }
11684 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11685 }
11686 // Insert non-constant values.
11687 for (int I : NonConsts)
11688 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11689 // Append instructions, which are/may be part of the loop, in the end to make
11690 // it possible to hoist non-loop-based instructions.
11691 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11692 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11693
11694 return Vec;
11695}
11696
11697/// Merges shuffle masks and emits final shuffle instruction, if required. It
11698/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
11699/// when the actual shuffle instruction is generated only if this is actually
11700/// required. Otherwise, the shuffle instruction emission is delayed till the
11701/// end of the process, to reduce the number of emitted instructions and further
11702/// analysis/transformations.
11703/// The class also will look through the previously emitted shuffle instructions
11704/// and properly mark indices in mask as undef.
11705/// For example, given the code
11706/// \code
11707/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
11708/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
11709/// \endcode
11710/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
11711/// look through %s1 and %s2 and emit
11712/// \code
11713/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11714/// \endcode
11715/// instead.
11716/// If 2 operands are of different size, the smallest one will be resized and
11717/// the mask recalculated properly.
11718/// For example, given the code
11719/// \code
11720/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
11721/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
11722/// \endcode
11723/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
11724/// look through %s1 and %s2 and emit
11725/// \code
11726/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11727/// \endcode
11728/// instead.
11729class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
11730 bool IsFinalized = false;
11731 /// Combined mask for all applied operands and masks. It is built during
11732 /// analysis and actual emission of shuffle vector instructions.
11733 SmallVector<int> CommonMask;
11734 /// List of operands for the shuffle vector instruction. It hold at max 2
11735 /// operands, if the 3rd is going to be added, the first 2 are combined into
11736 /// shuffle with \p CommonMask mask, the first operand sets to be the
11737 /// resulting shuffle and the second operand sets to be the newly added
11738 /// operand. The \p CommonMask is transformed in the proper way after that.
11739 SmallVector<Value *, 2> InVectors;
11740 Type *ScalarTy = nullptr;
11741 IRBuilderBase &Builder;
11742 BoUpSLP &R;
11743
11744 class ShuffleIRBuilder {
11745 IRBuilderBase &Builder;
11746 /// Holds all of the instructions that we gathered.
11747 SetVector<Instruction *> &GatherShuffleExtractSeq;
11748 /// A list of blocks that we are going to CSE.
11749 DenseSet<BasicBlock *> &CSEBlocks;
11750 /// Data layout.
11751 const DataLayout &DL;
11752
11753 public:
11754 ShuffleIRBuilder(IRBuilderBase &Builder,
11755 SetVector<Instruction *> &GatherShuffleExtractSeq,
11756 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
11757 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11758 CSEBlocks(CSEBlocks), DL(DL) {}
11759 ~ShuffleIRBuilder() = default;
11760 /// Creates shufflevector for the 2 operands with the given mask.
11761 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
11762 if (V1->getType() != V2->getType()) {
11764 V1->getType()->isIntOrIntVectorTy() &&
11765 "Expected integer vector types only.");
11766 if (V1->getType() != V2->getType()) {
11767 if (cast<VectorType>(V2->getType())
11768 ->getElementType()
11769 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
11770 ->getElementType()
11771 ->getIntegerBitWidth())
11772 V2 = Builder.CreateIntCast(
11773 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
11774 else
11775 V1 = Builder.CreateIntCast(
11776 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
11777 }
11778 }
11779 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
11780 if (auto *I = dyn_cast<Instruction>(Vec)) {
11781 GatherShuffleExtractSeq.insert(I);
11782 CSEBlocks.insert(I->getParent());
11783 }
11784 return Vec;
11785 }
11786 /// Creates permutation of the single vector operand with the given mask, if
11787 /// it is not identity mask.
11788 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
11789 if (Mask.empty())
11790 return V1;
11791 unsigned VF = Mask.size();
11792 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
11793 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
11794 return V1;
11795 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
11796 if (auto *I = dyn_cast<Instruction>(Vec)) {
11797 GatherShuffleExtractSeq.insert(I);
11798 CSEBlocks.insert(I->getParent());
11799 }
11800 return Vec;
11801 }
11802 Value *createIdentity(Value *V) { return V; }
11803 Value *createPoison(Type *Ty, unsigned VF) {
11804 return PoisonValue::get(getWidenedType(Ty, VF));
11805 }
11806 /// Resizes 2 input vector to match the sizes, if the they are not equal
11807 /// yet. The smallest vector is resized to the size of the larger vector.
11808 void resizeToMatch(Value *&V1, Value *&V2) {
11809 if (V1->getType() == V2->getType())
11810 return;
11811 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
11812 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
11813 int VF = std::max(V1VF, V2VF);
11814 int MinVF = std::min(V1VF, V2VF);
11815 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
11816 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
11817 0);
11818 Value *&Op = MinVF == V1VF ? V1 : V2;
11819 Op = Builder.CreateShuffleVector(Op, IdentityMask);
11820 if (auto *I = dyn_cast<Instruction>(Op)) {
11821 GatherShuffleExtractSeq.insert(I);
11822 CSEBlocks.insert(I->getParent());
11823 }
11824 if (MinVF == V1VF)
11825 V1 = Op;
11826 else
11827 V2 = Op;
11828 }
11829 };
11830
11831 /// Smart shuffle instruction emission, walks through shuffles trees and
11832 /// tries to find the best matching vector for the actual shuffle
11833 /// instruction.
11834 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
11835 assert(V1 && "Expected at least one vector value.");
11836 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11837 R.CSEBlocks, *R.DL);
11838 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11839 ShuffleBuilder);
11840 }
11841
11842 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
11843 /// shuffle emission.
11844 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
11845 ArrayRef<int> Mask) {
11846 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11847 if (Mask[Idx] != PoisonMaskElem)
11848 CommonMask[Idx] = Idx;
11849 }
11850
11851 /// Cast value \p V to the vector type with the same number of elements, but
11852 /// the base type \p ScalarTy.
11853 Value *castToScalarTyElem(Value *V,
11854 std::optional<bool> IsSigned = std::nullopt) {
11855 auto *VecTy = cast<VectorType>(V->getType());
11856 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
11857 if (VecTy->getElementType() == ScalarTy->getScalarType())
11858 return V;
11859 return Builder.CreateIntCast(
11860 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
11861 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
11862 }
11863
11864public:
11866 : ScalarTy(ScalarTy), Builder(Builder), R(R) {}
11867
11868 /// Adjusts extractelements after reusing them.
11869 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
11870 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11871 unsigned NumParts, bool &UseVecBaseAsInput) {
11872 UseVecBaseAsInput = false;
11873 SmallPtrSet<Value *, 4> UniqueBases;
11874 Value *VecBase = nullptr;
11875 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
11876 int Idx = Mask[I];
11877 if (Idx == PoisonMaskElem)
11878 continue;
11879 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
11880 VecBase = EI->getVectorOperand();
11881 if (const TreeEntry *TE = R.getTreeEntry(VecBase))
11882 VecBase = TE->VectorizedValue;
11883 assert(VecBase && "Expected vectorized value.");
11884 UniqueBases.insert(VecBase);
11885 // If the only one use is vectorized - can delete the extractelement
11886 // itself.
11887 if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) ||
11888 any_of(EI->users(), [&](User *U) {
11889 const TreeEntry *UTE = R.getTreeEntry(U);
11890 return !UTE || R.MultiNodeScalars.contains(U) ||
11891 (isa<GetElementPtrInst>(U) &&
11892 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
11893 count_if(R.VectorizableTree,
11894 [&](const std::unique_ptr<TreeEntry> &TE) {
11895 return any_of(TE->UserTreeIndices,
11896 [&](const EdgeInfo &Edge) {
11897 return Edge.UserTE == UTE;
11898 }) &&
11899 is_contained(TE->Scalars, EI);
11900 }) != 1;
11901 }))
11902 continue;
11903 R.eraseInstruction(EI);
11904 }
11905 if (NumParts == 1 || UniqueBases.size() == 1) {
11906 assert(VecBase && "Expected vectorized value.");
11907 return castToScalarTyElem(VecBase);
11908 }
11909 UseVecBaseAsInput = true;
11910 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
11911 for (auto [I, Idx] : enumerate(Mask))
11912 if (Idx != PoisonMaskElem)
11913 Idx = I;
11914 };
11915 // Perform multi-register vector shuffle, joining them into a single virtual
11916 // long vector.
11917 // Need to shuffle each part independently and then insert all this parts
11918 // into a long virtual vector register, forming the original vector.
11919 Value *Vec = nullptr;
11920 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
11921 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
11922 for (unsigned Part : seq<unsigned>(NumParts)) {
11923 unsigned Limit = getNumElems(E->Scalars.size(), SliceSize, Part);
11925 ArrayRef(E->Scalars).slice(Part * SliceSize, Limit);
11926 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
11927 constexpr int MaxBases = 2;
11928 SmallVector<Value *, MaxBases> Bases(MaxBases);
11929 auto VLMask = zip(VL, SubMask);
11930 const unsigned VF = std::accumulate(
11931 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
11932 if (std::get<1>(D) == PoisonMaskElem)
11933 return S;
11934 Value *VecOp =
11935 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
11936 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
11937 VecOp = TE->VectorizedValue;
11938 assert(VecOp && "Expected vectorized value.");
11939 const unsigned Size =
11940 cast<FixedVectorType>(VecOp->getType())->getNumElements();
11941 return std::max(S, Size);
11942 });
11943 for (const auto [V, I] : VLMask) {
11944 if (I == PoisonMaskElem)
11945 continue;
11946 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
11947 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
11948 VecOp = TE->VectorizedValue;
11949 assert(VecOp && "Expected vectorized value.");
11950 VecOp = castToScalarTyElem(VecOp);
11951 Bases[I / VF] = VecOp;
11952 }
11953 if (!Bases.front())
11954 continue;
11955 Value *SubVec;
11956 if (Bases.back()) {
11957 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
11958 TransformToIdentity(SubMask);
11959 } else {
11960 SubVec = Bases.front();
11961 }
11962 if (!Vec) {
11963 Vec = SubVec;
11964 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
11965 [&](unsigned P) {
11966 ArrayRef<int> SubMask =
11967 Mask.slice(P * SliceSize,
11968 getNumElems(Mask.size(),
11969 SliceSize, P));
11970 return all_of(SubMask, [](int Idx) {
11971 return Idx == PoisonMaskElem;
11972 });
11973 })) &&
11974 "Expected first part or all previous parts masked.");
11975 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11976 } else {
11977 unsigned NewVF =
11978 cast<FixedVectorType>(Vec->getType())->getNumElements();
11979 if (Vec->getType() != SubVec->getType()) {
11980 unsigned SubVecVF =
11981 cast<FixedVectorType>(SubVec->getType())->getNumElements();
11982 NewVF = std::max(NewVF, SubVecVF);
11983 }
11984 // Adjust SubMask.
11985 for (int &Idx : SubMask)
11986 if (Idx != PoisonMaskElem)
11987 Idx += NewVF;
11988 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11989 Vec = createShuffle(Vec, SubVec, VecMask);
11990 TransformToIdentity(VecMask);
11991 }
11992 }
11993 copy(VecMask, Mask.begin());
11994 return Vec;
11995 }
11996 /// Checks if the specified entry \p E needs to be delayed because of its
11997 /// dependency nodes.
11998 std::optional<Value *>
11999 needToDelay(const TreeEntry *E,
12001 // No need to delay emission if all deps are ready.
12002 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
12003 return all_of(
12004 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
12005 }))
12006 return std::nullopt;
12007 // Postpone gather emission, will be emitted after the end of the
12008 // process to keep correct order.
12009 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
12010 return Builder.CreateAlignedLoad(
12011 ResVecTy,
12013 MaybeAlign());
12014 }
12015 /// Adds 2 input vectors (in form of tree entries) and the mask for their
12016 /// shuffling.
12017 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
12018 Value *V1 = E1.VectorizedValue;
12019 if (V1->getType()->isIntOrIntVectorTy())
12020 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
12021 return !isKnownNonNegative(
12022 V, SimplifyQuery(*R.DL));
12023 }));
12024 Value *V2 = E2.VectorizedValue;
12025 if (V2->getType()->isIntOrIntVectorTy())
12026 V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
12027 return !isKnownNonNegative(
12028 V, SimplifyQuery(*R.DL));
12029 }));
12030 add(V1, V2, Mask);
12031 }
12032 /// Adds single input vector (in form of tree entry) and the mask for its
12033 /// shuffling.
12034 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
12035 Value *V1 = E1.VectorizedValue;
12036 if (V1->getType()->isIntOrIntVectorTy())
12037 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
12038 return !isKnownNonNegative(
12039 V, SimplifyQuery(*R.DL));
12040 }));
12041 add(V1, Mask);
12042 }
12043 /// Adds 2 input vectors and the mask for their shuffling.
12044 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
12045 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
12046 assert(isa<FixedVectorType>(V1->getType()) &&
12047 isa<FixedVectorType>(V2->getType()) &&
12048 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
12049 V1 = castToScalarTyElem(V1);
12050 V2 = castToScalarTyElem(V2);
12051 if (InVectors.empty()) {
12052 InVectors.push_back(V1);
12053 InVectors.push_back(V2);
12054 CommonMask.assign(Mask.begin(), Mask.end());
12055 return;
12056 }
12057 Value *Vec = InVectors.front();
12058 if (InVectors.size() == 2) {
12059 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
12060 transformMaskAfterShuffle(CommonMask, CommonMask);
12061 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
12062 Mask.size()) {
12063 Vec = createShuffle(Vec, nullptr, CommonMask);
12064 transformMaskAfterShuffle(CommonMask, CommonMask);
12065 }
12066 V1 = createShuffle(V1, V2, Mask);
12067 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12068 if (Mask[Idx] != PoisonMaskElem)
12069 CommonMask[Idx] = Idx + Sz;
12070 InVectors.front() = Vec;
12071 if (InVectors.size() == 2)
12072 InVectors.back() = V1;
12073 else
12074 InVectors.push_back(V1);
12075 }
12076 /// Adds another one input vector and the mask for the shuffling.
12077 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
12078 assert(isa<FixedVectorType>(V1->getType()) &&
12079 "castToScalarTyElem expects V1 to be FixedVectorType");
12080 V1 = castToScalarTyElem(V1);
12081 if (InVectors.empty()) {
12082 InVectors.push_back(V1);
12083 CommonMask.assign(Mask.begin(), Mask.end());
12084 return;
12085 }
12086 const auto *It = find(InVectors, V1);
12087 if (It == InVectors.end()) {
12088 if (InVectors.size() == 2 ||
12089 InVectors.front()->getType() != V1->getType()) {
12090 Value *V = InVectors.front();
12091 if (InVectors.size() == 2) {
12092 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
12093 transformMaskAfterShuffle(CommonMask, CommonMask);
12094 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
12095 CommonMask.size()) {
12096 V = createShuffle(InVectors.front(), nullptr, CommonMask);
12097 transformMaskAfterShuffle(CommonMask, CommonMask);
12098 }
12099 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12100 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
12101 CommonMask[Idx] =
12102 V->getType() != V1->getType()
12103 ? Idx + Sz
12104 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
12105 ->getNumElements();
12106 if (V->getType() != V1->getType())
12107 V1 = createShuffle(V1, nullptr, Mask);
12108 InVectors.front() = V;
12109 if (InVectors.size() == 2)
12110 InVectors.back() = V1;
12111 else
12112 InVectors.push_back(V1);
12113 return;
12114 }
12115 // Check if second vector is required if the used elements are already
12116 // used from the first one.
12117 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12118 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
12119 InVectors.push_back(V1);
12120 break;
12121 }
12122 }
12123 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
12124 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12125 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
12126 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
12127 }
12128 /// Adds another one input vector and the mask for the shuffling.
12130 SmallVector<int> NewMask;
12131 inversePermutation(Order, NewMask);
12132 add(V1, NewMask);
12133 }
12134 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
12135 Value *Root = nullptr) {
12136 return R.gather(VL, Root, ScalarTy);
12137 }
12138 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
12139 /// Finalize emission of the shuffles.
12140 /// \param Action the action (if any) to be performed before final applying of
12141 /// the \p ExtMask mask.
12142 Value *
12143 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
12144 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
12145 IsFinalized = true;
12146 if (Action) {
12147 Value *Vec = InVectors.front();
12148 if (InVectors.size() == 2) {
12149 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
12150 InVectors.pop_back();
12151 } else {
12152 Vec = createShuffle(Vec, nullptr, CommonMask);
12153 }
12154 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12155 if (CommonMask[Idx] != PoisonMaskElem)
12156 CommonMask[Idx] = Idx;
12157 assert(VF > 0 &&
12158 "Expected vector length for the final value before action.");
12159 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
12160 if (VecVF < VF) {
12161 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
12162 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
12163 Vec = createShuffle(Vec, nullptr, ResizeMask);
12164 }
12165 Action(Vec, CommonMask);
12166 InVectors.front() = Vec;
12167 }
12168 if (!ExtMask.empty()) {
12169 if (CommonMask.empty()) {
12170 CommonMask.assign(ExtMask.begin(), ExtMask.end());
12171 } else {
12172 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12173 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12174 if (ExtMask[I] == PoisonMaskElem)
12175 continue;
12176 NewMask[I] = CommonMask[ExtMask[I]];
12177 }
12178 CommonMask.swap(NewMask);
12179 }
12180 }
12181 if (CommonMask.empty()) {
12182 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
12183 return InVectors.front();
12184 }
12185 if (InVectors.size() == 2)
12186 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
12187 return createShuffle(InVectors.front(), nullptr, CommonMask);
12188 }
12189
12191 assert((IsFinalized || CommonMask.empty()) &&
12192 "Shuffle construction must be finalized.");
12193 }
12194};
12195
12196Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
12197 bool PostponedPHIs) {
12198 ValueList &VL = E->getOperand(NodeIdx);
12199 const unsigned VF = VL.size();
12200 InstructionsState S = getSameOpcode(VL, *TLI);
12201 // Special processing for GEPs bundle, which may include non-gep values.
12202 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
12203 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
12204 if (It != VL.end())
12205 S = getSameOpcode(*It, *TLI);
12206 }
12207 if (S.getOpcode()) {
12208 auto CheckSameVE = [&](const TreeEntry *VE) {
12209 return VE->isSame(VL) &&
12210 (any_of(VE->UserTreeIndices,
12211 [E, NodeIdx](const EdgeInfo &EI) {
12212 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12213 }) ||
12214 any_of(VectorizableTree,
12215 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
12216 return TE->isOperandGatherNode({E, NodeIdx}) &&
12217 VE->isSame(TE->Scalars);
12218 }));
12219 };
12220 TreeEntry *VE = getTreeEntry(S.OpValue);
12221 bool IsSameVE = VE && CheckSameVE(VE);
12222 if (!IsSameVE) {
12223 auto It = MultiNodeScalars.find(S.OpValue);
12224 if (It != MultiNodeScalars.end()) {
12225 auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
12226 return TE != VE && CheckSameVE(TE);
12227 });
12228 if (I != It->getSecond().end()) {
12229 VE = *I;
12230 IsSameVE = true;
12231 }
12232 }
12233 }
12234 if (IsSameVE) {
12235 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
12236 ShuffleInstructionBuilder ShuffleBuilder(
12237 cast<VectorType>(V->getType())->getElementType(), Builder, *this);
12238 ShuffleBuilder.add(V, Mask);
12239 return ShuffleBuilder.finalize(std::nullopt);
12240 };
12241 Value *V = vectorizeTree(VE, PostponedPHIs);
12242 if (VF * getNumElements(VL[0]->getType()) !=
12243 cast<FixedVectorType>(V->getType())->getNumElements()) {
12244 if (!VE->ReuseShuffleIndices.empty()) {
12245 // Reshuffle to get only unique values.
12246 // If some of the scalars are duplicated in the vectorization
12247 // tree entry, we do not vectorize them but instead generate a
12248 // mask for the reuses. But if there are several users of the
12249 // same entry, they may have different vectorization factors.
12250 // This is especially important for PHI nodes. In this case, we
12251 // need to adapt the resulting instruction for the user
12252 // vectorization factor and have to reshuffle it again to take
12253 // only unique elements of the vector. Without this code the
12254 // function incorrectly returns reduced vector instruction with
12255 // the same elements, not with the unique ones.
12256
12257 // block:
12258 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
12259 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
12260 // ... (use %2)
12261 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
12262 // br %block
12264 for (auto [I, V] : enumerate(VL)) {
12265 if (isa<PoisonValue>(V))
12266 continue;
12267 Mask[I] = VE->findLaneForValue(V);
12268 }
12269 V = FinalShuffle(V, Mask);
12270 } else {
12271 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
12272 "Expected vectorization factor less "
12273 "than original vector size.");
12274 SmallVector<int> UniformMask(VF, 0);
12275 std::iota(UniformMask.begin(), UniformMask.end(), 0);
12276 V = FinalShuffle(V, UniformMask);
12277 }
12278 }
12279 // Need to update the operand gather node, if actually the operand is not a
12280 // vectorized node, but the buildvector/gather node, which matches one of
12281 // the vectorized nodes.
12282 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
12283 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12284 }) == VE->UserTreeIndices.end()) {
12285 auto *It = find_if(
12286 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12287 return TE->isGather() &&
12288 TE->UserTreeIndices.front().UserTE == E &&
12289 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
12290 });
12291 assert(It != VectorizableTree.end() && "Expected gather node operand.");
12292 (*It)->VectorizedValue = V;
12293 }
12294 return V;
12295 }
12296 }
12297
12298 // Find the corresponding gather entry and vectorize it.
12299 // Allows to be more accurate with tree/graph transformations, checks for the
12300 // correctness of the transformations in many cases.
12301 auto *I = find_if(VectorizableTree,
12302 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
12303 return TE->isOperandGatherNode({E, NodeIdx});
12304 });
12305 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
12306 assert(I->get()->UserTreeIndices.size() == 1 &&
12307 "Expected only single user for the gather node.");
12308 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
12309 return vectorizeTree(I->get(), PostponedPHIs);
12310}
12311
12312template <typename BVTy, typename ResTy, typename... Args>
12313ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
12314 Args &...Params) {
12315 assert(E->isGather() && "Expected gather node.");
12316 unsigned VF = E->getVectorFactor();
12317
12318 bool NeedFreeze = false;
12319 SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
12320 E->ReuseShuffleIndices.end());
12321 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
12322 // Build a mask out of the reorder indices and reorder scalars per this
12323 // mask.
12324 SmallVector<int> ReorderMask;
12325 inversePermutation(E->ReorderIndices, ReorderMask);
12326 if (!ReorderMask.empty())
12327 reorderScalars(GatheredScalars, ReorderMask);
12328 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
12329 unsigned I, unsigned SliceSize) {
12330 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
12331 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12332 }))
12333 return false;
12334 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
12335 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
12336 if (UserTE->getNumOperands() != 2)
12337 return false;
12338 auto *It =
12339 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
12340 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
12341 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
12342 }) != TE->UserTreeIndices.end();
12343 });
12344 if (It == VectorizableTree.end())
12345 return false;
12346 int Idx;
12347 if ((Mask.size() < InputVF &&
12349 Idx == 0) ||
12350 (Mask.size() == InputVF &&
12351 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
12352 std::iota(
12353 std::next(Mask.begin(), I * SliceSize),
12354 std::next(Mask.begin(),
12355 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
12356 0);
12357 } else {
12358 unsigned IVal =
12359 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
12360 std::fill(
12361 std::next(Mask.begin(), I * SliceSize),
12362 std::next(Mask.begin(),
12363 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
12364 IVal);
12365 }
12366 return true;
12367 };
12368 BVTy ShuffleBuilder(ScalarTy, Params...);
12369 ResTy Res = ResTy();
12371 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
12373 Value *ExtractVecBase = nullptr;
12374 bool UseVecBaseAsInput = false;
12377 Type *OrigScalarTy = GatheredScalars.front()->getType();
12378 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
12379 unsigned NumParts = TTI->getNumberOfParts(VecTy);
12380 if (NumParts == 0 || NumParts >= GatheredScalars.size())
12381 NumParts = 1;
12382 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
12383 // Check for gathered extracts.
12384 bool Resized = false;
12385 ExtractShuffles =
12386 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
12387 if (!ExtractShuffles.empty()) {
12388 SmallVector<const TreeEntry *> ExtractEntries;
12389 for (auto [Idx, I] : enumerate(ExtractMask)) {
12390 if (I == PoisonMaskElem)
12391 continue;
12392 if (const auto *TE = getTreeEntry(
12393 cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
12394 ExtractEntries.push_back(TE);
12395 }
12396 if (std::optional<ResTy> Delayed =
12397 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
12398 // Delay emission of gathers which are not ready yet.
12399 PostponedGathers.insert(E);
12400 // Postpone gather emission, will be emitted after the end of the
12401 // process to keep correct order.
12402 return *Delayed;
12403 }
12404 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
12405 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
12406 ExtractVecBase = VecBase;
12407 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
12408 if (VF == VecBaseTy->getNumElements() &&
12409 GatheredScalars.size() != VF) {
12410 Resized = true;
12411 GatheredScalars.append(VF - GatheredScalars.size(),
12412 PoisonValue::get(OrigScalarTy));
12413 }
12414 }
12415 }
12416 // Gather extracts after we check for full matched gathers only.
12417 if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
12418 E->isAltShuffle() ||
12419 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
12420 isSplat(E->Scalars) ||
12421 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
12422 GatherShuffles =
12423 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
12424 }
12425 if (!GatherShuffles.empty()) {
12426 if (std::optional<ResTy> Delayed =
12427 ShuffleBuilder.needToDelay(E, Entries)) {
12428 // Delay emission of gathers which are not ready yet.
12429 PostponedGathers.insert(E);
12430 // Postpone gather emission, will be emitted after the end of the
12431 // process to keep correct order.
12432 return *Delayed;
12433 }
12434 if (GatherShuffles.size() == 1 &&
12435 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
12436 Entries.front().front()->isSame(E->Scalars)) {
12437 // Perfect match in the graph, will reuse the previously vectorized
12438 // node. Cost is 0.
12439 LLVM_DEBUG(
12440 dbgs()
12441 << "SLP: perfect diamond match for gather bundle "
12442 << shortBundleName(E->Scalars) << ".\n");
12443 // Restore the mask for previous partially matched values.
12444 Mask.resize(E->Scalars.size());
12445 const TreeEntry *FrontTE = Entries.front().front();
12446 if (FrontTE->ReorderIndices.empty() &&
12447 ((FrontTE->ReuseShuffleIndices.empty() &&
12448 E->Scalars.size() == FrontTE->Scalars.size()) ||
12449 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
12450 std::iota(Mask.begin(), Mask.end(), 0);
12451 } else {
12452 for (auto [I, V] : enumerate(E->Scalars)) {
12453 if (isa<PoisonValue>(V)) {
12455 continue;
12456 }
12457 Mask[I] = FrontTE->findLaneForValue(V);
12458 }
12459 }
12460 ShuffleBuilder.add(*FrontTE, Mask);
12461 Res = ShuffleBuilder.finalize(E->getCommonMask());
12462 return Res;
12463 }
12464 if (!Resized) {
12465 if (GatheredScalars.size() != VF &&
12466 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
12467 return any_of(TEs, [&](const TreeEntry *TE) {
12468 return TE->getVectorFactor() == VF;
12469 });
12470 }))
12471 GatheredScalars.append(VF - GatheredScalars.size(),
12472 PoisonValue::get(OrigScalarTy));
12473 }
12474 // Remove shuffled elements from list of gathers.
12475 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
12476 if (Mask[I] != PoisonMaskElem)
12477 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
12478 }
12479 }
12480 }
12481 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
12482 SmallVectorImpl<int> &ReuseMask,
12483 bool IsRootPoison) {
12484 // For splats with can emit broadcasts instead of gathers, so try to find
12485 // such sequences.
12486 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
12487 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
12488 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
12489 SmallVector<int> UndefPos;
12490 DenseMap<Value *, unsigned> UniquePositions;
12491 // Gather unique non-const values and all constant values.
12492 // For repeated values, just shuffle them.
12493 int NumNonConsts = 0;
12494 int SinglePos = 0;
12495 for (auto [I, V] : enumerate(Scalars)) {
12496 if (isa<UndefValue>(V)) {
12497 if (!isa<PoisonValue>(V)) {
12498 ReuseMask[I] = I;
12499 UndefPos.push_back(I);
12500 }
12501 continue;
12502 }
12503 if (isConstant(V)) {
12504 ReuseMask[I] = I;
12505 continue;
12506 }
12507 ++NumNonConsts;
12508 SinglePos = I;
12509 Value *OrigV = V;
12510 Scalars[I] = PoisonValue::get(OrigScalarTy);
12511 if (IsSplat) {
12512 Scalars.front() = OrigV;
12513 ReuseMask[I] = 0;
12514 } else {
12515 const auto Res = UniquePositions.try_emplace(OrigV, I);
12516 Scalars[Res.first->second] = OrigV;
12517 ReuseMask[I] = Res.first->second;
12518 }
12519 }
12520 if (NumNonConsts == 1) {
12521 // Restore single insert element.
12522 if (IsSplat) {
12523 ReuseMask.assign(VF, PoisonMaskElem);
12524 std::swap(Scalars.front(), Scalars[SinglePos]);
12525 if (!UndefPos.empty() && UndefPos.front() == 0)
12526 Scalars.front() = UndefValue::get(OrigScalarTy);
12527 }
12528 ReuseMask[SinglePos] = SinglePos;
12529 } else if (!UndefPos.empty() && IsSplat) {
12530 // For undef values, try to replace them with the simple broadcast.
12531 // We can do it if the broadcasted value is guaranteed to be
12532 // non-poisonous, or by freezing the incoming scalar value first.
12533 auto *It = find_if(Scalars, [this, E](Value *V) {
12534 return !isa<UndefValue>(V) &&
12535 (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
12536 (E->UserTreeIndices.size() == 1 &&
12537 any_of(V->uses(), [E](const Use &U) {
12538 // Check if the value already used in the same operation in
12539 // one of the nodes already.
12540 return E->UserTreeIndices.front().EdgeIdx !=
12541 U.getOperandNo() &&
12542 is_contained(
12543 E->UserTreeIndices.front().UserTE->Scalars,
12544 U.getUser());
12545 })));
12546 });
12547 if (It != Scalars.end()) {
12548 // Replace undefs by the non-poisoned scalars and emit broadcast.
12549 int Pos = std::distance(Scalars.begin(), It);
12550 for (int I : UndefPos) {
12551 // Set the undef position to the non-poisoned scalar.
12552 ReuseMask[I] = Pos;
12553 // Replace the undef by the poison, in the mask it is replaced by
12554 // non-poisoned scalar already.
12555 if (I != Pos)
12556 Scalars[I] = PoisonValue::get(OrigScalarTy);
12557 }
12558 } else {
12559 // Replace undefs by the poisons, emit broadcast and then emit
12560 // freeze.
12561 for (int I : UndefPos) {
12562 ReuseMask[I] = PoisonMaskElem;
12563 if (isa<UndefValue>(Scalars[I]))
12564 Scalars[I] = PoisonValue::get(OrigScalarTy);
12565 }
12566 NeedFreeze = true;
12567 }
12568 }
12569 };
12570 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
12571 bool IsNonPoisoned = true;
12572 bool IsUsedInExpr = true;
12573 Value *Vec1 = nullptr;
12574 if (!ExtractShuffles.empty()) {
12575 // Gather of extractelements can be represented as just a shuffle of
12576 // a single/two vectors the scalars are extracted from.
12577 // Find input vectors.
12578 Value *Vec2 = nullptr;
12579 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12580 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
12581 ExtractMask[I] = PoisonMaskElem;
12582 }
12583 if (UseVecBaseAsInput) {
12584 Vec1 = ExtractVecBase;
12585 } else {
12586 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12587 if (ExtractMask[I] == PoisonMaskElem)
12588 continue;
12589 if (isa<UndefValue>(E->Scalars[I]))
12590 continue;
12591 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
12592 Value *VecOp = EI->getVectorOperand();
12593 if (const auto *TE = getTreeEntry(VecOp))
12594 if (TE->VectorizedValue)
12595 VecOp = TE->VectorizedValue;
12596 if (!Vec1) {
12597 Vec1 = VecOp;
12598 } else if (Vec1 != VecOp) {
12599 assert((!Vec2 || Vec2 == VecOp) &&
12600 "Expected only 1 or 2 vectors shuffle.");
12601 Vec2 = VecOp;
12602 }
12603 }
12604 }
12605 if (Vec2) {
12606 IsUsedInExpr = false;
12607 IsNonPoisoned &=
12609 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12610 } else if (Vec1) {
12611 IsUsedInExpr &= FindReusedSplat(
12612 ExtractMask,
12613 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
12614 ExtractMask.size());
12615 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
12616 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
12617 } else {
12618 IsUsedInExpr = false;
12619 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
12620 /*ForExtracts=*/true);
12621 }
12622 }
12623 if (!GatherShuffles.empty()) {
12624 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
12625 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
12626 for (const auto [I, TEs] : enumerate(Entries)) {
12627 if (TEs.empty()) {
12628 assert(!GatherShuffles[I] &&
12629 "No shuffles with empty entries list expected.");
12630 continue;
12631 }
12632 assert((TEs.size() == 1 || TEs.size() == 2) &&
12633 "Expected shuffle of 1 or 2 entries.");
12634 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
12635 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
12636 VecMask.assign(VecMask.size(), PoisonMaskElem);
12637 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
12638 if (TEs.size() == 1) {
12639 IsUsedInExpr &= FindReusedSplat(
12640 VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
12641 ShuffleBuilder.add(*TEs.front(), VecMask);
12642 if (TEs.front()->VectorizedValue)
12643 IsNonPoisoned &=
12644 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
12645 } else {
12646 IsUsedInExpr = false;
12647 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
12648 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
12649 IsNonPoisoned &=
12650 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
12651 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
12652 }
12653 }
12654 }
12655 // Try to figure out best way to combine values: build a shuffle and insert
12656 // elements or just build several shuffles.
12657 // Insert non-constant scalars.
12658 SmallVector<Value *> NonConstants(GatheredScalars);
12659 int EMSz = ExtractMask.size();
12660 int MSz = Mask.size();
12661 // Try to build constant vector and shuffle with it only if currently we
12662 // have a single permutation and more than 1 scalar constants.
12663 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
12664 bool IsIdentityShuffle =
12665 ((UseVecBaseAsInput ||
12666 all_of(ExtractShuffles,
12667 [](const std::optional<TTI::ShuffleKind> &SK) {
12668 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12670 })) &&
12671 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
12672 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
12673 (!GatherShuffles.empty() &&
12674 all_of(GatherShuffles,
12675 [](const std::optional<TTI::ShuffleKind> &SK) {
12676 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12678 }) &&
12679 none_of(Mask, [&](int I) { return I >= MSz; }) &&
12681 bool EnoughConstsForShuffle =
12682 IsSingleShuffle &&
12683 (none_of(GatheredScalars,
12684 [](Value *V) {
12685 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12686 }) ||
12687 any_of(GatheredScalars,
12688 [](Value *V) {
12689 return isa<Constant>(V) && !isa<UndefValue>(V);
12690 })) &&
12691 (!IsIdentityShuffle ||
12692 (GatheredScalars.size() == 2 &&
12693 any_of(GatheredScalars,
12694 [](Value *V) { return !isa<UndefValue>(V); })) ||
12695 count_if(GatheredScalars, [](Value *V) {
12696 return isa<Constant>(V) && !isa<PoisonValue>(V);
12697 }) > 1);
12698 // NonConstants array contains just non-constant values, GatheredScalars
12699 // contains only constant to build final vector and then shuffle.
12700 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
12701 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
12702 NonConstants[I] = PoisonValue::get(OrigScalarTy);
12703 else
12704 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
12705 }
12706 // Generate constants for final shuffle and build a mask for them.
12707 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
12708 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
12709 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
12710 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12711 ShuffleBuilder.add(BV, BVMask);
12712 }
12713 if (all_of(NonConstants, [=](Value *V) {
12714 return isa<PoisonValue>(V) ||
12715 (IsSingleShuffle && ((IsIdentityShuffle &&
12716 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
12717 }))
12718 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12719 else
12720 Res = ShuffleBuilder.finalize(
12721 E->ReuseShuffleIndices, E->Scalars.size(),
12722 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
12723 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
12724 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
12725 });
12726 } else if (!allConstant(GatheredScalars)) {
12727 // Gather unique scalars and all constants.
12728 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
12729 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
12730 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
12731 ShuffleBuilder.add(BV, ReuseMask);
12732 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12733 } else {
12734 // Gather all constants.
12735 SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem);
12736 for (auto [I, V] : enumerate(E->Scalars)) {
12737 if (!isa<PoisonValue>(V))
12738 Mask[I] = I;
12739 }
12740 Value *BV = ShuffleBuilder.gather(E->Scalars);
12741 ShuffleBuilder.add(BV, Mask);
12742 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12743 }
12744
12745 if (NeedFreeze)
12746 Res = ShuffleBuilder.createFreeze(Res);
12747 return Res;
12748}
12749
12750Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
12751 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
12752 Builder, *this);
12753}
12754
12755Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
12756 IRBuilderBase::InsertPointGuard Guard(Builder);
12757
12758 if (E->VectorizedValue &&
12759 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12760 E->isAltShuffle())) {
12761 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
12762 return E->VectorizedValue;
12763 }
12764
12765 Value *V = E->Scalars.front();
12766 Type *ScalarTy = V->getType();
12767 if (auto *Store = dyn_cast<StoreInst>(V))
12768 ScalarTy = Store->getValueOperand()->getType();
12769 else if (auto *IE = dyn_cast<InsertElementInst>(V))
12770 ScalarTy = IE->getOperand(1)->getType();
12771 auto It = MinBWs.find(E);
12772 if (It != MinBWs.end())
12773 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
12774 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
12775 if (E->isGather()) {
12776 // Set insert point for non-reduction initial nodes.
12777 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12778 setInsertPointAfterBundle(E);
12779 Value *Vec = createBuildVector(E, ScalarTy);
12780 E->VectorizedValue = Vec;
12781 return Vec;
12782 }
12783
12784 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
12785 auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
12786 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
12787 if (E->getOpcode() == Instruction::Store &&
12788 E->State == TreeEntry::Vectorize) {
12790 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
12791 E->ReorderIndices.size());
12792 ShuffleBuilder.add(V, Mask);
12793 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12794 ShuffleBuilder.addOrdered(V, std::nullopt);
12795 } else {
12796 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
12797 }
12798 return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12799 };
12800
12801 assert((E->State == TreeEntry::Vectorize ||
12802 E->State == TreeEntry::ScatterVectorize ||
12803 E->State == TreeEntry::StridedVectorize) &&
12804 "Unhandled state");
12805 unsigned ShuffleOrOp =
12806 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
12807 Instruction *VL0 = E->getMainOp();
12808 auto GetOperandSignedness = [&](unsigned Idx) {
12809 const TreeEntry *OpE = getOperandEntry(E, Idx);
12810 bool IsSigned = false;
12811 auto It = MinBWs.find(OpE);
12812 if (It != MinBWs.end())
12813 IsSigned = It->second.second;
12814 else
12815 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
12816 return !isKnownNonNegative(R, SimplifyQuery(*DL));
12817 });
12818 return IsSigned;
12819 };
12820 switch (ShuffleOrOp) {
12821 case Instruction::PHI: {
12822 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
12823 E != VectorizableTree.front().get() ||
12824 !E->UserTreeIndices.empty()) &&
12825 "PHI reordering is free.");
12826 if (PostponedPHIs && E->VectorizedValue)
12827 return E->VectorizedValue;
12828 auto *PH = cast<PHINode>(VL0);
12829 Builder.SetInsertPoint(PH->getParent(),
12830 PH->getParent()->getFirstNonPHIIt());
12831 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12832 if (PostponedPHIs || !E->VectorizedValue) {
12833 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
12834 E->PHI = NewPhi;
12835 Value *V = NewPhi;
12836
12837 // Adjust insertion point once all PHI's have been generated.
12838 Builder.SetInsertPoint(PH->getParent(),
12839 PH->getParent()->getFirstInsertionPt());
12840 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12841
12842 V = FinalShuffle(V, E, VecTy);
12843
12844 E->VectorizedValue = V;
12845 if (PostponedPHIs)
12846 return V;
12847 }
12848 PHINode *NewPhi = cast<PHINode>(E->PHI);
12849 // If phi node is fully emitted - exit.
12850 if (NewPhi->getNumIncomingValues() != 0)
12851 return NewPhi;
12852
12853 // PHINodes may have multiple entries from the same block. We want to
12854 // visit every block once.
12856
12857 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12859 BasicBlock *IBB = PH->getIncomingBlock(I);
12860
12861 // Stop emission if all incoming values are generated.
12862 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
12863 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12864 return NewPhi;
12865 }
12866
12867 if (!VisitedBBs.insert(IBB).second) {
12868 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
12869 continue;
12870 }
12871
12872 Builder.SetInsertPoint(IBB->getTerminator());
12873 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12874 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
12875 if (VecTy != Vec->getType()) {
12876 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
12877 MinBWs.contains(getOperandEntry(E, I))) &&
12878 "Expected item in MinBWs.");
12879 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
12880 }
12881 NewPhi->addIncoming(Vec, IBB);
12882 }
12883
12884 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
12885 "Invalid number of incoming values");
12886 return NewPhi;
12887 }
12888
12889 case Instruction::ExtractElement: {
12890 Value *V = E->getSingleOperand(0);
12891 if (const TreeEntry *TE = getTreeEntry(V))
12892 V = TE->VectorizedValue;
12893 setInsertPointAfterBundle(E);
12894 V = FinalShuffle(V, E, VecTy);
12895 E->VectorizedValue = V;
12896 return V;
12897 }
12898 case Instruction::ExtractValue: {
12899 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
12900 Builder.SetInsertPoint(LI);
12901 Value *Ptr = LI->getPointerOperand();
12902 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
12903 Value *NewV = propagateMetadata(V, E->Scalars);
12904 NewV = FinalShuffle(NewV, E, VecTy);
12905 E->VectorizedValue = NewV;
12906 return NewV;
12907 }
12908 case Instruction::InsertElement: {
12909 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
12910 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
12911 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
12912 ArrayRef<Value *> Op = E->getOperand(1);
12913 Type *ScalarTy = Op.front()->getType();
12914 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
12915 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
12916 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
12917 assert(Res.first > 0 && "Expected item in MinBWs.");
12918 V = Builder.CreateIntCast(
12919 V,
12921 ScalarTy,
12922 cast<FixedVectorType>(V->getType())->getNumElements()),
12923 Res.second);
12924 }
12925
12926 // Create InsertVector shuffle if necessary
12927 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
12928 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
12929 }));
12930 const unsigned NumElts =
12931 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
12932 const unsigned NumScalars = E->Scalars.size();
12933
12934 unsigned Offset = *getElementIndex(VL0);
12935 assert(Offset < NumElts && "Failed to find vector index offset");
12936
12937 // Create shuffle to resize vector
12939 if (!E->ReorderIndices.empty()) {
12940 inversePermutation(E->ReorderIndices, Mask);
12941 Mask.append(NumElts - NumScalars, PoisonMaskElem);
12942 } else {
12943 Mask.assign(NumElts, PoisonMaskElem);
12944 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
12945 }
12946 // Create InsertVector shuffle if necessary
12947 bool IsIdentity = true;
12948 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
12949 Mask.swap(PrevMask);
12950 for (unsigned I = 0; I < NumScalars; ++I) {
12951 Value *Scalar = E->Scalars[PrevMask[I]];
12952 unsigned InsertIdx = *getElementIndex(Scalar);
12953 IsIdentity &= InsertIdx - Offset == I;
12954 Mask[InsertIdx - Offset] = I;
12955 }
12956 if (!IsIdentity || NumElts != NumScalars) {
12957 Value *V2 = nullptr;
12958 bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
12959 SmallVector<int> InsertMask(Mask);
12960 if (NumElts != NumScalars && Offset == 0) {
12961 // Follow all insert element instructions from the current buildvector
12962 // sequence.
12963 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
12964 do {
12965 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
12966 if (!InsertIdx)
12967 break;
12968 if (InsertMask[*InsertIdx] == PoisonMaskElem)
12969 InsertMask[*InsertIdx] = *InsertIdx;
12970 if (!Ins->hasOneUse())
12971 break;
12972 Ins = dyn_cast_or_null<InsertElementInst>(
12973 Ins->getUniqueUndroppableUser());
12974 } while (Ins);
12975 SmallBitVector UseMask =
12976 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12977 SmallBitVector IsFirstPoison =
12978 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12979 SmallBitVector IsFirstUndef =
12980 isUndefVector(FirstInsert->getOperand(0), UseMask);
12981 if (!IsFirstPoison.all()) {
12982 unsigned Idx = 0;
12983 for (unsigned I = 0; I < NumElts; I++) {
12984 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
12985 IsFirstUndef.test(I)) {
12986 if (IsVNonPoisonous) {
12987 InsertMask[I] = I < NumScalars ? I : 0;
12988 continue;
12989 }
12990 if (!V2)
12991 V2 = UndefValue::get(V->getType());
12992 if (Idx >= NumScalars)
12993 Idx = NumScalars - 1;
12994 InsertMask[I] = NumScalars + Idx;
12995 ++Idx;
12996 } else if (InsertMask[I] != PoisonMaskElem &&
12997 Mask[I] == PoisonMaskElem) {
12998 InsertMask[I] = PoisonMaskElem;
12999 }
13000 }
13001 } else {
13002 InsertMask = Mask;
13003 }
13004 }
13005 if (!V2)
13006 V2 = PoisonValue::get(V->getType());
13007 V = Builder.CreateShuffleVector(V, V2, InsertMask);
13008 if (auto *I = dyn_cast<Instruction>(V)) {
13009 GatherShuffleExtractSeq.insert(I);
13010 CSEBlocks.insert(I->getParent());
13011 }
13012 }
13013
13014 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
13015 for (unsigned I = 0; I < NumElts; I++) {
13016 if (Mask[I] != PoisonMaskElem)
13017 InsertMask[Offset + I] = I;
13018 }
13019 SmallBitVector UseMask =
13020 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
13021 SmallBitVector IsFirstUndef =
13022 isUndefVector(FirstInsert->getOperand(0), UseMask);
13023 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
13024 NumElts != NumScalars) {
13025 if (IsFirstUndef.all()) {
13026 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
13027 SmallBitVector IsFirstPoison =
13028 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
13029 if (!IsFirstPoison.all()) {
13030 for (unsigned I = 0; I < NumElts; I++) {
13031 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
13032 InsertMask[I] = I + NumElts;
13033 }
13034 }
13035 V = Builder.CreateShuffleVector(
13036 V,
13037 IsFirstPoison.all() ? PoisonValue::get(V->getType())
13038 : FirstInsert->getOperand(0),
13039 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
13040 if (auto *I = dyn_cast<Instruction>(V)) {
13041 GatherShuffleExtractSeq.insert(I);
13042 CSEBlocks.insert(I->getParent());
13043 }
13044 }
13045 } else {
13046 SmallBitVector IsFirstPoison =
13047 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
13048 for (unsigned I = 0; I < NumElts; I++) {
13049 if (InsertMask[I] == PoisonMaskElem)
13050 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
13051 else
13052 InsertMask[I] += NumElts;
13053 }
13054 V = Builder.CreateShuffleVector(
13055 FirstInsert->getOperand(0), V, InsertMask,
13056 cast<Instruction>(E->Scalars.back())->getName());
13057 if (auto *I = dyn_cast<Instruction>(V)) {
13058 GatherShuffleExtractSeq.insert(I);
13059 CSEBlocks.insert(I->getParent());
13060 }
13061 }
13062 }
13063
13064 ++NumVectorInstructions;
13065 E->VectorizedValue = V;
13066 return V;
13067 }
13068 case Instruction::ZExt:
13069 case Instruction::SExt:
13070 case Instruction::FPToUI:
13071 case Instruction::FPToSI:
13072 case Instruction::FPExt:
13073 case Instruction::PtrToInt:
13074 case Instruction::IntToPtr:
13075 case Instruction::SIToFP:
13076 case Instruction::UIToFP:
13077 case Instruction::Trunc:
13078 case Instruction::FPTrunc:
13079 case Instruction::BitCast: {
13080 setInsertPointAfterBundle(E);
13081
13082 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
13083 if (E->VectorizedValue) {
13084 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13085 return E->VectorizedValue;
13086 }
13087
13088 auto *CI = cast<CastInst>(VL0);
13089 Instruction::CastOps VecOpcode = CI->getOpcode();
13090 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
13091 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
13092 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
13093 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
13094 SrcScalarTy != CI->getOperand(0)->getType())) {
13095 // Check if the values are candidates to demote.
13096 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
13097 if (SrcIt != MinBWs.end())
13098 SrcBWSz = SrcIt->second.first;
13099 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
13100 if (BWSz == SrcBWSz) {
13101 VecOpcode = Instruction::BitCast;
13102 } else if (BWSz < SrcBWSz) {
13103 VecOpcode = Instruction::Trunc;
13104 } else if (It != MinBWs.end()) {
13105 assert(BWSz > SrcBWSz && "Invalid cast!");
13106 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
13107 } else if (SrcIt != MinBWs.end()) {
13108 assert(BWSz > SrcBWSz && "Invalid cast!");
13109 VecOpcode =
13110 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
13111 }
13112 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
13113 !SrcIt->second.second) {
13114 VecOpcode = Instruction::UIToFP;
13115 }
13116 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
13117 ? InVec
13118 : Builder.CreateCast(VecOpcode, InVec, VecTy);
13119 V = FinalShuffle(V, E, VecTy);
13120
13121 E->VectorizedValue = V;
13122 ++NumVectorInstructions;
13123 return V;
13124 }
13125 case Instruction::FCmp:
13126 case Instruction::ICmp: {
13127 setInsertPointAfterBundle(E);
13128
13129 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
13130 if (E->VectorizedValue) {
13131 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13132 return E->VectorizedValue;
13133 }
13134 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
13135 if (E->VectorizedValue) {
13136 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13137 return E->VectorizedValue;
13138 }
13139 if (L->getType() != R->getType()) {
13140 assert((getOperandEntry(E, 0)->isGather() ||
13141 getOperandEntry(E, 1)->isGather() ||
13142 MinBWs.contains(getOperandEntry(E, 0)) ||
13143 MinBWs.contains(getOperandEntry(E, 1))) &&
13144 "Expected item in MinBWs.");
13145 if (cast<VectorType>(L->getType())
13146 ->getElementType()
13147 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
13148 ->getElementType()
13149 ->getIntegerBitWidth()) {
13150 Type *CastTy = R->getType();
13151 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
13152 } else {
13153 Type *CastTy = L->getType();
13154 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
13155 }
13156 }
13157
13158 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
13159 Value *V = Builder.CreateCmp(P0, L, R);
13160 propagateIRFlags(V, E->Scalars, VL0);
13161 // Do not cast for cmps.
13162 VecTy = cast<FixedVectorType>(V->getType());
13163 V = FinalShuffle(V, E, VecTy);
13164
13165 E->VectorizedValue = V;
13166 ++NumVectorInstructions;
13167 return V;
13168 }
13169 case Instruction::Select: {
13170 setInsertPointAfterBundle(E);
13171
13172 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
13173 if (E->VectorizedValue) {
13174 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13175 return E->VectorizedValue;
13176 }
13177 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
13178 if (E->VectorizedValue) {
13179 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13180 return E->VectorizedValue;
13181 }
13182 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
13183 if (E->VectorizedValue) {
13184 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13185 return E->VectorizedValue;
13186 }
13187 if (True->getType() != VecTy || False->getType() != VecTy) {
13188 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
13189 getOperandEntry(E, 2)->isGather() ||
13190 MinBWs.contains(getOperandEntry(E, 1)) ||
13191 MinBWs.contains(getOperandEntry(E, 2))) &&
13192 "Expected item in MinBWs.");
13193 if (True->getType() != VecTy)
13194 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
13195 if (False->getType() != VecTy)
13196 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
13197 }
13198
13199 Value *V = Builder.CreateSelect(Cond, True, False);
13200 V = FinalShuffle(V, E, VecTy);
13201
13202 E->VectorizedValue = V;
13203 ++NumVectorInstructions;
13204 return V;
13205 }
13206 case Instruction::FNeg: {
13207 setInsertPointAfterBundle(E);
13208
13209 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
13210
13211 if (E->VectorizedValue) {
13212 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13213 return E->VectorizedValue;
13214 }
13215
13216 Value *V = Builder.CreateUnOp(
13217 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
13218 propagateIRFlags(V, E->Scalars, VL0);
13219 if (auto *I = dyn_cast<Instruction>(V))
13220 V = propagateMetadata(I, E->Scalars);
13221
13222 V = FinalShuffle(V, E, VecTy);
13223
13224 E->VectorizedValue = V;
13225 ++NumVectorInstructions;
13226
13227 return V;
13228 }
13229 case Instruction::Add:
13230 case Instruction::FAdd:
13231 case Instruction::Sub:
13232 case Instruction::FSub:
13233 case Instruction::Mul:
13234 case Instruction::FMul:
13235 case Instruction::UDiv:
13236 case Instruction::SDiv:
13237 case Instruction::FDiv:
13238 case Instruction::URem:
13239 case Instruction::SRem:
13240 case Instruction::FRem:
13241 case Instruction::Shl:
13242 case Instruction::LShr:
13243 case Instruction::AShr:
13244 case Instruction::And:
13245 case Instruction::Or:
13246 case Instruction::Xor: {
13247 setInsertPointAfterBundle(E);
13248
13249 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
13250 if (E->VectorizedValue) {
13251 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13252 return E->VectorizedValue;
13253 }
13254 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
13255 if (E->VectorizedValue) {
13256 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13257 return E->VectorizedValue;
13258 }
13259 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
13260 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
13261 ArrayRef<Value *> Ops = E->getOperand(I);
13262 if (all_of(Ops, [&](Value *Op) {
13263 auto *CI = dyn_cast<ConstantInt>(Op);
13264 return CI && CI->getValue().countr_one() >= It->second.first;
13265 })) {
13266 V = FinalShuffle(I == 0 ? RHS : LHS, E, VecTy);
13267 E->VectorizedValue = V;
13268 ++NumVectorInstructions;
13269 return V;
13270 }
13271 }
13272 }
13273 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
13274 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
13275 getOperandEntry(E, 1)->isGather() ||
13276 MinBWs.contains(getOperandEntry(E, 0)) ||
13277 MinBWs.contains(getOperandEntry(E, 1))) &&
13278 "Expected item in MinBWs.");
13279 if (LHS->getType() != VecTy)
13280 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
13281 if (RHS->getType() != VecTy)
13282 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
13283 }
13284
13285 Value *V = Builder.CreateBinOp(
13286 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
13287 RHS);
13288 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
13289 if (auto *I = dyn_cast<Instruction>(V)) {
13290 V = propagateMetadata(I, E->Scalars);
13291 // Drop nuw flags for abs(sub(commutative), true).
13292 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
13293 any_of(E->Scalars, [](Value *V) {
13294 return isCommutative(cast<Instruction>(V));
13295 }))
13296 I->setHasNoUnsignedWrap(/*b=*/false);
13297 }
13298
13299 V = FinalShuffle(V, E, VecTy);
13300
13301 E->VectorizedValue = V;
13302 ++NumVectorInstructions;
13303
13304 return V;
13305 }
13306 case Instruction::Load: {
13307 // Loads are inserted at the head of the tree because we don't want to
13308 // sink them all the way down past store instructions.
13309 setInsertPointAfterBundle(E);
13310
13311 LoadInst *LI = cast<LoadInst>(VL0);
13312 Instruction *NewLI;
13313 Value *PO = LI->getPointerOperand();
13314 if (E->State == TreeEntry::Vectorize) {
13315 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
13316 } else if (E->State == TreeEntry::StridedVectorize) {
13317 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
13318 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
13319 PO = IsReverseOrder ? PtrN : Ptr0;
13320 std::optional<int> Diff = getPointersDiff(
13321 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
13322 Type *StrideTy = DL->getIndexType(PO->getType());
13323 Value *StrideVal;
13324 if (Diff) {
13325 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
13326 StrideVal =
13327 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
13328 DL->getTypeAllocSize(ScalarTy));
13329 } else {
13330 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
13331 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
13332 return cast<LoadInst>(V)->getPointerOperand();
13333 });
13334 OrdersType Order;
13335 std::optional<Value *> Stride =
13336 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
13337 &*Builder.GetInsertPoint());
13338 Value *NewStride =
13339 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
13340 StrideVal = Builder.CreateMul(
13341 NewStride,
13342 ConstantInt::get(
13343 StrideTy,
13344 (IsReverseOrder ? -1 : 1) *
13345 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
13346 }
13347 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13348 auto *Inst = Builder.CreateIntrinsic(
13349 Intrinsic::experimental_vp_strided_load,
13350 {VecTy, PO->getType(), StrideTy},
13351 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
13352 Builder.getInt32(E->Scalars.size())});
13353 Inst->addParamAttr(
13354 /*ArgNo=*/0,
13355 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
13356 NewLI = Inst;
13357 } else {
13358 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
13359 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
13360 if (E->VectorizedValue) {
13361 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13362 return E->VectorizedValue;
13363 }
13364 // Use the minimum alignment of the gathered loads.
13365 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13366 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
13367 }
13368 Value *V = propagateMetadata(NewLI, E->Scalars);
13369
13370 V = FinalShuffle(V, E, VecTy);
13371 E->VectorizedValue = V;
13372 ++NumVectorInstructions;
13373 return V;
13374 }
13375 case Instruction::Store: {
13376 auto *SI = cast<StoreInst>(VL0);
13377
13378 setInsertPointAfterBundle(E);
13379
13380 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
13381 if (VecValue->getType() != VecTy)
13382 VecValue =
13383 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
13384 VecValue = FinalShuffle(VecValue, E, VecTy);
13385
13386 Value *Ptr = SI->getPointerOperand();
13387 Instruction *ST;
13388 if (E->State == TreeEntry::Vectorize) {
13389 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
13390 } else {
13391 assert(E->State == TreeEntry::StridedVectorize &&
13392 "Expected either strided or conseutive stores.");
13393 if (!E->ReorderIndices.empty()) {
13394 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
13395 Ptr = SI->getPointerOperand();
13396 }
13397 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
13398 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
13399 auto *Inst = Builder.CreateIntrinsic(
13400 Intrinsic::experimental_vp_strided_store,
13401 {VecTy, Ptr->getType(), StrideTy},
13402 {VecValue, Ptr,
13403 ConstantInt::get(
13404 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
13405 Builder.getAllOnesMask(VecTy->getElementCount()),
13406 Builder.getInt32(E->Scalars.size())});
13407 Inst->addParamAttr(
13408 /*ArgNo=*/1,
13409 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
13410 ST = Inst;
13411 }
13412
13413 Value *V = propagateMetadata(ST, E->Scalars);
13414
13415 E->VectorizedValue = V;
13416 ++NumVectorInstructions;
13417 return V;
13418 }
13419 case Instruction::GetElementPtr: {
13420 auto *GEP0 = cast<GetElementPtrInst>(VL0);
13421 setInsertPointAfterBundle(E);
13422
13423 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
13424 if (E->VectorizedValue) {
13425 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13426 return E->VectorizedValue;
13427 }
13428
13429 SmallVector<Value *> OpVecs;
13430 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
13431 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
13432 if (E->VectorizedValue) {
13433 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13434 return E->VectorizedValue;
13435 }
13436 OpVecs.push_back(OpVec);
13437 }
13438
13439 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
13440 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
13442 for (Value *V : E->Scalars) {
13443 if (isa<GetElementPtrInst>(V))
13444 GEPs.push_back(V);
13445 }
13446 V = propagateMetadata(I, GEPs);
13447 }
13448
13449 V = FinalShuffle(V, E, VecTy);
13450
13451 E->VectorizedValue = V;
13452 ++NumVectorInstructions;
13453
13454 return V;
13455 }
13456 case Instruction::Call: {
13457 CallInst *CI = cast<CallInst>(VL0);
13458 setInsertPointAfterBundle(E);
13459
13461
13462 SmallVector<Type *> ArgTys =
13464 It != MinBWs.end() ? It->second.first : 0);
13465 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
13466 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
13467 VecCallCosts.first <= VecCallCosts.second;
13468
13469 Value *ScalarArg = nullptr;
13470 SmallVector<Value *> OpVecs;
13471 SmallVector<Type *, 2> TysForDecl;
13472 // Add return type if intrinsic is overloaded on it.
13473 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
13474 TysForDecl.push_back(VecTy);
13475 auto *CEI = cast<CallInst>(VL0);
13476 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
13477 ValueList OpVL;
13478 // Some intrinsics have scalar arguments. This argument should not be
13479 // vectorized.
13480 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
13481 ScalarArg = CEI->getArgOperand(I);
13482 // if decided to reduce bitwidth of abs intrinsic, it second argument
13483 // must be set false (do not return poison, if value issigned min).
13484 if (ID == Intrinsic::abs && It != MinBWs.end() &&
13485 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
13486 ScalarArg = Builder.getFalse();
13487 OpVecs.push_back(ScalarArg);
13489 TysForDecl.push_back(ScalarArg->getType());
13490 continue;
13491 }
13492
13493 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
13494 if (E->VectorizedValue) {
13495 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13496 return E->VectorizedValue;
13497 }
13498 ScalarArg = CEI->getArgOperand(I);
13499 if (cast<VectorType>(OpVec->getType())->getElementType() !=
13500 ScalarArg->getType()->getScalarType() &&
13501 It == MinBWs.end()) {
13502 auto *CastTy =
13503 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
13504 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
13505 } else if (It != MinBWs.end()) {
13506 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
13507 }
13508 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
13509 OpVecs.push_back(OpVec);
13510 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
13511 TysForDecl.push_back(OpVec->getType());
13512 }
13513
13514 Function *CF;
13515 if (!UseIntrinsic) {
13516 VFShape Shape =
13519 static_cast<unsigned>(VecTy->getNumElements())),
13520 false /*HasGlobalPred*/);
13521 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
13522 } else {
13523 CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl);
13524 }
13525
13527 CI->getOperandBundlesAsDefs(OpBundles);
13528 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
13529
13530 propagateIRFlags(V, E->Scalars, VL0);
13531 V = FinalShuffle(V, E, VecTy);
13532
13533 E->VectorizedValue = V;
13534 ++NumVectorInstructions;
13535 return V;
13536 }
13537 case Instruction::ShuffleVector: {
13538 assert(E->isAltShuffle() &&
13539 ((Instruction::isBinaryOp(E->getOpcode()) &&
13540 Instruction::isBinaryOp(E->getAltOpcode())) ||
13541 (Instruction::isCast(E->getOpcode()) &&
13542 Instruction::isCast(E->getAltOpcode())) ||
13543 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
13544 "Invalid Shuffle Vector Operand");
13545
13546 Value *LHS = nullptr, *RHS = nullptr;
13547 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
13548 setInsertPointAfterBundle(E);
13549 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13550 if (E->VectorizedValue) {
13551 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13552 return E->VectorizedValue;
13553 }
13554 RHS = vectorizeOperand(E, 1, PostponedPHIs);
13555 } else {
13556 setInsertPointAfterBundle(E);
13557 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13558 }
13559 if (E->VectorizedValue) {
13560 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13561 return E->VectorizedValue;
13562 }
13563 if (LHS && RHS &&
13564 ((Instruction::isBinaryOp(E->getOpcode()) &&
13565 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
13566 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
13567 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
13568 getOperandEntry(E, 1)->isGather() ||
13569 MinBWs.contains(getOperandEntry(E, 0)) ||
13570 MinBWs.contains(getOperandEntry(E, 1))) &&
13571 "Expected item in MinBWs.");
13572 Type *CastTy = VecTy;
13573 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
13574 if (cast<VectorType>(LHS->getType())
13575 ->getElementType()
13576 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
13577 ->getElementType()
13578 ->getIntegerBitWidth())
13579 CastTy = RHS->getType();
13580 else
13581 CastTy = LHS->getType();
13582 }
13583 if (LHS->getType() != CastTy)
13584 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
13585 if (RHS->getType() != CastTy)
13586 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
13587 }
13588
13589 Value *V0, *V1;
13590 if (Instruction::isBinaryOp(E->getOpcode())) {
13591 V0 = Builder.CreateBinOp(
13592 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
13593 V1 = Builder.CreateBinOp(
13594 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
13595 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
13596 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
13597 auto *AltCI = cast<CmpInst>(E->getAltOp());
13598 CmpInst::Predicate AltPred = AltCI->getPredicate();
13599 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
13600 } else {
13601 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
13602 unsigned SrcBWSz = DL->getTypeSizeInBits(
13603 cast<VectorType>(LHS->getType())->getElementType());
13604 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
13605 if (BWSz <= SrcBWSz) {
13606 if (BWSz < SrcBWSz)
13607 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
13608 assert(LHS->getType() == VecTy && "Expected same type as operand.");
13609 if (auto *I = dyn_cast<Instruction>(LHS))
13610 LHS = propagateMetadata(I, E->Scalars);
13611 E->VectorizedValue = LHS;
13612 ++NumVectorInstructions;
13613 return LHS;
13614 }
13615 }
13616 V0 = Builder.CreateCast(
13617 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
13618 V1 = Builder.CreateCast(
13619 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
13620 }
13621 // Add V0 and V1 to later analysis to try to find and remove matching
13622 // instruction, if any.
13623 for (Value *V : {V0, V1}) {
13624 if (auto *I = dyn_cast<Instruction>(V)) {
13625 GatherShuffleExtractSeq.insert(I);
13626 CSEBlocks.insert(I->getParent());
13627 }
13628 }
13629
13630 // Create shuffle to take alternate operations from the vector.
13631 // Also, gather up main and alt scalar ops to propagate IR flags to
13632 // each vector operation.
13633 ValueList OpScalars, AltScalars;
13635 E->buildAltOpShuffleMask(
13636 [E, this](Instruction *I) {
13637 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
13638 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
13639 *TLI);
13640 },
13641 Mask, &OpScalars, &AltScalars);
13642
13643 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
13644 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
13645 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
13646 // Drop nuw flags for abs(sub(commutative), true).
13647 if (auto *I = dyn_cast<Instruction>(Vec);
13648 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
13649 any_of(E->Scalars, [](Value *V) {
13650 auto *IV = cast<Instruction>(V);
13651 return IV->getOpcode() == Instruction::Sub &&
13652 isCommutative(cast<Instruction>(IV));
13653 }))
13654 I->setHasNoUnsignedWrap(/*b=*/false);
13655 };
13656 DropNuwFlag(V0, E->getOpcode());
13657 DropNuwFlag(V1, E->getAltOpcode());
13658
13659 Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
13660 if (auto *I = dyn_cast<Instruction>(V)) {
13661 V = propagateMetadata(I, E->Scalars);
13662 GatherShuffleExtractSeq.insert(I);
13663 CSEBlocks.insert(I->getParent());
13664 }
13665
13666 E->VectorizedValue = V;
13667 ++NumVectorInstructions;
13668
13669 return V;
13670 }
13671 default:
13672 llvm_unreachable("unknown inst");
13673 }
13674 return nullptr;
13675}
13676
13678 ExtraValueToDebugLocsMap ExternallyUsedValues;
13679 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
13680 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13681}
13682
13683namespace {
13684/// Data type for handling buildvector sequences with the reused scalars from
13685/// other tree entries.
13686struct ShuffledInsertData {
13687 /// List of insertelements to be replaced by shuffles.
13688 SmallVector<InsertElementInst *> InsertElements;
13689 /// The parent vectors and shuffle mask for the given list of inserts.
13691};
13692} // namespace
13693
13695 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
13696 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
13697 Instruction *ReductionRoot) {
13698 // All blocks must be scheduled before any instructions are inserted.
13699 for (auto &BSIter : BlocksSchedules) {
13700 scheduleBlock(BSIter.second.get());
13701 }
13702 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
13703 // need to rebuild it.
13704 EntryToLastInstruction.clear();
13705
13706 if (ReductionRoot)
13707 Builder.SetInsertPoint(ReductionRoot->getParent(),
13708 ReductionRoot->getIterator());
13709 else
13710 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13711
13712 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
13713 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
13714 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13715 if (TE->State == TreeEntry::Vectorize &&
13716 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13717 TE->VectorizedValue)
13718 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
13719 // Run through the list of postponed gathers and emit them, replacing the temp
13720 // emitted allocas with actual vector instructions.
13721 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
13723 for (const TreeEntry *E : PostponedNodes) {
13724 auto *TE = const_cast<TreeEntry *>(E);
13725 if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
13726 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
13727 TE->UserTreeIndices.front().EdgeIdx)) &&
13728 VecTE->isSame(TE->Scalars))
13729 // Found gather node which is absolutely the same as one of the
13730 // vectorized nodes. It may happen after reordering.
13731 continue;
13732 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
13733 TE->VectorizedValue = nullptr;
13734 auto *UserI =
13735 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
13736 // If user is a PHI node, its vector code have to be inserted right before
13737 // block terminator. Since the node was delayed, there were some unresolved
13738 // dependencies at the moment when stab instruction was emitted. In a case
13739 // when any of these dependencies turn out an operand of another PHI, coming
13740 // from this same block, position of a stab instruction will become invalid.
13741 // The is because source vector that supposed to feed this gather node was
13742 // inserted at the end of the block [after stab instruction]. So we need
13743 // to adjust insertion point again to the end of block.
13744 if (isa<PHINode>(UserI)) {
13745 // Insert before all users.
13746 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
13747 for (User *U : PrevVec->users()) {
13748 if (U == UserI)
13749 continue;
13750 auto *UI = dyn_cast<Instruction>(U);
13751 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
13752 continue;
13753 if (UI->comesBefore(InsertPt))
13754 InsertPt = UI;
13755 }
13756 Builder.SetInsertPoint(InsertPt);
13757 } else {
13758 Builder.SetInsertPoint(PrevVec);
13759 }
13760 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
13761 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
13762 if (Vec->getType() != PrevVec->getType()) {
13763 assert(Vec->getType()->isIntOrIntVectorTy() &&
13764 PrevVec->getType()->isIntOrIntVectorTy() &&
13765 "Expected integer vector types only.");
13766 std::optional<bool> IsSigned;
13767 for (Value *V : TE->Scalars) {
13768 if (const TreeEntry *BaseTE = getTreeEntry(V)) {
13769 auto It = MinBWs.find(BaseTE);
13770 if (It != MinBWs.end()) {
13771 IsSigned = IsSigned.value_or(false) || It->second.second;
13772 if (*IsSigned)
13773 break;
13774 }
13775 for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
13776 auto It = MinBWs.find(MNTE);
13777 if (It != MinBWs.end()) {
13778 IsSigned = IsSigned.value_or(false) || It->second.second;
13779 if (*IsSigned)
13780 break;
13781 }
13782 }
13783 if (IsSigned.value_or(false))
13784 break;
13785 // Scan through gather nodes.
13786 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
13787 auto It = MinBWs.find(BVE);
13788 if (It != MinBWs.end()) {
13789 IsSigned = IsSigned.value_or(false) || It->second.second;
13790 if (*IsSigned)
13791 break;
13792 }
13793 }
13794 if (IsSigned.value_or(false))
13795 break;
13796 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
13797 IsSigned =
13798 IsSigned.value_or(false) ||
13799 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
13800 continue;
13801 }
13802 if (IsSigned.value_or(false))
13803 break;
13804 }
13805 }
13806 if (IsSigned.value_or(false)) {
13807 // Final attempt - check user node.
13808 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
13809 if (It != MinBWs.end())
13810 IsSigned = It->second.second;
13811 }
13812 assert(IsSigned &&
13813 "Expected user node or perfect diamond match in MinBWs.");
13814 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
13815 }
13816 PrevVec->replaceAllUsesWith(Vec);
13817 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
13818 // Replace the stub vector node, if it was used before for one of the
13819 // buildvector nodes already.
13820 auto It = PostponedValues.find(PrevVec);
13821 if (It != PostponedValues.end()) {
13822 for (TreeEntry *VTE : It->getSecond())
13823 VTE->VectorizedValue = Vec;
13824 }
13825 eraseInstruction(PrevVec);
13826 }
13827
13828 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
13829 << " values .\n");
13830
13831 SmallVector<ShuffledInsertData> ShuffledInserts;
13832 // Maps vector instruction to original insertelement instruction
13833 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
13834 // Maps extract Scalar to the corresponding extractelement instruction in the
13835 // basic block. Only one extractelement per block should be emitted.
13836 DenseMap<Value *,
13838 ScalarToEEs;
13839 SmallDenseSet<Value *, 4> UsedInserts;
13841 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
13842 // Extract all of the elements with the external uses.
13843 for (const auto &ExternalUse : ExternalUses) {
13844 Value *Scalar = ExternalUse.Scalar;
13845 llvm::User *User = ExternalUse.User;
13846
13847 // Skip users that we already RAUW. This happens when one instruction
13848 // has multiple uses of the same value.
13849 if (User && !is_contained(Scalar->users(), User))
13850 continue;
13851 TreeEntry *E = getTreeEntry(Scalar);
13852 assert(E && "Invalid scalar");
13853 assert(!E->isGather() && "Extracting from a gather list");
13854 // Non-instruction pointers are not deleted, just skip them.
13855 if (E->getOpcode() == Instruction::GetElementPtr &&
13856 !isa<GetElementPtrInst>(Scalar))
13857 continue;
13858
13859 Value *Vec = E->VectorizedValue;
13860 assert(Vec && "Can't find vectorizable value");
13861
13862 Value *Lane = Builder.getInt32(ExternalUse.Lane);
13863 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
13864 if (Scalar->getType() != Vec->getType()) {
13865 Value *Ex = nullptr;
13866 Value *ExV = nullptr;
13867 auto *GEP = dyn_cast<GetElementPtrInst>(Scalar);
13868 bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP);
13869 auto It = ScalarToEEs.find(Scalar);
13870 if (It != ScalarToEEs.end()) {
13871 // No need to emit many extracts, just move the only one in the
13872 // current block.
13873 auto EEIt = It->second.find(Builder.GetInsertBlock());
13874 if (EEIt != It->second.end()) {
13875 Instruction *I = EEIt->second.first;
13876 if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
13877 Builder.GetInsertPoint()->comesBefore(I)) {
13878 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
13879 Builder.GetInsertPoint());
13880 if (auto *CI = EEIt->second.second)
13881 CI->moveAfter(I);
13882 }
13883 Ex = I;
13884 ExV = EEIt->second.second ? EEIt->second.second : Ex;
13885 }
13886 }
13887 if (!Ex) {
13888 // "Reuse" the existing extract to improve final codegen.
13889 if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
13890 Value *V = ES->getVectorOperand();
13891 if (const TreeEntry *ETE = getTreeEntry(V))
13892 V = ETE->VectorizedValue;
13893 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
13894 } else if (ReplaceGEP) {
13895 // Leave the GEPs as is, they are free in most cases and better to
13896 // keep them as GEPs.
13897 auto *CloneGEP = GEP->clone();
13898 if (isa<Instruction>(Vec))
13899 CloneGEP->insertBefore(*Builder.GetInsertBlock(),
13900 Builder.GetInsertPoint());
13901 else
13902 CloneGEP->insertBefore(GEP);
13903 if (GEP->hasName())
13904 CloneGEP->takeName(GEP);
13905 Ex = CloneGEP;
13906 } else {
13907 Ex = Builder.CreateExtractElement(Vec, Lane);
13908 }
13909 // If necessary, sign-extend or zero-extend ScalarRoot
13910 // to the larger type.
13911 ExV = Ex;
13912 if (Scalar->getType() != Ex->getType())
13913 ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
13914 MinBWs.find(E)->second.second);
13915 if (auto *I = dyn_cast<Instruction>(Ex))
13916 ScalarToEEs[Scalar].try_emplace(
13917 Builder.GetInsertBlock(),
13918 std::make_pair(I, cast<Instruction>(ExV)));
13919 }
13920 // The then branch of the previous if may produce constants, since 0
13921 // operand might be a constant.
13922 if (auto *ExI = dyn_cast<Instruction>(Ex)) {
13923 GatherShuffleExtractSeq.insert(ExI);
13924 CSEBlocks.insert(ExI->getParent());
13925 }
13926 return ExV;
13927 }
13928 assert(isa<FixedVectorType>(Scalar->getType()) &&
13929 isa<InsertElementInst>(Scalar) &&
13930 "In-tree scalar of vector type is not insertelement?");
13931 auto *IE = cast<InsertElementInst>(Scalar);
13932 VectorToInsertElement.try_emplace(Vec, IE);
13933 return Vec;
13934 };
13935 // If User == nullptr, the Scalar remains as scalar in vectorized
13936 // instructions or is used as extra arg. Generate ExtractElement instruction
13937 // and update the record for this scalar in ExternallyUsedValues.
13938 if (!User) {
13939 if (!ScalarsWithNullptrUser.insert(Scalar).second)
13940 continue;
13941 assert((ExternallyUsedValues.count(Scalar) ||
13942 Scalar->hasNUsesOrMore(UsesLimit) ||
13943 any_of(Scalar->users(),
13944 [&](llvm::User *U) {
13945 if (ExternalUsesAsGEPs.contains(U))
13946 return true;
13947 TreeEntry *UseEntry = getTreeEntry(U);
13948 return UseEntry &&
13949 (UseEntry->State == TreeEntry::Vectorize ||
13950 UseEntry->State ==
13951 TreeEntry::StridedVectorize) &&
13952 (E->State == TreeEntry::Vectorize ||
13953 E->State == TreeEntry::StridedVectorize) &&
13954 doesInTreeUserNeedToExtract(
13955 Scalar,
13956 cast<Instruction>(UseEntry->Scalars.front()),
13957 TLI);
13958 })) &&
13959 "Scalar with nullptr User must be registered in "
13960 "ExternallyUsedValues map or remain as scalar in vectorized "
13961 "instructions");
13962 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
13963 if (auto *PHI = dyn_cast<PHINode>(VecI))
13964 Builder.SetInsertPoint(PHI->getParent(),
13965 PHI->getParent()->getFirstNonPHIIt());
13966 else
13967 Builder.SetInsertPoint(VecI->getParent(),
13968 std::next(VecI->getIterator()));
13969 } else {
13970 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13971 }
13972 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13973 // Required to update internally referenced instructions.
13974 Scalar->replaceAllUsesWith(NewInst);
13975 ReplacedExternals.emplace_back(Scalar, NewInst);
13976 continue;
13977 }
13978
13979 if (auto *VU = dyn_cast<InsertElementInst>(User);
13980 VU && VU->getOperand(1) == Scalar) {
13981 // Skip if the scalar is another vector op or Vec is not an instruction.
13982 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
13983 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
13984 if (!UsedInserts.insert(VU).second)
13985 continue;
13986 // Need to use original vector, if the root is truncated.
13987 auto BWIt = MinBWs.find(E);
13988 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
13989 auto *ScalarTy = FTy->getElementType();
13990 auto Key = std::make_pair(Vec, ScalarTy);
13991 auto VecIt = VectorCasts.find(Key);
13992 if (VecIt == VectorCasts.end()) {
13993 IRBuilderBase::InsertPointGuard Guard(Builder);
13994 if (auto *IVec = dyn_cast<PHINode>(Vec))
13995 Builder.SetInsertPoint(
13996 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
13997 else if (auto *IVec = dyn_cast<Instruction>(Vec))
13998 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
13999 Vec = Builder.CreateIntCast(
14000 Vec,
14002 ScalarTy,
14003 cast<FixedVectorType>(Vec->getType())->getNumElements()),
14004 BWIt->second.second);
14005 VectorCasts.try_emplace(Key, Vec);
14006 } else {
14007 Vec = VecIt->second;
14008 }
14009 }
14010
14011 std::optional<unsigned> InsertIdx = getElementIndex(VU);
14012 if (InsertIdx) {
14013 auto *It =
14014 find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
14015 // Checks if 2 insertelements are from the same buildvector.
14016 InsertElementInst *VecInsert = Data.InsertElements.front();
14018 VU, VecInsert,
14019 [](InsertElementInst *II) { return II->getOperand(0); });
14020 });
14021 unsigned Idx = *InsertIdx;
14022 if (It == ShuffledInserts.end()) {
14023 (void)ShuffledInserts.emplace_back();
14024 It = std::next(ShuffledInserts.begin(),
14025 ShuffledInserts.size() - 1);
14026 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
14027 if (Mask.empty())
14028 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
14029 // Find the insertvector, vectorized in tree, if any.
14030 Value *Base = VU;
14031 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
14032 if (IEBase != User &&
14033 (!IEBase->hasOneUse() ||
14034 getElementIndex(IEBase).value_or(Idx) == Idx))
14035 break;
14036 // Build the mask for the vectorized insertelement instructions.
14037 if (const TreeEntry *E = getTreeEntry(IEBase)) {
14038 do {
14039 IEBase = cast<InsertElementInst>(Base);
14040 int IEIdx = *getElementIndex(IEBase);
14041 assert(Mask[IEIdx] == PoisonMaskElem &&
14042 "InsertElementInstruction used already.");
14043 Mask[IEIdx] = IEIdx;
14044 Base = IEBase->getOperand(0);
14045 } while (E == getTreeEntry(Base));
14046 break;
14047 }
14048 Base = cast<InsertElementInst>(Base)->getOperand(0);
14049 // After the vectorization the def-use chain has changed, need
14050 // to look through original insertelement instructions, if they
14051 // get replaced by vector instructions.
14052 auto It = VectorToInsertElement.find(Base);
14053 if (It != VectorToInsertElement.end())
14054 Base = It->second;
14055 }
14056 }
14057 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
14058 if (Mask.empty())
14059 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
14060 Mask[Idx] = ExternalUse.Lane;
14061 It->InsertElements.push_back(cast<InsertElementInst>(User));
14062 continue;
14063 }
14064 }
14065 }
14066 }
14067
14068 // Generate extracts for out-of-tree users.
14069 // Find the insertion point for the extractelement lane.
14070 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
14071 if (PHINode *PH = dyn_cast<PHINode>(User)) {
14072 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
14073 if (PH->getIncomingValue(I) == Scalar) {
14074 Instruction *IncomingTerminator =
14075 PH->getIncomingBlock(I)->getTerminator();
14076 if (isa<CatchSwitchInst>(IncomingTerminator)) {
14077 Builder.SetInsertPoint(VecI->getParent(),
14078 std::next(VecI->getIterator()));
14079 } else {
14080 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
14081 }
14082 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14083 PH->setOperand(I, NewInst);
14084 }
14085 }
14086 } else {
14087 Builder.SetInsertPoint(cast<Instruction>(User));
14088 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14089 User->replaceUsesOfWith(Scalar, NewInst);
14090 }
14091 } else {
14092 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
14093 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14094 User->replaceUsesOfWith(Scalar, NewInst);
14095 }
14096
14097 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
14098 }
14099
14100 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
14101 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
14102 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
14103 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
14104 for (int I = 0, E = Mask.size(); I < E; ++I) {
14105 if (Mask[I] < VF)
14106 CombinedMask1[I] = Mask[I];
14107 else
14108 CombinedMask2[I] = Mask[I] - VF;
14109 }
14110 ShuffleInstructionBuilder ShuffleBuilder(
14111 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
14112 ShuffleBuilder.add(V1, CombinedMask1);
14113 if (V2)
14114 ShuffleBuilder.add(V2, CombinedMask2);
14115 return ShuffleBuilder.finalize(std::nullopt);
14116 };
14117
14118 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
14119 bool ForSingleMask) {
14120 unsigned VF = Mask.size();
14121 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
14122 if (VF != VecVF) {
14123 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
14124 Vec = CreateShuffle(Vec, nullptr, Mask);
14125 return std::make_pair(Vec, true);
14126 }
14127 if (!ForSingleMask) {
14128 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
14129 for (unsigned I = 0; I < VF; ++I) {
14130 if (Mask[I] != PoisonMaskElem)
14131 ResizeMask[Mask[I]] = Mask[I];
14132 }
14133 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
14134 }
14135 }
14136
14137 return std::make_pair(Vec, false);
14138 };
14139 // Perform shuffling of the vectorize tree entries for better handling of
14140 // external extracts.
14141 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
14142 // Find the first and the last instruction in the list of insertelements.
14143 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
14144 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
14145 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
14146 Builder.SetInsertPoint(LastInsert);
14147 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
14148 Value *NewInst = performExtractsShuffleAction<Value>(
14149 MutableArrayRef(Vector.data(), Vector.size()),
14150 FirstInsert->getOperand(0),
14151 [](Value *Vec) {
14152 return cast<VectorType>(Vec->getType())
14153 ->getElementCount()
14154 .getKnownMinValue();
14155 },
14156 ResizeToVF,
14157 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
14158 ArrayRef<Value *> Vals) {
14159 assert((Vals.size() == 1 || Vals.size() == 2) &&
14160 "Expected exactly 1 or 2 input values.");
14161 if (Vals.size() == 1) {
14162 // Do not create shuffle if the mask is a simple identity
14163 // non-resizing mask.
14164 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
14165 ->getNumElements() ||
14166 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14167 return CreateShuffle(Vals.front(), nullptr, Mask);
14168 return Vals.front();
14169 }
14170 return CreateShuffle(Vals.front() ? Vals.front()
14171 : FirstInsert->getOperand(0),
14172 Vals.back(), Mask);
14173 });
14174 auto It = ShuffledInserts[I].InsertElements.rbegin();
14175 // Rebuild buildvector chain.
14176 InsertElementInst *II = nullptr;
14177 if (It != ShuffledInserts[I].InsertElements.rend())
14178 II = *It;
14180 while (It != ShuffledInserts[I].InsertElements.rend()) {
14181 assert(II && "Must be an insertelement instruction.");
14182 if (*It == II)
14183 ++It;
14184 else
14185 Inserts.push_back(cast<Instruction>(II));
14186 II = dyn_cast<InsertElementInst>(II->getOperand(0));
14187 }
14188 for (Instruction *II : reverse(Inserts)) {
14189 II->replaceUsesOfWith(II->getOperand(0), NewInst);
14190 if (auto *NewI = dyn_cast<Instruction>(NewInst))
14191 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
14192 II->moveAfter(NewI);
14193 NewInst = II;
14194 }
14195 LastInsert->replaceAllUsesWith(NewInst);
14196 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
14197 IE->replaceUsesOfWith(IE->getOperand(0),
14198 PoisonValue::get(IE->getOperand(0)->getType()));
14199 IE->replaceUsesOfWith(IE->getOperand(1),
14200 PoisonValue::get(IE->getOperand(1)->getType()));
14201 eraseInstruction(IE);
14202 }
14203 CSEBlocks.insert(LastInsert->getParent());
14204 }
14205
14206 SmallVector<Instruction *> RemovedInsts;
14207 // For each vectorized value:
14208 for (auto &TEPtr : VectorizableTree) {
14209 TreeEntry *Entry = TEPtr.get();
14210
14211 // No need to handle users of gathered values.
14212 if (Entry->isGather())
14213 continue;
14214
14215 assert(Entry->VectorizedValue && "Can't find vectorizable value");
14216
14217 // For each lane:
14218 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
14219 Value *Scalar = Entry->Scalars[Lane];
14220
14221 if (Entry->getOpcode() == Instruction::GetElementPtr &&
14222 !isa<GetElementPtrInst>(Scalar))
14223 continue;
14224#ifndef NDEBUG
14225 Type *Ty = Scalar->getType();
14226 if (!Ty->isVoidTy()) {
14227 for (User *U : Scalar->users()) {
14228 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
14229
14230 // It is legal to delete users in the ignorelist.
14231 assert((getTreeEntry(U) ||
14232 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14233 (isa_and_nonnull<Instruction>(U) &&
14234 isDeleted(cast<Instruction>(U)))) &&
14235 "Deleting out-of-tree value");
14236 }
14237 }
14238#endif
14239 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
14240 auto *I = cast<Instruction>(Scalar);
14241 RemovedInsts.push_back(I);
14242 }
14243 }
14244
14245 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
14246 // new vector instruction.
14247 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
14248 V->mergeDIAssignID(RemovedInsts);
14249
14250 // Clear up reduction references, if any.
14251 if (UserIgnoreList) {
14252 for (Instruction *I : RemovedInsts) {
14253 if (getTreeEntry(I)->Idx != 0)
14254 continue;
14255 SmallVector<SelectInst *> LogicalOpSelects;
14256 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
14257 // Do not replace condition of the logical op in form select <cond>.
14258 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
14259 (match(U.getUser(), m_LogicalAnd()) ||
14260 match(U.getUser(), m_LogicalOr())) &&
14261 U.getOperandNo() == 0;
14262 if (IsPoisoningLogicalOp) {
14263 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
14264 return false;
14265 }
14266 return UserIgnoreList->contains(U.getUser());
14267 });
14268 // Replace conditions of the poisoning logical ops with the non-poison
14269 // constant value.
14270 for (SelectInst *SI : LogicalOpSelects)
14271 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
14272 }
14273 }
14274 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
14275 // cache correctness.
14276 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
14277 // - instructions are not deleted until later.
14278 removeInstructionsAndOperands(ArrayRef(RemovedInsts));
14279
14280 Builder.ClearInsertionPoint();
14281 InstrElementSize.clear();
14282
14283 const TreeEntry &RootTE = *VectorizableTree.front();
14284 Value *Vec = RootTE.VectorizedValue;
14285 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
14286 It != MinBWs.end() &&
14287 ReductionBitWidth != It->second.first) {
14288 IRBuilder<>::InsertPointGuard Guard(Builder);
14289 Builder.SetInsertPoint(ReductionRoot->getParent(),
14290 ReductionRoot->getIterator());
14291 Vec = Builder.CreateIntCast(
14292 Vec,
14293 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
14294 cast<VectorType>(Vec->getType())->getElementCount()),
14295 It->second.second);
14296 }
14297 return Vec;
14298}
14299
14301 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
14302 << " gather sequences instructions.\n");
14303 // LICM InsertElementInst sequences.
14304 for (Instruction *I : GatherShuffleExtractSeq) {
14305 if (isDeleted(I))
14306 continue;
14307
14308 // Check if this block is inside a loop.
14309 Loop *L = LI->getLoopFor(I->getParent());
14310 if (!L)
14311 continue;
14312
14313 // Check if it has a preheader.
14314 BasicBlock *PreHeader = L->getLoopPreheader();
14315 if (!PreHeader)
14316 continue;
14317
14318 // If the vector or the element that we insert into it are
14319 // instructions that are defined in this basic block then we can't
14320 // hoist this instruction.
14321 if (any_of(I->operands(), [L](Value *V) {
14322 auto *OpI = dyn_cast<Instruction>(V);
14323 return OpI && L->contains(OpI);
14324 }))
14325 continue;
14326
14327 // We can hoist this instruction. Move it to the pre-header.
14328 I->moveBefore(PreHeader->getTerminator());
14329 CSEBlocks.insert(PreHeader);
14330 }
14331
14332 // Make a list of all reachable blocks in our CSE queue.
14334 CSEWorkList.reserve(CSEBlocks.size());
14335 for (BasicBlock *BB : CSEBlocks)
14336 if (DomTreeNode *N = DT->getNode(BB)) {
14338 CSEWorkList.push_back(N);
14339 }
14340
14341 // Sort blocks by domination. This ensures we visit a block after all blocks
14342 // dominating it are visited.
14343 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
14344 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
14345 "Different nodes should have different DFS numbers");
14346 return A->getDFSNumIn() < B->getDFSNumIn();
14347 });
14348
14349 // Less defined shuffles can be replaced by the more defined copies.
14350 // Between two shuffles one is less defined if it has the same vector operands
14351 // and its mask indeces are the same as in the first one or undefs. E.g.
14352 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
14353 // poison, <0, 0, 0, 0>.
14354 auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
14355 SmallVectorImpl<int> &NewMask) {
14356 if (I1->getType() != I2->getType())
14357 return false;
14358 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
14359 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
14360 if (!SI1 || !SI2)
14361 return I1->isIdenticalTo(I2);
14362 if (SI1->isIdenticalTo(SI2))
14363 return true;
14364 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
14365 if (SI1->getOperand(I) != SI2->getOperand(I))
14366 return false;
14367 // Check if the second instruction is more defined than the first one.
14368 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
14369 ArrayRef<int> SM1 = SI1->getShuffleMask();
14370 // Count trailing undefs in the mask to check the final number of used
14371 // registers.
14372 unsigned LastUndefsCnt = 0;
14373 for (int I = 0, E = NewMask.size(); I < E; ++I) {
14374 if (SM1[I] == PoisonMaskElem)
14375 ++LastUndefsCnt;
14376 else
14377 LastUndefsCnt = 0;
14378 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
14379 NewMask[I] != SM1[I])
14380 return false;
14381 if (NewMask[I] == PoisonMaskElem)
14382 NewMask[I] = SM1[I];
14383 }
14384 // Check if the last undefs actually change the final number of used vector
14385 // registers.
14386 return SM1.size() - LastUndefsCnt > 1 &&
14387 TTI->getNumberOfParts(SI1->getType()) ==
14389 getWidenedType(SI1->getType()->getElementType(),
14390 SM1.size() - LastUndefsCnt));
14391 };
14392 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
14393 // instructions. TODO: We can further optimize this scan if we split the
14394 // instructions into different buckets based on the insert lane.
14396 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
14397 assert(*I &&
14398 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
14399 "Worklist not sorted properly!");
14400 BasicBlock *BB = (*I)->getBlock();
14401 // For all instructions in blocks containing gather sequences:
14402 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
14403 if (isDeleted(&In))
14404 continue;
14405 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
14406 !GatherShuffleExtractSeq.contains(&In))
14407 continue;
14408
14409 // Check if we can replace this instruction with any of the
14410 // visited instructions.
14411 bool Replaced = false;
14412 for (Instruction *&V : Visited) {
14413 SmallVector<int> NewMask;
14414 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
14415 DT->dominates(V->getParent(), In.getParent())) {
14416 In.replaceAllUsesWith(V);
14417 eraseInstruction(&In);
14418 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
14419 if (!NewMask.empty())
14420 SI->setShuffleMask(NewMask);
14421 Replaced = true;
14422 break;
14423 }
14424 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
14425 GatherShuffleExtractSeq.contains(V) &&
14426 IsIdenticalOrLessDefined(V, &In, NewMask) &&
14427 DT->dominates(In.getParent(), V->getParent())) {
14428 In.moveAfter(V);
14429 V->replaceAllUsesWith(&In);
14431 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
14432 if (!NewMask.empty())
14433 SI->setShuffleMask(NewMask);
14434 V = &In;
14435 Replaced = true;
14436 break;
14437 }
14438 }
14439 if (!Replaced) {
14440 assert(!is_contained(Visited, &In));
14441 Visited.push_back(&In);
14442 }
14443 }
14444 }
14445 CSEBlocks.clear();
14446 GatherShuffleExtractSeq.clear();
14447}
14448
14449BoUpSLP::ScheduleData *
14450BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
14451 ScheduleData *Bundle = nullptr;
14452 ScheduleData *PrevInBundle = nullptr;
14453 for (Value *V : VL) {
14455 continue;
14456 ScheduleData *BundleMember = getScheduleData(V);
14457 assert(BundleMember &&
14458 "no ScheduleData for bundle member "
14459 "(maybe not in same basic block)");
14460 assert(BundleMember->isSchedulingEntity() &&
14461 "bundle member already part of other bundle");
14462 if (PrevInBundle) {
14463 PrevInBundle->NextInBundle = BundleMember;
14464 } else {
14465 Bundle = BundleMember;
14466 }
14467
14468 // Group the instructions to a bundle.
14469 BundleMember->FirstInBundle = Bundle;
14470 PrevInBundle = BundleMember;
14471 }
14472 assert(Bundle && "Failed to find schedule bundle");
14473 return Bundle;
14474}
14475
14476// Groups the instructions to a bundle (which is then a single scheduling entity)
14477// and schedules instructions until the bundle gets ready.
14478std::optional<BoUpSLP::ScheduleData *>
14479BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
14480 const InstructionsState &S) {
14481 // No need to schedule PHIs, insertelement, extractelement and extractvalue
14482 // instructions.
14483 if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
14485 return nullptr;
14486
14487 // Initialize the instruction bundle.
14488 Instruction *OldScheduleEnd = ScheduleEnd;
14489 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
14490
14491 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
14492 ScheduleData *Bundle) {
14493 // The scheduling region got new instructions at the lower end (or it is a
14494 // new region for the first bundle). This makes it necessary to
14495 // recalculate all dependencies.
14496 // It is seldom that this needs to be done a second time after adding the
14497 // initial bundle to the region.
14498 if (ScheduleEnd != OldScheduleEnd) {
14499 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
14500 doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
14501 ReSchedule = true;
14502 }
14503 if (Bundle) {
14504 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
14505 << " in block " << BB->getName() << "\n");
14506 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
14507 }
14508
14509 if (ReSchedule) {
14510 resetSchedule();
14511 initialFillReadyList(ReadyInsts);
14512 }
14513
14514 // Now try to schedule the new bundle or (if no bundle) just calculate
14515 // dependencies. As soon as the bundle is "ready" it means that there are no
14516 // cyclic dependencies and we can schedule it. Note that's important that we
14517 // don't "schedule" the bundle yet (see cancelScheduling).
14518 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
14519 !ReadyInsts.empty()) {
14520 ScheduleData *Picked = ReadyInsts.pop_back_val();
14521 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
14522 "must be ready to schedule");
14523 schedule(Picked, ReadyInsts);
14524 }
14525 };
14526
14527 // Make sure that the scheduling region contains all
14528 // instructions of the bundle.
14529 for (Value *V : VL) {
14531 continue;
14532 if (!extendSchedulingRegion(V, S)) {
14533 // If the scheduling region got new instructions at the lower end (or it
14534 // is a new region for the first bundle). This makes it necessary to
14535 // recalculate all dependencies.
14536 // Otherwise the compiler may crash trying to incorrectly calculate
14537 // dependencies and emit instruction in the wrong order at the actual
14538 // scheduling.
14539 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
14540 return std::nullopt;
14541 }
14542 }
14543
14544 bool ReSchedule = false;
14545 for (Value *V : VL) {
14547 continue;
14548 ScheduleData *BundleMember = getScheduleData(V);
14549 assert(BundleMember &&
14550 "no ScheduleData for bundle member (maybe not in same basic block)");
14551
14552 // Make sure we don't leave the pieces of the bundle in the ready list when
14553 // whole bundle might not be ready.
14554 ReadyInsts.remove(BundleMember);
14555
14556 if (!BundleMember->IsScheduled)
14557 continue;
14558 // A bundle member was scheduled as single instruction before and now
14559 // needs to be scheduled as part of the bundle. We just get rid of the
14560 // existing schedule.
14561 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
14562 << " was already scheduled\n");
14563 ReSchedule = true;
14564 }
14565
14566 auto *Bundle = buildBundle(VL);
14567 TryScheduleBundleImpl(ReSchedule, Bundle);
14568 if (!Bundle->isReady()) {
14569 cancelScheduling(VL, S.OpValue);
14570 return std::nullopt;
14571 }
14572 return Bundle;
14573}
14574
14575void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
14576 Value *OpValue) {
14577 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
14579 return;
14580
14581 if (doesNotNeedToBeScheduled(OpValue))
14582 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
14583 ScheduleData *Bundle = getScheduleData(OpValue);
14584 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
14585 assert(!Bundle->IsScheduled &&
14586 "Can't cancel bundle which is already scheduled");
14587 assert(Bundle->isSchedulingEntity() &&
14588 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
14589 "tried to unbundle something which is not a bundle");
14590
14591 // Remove the bundle from the ready list.
14592 if (Bundle->isReady())
14593 ReadyInsts.remove(Bundle);
14594
14595 // Un-bundle: make single instructions out of the bundle.
14596 ScheduleData *BundleMember = Bundle;
14597 while (BundleMember) {
14598 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
14599 BundleMember->FirstInBundle = BundleMember;
14600 ScheduleData *Next = BundleMember->NextInBundle;
14601 BundleMember->NextInBundle = nullptr;
14602 BundleMember->TE = nullptr;
14603 if (BundleMember->unscheduledDepsInBundle() == 0) {
14604 ReadyInsts.insert(BundleMember);
14605 }
14606 BundleMember = Next;
14607 }
14608}
14609
14610BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
14611 // Allocate a new ScheduleData for the instruction.
14612 if (ChunkPos >= ChunkSize) {
14613 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
14614 ChunkPos = 0;
14615 }
14616 return &(ScheduleDataChunks.back()[ChunkPos++]);
14617}
14618
14619bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
14620 const InstructionsState &S) {
14621 if (getScheduleData(V, isOneOf(S, V)))
14622 return true;
14623 Instruction *I = dyn_cast<Instruction>(V);
14624 assert(I && "bundle member must be an instruction");
14625 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
14627 "phi nodes/insertelements/extractelements/extractvalues don't need to "
14628 "be scheduled");
14629 auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
14630 ScheduleData *ISD = getScheduleData(I);
14631 if (!ISD)
14632 return false;
14633 assert(isInSchedulingRegion(ISD) &&
14634 "ScheduleData not in scheduling region");
14635 ScheduleData *SD = allocateScheduleDataChunks();
14636 SD->Inst = I;
14637 SD->init(SchedulingRegionID, S.OpValue);
14638 ExtraScheduleDataMap[I][S.OpValue] = SD;
14639 return true;
14640 };
14641 if (CheckScheduleForI(I))
14642 return true;
14643 if (!ScheduleStart) {
14644 // It's the first instruction in the new region.
14645 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
14646 ScheduleStart = I;
14647 ScheduleEnd = I->getNextNode();
14648 if (isOneOf(S, I) != I)
14649 CheckScheduleForI(I);
14650 assert(ScheduleEnd && "tried to vectorize a terminator?");
14651 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
14652 return true;
14653 }
14654 // Search up and down at the same time, because we don't know if the new
14655 // instruction is above or below the existing scheduling region.
14656 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
14657 // against the budget. Otherwise debug info could affect codegen.
14659 ++ScheduleStart->getIterator().getReverse();
14660 BasicBlock::reverse_iterator UpperEnd = BB->rend();
14661 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
14662 BasicBlock::iterator LowerEnd = BB->end();
14663 auto IsAssumeLikeIntr = [](const Instruction &I) {
14664 if (auto *II = dyn_cast<IntrinsicInst>(&I))
14665 return II->isAssumeLikeIntrinsic();
14666 return false;
14667 };
14668 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14669 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14670 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
14671 &*DownIter != I) {
14672 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14673 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
14674 return false;
14675 }
14676
14677 ++UpIter;
14678 ++DownIter;
14679
14680 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14681 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14682 }
14683 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
14684 assert(I->getParent() == ScheduleStart->getParent() &&
14685 "Instruction is in wrong basic block.");
14686 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
14687 ScheduleStart = I;
14688 if (isOneOf(S, I) != I)
14689 CheckScheduleForI(I);
14690 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
14691 << "\n");
14692 return true;
14693 }
14694 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
14695 "Expected to reach top of the basic block or instruction down the "
14696 "lower end.");
14697 assert(I->getParent() == ScheduleEnd->getParent() &&
14698 "Instruction is in wrong basic block.");
14699 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
14700 nullptr);
14701 ScheduleEnd = I->getNextNode();
14702 if (isOneOf(S, I) != I)
14703 CheckScheduleForI(I);
14704 assert(ScheduleEnd && "tried to vectorize a terminator?");
14705 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
14706 return true;
14707}
14708
14709void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
14710 Instruction *ToI,
14711 ScheduleData *PrevLoadStore,
14712 ScheduleData *NextLoadStore) {
14713 ScheduleData *CurrentLoadStore = PrevLoadStore;
14714 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
14715 // No need to allocate data for non-schedulable instructions.
14717 continue;
14718 ScheduleData *SD = ScheduleDataMap.lookup(I);
14719 if (!SD) {
14720 SD = allocateScheduleDataChunks();
14721 ScheduleDataMap[I] = SD;
14722 SD->Inst = I;
14723 }
14724 assert(!isInSchedulingRegion(SD) &&
14725 "new ScheduleData already in scheduling region");
14726 SD->init(SchedulingRegionID, I);
14727
14728 if (I->mayReadOrWriteMemory() &&
14729 (!isa<IntrinsicInst>(I) ||
14730 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
14731 cast<IntrinsicInst>(I)->getIntrinsicID() !=
14732 Intrinsic::pseudoprobe))) {
14733 // Update the linked list of memory accessing instructions.
14734 if (CurrentLoadStore) {
14735 CurrentLoadStore->NextLoadStore = SD;
14736 } else {
14737 FirstLoadStoreInRegion = SD;
14738 }
14739 CurrentLoadStore = SD;
14740 }
14741
14742 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14743 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14744 RegionHasStackSave = true;
14745 }
14746 if (NextLoadStore) {
14747 if (CurrentLoadStore)
14748 CurrentLoadStore->NextLoadStore = NextLoadStore;
14749 } else {
14750 LastLoadStoreInRegion = CurrentLoadStore;
14751 }
14752}
14753
14754void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14755 bool InsertInReadyList,
14756 BoUpSLP *SLP) {
14757 assert(SD->isSchedulingEntity());
14758
14760 WorkList.push_back(SD);
14761
14762 while (!WorkList.empty()) {
14763 ScheduleData *SD = WorkList.pop_back_val();
14764 for (ScheduleData *BundleMember = SD; BundleMember;
14765 BundleMember = BundleMember->NextInBundle) {
14766 assert(isInSchedulingRegion(BundleMember));
14767 if (BundleMember->hasValidDependencies())
14768 continue;
14769
14770 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
14771 << "\n");
14772 BundleMember->Dependencies = 0;
14773 BundleMember->resetUnscheduledDeps();
14774
14775 // Handle def-use chain dependencies.
14776 if (BundleMember->OpValue != BundleMember->Inst) {
14777 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
14778 BundleMember->Dependencies++;
14779 ScheduleData *DestBundle = UseSD->FirstInBundle;
14780 if (!DestBundle->IsScheduled)
14781 BundleMember->incrementUnscheduledDeps(1);
14782 if (!DestBundle->hasValidDependencies())
14783 WorkList.push_back(DestBundle);
14784 }
14785 } else {
14786 for (User *U : BundleMember->Inst->users()) {
14787 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
14788 BundleMember->Dependencies++;
14789 ScheduleData *DestBundle = UseSD->FirstInBundle;
14790 if (!DestBundle->IsScheduled)
14791 BundleMember->incrementUnscheduledDeps(1);
14792 if (!DestBundle->hasValidDependencies())
14793 WorkList.push_back(DestBundle);
14794 }
14795 }
14796 }
14797
14798 auto MakeControlDependent = [&](Instruction *I) {
14799 auto *DepDest = getScheduleData(I);
14800 assert(DepDest && "must be in schedule window");
14801 DepDest->ControlDependencies.push_back(BundleMember);
14802 BundleMember->Dependencies++;
14803 ScheduleData *DestBundle = DepDest->FirstInBundle;
14804 if (!DestBundle->IsScheduled)
14805 BundleMember->incrementUnscheduledDeps(1);
14806 if (!DestBundle->hasValidDependencies())
14807 WorkList.push_back(DestBundle);
14808 };
14809
14810 // Any instruction which isn't safe to speculate at the beginning of the
14811 // block is control dependend on any early exit or non-willreturn call
14812 // which proceeds it.
14813 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
14814 for (Instruction *I = BundleMember->Inst->getNextNode();
14815 I != ScheduleEnd; I = I->getNextNode()) {
14816 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
14817 continue;
14818
14819 // Add the dependency
14820 MakeControlDependent(I);
14821
14823 // Everything past here must be control dependent on I.
14824 break;
14825 }
14826 }
14827
14828 if (RegionHasStackSave) {
14829 // If we have an inalloc alloca instruction, it needs to be scheduled
14830 // after any preceeding stacksave. We also need to prevent any alloca
14831 // from reordering above a preceeding stackrestore.
14832 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
14833 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
14834 for (Instruction *I = BundleMember->Inst->getNextNode();
14835 I != ScheduleEnd; I = I->getNextNode()) {
14836 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14837 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14838 // Any allocas past here must be control dependent on I, and I
14839 // must be memory dependend on BundleMember->Inst.
14840 break;
14841
14842 if (!isa<AllocaInst>(I))
14843 continue;
14844
14845 // Add the dependency
14846 MakeControlDependent(I);
14847 }
14848 }
14849
14850 // In addition to the cases handle just above, we need to prevent
14851 // allocas and loads/stores from moving below a stacksave or a
14852 // stackrestore. Avoiding moving allocas below stackrestore is currently
14853 // thought to be conservatism. Moving loads/stores below a stackrestore
14854 // can lead to incorrect code.
14855 if (isa<AllocaInst>(BundleMember->Inst) ||
14856 BundleMember->Inst->mayReadOrWriteMemory()) {
14857 for (Instruction *I = BundleMember->Inst->getNextNode();
14858 I != ScheduleEnd; I = I->getNextNode()) {
14859 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
14860 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14861 continue;
14862
14863 // Add the dependency
14864 MakeControlDependent(I);
14865 break;
14866 }
14867 }
14868 }
14869
14870 // Handle the memory dependencies (if any).
14871 ScheduleData *DepDest = BundleMember->NextLoadStore;
14872 if (!DepDest)
14873 continue;
14874 Instruction *SrcInst = BundleMember->Inst;
14875 assert(SrcInst->mayReadOrWriteMemory() &&
14876 "NextLoadStore list for non memory effecting bundle?");
14877 MemoryLocation SrcLoc = getLocation(SrcInst);
14878 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14879 unsigned NumAliased = 0;
14880 unsigned DistToSrc = 1;
14881
14882 for (; DepDest; DepDest = DepDest->NextLoadStore) {
14883 assert(isInSchedulingRegion(DepDest));
14884
14885 // We have two limits to reduce the complexity:
14886 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
14887 // SLP->isAliased (which is the expensive part in this loop).
14888 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
14889 // the whole loop (even if the loop is fast, it's quadratic).
14890 // It's important for the loop break condition (see below) to
14891 // check this limit even between two read-only instructions.
14892 if (DistToSrc >= MaxMemDepDistance ||
14893 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
14894 (NumAliased >= AliasedCheckLimit ||
14895 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
14896
14897 // We increment the counter only if the locations are aliased
14898 // (instead of counting all alias checks). This gives a better
14899 // balance between reduced runtime and accurate dependencies.
14900 NumAliased++;
14901
14902 DepDest->MemoryDependencies.push_back(BundleMember);
14903 BundleMember->Dependencies++;
14904 ScheduleData *DestBundle = DepDest->FirstInBundle;
14905 if (!DestBundle->IsScheduled) {
14906 BundleMember->incrementUnscheduledDeps(1);
14907 }
14908 if (!DestBundle->hasValidDependencies()) {
14909 WorkList.push_back(DestBundle);
14910 }
14911 }
14912
14913 // Example, explaining the loop break condition: Let's assume our
14914 // starting instruction is i0 and MaxMemDepDistance = 3.
14915 //
14916 // +--------v--v--v
14917 // i0,i1,i2,i3,i4,i5,i6,i7,i8
14918 // +--------^--^--^
14919 //
14920 // MaxMemDepDistance let us stop alias-checking at i3 and we add
14921 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
14922 // Previously we already added dependencies from i3 to i6,i7,i8
14923 // (because of MaxMemDepDistance). As we added a dependency from
14924 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
14925 // and we can abort this loop at i6.
14926 if (DistToSrc >= 2 * MaxMemDepDistance)
14927 break;
14928 DistToSrc++;
14929 }
14930 }
14931 if (InsertInReadyList && SD->isReady()) {
14932 ReadyInsts.insert(SD);
14933 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
14934 << "\n");
14935 }
14936 }
14937}
14938
14939void BoUpSLP::BlockScheduling::resetSchedule() {
14940 assert(ScheduleStart &&
14941 "tried to reset schedule on block which has not been scheduled");
14942 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
14943 doForAllOpcodes(I, [&](ScheduleData *SD) {
14944 assert(isInSchedulingRegion(SD) &&
14945 "ScheduleData not in scheduling region");
14946 SD->IsScheduled = false;
14947 SD->resetUnscheduledDeps();
14948 });
14949 }
14950 ReadyInsts.clear();
14951}
14952
14953void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14954 if (!BS->ScheduleStart)
14955 return;
14956
14957 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
14958
14959 // A key point - if we got here, pre-scheduling was able to find a valid
14960 // scheduling of the sub-graph of the scheduling window which consists
14961 // of all vector bundles and their transitive users. As such, we do not
14962 // need to reschedule anything *outside of* that subgraph.
14963
14964 BS->resetSchedule();
14965
14966 // For the real scheduling we use a more sophisticated ready-list: it is
14967 // sorted by the original instruction location. This lets the final schedule
14968 // be as close as possible to the original instruction order.
14969 // WARNING: If changing this order causes a correctness issue, that means
14970 // there is some missing dependence edge in the schedule data graph.
14971 struct ScheduleDataCompare {
14972 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
14973 return SD2->SchedulingPriority < SD1->SchedulingPriority;
14974 }
14975 };
14976 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14977
14978 // Ensure that all dependency data is updated (for nodes in the sub-graph)
14979 // and fill the ready-list with initial instructions.
14980 int Idx = 0;
14981 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
14982 I = I->getNextNode()) {
14983 BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
14984 TreeEntry *SDTE = getTreeEntry(SD->Inst);
14985 (void)SDTE;
14987 SD->isPartOfBundle() ==
14988 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
14989 "scheduler and vectorizer bundle mismatch");
14990 SD->FirstInBundle->SchedulingPriority = Idx++;
14991
14992 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14993 BS->calculateDependencies(SD, false, this);
14994 });
14995 }
14996 BS->initialFillReadyList(ReadyInsts);
14997
14998 Instruction *LastScheduledInst = BS->ScheduleEnd;
14999
15000 // Do the "real" scheduling.
15001 while (!ReadyInsts.empty()) {
15002 ScheduleData *Picked = *ReadyInsts.begin();
15003 ReadyInsts.erase(ReadyInsts.begin());
15004
15005 // Move the scheduled instruction(s) to their dedicated places, if not
15006 // there yet.
15007 for (ScheduleData *BundleMember = Picked; BundleMember;
15008 BundleMember = BundleMember->NextInBundle) {
15009 Instruction *PickedInst = BundleMember->Inst;
15010 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
15011 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
15012 LastScheduledInst = PickedInst;
15013 }
15014
15015 BS->schedule(Picked, ReadyInsts);
15016 }
15017
15018 // Check that we didn't break any of our invariants.
15019#ifdef EXPENSIVE_CHECKS
15020 BS->verify();
15021#endif
15022
15023#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
15024 // Check that all schedulable entities got scheduled
15025 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
15026 BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
15027 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
15028 assert(SD->IsScheduled && "must be scheduled at this point");
15029 }
15030 });
15031 }
15032#endif
15033
15034 // Avoid duplicate scheduling of the block.
15035 BS->ScheduleStart = nullptr;
15036}
15037
15039 // If V is a store, just return the width of the stored value (or value
15040 // truncated just before storing) without traversing the expression tree.
15041 // This is the common case.
15042 if (auto *Store = dyn_cast<StoreInst>(V))
15043 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
15044
15045 if (auto *IEI = dyn_cast<InsertElementInst>(V))
15046 return getVectorElementSize(IEI->getOperand(1));
15047
15048 auto E = InstrElementSize.find(V);
15049 if (E != InstrElementSize.end())
15050 return E->second;
15051
15052 // If V is not a store, we can traverse the expression tree to find loads
15053 // that feed it. The type of the loaded value may indicate a more suitable
15054 // width than V's type. We want to base the vector element size on the width
15055 // of memory operations where possible.
15058 if (auto *I = dyn_cast<Instruction>(V)) {
15059 Worklist.emplace_back(I, I->getParent(), 0);
15060 Visited.insert(I);
15061 }
15062
15063 // Traverse the expression tree in bottom-up order looking for loads. If we
15064 // encounter an instruction we don't yet handle, we give up.
15065 auto Width = 0u;
15066 Value *FirstNonBool = nullptr;
15067 while (!Worklist.empty()) {
15068 auto [I, Parent, Level] = Worklist.pop_back_val();
15069
15070 // We should only be looking at scalar instructions here. If the current
15071 // instruction has a vector type, skip.
15072 auto *Ty = I->getType();
15073 if (isa<VectorType>(Ty))
15074 continue;
15075 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
15076 FirstNonBool = I;
15077 if (Level > RecursionMaxDepth)
15078 continue;
15079
15080 // If the current instruction is a load, update MaxWidth to reflect the
15081 // width of the loaded value.
15082 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
15083 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
15084
15085 // Otherwise, we need to visit the operands of the instruction. We only
15086 // handle the interesting cases from buildTree here. If an operand is an
15087 // instruction we haven't yet visited and from the same basic block as the
15088 // user or the use is a PHI node, we add it to the worklist.
15091 for (Use &U : I->operands()) {
15092 if (auto *J = dyn_cast<Instruction>(U.get()))
15093 if (Visited.insert(J).second &&
15094 (isa<PHINode>(I) || J->getParent() == Parent)) {
15095 Worklist.emplace_back(J, J->getParent(), Level + 1);
15096 continue;
15097 }
15098 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
15099 FirstNonBool = U.get();
15100 }
15101 } else {
15102 break;
15103 }
15104 }
15105
15106 // If we didn't encounter a memory access in the expression tree, or if we
15107 // gave up for some reason, just return the width of V. Otherwise, return the
15108 // maximum width we found.
15109 if (!Width) {
15110 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
15111 V = FirstNonBool;
15112 Width = DL->getTypeSizeInBits(V->getType());
15113 }
15114
15115 for (Instruction *I : Visited)
15116 InstrElementSize[I] = Width;
15117
15118 return Width;
15119}
15120
15121bool BoUpSLP::collectValuesToDemote(
15122 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
15124 unsigned &MaxDepthLevel, bool &IsProfitableToDemote,
15125 bool IsTruncRoot) const {
15126 // We can always demote constants.
15127 if (all_of(E.Scalars, IsaPred<Constant>))
15128 return true;
15129
15130 unsigned OrigBitWidth = DL->getTypeSizeInBits(E.Scalars.front()->getType());
15131 if (OrigBitWidth == BitWidth) {
15132 MaxDepthLevel = 1;
15133 return true;
15134 }
15135
15136 // If the value is not a vectorized instruction in the expression and not used
15137 // by the insertelement instruction and not used in multiple vector nodes, it
15138 // cannot be demoted.
15139 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
15140 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15141 });
15142 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
15143 if (MultiNodeScalars.contains(V))
15144 return false;
15145 // For lat shuffle of sext/zext with many uses need to check the extra bit
15146 // for unsigned values, otherwise may have incorrect casting for reused
15147 // scalars.
15148 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
15149 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
15150 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15151 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
15152 return true;
15153 }
15154 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
15155 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
15156 if (IsSignedNode)
15157 ++BitWidth1;
15158 if (auto *I = dyn_cast<Instruction>(V)) {
15159 APInt Mask = DB->getDemandedBits(I);
15160 unsigned BitWidth2 =
15161 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
15162 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
15163 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
15164 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
15165 break;
15166 BitWidth2 *= 2;
15167 }
15168 BitWidth1 = std::min(BitWidth1, BitWidth2);
15169 }
15170 BitWidth = std::max(BitWidth, BitWidth1);
15171 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
15172 };
15173 using namespace std::placeholders;
15174 auto FinalAnalysis = [&]() {
15175 if (!IsProfitableToDemote)
15176 return false;
15177 bool Res = all_of(
15178 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
15179 // Demote gathers.
15180 if (Res && E.isGather()) {
15181 // Check possible extractelement instructions bases and final vector
15182 // length.
15183 SmallPtrSet<Value *, 4> UniqueBases;
15184 for (Value *V : E.Scalars) {
15185 auto *EE = dyn_cast<ExtractElementInst>(V);
15186 if (!EE)
15187 continue;
15188 UniqueBases.insert(EE->getVectorOperand());
15189 }
15190 const unsigned VF = E.Scalars.size();
15191 Type *OrigScalarTy = E.Scalars.front()->getType();
15192 if (UniqueBases.size() <= 2 ||
15193 TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF)) ==
15195 IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
15196 ToDemote.push_back(E.Idx);
15197 }
15198 return Res;
15199 };
15200 if (E.isGather() || !Visited.insert(&E).second ||
15201 any_of(E.Scalars, [&](Value *V) {
15202 return all_of(V->users(), [&](User *U) {
15203 return isa<InsertElementInst>(U) && !getTreeEntry(U);
15204 });
15205 }))
15206 return FinalAnalysis();
15207
15208 if (any_of(E.Scalars, [&](Value *V) {
15209 return !all_of(V->users(), [=](User *U) {
15210 return getTreeEntry(U) ||
15211 (UserIgnoreList && UserIgnoreList->contains(U)) ||
15212 (!isa<CmpInst>(U) && U->getType()->isSized() &&
15213 !U->getType()->isScalableTy() &&
15214 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
15215 }) && !IsPotentiallyTruncated(V, BitWidth);
15216 }))
15217 return false;
15218
15219 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
15220 bool &NeedToExit) {
15221 NeedToExit = false;
15222 unsigned InitLevel = MaxDepthLevel;
15223 for (const TreeEntry *Op : Operands) {
15224 unsigned Level = InitLevel;
15225 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
15226 ToDemote, Visited, Level, IsProfitableToDemote,
15227 IsTruncRoot)) {
15228 if (!IsProfitableToDemote)
15229 return false;
15230 NeedToExit = true;
15231 if (!FinalAnalysis())
15232 return false;
15233 continue;
15234 }
15235 MaxDepthLevel = std::max(MaxDepthLevel, Level);
15236 }
15237 return true;
15238 };
15239 auto AttemptCheckBitwidth =
15240 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
15241 // Try all bitwidth < OrigBitWidth.
15242 NeedToExit = false;
15243 unsigned BestFailBitwidth = 0;
15244 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
15245 if (Checker(BitWidth, OrigBitWidth))
15246 return true;
15247 if (BestFailBitwidth == 0 && FinalAnalysis())
15248 BestFailBitwidth = BitWidth;
15249 }
15250 if (BitWidth >= OrigBitWidth) {
15251 if (BestFailBitwidth == 0) {
15252 BitWidth = OrigBitWidth;
15253 return false;
15254 }
15255 MaxDepthLevel = 1;
15256 BitWidth = BestFailBitwidth;
15257 NeedToExit = true;
15258 return true;
15259 }
15260 return false;
15261 };
15262 auto TryProcessInstruction =
15263 [&](unsigned &BitWidth,
15265 function_ref<bool(unsigned, unsigned)> Checker = {}) {
15266 if (Operands.empty()) {
15267 if (!IsTruncRoot)
15268 MaxDepthLevel = 1;
15269 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15270 std::ref(BitWidth)));
15271 } else {
15272 // Several vectorized uses? Check if we can truncate it, otherwise -
15273 // exit.
15274 if (E.UserTreeIndices.size() > 1 &&
15275 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15276 std::ref(BitWidth))))
15277 return false;
15278 bool NeedToExit = false;
15279 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
15280 return false;
15281 if (NeedToExit)
15282 return true;
15283 if (!ProcessOperands(Operands, NeedToExit))
15284 return false;
15285 if (NeedToExit)
15286 return true;
15287 }
15288
15289 ++MaxDepthLevel;
15290 // Record the entry that we can demote.
15291 ToDemote.push_back(E.Idx);
15292 return IsProfitableToDemote;
15293 };
15294 switch (E.getOpcode()) {
15295
15296 // We can always demote truncations and extensions. Since truncations can
15297 // seed additional demotion, we save the truncated value.
15298 case Instruction::Trunc:
15299 if (IsProfitableToDemoteRoot)
15300 IsProfitableToDemote = true;
15301 return TryProcessInstruction(BitWidth);
15302 case Instruction::ZExt:
15303 case Instruction::SExt:
15304 IsProfitableToDemote = true;
15305 return TryProcessInstruction(BitWidth);
15306
15307 // We can demote certain binary operations if we can demote both of their
15308 // operands.
15309 case Instruction::Add:
15310 case Instruction::Sub:
15311 case Instruction::Mul:
15312 case Instruction::And:
15313 case Instruction::Or:
15314 case Instruction::Xor: {
15315 return TryProcessInstruction(
15316 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
15317 }
15318 case Instruction::Shl: {
15319 // If we are truncating the result of this SHL, and if it's a shift of an
15320 // inrange amount, we can always perform a SHL in a smaller type.
15321 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
15322 return all_of(E.Scalars, [&](Value *V) {
15323 auto *I = cast<Instruction>(V);
15324 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15325 return AmtKnownBits.getMaxValue().ult(BitWidth);
15326 });
15327 };
15328 return TryProcessInstruction(
15329 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
15330 }
15331 case Instruction::LShr: {
15332 // If this is a truncate of a logical shr, we can truncate it to a smaller
15333 // lshr iff we know that the bits we would otherwise be shifting in are
15334 // already zeros.
15335 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15336 return all_of(E.Scalars, [&](Value *V) {
15337 auto *I = cast<Instruction>(V);
15338 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15339 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15340 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15341 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
15342 SimplifyQuery(*DL));
15343 });
15344 };
15345 return TryProcessInstruction(
15346 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15347 LShrChecker);
15348 }
15349 case Instruction::AShr: {
15350 // If this is a truncate of an arithmetic shr, we can truncate it to a
15351 // smaller ashr iff we know that all the bits from the sign bit of the
15352 // original type and the sign bit of the truncate type are similar.
15353 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15354 return all_of(E.Scalars, [&](Value *V) {
15355 auto *I = cast<Instruction>(V);
15356 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15357 unsigned ShiftedBits = OrigBitWidth - BitWidth;
15358 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15359 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15360 nullptr, DT);
15361 });
15362 };
15363 return TryProcessInstruction(
15364 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15365 AShrChecker);
15366 }
15367 case Instruction::UDiv:
15368 case Instruction::URem: {
15369 // UDiv and URem can be truncated if all the truncated bits are zero.
15370 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15371 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15372 return all_of(E.Scalars, [&](Value *V) {
15373 auto *I = cast<Instruction>(V);
15374 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15375 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
15376 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15377 });
15378 };
15379 return TryProcessInstruction(
15380 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
15381 }
15382
15383 // We can demote selects if we can demote their true and false values.
15384 case Instruction::Select: {
15385 return TryProcessInstruction(
15386 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
15387 }
15388
15389 // We can demote phis if we can demote all their incoming operands. Note that
15390 // we don't need to worry about cycles since we ensure single use above.
15391 case Instruction::PHI: {
15392 const unsigned NumOps = E.getNumOperands();
15394 transform(seq<unsigned>(0, NumOps), Ops.begin(),
15395 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
15396
15397 return TryProcessInstruction(BitWidth, Ops);
15398 }
15399
15400 case Instruction::Call: {
15401 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
15402 if (!IC)
15403 break;
15405 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
15406 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
15407 break;
15408 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
15409 function_ref<bool(unsigned, unsigned)> CallChecker;
15410 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15411 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15412 return all_of(E.Scalars, [&](Value *V) {
15413 auto *I = cast<Instruction>(V);
15414 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
15415 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15416 return MaskedValueIsZero(I->getOperand(0), Mask,
15417 SimplifyQuery(*DL)) &&
15418 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15419 }
15420 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
15421 "Expected min/max intrinsics only.");
15422 unsigned SignBits = OrigBitWidth - BitWidth;
15423 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
15424 unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15425 nullptr, DT);
15426 unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
15427 nullptr, DT);
15428 return SignBits <= Op0SignBits &&
15429 ((SignBits != Op0SignBits &&
15430 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
15431 MaskedValueIsZero(I->getOperand(0), Mask,
15432 SimplifyQuery(*DL))) &&
15433 SignBits <= Op1SignBits &&
15434 ((SignBits != Op1SignBits &&
15435 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
15436 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
15437 });
15438 };
15439 if (ID != Intrinsic::abs) {
15440 Operands.push_back(getOperandEntry(&E, 1));
15441 CallChecker = CompChecker;
15442 }
15443 InstructionCost BestCost =
15444 std::numeric_limits<InstructionCost::CostType>::max();
15445 unsigned BestBitWidth = BitWidth;
15446 unsigned VF = E.Scalars.size();
15447 // Choose the best bitwidth based on cost estimations.
15448 auto Checker = [&](unsigned BitWidth, unsigned) {
15449 unsigned MinBW = PowerOf2Ceil(BitWidth);
15450 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(IC, ID, VF, MinBW);
15451 auto VecCallCosts = getVectorCallCosts(
15452 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
15453 TTI, TLI, ArgTys);
15454 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
15455 if (Cost < BestCost) {
15456 BestCost = Cost;
15457 BestBitWidth = BitWidth;
15458 }
15459 return false;
15460 };
15461 [[maybe_unused]] bool NeedToExit;
15462 (void)AttemptCheckBitwidth(Checker, NeedToExit);
15463 BitWidth = BestBitWidth;
15464 return TryProcessInstruction(BitWidth, Operands, CallChecker);
15465 }
15466
15467 // Otherwise, conservatively give up.
15468 default:
15469 break;
15470 }
15471 MaxDepthLevel = 1;
15472 return FinalAnalysis();
15473}
15474
15475static RecurKind getRdxKind(Value *V);
15476
15478 // We only attempt to truncate integer expressions.
15479 bool IsStoreOrInsertElt =
15480 VectorizableTree.front()->getOpcode() == Instruction::Store ||
15481 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
15482 if ((IsStoreOrInsertElt || UserIgnoreList) &&
15483 ExtraBitWidthNodes.size() <= 1 &&
15484 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
15485 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
15486 return;
15487
15488 unsigned NodeIdx = 0;
15489 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
15490 NodeIdx = 1;
15491
15492 // Ensure the roots of the vectorizable tree don't form a cycle.
15493 if (VectorizableTree[NodeIdx]->isGather() ||
15494 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
15495 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15496 [NodeIdx](const EdgeInfo &EI) {
15497 return EI.UserTE->Idx >
15498 static_cast<int>(NodeIdx);
15499 })))
15500 return;
15501
15502 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
15503 // resize to the final type.
15504 bool IsTruncRoot = false;
15505 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
15506 SmallVector<unsigned> RootDemotes;
15507 if (NodeIdx != 0 &&
15508 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15509 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15510 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
15511 IsTruncRoot = true;
15512 RootDemotes.push_back(NodeIdx);
15513 IsProfitableToDemoteRoot = true;
15514 ++NodeIdx;
15515 }
15516
15517 // Analyzed the reduction already and not profitable - exit.
15518 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
15519 return;
15520
15521 SmallVector<unsigned> ToDemote;
15522 auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
15523 bool IsProfitableToDemoteRoot, unsigned Opcode,
15524 unsigned Limit, bool IsTruncRoot,
15525 bool IsSignedCmp) -> unsigned {
15526 ToDemote.clear();
15527 // Check if the root is trunc and the next node is gather/buildvector, then
15528 // keep trunc in scalars, which is free in most cases.
15529 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
15530 E.Idx > (IsStoreOrInsertElt ? 2 : 1) &&
15531 all_of(E.Scalars, [&](Value *V) {
15532 return V->hasOneUse() || isa<Constant>(V) ||
15533 (!V->hasNUsesOrMore(UsesLimit) &&
15534 none_of(V->users(), [&](User *U) {
15535 const TreeEntry *TE = getTreeEntry(U);
15536 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
15537 if (TE == UserTE || !TE)
15538 return false;
15539 unsigned UserTESz = DL->getTypeSizeInBits(
15540 UserTE->Scalars.front()->getType());
15541 auto It = MinBWs.find(TE);
15542 if (It != MinBWs.end() && It->second.first > UserTESz)
15543 return true;
15544 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
15545 }));
15546 })) {
15547 ToDemote.push_back(E.Idx);
15548 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
15549 auto It = MinBWs.find(UserTE);
15550 if (It != MinBWs.end())
15551 return It->second.first;
15552 unsigned MaxBitWidth =
15553 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
15554 MaxBitWidth = bit_ceil(MaxBitWidth);
15555 if (MaxBitWidth < 8 && MaxBitWidth > 1)
15556 MaxBitWidth = 8;
15557 return MaxBitWidth;
15558 }
15559
15560 unsigned VF = E.getVectorFactor();
15561 auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType());
15562 if (!TreeRootIT || !Opcode)
15563 return 0u;
15564
15565 if (any_of(E.Scalars,
15566 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
15567 return 0u;
15568
15569 unsigned NumParts = TTI->getNumberOfParts(getWidenedType(TreeRootIT, VF));
15570
15571 // The maximum bit width required to represent all the values that can be
15572 // demoted without loss of precision. It would be safe to truncate the roots
15573 // of the expression to this width.
15574 unsigned MaxBitWidth = 1u;
15575
15576 // True if the roots can be zero-extended back to their original type,
15577 // rather than sign-extended. We know that if the leading bits are not
15578 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
15579 // True.
15580 // Determine if the sign bit of all the roots is known to be zero. If not,
15581 // IsKnownPositive is set to False.
15582 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
15583 KnownBits Known = computeKnownBits(R, *DL);
15584 return Known.isNonNegative();
15585 });
15586
15587 // We first check if all the bits of the roots are demanded. If they're not,
15588 // we can truncate the roots to this narrower type.
15589 for (Value *Root : E.Scalars) {
15590 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
15591 TypeSize NumTypeBits = DL->getTypeSizeInBits(Root->getType());
15592 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15593 // If we can't prove that the sign bit is zero, we must add one to the
15594 // maximum bit width to account for the unknown sign bit. This preserves
15595 // the existing sign bit so we can safely sign-extend the root back to the
15596 // original type. Otherwise, if we know the sign bit is zero, we will
15597 // zero-extend the root instead.
15598 //
15599 // FIXME: This is somewhat suboptimal, as there will be cases where adding
15600 // one to the maximum bit width will yield a larger-than-necessary
15601 // type. In general, we need to add an extra bit only if we can't
15602 // prove that the upper bit of the original type is equal to the
15603 // upper bit of the proposed smaller type. If these two bits are
15604 // the same (either zero or one) we know that sign-extending from
15605 // the smaller type will result in the same value. Here, since we
15606 // can't yet prove this, we are just making the proposed smaller
15607 // type larger to ensure correctness.
15608 if (!IsKnownPositive)
15609 ++BitWidth1;
15610
15611 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
15612 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15613 MaxBitWidth =
15614 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
15615 }
15616
15617 if (MaxBitWidth < 8 && MaxBitWidth > 1)
15618 MaxBitWidth = 8;
15619
15620 // If the original type is large, but reduced type does not improve the reg
15621 // use - ignore it.
15622 if (NumParts > 1 &&
15623 NumParts ==
15625 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
15626 return 0u;
15627
15628 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
15629 Opcode == Instruction::SExt ||
15630 Opcode == Instruction::ZExt || NumParts > 1;
15631 // Conservatively determine if we can actually truncate the roots of the
15632 // expression. Collect the values that can be demoted in ToDemote and
15633 // additional roots that require investigating in Roots.
15635 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
15636 bool NeedToDemote = IsProfitableToDemote;
15637
15638 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
15639 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
15640 IsTruncRoot) ||
15641 (MaxDepthLevel <= Limit &&
15642 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
15643 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
15644 DL->getTypeSizeInBits(TreeRootIT) /
15645 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
15646 ->getOperand(0)
15647 ->getType()) >
15648 2)))))
15649 return 0u;
15650 // Round MaxBitWidth up to the next power-of-two.
15651 MaxBitWidth = bit_ceil(MaxBitWidth);
15652
15653 return MaxBitWidth;
15654 };
15655
15656 // If we can truncate the root, we must collect additional values that might
15657 // be demoted as a result. That is, those seeded by truncations we will
15658 // modify.
15659 // Add reduction ops sizes, if any.
15660 if (UserIgnoreList &&
15661 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
15662 for (Value *V : *UserIgnoreList) {
15663 auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
15664 auto NumTypeBits = DL->getTypeSizeInBits(V->getType());
15665 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15667 ++BitWidth1;
15668 unsigned BitWidth2 = BitWidth1;
15670 auto Mask = DB->getDemandedBits(cast<Instruction>(V));
15671 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15672 }
15673 ReductionBitWidth =
15674 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
15675 }
15676 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
15677 ReductionBitWidth = 8;
15678
15679 ReductionBitWidth = bit_ceil(ReductionBitWidth);
15680 }
15681 bool IsTopRoot = NodeIdx == 0;
15682 while (NodeIdx < VectorizableTree.size() &&
15683 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15684 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15685 RootDemotes.push_back(NodeIdx);
15686 ++NodeIdx;
15687 IsTruncRoot = true;
15688 }
15689 bool IsSignedCmp = false;
15690 while (NodeIdx < VectorizableTree.size()) {
15691 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
15692 unsigned Limit = 2;
15693 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
15694 if (IsTopRoot &&
15695 ReductionBitWidth ==
15696 DL->getTypeSizeInBits(
15697 VectorizableTree.front()->Scalars.front()->getType()))
15698 Limit = 3;
15699 unsigned MaxBitWidth = ComputeMaxBitWidth(
15700 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
15701 Limit, IsTruncRoot, IsSignedCmp);
15702 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
15703 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
15704 ReductionBitWidth = bit_ceil(MaxBitWidth);
15705 else if (MaxBitWidth == 0)
15706 ReductionBitWidth = 0;
15707 }
15708
15709 for (unsigned Idx : RootDemotes) {
15710 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
15711 uint32_t OrigBitWidth = DL->getTypeSizeInBits(V->getType());
15712 if (OrigBitWidth > MaxBitWidth) {
15713 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
15714 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
15715 }
15716 return false;
15717 }))
15718 ToDemote.push_back(Idx);
15719 }
15720 RootDemotes.clear();
15721 IsTopRoot = false;
15722 IsProfitableToDemoteRoot = true;
15723
15724 if (ExtraBitWidthNodes.empty()) {
15725 NodeIdx = VectorizableTree.size();
15726 } else {
15727 unsigned NewIdx = 0;
15728 do {
15729 NewIdx = *ExtraBitWidthNodes.begin();
15730 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
15731 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
15732 NodeIdx = NewIdx;
15733 IsTruncRoot =
15734 NodeIdx < VectorizableTree.size() &&
15735 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15736 [](const EdgeInfo &EI) {
15737 return EI.EdgeIdx == 0 &&
15738 EI.UserTE->getOpcode() == Instruction::Trunc &&
15739 !EI.UserTE->isAltShuffle();
15740 });
15741 IsSignedCmp =
15742 NodeIdx < VectorizableTree.size() &&
15743 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15744 [&](const EdgeInfo &EI) {
15745 return EI.UserTE->getOpcode() == Instruction::ICmp &&
15746 any_of(EI.UserTE->Scalars, [&](Value *V) {
15747 auto *IC = dyn_cast<ICmpInst>(V);
15748 return IC &&
15749 (IC->isSigned() ||
15750 !isKnownNonNegative(IC->getOperand(0),
15751 SimplifyQuery(*DL)) ||
15752 !isKnownNonNegative(IC->getOperand(1),
15753 SimplifyQuery(*DL)));
15754 });
15755 });
15756 }
15757
15758 // If the maximum bit width we compute is less than the with of the roots'
15759 // type, we can proceed with the narrowing. Otherwise, do nothing.
15760 if (MaxBitWidth == 0 ||
15761 MaxBitWidth >=
15762 cast<IntegerType>(TreeRoot.front()->getType())->getBitWidth()) {
15763 if (UserIgnoreList)
15764 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
15765 continue;
15766 }
15767
15768 // Finally, map the values we can demote to the maximum bit with we
15769 // computed.
15770 for (unsigned Idx : ToDemote) {
15771 TreeEntry *TE = VectorizableTree[Idx].get();
15772 if (MinBWs.contains(TE))
15773 continue;
15774 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
15775 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15776 });
15777 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
15778 }
15779 }
15780}
15781
15783 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
15784 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
15785 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
15786 auto *AA = &AM.getResult<AAManager>(F);
15787 auto *LI = &AM.getResult<LoopAnalysis>(F);
15788 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
15789 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
15790 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
15792
15793 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
15794 if (!Changed)
15795 return PreservedAnalyses::all();
15796
15799 return PA;
15800}
15801
15803 TargetTransformInfo *TTI_,
15804 TargetLibraryInfo *TLI_, AAResults *AA_,
15805 LoopInfo *LI_, DominatorTree *DT_,
15806 AssumptionCache *AC_, DemandedBits *DB_,
15809 return false;
15810 SE = SE_;
15811 TTI = TTI_;
15812 TLI = TLI_;
15813 AA = AA_;
15814 LI = LI_;
15815 DT = DT_;
15816 AC = AC_;
15817 DB = DB_;
15818 DL = &F.getDataLayout();
15819
15820 Stores.clear();
15821 GEPs.clear();
15822 bool Changed = false;
15823
15824 // If the target claims to have no vector registers don't attempt
15825 // vectorization.
15827 LLVM_DEBUG(
15828 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
15829 return false;
15830 }
15831
15832 // Don't vectorize when the attribute NoImplicitFloat is used.
15833 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
15834 return false;
15835
15836 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
15837
15838 // Use the bottom up slp vectorizer to construct chains that start with
15839 // store instructions.
15840 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
15841
15842 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
15843 // delete instructions.
15844
15845 // Update DFS numbers now so that we can use them for ordering.
15846 DT->updateDFSNumbers();
15847
15848 // Scan the blocks in the function in post order.
15849 for (auto *BB : post_order(&F.getEntryBlock())) {
15850 // Start new block - clear the list of reduction roots.
15851 R.clearReductionData();
15852 collectSeedInstructions(BB);
15853
15854 // Vectorize trees that end at stores.
15855 if (!Stores.empty()) {
15856 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
15857 << " underlying objects.\n");
15858 Changed |= vectorizeStoreChains(R);
15859 }
15860
15861 // Vectorize trees that end at reductions.
15862 Changed |= vectorizeChainsInBlock(BB, R);
15863
15864 // Vectorize the index computations of getelementptr instructions. This
15865 // is primarily intended to catch gather-like idioms ending at
15866 // non-consecutive loads.
15867 if (!GEPs.empty()) {
15868 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
15869 << " underlying objects.\n");
15870 Changed |= vectorizeGEPIndices(BB, R);
15871 }
15872 }
15873
15874 if (Changed) {
15875 R.optimizeGatherSequence();
15876 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
15877 }
15878 return Changed;
15879}
15880
15881std::optional<bool>
15882SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
15883 unsigned Idx, unsigned MinVF,
15884 unsigned &Size) {
15885 Size = 0;
15886 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
15887 << "\n");
15888 const unsigned Sz = R.getVectorElementSize(Chain[0]);
15889 unsigned VF = Chain.size();
15890
15891 if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) {
15892 // Check if vectorizing with a non-power-of-2 VF should be considered. At
15893 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
15894 // all vector lanes are used.
15895 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
15896 return false;
15897 }
15898
15899 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
15900 << "\n");
15901
15902 SetVector<Value *> ValOps;
15903 for (Value *V : Chain)
15904 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
15905 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
15906 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
15907 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
15908 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
15909 bool IsPowerOf2 =
15910 isPowerOf2_32(ValOps.size()) ||
15911 (VectorizeNonPowerOf2 && isPowerOf2_32(ValOps.size() + 1));
15912 if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
15913 (!S.MainOp->isSafeToRemove() ||
15914 any_of(ValOps.getArrayRef(),
15915 [&](Value *V) {
15916 return !isa<ExtractElementInst>(V) &&
15917 (V->getNumUses() > Chain.size() ||
15918 any_of(V->users(), [&](User *U) {
15919 return !Stores.contains(U);
15920 }));
15921 }))) ||
15922 (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
15923 Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
15924 return false;
15925 }
15926 }
15927 if (R.isLoadCombineCandidate(Chain))
15928 return true;
15929 R.buildTree(Chain);
15930 // Check if tree tiny and store itself or its value is not vectorized.
15931 if (R.isTreeTinyAndNotFullyVectorizable()) {
15932 if (R.isGathered(Chain.front()) ||
15933 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
15934 return std::nullopt;
15935 Size = R.getTreeSize();
15936 return false;
15937 }
15938 R.reorderTopToBottom();
15939 R.reorderBottomToTop();
15940 R.buildExternalUses();
15941
15942 R.computeMinimumValueSizes();
15943 R.transformNodes();
15944
15945 Size = R.getTreeSize();
15946 if (S.getOpcode() == Instruction::Load)
15947 Size = 2; // cut off masked gather small trees
15948 InstructionCost Cost = R.getTreeCost();
15949
15950 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
15951 if (Cost < -SLPCostThreshold) {
15952 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
15953
15954 using namespace ore;
15955
15956 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
15957 cast<StoreInst>(Chain[0]))
15958 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
15959 << " and with tree size "
15960 << NV("TreeSize", R.getTreeSize()));
15961
15962 R.vectorizeTree();
15963 return true;
15964 }
15965
15966 return false;
15967}
15968
15969/// Checks if the quadratic mean deviation is less than 90% of the mean size.
15970static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
15971 bool First) {
15972 unsigned Num = 0;
15973 uint64_t Sum = std::accumulate(
15974 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
15975 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
15976 unsigned Size = First ? Val.first : Val.second;
15977 if (Size == 1)
15978 return V;
15979 ++Num;
15980 return V + Size;
15981 });
15982 if (Num == 0)
15983 return true;
15984 uint64_t Mean = Sum / Num;
15985 if (Mean == 0)
15986 return true;
15987 uint64_t Dev = std::accumulate(
15988 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
15989 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
15990 unsigned P = First ? Val.first : Val.second;
15991 if (P == 1)
15992 return V;
15993 return V + (P - Mean) * (P - Mean);
15994 }) /
15995 Num;
15996 return Dev * 81 / (Mean * Mean) == 0;
15997}
15998
15999bool SLPVectorizerPass::vectorizeStores(
16000 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
16001 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
16002 &Visited) {
16003 // We may run into multiple chains that merge into a single chain. We mark the
16004 // stores that we vectorized so that we don't visit the same store twice.
16005 BoUpSLP::ValueSet VectorizedStores;
16006 bool Changed = false;
16007
16008 struct StoreDistCompare {
16009 bool operator()(const std::pair<unsigned, int> &Op1,
16010 const std::pair<unsigned, int> &Op2) const {
16011 return Op1.second < Op2.second;
16012 }
16013 };
16014 // A set of pairs (index of store in Stores array ref, Distance of the store
16015 // address relative to base store address in units).
16016 using StoreIndexToDistSet =
16017 std::set<std::pair<unsigned, int>, StoreDistCompare>;
16018 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
16019 int PrevDist = -1;
16021 // Collect the chain into a list.
16022 for (auto [Idx, Data] : enumerate(Set)) {
16023 if (Operands.empty() || Data.second - PrevDist == 1) {
16024 Operands.push_back(Stores[Data.first]);
16025 PrevDist = Data.second;
16026 if (Idx != Set.size() - 1)
16027 continue;
16028 }
16029 auto E = make_scope_exit([&, &DataVar = Data]() {
16030 Operands.clear();
16031 Operands.push_back(Stores[DataVar.first]);
16032 PrevDist = DataVar.second;
16033 });
16034
16035 if (Operands.size() <= 1 ||
16036 !Visited
16037 .insert({Operands.front(),
16038 cast<StoreInst>(Operands.front())->getValueOperand(),
16039 Operands.back(),
16040 cast<StoreInst>(Operands.back())->getValueOperand(),
16041 Operands.size()})
16042 .second)
16043 continue;
16044
16045 unsigned MaxVecRegSize = R.getMaxVecRegSize();
16046 unsigned EltSize = R.getVectorElementSize(Operands[0]);
16047 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
16048
16049 unsigned MaxVF =
16050 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
16051 unsigned MaxRegVF = MaxVF;
16052 auto *Store = cast<StoreInst>(Operands[0]);
16053 Type *StoreTy = Store->getValueOperand()->getType();
16054 Type *ValueTy = StoreTy;
16055 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
16056 ValueTy = Trunc->getSrcTy();
16057 if (ValueTy == StoreTy &&
16058 R.getVectorElementSize(Store->getValueOperand()) <= EltSize)
16059 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
16060 unsigned MinVF = std::max<unsigned>(
16062 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
16063 ValueTy)));
16064
16065 if (MaxVF < MinVF) {
16066 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
16067 << ") < "
16068 << "MinVF (" << MinVF << ")\n");
16069 continue;
16070 }
16071
16072 unsigned NonPowerOf2VF = 0;
16074 // First try vectorizing with a non-power-of-2 VF. At the moment, only
16075 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
16076 // lanes are used.
16077 unsigned CandVF = Operands.size();
16078 if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxRegVF)
16079 NonPowerOf2VF = CandVF;
16080 }
16081
16082 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
16083 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
16084 unsigned Size = MinVF;
16085 for_each(reverse(CandidateVFs), [&](unsigned &VF) {
16086 VF = Size > MaxVF ? NonPowerOf2VF : Size;
16087 Size *= 2;
16088 });
16089 unsigned End = Operands.size();
16090 unsigned Repeat = 0;
16091 constexpr unsigned MaxAttempts = 4;
16093 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
16094 P.first = P.second = 1;
16095 });
16097 auto IsNotVectorized = [](bool First,
16098 const std::pair<unsigned, unsigned> &P) {
16099 return First ? P.first > 0 : P.second > 0;
16100 };
16101 auto IsVectorized = [](bool First,
16102 const std::pair<unsigned, unsigned> &P) {
16103 return First ? P.first == 0 : P.second == 0;
16104 };
16105 auto VFIsProfitable = [](bool First, unsigned Size,
16106 const std::pair<unsigned, unsigned> &P) {
16107 return First ? Size >= P.first : Size >= P.second;
16108 };
16109 auto FirstSizeSame = [](unsigned Size,
16110 const std::pair<unsigned, unsigned> &P) {
16111 return Size == P.first;
16112 };
16113 while (true) {
16114 ++Repeat;
16115 bool RepeatChanged = false;
16116 bool AnyProfitableGraph = false;
16117 for (unsigned Size : CandidateVFs) {
16118 AnyProfitableGraph = false;
16119 unsigned StartIdx = std::distance(
16120 RangeSizes.begin(),
16121 find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
16122 std::placeholders::_1)));
16123 while (StartIdx < End) {
16124 unsigned EndIdx =
16125 std::distance(RangeSizes.begin(),
16126 find_if(RangeSizes.drop_front(StartIdx),
16127 std::bind(IsVectorized, Size >= MaxRegVF,
16128 std::placeholders::_1)));
16129 unsigned Sz = EndIdx >= End ? End : EndIdx;
16130 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
16131 if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
16132 Size >= MaxRegVF)) {
16133 ++Cnt;
16134 continue;
16135 }
16137 assert(all_of(Slice,
16138 [&](Value *V) {
16139 return cast<StoreInst>(V)
16140 ->getValueOperand()
16141 ->getType() ==
16142 cast<StoreInst>(Slice.front())
16143 ->getValueOperand()
16144 ->getType();
16145 }) &&
16146 "Expected all operands of same type.");
16147 if (!NonSchedulable.empty()) {
16148 auto [NonSchedSizeMax, NonSchedSizeMin] =
16149 NonSchedulable.lookup(Slice.front());
16150 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
16151 Cnt += NonSchedSizeMax;
16152 continue;
16153 }
16154 }
16155 unsigned TreeSize;
16156 std::optional<bool> Res =
16157 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
16158 if (!Res) {
16159 NonSchedulable
16160 .try_emplace(Slice.front(), std::make_pair(Size, Size))
16161 .first->getSecond()
16162 .second = Size;
16163 } else if (*Res) {
16164 // Mark the vectorized stores so that we don't vectorize them
16165 // again.
16166 VectorizedStores.insert(Slice.begin(), Slice.end());
16167 // Mark the vectorized stores so that we don't vectorize them
16168 // again.
16169 AnyProfitableGraph = RepeatChanged = Changed = true;
16170 // If we vectorized initial block, no need to try to vectorize
16171 // it again.
16172 for_each(RangeSizes.slice(Cnt, Size),
16173 [](std::pair<unsigned, unsigned> &P) {
16174 P.first = P.second = 0;
16175 });
16176 if (Cnt < StartIdx + MinVF) {
16177 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
16178 [](std::pair<unsigned, unsigned> &P) {
16179 P.first = P.second = 0;
16180 });
16181 StartIdx = Cnt + Size;
16182 }
16183 if (Cnt > Sz - Size - MinVF) {
16184 for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
16185 [](std::pair<unsigned, unsigned> &P) {
16186 P.first = P.second = 0;
16187 });
16188 if (Sz == End)
16189 End = Cnt;
16190 Sz = Cnt;
16191 }
16192 Cnt += Size;
16193 continue;
16194 }
16195 if (Size > 2 && Res &&
16196 !all_of(RangeSizes.slice(Cnt, Size),
16197 std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
16198 std::placeholders::_1))) {
16199 Cnt += Size;
16200 continue;
16201 }
16202 // Check for the very big VFs that we're not rebuilding same
16203 // trees, just with larger number of elements.
16204 if (Size > MaxRegVF && TreeSize > 1 &&
16205 all_of(RangeSizes.slice(Cnt, Size),
16206 std::bind(FirstSizeSame, TreeSize,
16207 std::placeholders::_1))) {
16208 Cnt += Size;
16209 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
16210 ++Cnt;
16211 continue;
16212 }
16213 if (TreeSize > 1)
16214 for_each(RangeSizes.slice(Cnt, Size),
16215 [&](std::pair<unsigned, unsigned> &P) {
16216 if (Size >= MaxRegVF)
16217 P.second = std::max(P.second, TreeSize);
16218 else
16219 P.first = std::max(P.first, TreeSize);
16220 });
16221 ++Cnt;
16222 AnyProfitableGraph = true;
16223 }
16224 if (StartIdx >= End)
16225 break;
16226 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
16227 AnyProfitableGraph = true;
16228 StartIdx = std::distance(
16229 RangeSizes.begin(),
16230 find_if(RangeSizes.drop_front(Sz),
16231 std::bind(IsNotVectorized, Size >= MaxRegVF,
16232 std::placeholders::_1)));
16233 }
16234 if (!AnyProfitableGraph && Size >= MaxRegVF)
16235 break;
16236 }
16237 // All values vectorized - exit.
16238 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
16239 return P.first == 0 && P.second == 0;
16240 }))
16241 break;
16242 // Check if tried all attempts or no need for the last attempts at all.
16243 if (Repeat >= MaxAttempts ||
16244 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
16245 break;
16246 constexpr unsigned StoresLimit = 64;
16247 const unsigned MaxTotalNum = bit_floor(std::min<unsigned>(
16248 Operands.size(),
16249 static_cast<unsigned>(
16250 End -
16251 std::distance(
16252 RangeSizes.begin(),
16253 find_if(RangeSizes, std::bind(IsNotVectorized, true,
16254 std::placeholders::_1))) +
16255 1)));
16256 unsigned VF = PowerOf2Ceil(CandidateVFs.front()) * 2;
16257 if (VF > MaxTotalNum || VF >= StoresLimit)
16258 break;
16259 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
16260 if (P.first != 0)
16261 P.first = std::max(P.second, P.first);
16262 });
16263 // Last attempt to vectorize max number of elements, if all previous
16264 // attempts were unsuccessful because of the cost issues.
16265 CandidateVFs.clear();
16266 CandidateVFs.push_back(VF);
16267 }
16268 }
16269 };
16270
16271 // Stores pair (first: index of the store into Stores array ref, address of
16272 // which taken as base, second: sorted set of pairs {index, dist}, which are
16273 // indices of stores in the set and their store location distances relative to
16274 // the base address).
16275
16276 // Need to store the index of the very first store separately, since the set
16277 // may be reordered after the insertion and the first store may be moved. This
16278 // container allows to reduce number of calls of getPointersDiff() function.
16280 // Inserts the specified store SI with the given index Idx to the set of the
16281 // stores. If the store with the same distance is found already - stop
16282 // insertion, try to vectorize already found stores. If some stores from this
16283 // sequence were not vectorized - try to vectorize them with the new store
16284 // later. But this logic is applied only to the stores, that come before the
16285 // previous store with the same distance.
16286 // Example:
16287 // 1. store x, %p
16288 // 2. store y, %p+1
16289 // 3. store z, %p+2
16290 // 4. store a, %p
16291 // 5. store b, %p+3
16292 // - Scan this from the last to first store. The very first bunch of stores is
16293 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
16294 // vector).
16295 // - The next store in the list - #1 - has the same distance from store #5 as
16296 // the store #4.
16297 // - Try to vectorize sequence of stores 4,2,3,5.
16298 // - If all these stores are vectorized - just drop them.
16299 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
16300 // - Start new stores sequence.
16301 // The new bunch of stores is {1, {1, 0}}.
16302 // - Add the stores from previous sequence, that were not vectorized.
16303 // Here we consider the stores in the reversed order, rather they are used in
16304 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
16305 // Store #3 can be added -> comes after store #4 with the same distance as
16306 // store #1.
16307 // Store #5 cannot be added - comes before store #4.
16308 // This logic allows to improve the compile time, we assume that the stores
16309 // after previous store with the same distance most likely have memory
16310 // dependencies and no need to waste compile time to try to vectorize them.
16311 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
16312 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
16313 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
16314 std::optional<int> Diff = getPointersDiff(
16315 Stores[Set.first]->getValueOperand()->getType(),
16316 Stores[Set.first]->getPointerOperand(),
16317 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
16318 /*StrictCheck=*/true);
16319 if (!Diff)
16320 continue;
16321 auto It = Set.second.find(std::make_pair(Idx, *Diff));
16322 if (It == Set.second.end()) {
16323 Set.second.emplace(Idx, *Diff);
16324 return;
16325 }
16326 // Try to vectorize the first found set to avoid duplicate analysis.
16327 TryToVectorize(Set.second);
16328 StoreIndexToDistSet PrevSet;
16329 PrevSet.swap(Set.second);
16330 Set.first = Idx;
16331 Set.second.emplace(Idx, 0);
16332 // Insert stores that followed previous match to try to vectorize them
16333 // with this store.
16334 unsigned StartIdx = It->first + 1;
16335 SmallBitVector UsedStores(Idx - StartIdx);
16336 // Distances to previously found dup store (or this store, since they
16337 // store to the same addresses).
16338 SmallVector<int> Dists(Idx - StartIdx, 0);
16339 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
16340 // Do not try to vectorize sequences, we already tried.
16341 if (Pair.first <= It->first ||
16342 VectorizedStores.contains(Stores[Pair.first]))
16343 break;
16344 unsigned BI = Pair.first - StartIdx;
16345 UsedStores.set(BI);
16346 Dists[BI] = Pair.second - It->second;
16347 }
16348 for (unsigned I = StartIdx; I < Idx; ++I) {
16349 unsigned BI = I - StartIdx;
16350 if (UsedStores.test(BI))
16351 Set.second.emplace(I, Dists[BI]);
16352 }
16353 return;
16354 }
16355 auto &Res = SortedStores.emplace_back();
16356 Res.first = Idx;
16357 Res.second.emplace(Idx, 0);
16358 };
16359 Type *PrevValTy = nullptr;
16360 for (auto [I, SI] : enumerate(Stores)) {
16361 if (R.isDeleted(SI))
16362 continue;
16363 if (!PrevValTy)
16364 PrevValTy = SI->getValueOperand()->getType();
16365 // Check that we do not try to vectorize stores of different types.
16366 if (PrevValTy != SI->getValueOperand()->getType()) {
16367 for (auto &Set : SortedStores)
16368 TryToVectorize(Set.second);
16369 SortedStores.clear();
16370 PrevValTy = SI->getValueOperand()->getType();
16371 }
16372 FillStoresSet(I, SI);
16373 }
16374
16375 // Final vectorization attempt.
16376 for (auto &Set : SortedStores)
16377 TryToVectorize(Set.second);
16378
16379 return Changed;
16380}
16381
16382void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
16383 // Initialize the collections. We will make a single pass over the block.
16384 Stores.clear();
16385 GEPs.clear();
16386
16387 // Visit the store and getelementptr instructions in BB and organize them in
16388 // Stores and GEPs according to the underlying objects of their pointer
16389 // operands.
16390 for (Instruction &I : *BB) {
16391 // Ignore store instructions that are volatile or have a pointer operand
16392 // that doesn't point to a scalar type.
16393 if (auto *SI = dyn_cast<StoreInst>(&I)) {
16394 if (!SI->isSimple())
16395 continue;
16396 if (!isValidElementType(SI->getValueOperand()->getType()))
16397 continue;
16398 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
16399 }
16400
16401 // Ignore getelementptr instructions that have more than one index, a
16402 // constant index, or a pointer operand that doesn't point to a scalar
16403 // type.
16404 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
16405 if (GEP->getNumIndices() != 1)
16406 continue;
16407 Value *Idx = GEP->idx_begin()->get();
16408 if (isa<Constant>(Idx))
16409 continue;
16410 if (!isValidElementType(Idx->getType()))
16411 continue;
16412 if (GEP->getType()->isVectorTy())
16413 continue;
16414 GEPs[GEP->getPointerOperand()].push_back(GEP);
16415 }
16416 }
16417}
16418
16419bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
16420 bool MaxVFOnly) {
16421 if (VL.size() < 2)
16422 return false;
16423
16424 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
16425 << VL.size() << ".\n");
16426
16427 // Check that all of the parts are instructions of the same type,
16428 // we permit an alternate opcode via InstructionsState.
16429 InstructionsState S = getSameOpcode(VL, *TLI);
16430 if (!S.getOpcode())
16431 return false;
16432
16433 Instruction *I0 = cast<Instruction>(S.OpValue);
16434 // Make sure invalid types (including vector type) are rejected before
16435 // determining vectorization factor for scalar instructions.
16436 for (Value *V : VL) {
16437 Type *Ty = V->getType();
16438 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
16439 // NOTE: the following will give user internal llvm type name, which may
16440 // not be useful.
16441 R.getORE()->emit([&]() {
16442 std::string TypeStr;
16443 llvm::raw_string_ostream rso(TypeStr);
16444 Ty->print(rso);
16445 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
16446 << "Cannot SLP vectorize list: type "
16447 << TypeStr + " is unsupported by vectorizer";
16448 });
16449 return false;
16450 }
16451 }
16452
16453 unsigned Sz = R.getVectorElementSize(I0);
16454 unsigned MinVF = R.getMinVF(Sz);
16455 unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
16456 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
16457 if (MaxVF < 2) {
16458 R.getORE()->emit([&]() {
16459 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
16460 << "Cannot SLP vectorize list: vectorization factor "
16461 << "less than 2 is not supported";
16462 });
16463 return false;
16464 }
16465
16466 bool Changed = false;
16467 bool CandidateFound = false;
16468 InstructionCost MinCost = SLPCostThreshold.getValue();
16469 Type *ScalarTy = VL[0]->getType();
16470 if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
16471 ScalarTy = IE->getOperand(1)->getType();
16472
16473 unsigned NextInst = 0, MaxInst = VL.size();
16474 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
16475 // No actual vectorization should happen, if number of parts is the same as
16476 // provided vectorization factor (i.e. the scalar type is used for vector
16477 // code during codegen).
16478 auto *VecTy = getWidenedType(ScalarTy, VF);
16479 if (TTI->getNumberOfParts(VecTy) == VF)
16480 continue;
16481 for (unsigned I = NextInst; I < MaxInst; ++I) {
16482 unsigned ActualVF = std::min(MaxInst - I, VF);
16483
16484 if (!isPowerOf2_32(ActualVF))
16485 continue;
16486
16487 if (MaxVFOnly && ActualVF < MaxVF)
16488 break;
16489 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
16490 break;
16491
16492 ArrayRef<Value *> Ops = VL.slice(I, ActualVF);
16493 // Check that a previous iteration of this loop did not delete the Value.
16494 if (llvm::any_of(Ops, [&R](Value *V) {
16495 auto *I = dyn_cast<Instruction>(V);
16496 return I && R.isDeleted(I);
16497 }))
16498 continue;
16499
16500 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
16501 << "\n");
16502
16503 R.buildTree(Ops);
16504 if (R.isTreeTinyAndNotFullyVectorizable())
16505 continue;
16506 R.reorderTopToBottom();
16507 R.reorderBottomToTop(
16508 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
16509 !R.doesRootHaveInTreeUses());
16510 R.buildExternalUses();
16511
16512 R.computeMinimumValueSizes();
16513 R.transformNodes();
16514 InstructionCost Cost = R.getTreeCost();
16515 CandidateFound = true;
16516 MinCost = std::min(MinCost, Cost);
16517
16518 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
16519 << " for VF=" << ActualVF << "\n");
16520 if (Cost < -SLPCostThreshold) {
16521 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
16522 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
16523 cast<Instruction>(Ops[0]))
16524 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
16525 << " and with tree size "
16526 << ore::NV("TreeSize", R.getTreeSize()));
16527
16528 R.vectorizeTree();
16529 // Move to the next bundle.
16530 I += VF - 1;
16531 NextInst = I + 1;
16532 Changed = true;
16533 }
16534 }
16535 }
16536
16537 if (!Changed && CandidateFound) {
16538 R.getORE()->emit([&]() {
16539 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
16540 << "List vectorization was possible but not beneficial with cost "
16541 << ore::NV("Cost", MinCost) << " >= "
16542 << ore::NV("Treshold", -SLPCostThreshold);
16543 });
16544 } else if (!Changed) {
16545 R.getORE()->emit([&]() {
16546 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
16547 << "Cannot SLP vectorize list: vectorization was impossible"
16548 << " with available vectorization factors";
16549 });
16550 }
16551 return Changed;
16552}
16553
16554bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
16555 if (!I)
16556 return false;
16557
16558 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
16559 return false;
16560
16561 Value *P = I->getParent();
16562
16563 // Vectorize in current basic block only.
16564 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
16565 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
16566 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
16567 return false;
16568
16569 // First collect all possible candidates
16571 Candidates.emplace_back(Op0, Op1);
16572
16573 auto *A = dyn_cast<BinaryOperator>(Op0);
16574 auto *B = dyn_cast<BinaryOperator>(Op1);
16575 // Try to skip B.
16576 if (A && B && B->hasOneUse()) {
16577 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
16578 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
16579 if (B0 && B0->getParent() == P)
16580 Candidates.emplace_back(A, B0);
16581 if (B1 && B1->getParent() == P)
16582 Candidates.emplace_back(A, B1);
16583 }
16584 // Try to skip A.
16585 if (B && A && A->hasOneUse()) {
16586 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
16587 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
16588 if (A0 && A0->getParent() == P)
16589 Candidates.emplace_back(A0, B);
16590 if (A1 && A1->getParent() == P)
16591 Candidates.emplace_back(A1, B);
16592 }
16593
16594 if (Candidates.size() == 1)
16595 return tryToVectorizeList({Op0, Op1}, R);
16596
16597 // We have multiple options. Try to pick the single best.
16598 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
16599 if (!BestCandidate)
16600 return false;
16601 return tryToVectorizeList(
16602 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
16603}
16604
16605namespace {
16606
16607/// Model horizontal reductions.
16608///
16609/// A horizontal reduction is a tree of reduction instructions that has values
16610/// that can be put into a vector as its leaves. For example:
16611///
16612/// mul mul mul mul
16613/// \ / \ /
16614/// + +
16615/// \ /
16616/// +
16617/// This tree has "mul" as its leaf values and "+" as its reduction
16618/// instructions. A reduction can feed into a store or a binary operation
16619/// feeding a phi.
16620/// ...
16621/// \ /
16622/// +
16623/// |
16624/// phi +=
16625///
16626/// Or:
16627/// ...
16628/// \ /
16629/// +
16630/// |
16631/// *p =
16632///
16633class HorizontalReduction {
16634 using ReductionOpsType = SmallVector<Value *, 16>;
16635 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
16636 ReductionOpsListType ReductionOps;
16637 /// List of possibly reduced values.
16639 /// Maps reduced value to the corresponding reduction operation.
16641 // Use map vector to make stable output.
16643 WeakTrackingVH ReductionRoot;
16644 /// The type of reduction operation.
16645 RecurKind RdxKind;
16646 /// Checks if the optimization of original scalar identity operations on
16647 /// matched horizontal reductions is enabled and allowed.
16648 bool IsSupportedHorRdxIdentityOp = false;
16649
16650 static bool isCmpSelMinMax(Instruction *I) {
16651 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
16653 }
16654
16655 // And/or are potentially poison-safe logical patterns like:
16656 // select x, y, false
16657 // select x, true, y
16658 static bool isBoolLogicOp(Instruction *I) {
16659 return isa<SelectInst>(I) &&
16660 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
16661 }
16662
16663 /// Checks if instruction is associative and can be vectorized.
16664 static bool isVectorizable(RecurKind Kind, Instruction *I) {
16665 if (Kind == RecurKind::None)
16666 return false;
16667
16668 // Integer ops that map to select instructions or intrinsics are fine.
16670 isBoolLogicOp(I))
16671 return true;
16672
16673 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
16674 // FP min/max are associative except for NaN and -0.0. We do not
16675 // have to rule out -0.0 here because the intrinsic semantics do not
16676 // specify a fixed result for it.
16677 return I->getFastMathFlags().noNaNs();
16678 }
16679
16680 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
16681 return true;
16682
16683 return I->isAssociative();
16684 }
16685
16686 static Value *getRdxOperand(Instruction *I, unsigned Index) {
16687 // Poison-safe 'or' takes the form: select X, true, Y
16688 // To make that work with the normal operand processing, we skip the
16689 // true value operand.
16690 // TODO: Change the code and data structures to handle this without a hack.
16691 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
16692 return I->getOperand(2);
16693 return I->getOperand(Index);
16694 }
16695
16696 /// Creates reduction operation with the current opcode.
16697 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
16698 Value *RHS, const Twine &Name, bool UseSelect) {
16699 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
16700 switch (Kind) {
16701 case RecurKind::Or:
16702 if (UseSelect &&
16704 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
16705 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16706 Name);
16707 case RecurKind::And:
16708 if (UseSelect &&
16710 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
16711 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16712 Name);
16713 case RecurKind::Add:
16714 case RecurKind::Mul:
16715 case RecurKind::Xor:
16716 case RecurKind::FAdd:
16717 case RecurKind::FMul:
16718 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16719 Name);
16720 case RecurKind::FMax:
16721 return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
16722 case RecurKind::FMin:
16723 return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
16724 case RecurKind::FMaximum:
16725 return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
16726 case RecurKind::FMinimum:
16727 return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
16728 case RecurKind::SMax:
16729 if (UseSelect) {
16730 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
16731 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16732 }
16733 return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
16734 case RecurKind::SMin:
16735 if (UseSelect) {
16736 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
16737 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16738 }
16739 return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
16740 case RecurKind::UMax:
16741 if (UseSelect) {
16742 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
16743 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16744 }
16745 return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
16746 case RecurKind::UMin:
16747 if (UseSelect) {
16748 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
16749 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16750 }
16751 return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
16752 default:
16753 llvm_unreachable("Unknown reduction operation.");
16754 }
16755 }
16756
16757 /// Creates reduction operation with the current opcode with the IR flags
16758 /// from \p ReductionOps, dropping nuw/nsw flags.
16759 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
16760 Value *RHS, const Twine &Name,
16761 const ReductionOpsListType &ReductionOps) {
16762 bool UseSelect = ReductionOps.size() == 2 ||
16763 // Logical or/and.
16764 (ReductionOps.size() == 1 &&
16765 any_of(ReductionOps.front(), IsaPred<SelectInst>));
16766 assert((!UseSelect || ReductionOps.size() != 2 ||
16767 isa<SelectInst>(ReductionOps[1][0])) &&
16768 "Expected cmp + select pairs for reduction");
16769 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
16771 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
16772 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
16773 /*IncludeWrapFlags=*/false);
16774 propagateIRFlags(Op, ReductionOps[1], nullptr,
16775 /*IncludeWrapFlags=*/false);
16776 return Op;
16777 }
16778 }
16779 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
16780 return Op;
16781 }
16782
16783public:
16784 static RecurKind getRdxKind(Value *V) {
16785 auto *I = dyn_cast<Instruction>(V);
16786 if (!I)
16787 return RecurKind::None;
16788 if (match(I, m_Add(m_Value(), m_Value())))
16789 return RecurKind::Add;
16790 if (match(I, m_Mul(m_Value(), m_Value())))
16791 return RecurKind::Mul;
16792 if (match(I, m_And(m_Value(), m_Value())) ||
16794 return RecurKind::And;
16795 if (match(I, m_Or(m_Value(), m_Value())) ||
16797 return RecurKind::Or;
16798 if (match(I, m_Xor(m_Value(), m_Value())))
16799 return RecurKind::Xor;
16800 if (match(I, m_FAdd(m_Value(), m_Value())))
16801 return RecurKind::FAdd;
16802 if (match(I, m_FMul(m_Value(), m_Value())))
16803 return RecurKind::FMul;
16804
16805 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
16806 return RecurKind::FMax;
16807 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
16808 return RecurKind::FMin;
16809
16810 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
16811 return RecurKind::FMaximum;
16812 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
16813 return RecurKind::FMinimum;
16814 // This matches either cmp+select or intrinsics. SLP is expected to handle
16815 // either form.
16816 // TODO: If we are canonicalizing to intrinsics, we can remove several
16817 // special-case paths that deal with selects.
16818 if (match(I, m_SMax(m_Value(), m_Value())))
16819 return RecurKind::SMax;
16820 if (match(I, m_SMin(m_Value(), m_Value())))
16821 return RecurKind::SMin;
16822 if (match(I, m_UMax(m_Value(), m_Value())))
16823 return RecurKind::UMax;
16824 if (match(I, m_UMin(m_Value(), m_Value())))
16825 return RecurKind::UMin;
16826
16827 if (auto *Select = dyn_cast<SelectInst>(I)) {
16828 // Try harder: look for min/max pattern based on instructions producing
16829 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
16830 // During the intermediate stages of SLP, it's very common to have
16831 // pattern like this (since optimizeGatherSequence is run only once
16832 // at the end):
16833 // %1 = extractelement <2 x i32> %a, i32 0
16834 // %2 = extractelement <2 x i32> %a, i32 1
16835 // %cond = icmp sgt i32 %1, %2
16836 // %3 = extractelement <2 x i32> %a, i32 0
16837 // %4 = extractelement <2 x i32> %a, i32 1
16838 // %select = select i1 %cond, i32 %3, i32 %4
16839 CmpInst::Predicate Pred;
16840 Instruction *L1;
16841 Instruction *L2;
16842
16843 Value *LHS = Select->getTrueValue();
16844 Value *RHS = Select->getFalseValue();
16845 Value *Cond = Select->getCondition();
16846
16847 // TODO: Support inverse predicates.
16848 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
16849 if (!isa<ExtractElementInst>(RHS) ||
16850 !L2->isIdenticalTo(cast<Instruction>(RHS)))
16851 return RecurKind::None;
16852 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
16853 if (!isa<ExtractElementInst>(LHS) ||
16854 !L1->isIdenticalTo(cast<Instruction>(LHS)))
16855 return RecurKind::None;
16856 } else {
16857 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
16858 return RecurKind::None;
16859 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
16860 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
16861 !L2->isIdenticalTo(cast<Instruction>(RHS)))
16862 return RecurKind::None;
16863 }
16864
16865 switch (Pred) {
16866 default:
16867 return RecurKind::None;
16868 case CmpInst::ICMP_SGT:
16869 case CmpInst::ICMP_SGE:
16870 return RecurKind::SMax;
16871 case CmpInst::ICMP_SLT:
16872 case CmpInst::ICMP_SLE:
16873 return RecurKind::SMin;
16874 case CmpInst::ICMP_UGT:
16875 case CmpInst::ICMP_UGE:
16876 return RecurKind::UMax;
16877 case CmpInst::ICMP_ULT:
16878 case CmpInst::ICMP_ULE:
16879 return RecurKind::UMin;
16880 }
16881 }
16882 return RecurKind::None;
16883 }
16884
16885 /// Get the index of the first operand.
16886 static unsigned getFirstOperandIndex(Instruction *I) {
16887 return isCmpSelMinMax(I) ? 1 : 0;
16888 }
16889
16890private:
16891 /// Total number of operands in the reduction operation.
16892 static unsigned getNumberOfOperands(Instruction *I) {
16893 return isCmpSelMinMax(I) ? 3 : 2;
16894 }
16895
16896 /// Checks if the instruction is in basic block \p BB.
16897 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
16898 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
16899 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
16900 auto *Sel = cast<SelectInst>(I);
16901 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
16902 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
16903 }
16904 return I->getParent() == BB;
16905 }
16906
16907 /// Expected number of uses for reduction operations/reduced values.
16908 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
16909 if (IsCmpSelMinMax) {
16910 // SelectInst must be used twice while the condition op must have single
16911 // use only.
16912 if (auto *Sel = dyn_cast<SelectInst>(I))
16913 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
16914 return I->hasNUses(2);
16915 }
16916
16917 // Arithmetic reduction operation must be used once only.
16918 return I->hasOneUse();
16919 }
16920
16921 /// Initializes the list of reduction operations.
16922 void initReductionOps(Instruction *I) {
16923 if (isCmpSelMinMax(I))
16924 ReductionOps.assign(2, ReductionOpsType());
16925 else
16926 ReductionOps.assign(1, ReductionOpsType());
16927 }
16928
16929 /// Add all reduction operations for the reduction instruction \p I.
16930 void addReductionOps(Instruction *I) {
16931 if (isCmpSelMinMax(I)) {
16932 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
16933 ReductionOps[1].emplace_back(I);
16934 } else {
16935 ReductionOps[0].emplace_back(I);
16936 }
16937 }
16938
16939 static bool isGoodForReduction(ArrayRef<Value *> Data) {
16940 int Sz = Data.size();
16941 auto *I = dyn_cast<Instruction>(Data.front());
16942 return Sz > 1 || isConstant(Data.front()) ||
16943 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
16944 }
16945
16946public:
16947 HorizontalReduction() = default;
16948
16949 /// Try to find a reduction tree.
16950 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
16951 ScalarEvolution &SE, const DataLayout &DL,
16952 const TargetLibraryInfo &TLI) {
16953 RdxKind = HorizontalReduction::getRdxKind(Root);
16954 if (!isVectorizable(RdxKind, Root))
16955 return false;
16956
16957 // Analyze "regular" integer/FP types for reductions - no target-specific
16958 // types or pointers.
16959 Type *Ty = Root->getType();
16960 if (!isValidElementType(Ty) || Ty->isPointerTy())
16961 return false;
16962
16963 // Though the ultimate reduction may have multiple uses, its condition must
16964 // have only single use.
16965 if (auto *Sel = dyn_cast<SelectInst>(Root))
16966 if (!Sel->getCondition()->hasOneUse())
16967 return false;
16968
16969 ReductionRoot = Root;
16970
16971 // Iterate through all the operands of the possible reduction tree and
16972 // gather all the reduced values, sorting them by their value id.
16973 BasicBlock *BB = Root->getParent();
16974 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
16975 SmallVector<Instruction *> Worklist(1, Root);
16976 // Checks if the operands of the \p TreeN instruction are also reduction
16977 // operations or should be treated as reduced values or an extra argument,
16978 // which is not part of the reduction.
16979 auto CheckOperands = [&](Instruction *TreeN,
16980 SmallVectorImpl<Value *> &ExtraArgs,
16981 SmallVectorImpl<Value *> &PossibleReducedVals,
16982 SmallVectorImpl<Instruction *> &ReductionOps) {
16983 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
16984 getNumberOfOperands(TreeN)))) {
16985 Value *EdgeVal = getRdxOperand(TreeN, I);
16986 ReducedValsToOps[EdgeVal].push_back(TreeN);
16987 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
16988 // Edge has wrong parent - mark as an extra argument.
16989 if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
16990 !hasSameParent(EdgeInst, BB)) {
16991 ExtraArgs.push_back(EdgeVal);
16992 continue;
16993 }
16994 // If the edge is not an instruction, or it is different from the main
16995 // reduction opcode or has too many uses - possible reduced value.
16996 // Also, do not try to reduce const values, if the operation is not
16997 // foldable.
16998 if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
16999 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
17000 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
17001 !isVectorizable(RdxKind, EdgeInst) ||
17002 (R.isAnalyzedReductionRoot(EdgeInst) &&
17003 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
17004 PossibleReducedVals.push_back(EdgeVal);
17005 continue;
17006 }
17007 ReductionOps.push_back(EdgeInst);
17008 }
17009 };
17010 // Try to regroup reduced values so that it gets more profitable to try to
17011 // reduce them. Values are grouped by their value ids, instructions - by
17012 // instruction op id and/or alternate op id, plus do extra analysis for
17013 // loads (grouping them by the distabce between pointers) and cmp
17014 // instructions (grouping them by the predicate).
17016 PossibleReducedVals;
17017 initReductionOps(Root);
17019 SmallSet<size_t, 2> LoadKeyUsed;
17020
17021 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
17023 if (LoadKeyUsed.contains(Key)) {
17024 auto LIt = LoadsMap.find(Ptr);
17025 if (LIt != LoadsMap.end()) {
17026 for (LoadInst *RLI : LIt->second) {
17027 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
17028 LI->getType(), LI->getPointerOperand(), DL, SE,
17029 /*StrictCheck=*/true))
17030 return hash_value(RLI->getPointerOperand());
17031 }
17032 for (LoadInst *RLI : LIt->second) {
17034 LI->getPointerOperand(), TLI)) {
17035 hash_code SubKey = hash_value(RLI->getPointerOperand());
17036 return SubKey;
17037 }
17038 }
17039 if (LIt->second.size() > 2) {
17040 hash_code SubKey =
17041 hash_value(LIt->second.back()->getPointerOperand());
17042 return SubKey;
17043 }
17044 }
17045 }
17046 LoadKeyUsed.insert(Key);
17047 LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
17048 return hash_value(LI->getPointerOperand());
17049 };
17050
17051 while (!Worklist.empty()) {
17052 Instruction *TreeN = Worklist.pop_back_val();
17054 SmallVector<Value *> PossibleRedVals;
17055 SmallVector<Instruction *> PossibleReductionOps;
17056 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
17057 // If too many extra args - mark the instruction itself as a reduction
17058 // value, not a reduction operation.
17059 if (Args.size() < 2) {
17060 addReductionOps(TreeN);
17061 // Add extra args.
17062 if (!Args.empty()) {
17063 assert(Args.size() == 1 && "Expected only single argument.");
17064 ExtraArgs[TreeN] = Args.front();
17065 }
17066 // Add reduction values. The values are sorted for better vectorization
17067 // results.
17068 for (Value *V : PossibleRedVals) {
17069 size_t Key, Idx;
17070 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
17071 /*AllowAlternate=*/false);
17072 ++PossibleReducedVals[Key][Idx]
17073 .insert(std::make_pair(V, 0))
17074 .first->second;
17075 }
17076 Worklist.append(PossibleReductionOps.rbegin(),
17077 PossibleReductionOps.rend());
17078 } else {
17079 size_t Key, Idx;
17080 std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey,
17081 /*AllowAlternate=*/false);
17082 ++PossibleReducedVals[Key][Idx]
17083 .insert(std::make_pair(TreeN, 0))
17084 .first->second;
17085 }
17086 }
17087 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
17088 // Sort values by the total number of values kinds to start the reduction
17089 // from the longest possible reduced values sequences.
17090 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
17091 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
17092 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
17093 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
17094 It != E; ++It) {
17095 PossibleRedValsVect.emplace_back();
17096 auto RedValsVect = It->second.takeVector();
17097 stable_sort(RedValsVect, llvm::less_second());
17098 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
17099 PossibleRedValsVect.back().append(Data.second, Data.first);
17100 }
17101 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
17102 return P1.size() > P2.size();
17103 });
17104 int NewIdx = -1;
17105 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
17106 if (NewIdx < 0 ||
17107 (!isGoodForReduction(Data) &&
17108 (!isa<LoadInst>(Data.front()) ||
17109 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
17111 cast<LoadInst>(Data.front())->getPointerOperand()) !=
17113 cast<LoadInst>(ReducedVals[NewIdx].front())
17114 ->getPointerOperand())))) {
17115 NewIdx = ReducedVals.size();
17116 ReducedVals.emplace_back();
17117 }
17118 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
17119 }
17120 }
17121 // Sort the reduced values by number of same/alternate opcode and/or pointer
17122 // operand.
17123 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
17124 return P1.size() > P2.size();
17125 });
17126 return true;
17127 }
17128
17129 /// Attempt to vectorize the tree found by matchAssociativeReduction.
17130 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
17131 const TargetLibraryInfo &TLI) {
17132 constexpr int ReductionLimit = 4;
17133 constexpr unsigned RegMaxNumber = 4;
17134 constexpr unsigned RedValsMaxNumber = 128;
17135 // If there are a sufficient number of reduction values, reduce
17136 // to a nearby power-of-2. We can safely generate oversized
17137 // vectors and rely on the backend to split them to legal sizes.
17138 unsigned NumReducedVals =
17139 std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0,
17140 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
17141 if (!isGoodForReduction(Vals))
17142 return Num;
17143 return Num + Vals.size();
17144 });
17145 if (NumReducedVals < ReductionLimit &&
17147 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
17148 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
17149 }))) {
17150 for (ReductionOpsType &RdxOps : ReductionOps)
17151 for (Value *RdxOp : RdxOps)
17152 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17153 return nullptr;
17154 }
17155
17156 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
17157 TargetFolder(DL));
17158 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
17159
17160 // Track the reduced values in case if they are replaced by extractelement
17161 // because of the vectorization.
17163 ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
17164 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
17165 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
17166 ExternallyUsedValues.reserve(ExtraArgs.size() + 1);
17167 // The same extra argument may be used several times, so log each attempt
17168 // to use it.
17169 for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
17170 assert(Pair.first && "DebugLoc must be set.");
17171 ExternallyUsedValues[Pair.second].push_back(Pair.first);
17172 TrackedVals.try_emplace(Pair.second, Pair.second);
17173 }
17174
17175 // The compare instruction of a min/max is the insertion point for new
17176 // instructions and may be replaced with a new compare instruction.
17177 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
17178 assert(isa<SelectInst>(RdxRootInst) &&
17179 "Expected min/max reduction to have select root instruction");
17180 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
17181 assert(isa<Instruction>(ScalarCond) &&
17182 "Expected min/max reduction to have compare condition");
17183 return cast<Instruction>(ScalarCond);
17184 };
17185
17186 // Return new VectorizedTree, based on previous value.
17187 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
17188 if (VectorizedTree) {
17189 // Update the final value in the reduction.
17191 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
17192 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
17194 !isGuaranteedNotToBePoison(VectorizedTree))) {
17195 auto It = ReducedValsToOps.find(Res);
17196 if (It != ReducedValsToOps.end() &&
17197 any_of(It->getSecond(),
17198 [](Instruction *I) { return isBoolLogicOp(I); }))
17199 std::swap(VectorizedTree, Res);
17200 }
17201
17202 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
17203 ReductionOps);
17204 }
17205 // Initialize the final value in the reduction.
17206 return Res;
17207 };
17208 bool AnyBoolLogicOp =
17209 any_of(ReductionOps.back(), [](Value *V) {
17210 return isBoolLogicOp(cast<Instruction>(V));
17211 });
17212 // The reduction root is used as the insertion point for new instructions,
17213 // so set it as externally used to prevent it from being deleted.
17214 ExternallyUsedValues[ReductionRoot];
17215 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
17216 ReductionOps.front().size());
17217 for (ReductionOpsType &RdxOps : ReductionOps)
17218 for (Value *RdxOp : RdxOps) {
17219 if (!RdxOp)
17220 continue;
17221 IgnoreList.insert(RdxOp);
17222 }
17223 // Intersect the fast-math-flags from all reduction operations.
17224 FastMathFlags RdxFMF;
17225 RdxFMF.set();
17226 for (Value *U : IgnoreList)
17227 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
17228 RdxFMF &= FPMO->getFastMathFlags();
17229 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
17230
17231 // Need to track reduced vals, they may be changed during vectorization of
17232 // subvectors.
17233 for (ArrayRef<Value *> Candidates : ReducedVals)
17234 for (Value *V : Candidates)
17235 TrackedVals.try_emplace(V, V);
17236
17237 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
17238 // List of the values that were reduced in other trees as part of gather
17239 // nodes and thus requiring extract if fully vectorized in other trees.
17240 SmallPtrSet<Value *, 4> RequiredExtract;
17241 Value *VectorizedTree = nullptr;
17242 bool CheckForReusedReductionOps = false;
17243 // Try to vectorize elements based on their type.
17245 for (ArrayRef<Value *> RV : ReducedVals)
17246 States.push_back(getSameOpcode(RV, TLI));
17247 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
17248 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
17249 InstructionsState S = States[I];
17250 SmallVector<Value *> Candidates;
17251 Candidates.reserve(2 * OrigReducedVals.size());
17252 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
17253 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
17254 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
17255 // Check if the reduction value was not overriden by the extractelement
17256 // instruction because of the vectorization and exclude it, if it is not
17257 // compatible with other values.
17258 // Also check if the instruction was folded to constant/other value.
17259 auto *Inst = dyn_cast<Instruction>(RdxVal);
17260 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
17261 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
17262 (S.getOpcode() && !Inst))
17263 continue;
17264 Candidates.push_back(RdxVal);
17265 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
17266 }
17267 bool ShuffledExtracts = false;
17268 // Try to handle shuffled extractelements.
17269 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
17270 I + 1 < E) {
17271 InstructionsState NextS = getSameOpcode(ReducedVals[I + 1], TLI);
17272 if (NextS.getOpcode() == Instruction::ExtractElement &&
17273 !NextS.isAltShuffle()) {
17274 SmallVector<Value *> CommonCandidates(Candidates);
17275 for (Value *RV : ReducedVals[I + 1]) {
17276 Value *RdxVal = TrackedVals.find(RV)->second;
17277 // Check if the reduction value was not overriden by the
17278 // extractelement instruction because of the vectorization and
17279 // exclude it, if it is not compatible with other values.
17280 if (auto *Inst = dyn_cast<Instruction>(RdxVal))
17281 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
17282 continue;
17283 CommonCandidates.push_back(RdxVal);
17284 TrackedToOrig.try_emplace(RdxVal, RV);
17285 }
17287 if (isFixedVectorShuffle(CommonCandidates, Mask)) {
17288 ++I;
17289 Candidates.swap(CommonCandidates);
17290 ShuffledExtracts = true;
17291 }
17292 }
17293 }
17294
17295 // Emit code for constant values.
17296 if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 &&
17297 allConstant(Candidates)) {
17298 Value *Res = Candidates.front();
17299 ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond();
17300 for (Value *VC : ArrayRef(Candidates).drop_front()) {
17301 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
17302 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
17303 if (auto *ResI = dyn_cast<Instruction>(Res))
17304 V.analyzedReductionRoot(ResI);
17305 }
17306 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
17307 continue;
17308 }
17309
17310 unsigned NumReducedVals = Candidates.size();
17311 if (NumReducedVals < ReductionLimit &&
17312 (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization ||
17313 !isSplat(Candidates)))
17314 continue;
17315
17316 // Check if we support repeated scalar values processing (optimization of
17317 // original scalar identity operations on matched horizontal reductions).
17318 IsSupportedHorRdxIdentityOp =
17319 AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
17320 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
17321 // Gather same values.
17322 MapVector<Value *, unsigned> SameValuesCounter;
17323 if (IsSupportedHorRdxIdentityOp)
17324 for (Value *V : Candidates)
17325 ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second;
17326 // Used to check if the reduced values used same number of times. In this
17327 // case the compiler may produce better code. E.g. if reduced values are
17328 // aabbccdd (8 x values), then the first node of the tree will have a node
17329 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
17330 // Plus, the final reduction will be performed on <8 x aabbccdd>.
17331 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
17332 // x abcd) * 2.
17333 // Currently it only handles add/fadd/xor. and/or/min/max do not require
17334 // this analysis, other operations may require an extra estimation of
17335 // the profitability.
17336 bool SameScaleFactor = false;
17337 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
17338 SameValuesCounter.size() != Candidates.size();
17339 if (OptReusedScalars) {
17340 SameScaleFactor =
17341 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
17342 RdxKind == RecurKind::Xor) &&
17343 all_of(drop_begin(SameValuesCounter),
17344 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
17345 return P.second == SameValuesCounter.front().second;
17346 });
17347 Candidates.resize(SameValuesCounter.size());
17348 transform(SameValuesCounter, Candidates.begin(),
17349 [](const auto &P) { return P.first; });
17350 NumReducedVals = Candidates.size();
17351 // Have a reduction of the same element.
17352 if (NumReducedVals == 1) {
17353 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
17354 unsigned Cnt = SameValuesCounter.lookup(OrigV);
17355 Value *RedVal =
17356 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
17357 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17358 VectorizedVals.try_emplace(OrigV, Cnt);
17359 continue;
17360 }
17361 }
17362
17363 unsigned MaxVecRegSize = V.getMaxVecRegSize();
17364 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
17365 unsigned MaxElts =
17366 RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize);
17367
17368 unsigned ReduxWidth = std::min<unsigned>(
17369 llvm::bit_floor(NumReducedVals),
17370 std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
17371 RegMaxNumber * RedValsMaxNumber));
17372 unsigned Start = 0;
17373 unsigned Pos = Start;
17374 // Restarts vectorization attempt with lower vector factor.
17375 unsigned PrevReduxWidth = ReduxWidth;
17376 bool CheckForReusedReductionOpsLocal = false;
17377 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
17378 &CheckForReusedReductionOpsLocal,
17379 &PrevReduxWidth, &V,
17380 &IgnoreList](bool IgnoreVL = false) {
17381 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
17382 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
17383 // Check if any of the reduction ops are gathered. If so, worth
17384 // trying again with less number of reduction ops.
17385 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
17386 }
17387 ++Pos;
17388 if (Pos < NumReducedVals - ReduxWidth + 1)
17389 return IsAnyRedOpGathered;
17390 Pos = Start;
17391 ReduxWidth /= 2;
17392 return IsAnyRedOpGathered;
17393 };
17394 bool AnyVectorized = false;
17395 while (Pos < NumReducedVals - ReduxWidth + 1 &&
17396 ReduxWidth >= ReductionLimit) {
17397 // Dependency in tree of the reduction ops - drop this attempt, try
17398 // later.
17399 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
17400 Start == 0) {
17401 CheckForReusedReductionOps = true;
17402 break;
17403 }
17404 PrevReduxWidth = ReduxWidth;
17405 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
17406 // Beeing analyzed already - skip.
17407 if (V.areAnalyzedReductionVals(VL)) {
17408 (void)AdjustReducedVals(/*IgnoreVL=*/true);
17409 continue;
17410 }
17411 // Early exit if any of the reduction values were deleted during
17412 // previous vectorization attempts.
17413 if (any_of(VL, [&V](Value *RedVal) {
17414 auto *RedValI = dyn_cast<Instruction>(RedVal);
17415 if (!RedValI)
17416 return false;
17417 return V.isDeleted(RedValI);
17418 }))
17419 break;
17420 V.buildTree(VL, IgnoreList);
17421 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
17422 if (!AdjustReducedVals())
17423 V.analyzedReductionVals(VL);
17424 continue;
17425 }
17426 if (V.isLoadCombineReductionCandidate(RdxKind)) {
17427 if (!AdjustReducedVals())
17428 V.analyzedReductionVals(VL);
17429 continue;
17430 }
17431 V.reorderTopToBottom();
17432 // No need to reorder the root node at all.
17433 V.reorderBottomToTop(/*IgnoreReorder=*/true);
17434 // Keep extracted other reduction values, if they are used in the
17435 // vectorization trees.
17436 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
17437 ExternallyUsedValues);
17438 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
17439 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
17440 continue;
17441 for (Value *V : ReducedVals[Cnt])
17442 if (isa<Instruction>(V))
17443 LocalExternallyUsedValues[TrackedVals[V]];
17444 }
17445 if (!IsSupportedHorRdxIdentityOp) {
17446 // Number of uses of the candidates in the vector of values.
17447 assert(SameValuesCounter.empty() &&
17448 "Reused values counter map is not empty");
17449 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17450 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17451 continue;
17452 Value *V = Candidates[Cnt];
17453 Value *OrigV = TrackedToOrig.find(V)->second;
17454 ++SameValuesCounter[OrigV];
17455 }
17456 }
17457 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
17458 // Gather externally used values.
17460 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17461 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17462 continue;
17463 Value *RdxVal = Candidates[Cnt];
17464 if (!Visited.insert(RdxVal).second)
17465 continue;
17466 // Check if the scalar was vectorized as part of the vectorization
17467 // tree but not the top node.
17468 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
17469 LocalExternallyUsedValues[RdxVal];
17470 continue;
17471 }
17472 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17473 unsigned NumOps =
17474 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
17475 if (NumOps != ReducedValsToOps.find(OrigV)->second.size())
17476 LocalExternallyUsedValues[RdxVal];
17477 }
17478 // Do not need the list of reused scalars in regular mode anymore.
17479 if (!IsSupportedHorRdxIdentityOp)
17480 SameValuesCounter.clear();
17481 for (Value *RdxVal : VL)
17482 if (RequiredExtract.contains(RdxVal))
17483 LocalExternallyUsedValues[RdxVal];
17484 // Update LocalExternallyUsedValues for the scalar, replaced by
17485 // extractelement instructions.
17486 DenseMap<Value *, Value *> ReplacementToExternal;
17487 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals)
17488 ReplacementToExternal.try_emplace(Pair.second, Pair.first);
17489 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
17490 Value *Ext = Pair.first;
17491 auto RIt = ReplacementToExternal.find(Ext);
17492 while (RIt != ReplacementToExternal.end()) {
17493 Ext = RIt->second;
17494 RIt = ReplacementToExternal.find(Ext);
17495 }
17496 auto *It = ExternallyUsedValues.find(Ext);
17497 if (It == ExternallyUsedValues.end())
17498 continue;
17499 LocalExternallyUsedValues[Pair.second].append(It->second);
17500 }
17501 V.buildExternalUses(LocalExternallyUsedValues);
17502
17503 V.computeMinimumValueSizes();
17504 V.transformNodes();
17505
17506 // Estimate cost.
17507 InstructionCost TreeCost = V.getTreeCost(VL);
17508 InstructionCost ReductionCost =
17509 getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
17510 InstructionCost Cost = TreeCost + ReductionCost;
17511 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
17512 << " for reduction\n");
17513 if (!Cost.isValid())
17514 break;
17515 if (Cost >= -SLPCostThreshold) {
17516 V.getORE()->emit([&]() {
17518 SV_NAME, "HorSLPNotBeneficial",
17519 ReducedValsToOps.find(VL[0])->second.front())
17520 << "Vectorizing horizontal reduction is possible "
17521 << "but not beneficial with cost " << ore::NV("Cost", Cost)
17522 << " and threshold "
17523 << ore::NV("Threshold", -SLPCostThreshold);
17524 });
17525 if (!AdjustReducedVals())
17526 V.analyzedReductionVals(VL);
17527 continue;
17528 }
17529
17530 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
17531 << Cost << ". (HorRdx)\n");
17532 V.getORE()->emit([&]() {
17533 return OptimizationRemark(
17534 SV_NAME, "VectorizedHorizontalReduction",
17535 ReducedValsToOps.find(VL[0])->second.front())
17536 << "Vectorized horizontal reduction with cost "
17537 << ore::NV("Cost", Cost) << " and with tree size "
17538 << ore::NV("TreeSize", V.getTreeSize());
17539 });
17540
17541 Builder.setFastMathFlags(RdxFMF);
17542
17543 // Emit a reduction. If the root is a select (min/max idiom), the insert
17544 // point is the compare condition of that select.
17545 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
17546 Instruction *InsertPt = RdxRootInst;
17547 if (IsCmpSelMinMax)
17548 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
17549
17550 // Vectorize a tree.
17551 Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,
17552 ReplacedExternals, InsertPt);
17553
17554 Builder.SetInsertPoint(InsertPt);
17555
17556 // To prevent poison from leaking across what used to be sequential,
17557 // safe, scalar boolean logic operations, the reduction operand must be
17558 // frozen.
17559 if ((isBoolLogicOp(RdxRootInst) ||
17560 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
17561 !isGuaranteedNotToBePoison(VectorizedRoot))
17562 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
17563
17564 // Emit code to correctly handle reused reduced values, if required.
17565 if (OptReusedScalars && !SameScaleFactor) {
17566 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
17567 SameValuesCounter, TrackedToOrig);
17568 }
17569
17570 Value *ReducedSubTree =
17571 emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
17572 if (ReducedSubTree->getType() != VL.front()->getType()) {
17573 assert(ReducedSubTree->getType() != VL.front()->getType() &&
17574 "Expected different reduction type.");
17575 ReducedSubTree =
17576 Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
17577 V.isSignedMinBitwidthRootNode());
17578 }
17579
17580 // Improved analysis for add/fadd/xor reductions with same scale factor
17581 // for all operands of reductions. We can emit scalar ops for them
17582 // instead.
17583 if (OptReusedScalars && SameScaleFactor)
17584 ReducedSubTree = emitScaleForReusedOps(
17585 ReducedSubTree, Builder, SameValuesCounter.front().second);
17586
17587 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
17588 // Count vectorized reduced values to exclude them from final reduction.
17589 for (Value *RdxVal : VL) {
17590 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17591 if (IsSupportedHorRdxIdentityOp) {
17592 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
17593 continue;
17594 }
17595 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
17596 if (!V.isVectorized(RdxVal))
17597 RequiredExtract.insert(RdxVal);
17598 }
17599 Pos += ReduxWidth;
17600 Start = Pos;
17601 ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
17602 AnyVectorized = true;
17603 }
17604 if (OptReusedScalars && !AnyVectorized) {
17605 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
17606 Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second);
17607 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17608 Value *OrigV = TrackedToOrig.find(P.first)->second;
17609 VectorizedVals.try_emplace(OrigV, P.second);
17610 }
17611 continue;
17612 }
17613 }
17614 if (VectorizedTree) {
17615 // Reorder operands of bool logical op in the natural order to avoid
17616 // possible problem with poison propagation. If not possible to reorder
17617 // (both operands are originally RHS), emit an extra freeze instruction
17618 // for the LHS operand.
17619 // I.e., if we have original code like this:
17620 // RedOp1 = select i1 ?, i1 LHS, i1 false
17621 // RedOp2 = select i1 RHS, i1 ?, i1 false
17622
17623 // Then, we swap LHS/RHS to create a new op that matches the poison
17624 // semantics of the original code.
17625
17626 // If we have original code like this and both values could be poison:
17627 // RedOp1 = select i1 ?, i1 LHS, i1 false
17628 // RedOp2 = select i1 ?, i1 RHS, i1 false
17629
17630 // Then, we must freeze LHS in the new op.
17631 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
17632 Instruction *RedOp1,
17633 Instruction *RedOp2,
17634 bool InitStep) {
17635 if (!AnyBoolLogicOp)
17636 return;
17637 if (isBoolLogicOp(RedOp1) &&
17638 ((!InitStep && LHS == VectorizedTree) ||
17639 getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
17640 return;
17641 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
17642 getRdxOperand(RedOp2, 0) == RHS ||
17644 std::swap(LHS, RHS);
17645 return;
17646 }
17647 if (LHS != VectorizedTree)
17648 LHS = Builder.CreateFreeze(LHS);
17649 };
17650 // Finish the reduction.
17651 // Need to add extra arguments and not vectorized possible reduction
17652 // values.
17653 // Try to avoid dependencies between the scalar remainders after
17654 // reductions.
17655 auto FinalGen =
17657 bool InitStep) {
17658 unsigned Sz = InstVals.size();
17660 Sz % 2);
17661 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
17662 Instruction *RedOp = InstVals[I + 1].first;
17663 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
17664 Value *RdxVal1 = InstVals[I].second;
17665 Value *StableRdxVal1 = RdxVal1;
17666 auto It1 = TrackedVals.find(RdxVal1);
17667 if (It1 != TrackedVals.end())
17668 StableRdxVal1 = It1->second;
17669 Value *RdxVal2 = InstVals[I + 1].second;
17670 Value *StableRdxVal2 = RdxVal2;
17671 auto It2 = TrackedVals.find(RdxVal2);
17672 if (It2 != TrackedVals.end())
17673 StableRdxVal2 = It2->second;
17674 // To prevent poison from leaking across what used to be
17675 // sequential, safe, scalar boolean logic operations, the
17676 // reduction operand must be frozen.
17677 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
17678 RedOp, InitStep);
17679 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
17680 StableRdxVal2, "op.rdx", ReductionOps);
17681 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
17682 }
17683 if (Sz % 2 == 1)
17684 ExtraReds[Sz / 2] = InstVals.back();
17685 return ExtraReds;
17686 };
17688 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
17689 VectorizedTree);
17691 for (ArrayRef<Value *> Candidates : ReducedVals) {
17692 for (Value *RdxVal : Candidates) {
17693 if (!Visited.insert(RdxVal).second)
17694 continue;
17695 unsigned NumOps = VectorizedVals.lookup(RdxVal);
17696 for (Instruction *RedOp :
17697 ArrayRef(ReducedValsToOps.find(RdxVal)->second)
17698 .drop_back(NumOps))
17699 ExtraReductions.emplace_back(RedOp, RdxVal);
17700 }
17701 }
17702 for (auto &Pair : ExternallyUsedValues) {
17703 // Add each externally used value to the final reduction.
17704 for (auto *I : Pair.second)
17705 ExtraReductions.emplace_back(I, Pair.first);
17706 }
17707 // Iterate through all not-vectorized reduction values/extra arguments.
17708 bool InitStep = true;
17709 while (ExtraReductions.size() > 1) {
17711 FinalGen(ExtraReductions, InitStep);
17712 ExtraReductions.swap(NewReds);
17713 InitStep = false;
17714 }
17715 VectorizedTree = ExtraReductions.front().second;
17716
17717 ReductionRoot->replaceAllUsesWith(VectorizedTree);
17718
17719 // The original scalar reduction is expected to have no remaining
17720 // uses outside the reduction tree itself. Assert that we got this
17721 // correct, replace internal uses with undef, and mark for eventual
17722 // deletion.
17723#ifndef NDEBUG
17724 SmallSet<Value *, 4> IgnoreSet;
17725 for (ArrayRef<Value *> RdxOps : ReductionOps)
17726 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
17727#endif
17728 for (ArrayRef<Value *> RdxOps : ReductionOps) {
17729 for (Value *Ignore : RdxOps) {
17730 if (!Ignore)
17731 continue;
17732#ifndef NDEBUG
17733 for (auto *U : Ignore->users()) {
17734 assert(IgnoreSet.count(U) &&
17735 "All users must be either in the reduction ops list.");
17736 }
17737#endif
17738 if (!Ignore->use_empty()) {
17739 Value *P = PoisonValue::get(Ignore->getType());
17740 Ignore->replaceAllUsesWith(P);
17741 }
17742 }
17743 V.removeInstructionsAndOperands(RdxOps);
17744 }
17745 } else if (!CheckForReusedReductionOps) {
17746 for (ReductionOpsType &RdxOps : ReductionOps)
17747 for (Value *RdxOp : RdxOps)
17748 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17749 }
17750 return VectorizedTree;
17751 }
17752
17753private:
17754 /// Calculate the cost of a reduction.
17755 InstructionCost getReductionCost(TargetTransformInfo *TTI,
17756 ArrayRef<Value *> ReducedVals,
17757 bool IsCmpSelMinMax, unsigned ReduxWidth,
17758 FastMathFlags FMF) {
17760 Type *ScalarTy = ReducedVals.front()->getType();
17761 FixedVectorType *VectorTy = getWidenedType(ScalarTy, ReduxWidth);
17762 InstructionCost VectorCost = 0, ScalarCost;
17763 // If all of the reduced values are constant, the vector cost is 0, since
17764 // the reduction value can be calculated at the compile time.
17765 bool AllConsts = allConstant(ReducedVals);
17766 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
17768 // Scalar cost is repeated for N-1 elements.
17769 int Cnt = ReducedVals.size();
17770 for (Value *RdxVal : ReducedVals) {
17771 if (Cnt == 1)
17772 break;
17773 --Cnt;
17774 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
17775 Cost += GenCostFn();
17776 continue;
17777 }
17778 InstructionCost ScalarCost = 0;
17779 for (User *U : RdxVal->users()) {
17780 auto *RdxOp = cast<Instruction>(U);
17781 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
17782 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
17783 continue;
17784 }
17785 ScalarCost = InstructionCost::getInvalid();
17786 break;
17787 }
17788 if (ScalarCost.isValid())
17789 Cost += ScalarCost;
17790 else
17791 Cost += GenCostFn();
17792 }
17793 return Cost;
17794 };
17795 switch (RdxKind) {
17796 case RecurKind::Add:
17797 case RecurKind::Mul:
17798 case RecurKind::Or:
17799 case RecurKind::And:
17800 case RecurKind::Xor:
17801 case RecurKind::FAdd:
17802 case RecurKind::FMul: {
17803 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
17804 if (!AllConsts)
17805 VectorCost =
17806 TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
17807 ScalarCost = EvaluateScalarCost([&]() {
17808 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
17809 });
17810 break;
17811 }
17812 case RecurKind::FMax:
17813 case RecurKind::FMin:
17814 case RecurKind::FMaximum:
17815 case RecurKind::FMinimum:
17816 case RecurKind::SMax:
17817 case RecurKind::SMin:
17818 case RecurKind::UMax:
17819 case RecurKind::UMin: {
17821 if (!AllConsts)
17822 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
17823 ScalarCost = EvaluateScalarCost([&]() {
17824 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
17825 return TTI->getIntrinsicInstrCost(ICA, CostKind);
17826 });
17827 break;
17828 }
17829 default:
17830 llvm_unreachable("Expected arithmetic or min/max reduction operation");
17831 }
17832
17833 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
17834 << " for reduction of " << shortBundleName(ReducedVals)
17835 << " (It is a splitting reduction)\n");
17836 return VectorCost - ScalarCost;
17837 }
17838
17839 /// Emit a horizontal reduction of the vectorized value.
17840 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
17841 unsigned ReduxWidth, const TargetTransformInfo *TTI) {
17842 assert(VectorizedValue && "Need to have a vectorized tree node");
17843 assert(isPowerOf2_32(ReduxWidth) &&
17844 "We only handle power-of-two reductions for now");
17845 assert(RdxKind != RecurKind::FMulAdd &&
17846 "A call to the llvm.fmuladd intrinsic is not handled yet");
17847
17848 ++NumVectorInstructions;
17849 return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind);
17850 }
17851
17852 /// Emits optimized code for unique scalar value reused \p Cnt times.
17853 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17854 unsigned Cnt) {
17855 assert(IsSupportedHorRdxIdentityOp &&
17856 "The optimization of matched scalar identity horizontal reductions "
17857 "must be supported.");
17858 switch (RdxKind) {
17859 case RecurKind::Add: {
17860 // res = mul vv, n
17861 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
17862 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
17863 << VectorizedValue << ". (HorRdx)\n");
17864 return Builder.CreateMul(VectorizedValue, Scale);
17865 }
17866 case RecurKind::Xor: {
17867 // res = n % 2 ? 0 : vv
17868 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
17869 << ". (HorRdx)\n");
17870 if (Cnt % 2 == 0)
17871 return Constant::getNullValue(VectorizedValue->getType());
17872 return VectorizedValue;
17873 }
17874 case RecurKind::FAdd: {
17875 // res = fmul v, n
17876 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
17877 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
17878 << VectorizedValue << ". (HorRdx)\n");
17879 return Builder.CreateFMul(VectorizedValue, Scale);
17880 }
17881 case RecurKind::And:
17882 case RecurKind::Or:
17883 case RecurKind::SMax:
17884 case RecurKind::SMin:
17885 case RecurKind::UMax:
17886 case RecurKind::UMin:
17887 case RecurKind::FMax:
17888 case RecurKind::FMin:
17889 case RecurKind::FMaximum:
17890 case RecurKind::FMinimum:
17891 // res = vv
17892 return VectorizedValue;
17893 case RecurKind::Mul:
17894 case RecurKind::FMul:
17895 case RecurKind::FMulAdd:
17896 case RecurKind::IAnyOf:
17897 case RecurKind::FAnyOf:
17898 case RecurKind::None:
17899 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
17900 }
17901 return nullptr;
17902 }
17903
17904 /// Emits actual operation for the scalar identity values, found during
17905 /// horizontal reduction analysis.
17906 Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17907 BoUpSLP &R,
17908 const MapVector<Value *, unsigned> &SameValuesCounter,
17909 const DenseMap<Value *, Value *> &TrackedToOrig) {
17910 assert(IsSupportedHorRdxIdentityOp &&
17911 "The optimization of matched scalar identity horizontal reductions "
17912 "must be supported.");
17913 ArrayRef<Value *> VL = R.getRootNodeScalars();
17914 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
17915 if (VTy->getElementType() != VL.front()->getType()) {
17916 VectorizedValue = Builder.CreateIntCast(
17917 VectorizedValue,
17918 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
17919 R.isSignedMinBitwidthRootNode());
17920 }
17921 switch (RdxKind) {
17922 case RecurKind::Add: {
17923 // root = mul prev_root, <1, 1, n, 1>
17925 for (Value *V : VL) {
17926 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17927 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
17928 }
17929 auto *Scale = ConstantVector::get(Vals);
17930 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
17931 << VectorizedValue << ". (HorRdx)\n");
17932 return Builder.CreateMul(VectorizedValue, Scale);
17933 }
17934 case RecurKind::And:
17935 case RecurKind::Or:
17936 // No need for multiple or/and(s).
17937 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
17938 << ". (HorRdx)\n");
17939 return VectorizedValue;
17940 case RecurKind::SMax:
17941 case RecurKind::SMin:
17942 case RecurKind::UMax:
17943 case RecurKind::UMin:
17944 case RecurKind::FMax:
17945 case RecurKind::FMin:
17946 case RecurKind::FMaximum:
17947 case RecurKind::FMinimum:
17948 // No need for multiple min/max(s) of the same value.
17949 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
17950 << ". (HorRdx)\n");
17951 return VectorizedValue;
17952 case RecurKind::Xor: {
17953 // Replace values with even number of repeats with 0, since
17954 // x xor x = 0.
17955 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
17956 // 7>, if elements 4th and 6th elements have even number of repeats.
17958 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
17960 std::iota(Mask.begin(), Mask.end(), 0);
17961 bool NeedShuffle = false;
17962 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
17963 Value *V = VL[I];
17964 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17965 if (Cnt % 2 == 0) {
17966 Mask[I] = VF;
17967 NeedShuffle = true;
17968 }
17969 }
17970 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
17971 : Mask) dbgs()
17972 << I << " ";
17973 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
17974 if (NeedShuffle)
17975 VectorizedValue = Builder.CreateShuffleVector(
17976 VectorizedValue,
17977 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
17978 return VectorizedValue;
17979 }
17980 case RecurKind::FAdd: {
17981 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
17983 for (Value *V : VL) {
17984 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17985 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
17986 }
17987 auto *Scale = ConstantVector::get(Vals);
17988 return Builder.CreateFMul(VectorizedValue, Scale);
17989 }
17990 case RecurKind::Mul:
17991 case RecurKind::FMul:
17992 case RecurKind::FMulAdd:
17993 case RecurKind::IAnyOf:
17994 case RecurKind::FAnyOf:
17995 case RecurKind::None:
17996 llvm_unreachable("Unexpected reduction kind for reused scalars.");
17997 }
17998 return nullptr;
17999 }
18000};
18001} // end anonymous namespace
18002
18003/// Gets recurrence kind from the specified value.
18005 return HorizontalReduction::getRdxKind(V);
18006}
18007static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
18008 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
18009 return cast<FixedVectorType>(IE->getType())->getNumElements();
18010
18011 unsigned AggregateSize = 1;
18012 auto *IV = cast<InsertValueInst>(InsertInst);
18013 Type *CurrentType = IV->getType();
18014 do {
18015 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
18016 for (auto *Elt : ST->elements())
18017 if (Elt != ST->getElementType(0)) // check homogeneity
18018 return std::nullopt;
18019 AggregateSize *= ST->getNumElements();
18020 CurrentType = ST->getElementType(0);
18021 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
18022 AggregateSize *= AT->getNumElements();
18023 CurrentType = AT->getElementType();
18024 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
18025 AggregateSize *= VT->getNumElements();
18026 return AggregateSize;
18027 } else if (CurrentType->isSingleValueType()) {
18028 return AggregateSize;
18029 } else {
18030 return std::nullopt;
18031 }
18032 } while (true);
18033}
18034
18035static void findBuildAggregate_rec(Instruction *LastInsertInst,
18037 SmallVectorImpl<Value *> &BuildVectorOpds,
18038 SmallVectorImpl<Value *> &InsertElts,
18039 unsigned OperandOffset) {
18040 do {
18041 Value *InsertedOperand = LastInsertInst->getOperand(1);
18042 std::optional<unsigned> OperandIndex =
18043 getElementIndex(LastInsertInst, OperandOffset);
18044 if (!OperandIndex)
18045 return;
18046 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
18047 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
18048 BuildVectorOpds, InsertElts, *OperandIndex);
18049
18050 } else {
18051 BuildVectorOpds[*OperandIndex] = InsertedOperand;
18052 InsertElts[*OperandIndex] = LastInsertInst;
18053 }
18054 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
18055 } while (LastInsertInst != nullptr &&
18056 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
18057 LastInsertInst->hasOneUse());
18058}
18059
18060/// Recognize construction of vectors like
18061/// %ra = insertelement <4 x float> poison, float %s0, i32 0
18062/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
18063/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
18064/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
18065/// starting from the last insertelement or insertvalue instruction.
18066///
18067/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
18068/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
18069/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
18070///
18071/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
18072///
18073/// \return true if it matches.
18074static bool findBuildAggregate(Instruction *LastInsertInst,
18076 SmallVectorImpl<Value *> &BuildVectorOpds,
18077 SmallVectorImpl<Value *> &InsertElts) {
18078
18079 assert((isa<InsertElementInst>(LastInsertInst) ||
18080 isa<InsertValueInst>(LastInsertInst)) &&
18081 "Expected insertelement or insertvalue instruction!");
18082
18083 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
18084 "Expected empty result vectors!");
18085
18086 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
18087 if (!AggregateSize)
18088 return false;
18089 BuildVectorOpds.resize(*AggregateSize);
18090 InsertElts.resize(*AggregateSize);
18091
18092 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);
18093 llvm::erase(BuildVectorOpds, nullptr);
18094 llvm::erase(InsertElts, nullptr);
18095 if (BuildVectorOpds.size() >= 2)
18096 return true;
18097
18098 return false;
18099}
18100
18101/// Try and get a reduction instruction from a phi node.
18102///
18103/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
18104/// if they come from either \p ParentBB or a containing loop latch.
18105///
18106/// \returns A candidate reduction value if possible, or \code nullptr \endcode
18107/// if not possible.
18109 BasicBlock *ParentBB, LoopInfo *LI) {
18110 // There are situations where the reduction value is not dominated by the
18111 // reduction phi. Vectorizing such cases has been reported to cause
18112 // miscompiles. See PR25787.
18113 auto DominatedReduxValue = [&](Value *R) {
18114 return isa<Instruction>(R) &&
18115 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
18116 };
18117
18118 Instruction *Rdx = nullptr;
18119
18120 // Return the incoming value if it comes from the same BB as the phi node.
18121 if (P->getIncomingBlock(0) == ParentBB) {
18122 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
18123 } else if (P->getIncomingBlock(1) == ParentBB) {
18124 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
18125 }
18126
18127 if (Rdx && DominatedReduxValue(Rdx))
18128 return Rdx;
18129
18130 // Otherwise, check whether we have a loop latch to look at.
18131 Loop *BBL = LI->getLoopFor(ParentBB);
18132 if (!BBL)
18133 return nullptr;
18134 BasicBlock *BBLatch = BBL->getLoopLatch();
18135 if (!BBLatch)
18136 return nullptr;
18137
18138 // There is a loop latch, return the incoming value if it comes from
18139 // that. This reduction pattern occasionally turns up.
18140 if (P->getIncomingBlock(0) == BBLatch) {
18141 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
18142 } else if (P->getIncomingBlock(1) == BBLatch) {
18143 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
18144 }
18145
18146 if (Rdx && DominatedReduxValue(Rdx))
18147 return Rdx;
18148
18149 return nullptr;
18150}
18151
18152static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
18153 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
18154 return true;
18155 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
18156 return true;
18157 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
18158 return true;
18159 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
18160 return true;
18161 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
18162 return true;
18163 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
18164 return true;
18165 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
18166 return true;
18167 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
18168 return true;
18169 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
18170 return true;
18171 return false;
18172}
18173
18174/// We could have an initial reduction that is not an add.
18175/// r *= v1 + v2 + v3 + v4
18176/// In such a case start looking for a tree rooted in the first '+'.
18177/// \Returns the new root if found, which may be nullptr if not an instruction.
18179 Instruction *Root) {
18180 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
18181 isa<IntrinsicInst>(Root)) &&
18182 "Expected binop, select, or intrinsic for reduction matching");
18183 Value *LHS =
18184 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
18185 Value *RHS =
18186 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
18187 if (LHS == Phi)
18188 return dyn_cast<Instruction>(RHS);
18189 if (RHS == Phi)
18190 return dyn_cast<Instruction>(LHS);
18191 return nullptr;
18192}
18193
18194/// \p Returns the first operand of \p I that does not match \p Phi. If
18195/// operand is not an instruction it returns nullptr.
18197 Value *Op0 = nullptr;
18198 Value *Op1 = nullptr;
18199 if (!matchRdxBop(I, Op0, Op1))
18200 return nullptr;
18201 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
18202}
18203
18204/// \Returns true if \p I is a candidate instruction for reduction vectorization.
18206 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
18207 Value *B0 = nullptr, *B1 = nullptr;
18208 bool IsBinop = matchRdxBop(I, B0, B1);
18209 return IsBinop || IsSelect;
18210}
18211
18212bool SLPVectorizerPass::vectorizeHorReduction(
18214 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
18215 if (!ShouldVectorizeHor)
18216 return false;
18217 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
18218
18219 if (Root->getParent() != BB || isa<PHINode>(Root))
18220 return false;
18221
18222 // If we can find a secondary reduction root, use that instead.
18223 auto SelectRoot = [&]() {
18224 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
18225 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
18226 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
18227 return NewRoot;
18228 return Root;
18229 };
18230
18231 // Start analysis starting from Root instruction. If horizontal reduction is
18232 // found, try to vectorize it. If it is not a horizontal reduction or
18233 // vectorization is not possible or not effective, and currently analyzed
18234 // instruction is a binary operation, try to vectorize the operands, using
18235 // pre-order DFS traversal order. If the operands were not vectorized, repeat
18236 // the same procedure considering each operand as a possible root of the
18237 // horizontal reduction.
18238 // Interrupt the process if the Root instruction itself was vectorized or all
18239 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
18240 // If a horizintal reduction was not matched or vectorized we collect
18241 // instructions for possible later attempts for vectorization.
18242 std::queue<std::pair<Instruction *, unsigned>> Stack;
18243 Stack.emplace(SelectRoot(), 0);
18244 SmallPtrSet<Value *, 8> VisitedInstrs;
18245 bool Res = false;
18246 auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
18247 if (R.isAnalyzedReductionRoot(Inst))
18248 return nullptr;
18249 if (!isReductionCandidate(Inst))
18250 return nullptr;
18251 HorizontalReduction HorRdx;
18252 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
18253 return nullptr;
18254 return HorRdx.tryToReduce(R, *DL, TTI, *TLI);
18255 };
18256 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
18257 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
18258 FutureSeed = getNonPhiOperand(Root, P);
18259 if (!FutureSeed)
18260 return false;
18261 }
18262 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
18263 // analysis is done separately.
18264 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
18265 PostponedInsts.push_back(FutureSeed);
18266 return true;
18267 };
18268
18269 while (!Stack.empty()) {
18270 Instruction *Inst;
18271 unsigned Level;
18272 std::tie(Inst, Level) = Stack.front();
18273 Stack.pop();
18274 // Do not try to analyze instruction that has already been vectorized.
18275 // This may happen when we vectorize instruction operands on a previous
18276 // iteration while stack was populated before that happened.
18277 if (R.isDeleted(Inst))
18278 continue;
18279 if (Value *VectorizedV = TryToReduce(Inst)) {
18280 Res = true;
18281 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
18282 // Try to find another reduction.
18283 Stack.emplace(I, Level);
18284 continue;
18285 }
18286 if (R.isDeleted(Inst))
18287 continue;
18288 } else {
18289 // We could not vectorize `Inst` so try to use it as a future seed.
18290 if (!TryAppendToPostponedInsts(Inst)) {
18291 assert(Stack.empty() && "Expected empty stack");
18292 break;
18293 }
18294 }
18295
18296 // Try to vectorize operands.
18297 // Continue analysis for the instruction from the same basic block only to
18298 // save compile time.
18299 if (++Level < RecursionMaxDepth)
18300 for (auto *Op : Inst->operand_values())
18301 if (VisitedInstrs.insert(Op).second)
18302 if (auto *I = dyn_cast<Instruction>(Op))
18303 // Do not try to vectorize CmpInst operands, this is done
18304 // separately.
18305 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
18306 !R.isDeleted(I) && I->getParent() == BB)
18307 Stack.emplace(I, Level);
18308 }
18309 return Res;
18310}
18311
18312bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
18313 BasicBlock *BB, BoUpSLP &R,
18315 SmallVector<WeakTrackingVH> PostponedInsts;
18316 bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
18317 Res |= tryToVectorize(PostponedInsts, R);
18318 return Res;
18319}
18320
18321bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
18322 BoUpSLP &R) {
18323 bool Res = false;
18324 for (Value *V : Insts)
18325 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
18326 Res |= tryToVectorize(Inst, R);
18327 return Res;
18328}
18329
18330bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
18331 BasicBlock *BB, BoUpSLP &R,
18332 bool MaxVFOnly) {
18333 if (!R.canMapToVector(IVI->getType()))
18334 return false;
18335
18336 SmallVector<Value *, 16> BuildVectorOpds;
18337 SmallVector<Value *, 16> BuildVectorInsts;
18338 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
18339 return false;
18340
18341 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
18342 R.getORE()->emit([&]() {
18343 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
18344 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
18345 "trying reduction first.";
18346 });
18347 return false;
18348 }
18349 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
18350 // Aggregate value is unlikely to be processed in vector register.
18351 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
18352}
18353
18354bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
18355 BasicBlock *BB, BoUpSLP &R,
18356 bool MaxVFOnly) {
18357 SmallVector<Value *, 16> BuildVectorInsts;
18358 SmallVector<Value *, 16> BuildVectorOpds;
18360 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
18361 (llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
18362 isFixedVectorShuffle(BuildVectorOpds, Mask)))
18363 return false;
18364
18365 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
18366 R.getORE()->emit([&]() {
18367 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
18368 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
18369 "trying reduction first.";
18370 });
18371 return false;
18372 }
18373 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
18374 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
18375}
18376
18377template <typename T>
18379 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
18380 function_ref<bool(T *, T *)> AreCompatible,
18381 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
18382 bool MaxVFOnly, BoUpSLP &R) {
18383 bool Changed = false;
18384 // Sort by type, parent, operands.
18385 stable_sort(Incoming, Comparator);
18386
18387 // Try to vectorize elements base on their type.
18388 SmallVector<T *> Candidates;
18390 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
18391 VL.clear()) {
18392 // Look for the next elements with the same type, parent and operand
18393 // kinds.
18394 auto *I = dyn_cast<Instruction>(*IncIt);
18395 if (!I || R.isDeleted(I)) {
18396 ++IncIt;
18397 continue;
18398 }
18399 auto *SameTypeIt = IncIt;
18400 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
18401 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
18402 AreCompatible(*SameTypeIt, *IncIt))) {
18403 auto *I = dyn_cast<Instruction>(*SameTypeIt);
18404 ++SameTypeIt;
18405 if (I && !R.isDeleted(I))
18406 VL.push_back(cast<T>(I));
18407 }
18408
18409 // Try to vectorize them.
18410 unsigned NumElts = VL.size();
18411 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
18412 << NumElts << ")\n");
18413 // The vectorization is a 3-state attempt:
18414 // 1. Try to vectorize instructions with the same/alternate opcodes with the
18415 // size of maximal register at first.
18416 // 2. Try to vectorize remaining instructions with the same type, if
18417 // possible. This may result in the better vectorization results rather than
18418 // if we try just to vectorize instructions with the same/alternate opcodes.
18419 // 3. Final attempt to try to vectorize all instructions with the
18420 // same/alternate ops only, this may result in some extra final
18421 // vectorization.
18422 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
18423 // Success start over because instructions might have been changed.
18424 Changed = true;
18425 VL.swap(Candidates);
18426 Candidates.clear();
18427 for (T *V : VL) {
18428 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
18429 Candidates.push_back(V);
18430 }
18431 } else {
18432 /// \Returns the minimum number of elements that we will attempt to
18433 /// vectorize.
18434 auto GetMinNumElements = [&R](Value *V) {
18435 unsigned EltSize = R.getVectorElementSize(V);
18436 return std::max(2U, R.getMaxVecRegSize() / EltSize);
18437 };
18438 if (NumElts < GetMinNumElements(*IncIt) &&
18439 (Candidates.empty() ||
18440 Candidates.front()->getType() == (*IncIt)->getType())) {
18441 for (T *V : VL) {
18442 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
18443 Candidates.push_back(V);
18444 }
18445 }
18446 }
18447 // Final attempt to vectorize instructions with the same types.
18448 if (Candidates.size() > 1 &&
18449 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
18450 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
18451 // Success start over because instructions might have been changed.
18452 Changed = true;
18453 } else if (MaxVFOnly) {
18454 // Try to vectorize using small vectors.
18456 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
18457 VL.clear()) {
18458 auto *I = dyn_cast<Instruction>(*It);
18459 if (!I || R.isDeleted(I)) {
18460 ++It;
18461 continue;
18462 }
18463 auto *SameTypeIt = It;
18464 while (SameTypeIt != End &&
18465 (!isa<Instruction>(*SameTypeIt) ||
18466 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
18467 AreCompatible(*SameTypeIt, *It))) {
18468 auto *I = dyn_cast<Instruction>(*SameTypeIt);
18469 ++SameTypeIt;
18470 if (I && !R.isDeleted(I))
18471 VL.push_back(cast<T>(I));
18472 }
18473 unsigned NumElts = VL.size();
18474 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
18475 /*MaxVFOnly=*/false))
18476 Changed = true;
18477 It = SameTypeIt;
18478 }
18479 }
18480 Candidates.clear();
18481 }
18482
18483 // Start over at the next instruction of a different type (or the end).
18484 IncIt = SameTypeIt;
18485 }
18486 return Changed;
18487}
18488
18489/// Compare two cmp instructions. If IsCompatibility is true, function returns
18490/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
18491/// operands. If IsCompatibility is false, function implements strict weak
18492/// ordering relation between two cmp instructions, returning true if the first
18493/// instruction is "less" than the second, i.e. its predicate is less than the
18494/// predicate of the second or the operands IDs are less than the operands IDs
18495/// of the second cmp instruction.
18496template <bool IsCompatibility>
18497static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
18498 const DominatorTree &DT) {
18499 assert(isValidElementType(V->getType()) &&
18500 isValidElementType(V2->getType()) &&
18501 "Expected valid element types only.");
18502 if (V == V2)
18503 return IsCompatibility;
18504 auto *CI1 = cast<CmpInst>(V);
18505 auto *CI2 = cast<CmpInst>(V2);
18506 if (CI1->getOperand(0)->getType()->getTypeID() <
18507 CI2->getOperand(0)->getType()->getTypeID())
18508 return !IsCompatibility;
18509 if (CI1->getOperand(0)->getType()->getTypeID() >
18510 CI2->getOperand(0)->getType()->getTypeID())
18511 return false;
18512 CmpInst::Predicate Pred1 = CI1->getPredicate();
18513 CmpInst::Predicate Pred2 = CI2->getPredicate();
18516 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
18517 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
18518 if (BasePred1 < BasePred2)
18519 return !IsCompatibility;
18520 if (BasePred1 > BasePred2)
18521 return false;
18522 // Compare operands.
18523 bool CI1Preds = Pred1 == BasePred1;
18524 bool CI2Preds = Pred2 == BasePred1;
18525 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
18526 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
18527 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
18528 if (Op1 == Op2)
18529 continue;
18530 if (Op1->getValueID() < Op2->getValueID())
18531 return !IsCompatibility;
18532 if (Op1->getValueID() > Op2->getValueID())
18533 return false;
18534 if (auto *I1 = dyn_cast<Instruction>(Op1))
18535 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
18536 if (IsCompatibility) {
18537 if (I1->getParent() != I2->getParent())
18538 return false;
18539 } else {
18540 // Try to compare nodes with same parent.
18541 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
18542 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
18543 if (!NodeI1)
18544 return NodeI2 != nullptr;
18545 if (!NodeI2)
18546 return false;
18547 assert((NodeI1 == NodeI2) ==
18548 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18549 "Different nodes should have different DFS numbers");
18550 if (NodeI1 != NodeI2)
18551 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18552 }
18553 InstructionsState S = getSameOpcode({I1, I2}, TLI);
18554 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
18555 continue;
18556 if (IsCompatibility)
18557 return false;
18558 if (I1->getOpcode() != I2->getOpcode())
18559 return I1->getOpcode() < I2->getOpcode();
18560 }
18561 }
18562 return IsCompatibility;
18563}
18564
18565template <typename ItT>
18566bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
18567 BasicBlock *BB, BoUpSLP &R) {
18568 bool Changed = false;
18569 // Try to find reductions first.
18570 for (CmpInst *I : CmpInsts) {
18571 if (R.isDeleted(I))
18572 continue;
18573 for (Value *Op : I->operands())
18574 if (auto *RootOp = dyn_cast<Instruction>(Op))
18575 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI);
18576 }
18577 // Try to vectorize operands as vector bundles.
18578 for (CmpInst *I : CmpInsts) {
18579 if (R.isDeleted(I))
18580 continue;
18581 Changed |= tryToVectorize(I, R);
18582 }
18583 // Try to vectorize list of compares.
18584 // Sort by type, compare predicate, etc.
18585 auto CompareSorter = [&](Value *V, Value *V2) {
18586 if (V == V2)
18587 return false;
18588 return compareCmp<false>(V, V2, *TLI, *DT);
18589 };
18590
18591 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
18592 if (V1 == V2)
18593 return true;
18594 return compareCmp<true>(V1, V2, *TLI, *DT);
18595 };
18596
18598 for (Instruction *V : CmpInsts)
18599 if (!R.isDeleted(V) && isValidElementType(V->getType()))
18600 Vals.push_back(V);
18601 if (Vals.size() <= 1)
18602 return Changed;
18603 Changed |= tryToVectorizeSequence<Value>(
18604 Vals, CompareSorter, AreCompatibleCompares,
18605 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18606 // Exclude possible reductions from other blocks.
18607 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
18608 return any_of(V->users(), [V](User *U) {
18609 auto *Select = dyn_cast<SelectInst>(U);
18610 return Select &&
18611 Select->getParent() != cast<Instruction>(V)->getParent();
18612 });
18613 });
18614 if (ArePossiblyReducedInOtherBlock)
18615 return false;
18616 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18617 },
18618 /*MaxVFOnly=*/true, R);
18619 return Changed;
18620}
18621
18622bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
18623 BasicBlock *BB, BoUpSLP &R) {
18624 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
18625 "This function only accepts Insert instructions");
18626 bool OpsChanged = false;
18627 SmallVector<WeakTrackingVH> PostponedInsts;
18628 for (auto *I : reverse(Instructions)) {
18629 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
18630 if (R.isDeleted(I) || isa<CmpInst>(I))
18631 continue;
18632 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
18633 OpsChanged |=
18634 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
18635 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
18636 OpsChanged |=
18637 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
18638 }
18639 // pass2 - try to vectorize reductions only
18640 if (R.isDeleted(I))
18641 continue;
18642 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
18643 if (R.isDeleted(I) || isa<CmpInst>(I))
18644 continue;
18645 // pass3 - try to match and vectorize a buildvector sequence.
18646 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
18647 OpsChanged |=
18648 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
18649 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
18650 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
18651 /*MaxVFOnly=*/false);
18652 }
18653 }
18654 // Now try to vectorize postponed instructions.
18655 OpsChanged |= tryToVectorize(PostponedInsts, R);
18656
18657 Instructions.clear();
18658 return OpsChanged;
18659}
18660
18661bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
18662 bool Changed = false;
18664 SmallPtrSet<Value *, 16> VisitedInstrs;
18665 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
18666 // node. Allows better to identify the chains that can be vectorized in the
18667 // better way.
18669 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
18671 isValidElementType(V2->getType()) &&
18672 "Expected vectorizable types only.");
18673 // It is fine to compare type IDs here, since we expect only vectorizable
18674 // types, like ints, floats and pointers, we don't care about other type.
18675 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
18676 return true;
18677 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
18678 return false;
18679 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
18680 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
18681 if (Opcodes1.size() < Opcodes2.size())
18682 return true;
18683 if (Opcodes1.size() > Opcodes2.size())
18684 return false;
18685 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
18686 {
18687 // Instructions come first.
18688 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
18689 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
18690 if (I1 && I2) {
18691 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
18692 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
18693 if (!NodeI1)
18694 return NodeI2 != nullptr;
18695 if (!NodeI2)
18696 return false;
18697 assert((NodeI1 == NodeI2) ==
18698 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18699 "Different nodes should have different DFS numbers");
18700 if (NodeI1 != NodeI2)
18701 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18702 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18703 if (S.getOpcode() && !S.isAltShuffle())
18704 continue;
18705 return I1->getOpcode() < I2->getOpcode();
18706 }
18707 if (I1)
18708 return true;
18709 if (I2)
18710 return false;
18711 }
18712 {
18713 // Non-undef constants come next.
18714 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
18715 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
18716 if (C1 && C2)
18717 continue;
18718 if (C1)
18719 return true;
18720 if (C2)
18721 return false;
18722 }
18723 bool U1 = isa<UndefValue>(Opcodes1[I]);
18724 bool U2 = isa<UndefValue>(Opcodes2[I]);
18725 {
18726 // Non-constant non-instructions come next.
18727 if (!U1 && !U2) {
18728 auto ValID1 = Opcodes1[I]->getValueID();
18729 auto ValID2 = Opcodes2[I]->getValueID();
18730 if (ValID1 == ValID2)
18731 continue;
18732 if (ValID1 < ValID2)
18733 return true;
18734 if (ValID1 > ValID2)
18735 return false;
18736 }
18737 if (!U1)
18738 return true;
18739 if (!U2)
18740 return false;
18741 }
18742 // Undefs come last.
18743 assert(U1 && U2 && "The only thing left should be undef & undef.");
18744 continue;
18745 }
18746 return false;
18747 };
18748 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
18749 if (V1 == V2)
18750 return true;
18751 if (V1->getType() != V2->getType())
18752 return false;
18753 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
18754 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
18755 if (Opcodes1.size() != Opcodes2.size())
18756 return false;
18757 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
18758 // Undefs are compatible with any other value.
18759 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
18760 continue;
18761 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
18762 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
18763 if (R.isDeleted(I1) || R.isDeleted(I2))
18764 return false;
18765 if (I1->getParent() != I2->getParent())
18766 return false;
18767 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18768 if (S.getOpcode())
18769 continue;
18770 return false;
18771 }
18772 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
18773 continue;
18774 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
18775 return false;
18776 }
18777 return true;
18778 };
18779
18780 bool HaveVectorizedPhiNodes = false;
18781 do {
18782 // Collect the incoming values from the PHIs.
18783 Incoming.clear();
18784 for (Instruction &I : *BB) {
18785 auto *P = dyn_cast<PHINode>(&I);
18786 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
18787 break;
18788
18789 // No need to analyze deleted, vectorized and non-vectorizable
18790 // instructions.
18791 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
18792 isValidElementType(P->getType()))
18793 Incoming.push_back(P);
18794 }
18795
18796 if (Incoming.size() <= 1)
18797 break;
18798
18799 // Find the corresponding non-phi nodes for better matching when trying to
18800 // build the tree.
18801 for (Value *V : Incoming) {
18802 SmallVectorImpl<Value *> &Opcodes =
18803 PHIToOpcodes.try_emplace(V).first->getSecond();
18804 if (!Opcodes.empty())
18805 continue;
18806 SmallVector<Value *, 4> Nodes(1, V);
18808 while (!Nodes.empty()) {
18809 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
18810 if (!Visited.insert(PHI).second)
18811 continue;
18812 for (Value *V : PHI->incoming_values()) {
18813 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
18814 Nodes.push_back(PHI1);
18815 continue;
18816 }
18817 Opcodes.emplace_back(V);
18818 }
18819 }
18820 }
18821
18822 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
18823 Incoming, PHICompare, AreCompatiblePHIs,
18824 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18825 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18826 },
18827 /*MaxVFOnly=*/true, R);
18828 Changed |= HaveVectorizedPhiNodes;
18829 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
18830 auto *PHI = dyn_cast<PHINode>(P.first);
18831 return !PHI || R.isDeleted(PHI);
18832 }))
18833 PHIToOpcodes.clear();
18834 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
18835 } while (HaveVectorizedPhiNodes);
18836
18837 VisitedInstrs.clear();
18838
18839 InstSetVector PostProcessInserts;
18840 SmallSetVector<CmpInst *, 8> PostProcessCmps;
18841 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
18842 // also vectorizes `PostProcessCmps`.
18843 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
18844 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
18845 if (VectorizeCmps) {
18846 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
18847 PostProcessCmps.clear();
18848 }
18849 PostProcessInserts.clear();
18850 return Changed;
18851 };
18852 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
18853 auto IsInPostProcessInstrs = [&](Instruction *I) {
18854 if (auto *Cmp = dyn_cast<CmpInst>(I))
18855 return PostProcessCmps.contains(Cmp);
18856 return isa<InsertElementInst, InsertValueInst>(I) &&
18857 PostProcessInserts.contains(I);
18858 };
18859 // Returns true if `I` is an instruction without users, like terminator, or
18860 // function call with ignored return value, store. Ignore unused instructions
18861 // (basing on instruction type, except for CallInst and InvokeInst).
18862 auto HasNoUsers = [](Instruction *I) {
18863 return I->use_empty() &&
18864 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
18865 };
18866 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
18867 // Skip instructions with scalable type. The num of elements is unknown at
18868 // compile-time for scalable type.
18869 if (isa<ScalableVectorType>(It->getType()))
18870 continue;
18871
18872 // Skip instructions marked for the deletion.
18873 if (R.isDeleted(&*It))
18874 continue;
18875 // We may go through BB multiple times so skip the one we have checked.
18876 if (!VisitedInstrs.insert(&*It).second) {
18877 if (HasNoUsers(&*It) &&
18878 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
18879 // We would like to start over since some instructions are deleted
18880 // and the iterator may become invalid value.
18881 Changed = true;
18882 It = BB->begin();
18883 E = BB->end();
18884 }
18885 continue;
18886 }
18887
18888 if (isa<DbgInfoIntrinsic>(It))
18889 continue;
18890
18891 // Try to vectorize reductions that use PHINodes.
18892 if (PHINode *P = dyn_cast<PHINode>(It)) {
18893 // Check that the PHI is a reduction PHI.
18894 if (P->getNumIncomingValues() == 2) {
18895 // Try to match and vectorize a horizontal reduction.
18896 Instruction *Root = getReductionInstr(DT, P, BB, LI);
18897 if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
18898 Changed = true;
18899 It = BB->begin();
18900 E = BB->end();
18901 continue;
18902 }
18903 }
18904 // Try to vectorize the incoming values of the PHI, to catch reductions
18905 // that feed into PHIs.
18906 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
18907 // Skip if the incoming block is the current BB for now. Also, bypass
18908 // unreachable IR for efficiency and to avoid crashing.
18909 // TODO: Collect the skipped incoming values and try to vectorize them
18910 // after processing BB.
18911 if (BB == P->getIncomingBlock(I) ||
18912 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
18913 continue;
18914
18915 // Postponed instructions should not be vectorized here, delay their
18916 // vectorization.
18917 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
18918 PI && !IsInPostProcessInstrs(PI)) {
18919 bool Res = vectorizeRootInstruction(nullptr, PI,
18920 P->getIncomingBlock(I), R, TTI);
18921 Changed |= Res;
18922 if (Res && R.isDeleted(P)) {
18923 It = BB->begin();
18924 E = BB->end();
18925 break;
18926 }
18927 }
18928 }
18929 continue;
18930 }
18931
18932 if (HasNoUsers(&*It)) {
18933 bool OpsChanged = false;
18934 auto *SI = dyn_cast<StoreInst>(It);
18935 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
18936 if (SI) {
18937 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
18938 // Try to vectorize chain in store, if this is the only store to the
18939 // address in the block.
18940 // TODO: This is just a temporarily solution to save compile time. Need
18941 // to investigate if we can safely turn on slp-vectorize-hor-store
18942 // instead to allow lookup for reduction chains in all non-vectorized
18943 // stores (need to check side effects and compile time).
18944 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
18945 SI->getValueOperand()->hasOneUse();
18946 }
18947 if (TryToVectorizeRoot) {
18948 for (auto *V : It->operand_values()) {
18949 // Postponed instructions should not be vectorized here, delay their
18950 // vectorization.
18951 if (auto *VI = dyn_cast<Instruction>(V);
18952 VI && !IsInPostProcessInstrs(VI))
18953 // Try to match and vectorize a horizontal reduction.
18954 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI);
18955 }
18956 }
18957 // Start vectorization of post-process list of instructions from the
18958 // top-tree instructions to try to vectorize as many instructions as
18959 // possible.
18960 OpsChanged |=
18961 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
18962 if (OpsChanged) {
18963 // We would like to start over since some instructions are deleted
18964 // and the iterator may become invalid value.
18965 Changed = true;
18966 It = BB->begin();
18967 E = BB->end();
18968 continue;
18969 }
18970 }
18971
18972 if (isa<InsertElementInst, InsertValueInst>(It))
18973 PostProcessInserts.insert(&*It);
18974 else if (isa<CmpInst>(It))
18975 PostProcessCmps.insert(cast<CmpInst>(&*It));
18976 }
18977
18978 return Changed;
18979}
18980
18981bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
18982 auto Changed = false;
18983 for (auto &Entry : GEPs) {
18984 // If the getelementptr list has fewer than two elements, there's nothing
18985 // to do.
18986 if (Entry.second.size() < 2)
18987 continue;
18988
18989 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
18990 << Entry.second.size() << ".\n");
18991
18992 // Process the GEP list in chunks suitable for the target's supported
18993 // vector size. If a vector register can't hold 1 element, we are done. We
18994 // are trying to vectorize the index computations, so the maximum number of
18995 // elements is based on the size of the index expression, rather than the
18996 // size of the GEP itself (the target's pointer size).
18997 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
18998 return !R.isDeleted(GEP);
18999 });
19000 if (It == Entry.second.end())
19001 continue;
19002 unsigned MaxVecRegSize = R.getMaxVecRegSize();
19003 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
19004 if (MaxVecRegSize < EltSize)
19005 continue;
19006
19007 unsigned MaxElts = MaxVecRegSize / EltSize;
19008 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
19009 auto Len = std::min<unsigned>(BE - BI, MaxElts);
19010 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
19011
19012 // Initialize a set a candidate getelementptrs. Note that we use a
19013 // SetVector here to preserve program order. If the index computations
19014 // are vectorizable and begin with loads, we want to minimize the chance
19015 // of having to reorder them later.
19016 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
19017
19018 // Some of the candidates may have already been vectorized after we
19019 // initially collected them or their index is optimized to constant value.
19020 // If so, they are marked as deleted, so remove them from the set of
19021 // candidates.
19022 Candidates.remove_if([&R](Value *I) {
19023 return R.isDeleted(cast<Instruction>(I)) ||
19024 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
19025 });
19026
19027 // Remove from the set of candidates all pairs of getelementptrs with
19028 // constant differences. Such getelementptrs are likely not good
19029 // candidates for vectorization in a bottom-up phase since one can be
19030 // computed from the other. We also ensure all candidate getelementptr
19031 // indices are unique.
19032 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
19033 auto *GEPI = GEPList[I];
19034 if (!Candidates.count(GEPI))
19035 continue;
19036 auto *SCEVI = SE->getSCEV(GEPList[I]);
19037 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
19038 auto *GEPJ = GEPList[J];
19039 auto *SCEVJ = SE->getSCEV(GEPList[J]);
19040 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
19041 Candidates.remove(GEPI);
19042 Candidates.remove(GEPJ);
19043 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
19044 Candidates.remove(GEPJ);
19045 }
19046 }
19047 }
19048
19049 // We break out of the above computation as soon as we know there are
19050 // fewer than two candidates remaining.
19051 if (Candidates.size() < 2)
19052 continue;
19053
19054 // Add the single, non-constant index of each candidate to the bundle. We
19055 // ensured the indices met these constraints when we originally collected
19056 // the getelementptrs.
19057 SmallVector<Value *, 16> Bundle(Candidates.size());
19058 auto BundleIndex = 0u;
19059 for (auto *V : Candidates) {
19060 auto *GEP = cast<GetElementPtrInst>(V);
19061 auto *GEPIdx = GEP->idx_begin()->get();
19062 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
19063 Bundle[BundleIndex++] = GEPIdx;
19064 }
19065
19066 // Try and vectorize the indices. We are currently only interested in
19067 // gather-like cases of the form:
19068 //
19069 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
19070 //
19071 // where the loads of "a", the loads of "b", and the subtractions can be
19072 // performed in parallel. It's likely that detecting this pattern in a
19073 // bottom-up phase will be simpler and less costly than building a
19074 // full-blown top-down phase beginning at the consecutive loads.
19075 Changed |= tryToVectorizeList(Bundle, R);
19076 }
19077 }
19078 return Changed;
19079}
19080
19081bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
19082 bool Changed = false;
19083 // Sort by type, base pointers and values operand. Value operands must be
19084 // compatible (have the same opcode, same parent), otherwise it is
19085 // definitely not profitable to try to vectorize them.
19086 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
19087 if (V->getValueOperand()->getType()->getTypeID() <
19088 V2->getValueOperand()->getType()->getTypeID())
19089 return true;
19090 if (V->getValueOperand()->getType()->getTypeID() >
19091 V2->getValueOperand()->getType()->getTypeID())
19092 return false;
19093 if (V->getPointerOperandType()->getTypeID() <
19094 V2->getPointerOperandType()->getTypeID())
19095 return true;
19096 if (V->getPointerOperandType()->getTypeID() >
19097 V2->getPointerOperandType()->getTypeID())
19098 return false;
19099 // UndefValues are compatible with all other values.
19100 if (isa<UndefValue>(V->getValueOperand()) ||
19101 isa<UndefValue>(V2->getValueOperand()))
19102 return false;
19103 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
19104 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
19106 DT->getNode(I1->getParent());
19108 DT->getNode(I2->getParent());
19109 assert(NodeI1 && "Should only process reachable instructions");
19110 assert(NodeI2 && "Should only process reachable instructions");
19111 assert((NodeI1 == NodeI2) ==
19112 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
19113 "Different nodes should have different DFS numbers");
19114 if (NodeI1 != NodeI2)
19115 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
19116 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
19117 if (S.getOpcode())
19118 return false;
19119 return I1->getOpcode() < I2->getOpcode();
19120 }
19121 if (isa<Constant>(V->getValueOperand()) &&
19122 isa<Constant>(V2->getValueOperand()))
19123 return false;
19124 return V->getValueOperand()->getValueID() <
19125 V2->getValueOperand()->getValueID();
19126 };
19127
19128 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
19129 if (V1 == V2)
19130 return true;
19131 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
19132 return false;
19133 if (V1->getPointerOperandType() != V2->getPointerOperandType())
19134 return false;
19135 // Undefs are compatible with any other value.
19136 if (isa<UndefValue>(V1->getValueOperand()) ||
19137 isa<UndefValue>(V2->getValueOperand()))
19138 return true;
19139 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
19140 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
19141 if (I1->getParent() != I2->getParent())
19142 return false;
19143 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
19144 return S.getOpcode() > 0;
19145 }
19146 if (isa<Constant>(V1->getValueOperand()) &&
19147 isa<Constant>(V2->getValueOperand()))
19148 return true;
19149 return V1->getValueOperand()->getValueID() ==
19150 V2->getValueOperand()->getValueID();
19151 };
19152
19153 // Attempt to sort and vectorize each of the store-groups.
19155 for (auto &Pair : Stores) {
19156 if (Pair.second.size() < 2)
19157 continue;
19158
19159 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
19160 << Pair.second.size() << ".\n");
19161
19162 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
19163 continue;
19164
19165 // Reverse stores to do bottom-to-top analysis. This is important if the
19166 // values are stores to the same addresses several times, in this case need
19167 // to follow the stores order (reversed to meet the memory dependecies).
19168 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
19169 Pair.second.rend());
19170 Changed |= tryToVectorizeSequence<StoreInst>(
19171 ReversedStores, StoreSorter, AreCompatibleStores,
19172 [&](ArrayRef<StoreInst *> Candidates, bool) {
19173 return vectorizeStores(Candidates, R, Attempted);
19174 },
19175 /*MaxVFOnly=*/false, R);
19176 }
19177 return Changed;
19178}
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:537
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Definition: DataLayout.cpp:947
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
IRTranslator LLVM IR MI
This defines the Use class.
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
Loop::LoopBounds::Direction Direction
Definition: LoopInfo.cpp:231
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(VerifyEach)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > AllowHorRdxIdenityOptimization("slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, cl::desc("Allow optimization of original scalar identity operations on " "matched horizontal reductions."))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static Value * isOneOf(const InstructionsState &S, Value *Op)
Chooses the correct key for scheduling data.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
#define SV_NAME
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static std::string shortBundleName(ArrayRef< Value * > VL)
Print a short descriptor of the instruction bundle suitable for debug output.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt)
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition: VPlanSLP.cpp:154
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:78
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1310
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1636
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:180
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:266
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:424
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:187
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:174
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:228
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:204
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:168
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:195
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:232
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:451
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:438
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:169
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:209
reverse_iterator rend()
Definition: BasicBlock.h:456
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:167
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:229
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1236
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:2070
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1965
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2207
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:2064
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1323
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1401
unsigned arg_size() const
Definition: InstrTypes.h:1408
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1594
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:2061
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:530
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:747
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1104
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:786
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:787
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:781
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:785
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:783
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:909
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:871
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:847
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2269
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:155
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1450
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1399
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:417
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:484
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
Definition: DataLayout.cpp:905
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:211
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
value_type & FindAndConstruct(const KeyT &Key)
Definition: DenseMap.h:364
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:321
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void set()
Definition: FMF.h:62
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:130
Type * getReturnType() const
Definition: DerivedTypes.h:124
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:915
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:91
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2262
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:922
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:508
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2465
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:536
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2270
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1812
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:463
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:933
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1091
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:172
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2540
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:171
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:308
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:217
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1871
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:845
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1758
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:483
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2371
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2402
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2254
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2499
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:468
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1671
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:166
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2278
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2166
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2201
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1831
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2417
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1592
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1366
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:631
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2671
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:282
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:754
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:466
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
Definition: Instruction.h:279
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:280
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
An instruction for reading from memory.
Definition: Instructions.h:174
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:259
Value * getPointerOperand()
Definition: Instructions.h:253
bool isSimple() const
Definition: Instructions.h:245
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:209
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:571
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
iterator end()
Definition: MapVector.h:71
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:55
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:118
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
void reserve(size_type NumEntries)
Grow the MapVector so that it can contain at least NumEntries items before resizing again.
Definition: MapVector.h:64
size_type size() const
Definition: MapVector.h:60
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:83
void clear()
Definition: MapVector.h:88
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
T & front() const
front - Get the first element.
Definition: ArrayRef.h:363
iterator end() const
Definition: ArrayRef.h:357
iterator begin() const
Definition: ArrayRef.h:356
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:376
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
Definition: ArrayRef.h:449
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:155
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1852
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:323
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:361
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:412
iterator end() const
Definition: SmallPtrSet.h:437
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
iterator begin() const
Definition: SmallPtrSet.h:432
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:418
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:236
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:981
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:290
Type * getPointerOperandType() const
Definition: Instructions.h:380
Value * getValueOperand()
Definition: Instructions.h:374
Value * getPointerOperand()
Definition: Instructions.h:377
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:34
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr) const
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:160
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:287
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:166
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:262
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1833
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:242
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Definition: User.h:73
op_iterator op_begin()
Definition: User.h:234
Value * getOperand(unsigned i) const
Definition: User.h:169
iterator_range< value_op_iterator > operand_values()
Definition: User.h:266
The Vector Function Database.
Definition: VectorUtils.h:30
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:71
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:532
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:153
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Type * getElementType() const
Definition: DerivedTypes.h:436
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type size() const
Definition: DenseSet.h:81
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
bool erase(const ValueT &V)
Definition: DenseSet.h:101
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:75
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals=std::nullopt)
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:105
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ Entry
Definition: COFF.h:811
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1513
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:816
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:875
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239
ManagedStatic< cl::opt< FnT >, OptCreatorT > Action
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:853
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1715
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:128
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:988
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:540
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
Value * createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a target reduction of the given vector.
Definition: LoopUtils.cpp:1210
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2400
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7128
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition: Utils.cpp:1671
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:547
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2059
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition: Local.cpp:400
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1754
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:120
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:419
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1308
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ Other
Any other memory.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ Or
Bitwise or logical OR of integers.
@ None
Not a recurrence.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
DWARFExpression::Operation Op
auto max_element(R &&Range)
Definition: STLExtras.h:1986
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:427
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
InstructionCost Cost
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:593
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:471
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2228
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:220
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1450
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1459
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.