LLVM 19.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
91#include <algorithm>
92#include <cassert>
93#include <cstdint>
94#include <iterator>
95#include <memory>
96#include <optional>
97#include <set>
98#include <string>
99#include <tuple>
100#include <utility>
101
102using namespace llvm;
103using namespace llvm::PatternMatch;
104using namespace slpvectorizer;
105
106#define SV_NAME "slp-vectorizer"
107#define DEBUG_TYPE "SLP"
108
109STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
110
111static cl::opt<bool>
112 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
113 cl::desc("Run the SLP vectorization passes"));
114
115static cl::opt<int>
117 cl::desc("Only vectorize if you gain more than this "
118 "number "));
119
120static cl::opt<bool>
121ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
122 cl::desc("Attempt to vectorize horizontal reductions"));
123
125 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
126 cl::desc(
127 "Attempt to vectorize horizontal reductions feeding into a store"));
128
129// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
130// even if we match a reduction but do not vectorize in the end.
132 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
133 cl::desc("Allow optimization of original scalar identity operations on "
134 "matched horizontal reductions."));
135
136static cl::opt<int>
138 cl::desc("Attempt to vectorize for this register size in bits"));
139
142 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
143
144/// Limits the size of scheduling regions in a block.
145/// It avoid long compile times for _very_ large blocks where vector
146/// instructions are spread over a wide range.
147/// This limit is way higher than needed by real-world functions.
148static cl::opt<int>
149ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
150 cl::desc("Limit the size of the SLP scheduling region per block"));
151
153 "slp-min-reg-size", cl::init(128), cl::Hidden,
154 cl::desc("Attempt to vectorize for this register size in bits"));
155
157 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
158 cl::desc("Limit the recursion depth when building a vectorizable tree"));
159
161 "slp-min-tree-size", cl::init(3), cl::Hidden,
162 cl::desc("Only vectorize small trees if they are fully vectorizable"));
163
164// The maximum depth that the look-ahead score heuristic will explore.
165// The higher this value, the higher the compilation time overhead.
167 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
168 cl::desc("The maximum look-ahead depth for operand reordering scores"));
169
170// The maximum depth that the look-ahead score heuristic will explore
171// when it probing among candidates for vectorization tree roots.
172// The higher this value, the higher the compilation time overhead but unlike
173// similar limit for operands ordering this is less frequently used, hence
174// impact of higher value is less noticeable.
176 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
177 cl::desc("The maximum look-ahead depth for searching best rooting option"));
178
180 "slp-min-strided-loads", cl::init(2), cl::Hidden,
181 cl::desc("The minimum number of loads, which should be considered strided, "
182 "if the stride is > 1 or is runtime value"));
183
185 "slp-max-stride", cl::init(8), cl::Hidden,
186 cl::desc("The maximum stride, considered to be profitable."));
187
188static cl::opt<bool>
189 ViewSLPTree("view-slp-tree", cl::Hidden,
190 cl::desc("Display the SLP trees with Graphviz"));
191
192// Limit the number of alias checks. The limit is chosen so that
193// it has no negative effect on the llvm benchmarks.
194static const unsigned AliasedCheckLimit = 10;
195
196// Limit of the number of uses for potentially transformed instructions/values,
197// used in checks to avoid compile-time explode.
198static constexpr int UsesLimit = 8;
199
200// Another limit for the alias checks: The maximum distance between load/store
201// instructions where alias checks are done.
202// This limit is useful for very large basic blocks.
203static const unsigned MaxMemDepDistance = 160;
204
205/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
206/// regions to be handled.
207static const int MinScheduleRegionSize = 16;
208
209/// Predicate for the element types that the SLP vectorizer supports.
210///
211/// The most important thing to filter here are types which are invalid in LLVM
212/// vectors. We also filter target specific types which have absolutely no
213/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
214/// avoids spending time checking the cost model and realizing that they will
215/// be inevitably scalarized.
216static bool isValidElementType(Type *Ty) {
217 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
218 !Ty->isPPC_FP128Ty();
219}
220
221/// \returns True if the value is a constant (but not globals/constant
222/// expressions).
223static bool isConstant(Value *V) {
224 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
225}
226
227/// Checks if \p V is one of vector-like instructions, i.e. undef,
228/// insertelement/extractelement with constant indices for fixed vector type or
229/// extractvalue instruction.
231 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
232 !isa<ExtractValueInst, UndefValue>(V))
233 return false;
234 auto *I = dyn_cast<Instruction>(V);
235 if (!I || isa<ExtractValueInst>(I))
236 return true;
237 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
238 return false;
239 if (isa<ExtractElementInst>(I))
240 return isConstant(I->getOperand(1));
241 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
242 return isConstant(I->getOperand(2));
243}
244
245#if !defined(NDEBUG)
246/// Print a short descriptor of the instruction bundle suitable for debug output.
247static std::string shortBundleName(ArrayRef<Value *> VL) {
248 std::string Result;
249 raw_string_ostream OS(Result);
250 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
251 OS.flush();
252 return Result;
253}
254#endif
255
256/// \returns true if all of the instructions in \p VL are in the same block or
257/// false otherwise.
259 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
260 if (!I0)
261 return false;
263 return true;
264
265 BasicBlock *BB = I0->getParent();
266 for (int I = 1, E = VL.size(); I < E; I++) {
267 auto *II = dyn_cast<Instruction>(VL[I]);
268 if (!II)
269 return false;
270
271 if (BB != II->getParent())
272 return false;
273 }
274 return true;
275}
276
277/// \returns True if all of the values in \p VL are constants (but not
278/// globals/constant expressions).
280 // Constant expressions and globals can't be vectorized like normal integer/FP
281 // constants.
282 return all_of(VL, isConstant);
283}
284
285/// \returns True if all of the values in \p VL are identical or some of them
286/// are UndefValue.
287static bool isSplat(ArrayRef<Value *> VL) {
288 Value *FirstNonUndef = nullptr;
289 for (Value *V : VL) {
290 if (isa<UndefValue>(V))
291 continue;
292 if (!FirstNonUndef) {
293 FirstNonUndef = V;
294 continue;
295 }
296 if (V != FirstNonUndef)
297 return false;
298 }
299 return FirstNonUndef != nullptr;
300}
301
302/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
304 if (auto *Cmp = dyn_cast<CmpInst>(I))
305 return Cmp->isCommutative();
306 if (auto *BO = dyn_cast<BinaryOperator>(I))
307 return BO->isCommutative();
308 // TODO: This should check for generic Instruction::isCommutative(), but
309 // we need to confirm that the caller code correctly handles Intrinsics
310 // for example (does not have 2 operands).
311 return false;
312}
313
314/// \returns inserting index of InsertElement or InsertValue instruction,
315/// using Offset as base offset for index.
316static std::optional<unsigned> getInsertIndex(const Value *InsertInst,
317 unsigned Offset = 0) {
318 int Index = Offset;
319 if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
320 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
321 if (!VT)
322 return std::nullopt;
323 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
324 if (!CI)
325 return std::nullopt;
326 if (CI->getValue().uge(VT->getNumElements()))
327 return std::nullopt;
328 Index *= VT->getNumElements();
329 Index += CI->getZExtValue();
330 return Index;
331 }
332
333 const auto *IV = cast<InsertValueInst>(InsertInst);
334 Type *CurrentType = IV->getType();
335 for (unsigned I : IV->indices()) {
336 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
337 Index *= ST->getNumElements();
338 CurrentType = ST->getElementType(I);
339 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
340 Index *= AT->getNumElements();
341 CurrentType = AT->getElementType();
342 } else {
343 return std::nullopt;
344 }
345 Index += I;
346 }
347 return Index;
348}
349
350namespace {
351/// Specifies the way the mask should be analyzed for undefs/poisonous elements
352/// in the shuffle mask.
353enum class UseMask {
354 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
355 ///< check for the mask elements for the first argument (mask
356 ///< indices are in range [0:VF)).
357 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
358 ///< for the mask elements for the second argument (mask indices
359 ///< are in range [VF:2*VF))
360 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
361 ///< future shuffle elements and mark them as ones as being used
362 ///< in future. Non-undef elements are considered as unused since
363 ///< they're already marked as used in the mask.
364};
365} // namespace
366
367/// Prepares a use bitset for the given mask either for the first argument or
368/// for the second.
370 UseMask MaskArg) {
371 SmallBitVector UseMask(VF, true);
372 for (auto [Idx, Value] : enumerate(Mask)) {
373 if (Value == PoisonMaskElem) {
374 if (MaskArg == UseMask::UndefsAsMask)
375 UseMask.reset(Idx);
376 continue;
377 }
378 if (MaskArg == UseMask::FirstArg && Value < VF)
379 UseMask.reset(Value);
380 else if (MaskArg == UseMask::SecondArg && Value >= VF)
381 UseMask.reset(Value - VF);
382 }
383 return UseMask;
384}
385
386/// Checks if the given value is actually an undefined constant vector.
387/// Also, if the \p UseMask is not empty, tries to check if the non-masked
388/// elements actually mask the insertelement buildvector, if any.
389template <bool IsPoisonOnly = false>
391 const SmallBitVector &UseMask = {}) {
392 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
393 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
394 if (isa<T>(V))
395 return Res;
396 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
397 if (!VecTy)
398 return Res.reset();
399 auto *C = dyn_cast<Constant>(V);
400 if (!C) {
401 if (!UseMask.empty()) {
402 const Value *Base = V;
403 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
404 Base = II->getOperand(0);
405 if (isa<T>(II->getOperand(1)))
406 continue;
407 std::optional<unsigned> Idx = getInsertIndex(II);
408 if (!Idx) {
409 Res.reset();
410 return Res;
411 }
412 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
413 Res.reset(*Idx);
414 }
415 // TODO: Add analysis for shuffles here too.
416 if (V == Base) {
417 Res.reset();
418 } else {
419 SmallBitVector SubMask(UseMask.size(), false);
420 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
421 }
422 } else {
423 Res.reset();
424 }
425 return Res;
426 }
427 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
428 if (Constant *Elem = C->getAggregateElement(I))
429 if (!isa<T>(Elem) &&
430 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
431 Res.reset(I);
432 }
433 return Res;
434}
435
436/// Checks if the vector of instructions can be represented as a shuffle, like:
437/// %x0 = extractelement <4 x i8> %x, i32 0
438/// %x3 = extractelement <4 x i8> %x, i32 3
439/// %y1 = extractelement <4 x i8> %y, i32 1
440/// %y2 = extractelement <4 x i8> %y, i32 2
441/// %x0x0 = mul i8 %x0, %x0
442/// %x3x3 = mul i8 %x3, %x3
443/// %y1y1 = mul i8 %y1, %y1
444/// %y2y2 = mul i8 %y2, %y2
445/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
446/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
447/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
448/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
449/// ret <4 x i8> %ins4
450/// can be transformed into:
451/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
452/// i32 6>
453/// %2 = mul <4 x i8> %1, %1
454/// ret <4 x i8> %2
455/// Mask will return the Shuffle Mask equivalent to the extracted elements.
456/// TODO: Can we split off and reuse the shuffle mask detection from
457/// ShuffleVectorInst/getShuffleCost?
458static std::optional<TargetTransformInfo::ShuffleKind>
460 const auto *It =
461 find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); });
462 if (It == VL.end())
463 return std::nullopt;
464 auto *EI0 = cast<ExtractElementInst>(*It);
465 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
466 return std::nullopt;
467 unsigned Size =
468 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
469 Value *Vec1 = nullptr;
470 Value *Vec2 = nullptr;
471 enum ShuffleMode { Unknown, Select, Permute };
472 ShuffleMode CommonShuffleMode = Unknown;
473 Mask.assign(VL.size(), PoisonMaskElem);
474 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
475 // Undef can be represented as an undef element in a vector.
476 if (isa<UndefValue>(VL[I]))
477 continue;
478 auto *EI = cast<ExtractElementInst>(VL[I]);
479 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
480 return std::nullopt;
481 auto *Vec = EI->getVectorOperand();
482 // We can extractelement from undef or poison vector.
483 if (isUndefVector(Vec).all())
484 continue;
485 // All vector operands must have the same number of vector elements.
486 if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
487 return std::nullopt;
488 if (isa<UndefValue>(EI->getIndexOperand()))
489 continue;
490 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
491 if (!Idx)
492 return std::nullopt;
493 // Undefined behavior if Idx is negative or >= Size.
494 if (Idx->getValue().uge(Size))
495 continue;
496 unsigned IntIdx = Idx->getValue().getZExtValue();
497 Mask[I] = IntIdx;
498 // For correct shuffling we have to have at most 2 different vector operands
499 // in all extractelement instructions.
500 if (!Vec1 || Vec1 == Vec) {
501 Vec1 = Vec;
502 } else if (!Vec2 || Vec2 == Vec) {
503 Vec2 = Vec;
504 Mask[I] += Size;
505 } else {
506 return std::nullopt;
507 }
508 if (CommonShuffleMode == Permute)
509 continue;
510 // If the extract index is not the same as the operation number, it is a
511 // permutation.
512 if (IntIdx != I) {
513 CommonShuffleMode = Permute;
514 continue;
515 }
516 CommonShuffleMode = Select;
517 }
518 // If we're not crossing lanes in different vectors, consider it as blending.
519 if (CommonShuffleMode == Select && Vec2)
521 // If Vec2 was never used, we have a permutation of a single vector, otherwise
522 // we have permutation of 2 vectors.
525}
526
527/// \returns True if Extract{Value,Element} instruction extracts element Idx.
528static std::optional<unsigned> getExtractIndex(Instruction *E) {
529 unsigned Opcode = E->getOpcode();
530 assert((Opcode == Instruction::ExtractElement ||
531 Opcode == Instruction::ExtractValue) &&
532 "Expected extractelement or extractvalue instruction.");
533 if (Opcode == Instruction::ExtractElement) {
534 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
535 if (!CI)
536 return std::nullopt;
537 return CI->getZExtValue();
538 }
539 auto *EI = cast<ExtractValueInst>(E);
540 if (EI->getNumIndices() != 1)
541 return std::nullopt;
542 return *EI->idx_begin();
543}
544
545namespace {
546
547/// Main data required for vectorization of instructions.
548struct InstructionsState {
549 /// The very first instruction in the list with the main opcode.
550 Value *OpValue = nullptr;
551
552 /// The main/alternate instruction.
553 Instruction *MainOp = nullptr;
554 Instruction *AltOp = nullptr;
555
556 /// The main/alternate opcodes for the list of instructions.
557 unsigned getOpcode() const {
558 return MainOp ? MainOp->getOpcode() : 0;
559 }
560
561 unsigned getAltOpcode() const {
562 return AltOp ? AltOp->getOpcode() : 0;
563 }
564
565 /// Some of the instructions in the list have alternate opcodes.
566 bool isAltShuffle() const { return AltOp != MainOp; }
567
568 bool isOpcodeOrAlt(Instruction *I) const {
569 unsigned CheckedOpcode = I->getOpcode();
570 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
571 }
572
573 InstructionsState() = delete;
574 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
575 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
576};
577
578} // end anonymous namespace
579
580/// Chooses the correct key for scheduling data. If \p Op has the same (or
581/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
582/// OpValue.
583static Value *isOneOf(const InstructionsState &S, Value *Op) {
584 auto *I = dyn_cast<Instruction>(Op);
585 if (I && S.isOpcodeOrAlt(I))
586 return Op;
587 return S.OpValue;
588}
589
590/// \returns true if \p Opcode is allowed as part of the main/alternate
591/// instruction for SLP vectorization.
592///
593/// Example of unsupported opcode is SDIV that can potentially cause UB if the
594/// "shuffled out" lane would result in division by zero.
595static bool isValidForAlternation(unsigned Opcode) {
596 if (Instruction::isIntDivRem(Opcode))
597 return false;
598
599 return true;
600}
601
602static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
603 const TargetLibraryInfo &TLI,
604 unsigned BaseIndex = 0);
605
606/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
607/// compatible instructions or constants, or just some other regular values.
608static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
609 Value *Op1, const TargetLibraryInfo &TLI) {
610 return (isConstant(BaseOp0) && isConstant(Op0)) ||
611 (isConstant(BaseOp1) && isConstant(Op1)) ||
612 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
613 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
614 BaseOp0 == Op0 || BaseOp1 == Op1 ||
615 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
616 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
617}
618
619/// \returns true if a compare instruction \p CI has similar "look" and
620/// same predicate as \p BaseCI, "as is" or with its operands and predicate
621/// swapped, false otherwise.
622static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
623 const TargetLibraryInfo &TLI) {
624 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
625 "Assessing comparisons of different types?");
626 CmpInst::Predicate BasePred = BaseCI->getPredicate();
627 CmpInst::Predicate Pred = CI->getPredicate();
629
630 Value *BaseOp0 = BaseCI->getOperand(0);
631 Value *BaseOp1 = BaseCI->getOperand(1);
632 Value *Op0 = CI->getOperand(0);
633 Value *Op1 = CI->getOperand(1);
634
635 return (BasePred == Pred &&
636 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
637 (BasePred == SwappedPred &&
638 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
639}
640
641/// \returns analysis of the Instructions in \p VL described in
642/// InstructionsState, the Opcode that we suppose the whole list
643/// could be vectorized even if its structure is diverse.
644static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
645 const TargetLibraryInfo &TLI,
646 unsigned BaseIndex) {
647 // Make sure these are all Instructions.
648 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
649 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
650
651 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
652 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
653 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
654 CmpInst::Predicate BasePred =
655 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
657 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
658 unsigned AltOpcode = Opcode;
659 unsigned AltIndex = BaseIndex;
660
661 // Check for one alternate opcode from another BinaryOperator.
662 // TODO - generalize to support all operators (types, calls etc.).
663 auto *IBase = cast<Instruction>(VL[BaseIndex]);
664 Intrinsic::ID BaseID = 0;
665 SmallVector<VFInfo> BaseMappings;
666 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
668 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
669 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
670 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
671 }
672 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
673 auto *I = cast<Instruction>(VL[Cnt]);
674 unsigned InstOpcode = I->getOpcode();
675 if (IsBinOp && isa<BinaryOperator>(I)) {
676 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
677 continue;
678 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
679 isValidForAlternation(Opcode)) {
680 AltOpcode = InstOpcode;
681 AltIndex = Cnt;
682 continue;
683 }
684 } else if (IsCastOp && isa<CastInst>(I)) {
685 Value *Op0 = IBase->getOperand(0);
686 Type *Ty0 = Op0->getType();
687 Value *Op1 = I->getOperand(0);
688 Type *Ty1 = Op1->getType();
689 if (Ty0 == Ty1) {
690 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
691 continue;
692 if (Opcode == AltOpcode) {
694 isValidForAlternation(InstOpcode) &&
695 "Cast isn't safe for alternation, logic needs to be updated!");
696 AltOpcode = InstOpcode;
697 AltIndex = Cnt;
698 continue;
699 }
700 }
701 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
702 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
703 Type *Ty0 = BaseInst->getOperand(0)->getType();
704 Type *Ty1 = Inst->getOperand(0)->getType();
705 if (Ty0 == Ty1) {
706 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
707 // Check for compatible operands. If the corresponding operands are not
708 // compatible - need to perform alternate vectorization.
709 CmpInst::Predicate CurrentPred = Inst->getPredicate();
710 CmpInst::Predicate SwappedCurrentPred =
711 CmpInst::getSwappedPredicate(CurrentPred);
712
713 if (E == 2 &&
714 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
715 continue;
716
717 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
718 continue;
719 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
720 if (AltIndex != BaseIndex) {
721 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
722 continue;
723 } else if (BasePred != CurrentPred) {
724 assert(
725 isValidForAlternation(InstOpcode) &&
726 "CmpInst isn't safe for alternation, logic needs to be updated!");
727 AltIndex = Cnt;
728 continue;
729 }
730 CmpInst::Predicate AltPred = AltInst->getPredicate();
731 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
732 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
733 continue;
734 }
735 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
736 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
737 if (Gep->getNumOperands() != 2 ||
738 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
739 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
740 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
742 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
743 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
744 auto *BaseLI = cast<LoadInst>(IBase);
745 if (!LI->isSimple() || !BaseLI->isSimple())
746 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
747 } else if (auto *Call = dyn_cast<CallInst>(I)) {
748 auto *CallBase = cast<CallInst>(IBase);
749 if (Call->getCalledFunction() != CallBase->getCalledFunction())
750 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
751 if (Call->hasOperandBundles() &&
752 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
753 Call->op_begin() + Call->getBundleOperandsEndIndex(),
754 CallBase->op_begin() +
756 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
758 if (ID != BaseID)
759 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
760 if (!ID) {
761 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
762 if (Mappings.size() != BaseMappings.size() ||
763 Mappings.front().ISA != BaseMappings.front().ISA ||
764 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
765 Mappings.front().VectorName != BaseMappings.front().VectorName ||
766 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
767 Mappings.front().Shape.Parameters !=
768 BaseMappings.front().Shape.Parameters)
769 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
770 }
771 }
772 continue;
773 }
774 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
775 }
776
777 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
778 cast<Instruction>(VL[AltIndex]));
779}
780
781/// \returns true if all of the values in \p VL have the same type or false
782/// otherwise.
784 Type *Ty = VL.front()->getType();
785 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
786}
787
788/// \returns True if in-tree use also needs extract. This refers to
789/// possible scalar operand in vectorized instruction.
790static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
791 TargetLibraryInfo *TLI) {
792 unsigned Opcode = UserInst->getOpcode();
793 switch (Opcode) {
794 case Instruction::Load: {
795 LoadInst *LI = cast<LoadInst>(UserInst);
796 return (LI->getPointerOperand() == Scalar);
797 }
798 case Instruction::Store: {
799 StoreInst *SI = cast<StoreInst>(UserInst);
800 return (SI->getPointerOperand() == Scalar);
801 }
802 case Instruction::Call: {
803 CallInst *CI = cast<CallInst>(UserInst);
805 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
806 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
807 Arg.value().get() == Scalar;
808 });
809 }
810 default:
811 return false;
812 }
813}
814
815/// \returns the AA location that is being access by the instruction.
817 if (StoreInst *SI = dyn_cast<StoreInst>(I))
818 return MemoryLocation::get(SI);
819 if (LoadInst *LI = dyn_cast<LoadInst>(I))
820 return MemoryLocation::get(LI);
821 return MemoryLocation();
822}
823
824/// \returns True if the instruction is not a volatile or atomic load/store.
825static bool isSimple(Instruction *I) {
826 if (LoadInst *LI = dyn_cast<LoadInst>(I))
827 return LI->isSimple();
828 if (StoreInst *SI = dyn_cast<StoreInst>(I))
829 return SI->isSimple();
830 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
831 return !MI->isVolatile();
832 return true;
833}
834
835/// Shuffles \p Mask in accordance with the given \p SubMask.
836/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
837/// one but two input vectors.
838static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
839 bool ExtendingManyInputs = false) {
840 if (SubMask.empty())
841 return;
842 assert(
843 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
844 // Check if input scalars were extended to match the size of other node.
845 (SubMask.size() == Mask.size() &&
846 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
847 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
848 "SubMask with many inputs support must be larger than the mask.");
849 if (Mask.empty()) {
850 Mask.append(SubMask.begin(), SubMask.end());
851 return;
852 }
853 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
854 int TermValue = std::min(Mask.size(), SubMask.size());
855 for (int I = 0, E = SubMask.size(); I < E; ++I) {
856 if (SubMask[I] == PoisonMaskElem ||
857 (!ExtendingManyInputs &&
858 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
859 continue;
860 NewMask[I] = Mask[SubMask[I]];
861 }
862 Mask.swap(NewMask);
863}
864
865/// Order may have elements assigned special value (size) which is out of
866/// bounds. Such indices only appear on places which correspond to undef values
867/// (see canReuseExtract for details) and used in order to avoid undef values
868/// have effect on operands ordering.
869/// The first loop below simply finds all unused indices and then the next loop
870/// nest assigns these indices for undef values positions.
871/// As an example below Order has two undef positions and they have assigned
872/// values 3 and 7 respectively:
873/// before: 6 9 5 4 9 2 1 0
874/// after: 6 3 5 4 7 2 1 0
876 const unsigned Sz = Order.size();
877 SmallBitVector UnusedIndices(Sz, /*t=*/true);
878 SmallBitVector MaskedIndices(Sz);
879 for (unsigned I = 0; I < Sz; ++I) {
880 if (Order[I] < Sz)
881 UnusedIndices.reset(Order[I]);
882 else
883 MaskedIndices.set(I);
884 }
885 if (MaskedIndices.none())
886 return;
887 assert(UnusedIndices.count() == MaskedIndices.count() &&
888 "Non-synced masked/available indices.");
889 int Idx = UnusedIndices.find_first();
890 int MIdx = MaskedIndices.find_first();
891 while (MIdx >= 0) {
892 assert(Idx >= 0 && "Indices must be synced.");
893 Order[MIdx] = Idx;
894 Idx = UnusedIndices.find_next(Idx);
895 MIdx = MaskedIndices.find_next(MIdx);
896 }
897}
898
899namespace llvm {
900
902 SmallVectorImpl<int> &Mask) {
903 Mask.clear();
904 const unsigned E = Indices.size();
905 Mask.resize(E, PoisonMaskElem);
906 for (unsigned I = 0; I < E; ++I)
907 Mask[Indices[I]] = I;
908}
909
910/// Reorders the list of scalars in accordance with the given \p Mask.
912 ArrayRef<int> Mask) {
913 assert(!Mask.empty() && "Expected non-empty mask.");
914 SmallVector<Value *> Prev(Scalars.size(),
915 UndefValue::get(Scalars.front()->getType()));
916 Prev.swap(Scalars);
917 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
918 if (Mask[I] != PoisonMaskElem)
919 Scalars[Mask[I]] = Prev[I];
920}
921
922/// Checks if the provided value does not require scheduling. It does not
923/// require scheduling if this is not an instruction or it is an instruction
924/// that does not read/write memory and all operands are either not instructions
925/// or phi nodes or instructions from different blocks.
927 auto *I = dyn_cast<Instruction>(V);
928 if (!I)
929 return true;
930 return !mayHaveNonDefUseDependency(*I) &&
931 all_of(I->operands(), [I](Value *V) {
932 auto *IO = dyn_cast<Instruction>(V);
933 if (!IO)
934 return true;
935 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
936 });
937}
938
939/// Checks if the provided value does not require scheduling. It does not
940/// require scheduling if this is not an instruction or it is an instruction
941/// that does not read/write memory and all users are phi nodes or instructions
942/// from the different blocks.
943static bool isUsedOutsideBlock(Value *V) {
944 auto *I = dyn_cast<Instruction>(V);
945 if (!I)
946 return true;
947 // Limits the number of uses to save compile time.
948 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
949 all_of(I->users(), [I](User *U) {
950 auto *IU = dyn_cast<Instruction>(U);
951 if (!IU)
952 return true;
953 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
954 });
955}
956
957/// Checks if the specified value does not require scheduling. It does not
958/// require scheduling if all operands and all users do not need to be scheduled
959/// in the current basic block.
962}
963
964/// Checks if the specified array of instructions does not require scheduling.
965/// It is so if all either instructions have operands that do not require
966/// scheduling or their users do not require scheduling since they are phis or
967/// in other basic blocks.
969 return !VL.empty() &&
971}
972
973namespace slpvectorizer {
974
975/// Bottom Up SLP Vectorizer.
976class BoUpSLP {
977 struct TreeEntry;
978 struct ScheduleData;
981
982public:
990
992 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
995 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li),
996 DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
998 // Use the vector register size specified by the target unless overridden
999 // by a command-line option.
1000 // TODO: It would be better to limit the vectorization factor based on
1001 // data type rather than just register size. For example, x86 AVX has
1002 // 256-bit registers, but it does not support integer operations
1003 // at that width (that requires AVX2).
1004 if (MaxVectorRegSizeOption.getNumOccurrences())
1005 MaxVecRegSize = MaxVectorRegSizeOption;
1006 else
1007 MaxVecRegSize =
1009 .getFixedValue();
1010
1011 if (MinVectorRegSizeOption.getNumOccurrences())
1012 MinVecRegSize = MinVectorRegSizeOption;
1013 else
1014 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1015 }
1016
1017 /// Vectorize the tree that starts with the elements in \p VL.
1018 /// Returns the vectorized root.
1020
1021 /// Vectorize the tree but with the list of externally used values \p
1022 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1023 /// generated extractvalue instructions.
1024 /// \param ReplacedExternals containd list of replaced external values
1025 /// {scalar, replace} after emitting extractelement for external uses.
1026 Value *
1027 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1028 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1029 Instruction *ReductionRoot = nullptr);
1030
1031 /// \returns the cost incurred by unwanted spills and fills, caused by
1032 /// holding live values over call sites.
1034
1035 /// \returns the vectorization cost of the subtree that starts at \p VL.
1036 /// A negative number means that this is profitable.
1037 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1038
1039 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1040 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1041 void buildTree(ArrayRef<Value *> Roots,
1042 const SmallDenseSet<Value *> &UserIgnoreLst);
1043
1044 /// Construct a vectorizable tree that starts at \p Roots.
1045 void buildTree(ArrayRef<Value *> Roots);
1046
1047 /// Returns whether the root node has in-tree uses.
1049 return !VectorizableTree.empty() &&
1050 !VectorizableTree.front()->UserTreeIndices.empty();
1051 }
1052
1053 /// Return the scalars of the root node.
1055 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1056 return VectorizableTree.front()->Scalars;
1057 }
1058
1059 /// Builds external uses of the vectorized scalars, i.e. the list of
1060 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1061 /// ExternallyUsedValues contains additional list of external uses to handle
1062 /// vectorization of reductions.
1063 void
1064 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1065
1066 /// Clear the internal data structures that are created by 'buildTree'.
1067 void deleteTree() {
1068 VectorizableTree.clear();
1069 ScalarToTreeEntry.clear();
1070 MultiNodeScalars.clear();
1071 MustGather.clear();
1072 EntryToLastInstruction.clear();
1073 ExternalUses.clear();
1074 for (auto &Iter : BlocksSchedules) {
1075 BlockScheduling *BS = Iter.second.get();
1076 BS->clear();
1077 }
1078 MinBWs.clear();
1079 InstrElementSize.clear();
1080 UserIgnoreList = nullptr;
1081 PostponedGathers.clear();
1082 ValueToGatherNodes.clear();
1083 }
1084
1085 unsigned getTreeSize() const { return VectorizableTree.size(); }
1086
1087 /// Perform LICM and CSE on the newly generated gather sequences.
1089
1090 /// Checks if the specified gather tree entry \p TE can be represented as a
1091 /// shuffled vector entry + (possibly) permutation with other gathers. It
1092 /// implements the checks only for possibly ordered scalars (Loads,
1093 /// ExtractElement, ExtractValue), which can be part of the graph.
1094 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1095
1096 /// Sort loads into increasing pointers offsets to allow greater clustering.
1097 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1098
1099 /// Gets reordering data for the given tree entry. If the entry is vectorized
1100 /// - just return ReorderIndices, otherwise check if the scalars can be
1101 /// reordered and return the most optimal order.
1102 /// \return std::nullopt if ordering is not important, empty order, if
1103 /// identity order is important, or the actual order.
1104 /// \param TopToBottom If true, include the order of vectorized stores and
1105 /// insertelement nodes, otherwise skip them.
1106 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1107 bool TopToBottom);
1108
1109 /// Reorders the current graph to the most profitable order starting from the
1110 /// root node to the leaf nodes. The best order is chosen only from the nodes
1111 /// of the same size (vectorization factor). Smaller nodes are considered
1112 /// parts of subgraph with smaller VF and they are reordered independently. We
1113 /// can make it because we still need to extend smaller nodes to the wider VF
1114 /// and we can merge reordering shuffles with the widening shuffles.
1115 void reorderTopToBottom();
1116
1117 /// Reorders the current graph to the most profitable order starting from
1118 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1119 /// number of reshuffles if the leaf nodes use the same order. In this case we
1120 /// can merge the orders and just shuffle user node instead of shuffling its
1121 /// operands. Plus, even the leaf nodes have different orders, it allows to
1122 /// sink reordering in the graph closer to the root node and merge it later
1123 /// during analysis.
1124 void reorderBottomToTop(bool IgnoreReorder = false);
1125
1126 /// \return The vector element size in bits to use when vectorizing the
1127 /// expression tree ending at \p V. If V is a store, the size is the width of
1128 /// the stored value. Otherwise, the size is the width of the largest loaded
1129 /// value reaching V. This method is used by the vectorizer to calculate
1130 /// vectorization factors.
1131 unsigned getVectorElementSize(Value *V);
1132
1133 /// Compute the minimum type sizes required to represent the entries in a
1134 /// vectorizable tree.
1136
1137 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1138 unsigned getMaxVecRegSize() const {
1139 return MaxVecRegSize;
1140 }
1141
1142 // \returns minimum vector register size as set by cl::opt.
1143 unsigned getMinVecRegSize() const {
1144 return MinVecRegSize;
1145 }
1146
1147 unsigned getMinVF(unsigned Sz) const {
1148 return std::max(2U, getMinVecRegSize() / Sz);
1149 }
1150
1151 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1152 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1153 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1154 return MaxVF ? MaxVF : UINT_MAX;
1155 }
1156
1157 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1158 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1159 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1160 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1161 ///
1162 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1163 unsigned canMapToVector(Type *T) const;
1164
1165 /// \returns True if the VectorizableTree is both tiny and not fully
1166 /// vectorizable. We do not vectorize such trees.
1167 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1168
1169 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1170 /// can be load combined in the backend. Load combining may not be allowed in
1171 /// the IR optimizer, so we do not want to alter the pattern. For example,
1172 /// partially transforming a scalar bswap() pattern into vector code is
1173 /// effectively impossible for the backend to undo.
1174 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1175 /// may not be necessary.
1176 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1177
1178 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1179 /// can be load combined in the backend. Load combining may not be allowed in
1180 /// the IR optimizer, so we do not want to alter the pattern. For example,
1181 /// partially transforming a scalar bswap() pattern into vector code is
1182 /// effectively impossible for the backend to undo.
1183 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1184 /// may not be necessary.
1185 bool isLoadCombineCandidate() const;
1186
1188
1189 /// This structure holds any data we need about the edges being traversed
1190 /// during buildTree_rec(). We keep track of:
1191 /// (i) the user TreeEntry index, and
1192 /// (ii) the index of the edge.
1193 struct EdgeInfo {
1194 EdgeInfo() = default;
1195 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1197 /// The user TreeEntry.
1198 TreeEntry *UserTE = nullptr;
1199 /// The operand index of the use.
1200 unsigned EdgeIdx = UINT_MAX;
1201#ifndef NDEBUG
1203 const BoUpSLP::EdgeInfo &EI) {
1204 EI.dump(OS);
1205 return OS;
1206 }
1207 /// Debug print.
1208 void dump(raw_ostream &OS) const {
1209 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1210 << " EdgeIdx:" << EdgeIdx << "}";
1211 }
1212 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1213#endif
1214 bool operator == (const EdgeInfo &Other) const {
1215 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1216 }
1217 };
1218
1219 /// A helper class used for scoring candidates for two consecutive lanes.
1221 const TargetLibraryInfo &TLI;
1222 const DataLayout &DL;
1223 ScalarEvolution &SE;
1224 const BoUpSLP &R;
1225 int NumLanes; // Total number of lanes (aka vectorization factor).
1226 int MaxLevel; // The maximum recursion depth for accumulating score.
1227
1228 public:
1230 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1231 int MaxLevel)
1232 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1233 MaxLevel(MaxLevel) {}
1234
1235 // The hard-coded scores listed here are not very important, though it shall
1236 // be higher for better matches to improve the resulting cost. When
1237 // computing the scores of matching one sub-tree with another, we are
1238 // basically counting the number of values that are matching. So even if all
1239 // scores are set to 1, we would still get a decent matching result.
1240 // However, sometimes we have to break ties. For example we may have to
1241 // choose between matching loads vs matching opcodes. This is what these
1242 // scores are helping us with: they provide the order of preference. Also,
1243 // this is important if the scalar is externally used or used in another
1244 // tree entry node in the different lane.
1245
1246 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1247 static const int ScoreConsecutiveLoads = 4;
1248 /// The same load multiple times. This should have a better score than
1249 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1250 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1251 /// a vector load and 1.0 for a broadcast.
1252 static const int ScoreSplatLoads = 3;
1253 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1254 static const int ScoreReversedLoads = 3;
1255 /// A load candidate for masked gather.
1256 static const int ScoreMaskedGatherCandidate = 1;
1257 /// ExtractElementInst from same vector and consecutive indexes.
1258 static const int ScoreConsecutiveExtracts = 4;
1259 /// ExtractElementInst from same vector and reversed indices.
1260 static const int ScoreReversedExtracts = 3;
1261 /// Constants.
1262 static const int ScoreConstants = 2;
1263 /// Instructions with the same opcode.
1264 static const int ScoreSameOpcode = 2;
1265 /// Instructions with alt opcodes (e.g, add + sub).
1266 static const int ScoreAltOpcodes = 1;
1267 /// Identical instructions (a.k.a. splat or broadcast).
1268 static const int ScoreSplat = 1;
1269 /// Matching with an undef is preferable to failing.
1270 static const int ScoreUndef = 1;
1271 /// Score for failing to find a decent match.
1272 static const int ScoreFail = 0;
1273 /// Score if all users are vectorized.
1274 static const int ScoreAllUserVectorized = 1;
1275
1276 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1277 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1278 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1279 /// MainAltOps.
1281 ArrayRef<Value *> MainAltOps) const {
1282 if (!isValidElementType(V1->getType()) ||
1283 !isValidElementType(V2->getType()))
1285
1286 if (V1 == V2) {
1287 if (isa<LoadInst>(V1)) {
1288 // Retruns true if the users of V1 and V2 won't need to be extracted.
1289 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1290 // Bail out if we have too many uses to save compilation time.
1291 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1292 return false;
1293
1294 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1295 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1296 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1297 });
1298 };
1299 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1300 };
1301 // A broadcast of a load can be cheaper on some targets.
1302 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1303 ElementCount::getFixed(NumLanes)) &&
1304 ((int)V1->getNumUses() == NumLanes ||
1305 AllUsersAreInternal(V1, V2)))
1307 }
1309 }
1310
1311 auto *LI1 = dyn_cast<LoadInst>(V1);
1312 auto *LI2 = dyn_cast<LoadInst>(V2);
1313 if (LI1 && LI2) {
1314 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1315 !LI2->isSimple())
1317
1318 std::optional<int> Dist = getPointersDiff(
1319 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1320 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1321 if (!Dist || *Dist == 0) {
1322 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1323 getUnderlyingObject(LI2->getPointerOperand()) &&
1324 R.TTI->isLegalMaskedGather(
1325 FixedVectorType::get(LI1->getType(), NumLanes),
1326 LI1->getAlign()))
1329 }
1330 // The distance is too large - still may be profitable to use masked
1331 // loads/gathers.
1332 if (std::abs(*Dist) > NumLanes / 2)
1334 // This still will detect consecutive loads, but we might have "holes"
1335 // in some cases. It is ok for non-power-2 vectorization and may produce
1336 // better results. It should not affect current vectorization.
1339 }
1340
1341 auto *C1 = dyn_cast<Constant>(V1);
1342 auto *C2 = dyn_cast<Constant>(V2);
1343 if (C1 && C2)
1345
1346 // Extracts from consecutive indexes of the same vector better score as
1347 // the extracts could be optimized away.
1348 Value *EV1;
1349 ConstantInt *Ex1Idx;
1350 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1351 // Undefs are always profitable for extractelements.
1352 // Compiler can easily combine poison and extractelement <non-poison> or
1353 // undef and extractelement <poison>. But combining undef +
1354 // extractelement <non-poison-but-may-produce-poison> requires some
1355 // extra operations.
1356 if (isa<UndefValue>(V2))
1357 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1360 Value *EV2 = nullptr;
1361 ConstantInt *Ex2Idx = nullptr;
1362 if (match(V2,
1364 m_Undef())))) {
1365 // Undefs are always profitable for extractelements.
1366 if (!Ex2Idx)
1368 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1370 if (EV2 == EV1) {
1371 int Idx1 = Ex1Idx->getZExtValue();
1372 int Idx2 = Ex2Idx->getZExtValue();
1373 int Dist = Idx2 - Idx1;
1374 // The distance is too large - still may be profitable to use
1375 // shuffles.
1376 if (std::abs(Dist) == 0)
1378 if (std::abs(Dist) > NumLanes / 2)
1382 }
1384 }
1386 }
1387
1388 auto *I1 = dyn_cast<Instruction>(V1);
1389 auto *I2 = dyn_cast<Instruction>(V2);
1390 if (I1 && I2) {
1391 if (I1->getParent() != I2->getParent())
1393 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1394 Ops.push_back(I1);
1395 Ops.push_back(I2);
1396 InstructionsState S = getSameOpcode(Ops, TLI);
1397 // Note: Only consider instructions with <= 2 operands to avoid
1398 // complexity explosion.
1399 if (S.getOpcode() &&
1400 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1401 !S.isAltShuffle()) &&
1402 all_of(Ops, [&S](Value *V) {
1403 return cast<Instruction>(V)->getNumOperands() ==
1404 S.MainOp->getNumOperands();
1405 }))
1406 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1408 }
1409
1410 if (isa<UndefValue>(V2))
1412
1414 }
1415
1416 /// Go through the operands of \p LHS and \p RHS recursively until
1417 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1418 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1419 /// of \p U1 and \p U2), except at the beginning of the recursion where
1420 /// these are set to nullptr.
1421 ///
1422 /// For example:
1423 /// \verbatim
1424 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1425 /// \ / \ / \ / \ /
1426 /// + + + +
1427 /// G1 G2 G3 G4
1428 /// \endverbatim
1429 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1430 /// each level recursively, accumulating the score. It starts from matching
1431 /// the additions at level 0, then moves on to the loads (level 1). The
1432 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1433 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1434 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1435 /// Please note that the order of the operands does not matter, as we
1436 /// evaluate the score of all profitable combinations of operands. In
1437 /// other words the score of G1 and G4 is the same as G1 and G2. This
1438 /// heuristic is based on ideas described in:
1439 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1440 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1441 /// Luís F. W. Góes
1443 Instruction *U2, int CurrLevel,
1444 ArrayRef<Value *> MainAltOps) const {
1445
1446 // Get the shallow score of V1 and V2.
1447 int ShallowScoreAtThisLevel =
1448 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1449
1450 // If reached MaxLevel,
1451 // or if V1 and V2 are not instructions,
1452 // or if they are SPLAT,
1453 // or if they are not consecutive,
1454 // or if profitable to vectorize loads or extractelements, early return
1455 // the current cost.
1456 auto *I1 = dyn_cast<Instruction>(LHS);
1457 auto *I2 = dyn_cast<Instruction>(RHS);
1458 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1459 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1460 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1461 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1462 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1463 ShallowScoreAtThisLevel))
1464 return ShallowScoreAtThisLevel;
1465 assert(I1 && I2 && "Should have early exited.");
1466
1467 // Contains the I2 operand indexes that got matched with I1 operands.
1468 SmallSet<unsigned, 4> Op2Used;
1469
1470 // Recursion towards the operands of I1 and I2. We are trying all possible
1471 // operand pairs, and keeping track of the best score.
1472 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1473 OpIdx1 != NumOperands1; ++OpIdx1) {
1474 // Try to pair op1I with the best operand of I2.
1475 int MaxTmpScore = 0;
1476 unsigned MaxOpIdx2 = 0;
1477 bool FoundBest = false;
1478 // If I2 is commutative try all combinations.
1479 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1480 unsigned ToIdx = isCommutative(I2)
1481 ? I2->getNumOperands()
1482 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1483 assert(FromIdx <= ToIdx && "Bad index");
1484 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1485 // Skip operands already paired with OpIdx1.
1486 if (Op2Used.count(OpIdx2))
1487 continue;
1488 // Recursively calculate the cost at each level
1489 int TmpScore =
1490 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1491 I1, I2, CurrLevel + 1, std::nullopt);
1492 // Look for the best score.
1493 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1494 TmpScore > MaxTmpScore) {
1495 MaxTmpScore = TmpScore;
1496 MaxOpIdx2 = OpIdx2;
1497 FoundBest = true;
1498 }
1499 }
1500 if (FoundBest) {
1501 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1502 Op2Used.insert(MaxOpIdx2);
1503 ShallowScoreAtThisLevel += MaxTmpScore;
1504 }
1505 }
1506 return ShallowScoreAtThisLevel;
1507 }
1508 };
1509 /// A helper data structure to hold the operands of a vector of instructions.
1510 /// This supports a fixed vector length for all operand vectors.
1512 /// For each operand we need (i) the value, and (ii) the opcode that it
1513 /// would be attached to if the expression was in a left-linearized form.
1514 /// This is required to avoid illegal operand reordering.
1515 /// For example:
1516 /// \verbatim
1517 /// 0 Op1
1518 /// |/
1519 /// Op1 Op2 Linearized + Op2
1520 /// \ / ----------> |/
1521 /// - -
1522 ///
1523 /// Op1 - Op2 (0 + Op1) - Op2
1524 /// \endverbatim
1525 ///
1526 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1527 ///
1528 /// Another way to think of this is to track all the operations across the
1529 /// path from the operand all the way to the root of the tree and to
1530 /// calculate the operation that corresponds to this path. For example, the
1531 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1532 /// corresponding operation is a '-' (which matches the one in the
1533 /// linearized tree, as shown above).
1534 ///
1535 /// For lack of a better term, we refer to this operation as Accumulated
1536 /// Path Operation (APO).
1537 struct OperandData {
1538 OperandData() = default;
1539 OperandData(Value *V, bool APO, bool IsUsed)
1540 : V(V), APO(APO), IsUsed(IsUsed) {}
1541 /// The operand value.
1542 Value *V = nullptr;
1543 /// TreeEntries only allow a single opcode, or an alternate sequence of
1544 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1545 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1546 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1547 /// (e.g., Add/Mul)
1548 bool APO = false;
1549 /// Helper data for the reordering function.
1550 bool IsUsed = false;
1551 };
1552
1553 /// During operand reordering, we are trying to select the operand at lane
1554 /// that matches best with the operand at the neighboring lane. Our
1555 /// selection is based on the type of value we are looking for. For example,
1556 /// if the neighboring lane has a load, we need to look for a load that is
1557 /// accessing a consecutive address. These strategies are summarized in the
1558 /// 'ReorderingMode' enumerator.
1559 enum class ReorderingMode {
1560 Load, ///< Matching loads to consecutive memory addresses
1561 Opcode, ///< Matching instructions based on opcode (same or alternate)
1562 Constant, ///< Matching constants
1563 Splat, ///< Matching the same instruction multiple times (broadcast)
1564 Failed, ///< We failed to create a vectorizable group
1565 };
1566
1568
1569 /// A vector of operand vectors.
1571
1572 const TargetLibraryInfo &TLI;
1573 const DataLayout &DL;
1574 ScalarEvolution &SE;
1575 const BoUpSLP &R;
1576
1577 /// \returns the operand data at \p OpIdx and \p Lane.
1578 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1579 return OpsVec[OpIdx][Lane];
1580 }
1581
1582 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1583 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1584 return OpsVec[OpIdx][Lane];
1585 }
1586
1587 /// Clears the used flag for all entries.
1588 void clearUsed() {
1589 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1590 OpIdx != NumOperands; ++OpIdx)
1591 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1592 ++Lane)
1593 OpsVec[OpIdx][Lane].IsUsed = false;
1594 }
1595
1596 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1597 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1598 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1599 }
1600
1601 /// \param Lane lane of the operands under analysis.
1602 /// \param OpIdx operand index in \p Lane lane we're looking the best
1603 /// candidate for.
1604 /// \param Idx operand index of the current candidate value.
1605 /// \returns The additional score due to possible broadcasting of the
1606 /// elements in the lane. It is more profitable to have power-of-2 unique
1607 /// elements in the lane, it will be vectorized with higher probability
1608 /// after removing duplicates. Currently the SLP vectorizer supports only
1609 /// vectorization of the power-of-2 number of unique scalars.
1610 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1611 Value *IdxLaneV = getData(Idx, Lane).V;
1612 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1613 return 0;
1615 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1616 if (Ln == Lane)
1617 continue;
1618 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1619 if (!isa<Instruction>(OpIdxLnV))
1620 return 0;
1621 Uniques.insert(OpIdxLnV);
1622 }
1623 int UniquesCount = Uniques.size();
1624 int UniquesCntWithIdxLaneV =
1625 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1626 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1627 int UniquesCntWithOpIdxLaneV =
1628 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1629 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1630 return 0;
1631 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1632 UniquesCntWithOpIdxLaneV) -
1633 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1634 }
1635
1636 /// \param Lane lane of the operands under analysis.
1637 /// \param OpIdx operand index in \p Lane lane we're looking the best
1638 /// candidate for.
1639 /// \param Idx operand index of the current candidate value.
1640 /// \returns The additional score for the scalar which users are all
1641 /// vectorized.
1642 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1643 Value *IdxLaneV = getData(Idx, Lane).V;
1644 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1645 // Do not care about number of uses for vector-like instructions
1646 // (extractelement/extractvalue with constant indices), they are extracts
1647 // themselves and already externally used. Vectorization of such
1648 // instructions does not add extra extractelement instruction, just may
1649 // remove it.
1650 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1651 isVectorLikeInstWithConstOps(OpIdxLaneV))
1653 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1654 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1655 return 0;
1656 return R.areAllUsersVectorized(IdxLaneI)
1658 : 0;
1659 }
1660
1661 /// Score scaling factor for fully compatible instructions but with
1662 /// different number of external uses. Allows better selection of the
1663 /// instructions with less external uses.
1664 static const int ScoreScaleFactor = 10;
1665
1666 /// \Returns the look-ahead score, which tells us how much the sub-trees
1667 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1668 /// score. This helps break ties in an informed way when we cannot decide on
1669 /// the order of the operands by just considering the immediate
1670 /// predecessors.
1671 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1672 int Lane, unsigned OpIdx, unsigned Idx,
1673 bool &IsUsed) {
1674 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1676 // Keep track of the instruction stack as we recurse into the operands
1677 // during the look-ahead score exploration.
1678 int Score =
1679 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1680 /*CurrLevel=*/1, MainAltOps);
1681 if (Score) {
1682 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1683 if (Score <= -SplatScore) {
1684 // Set the minimum score for splat-like sequence to avoid setting
1685 // failed state.
1686 Score = 1;
1687 } else {
1688 Score += SplatScore;
1689 // Scale score to see the difference between different operands
1690 // and similar operands but all vectorized/not all vectorized
1691 // uses. It does not affect actual selection of the best
1692 // compatible operand in general, just allows to select the
1693 // operand with all vectorized uses.
1694 Score *= ScoreScaleFactor;
1695 Score += getExternalUseScore(Lane, OpIdx, Idx);
1696 IsUsed = true;
1697 }
1698 }
1699 return Score;
1700 }
1701
1702 /// Best defined scores per lanes between the passes. Used to choose the
1703 /// best operand (with the highest score) between the passes.
1704 /// The key - {Operand Index, Lane}.
1705 /// The value - the best score between the passes for the lane and the
1706 /// operand.
1708 BestScoresPerLanes;
1709
1710 // Search all operands in Ops[*][Lane] for the one that matches best
1711 // Ops[OpIdx][LastLane] and return its opreand index.
1712 // If no good match can be found, return std::nullopt.
1713 std::optional<unsigned>
1714 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1715 ArrayRef<ReorderingMode> ReorderingModes,
1716 ArrayRef<Value *> MainAltOps) {
1717 unsigned NumOperands = getNumOperands();
1718
1719 // The operand of the previous lane at OpIdx.
1720 Value *OpLastLane = getData(OpIdx, LastLane).V;
1721
1722 // Our strategy mode for OpIdx.
1723 ReorderingMode RMode = ReorderingModes[OpIdx];
1724 if (RMode == ReorderingMode::Failed)
1725 return std::nullopt;
1726
1727 // The linearized opcode of the operand at OpIdx, Lane.
1728 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1729
1730 // The best operand index and its score.
1731 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1732 // are using the score to differentiate between the two.
1733 struct BestOpData {
1734 std::optional<unsigned> Idx;
1735 unsigned Score = 0;
1736 } BestOp;
1737 BestOp.Score =
1738 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1739 .first->second;
1740
1741 // Track if the operand must be marked as used. If the operand is set to
1742 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1743 // want to reestimate the operands again on the following iterations).
1744 bool IsUsed =
1745 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1746 // Iterate through all unused operands and look for the best.
1747 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1748 // Get the operand at Idx and Lane.
1749 OperandData &OpData = getData(Idx, Lane);
1750 Value *Op = OpData.V;
1751 bool OpAPO = OpData.APO;
1752
1753 // Skip already selected operands.
1754 if (OpData.IsUsed)
1755 continue;
1756
1757 // Skip if we are trying to move the operand to a position with a
1758 // different opcode in the linearized tree form. This would break the
1759 // semantics.
1760 if (OpAPO != OpIdxAPO)
1761 continue;
1762
1763 // Look for an operand that matches the current mode.
1764 switch (RMode) {
1765 case ReorderingMode::Load:
1766 case ReorderingMode::Constant:
1767 case ReorderingMode::Opcode: {
1768 bool LeftToRight = Lane > LastLane;
1769 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1770 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1771 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1772 OpIdx, Idx, IsUsed);
1773 if (Score > static_cast<int>(BestOp.Score)) {
1774 BestOp.Idx = Idx;
1775 BestOp.Score = Score;
1776 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1777 }
1778 break;
1779 }
1780 case ReorderingMode::Splat:
1781 if (Op == OpLastLane)
1782 BestOp.Idx = Idx;
1783 break;
1784 case ReorderingMode::Failed:
1785 llvm_unreachable("Not expected Failed reordering mode.");
1786 }
1787 }
1788
1789 if (BestOp.Idx) {
1790 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1791 return BestOp.Idx;
1792 }
1793 // If we could not find a good match return std::nullopt.
1794 return std::nullopt;
1795 }
1796
1797 /// Helper for reorderOperandVecs.
1798 /// \returns the lane that we should start reordering from. This is the one
1799 /// which has the least number of operands that can freely move about or
1800 /// less profitable because it already has the most optimal set of operands.
1801 unsigned getBestLaneToStartReordering() const {
1802 unsigned Min = UINT_MAX;
1803 unsigned SameOpNumber = 0;
1804 // std::pair<unsigned, unsigned> is used to implement a simple voting
1805 // algorithm and choose the lane with the least number of operands that
1806 // can freely move about or less profitable because it already has the
1807 // most optimal set of operands. The first unsigned is a counter for
1808 // voting, the second unsigned is the counter of lanes with instructions
1809 // with same/alternate opcodes and same parent basic block.
1811 // Try to be closer to the original results, if we have multiple lanes
1812 // with same cost. If 2 lanes have the same cost, use the one with the
1813 // lowest index.
1814 for (int I = getNumLanes(); I > 0; --I) {
1815 unsigned Lane = I - 1;
1816 OperandsOrderData NumFreeOpsHash =
1817 getMaxNumOperandsThatCanBeReordered(Lane);
1818 // Compare the number of operands that can move and choose the one with
1819 // the least number.
1820 if (NumFreeOpsHash.NumOfAPOs < Min) {
1821 Min = NumFreeOpsHash.NumOfAPOs;
1822 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1823 HashMap.clear();
1824 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1825 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1826 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1827 // Select the most optimal lane in terms of number of operands that
1828 // should be moved around.
1829 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1830 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1831 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1832 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1833 auto *It = HashMap.find(NumFreeOpsHash.Hash);
1834 if (It == HashMap.end())
1835 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1836 else
1837 ++It->second.first;
1838 }
1839 }
1840 // Select the lane with the minimum counter.
1841 unsigned BestLane = 0;
1842 unsigned CntMin = UINT_MAX;
1843 for (const auto &Data : reverse(HashMap)) {
1844 if (Data.second.first < CntMin) {
1845 CntMin = Data.second.first;
1846 BestLane = Data.second.second;
1847 }
1848 }
1849 return BestLane;
1850 }
1851
1852 /// Data structure that helps to reorder operands.
1853 struct OperandsOrderData {
1854 /// The best number of operands with the same APOs, which can be
1855 /// reordered.
1856 unsigned NumOfAPOs = UINT_MAX;
1857 /// Number of operands with the same/alternate instruction opcode and
1858 /// parent.
1859 unsigned NumOpsWithSameOpcodeParent = 0;
1860 /// Hash for the actual operands ordering.
1861 /// Used to count operands, actually their position id and opcode
1862 /// value. It is used in the voting mechanism to find the lane with the
1863 /// least number of operands that can freely move about or less profitable
1864 /// because it already has the most optimal set of operands. Can be
1865 /// replaced with SmallVector<unsigned> instead but hash code is faster
1866 /// and requires less memory.
1867 unsigned Hash = 0;
1868 };
1869 /// \returns the maximum number of operands that are allowed to be reordered
1870 /// for \p Lane and the number of compatible instructions(with the same
1871 /// parent/opcode). This is used as a heuristic for selecting the first lane
1872 /// to start operand reordering.
1873 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
1874 unsigned CntTrue = 0;
1875 unsigned NumOperands = getNumOperands();
1876 // Operands with the same APO can be reordered. We therefore need to count
1877 // how many of them we have for each APO, like this: Cnt[APO] = x.
1878 // Since we only have two APOs, namely true and false, we can avoid using
1879 // a map. Instead we can simply count the number of operands that
1880 // correspond to one of them (in this case the 'true' APO), and calculate
1881 // the other by subtracting it from the total number of operands.
1882 // Operands with the same instruction opcode and parent are more
1883 // profitable since we don't need to move them in many cases, with a high
1884 // probability such lane already can be vectorized effectively.
1885 bool AllUndefs = true;
1886 unsigned NumOpsWithSameOpcodeParent = 0;
1887 Instruction *OpcodeI = nullptr;
1888 BasicBlock *Parent = nullptr;
1889 unsigned Hash = 0;
1890 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1891 const OperandData &OpData = getData(OpIdx, Lane);
1892 if (OpData.APO)
1893 ++CntTrue;
1894 // Use Boyer-Moore majority voting for finding the majority opcode and
1895 // the number of times it occurs.
1896 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
1897 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
1898 I->getParent() != Parent) {
1899 if (NumOpsWithSameOpcodeParent == 0) {
1900 NumOpsWithSameOpcodeParent = 1;
1901 OpcodeI = I;
1902 Parent = I->getParent();
1903 } else {
1904 --NumOpsWithSameOpcodeParent;
1905 }
1906 } else {
1907 ++NumOpsWithSameOpcodeParent;
1908 }
1909 }
1910 Hash = hash_combine(
1911 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
1912 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
1913 }
1914 if (AllUndefs)
1915 return {};
1916 OperandsOrderData Data;
1917 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
1918 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
1919 Data.Hash = Hash;
1920 return Data;
1921 }
1922
1923 /// Go through the instructions in VL and append their operands.
1924 void appendOperandsOfVL(ArrayRef<Value *> VL) {
1925 assert(!VL.empty() && "Bad VL");
1926 assert((empty() || VL.size() == getNumLanes()) &&
1927 "Expected same number of lanes");
1928 assert(isa<Instruction>(VL[0]) && "Expected instruction");
1929 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
1930 OpsVec.resize(NumOperands);
1931 unsigned NumLanes = VL.size();
1932 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1933 OpsVec[OpIdx].resize(NumLanes);
1934 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1935 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
1936 // Our tree has just 3 nodes: the root and two operands.
1937 // It is therefore trivial to get the APO. We only need to check the
1938 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
1939 // RHS operand. The LHS operand of both add and sub is never attached
1940 // to an inversese operation in the linearized form, therefore its APO
1941 // is false. The RHS is true only if VL[Lane] is an inverse operation.
1942
1943 // Since operand reordering is performed on groups of commutative
1944 // operations or alternating sequences (e.g., +, -), we can safely
1945 // tell the inverse operations by checking commutativity.
1946 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
1947 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
1948 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
1949 APO, false};
1950 }
1951 }
1952 }
1953
1954 /// \returns the number of operands.
1955 unsigned getNumOperands() const { return OpsVec.size(); }
1956
1957 /// \returns the number of lanes.
1958 unsigned getNumLanes() const { return OpsVec[0].size(); }
1959
1960 /// \returns the operand value at \p OpIdx and \p Lane.
1961 Value *getValue(unsigned OpIdx, unsigned Lane) const {
1962 return getData(OpIdx, Lane).V;
1963 }
1964
1965 /// \returns true if the data structure is empty.
1966 bool empty() const { return OpsVec.empty(); }
1967
1968 /// Clears the data.
1969 void clear() { OpsVec.clear(); }
1970
1971 /// \Returns true if there are enough operands identical to \p Op to fill
1972 /// the whole vector.
1973 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
1974 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
1975 bool OpAPO = getData(OpIdx, Lane).APO;
1976 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
1977 if (Ln == Lane)
1978 continue;
1979 // This is set to true if we found a candidate for broadcast at Lane.
1980 bool FoundCandidate = false;
1981 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
1982 OperandData &Data = getData(OpI, Ln);
1983 if (Data.APO != OpAPO || Data.IsUsed)
1984 continue;
1985 if (Data.V == Op) {
1986 FoundCandidate = true;
1987 Data.IsUsed = true;
1988 break;
1989 }
1990 }
1991 if (!FoundCandidate)
1992 return false;
1993 }
1994 return true;
1995 }
1996
1997 public:
1998 /// Initialize with all the operands of the instruction vector \p RootVL.
2000 const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R)
2001 : TLI(TLI), DL(DL), SE(SE), R(R) {
2002 // Append all the operands of RootVL.
2003 appendOperandsOfVL(RootVL);
2004 }
2005
2006 /// \Returns a value vector with the operands across all lanes for the
2007 /// opearnd at \p OpIdx.
2008 ValueList getVL(unsigned OpIdx) const {
2009 ValueList OpVL(OpsVec[OpIdx].size());
2010 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2011 "Expected same num of lanes across all operands");
2012 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2013 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2014 return OpVL;
2015 }
2016
2017 // Performs operand reordering for 2 or more operands.
2018 // The original operands are in OrigOps[OpIdx][Lane].
2019 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2020 void reorder() {
2021 unsigned NumOperands = getNumOperands();
2022 unsigned NumLanes = getNumLanes();
2023 // Each operand has its own mode. We are using this mode to help us select
2024 // the instructions for each lane, so that they match best with the ones
2025 // we have selected so far.
2026 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2027
2028 // This is a greedy single-pass algorithm. We are going over each lane
2029 // once and deciding on the best order right away with no back-tracking.
2030 // However, in order to increase its effectiveness, we start with the lane
2031 // that has operands that can move the least. For example, given the
2032 // following lanes:
2033 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2034 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2035 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2036 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2037 // we will start at Lane 1, since the operands of the subtraction cannot
2038 // be reordered. Then we will visit the rest of the lanes in a circular
2039 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2040
2041 // Find the first lane that we will start our search from.
2042 unsigned FirstLane = getBestLaneToStartReordering();
2043
2044 // Initialize the modes.
2045 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2046 Value *OpLane0 = getValue(OpIdx, FirstLane);
2047 // Keep track if we have instructions with all the same opcode on one
2048 // side.
2049 if (isa<LoadInst>(OpLane0))
2050 ReorderingModes[OpIdx] = ReorderingMode::Load;
2051 else if (isa<Instruction>(OpLane0)) {
2052 // Check if OpLane0 should be broadcast.
2053 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2054 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2055 else
2056 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2057 }
2058 else if (isa<Constant>(OpLane0))
2059 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2060 else if (isa<Argument>(OpLane0))
2061 // Our best hope is a Splat. It may save some cost in some cases.
2062 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2063 else
2064 // NOTE: This should be unreachable.
2065 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2066 }
2067
2068 // Check that we don't have same operands. No need to reorder if operands
2069 // are just perfect diamond or shuffled diamond match. Do not do it only
2070 // for possible broadcasts or non-power of 2 number of scalars (just for
2071 // now).
2072 auto &&SkipReordering = [this]() {
2073 SmallPtrSet<Value *, 4> UniqueValues;
2074 ArrayRef<OperandData> Op0 = OpsVec.front();
2075 for (const OperandData &Data : Op0)
2076 UniqueValues.insert(Data.V);
2077 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2078 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2079 return !UniqueValues.contains(Data.V);
2080 }))
2081 return false;
2082 }
2083 // TODO: Check if we can remove a check for non-power-2 number of
2084 // scalars after full support of non-power-2 vectorization.
2085 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2086 };
2087
2088 // If the initial strategy fails for any of the operand indexes, then we
2089 // perform reordering again in a second pass. This helps avoid assigning
2090 // high priority to the failed strategy, and should improve reordering for
2091 // the non-failed operand indexes.
2092 for (int Pass = 0; Pass != 2; ++Pass) {
2093 // Check if no need to reorder operands since they're are perfect or
2094 // shuffled diamond match.
2095 // Need to do it to avoid extra external use cost counting for
2096 // shuffled matches, which may cause regressions.
2097 if (SkipReordering())
2098 break;
2099 // Skip the second pass if the first pass did not fail.
2100 bool StrategyFailed = false;
2101 // Mark all operand data as free to use.
2102 clearUsed();
2103 // We keep the original operand order for the FirstLane, so reorder the
2104 // rest of the lanes. We are visiting the nodes in a circular fashion,
2105 // using FirstLane as the center point and increasing the radius
2106 // distance.
2107 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2108 for (unsigned I = 0; I < NumOperands; ++I)
2109 MainAltOps[I].push_back(getData(I, FirstLane).V);
2110
2111 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2112 // Visit the lane on the right and then the lane on the left.
2113 for (int Direction : {+1, -1}) {
2114 int Lane = FirstLane + Direction * Distance;
2115 if (Lane < 0 || Lane >= (int)NumLanes)
2116 continue;
2117 int LastLane = Lane - Direction;
2118 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2119 "Out of bounds");
2120 // Look for a good match for each operand.
2121 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2122 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2123 std::optional<unsigned> BestIdx = getBestOperand(
2124 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2125 // By not selecting a value, we allow the operands that follow to
2126 // select a better matching value. We will get a non-null value in
2127 // the next run of getBestOperand().
2128 if (BestIdx) {
2129 // Swap the current operand with the one returned by
2130 // getBestOperand().
2131 swap(OpIdx, *BestIdx, Lane);
2132 } else {
2133 // We failed to find a best operand, set mode to 'Failed'.
2134 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2135 // Enable the second pass.
2136 StrategyFailed = true;
2137 }
2138 // Try to get the alternate opcode and follow it during analysis.
2139 if (MainAltOps[OpIdx].size() != 2) {
2140 OperandData &AltOp = getData(OpIdx, Lane);
2141 InstructionsState OpS =
2142 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2143 if (OpS.getOpcode() && OpS.isAltShuffle())
2144 MainAltOps[OpIdx].push_back(AltOp.V);
2145 }
2146 }
2147 }
2148 }
2149 // Skip second pass if the strategy did not fail.
2150 if (!StrategyFailed)
2151 break;
2152 }
2153 }
2154
2155#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2156 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2157 switch (RMode) {
2158 case ReorderingMode::Load:
2159 return "Load";
2160 case ReorderingMode::Opcode:
2161 return "Opcode";
2162 case ReorderingMode::Constant:
2163 return "Constant";
2164 case ReorderingMode::Splat:
2165 return "Splat";
2166 case ReorderingMode::Failed:
2167 return "Failed";
2168 }
2169 llvm_unreachable("Unimplemented Reordering Type");
2170 }
2171
2172 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2173 raw_ostream &OS) {
2174 return OS << getModeStr(RMode);
2175 }
2176
2177 /// Debug print.
2178 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2179 printMode(RMode, dbgs());
2180 }
2181
2182 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2183 return printMode(RMode, OS);
2184 }
2185
2187 const unsigned Indent = 2;
2188 unsigned Cnt = 0;
2189 for (const OperandDataVec &OpDataVec : OpsVec) {
2190 OS << "Operand " << Cnt++ << "\n";
2191 for (const OperandData &OpData : OpDataVec) {
2192 OS.indent(Indent) << "{";
2193 if (Value *V = OpData.V)
2194 OS << *V;
2195 else
2196 OS << "null";
2197 OS << ", APO:" << OpData.APO << "}\n";
2198 }
2199 OS << "\n";
2200 }
2201 return OS;
2202 }
2203
2204 /// Debug print.
2205 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2206#endif
2207 };
2208
2209 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2210 /// for a pair which have highest score deemed to have best chance to form
2211 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2212 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2213 /// of the cost, considered to be good enough score.
2214 std::optional<int>
2215 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2216 int Limit = LookAheadHeuristics::ScoreFail) {
2217 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2219 int BestScore = Limit;
2220 std::optional<int> Index;
2221 for (int I : seq<int>(0, Candidates.size())) {
2222 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2223 Candidates[I].second,
2224 /*U1=*/nullptr, /*U2=*/nullptr,
2225 /*Level=*/1, std::nullopt);
2226 if (Score > BestScore) {
2227 BestScore = Score;
2228 Index = I;
2229 }
2230 }
2231 return Index;
2232 }
2233
2234 /// Checks if the instruction is marked for deletion.
2235 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2236
2237 /// Removes an instruction from its block and eventually deletes it.
2238 /// It's like Instruction::eraseFromParent() except that the actual deletion
2239 /// is delayed until BoUpSLP is destructed.
2241 DeletedInstructions.insert(I);
2242 }
2243
2244 /// Checks if the instruction was already analyzed for being possible
2245 /// reduction root.
2247 return AnalyzedReductionsRoots.count(I);
2248 }
2249 /// Register given instruction as already analyzed for being possible
2250 /// reduction root.
2252 AnalyzedReductionsRoots.insert(I);
2253 }
2254 /// Checks if the provided list of reduced values was checked already for
2255 /// vectorization.
2257 return AnalyzedReductionVals.contains(hash_value(VL));
2258 }
2259 /// Adds the list of reduced values to list of already checked values for the
2260 /// vectorization.
2262 AnalyzedReductionVals.insert(hash_value(VL));
2263 }
2264 /// Clear the list of the analyzed reduction root instructions.
2266 AnalyzedReductionsRoots.clear();
2267 AnalyzedReductionVals.clear();
2268 }
2269 /// Checks if the given value is gathered in one of the nodes.
2270 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2271 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2272 }
2273
2274 /// Check if the value is vectorized in the tree.
2275 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2276
2277 ~BoUpSLP();
2278
2279private:
2280 /// Determine if a vectorized value \p V in can be demoted to
2281 /// a smaller type with a truncation. We collect the values that will be
2282 /// demoted in ToDemote and additional roots that require investigating in
2283 /// Roots.
2284 /// \param DemotedConsts list of Instruction/OperandIndex pairs that are
2285 /// constant and to be demoted. Required to correctly identify constant nodes
2286 /// to be demoted.
2287 bool collectValuesToDemote(
2288 Value *V, SmallVectorImpl<Value *> &ToDemote,
2289 DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,
2290 SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const;
2291
2292 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2293 /// reordering (i.e. the operands can be reordered because they have only one
2294 /// user and reordarable).
2295 /// \param ReorderableGathers List of all gather nodes that require reordering
2296 /// (e.g., gather of extractlements or partially vectorizable loads).
2297 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2298 /// reordering, subset of \p NonVectorized.
2299 bool
2300 canReorderOperands(TreeEntry *UserTE,
2301 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2302 ArrayRef<TreeEntry *> ReorderableGathers,
2303 SmallVectorImpl<TreeEntry *> &GatherOps);
2304
2305 /// Checks if the given \p TE is a gather node with clustered reused scalars
2306 /// and reorders it per given \p Mask.
2307 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2308
2309 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2310 /// if any. If it is not vectorized (gather node), returns nullptr.
2311 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2312 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2313 TreeEntry *TE = nullptr;
2314 const auto *It = find_if(VL, [&](Value *V) {
2315 TE = getTreeEntry(V);
2316 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2317 return true;
2318 auto It = MultiNodeScalars.find(V);
2319 if (It != MultiNodeScalars.end()) {
2320 for (TreeEntry *E : It->second) {
2321 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2322 TE = E;
2323 return true;
2324 }
2325 }
2326 }
2327 return false;
2328 });
2329 if (It != VL.end()) {
2330 assert(TE->isSame(VL) && "Expected same scalars.");
2331 return TE;
2332 }
2333 return nullptr;
2334 }
2335
2336 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2337 /// if any. If it is not vectorized (gather node), returns nullptr.
2338 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2339 unsigned OpIdx) const {
2340 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2341 const_cast<TreeEntry *>(UserTE), OpIdx);
2342 }
2343
2344 /// Checks if all users of \p I are the part of the vectorization tree.
2345 bool areAllUsersVectorized(
2346 Instruction *I,
2347 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2348
2349 /// Return information about the vector formed for the specified index
2350 /// of a vector of (the same) instruction.
2352
2353 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2354 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2355
2356 /// \returns the cost of the vectorizable entry.
2357 InstructionCost getEntryCost(const TreeEntry *E,
2358 ArrayRef<Value *> VectorizedVals,
2359 SmallPtrSetImpl<Value *> &CheckedExtracts);
2360
2361 /// This is the recursive part of buildTree.
2362 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2363 const EdgeInfo &EI);
2364
2365 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2366 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2367 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2368 /// returns false, setting \p CurrentOrder to either an empty vector or a
2369 /// non-identity permutation that allows to reuse extract instructions.
2370 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2371 /// extract order.
2372 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2373 SmallVectorImpl<unsigned> &CurrentOrder,
2374 bool ResizeAllowed = false) const;
2375
2376 /// Vectorize a single entry in the tree.
2377 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2378 /// avoid issues with def-use order.
2379 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2380
2381 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2382 /// \p E.
2383 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2384 /// avoid issues with def-use order.
2385 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2386
2387 /// Create a new vector from a list of scalar values. Produces a sequence
2388 /// which exploits values reused across lanes, and arranges the inserts
2389 /// for ease of later optimization.
2390 template <typename BVTy, typename ResTy, typename... Args>
2391 ResTy processBuildVector(const TreeEntry *E, Args &...Params);
2392
2393 /// Create a new vector from a list of scalar values. Produces a sequence
2394 /// which exploits values reused across lanes, and arranges the inserts
2395 /// for ease of later optimization.
2396 Value *createBuildVector(const TreeEntry *E);
2397
2398 /// Returns the instruction in the bundle, which can be used as a base point
2399 /// for scheduling. Usually it is the last instruction in the bundle, except
2400 /// for the case when all operands are external (in this case, it is the first
2401 /// instruction in the list).
2402 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2403
2404 /// Tries to find extractelement instructions with constant indices from fixed
2405 /// vector type and gather such instructions into a bunch, which highly likely
2406 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2407 /// was successful, the matched scalars are replaced by poison values in \p VL
2408 /// for future analysis.
2409 std::optional<TargetTransformInfo::ShuffleKind>
2410 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2411 SmallVectorImpl<int> &Mask) const;
2412
2413 /// Tries to find extractelement instructions with constant indices from fixed
2414 /// vector type and gather such instructions into a bunch, which highly likely
2415 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2416 /// was successful, the matched scalars are replaced by poison values in \p VL
2417 /// for future analysis.
2419 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2421 unsigned NumParts) const;
2422
2423 /// Checks if the gathered \p VL can be represented as a single register
2424 /// shuffle(s) of previous tree entries.
2425 /// \param TE Tree entry checked for permutation.
2426 /// \param VL List of scalars (a subset of the TE scalar), checked for
2427 /// permutations. Must form single-register vector.
2428 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2429 /// commands to build the mask using the original vector value, without
2430 /// relying on the potential reordering.
2431 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2432 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2433 std::optional<TargetTransformInfo::ShuffleKind>
2434 isGatherShuffledSingleRegisterEntry(
2435 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2436 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2437 bool ForOrder);
2438
2439 /// Checks if the gathered \p VL can be represented as multi-register
2440 /// shuffle(s) of previous tree entries.
2441 /// \param TE Tree entry checked for permutation.
2442 /// \param VL List of scalars (a subset of the TE scalar), checked for
2443 /// permutations.
2444 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2445 /// commands to build the mask using the original vector value, without
2446 /// relying on the potential reordering.
2447 /// \returns per-register series of ShuffleKind, if gathered values can be
2448 /// represented as shuffles of previous tree entries. \p Mask is filled with
2449 /// the shuffle mask (also on per-register base).
2451 isGatherShuffledEntry(
2452 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2454 unsigned NumParts, bool ForOrder = false);
2455
2456 /// \returns the scalarization cost for this list of values. Assuming that
2457 /// this subtree gets vectorized, we may need to extract the values from the
2458 /// roots. This method calculates the cost of extracting the values.
2459 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2460 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc) const;
2461
2462 /// Set the Builder insert point to one after the last instruction in
2463 /// the bundle
2464 void setInsertPointAfterBundle(const TreeEntry *E);
2465
2466 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2467 /// specified, the starting vector value is poison.
2468 Value *gather(ArrayRef<Value *> VL, Value *Root);
2469
2470 /// \returns whether the VectorizableTree is fully vectorizable and will
2471 /// be beneficial even the tree height is tiny.
2472 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2473
2474 /// Reorder commutative or alt operands to get better probability of
2475 /// generating vectorized code.
2476 static void reorderInputsAccordingToOpcode(
2479 const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R);
2480
2481 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2482 /// users of \p TE and collects the stores. It returns the map from the store
2483 /// pointers to the collected stores.
2485 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2486
2487 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2488 /// stores in \p StoresVec can form a vector instruction. If so it returns
2489 /// true and populates \p ReorderIndices with the shuffle indices of the
2490 /// stores when compared to the sorted vector.
2491 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2492 OrdersType &ReorderIndices) const;
2493
2494 /// Iterates through the users of \p TE, looking for scalar stores that can be
2495 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2496 /// their order and builds an order index vector for each store bundle. It
2497 /// returns all these order vectors found.
2498 /// We run this after the tree has formed, otherwise we may come across user
2499 /// instructions that are not yet in the tree.
2501 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2502
2503 struct TreeEntry {
2504 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2505 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2506
2507 /// \returns Common mask for reorder indices and reused scalars.
2508 SmallVector<int> getCommonMask() const {
2510 inversePermutation(ReorderIndices, Mask);
2511 ::addMask(Mask, ReuseShuffleIndices);
2512 return Mask;
2513 }
2514
2515 /// \returns true if the scalars in VL are equal to this entry.
2516 bool isSame(ArrayRef<Value *> VL) const {
2517 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2518 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2519 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2520 return VL.size() == Mask.size() &&
2521 std::equal(VL.begin(), VL.end(), Mask.begin(),
2522 [Scalars](Value *V, int Idx) {
2523 return (isa<UndefValue>(V) &&
2524 Idx == PoisonMaskElem) ||
2525 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2526 });
2527 };
2528 if (!ReorderIndices.empty()) {
2529 // TODO: implement matching if the nodes are just reordered, still can
2530 // treat the vector as the same if the list of scalars matches VL
2531 // directly, without reordering.
2533 inversePermutation(ReorderIndices, Mask);
2534 if (VL.size() == Scalars.size())
2535 return IsSame(Scalars, Mask);
2536 if (VL.size() == ReuseShuffleIndices.size()) {
2537 ::addMask(Mask, ReuseShuffleIndices);
2538 return IsSame(Scalars, Mask);
2539 }
2540 return false;
2541 }
2542 return IsSame(Scalars, ReuseShuffleIndices);
2543 }
2544
2545 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2546 return State == TreeEntry::NeedToGather &&
2547 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2548 UserTreeIndices.front().UserTE == UserEI.UserTE;
2549 }
2550
2551 /// \returns true if current entry has same operands as \p TE.
2552 bool hasEqualOperands(const TreeEntry &TE) const {
2553 if (TE.getNumOperands() != getNumOperands())
2554 return false;
2555 SmallBitVector Used(getNumOperands());
2556 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2557 unsigned PrevCount = Used.count();
2558 for (unsigned K = 0; K < E; ++K) {
2559 if (Used.test(K))
2560 continue;
2561 if (getOperand(K) == TE.getOperand(I)) {
2562 Used.set(K);
2563 break;
2564 }
2565 }
2566 // Check if we actually found the matching operand.
2567 if (PrevCount == Used.count())
2568 return false;
2569 }
2570 return true;
2571 }
2572
2573 /// \return Final vectorization factor for the node. Defined by the total
2574 /// number of vectorized scalars, including those, used several times in the
2575 /// entry and counted in the \a ReuseShuffleIndices, if any.
2576 unsigned getVectorFactor() const {
2577 if (!ReuseShuffleIndices.empty())
2578 return ReuseShuffleIndices.size();
2579 return Scalars.size();
2580 };
2581
2582 /// A vector of scalars.
2583 ValueList Scalars;
2584
2585 /// The Scalars are vectorized into this value. It is initialized to Null.
2586 WeakTrackingVH VectorizedValue = nullptr;
2587
2588 /// New vector phi instructions emitted for the vectorized phi nodes.
2589 PHINode *PHI = nullptr;
2590
2591 /// Do we need to gather this sequence or vectorize it
2592 /// (either with vector instruction or with scatter/gather
2593 /// intrinsics for store/load)?
2594 enum EntryState {
2595 Vectorize,
2596 ScatterVectorize,
2597 StridedVectorize,
2598 NeedToGather
2599 };
2600 EntryState State;
2601
2602 /// Does this sequence require some shuffling?
2603 SmallVector<int, 4> ReuseShuffleIndices;
2604
2605 /// Does this entry require reordering?
2606 SmallVector<unsigned, 4> ReorderIndices;
2607
2608 /// Points back to the VectorizableTree.
2609 ///
2610 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2611 /// to be a pointer and needs to be able to initialize the child iterator.
2612 /// Thus we need a reference back to the container to translate the indices
2613 /// to entries.
2614 VecTreeTy &Container;
2615
2616 /// The TreeEntry index containing the user of this entry. We can actually
2617 /// have multiple users so the data structure is not truly a tree.
2618 SmallVector<EdgeInfo, 1> UserTreeIndices;
2619
2620 /// The index of this treeEntry in VectorizableTree.
2621 int Idx = -1;
2622
2623 private:
2624 /// The operands of each instruction in each lane Operands[op_index][lane].
2625 /// Note: This helps avoid the replication of the code that performs the
2626 /// reordering of operands during buildTree_rec() and vectorizeTree().
2628
2629 /// The main/alternate instruction.
2630 Instruction *MainOp = nullptr;
2631 Instruction *AltOp = nullptr;
2632
2633 public:
2634 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
2635 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2636 if (Operands.size() < OpIdx + 1)
2637 Operands.resize(OpIdx + 1);
2638 assert(Operands[OpIdx].empty() && "Already resized?");
2639 assert(OpVL.size() <= Scalars.size() &&
2640 "Number of operands is greater than the number of scalars.");
2641 Operands[OpIdx].resize(OpVL.size());
2642 copy(OpVL, Operands[OpIdx].begin());
2643 }
2644
2645 /// Set the operands of this bundle in their original order.
2646 void setOperandsInOrder() {
2647 assert(Operands.empty() && "Already initialized?");
2648 auto *I0 = cast<Instruction>(Scalars[0]);
2649 Operands.resize(I0->getNumOperands());
2650 unsigned NumLanes = Scalars.size();
2651 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2652 OpIdx != NumOperands; ++OpIdx) {
2653 Operands[OpIdx].resize(NumLanes);
2654 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2655 auto *I = cast<Instruction>(Scalars[Lane]);
2656 assert(I->getNumOperands() == NumOperands &&
2657 "Expected same number of operands");
2658 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
2659 }
2660 }
2661 }
2662
2663 /// Reorders operands of the node to the given mask \p Mask.
2664 void reorderOperands(ArrayRef<int> Mask) {
2665 for (ValueList &Operand : Operands)
2666 reorderScalars(Operand, Mask);
2667 }
2668
2669 /// \returns the \p OpIdx operand of this TreeEntry.
2670 ValueList &getOperand(unsigned OpIdx) {
2671 assert(OpIdx < Operands.size() && "Off bounds");
2672 return Operands[OpIdx];
2673 }
2674
2675 /// \returns the \p OpIdx operand of this TreeEntry.
2676 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
2677 assert(OpIdx < Operands.size() && "Off bounds");
2678 return Operands[OpIdx];
2679 }
2680
2681 /// \returns the number of operands.
2682 unsigned getNumOperands() const { return Operands.size(); }
2683
2684 /// \return the single \p OpIdx operand.
2685 Value *getSingleOperand(unsigned OpIdx) const {
2686 assert(OpIdx < Operands.size() && "Off bounds");
2687 assert(!Operands[OpIdx].empty() && "No operand available");
2688 return Operands[OpIdx][0];
2689 }
2690
2691 /// Some of the instructions in the list have alternate opcodes.
2692 bool isAltShuffle() const { return MainOp != AltOp; }
2693
2694 bool isOpcodeOrAlt(Instruction *I) const {
2695 unsigned CheckedOpcode = I->getOpcode();
2696 return (getOpcode() == CheckedOpcode ||
2697 getAltOpcode() == CheckedOpcode);
2698 }
2699
2700 /// Chooses the correct key for scheduling data. If \p Op has the same (or
2701 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
2702 /// \p OpValue.
2703 Value *isOneOf(Value *Op) const {
2704 auto *I = dyn_cast<Instruction>(Op);
2705 if (I && isOpcodeOrAlt(I))
2706 return Op;
2707 return MainOp;
2708 }
2709
2710 void setOperations(const InstructionsState &S) {
2711 MainOp = S.MainOp;
2712 AltOp = S.AltOp;
2713 }
2714
2715 Instruction *getMainOp() const {
2716 return MainOp;
2717 }
2718
2719 Instruction *getAltOp() const {
2720 return AltOp;
2721 }
2722
2723 /// The main/alternate opcodes for the list of instructions.
2724 unsigned getOpcode() const {
2725 return MainOp ? MainOp->getOpcode() : 0;
2726 }
2727
2728 unsigned getAltOpcode() const {
2729 return AltOp ? AltOp->getOpcode() : 0;
2730 }
2731
2732 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
2733 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
2734 int findLaneForValue(Value *V) const {
2735 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
2736 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2737 if (!ReorderIndices.empty())
2738 FoundLane = ReorderIndices[FoundLane];
2739 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2740 if (!ReuseShuffleIndices.empty()) {
2741 FoundLane = std::distance(ReuseShuffleIndices.begin(),
2742 find(ReuseShuffleIndices, FoundLane));
2743 }
2744 return FoundLane;
2745 }
2746
2747 /// Build a shuffle mask for graph entry which represents a merge of main
2748 /// and alternate operations.
2749 void
2750 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
2752 SmallVectorImpl<Value *> *OpScalars = nullptr,
2753 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
2754
2755#ifndef NDEBUG
2756 /// Debug printer.
2757 LLVM_DUMP_METHOD void dump() const {
2758 dbgs() << Idx << ".\n";
2759 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
2760 dbgs() << "Operand " << OpI << ":\n";
2761 for (const Value *V : Operands[OpI])
2762 dbgs().indent(2) << *V << "\n";
2763 }
2764 dbgs() << "Scalars: \n";
2765 for (Value *V : Scalars)
2766 dbgs().indent(2) << *V << "\n";
2767 dbgs() << "State: ";
2768 switch (State) {
2769 case Vectorize:
2770 dbgs() << "Vectorize\n";
2771 break;
2772 case ScatterVectorize:
2773 dbgs() << "ScatterVectorize\n";
2774 break;
2775 case StridedVectorize:
2776 dbgs() << "StridedVectorize\n";
2777 break;
2778 case NeedToGather:
2779 dbgs() << "NeedToGather\n";
2780 break;
2781 }
2782 dbgs() << "MainOp: ";
2783 if (MainOp)
2784 dbgs() << *MainOp << "\n";
2785 else
2786 dbgs() << "NULL\n";
2787 dbgs() << "AltOp: ";
2788 if (AltOp)
2789 dbgs() << *AltOp << "\n";
2790 else
2791 dbgs() << "NULL\n";
2792 dbgs() << "VectorizedValue: ";
2793 if (VectorizedValue)
2794 dbgs() << *VectorizedValue << "\n";
2795 else
2796 dbgs() << "NULL\n";
2797 dbgs() << "ReuseShuffleIndices: ";
2798 if (ReuseShuffleIndices.empty())
2799 dbgs() << "Empty";
2800 else
2801 for (int ReuseIdx : ReuseShuffleIndices)
2802 dbgs() << ReuseIdx << ", ";
2803 dbgs() << "\n";
2804 dbgs() << "ReorderIndices: ";
2805 for (unsigned ReorderIdx : ReorderIndices)
2806 dbgs() << ReorderIdx << ", ";
2807 dbgs() << "\n";
2808 dbgs() << "UserTreeIndices: ";
2809 for (const auto &EInfo : UserTreeIndices)
2810 dbgs() << EInfo << ", ";
2811 dbgs() << "\n";
2812 }
2813#endif
2814 };
2815
2816#ifndef NDEBUG
2817 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
2818 InstructionCost VecCost, InstructionCost ScalarCost,
2819 StringRef Banner) const {
2820 dbgs() << "SLP: " << Banner << ":\n";
2821 E->dump();
2822 dbgs() << "SLP: Costs:\n";
2823 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
2824 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
2825 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
2826 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2827 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
2828 }
2829#endif
2830
2831 /// Create a new VectorizableTree entry.
2832 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2833 std::optional<ScheduleData *> Bundle,
2834 const InstructionsState &S,
2835 const EdgeInfo &UserTreeIdx,
2836 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2837 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2838 TreeEntry::EntryState EntryState =
2839 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2840 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2841 ReuseShuffleIndices, ReorderIndices);
2842 }
2843
2844 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2845 TreeEntry::EntryState EntryState,
2846 std::optional<ScheduleData *> Bundle,
2847 const InstructionsState &S,
2848 const EdgeInfo &UserTreeIdx,
2849 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2850 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2851 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2852 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2853 "Need to vectorize gather entry?");
2854 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
2855 TreeEntry *Last = VectorizableTree.back().get();
2856 Last->Idx = VectorizableTree.size() - 1;
2857 Last->State = EntryState;
2858 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2859 ReuseShuffleIndices.end());
2860 if (ReorderIndices.empty()) {
2861 Last->Scalars.assign(VL.begin(), VL.end());
2862 Last->setOperations(S);
2863 } else {
2864 // Reorder scalars and build final mask.
2865 Last->Scalars.assign(VL.size(), nullptr);
2866 transform(ReorderIndices, Last->Scalars.begin(),
2867 [VL](unsigned Idx) -> Value * {
2868 if (Idx >= VL.size())
2869 return UndefValue::get(VL.front()->getType());
2870 return VL[Idx];
2871 });
2872 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
2873 Last->setOperations(S);
2874 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
2875 }
2876 if (Last->State != TreeEntry::NeedToGather) {
2877 for (Value *V : VL) {
2878 const TreeEntry *TE = getTreeEntry(V);
2879 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
2880 "Scalar already in tree!");
2881 if (TE) {
2882 if (TE != Last)
2883 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
2884 continue;
2885 }
2886 ScalarToTreeEntry[V] = Last;
2887 }
2888 // Update the scheduler bundle to point to this TreeEntry.
2889 ScheduleData *BundleMember = *Bundle;
2890 assert((BundleMember || isa<PHINode>(S.MainOp) ||
2891 isVectorLikeInstWithConstOps(S.MainOp) ||
2892 doesNotNeedToSchedule(VL)) &&
2893 "Bundle and VL out of sync");
2894 if (BundleMember) {
2895 for (Value *V : VL) {
2897 continue;
2898 if (!BundleMember)
2899 continue;
2900 BundleMember->TE = Last;
2901 BundleMember = BundleMember->NextInBundle;
2902 }
2903 }
2904 assert(!BundleMember && "Bundle and VL out of sync");
2905 } else {
2906 MustGather.insert(VL.begin(), VL.end());
2907 // Build a map for gathered scalars to the nodes where they are used.
2908 for (Value *V : VL)
2909 if (!isConstant(V))
2910 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
2911 }
2912
2913 if (UserTreeIdx.UserTE)
2914 Last->UserTreeIndices.push_back(UserTreeIdx);
2915
2916 return Last;
2917 }
2918
2919 /// -- Vectorization State --
2920 /// Holds all of the tree entries.
2921 TreeEntry::VecTreeTy VectorizableTree;
2922
2923#ifndef NDEBUG
2924 /// Debug printer.
2925 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
2926 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
2927 VectorizableTree[Id]->dump();
2928 dbgs() << "\n";
2929 }
2930 }
2931#endif
2932
2933 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
2934
2935 const TreeEntry *getTreeEntry(Value *V) const {
2936 return ScalarToTreeEntry.lookup(V);
2937 }
2938
2939 /// Checks if the specified list of the instructions/values can be vectorized
2940 /// and fills required data before actual scheduling of the instructions.
2941 TreeEntry::EntryState getScalarsVectorizationState(
2942 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
2943 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
2944
2945 /// Maps a specific scalar to its tree entry.
2946 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
2947
2948 /// List of scalars, used in several vectorize nodes, and the list of the
2949 /// nodes.
2951
2952 /// Maps a value to the proposed vectorizable size.
2953 SmallDenseMap<Value *, unsigned> InstrElementSize;
2954
2955 /// A list of scalars that we found that we need to keep as scalars.
2956 ValueSet MustGather;
2957
2958 /// A map between the vectorized entries and the last instructions in the
2959 /// bundles. The bundles are built in use order, not in the def order of the
2960 /// instructions. So, we cannot rely directly on the last instruction in the
2961 /// bundle being the last instruction in the program order during
2962 /// vectorization process since the basic blocks are affected, need to
2963 /// pre-gather them before.
2964 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
2965
2966 /// List of gather nodes, depending on other gather/vector nodes, which should
2967 /// be emitted after the vector instruction emission process to correctly
2968 /// handle order of the vector instructions and shuffles.
2969 SetVector<const TreeEntry *> PostponedGathers;
2970
2971 using ValueToGatherNodesMap =
2973 ValueToGatherNodesMap ValueToGatherNodes;
2974
2975 /// This POD struct describes one external user in the vectorized tree.
2976 struct ExternalUser {
2977 ExternalUser(Value *S, llvm::User *U, int L)
2978 : Scalar(S), User(U), Lane(L) {}
2979
2980 // Which scalar in our function.
2981 Value *Scalar;
2982
2983 // Which user that uses the scalar.
2985
2986 // Which lane does the scalar belong to.
2987 int Lane;
2988 };
2989 using UserList = SmallVector<ExternalUser, 16>;
2990
2991 /// Checks if two instructions may access the same memory.
2992 ///
2993 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
2994 /// is invariant in the calling loop.
2995 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
2996 Instruction *Inst2) {
2997 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
2998 return true;
2999 // First check if the result is already in the cache.
3000 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3001 auto It = AliasCache.find(Key);
3002 if (It != AliasCache.end())
3003 return It->second;
3004 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3005 // Store the result in the cache.
3006 AliasCache.try_emplace(Key, Aliased);
3007 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3008 return Aliased;
3009 }
3010
3011 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3012
3013 /// Cache for alias results.
3014 /// TODO: consider moving this to the AliasAnalysis itself.
3016
3017 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3018 // globally through SLP because we don't perform any action which
3019 // invalidates capture results.
3020 BatchAAResults BatchAA;
3021
3022 /// Temporary store for deleted instructions. Instructions will be deleted
3023 /// eventually when the BoUpSLP is destructed. The deferral is required to
3024 /// ensure that there are no incorrect collisions in the AliasCache, which
3025 /// can happen if a new instruction is allocated at the same address as a
3026 /// previously deleted instruction.
3027 DenseSet<Instruction *> DeletedInstructions;
3028
3029 /// Set of the instruction, being analyzed already for reductions.
3030 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3031
3032 /// Set of hashes for the list of reduction values already being analyzed.
3033 DenseSet<size_t> AnalyzedReductionVals;
3034
3035 /// A list of values that need to extracted out of the tree.
3036 /// This list holds pairs of (Internal Scalar : External User). External User
3037 /// can be nullptr, it means that this Internal Scalar will be used later,
3038 /// after vectorization.
3039 UserList ExternalUses;
3040
3041 /// Values used only by @llvm.assume calls.
3043
3044 /// Holds all of the instructions that we gathered, shuffle instructions and
3045 /// extractelements.
3046 SetVector<Instruction *> GatherShuffleExtractSeq;
3047
3048 /// A list of blocks that we are going to CSE.
3049 DenseSet<BasicBlock *> CSEBlocks;
3050
3051 /// Contains all scheduling relevant data for an instruction.
3052 /// A ScheduleData either represents a single instruction or a member of an
3053 /// instruction bundle (= a group of instructions which is combined into a
3054 /// vector instruction).
3055 struct ScheduleData {
3056 // The initial value for the dependency counters. It means that the
3057 // dependencies are not calculated yet.
3058 enum { InvalidDeps = -1 };
3059
3060 ScheduleData() = default;
3061
3062 void init(int BlockSchedulingRegionID, Value *OpVal) {
3063 FirstInBundle = this;
3064 NextInBundle = nullptr;
3065 NextLoadStore = nullptr;
3066 IsScheduled = false;
3067 SchedulingRegionID = BlockSchedulingRegionID;
3068 clearDependencies();
3069 OpValue = OpVal;
3070 TE = nullptr;
3071 }
3072
3073 /// Verify basic self consistency properties
3074 void verify() {
3075 if (hasValidDependencies()) {
3076 assert(UnscheduledDeps <= Dependencies && "invariant");
3077 } else {
3078 assert(UnscheduledDeps == Dependencies && "invariant");
3079 }
3080
3081 if (IsScheduled) {
3082 assert(isSchedulingEntity() &&
3083 "unexpected scheduled state");
3084 for (const ScheduleData *BundleMember = this; BundleMember;
3085 BundleMember = BundleMember->NextInBundle) {
3086 assert(BundleMember->hasValidDependencies() &&
3087 BundleMember->UnscheduledDeps == 0 &&
3088 "unexpected scheduled state");
3089 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3090 "only bundle is marked scheduled");
3091 }
3092 }
3093
3094 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3095 "all bundle members must be in same basic block");
3096 }
3097
3098 /// Returns true if the dependency information has been calculated.
3099 /// Note that depenendency validity can vary between instructions within
3100 /// a single bundle.
3101 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3102
3103 /// Returns true for single instructions and for bundle representatives
3104 /// (= the head of a bundle).
3105 bool isSchedulingEntity() const { return FirstInBundle == this; }
3106
3107 /// Returns true if it represents an instruction bundle and not only a
3108 /// single instruction.
3109 bool isPartOfBundle() const {
3110 return NextInBundle != nullptr || FirstInBundle != this || TE;
3111 }
3112
3113 /// Returns true if it is ready for scheduling, i.e. it has no more
3114 /// unscheduled depending instructions/bundles.
3115 bool isReady() const {
3116 assert(isSchedulingEntity() &&
3117 "can't consider non-scheduling entity for ready list");
3118 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3119 }
3120
3121 /// Modifies the number of unscheduled dependencies for this instruction,
3122 /// and returns the number of remaining dependencies for the containing
3123 /// bundle.
3124 int incrementUnscheduledDeps(int Incr) {
3125 assert(hasValidDependencies() &&
3126 "increment of unscheduled deps would be meaningless");
3127 UnscheduledDeps += Incr;
3128 return FirstInBundle->unscheduledDepsInBundle();
3129 }
3130
3131 /// Sets the number of unscheduled dependencies to the number of
3132 /// dependencies.
3133 void resetUnscheduledDeps() {
3134 UnscheduledDeps = Dependencies;
3135 }
3136
3137 /// Clears all dependency information.
3138 void clearDependencies() {
3139 Dependencies = InvalidDeps;
3140 resetUnscheduledDeps();
3141 MemoryDependencies.clear();
3142 ControlDependencies.clear();
3143 }
3144
3145 int unscheduledDepsInBundle() const {
3146 assert(isSchedulingEntity() && "only meaningful on the bundle");
3147 int Sum = 0;
3148 for (const ScheduleData *BundleMember = this; BundleMember;
3149 BundleMember = BundleMember->NextInBundle) {
3150 if (BundleMember->UnscheduledDeps == InvalidDeps)
3151 return InvalidDeps;
3152 Sum += BundleMember->UnscheduledDeps;
3153 }
3154 return Sum;
3155 }
3156
3157 void dump(raw_ostream &os) const {
3158 if (!isSchedulingEntity()) {
3159 os << "/ " << *Inst;
3160 } else if (NextInBundle) {
3161 os << '[' << *Inst;
3162 ScheduleData *SD = NextInBundle;
3163 while (SD) {
3164 os << ';' << *SD->Inst;
3165 SD = SD->NextInBundle;
3166 }
3167 os << ']';
3168 } else {
3169 os << *Inst;
3170 }
3171 }
3172
3173 Instruction *Inst = nullptr;
3174
3175 /// Opcode of the current instruction in the schedule data.
3176 Value *OpValue = nullptr;
3177
3178 /// The TreeEntry that this instruction corresponds to.
3179 TreeEntry *TE = nullptr;
3180
3181 /// Points to the head in an instruction bundle (and always to this for
3182 /// single instructions).
3183 ScheduleData *FirstInBundle = nullptr;
3184
3185 /// Single linked list of all instructions in a bundle. Null if it is a
3186 /// single instruction.
3187 ScheduleData *NextInBundle = nullptr;
3188
3189 /// Single linked list of all memory instructions (e.g. load, store, call)
3190 /// in the block - until the end of the scheduling region.
3191 ScheduleData *NextLoadStore = nullptr;
3192
3193 /// The dependent memory instructions.
3194 /// This list is derived on demand in calculateDependencies().
3195 SmallVector<ScheduleData *, 4> MemoryDependencies;
3196
3197 /// List of instructions which this instruction could be control dependent
3198 /// on. Allowing such nodes to be scheduled below this one could introduce
3199 /// a runtime fault which didn't exist in the original program.
3200 /// ex: this is a load or udiv following a readonly call which inf loops
3201 SmallVector<ScheduleData *, 4> ControlDependencies;
3202
3203 /// This ScheduleData is in the current scheduling region if this matches
3204 /// the current SchedulingRegionID of BlockScheduling.
3205 int SchedulingRegionID = 0;
3206
3207 /// Used for getting a "good" final ordering of instructions.
3208 int SchedulingPriority = 0;
3209
3210 /// The number of dependencies. Constitutes of the number of users of the
3211 /// instruction plus the number of dependent memory instructions (if any).
3212 /// This value is calculated on demand.
3213 /// If InvalidDeps, the number of dependencies is not calculated yet.
3214 int Dependencies = InvalidDeps;
3215
3216 /// The number of dependencies minus the number of dependencies of scheduled
3217 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3218 /// for scheduling.
3219 /// Note that this is negative as long as Dependencies is not calculated.
3220 int UnscheduledDeps = InvalidDeps;
3221
3222 /// True if this instruction is scheduled (or considered as scheduled in the
3223 /// dry-run).
3224 bool IsScheduled = false;
3225 };
3226
3227#ifndef NDEBUG
3229 const BoUpSLP::ScheduleData &SD) {
3230 SD.dump(os);
3231 return os;
3232 }
3233#endif
3234
3235 friend struct GraphTraits<BoUpSLP *>;
3236 friend struct DOTGraphTraits<BoUpSLP *>;
3237
3238 /// Contains all scheduling data for a basic block.
3239 /// It does not schedules instructions, which are not memory read/write
3240 /// instructions and their operands are either constants, or arguments, or
3241 /// phis, or instructions from others blocks, or their users are phis or from
3242 /// the other blocks. The resulting vector instructions can be placed at the
3243 /// beginning of the basic block without scheduling (if operands does not need
3244 /// to be scheduled) or at the end of the block (if users are outside of the
3245 /// block). It allows to save some compile time and memory used by the
3246 /// compiler.
3247 /// ScheduleData is assigned for each instruction in between the boundaries of
3248 /// the tree entry, even for those, which are not part of the graph. It is
3249 /// required to correctly follow the dependencies between the instructions and
3250 /// their correct scheduling. The ScheduleData is not allocated for the
3251 /// instructions, which do not require scheduling, like phis, nodes with
3252 /// extractelements/insertelements only or nodes with instructions, with
3253 /// uses/operands outside of the block.
3254 struct BlockScheduling {
3255 BlockScheduling(BasicBlock *BB)
3256 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3257
3258 void clear() {
3259 ReadyInsts.clear();
3260 ScheduleStart = nullptr;
3261 ScheduleEnd = nullptr;
3262 FirstLoadStoreInRegion = nullptr;
3263 LastLoadStoreInRegion = nullptr;
3264 RegionHasStackSave = false;
3265
3266 // Reduce the maximum schedule region size by the size of the
3267 // previous scheduling run.
3268 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3269 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3270 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3271 ScheduleRegionSize = 0;
3272
3273 // Make a new scheduling region, i.e. all existing ScheduleData is not
3274 // in the new region yet.
3275 ++SchedulingRegionID;
3276 }
3277
3278 ScheduleData *getScheduleData(Instruction *I) {
3279 if (BB != I->getParent())
3280 // Avoid lookup if can't possibly be in map.
3281 return nullptr;
3282 ScheduleData *SD = ScheduleDataMap.lookup(I);
3283 if (SD && isInSchedulingRegion(SD))
3284 return SD;
3285 return nullptr;
3286 }
3287
3288 ScheduleData *getScheduleData(Value *V) {
3289 if (auto *I = dyn_cast<Instruction>(V))
3290 return getScheduleData(I);
3291 return nullptr;
3292 }
3293
3294 ScheduleData *getScheduleData(Value *V, Value *Key) {
3295 if (V == Key)
3296 return getScheduleData(V);
3297 auto I = ExtraScheduleDataMap.find(V);
3298 if (I != ExtraScheduleDataMap.end()) {
3299 ScheduleData *SD = I->second.lookup(Key);
3300 if (SD && isInSchedulingRegion(SD))
3301 return SD;
3302 }
3303 return nullptr;
3304 }
3305
3306 bool isInSchedulingRegion(ScheduleData *SD) const {
3307 return SD->SchedulingRegionID == SchedulingRegionID;
3308 }
3309
3310 /// Marks an instruction as scheduled and puts all dependent ready
3311 /// instructions into the ready-list.
3312 template <typename ReadyListType>
3313 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3314 SD->IsScheduled = true;
3315 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3316
3317 for (ScheduleData *BundleMember = SD; BundleMember;
3318 BundleMember = BundleMember->NextInBundle) {
3319 if (BundleMember->Inst != BundleMember->OpValue)
3320 continue;
3321
3322 // Handle the def-use chain dependencies.
3323
3324 // Decrement the unscheduled counter and insert to ready list if ready.
3325 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3326 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
3327 if (OpDef && OpDef->hasValidDependencies() &&
3328 OpDef->incrementUnscheduledDeps(-1) == 0) {
3329 // There are no more unscheduled dependencies after
3330 // decrementing, so we can put the dependent instruction
3331 // into the ready list.
3332 ScheduleData *DepBundle = OpDef->FirstInBundle;
3333 assert(!DepBundle->IsScheduled &&
3334 "already scheduled bundle gets ready");
3335 ReadyList.insert(DepBundle);
3336 LLVM_DEBUG(dbgs()
3337 << "SLP: gets ready (def): " << *DepBundle << "\n");
3338 }
3339 });
3340 };
3341
3342 // If BundleMember is a vector bundle, its operands may have been
3343 // reordered during buildTree(). We therefore need to get its operands
3344 // through the TreeEntry.
3345 if (TreeEntry *TE = BundleMember->TE) {
3346 // Need to search for the lane since the tree entry can be reordered.
3347 int Lane = std::distance(TE->Scalars.begin(),
3348 find(TE->Scalars, BundleMember->Inst));
3349 assert(Lane >= 0 && "Lane not set");
3350
3351 // Since vectorization tree is being built recursively this assertion
3352 // ensures that the tree entry has all operands set before reaching
3353 // this code. Couple of exceptions known at the moment are extracts
3354 // where their second (immediate) operand is not added. Since
3355 // immediates do not affect scheduler behavior this is considered
3356 // okay.
3357 auto *In = BundleMember->Inst;
3358 assert(In &&
3359 (isa<ExtractValueInst, ExtractElementInst>(In) ||
3360 In->getNumOperands() == TE->getNumOperands()) &&
3361 "Missed TreeEntry operands?");
3362 (void)In; // fake use to avoid build failure when assertions disabled
3363
3364 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3365 OpIdx != NumOperands; ++OpIdx)
3366 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3367 DecrUnsched(I);
3368 } else {
3369 // If BundleMember is a stand-alone instruction, no operand reordering
3370 // has taken place, so we directly access its operands.
3371 for (Use &U : BundleMember->Inst->operands())
3372 if (auto *I = dyn_cast<Instruction>(U.get()))
3373 DecrUnsched(I);
3374 }
3375 // Handle the memory dependencies.
3376 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3377 if (MemoryDepSD->hasValidDependencies() &&
3378 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3379 // There are no more unscheduled dependencies after decrementing,
3380 // so we can put the dependent instruction into the ready list.
3381 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3382 assert(!DepBundle->IsScheduled &&
3383 "already scheduled bundle gets ready");
3384 ReadyList.insert(DepBundle);
3386 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3387 }
3388 }
3389 // Handle the control dependencies.
3390 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3391 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3392 // There are no more unscheduled dependencies after decrementing,
3393 // so we can put the dependent instruction into the ready list.
3394 ScheduleData *DepBundle = DepSD->FirstInBundle;
3395 assert(!DepBundle->IsScheduled &&
3396 "already scheduled bundle gets ready");
3397 ReadyList.insert(DepBundle);
3399 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3400 }
3401 }
3402 }
3403 }
3404
3405 /// Verify basic self consistency properties of the data structure.
3406 void verify() {
3407 if (!ScheduleStart)
3408 return;
3409
3410 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3411 ScheduleStart->comesBefore(ScheduleEnd) &&
3412 "Not a valid scheduling region?");
3413
3414 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3415 auto *SD = getScheduleData(I);
3416 if (!SD)
3417 continue;
3418 assert(isInSchedulingRegion(SD) &&
3419 "primary schedule data not in window?");
3420 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3421 "entire bundle in window!");
3422 (void)SD;
3423 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3424 }
3425
3426 for (auto *SD : ReadyInsts) {
3427 assert(SD->isSchedulingEntity() && SD->isReady() &&
3428 "item in ready list not ready?");
3429 (void)SD;
3430 }
3431 }
3432
3433 void doForAllOpcodes(Value *V,
3434 function_ref<void(ScheduleData *SD)> Action) {
3435 if (ScheduleData *SD = getScheduleData(V))
3436 Action(SD);
3437 auto I = ExtraScheduleDataMap.find(V);
3438 if (I != ExtraScheduleDataMap.end())
3439 for (auto &P : I->second)
3440 if (isInSchedulingRegion(P.second))
3441 Action(P.second);
3442 }
3443
3444 /// Put all instructions into the ReadyList which are ready for scheduling.
3445 template <typename ReadyListType>
3446 void initialFillReadyList(ReadyListType &ReadyList) {
3447 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3448 doForAllOpcodes(I, [&](ScheduleData *SD) {
3449 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3450 SD->isReady()) {
3451 ReadyList.insert(SD);
3452 LLVM_DEBUG(dbgs()
3453 << "SLP: initially in ready list: " << *SD << "\n");
3454 }
3455 });
3456 }
3457 }
3458
3459 /// Build a bundle from the ScheduleData nodes corresponding to the
3460 /// scalar instruction for each lane.
3461 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3462
3463 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3464 /// cyclic dependencies. This is only a dry-run, no instructions are
3465 /// actually moved at this stage.
3466 /// \returns the scheduling bundle. The returned Optional value is not
3467 /// std::nullopt if \p VL is allowed to be scheduled.
3468 std::optional<ScheduleData *>
3469 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3470 const InstructionsState &S);
3471
3472 /// Un-bundles a group of instructions.
3473 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3474
3475 /// Allocates schedule data chunk.
3476 ScheduleData *allocateScheduleDataChunks();
3477
3478 /// Extends the scheduling region so that V is inside the region.
3479 /// \returns true if the region size is within the limit.
3480 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3481
3482 /// Initialize the ScheduleData structures for new instructions in the
3483 /// scheduling region.
3484 void initScheduleData(Instruction *FromI, Instruction *ToI,
3485 ScheduleData *PrevLoadStore,
3486 ScheduleData *NextLoadStore);
3487
3488 /// Updates the dependency information of a bundle and of all instructions/
3489 /// bundles which depend on the original bundle.
3490 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3491 BoUpSLP *SLP);
3492
3493 /// Sets all instruction in the scheduling region to un-scheduled.
3494 void resetSchedule();
3495
3496 BasicBlock *BB;
3497
3498 /// Simple memory allocation for ScheduleData.
3500
3501 /// The size of a ScheduleData array in ScheduleDataChunks.
3502 int ChunkSize;
3503
3504 /// The allocator position in the current chunk, which is the last entry
3505 /// of ScheduleDataChunks.
3506 int ChunkPos;
3507
3508 /// Attaches ScheduleData to Instruction.
3509 /// Note that the mapping survives during all vectorization iterations, i.e.
3510 /// ScheduleData structures are recycled.
3512
3513 /// Attaches ScheduleData to Instruction with the leading key.
3515 ExtraScheduleDataMap;
3516
3517 /// The ready-list for scheduling (only used for the dry-run).
3518 SetVector<ScheduleData *> ReadyInsts;
3519
3520 /// The first instruction of the scheduling region.
3521 Instruction *ScheduleStart = nullptr;
3522
3523 /// The first instruction _after_ the scheduling region.
3524 Instruction *ScheduleEnd = nullptr;
3525
3526 /// The first memory accessing instruction in the scheduling region
3527 /// (can be null).
3528 ScheduleData *FirstLoadStoreInRegion = nullptr;
3529
3530 /// The last memory accessing instruction in the scheduling region
3531 /// (can be null).
3532 ScheduleData *LastLoadStoreInRegion = nullptr;
3533
3534 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3535 /// region? Used to optimize the dependence calculation for the
3536 /// common case where there isn't.
3537 bool RegionHasStackSave = false;
3538
3539 /// The current size of the scheduling region.
3540 int ScheduleRegionSize = 0;
3541
3542 /// The maximum size allowed for the scheduling region.
3543 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3544
3545 /// The ID of the scheduling region. For a new vectorization iteration this
3546 /// is incremented which "removes" all ScheduleData from the region.
3547 /// Make sure that the initial SchedulingRegionID is greater than the
3548 /// initial SchedulingRegionID in ScheduleData (which is 0).
3549 int SchedulingRegionID = 1;
3550 };
3551
3552 /// Attaches the BlockScheduling structures to basic blocks.
3554
3555 /// Performs the "real" scheduling. Done before vectorization is actually
3556 /// performed in a basic block.
3557 void scheduleBlock(BlockScheduling *BS);
3558
3559 /// List of users to ignore during scheduling and that don't need extracting.
3560 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3561
3562 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3563 /// sorted SmallVectors of unsigned.
3564 struct OrdersTypeDenseMapInfo {
3565 static OrdersType getEmptyKey() {
3566 OrdersType V;
3567 V.push_back(~1U);
3568 return V;
3569 }
3570
3571 static OrdersType getTombstoneKey() {
3572 OrdersType V;
3573 V.push_back(~2U);
3574 return V;
3575 }
3576
3577 static unsigned getHashValue(const OrdersType &V) {
3578 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3579 }
3580
3581 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3582 return LHS == RHS;
3583 }
3584 };
3585
3586 // Analysis and block reference.
3587 Function *F;
3588 ScalarEvolution *SE;
3590 TargetLibraryInfo *TLI;
3591 LoopInfo *LI;
3592 DominatorTree *DT;
3593 AssumptionCache *AC;
3594 DemandedBits *DB;
3595 const DataLayout *DL;
3597
3598 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3599 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3600
3601 /// Instruction builder to construct the vectorized tree.
3602 IRBuilder<> Builder;
3603
3604 /// A map of scalar integer values to the smallest bit width with which they
3605 /// can legally be represented. The values map to (width, signed) pairs,
3606 /// where "width" indicates the minimum bit width and "signed" is True if the
3607 /// value must be signed-extended, rather than zero-extended, back to its
3608 /// original width.
3610};
3611
3612} // end namespace slpvectorizer
3613
3614template <> struct GraphTraits<BoUpSLP *> {
3615 using TreeEntry = BoUpSLP::TreeEntry;
3616
3617 /// NodeRef has to be a pointer per the GraphWriter.
3619
3621
3622 /// Add the VectorizableTree to the index iterator to be able to return
3623 /// TreeEntry pointers.
3624 struct ChildIteratorType
3625 : public iterator_adaptor_base<
3626 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3628
3630 ContainerTy &VT)
3631 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
3632
3633 NodeRef operator*() { return I->UserTE; }
3634 };
3635
3637 return R.VectorizableTree[0].get();
3638 }
3639
3640 static ChildIteratorType child_begin(NodeRef N) {
3641 return {N->UserTreeIndices.begin(), N->Container};
3642 }
3643
3644 static ChildIteratorType child_end(NodeRef N) {
3645 return {N->UserTreeIndices.end(), N->Container};
3646 }
3647
3648 /// For the node iterator we just need to turn the TreeEntry iterator into a
3649 /// TreeEntry* iterator so that it dereferences to NodeRef.
3650 class nodes_iterator {
3652 ItTy It;
3653
3654 public:
3655 nodes_iterator(const ItTy &It2) : It(It2) {}
3656 NodeRef operator*() { return It->get(); }
3657 nodes_iterator operator++() {
3658 ++It;
3659 return *this;
3660 }
3661 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
3662 };
3663
3664 static nodes_iterator nodes_begin(BoUpSLP *R) {
3665 return nodes_iterator(R->VectorizableTree.begin());
3666 }
3667
3668 static nodes_iterator nodes_end(BoUpSLP *R) {
3669 return nodes_iterator(R->VectorizableTree.end());
3670 }
3671
3672 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
3673};
3674
3675template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
3676 using TreeEntry = BoUpSLP::TreeEntry;
3677
3678 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
3679
3680 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
3681 std::string Str;
3683 OS << Entry->Idx << ".\n";
3684 if (isSplat(Entry->Scalars))
3685 OS << "<splat> ";
3686 for (auto *V : Entry->Scalars) {
3687 OS << *V;
3688 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
3689 return EU.Scalar == V;
3690 }))
3691 OS << " <extract>";
3692 OS << "\n";
3693 }
3694 return Str;
3695 }
3696
3697 static std::string getNodeAttributes(const TreeEntry *Entry,
3698 const BoUpSLP *) {
3699 if (Entry->State == TreeEntry::NeedToGather)
3700 return "color=red";
3701 if (Entry->State == TreeEntry::ScatterVectorize ||
3702 Entry->State == TreeEntry::StridedVectorize)
3703 return "color=blue";
3704 return "";
3705 }
3706};
3707
3708} // end namespace llvm
3709
3712 for (auto *I : DeletedInstructions) {
3713 for (Use &U : I->operands()) {
3714 auto *Op = dyn_cast<Instruction>(U.get());
3715 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
3717 DeadInsts.emplace_back(Op);
3718 }
3719 I->dropAllReferences();
3720 }
3721 for (auto *I : DeletedInstructions) {
3722 assert(I->use_empty() &&
3723 "trying to erase instruction with users.");
3724 I->eraseFromParent();
3725 }
3726
3727 // Cleanup any dead scalar code feeding the vectorized instructions
3729
3730#ifdef EXPENSIVE_CHECKS
3731 // If we could guarantee that this call is not extremely slow, we could
3732 // remove the ifdef limitation (see PR47712).
3733 assert(!verifyFunction(*F, &dbgs()));
3734#endif
3735}
3736
3737/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
3738/// contains original mask for the scalars reused in the node. Procedure
3739/// transform this mask in accordance with the given \p Mask.
3741 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
3742 "Expected non-empty mask.");
3743 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
3744 Prev.swap(Reuses);
3745 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
3746 if (Mask[I] != PoisonMaskElem)
3747 Reuses[Mask[I]] = Prev[I];
3748}
3749
3750/// Reorders the given \p Order according to the given \p Mask. \p Order - is
3751/// the original order of the scalars. Procedure transforms the provided order
3752/// in accordance with the given \p Mask. If the resulting \p Order is just an
3753/// identity order, \p Order is cleared.
3755 bool BottomOrder = false) {
3756 assert(!Mask.empty() && "Expected non-empty mask.");
3757 unsigned Sz = Mask.size();
3758 if (BottomOrder) {
3759 SmallVector<unsigned> PrevOrder;
3760 if (Order.empty()) {
3761 PrevOrder.resize(Sz);
3762 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
3763 } else {
3764 PrevOrder.swap(Order);
3765 }
3766 Order.assign(Sz, Sz);
3767 for (unsigned I = 0; I < Sz; ++I)
3768 if (Mask[I] != PoisonMaskElem)
3769 Order[I] = PrevOrder[Mask[I]];
3770 if (all_of(enumerate(Order), [&](const auto &Data) {
3771 return Data.value() == Sz || Data.index() == Data.value();
3772 })) {
3773 Order.clear();
3774 return;
3775 }
3776 fixupOrderingIndices(Order);
3777 return;
3778 }
3779 SmallVector<int> MaskOrder;
3780 if (Order.empty()) {
3781 MaskOrder.resize(Sz);
3782 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
3783 } else {
3784 inversePermutation(Order, MaskOrder);
3785 }
3786 reorderReuses(MaskOrder, Mask);
3787 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
3788 Order.clear();
3789 return;
3790 }
3791 Order.assign(Sz, Sz);
3792 for (unsigned I = 0; I < Sz; ++I)
3793 if (MaskOrder[I] != PoisonMaskElem)
3794 Order[MaskOrder[I]] = I;
3795 fixupOrderingIndices(Order);
3796}
3797
3798std::optional<BoUpSLP::OrdersType>
3799BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
3800 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
3801 // Try to find subvector extract/insert patterns and reorder only such
3802 // patterns.
3803 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
3804 Type *ScalarTy = GatheredScalars.front()->getType();
3805 int NumScalars = GatheredScalars.size();
3806 if (!isValidElementType(ScalarTy))
3807 return std::nullopt;
3808 auto *VecTy = FixedVectorType::get(ScalarTy, NumScalars);
3809 int NumParts = TTI->getNumberOfParts(VecTy);
3810 if (NumParts == 0 || NumParts >= NumScalars)
3811 NumParts = 1;
3812 SmallVector<int> ExtractMask;
3813 SmallVector<int> Mask;
3816 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
3818 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
3819 /*ForOrder=*/true);
3820 // No shuffled operands - ignore.
3821 if (GatherShuffles.empty() && ExtractShuffles.empty())
3822 return std::nullopt;
3823 OrdersType CurrentOrder(NumScalars, NumScalars);
3824 if (GatherShuffles.size() == 1 &&
3825 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
3826 Entries.front().front()->isSame(TE.Scalars)) {
3827 // Perfect match in the graph, will reuse the previously vectorized
3828 // node. Cost is 0.
3829 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
3830 return CurrentOrder;
3831 }
3832 auto IsSplatMask = [](ArrayRef<int> Mask) {
3833 int SingleElt = PoisonMaskElem;
3834 return all_of(Mask, [&](int I) {
3835 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
3836 SingleElt = I;
3837 return I == PoisonMaskElem || I == SingleElt;
3838 });
3839 };
3840 // Exclusive broadcast mask - ignore.
3841 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
3842 (Entries.size() != 1 ||
3843 Entries.front().front()->ReorderIndices.empty())) ||
3844 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
3845 return std::nullopt;
3846 SmallBitVector ShuffledSubMasks(NumParts);
3847 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
3848 ArrayRef<int> Mask, int PartSz, int NumParts,
3849 function_ref<unsigned(unsigned)> GetVF) {
3850 for (int I : seq<int>(0, NumParts)) {
3851 if (ShuffledSubMasks.test(I))
3852 continue;
3853 const int VF = GetVF(I);
3854 if (VF == 0)
3855 continue;
3856 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, PartSz);
3857 // Shuffle of at least 2 vectors - ignore.
3858 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
3859 std::fill(Slice.begin(), Slice.end(), NumScalars);
3860 ShuffledSubMasks.set(I);
3861 continue;
3862 }
3863 // Try to include as much elements from the mask as possible.
3864 int FirstMin = INT_MAX;
3865 int SecondVecFound = false;
3866 for (int K : seq<int>(0, PartSz)) {
3867 int Idx = Mask[I * PartSz + K];
3868 if (Idx == PoisonMaskElem) {
3869 Value *V = GatheredScalars[I * PartSz + K];
3870 if (isConstant(V) && !isa<PoisonValue>(V)) {
3871 SecondVecFound = true;
3872 break;
3873 }
3874 continue;
3875 }
3876 if (Idx < VF) {
3877 if (FirstMin > Idx)
3878 FirstMin = Idx;
3879 } else {
3880 SecondVecFound = true;
3881 break;
3882 }
3883 }
3884 FirstMin = (FirstMin / PartSz) * PartSz;
3885 // Shuffle of at least 2 vectors - ignore.
3886 if (SecondVecFound) {
3887 std::fill(Slice.begin(), Slice.end(), NumScalars);
3888 ShuffledSubMasks.set(I);
3889 continue;
3890 }
3891 for (int K : seq<int>(0, PartSz)) {
3892 int Idx = Mask[I * PartSz + K];
3893 if (Idx == PoisonMaskElem)
3894 continue;
3895 Idx -= FirstMin;
3896 if (Idx >= PartSz) {
3897 SecondVecFound = true;
3898 break;
3899 }
3900 if (CurrentOrder[I * PartSz + Idx] >
3901 static_cast<unsigned>(I * PartSz + K) &&
3902 CurrentOrder[I * PartSz + Idx] !=
3903 static_cast<unsigned>(I * PartSz + Idx))
3904 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
3905 }
3906 // Shuffle of at least 2 vectors - ignore.
3907 if (SecondVecFound) {
3908 std::fill(Slice.begin(), Slice.end(), NumScalars);
3909 ShuffledSubMasks.set(I);
3910 continue;
3911 }
3912 }
3913 };
3914 int PartSz = NumScalars / NumParts;
3915 if (!ExtractShuffles.empty())
3916 TransformMaskToOrder(
3917 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
3918 if (!ExtractShuffles[I])
3919 return 0U;
3920 unsigned VF = 0;
3921 for (unsigned Idx : seq<unsigned>(0, PartSz)) {
3922 int K = I * PartSz + Idx;
3923 if (ExtractMask[K] == PoisonMaskElem)
3924 continue;
3925 if (!TE.ReuseShuffleIndices.empty())
3926 K = TE.ReuseShuffleIndices[K];
3927 if (!TE.ReorderIndices.empty())
3928 K = std::distance(TE.ReorderIndices.begin(),
3929 find(TE.ReorderIndices, K));
3930 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
3931 if (!EI)
3932 continue;
3933 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
3934 ->getElementCount()
3935 .getKnownMinValue());
3936 }
3937 return VF;
3938 });
3939 // Check special corner case - single shuffle of the same entry.
3940 if (GatherShuffles.size() == 1 && NumParts != 1) {
3941 if (ShuffledSubMasks.any())
3942 return std::nullopt;
3943 PartSz = NumScalars;
3944 NumParts = 1;
3945 }
3946 if (!Entries.empty())
3947 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
3948 if (!GatherShuffles[I])
3949 return 0U;
3950 return std::max(Entries[I].front()->getVectorFactor(),
3951 Entries[I].back()->getVectorFactor());
3952 });
3953 int NumUndefs =
3954 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
3955 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
3956 return std::nullopt;
3957 return std::move(CurrentOrder);
3958}
3959
3960namespace {
3961/// Tracks the state we can represent the loads in the given sequence.
3962enum class LoadsState { Gather, Vectorize, ScatterVectorize, StridedVectorize };
3963} // anonymous namespace
3964
3965static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
3966 const TargetLibraryInfo &TLI,
3967 bool CompareOpcodes = true) {
3968 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
3969 return false;
3970 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
3971 if (!GEP1)
3972 return false;
3973 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
3974 if (!GEP2)
3975 return false;
3976 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
3977 ((isConstant(GEP1->getOperand(1)) &&
3978 isConstant(GEP2->getOperand(1))) ||
3979 !CompareOpcodes ||
3980 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
3981 .getOpcode());
3982}
3983
3984/// Calculates minimal alignment as a common alignment.
3985template <typename T>
3987 Align CommonAlignment = cast<T>(VL.front())->getAlign();
3988 for (Value *V : VL.drop_front())
3989 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
3990 return CommonAlignment;
3991}
3992
3993/// Check if \p Order represents reverse order.
3995 unsigned Sz = Order.size();
3996 return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
3997 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
3998 });
3999}
4000
4001/// Checks if the given array of loads can be represented as a vectorized,
4002/// scatter or just simple gather.
4003static LoadsState canVectorizeLoads(const BoUpSLP &R, ArrayRef<Value *> VL,
4004 const Value *VL0,
4005 const TargetTransformInfo &TTI,
4006 const DataLayout &DL, ScalarEvolution &SE,
4007 LoopInfo &LI, const TargetLibraryInfo &TLI,
4009 SmallVectorImpl<Value *> &PointerOps,
4010 bool TryRecursiveCheck = true) {
4011 // Check that a vectorized load would load the same memory as a scalar
4012 // load. For example, we don't want to vectorize loads that are smaller
4013 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4014 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4015 // from such a struct, we read/write packed bits disagreeing with the
4016 // unvectorized version.
4017 Type *ScalarTy = VL0->getType();
4018
4019 if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy))
4020 return LoadsState::Gather;
4021
4022 // Make sure all loads in the bundle are simple - we can't vectorize
4023 // atomic or volatile loads.
4024 PointerOps.clear();
4025 const unsigned Sz = VL.size();
4026 PointerOps.resize(Sz);
4027 auto *POIter = PointerOps.begin();
4028 for (Value *V : VL) {
4029 auto *L = cast<LoadInst>(V);
4030 if (!L->isSimple())
4031 return LoadsState::Gather;
4032 *POIter = L->getPointerOperand();
4033 ++POIter;
4034 }
4035
4036 Order.clear();
4037 auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
4038 // Check the order of pointer operands or that all pointers are the same.
4039 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
4040 if (IsSorted || all_of(PointerOps, [&](Value *P) {
4041 return arePointersCompatible(P, PointerOps.front(), TLI);
4042 })) {
4043 if (IsSorted) {
4044 Value *Ptr0;
4045 Value *PtrN;
4046 if (Order.empty()) {
4047 Ptr0 = PointerOps.front();
4048 PtrN = PointerOps.back();
4049 } else {
4050 Ptr0 = PointerOps[Order.front()];
4051 PtrN = PointerOps[Order.back()];
4052 }
4053 std::optional<int> Diff =
4054 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
4055 // Check that the sorted loads are consecutive.
4056 if (static_cast<unsigned>(*Diff) == Sz - 1)
4057 return LoadsState::Vectorize;
4058 // Simple check if not a strided access - clear order.
4059 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4060 // Try to generate strided load node if:
4061 // 1. Target with strided load support is detected.
4062 // 2. The number of loads is greater than MinProfitableStridedLoads,
4063 // or the potential stride <= MaxProfitableLoadStride and the
4064 // potential stride is power-of-2 (to avoid perf regressions for the very
4065 // small number of loads) and max distance > number of loads, or potential
4066 // stride is -1.
4067 // 3. The loads are ordered, or number of unordered loads <=
4068 // MaxProfitableUnorderedLoads, or loads are in reversed order.
4069 // (this check is to avoid extra costs for very expensive shuffles).
4070 if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
4071 (static_cast<unsigned>(std::abs(*Diff)) <=
4073 isPowerOf2_32(std::abs(*Diff)))) &&
4074 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4075 *Diff == -(static_cast<int>(Sz) - 1))) {
4076 int Stride = *Diff / static_cast<int>(Sz - 1);
4077 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4078 Align Alignment =
4079 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
4080 ->getAlign();
4081 if (TTI.isLegalStridedLoadStore(VecTy, Alignment)) {
4082 // Iterate through all pointers and check if all distances are
4083 // unique multiple of Dist.
4084 SmallSet<int, 4> Dists;
4085 for (Value *Ptr : PointerOps) {
4086 int Dist = 0;
4087 if (Ptr == PtrN)
4088 Dist = *Diff;
4089 else if (Ptr != Ptr0)
4090 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
4091 // If the strides are not the same or repeated, we can't
4092 // vectorize.
4093 if (((Dist / Stride) * Stride) != Dist ||
4094 !Dists.insert(Dist).second)
4095 break;
4096 }
4097 if (Dists.size() == Sz)
4098 return LoadsState::StridedVectorize;
4099 }
4100 }
4101 }
4102 }
4103 auto CheckForShuffledLoads = [&](Align CommonAlignment) {
4104 unsigned Sz = DL.getTypeSizeInBits(ScalarTy);
4105 unsigned MinVF = R.getMinVF(Sz);
4106 unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
4107 MaxVF = std::min(R.getMaximumVF(Sz, Instruction::Load), MaxVF);
4108 for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4109 unsigned VectorizedCnt = 0;
4111 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4112 Cnt += VF, ++VectorizedCnt) {
4113 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4115 SmallVector<Value *> PointerOps;
4116 LoadsState LS =
4117 canVectorizeLoads(R, Slice, Slice.front(), TTI, DL, SE, LI, TLI,
4118 Order, PointerOps, /*TryRecursiveCheck=*/false);
4119 // Check that the sorted loads are consecutive.
4120 if (LS == LoadsState::Gather)
4121 break;
4122 // If need the reorder - consider as high-cost masked gather for now.
4123 if ((LS == LoadsState::Vectorize ||
4124 LS == LoadsState::StridedVectorize) &&
4125 !Order.empty() && !isReverseOrder(Order))
4126 LS = LoadsState::ScatterVectorize;
4127 States.push_back(LS);
4128 }
4129 // Can be vectorized later as a serie of loads/insertelements.
4130 if (VectorizedCnt == VL.size() / VF) {
4131 // Compare masked gather cost and loads + insersubvector costs.
4133 InstructionCost MaskedGatherCost = TTI.getGatherScatterOpCost(
4134 Instruction::Load, VecTy,
4135 cast<LoadInst>(VL0)->getPointerOperand(),
4136 /*VariableMask=*/false, CommonAlignment, CostKind);
4137 InstructionCost VecLdCost = 0;
4138 auto *SubVecTy = FixedVectorType::get(ScalarTy, VF);
4139 for (auto [I, LS] : enumerate(States)) {
4140 auto *LI0 = cast<LoadInst>(VL[I * VF]);
4141 switch (LS) {
4142 case LoadsState::Vectorize:
4143 VecLdCost += TTI.getMemoryOpCost(
4144 Instruction::Load, SubVecTy, LI0->getAlign(),
4145 LI0->getPointerAddressSpace(), CostKind,
4147 break;
4148 case LoadsState::StridedVectorize:
4149 VecLdCost += TTI.getStridedMemoryOpCost(
4150 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4151 /*VariableMask=*/false, CommonAlignment, CostKind);
4152 break;
4153 case LoadsState::ScatterVectorize:
4154 VecLdCost += TTI.getGatherScatterOpCost(
4155 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4156 /*VariableMask=*/false, CommonAlignment, CostKind);
4157 break;
4158 case LoadsState::Gather:
4160 "Expected only consecutive, strided or masked gather loads.");
4161 }
4162 VecLdCost +=
4163 TTI.getShuffleCost(TTI ::SK_InsertSubvector, VecTy,
4164 std::nullopt, CostKind, I * VF, SubVecTy);
4165 }
4166 // If masked gather cost is higher - better to vectorize, so
4167 // consider it as a gather node. It will be better estimated
4168 // later.
4169 if (MaskedGatherCost > VecLdCost)
4170 return true;
4171 }
4172 }
4173 return false;
4174 };
4175 // TODO: need to improve analysis of the pointers, if not all of them are
4176 // GEPs or have > 2 operands, we end up with a gather node, which just
4177 // increases the cost.
4178 Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent());
4179 bool ProfitableGatherPointers =
4180 L && Sz > 2 && count_if(PointerOps, [L](Value *V) {
4181 return L->isLoopInvariant(V);
4182 }) <= Sz / 2;
4183 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
4184 auto *GEP = dyn_cast<GetElementPtrInst>(P);
4185 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
4186 (GEP && GEP->getNumOperands() == 2 &&
4187 isa<Constant, Instruction>(GEP->getOperand(1)));
4188 })) {
4189 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4190 if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
4191 !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
4192 // Check if potential masked gather can be represented as series
4193 // of loads + insertsubvectors.
4194 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4195 // If masked gather cost is higher - better to vectorize, so
4196 // consider it as a gather node. It will be better estimated
4197 // later.
4198 return LoadsState::Gather;
4199 }
4200 return LoadsState::ScatterVectorize;
4201 }
4202 }
4203 }
4204
4205 return LoadsState::Gather;
4206}
4207
4209 const DataLayout &DL, ScalarEvolution &SE,
4210 SmallVectorImpl<unsigned> &SortedIndices) {
4212 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4213 "Expected list of pointer operands.");
4214 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4215 // Ptr into, sort and return the sorted indices with values next to one
4216 // another.
4218 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4219
4220 unsigned Cnt = 1;
4221 for (Value *Ptr : VL.drop_front()) {
4222 bool Found = any_of(Bases, [&](auto &Base) {
4223 std::optional<int> Diff =
4224 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4225 /*StrictCheck=*/true);
4226 if (!Diff)
4227 return false;
4228
4229 Base.second.emplace_back(Ptr, *Diff, Cnt++);
4230 return true;
4231 });
4232
4233 if (!Found) {
4234 // If we haven't found enough to usefully cluster, return early.
4235 if (Bases.size() > VL.size() / 2 - 1)
4236 return false;
4237
4238 // Not found already - add a new Base
4239 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
4240 }
4241 }
4242
4243 // For each of the bases sort the pointers by Offset and check if any of the
4244 // base become consecutively allocated.
4245 bool AnyConsecutive = false;
4246 for (auto &Base : Bases) {
4247 auto &Vec = Base.second;
4248 if (Vec.size() > 1) {
4249 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
4250 const std::tuple<Value *, int, unsigned> &Y) {
4251 return std::get<1>(X) < std::get<1>(Y);
4252 });
4253 int InitialOffset = std::get<1>(Vec[0]);
4254 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
4255 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4256 });
4257 }
4258 }
4259
4260 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4261 SortedIndices.clear();
4262 if (!AnyConsecutive)
4263 return false;
4264
4265 for (auto &Base : Bases) {
4266 for (auto &T : Base.second)
4267 SortedIndices.push_back(std::get<2>(T));
4268 }
4269
4270 assert(SortedIndices.size() == VL.size() &&
4271 "Expected SortedIndices to be the size of VL");
4272 return true;
4273}
4274
4275std::optional<BoUpSLP::OrdersType>
4276BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4277 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
4278 Type *ScalarTy = TE.Scalars[0]->getType();
4279
4281 Ptrs.reserve(TE.Scalars.size());
4282 for (Value *V : TE.Scalars) {
4283 auto *L = dyn_cast<LoadInst>(V);
4284 if (!L || !L->isSimple())
4285 return std::nullopt;
4286 Ptrs.push_back(L->getPointerOperand());
4287 }
4288
4289 BoUpSLP::OrdersType Order;
4290 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
4291 return std::move(Order);
4292 return std::nullopt;
4293}
4294
4295/// Check if two insertelement instructions are from the same buildvector.
4298 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4299 // Instructions must be from the same basic blocks.
4300 if (VU->getParent() != V->getParent())
4301 return false;
4302 // Checks if 2 insertelements are from the same buildvector.
4303 if (VU->getType() != V->getType())
4304 return false;
4305 // Multiple used inserts are separate nodes.
4306 if (!VU->hasOneUse() && !V->hasOneUse())
4307 return false;
4308 auto *IE1 = VU;
4309 auto *IE2 = V;
4310 std::optional<unsigned> Idx1 = getInsertIndex(IE1);
4311 std::optional<unsigned> Idx2 = getInsertIndex(IE2);
4312 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4313 return false;
4314 // Go through the vector operand of insertelement instructions trying to find
4315 // either VU as the original vector for IE2 or V as the original vector for
4316 // IE1.
4317 SmallBitVector ReusedIdx(
4318 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
4319 bool IsReusedIdx = false;
4320 do {
4321 if (IE2 == VU && !IE1)
4322 return VU->hasOneUse();
4323 if (IE1 == V && !IE2)
4324 return V->hasOneUse();
4325 if (IE1 && IE1 != V) {
4326 unsigned Idx1 = getInsertIndex(IE1).value_or(*Idx2);
4327 IsReusedIdx |= ReusedIdx.test(Idx1);
4328 ReusedIdx.set(Idx1);
4329 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4330 IE1 = nullptr;
4331 else
4332 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4333 }
4334 if (IE2 && IE2 != VU) {
4335 unsigned Idx2 = getInsertIndex(IE2).value_or(*Idx1);
4336 IsReusedIdx |= ReusedIdx.test(Idx2);
4337 ReusedIdx.set(Idx2);
4338 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4339 IE2 = nullptr;
4340 else
4341 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4342 }
4343 } while (!IsReusedIdx && (IE1 || IE2));
4344 return false;
4345}
4346
4347std::optional<BoUpSLP::OrdersType>
4348BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4349 // No need to reorder if need to shuffle reuses, still need to shuffle the
4350 // node.
4351 if (!TE.ReuseShuffleIndices.empty()) {
4352 if (isSplat(TE.Scalars))
4353 return std::nullopt;
4354 // Check if reuse shuffle indices can be improved by reordering.
4355 // For this, check that reuse mask is "clustered", i.e. each scalar values
4356 // is used once in each submask of size <number_of_scalars>.
4357 // Example: 4 scalar values.
4358 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4359 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4360 // element 3 is used twice in the second submask.
4361 unsigned Sz = TE.Scalars.size();
4362 if (TE.State == TreeEntry::NeedToGather) {
4363 if (std::optional<OrdersType> CurrentOrder =
4365 SmallVector<int> Mask;
4366 fixupOrderingIndices(*CurrentOrder);
4367 inversePermutation(*CurrentOrder, Mask);
4368 ::addMask(Mask, TE.ReuseShuffleIndices);
4369 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4370 unsigned Sz = TE.Scalars.size();
4371 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
4372 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
4373 if (Idx != PoisonMaskElem)
4374 Res[Idx + K * Sz] = I + K * Sz;
4375 }
4376 return std::move(Res);
4377 }
4378 }
4379 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4381 TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
4382 return std::nullopt;
4383 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
4384 Sz)) {
4385 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4386 if (TE.ReorderIndices.empty())
4387 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4388 else
4389 inversePermutation(TE.ReorderIndices, ReorderMask);
4390 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4391 unsigned VF = ReorderMask.size();
4392 OrdersType ResOrder(VF, VF);
4393 unsigned NumParts = VF / Sz;
4394 SmallBitVector UsedVals(NumParts);
4395 for (unsigned I = 0; I < VF; I += Sz) {
4396 int Val = PoisonMaskElem;
4397 unsigned UndefCnt = 0;
4398 if (any_of(ArrayRef(ReorderMask).slice(I, Sz),
4399 [&](int Idx) {
4400 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
4401 Val = Idx;
4402 if (Idx == PoisonMaskElem)
4403 ++UndefCnt;
4404 return Idx != PoisonMaskElem && Idx != Val;
4405 }) ||
4406 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
4407 UndefCnt > Sz / 2)
4408 return std::nullopt;
4409 UsedVals.set(Val);
4410 for (unsigned K = 0; K < NumParts; ++K)
4411 ResOrder[Val + Sz * K] = I + K;
4412 }
4413 return std::move(ResOrder);
4414 }
4415 unsigned VF = TE.getVectorFactor();
4416 // Try build correct order for extractelement instructions.
4417 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
4418 TE.ReuseShuffleIndices.end());
4419 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4420 all_of(TE.Scalars, [Sz](Value *V) {
4421 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4422 return Idx && *Idx < Sz;
4423 })) {
4424 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4425 if (TE.ReorderIndices.empty())
4426 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4427 else
4428 inversePermutation(TE.ReorderIndices, ReorderMask);
4429 for (unsigned I = 0; I < VF; ++I) {
4430 int &Idx = ReusedMask[I];
4431 if (Idx == PoisonMaskElem)
4432 continue;
4433 Value *V = TE.Scalars[ReorderMask[Idx]];
4434 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
4435 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
4436 }
4437 }
4438 // Build the order of the VF size, need to reorder reuses shuffles, they are
4439 // always of VF size.
4440 OrdersType ResOrder(VF);
4441 std::iota(ResOrder.begin(), ResOrder.end(), 0);
4442 auto *It = ResOrder.begin();
4443 for (unsigned K = 0; K < VF; K += Sz) {
4444 OrdersType CurrentOrder(TE.ReorderIndices);
4445 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
4446 if (SubMask.front() == PoisonMaskElem)
4447 std::iota(SubMask.begin(), SubMask.end(), 0);
4448 reorderOrder(CurrentOrder, SubMask);
4449 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
4450 std::advance(It, Sz);
4451 }
4452 if (TE.State == TreeEntry::NeedToGather &&
4453 all_of(enumerate(ResOrder),
4454 [](const auto &Data) { return Data.index() == Data.value(); }))
4455 return std::nullopt; // No need to reorder.
4456 return std::move(ResOrder);
4457 }
4458 if ((TE.State == TreeEntry::Vectorize ||
4459 TE.State == TreeEntry::StridedVectorize) &&
4460 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4461 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4462 !TE.isAltShuffle())
4463 return TE.ReorderIndices;
4464 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4465 auto PHICompare = [&](unsigned I1, unsigned I2) {
4466 Value *V1 = TE.Scalars[I1];
4467 Value *V2 = TE.Scalars[I2];
4468 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
4469 return false;
4470 if (V1->getNumUses() < V2->getNumUses())
4471 return true;
4472 if (V1->getNumUses() > V2->getNumUses())
4473 return false;
4474 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
4475 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4476 if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4477 if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4479 IE1, IE2,
4480 [](InsertElementInst *II) { return II->getOperand(0); }))
4481 return I1 < I2;
4482 return getInsertIndex(IE1) < getInsertIndex(IE2);
4483 }
4484 if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4485 if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4486 if (EE1->getOperand(0) != EE2->getOperand(0))
4487 return I1 < I2;
4488 return getInsertIndex(EE1) < getInsertIndex(EE2);
4489 }
4490 return I1 < I2;
4491 };
4492 auto IsIdentityOrder = [](const OrdersType &Order) {
4493 for (unsigned Idx : seq<unsigned>(0, Order.size()))
4494 if (Idx != Order[Idx])
4495 return false;
4496 return true;
4497 };
4498 if (!TE.ReorderIndices.empty())
4499 return TE.ReorderIndices;
4501 SmallVector<unsigned> Phis(TE.Scalars.size());
4502 std::iota(Phis.begin(), Phis.end(), 0);
4503 OrdersType ResOrder(TE.Scalars.size());
4504 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4505 PhiToId[Id] = Id;
4506 stable_sort(Phis, PHICompare);
4507 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4508 ResOrder[Id] = PhiToId[Phis[Id]];
4509 if (IsIdentityOrder(ResOrder))
4510 return std::nullopt; // No need to reorder.
4511 return std::move(ResOrder);
4512 }
4513 if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
4514 allSameType(TE.Scalars)) {
4515 // TODO: add analysis of other gather nodes with extractelement
4516 // instructions and other values/instructions, not only undefs.
4517 if ((TE.getOpcode() == Instruction::ExtractElement ||
4518 (all_of(TE.Scalars,
4519 [](Value *V) {
4520 return isa<UndefValue, ExtractElementInst>(V);
4521 }) &&
4522 any_of(TE.Scalars,
4523 [](Value *V) { return isa<ExtractElementInst>(V); }))) &&
4524 all_of(TE.Scalars, [](Value *V) {
4525 auto *EE = dyn_cast<ExtractElementInst>(V);
4526 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4527 })) {
4528 // Check that gather of extractelements can be represented as
4529 // just a shuffle of a single vector.
4530 OrdersType CurrentOrder;
4531 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4532 /*ResizeAllowed=*/true);
4533 if (Reuse || !CurrentOrder.empty())
4534 return std::move(CurrentOrder);
4535 }
4536 // If the gather node is <undef, v, .., poison> and
4537 // insertelement poison, v, 0 [+ permute]
4538 // is cheaper than
4539 // insertelement poison, v, n - try to reorder.
4540 // If rotating the whole graph, exclude the permute cost, the whole graph
4541 // might be transformed.
4542 int Sz = TE.Scalars.size();
4543 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
4544 count_if(TE.Scalars, UndefValue::classof) == Sz - 1) {
4545 const auto *It =
4546 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
4547 if (It == TE.Scalars.begin())
4548 return OrdersType();
4549 auto *Ty = FixedVectorType::get(TE.Scalars.front()->getType(), Sz);
4550 if (It != TE.Scalars.end()) {
4551 OrdersType Order(Sz, Sz);
4552 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4553 Order[Idx] = 0;
4554 fixupOrderingIndices(Order);
4555 SmallVector<int> Mask;
4556 inversePermutation(Order, Mask);
4557 InstructionCost PermuteCost =
4558 TopToBottom
4559 ? 0
4561 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
4562 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
4563 PoisonValue::get(Ty), *It);
4564 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
4565 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
4566 PoisonValue::get(Ty), *It);
4567 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
4568 OrdersType Order(Sz, Sz);
4569 Order[Idx] = 0;
4570 return std::move(Order);
4571 }
4572 }
4573 }
4574 if (isSplat(TE.Scalars))
4575 return std::nullopt;
4576 if (TE.Scalars.size() >= 4)
4577 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
4578 return Order;
4579 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
4580 return CurrentOrder;
4581 }
4582 return std::nullopt;
4583}
4584
4585/// Checks if the given mask is a "clustered" mask with the same clusters of
4586/// size \p Sz, which are not identity submasks.
4588 unsigned Sz) {
4589 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
4590 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
4591 return false;
4592 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
4593 ArrayRef<int> Cluster = Mask.slice(I, Sz);
4594 if (Cluster != FirstCluster)
4595 return false;
4596 }
4597 return true;
4598}
4599
4600void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
4601 // Reorder reuses mask.
4602 reorderReuses(TE.ReuseShuffleIndices, Mask);
4603 const unsigned Sz = TE.Scalars.size();
4604 // For vectorized and non-clustered reused no need to do anything else.
4605 if (TE.State != TreeEntry::NeedToGather ||
4607 Sz) ||
4608 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
4609 return;
4610 SmallVector<int> NewMask;
4611 inversePermutation(TE.ReorderIndices, NewMask);
4612 addMask(NewMask, TE.ReuseShuffleIndices);
4613 // Clear reorder since it is going to be applied to the new mask.
4614 TE.ReorderIndices.clear();
4615 // Try to improve gathered nodes with clustered reuses, if possible.
4616 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
4617 SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
4618 inversePermutation(NewOrder, NewMask);
4619 reorderScalars(TE.Scalars, NewMask);
4620 // Fill the reuses mask with the identity submasks.
4621 for (auto *It = TE.ReuseShuffleIndices.begin(),
4622 *End = TE.ReuseShuffleIndices.end();
4623 It != End; std::advance(It, Sz))
4624 std::iota(It, std::next(It, Sz), 0);
4625}
4626
4628 ArrayRef<unsigned> SecondaryOrder) {
4629 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
4630 "Expected same size of orders");
4631 unsigned Sz = Order.size();
4632 SmallBitVector UsedIndices(Sz);
4633 for (unsigned Idx : seq<unsigned>(0, Sz)) {
4634 if (Order[Idx] != Sz)
4635 UsedIndices.set(Order[Idx]);
4636 }
4637 if (SecondaryOrder.empty()) {
4638 for (unsigned Idx : seq<unsigned>(0, Sz))
4639 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
4640 Order[Idx] = Idx;
4641 } else {
4642 for (unsigned Idx : seq<unsigned>(0, Sz))
4643 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
4644 !UsedIndices.test(SecondaryOrder[Idx]))
4645 Order[Idx] = SecondaryOrder[Idx];
4646 }
4647}
4648
4650 // Maps VF to the graph nodes.
4652 // ExtractElement gather nodes which can be vectorized and need to handle
4653 // their ordering.
4655
4656 // Phi nodes can have preferred ordering based on their result users
4658
4659 // AltShuffles can also have a preferred ordering that leads to fewer
4660 // instructions, e.g., the addsub instruction in x86.
4661 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
4662
4663 // Maps a TreeEntry to the reorder indices of external users.
4665 ExternalUserReorderMap;
4666 // FIXME: Workaround for syntax error reported by MSVC buildbots.
4667 TargetTransformInfo &TTIRef = *TTI;
4668 // Find all reorderable nodes with the given VF.
4669 // Currently the are vectorized stores,loads,extracts + some gathering of
4670 // extracts.
4671 for_each(VectorizableTree, [this, &TTIRef, &VFToOrderedEntries,
4672 &GathersToOrders, &ExternalUserReorderMap,
4673 &AltShufflesToOrders, &PhisToOrders](
4674 const std::unique_ptr<TreeEntry> &TE) {
4675 // Look for external users that will probably be vectorized.
4676 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
4677 findExternalStoreUsersReorderIndices(TE.get());
4678 if (!ExternalUserReorderIndices.empty()) {
4679 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4680 ExternalUserReorderMap.try_emplace(TE.get(),
4681 std::move(ExternalUserReorderIndices));
4682 }
4683
4684 // Patterns like [fadd,fsub] can be combined into a single instruction in
4685 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
4686 // to take into account their order when looking for the most used order.
4687 if (TE->isAltShuffle()) {
4688 VectorType *VecTy =
4689 FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
4690 unsigned Opcode0 = TE->getOpcode();
4691 unsigned Opcode1 = TE->getAltOpcode();
4692 // The opcode mask selects between the two opcodes.
4693 SmallBitVector OpcodeMask(TE->Scalars.size(), false);
4694 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
4695 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
4696 OpcodeMask.set(Lane);
4697 // If this pattern is supported by the target then we consider the order.
4698 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
4699 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4700 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
4701 }
4702 // TODO: Check the reverse order too.
4703 }
4704
4705 if (std::optional<OrdersType> CurrentOrder =
4706 getReorderingData(*TE, /*TopToBottom=*/true)) {
4707 // Do not include ordering for nodes used in the alt opcode vectorization,
4708 // better to reorder them during bottom-to-top stage. If follow the order
4709 // here, it causes reordering of the whole graph though actually it is
4710 // profitable just to reorder the subgraph that starts from the alternate
4711 // opcode vectorization node. Such nodes already end-up with the shuffle
4712 // instruction and it is just enough to change this shuffle rather than
4713 // rotate the scalars for the whole graph.
4714 unsigned Cnt = 0;
4715 const TreeEntry *UserTE = TE.get();
4716 while (UserTE && Cnt < RecursionMaxDepth) {
4717 if (UserTE->UserTreeIndices.size() != 1)
4718 break;
4719 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
4720 return EI.UserTE->State == TreeEntry::Vectorize &&
4721 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
4722 }))
4723 return;
4724 UserTE = UserTE->UserTreeIndices.back().UserTE;
4725 ++Cnt;
4726 }
4727 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4728 if (!(TE->State == TreeEntry::Vectorize ||
4729 TE->State == TreeEntry::StridedVectorize) ||
4730 !TE->ReuseShuffleIndices.empty())
4731 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
4732 if (TE->State == TreeEntry::Vectorize &&
4733 TE->getOpcode() == Instruction::PHI)
4734 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
4735 }
4736 });
4737
4738 // Reorder the graph nodes according to their vectorization factor.
4739 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
4740 VF /= 2) {
4741 auto It = VFToOrderedEntries.find(VF);
4742 if (It == VFToOrderedEntries.end())
4743 continue;
4744 // Try to find the most profitable order. We just are looking for the most
4745 // used order and reorder scalar elements in the nodes according to this
4746 // mostly used order.
4747 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
4748 // All operands are reordered and used only in this node - propagate the
4749 // most used order to the user node.
4752 OrdersUses;
4754 for (const TreeEntry *OpTE : OrderedEntries) {
4755 // No need to reorder this nodes, still need to extend and to use shuffle,
4756 // just need to merge reordering shuffle and the reuse shuffle.
4757 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
4758 continue;
4759 // Count number of orders uses.
4760 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
4761 &PhisToOrders]() -> const OrdersType & {
4762 if (OpTE->State == TreeEntry::NeedToGather ||
4763 !OpTE->ReuseShuffleIndices.empty()) {
4764 auto It = GathersToOrders.find(OpTE);
4765 if (It != GathersToOrders.end())
4766 return It->second;
4767 }
4768 if (OpTE->isAltShuffle()) {
4769 auto It = AltShufflesToOrders.find(OpTE);
4770 if (It != AltShufflesToOrders.end())
4771 return It->second;
4772 }
4773 if (OpTE->State == TreeEntry::Vectorize &&
4774 OpTE->getOpcode() == Instruction::PHI) {
4775 auto It = PhisToOrders.find(OpTE);
4776 if (It != PhisToOrders.end())
4777 return It->second;
4778 }
4779 return OpTE->ReorderIndices;
4780 }();
4781 // First consider the order of the external scalar users.
4782 auto It = ExternalUserReorderMap.find(OpTE);
4783 if (It != ExternalUserReorderMap.end()) {
4784 const auto &ExternalUserReorderIndices = It->second;
4785 // If the OpTE vector factor != number of scalars - use natural order,
4786 // it is an attempt to reorder node with reused scalars but with
4787 // external uses.
4788 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
4789 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
4790 ExternalUserReorderIndices.size();
4791 } else {
4792 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
4793 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
4794 }
4795 // No other useful reorder data in this entry.
4796 if (Order.empty())
4797 continue;
4798 }
4799 // Stores actually store the mask, not the order, need to invert.
4800 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4801 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4802 SmallVector<int> Mask;
4803 inversePermutation(Order, Mask);
4804 unsigned E = Order.size();
4805 OrdersType CurrentOrder(E, E);
4806 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
4807 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
4808 });
4809 fixupOrderingIndices(CurrentOrder);
4810 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
4811 } else {
4812 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
4813 }
4814 }
4815 if (OrdersUses.empty())
4816 continue;
4817 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
4818 const unsigned Sz = Order.size();
4819 for (unsigned Idx : seq<unsigned>(0, Sz))
4820 if (Idx != Order[Idx] && Order[Idx] != Sz)
4821 return false;
4822 return true;
4823 };
4824 // Choose the most used order.
4825 unsigned IdentityCnt = 0;
4826 unsigned FilledIdentityCnt = 0;
4827 OrdersType IdentityOrder(VF, VF);
4828 for (auto &Pair : OrdersUses) {
4829 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
4830 if (!Pair.first.empty())
4831 FilledIdentityCnt += Pair.second;
4832 IdentityCnt += Pair.second;
4833 combineOrders(IdentityOrder, Pair.first);
4834 }
4835 }
4836 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
4837 unsigned Cnt = IdentityCnt;
4838 for (auto &Pair : OrdersUses) {
4839 // Prefer identity order. But, if filled identity found (non-empty order)
4840 // with same number of uses, as the new candidate order, we can choose
4841 // this candidate order.
4842 if (Cnt < Pair.second ||
4843 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
4844 Cnt == Pair.second && !BestOrder.empty() &&
4845 IsIdentityOrder(BestOrder))) {
4846 combineOrders(Pair.first, BestOrder);
4847 BestOrder = Pair.first;
4848 Cnt = Pair.second;
4849 } else {
4850 combineOrders(BestOrder, Pair.first);
4851 }
4852 }
4853 // Set order of the user node.
4854 if (IsIdentityOrder(BestOrder))
4855 continue;
4856 fixupOrderingIndices(BestOrder);
4857 SmallVector<int> Mask;
4858 inversePermutation(BestOrder, Mask);
4859 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
4860 unsigned E = BestOrder.size();
4861 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
4862 return I < E ? static_cast<int>(I) : PoisonMaskElem;
4863 });
4864 // Do an actual reordering, if profitable.
4865 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
4866 // Just do the reordering for the nodes with the given VF.
4867 if (TE->Scalars.size() != VF) {
4868 if (TE->ReuseShuffleIndices.size() == VF) {
4869 // Need to reorder the reuses masks of the operands with smaller VF to
4870 // be able to find the match between the graph nodes and scalar
4871 // operands of the given node during vectorization/cost estimation.
4872 assert(all_of(TE->UserTreeIndices,
4873 [VF, &TE](const EdgeInfo &EI) {
4874 return EI.UserTE->Scalars.size() == VF ||
4875 EI.UserTE->Scalars.size() ==
4876 TE->Scalars.size();
4877 }) &&
4878 "All users must be of VF size.");
4879 // Update ordering of the operands with the smaller VF than the given
4880 // one.
4881 reorderNodeWithReuses(*TE, Mask);
4882 }
4883 continue;
4884 }
4885 if ((TE->State == TreeEntry::Vectorize ||
4886 TE->State == TreeEntry::StridedVectorize) &&
4888 InsertElementInst>(TE->getMainOp()) &&
4889 !TE->isAltShuffle()) {
4890 // Build correct orders for extract{element,value}, loads and
4891 // stores.
4892 reorderOrder(TE->ReorderIndices, Mask);
4893 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
4894 TE->reorderOperands(Mask);
4895 } else {
4896 // Reorder the node and its operands.
4897 TE->reorderOperands(Mask);
4898 assert(TE->ReorderIndices.empty() &&
4899 "Expected empty reorder sequence.");
4900 reorderScalars(TE->Scalars, Mask);
4901 }
4902 if (!TE->ReuseShuffleIndices.empty()) {
4903 // Apply reversed order to keep the original ordering of the reused
4904 // elements to avoid extra reorder indices shuffling.
4905 OrdersType CurrentOrder;
4906 reorderOrder(CurrentOrder, MaskOrder);
4907 SmallVector<int> NewReuses;
4908 inversePermutation(CurrentOrder, NewReuses);
4909 addMask(NewReuses, TE->ReuseShuffleIndices);
4910 TE->ReuseShuffleIndices.swap(NewReuses);
4911 }
4912 }
4913 }
4914}
4915
4916bool BoUpSLP::canReorderOperands(
4917 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
4918 ArrayRef<TreeEntry *> ReorderableGathers,
4919 SmallVectorImpl<TreeEntry *> &GatherOps) {
4920 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
4921 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
4922 return OpData.first == I &&
4923 OpData.second->State == TreeEntry::Vectorize;
4924 }))
4925 continue;
4926 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
4927 // Do not reorder if operand node is used by many user nodes.
4928 if (any_of(TE->UserTreeIndices,
4929 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
4930 return false;
4931 // Add the node to the list of the ordered nodes with the identity
4932 // order.
4933 Edges.emplace_back(I, TE);
4934 // Add ScatterVectorize nodes to the list of operands, where just
4935 // reordering of the scalars is required. Similar to the gathers, so
4936 // simply add to the list of gathered ops.
4937 // If there are reused scalars, process this node as a regular vectorize
4938 // node, just reorder reuses mask.
4939 if (TE->State != TreeEntry::Vectorize &&
4940 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
4941 GatherOps.push_back(TE);
4942 continue;
4943 }
4944 TreeEntry *Gather = nullptr;
4945 if (count_if(ReorderableGathers,
4946 [&Gather, UserTE, I](TreeEntry *TE) {
4947 assert(TE->State != TreeEntry::Vectorize &&
4948 "Only non-vectorized nodes are expected.");
4949 if (any_of(TE->UserTreeIndices,
4950 [UserTE, I](const EdgeInfo &EI) {
4951 return EI.UserTE == UserTE && EI.EdgeIdx == I;
4952 })) {
4953 assert(TE->isSame(UserTE->getOperand(I)) &&
4954 "Operand entry does not match operands.");
4955 Gather = TE;
4956 return true;
4957 }
4958 return false;
4959 }) > 1 &&
4960 !allConstant(UserTE->getOperand(I)))
4961 return false;
4962 if (Gather)
4963 GatherOps.push_back(Gather);
4964 }
4965 return true;
4966}
4967
4968void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
4969 SetVector<TreeEntry *> OrderedEntries;
4970 DenseSet<const TreeEntry *> GathersToOrders;
4971 // Find all reorderable leaf nodes with the given VF.
4972 // Currently the are vectorized loads,extracts without alternate operands +
4973 // some gathering of extracts.
4974 SmallVector<TreeEntry *> NonVectorized;
4975 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
4976 if (TE->State != TreeEntry::Vectorize &&
4977 TE->State != TreeEntry::StridedVectorize)
4978 NonVectorized.push_back(TE.get());
4979 if (std::optional<OrdersType> CurrentOrder =
4980 getReorderingData(*TE, /*TopToBottom=*/false)) {
4981 OrderedEntries.insert(TE.get());
4982 if (!(TE->State == TreeEntry::Vectorize ||
4983 TE->State == TreeEntry::StridedVectorize) ||
4984 !TE->ReuseShuffleIndices.empty())
4985 GathersToOrders.insert(TE.get());
4986 }
4987 }
4988
4989 // 1. Propagate order to the graph nodes, which use only reordered nodes.
4990 // I.e., if the node has operands, that are reordered, try to make at least
4991 // one operand order in the natural order and reorder others + reorder the
4992 // user node itself.
4994 while (!OrderedEntries.empty()) {
4995 // 1. Filter out only reordered nodes.
4996 // 2. If the entry has multiple uses - skip it and jump to the next node.
4998 SmallVector<TreeEntry *> Filtered;
4999 for (TreeEntry *TE : OrderedEntries) {
5000 if (!(TE->State == TreeEntry::Vectorize ||
5001 TE->State == TreeEntry::StridedVectorize ||
5002 (TE-