LLVM 19.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
25#include "llvm/ADT/SetVector.h"
28#include "llvm/ADT/SmallSet.h"
30#include "llvm/ADT/Statistic.h"
31#include "llvm/ADT/iterator.h"
50#include "llvm/IR/Attributes.h"
51#include "llvm/IR/BasicBlock.h"
52#include "llvm/IR/Constant.h"
53#include "llvm/IR/Constants.h"
54#include "llvm/IR/DataLayout.h"
56#include "llvm/IR/Dominators.h"
57#include "llvm/IR/Function.h"
58#include "llvm/IR/IRBuilder.h"
59#include "llvm/IR/InstrTypes.h"
60#include "llvm/IR/Instruction.h"
63#include "llvm/IR/Intrinsics.h"
64#include "llvm/IR/Module.h"
65#include "llvm/IR/Operator.h"
67#include "llvm/IR/Type.h"
68#include "llvm/IR/Use.h"
69#include "llvm/IR/User.h"
70#include "llvm/IR/Value.h"
71#include "llvm/IR/ValueHandle.h"
72#ifdef EXPENSIVE_CHECKS
73#include "llvm/IR/Verifier.h"
74#endif
75#include "llvm/Pass.h"
80#include "llvm/Support/Debug.h"
90#include <algorithm>
91#include <cassert>
92#include <cstdint>
93#include <iterator>
94#include <memory>
95#include <optional>
96#include <set>
97#include <string>
98#include <tuple>
99#include <utility>
100
101using namespace llvm;
102using namespace llvm::PatternMatch;
103using namespace slpvectorizer;
104
105#define SV_NAME "slp-vectorizer"
106#define DEBUG_TYPE "SLP"
107
108STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
109
110static cl::opt<bool>
111 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
112 cl::desc("Run the SLP vectorization passes"));
113
114static cl::opt<int>
116 cl::desc("Only vectorize if you gain more than this "
117 "number "));
118
119static cl::opt<bool>
120ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
121 cl::desc("Attempt to vectorize horizontal reductions"));
122
124 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
125 cl::desc(
126 "Attempt to vectorize horizontal reductions feeding into a store"));
127
128// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
129// even if we match a reduction but do not vectorize in the end.
131 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
132 cl::desc("Allow optimization of original scalar identity operations on "
133 "matched horizontal reductions."));
134
135static cl::opt<int>
137 cl::desc("Attempt to vectorize for this register size in bits"));
138
141 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
142
143/// Limits the size of scheduling regions in a block.
144/// It avoid long compile times for _very_ large blocks where vector
145/// instructions are spread over a wide range.
146/// This limit is way higher than needed by real-world functions.
147static cl::opt<int>
148ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
149 cl::desc("Limit the size of the SLP scheduling region per block"));
150
152 "slp-min-reg-size", cl::init(128), cl::Hidden,
153 cl::desc("Attempt to vectorize for this register size in bits"));
154
156 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
157 cl::desc("Limit the recursion depth when building a vectorizable tree"));
158
160 "slp-min-tree-size", cl::init(3), cl::Hidden,
161 cl::desc("Only vectorize small trees if they are fully vectorizable"));
162
163// The maximum depth that the look-ahead score heuristic will explore.
164// The higher this value, the higher the compilation time overhead.
166 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
167 cl::desc("The maximum look-ahead depth for operand reordering scores"));
168
169// The maximum depth that the look-ahead score heuristic will explore
170// when it probing among candidates for vectorization tree roots.
171// The higher this value, the higher the compilation time overhead but unlike
172// similar limit for operands ordering this is less frequently used, hence
173// impact of higher value is less noticeable.
175 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
176 cl::desc("The maximum look-ahead depth for searching best rooting option"));
177
179 "slp-min-strided-loads", cl::init(2), cl::Hidden,
180 cl::desc("The minimum number of loads, which should be considered strided, "
181 "if the stride is > 1 or is runtime value"));
182
184 "slp-max-stride", cl::init(8), cl::Hidden,
185 cl::desc("The maximum stride, considered to be profitable."));
186
187static cl::opt<bool>
188 ViewSLPTree("view-slp-tree", cl::Hidden,
189 cl::desc("Display the SLP trees with Graphviz"));
190
191// Limit the number of alias checks. The limit is chosen so that
192// it has no negative effect on the llvm benchmarks.
193static const unsigned AliasedCheckLimit = 10;
194
195// Another limit for the alias checks: The maximum distance between load/store
196// instructions where alias checks are done.
197// This limit is useful for very large basic blocks.
198static const unsigned MaxMemDepDistance = 160;
199
200/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
201/// regions to be handled.
202static const int MinScheduleRegionSize = 16;
203
204/// Predicate for the element types that the SLP vectorizer supports.
205///
206/// The most important thing to filter here are types which are invalid in LLVM
207/// vectors. We also filter target specific types which have absolutely no
208/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
209/// avoids spending time checking the cost model and realizing that they will
210/// be inevitably scalarized.
211static bool isValidElementType(Type *Ty) {
212 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
213 !Ty->isPPC_FP128Ty();
214}
215
216/// \returns True if the value is a constant (but not globals/constant
217/// expressions).
218static bool isConstant(Value *V) {
219 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
220}
221
222/// Checks if \p V is one of vector-like instructions, i.e. undef,
223/// insertelement/extractelement with constant indices for fixed vector type or
224/// extractvalue instruction.
226 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
227 !isa<ExtractValueInst, UndefValue>(V))
228 return false;
229 auto *I = dyn_cast<Instruction>(V);
230 if (!I || isa<ExtractValueInst>(I))
231 return true;
232 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
233 return false;
234 if (isa<ExtractElementInst>(I))
235 return isConstant(I->getOperand(1));
236 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
237 return isConstant(I->getOperand(2));
238}
239
240#if !defined(NDEBUG)
241/// Print a short descriptor of the instruction bundle suitable for debug output.
242static std::string shortBundleName(ArrayRef<Value *> VL) {
243 std::string Result;
244 raw_string_ostream OS(Result);
245 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
246 OS.flush();
247 return Result;
248}
249#endif
250
251/// \returns true if all of the instructions in \p VL are in the same block or
252/// false otherwise.
254 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
255 if (!I0)
256 return false;
258 return true;
259
260 BasicBlock *BB = I0->getParent();
261 for (int I = 1, E = VL.size(); I < E; I++) {
262 auto *II = dyn_cast<Instruction>(VL[I]);
263 if (!II)
264 return false;
265
266 if (BB != II->getParent())
267 return false;
268 }
269 return true;
270}
271
272/// \returns True if all of the values in \p VL are constants (but not
273/// globals/constant expressions).
275 // Constant expressions and globals can't be vectorized like normal integer/FP
276 // constants.
277 return all_of(VL, isConstant);
278}
279
280/// \returns True if all of the values in \p VL are identical or some of them
281/// are UndefValue.
282static bool isSplat(ArrayRef<Value *> VL) {
283 Value *FirstNonUndef = nullptr;
284 for (Value *V : VL) {
285 if (isa<UndefValue>(V))
286 continue;
287 if (!FirstNonUndef) {
288 FirstNonUndef = V;
289 continue;
290 }
291 if (V != FirstNonUndef)
292 return false;
293 }
294 return FirstNonUndef != nullptr;
295}
296
297/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
299 if (auto *Cmp = dyn_cast<CmpInst>(I))
300 return Cmp->isCommutative();
301 if (auto *BO = dyn_cast<BinaryOperator>(I))
302 return BO->isCommutative();
303 // TODO: This should check for generic Instruction::isCommutative(), but
304 // we need to confirm that the caller code correctly handles Intrinsics
305 // for example (does not have 2 operands).
306 return false;
307}
308
309/// \returns inserting index of InsertElement or InsertValue instruction,
310/// using Offset as base offset for index.
311static std::optional<unsigned> getInsertIndex(const Value *InsertInst,
312 unsigned Offset = 0) {
313 int Index = Offset;
314 if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
315 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
316 if (!VT)
317 return std::nullopt;
318 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
319 if (!CI)
320 return std::nullopt;
321 if (CI->getValue().uge(VT->getNumElements()))
322 return std::nullopt;
323 Index *= VT->getNumElements();
324 Index += CI->getZExtValue();
325 return Index;
326 }
327
328 const auto *IV = cast<InsertValueInst>(InsertInst);
329 Type *CurrentType = IV->getType();
330 for (unsigned I : IV->indices()) {
331 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
332 Index *= ST->getNumElements();
333 CurrentType = ST->getElementType(I);
334 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
335 Index *= AT->getNumElements();
336 CurrentType = AT->getElementType();
337 } else {
338 return std::nullopt;
339 }
340 Index += I;
341 }
342 return Index;
343}
344
345namespace {
346/// Specifies the way the mask should be analyzed for undefs/poisonous elements
347/// in the shuffle mask.
348enum class UseMask {
349 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
350 ///< check for the mask elements for the first argument (mask
351 ///< indices are in range [0:VF)).
352 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
353 ///< for the mask elements for the second argument (mask indices
354 ///< are in range [VF:2*VF))
355 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
356 ///< future shuffle elements and mark them as ones as being used
357 ///< in future. Non-undef elements are considered as unused since
358 ///< they're already marked as used in the mask.
359};
360} // namespace
361
362/// Prepares a use bitset for the given mask either for the first argument or
363/// for the second.
365 UseMask MaskArg) {
366 SmallBitVector UseMask(VF, true);
367 for (auto [Idx, Value] : enumerate(Mask)) {
368 if (Value == PoisonMaskElem) {
369 if (MaskArg == UseMask::UndefsAsMask)
370 UseMask.reset(Idx);
371 continue;
372 }
373 if (MaskArg == UseMask::FirstArg && Value < VF)
374 UseMask.reset(Value);
375 else if (MaskArg == UseMask::SecondArg && Value >= VF)
376 UseMask.reset(Value - VF);
377 }
378 return UseMask;
379}
380
381/// Checks if the given value is actually an undefined constant vector.
382/// Also, if the \p UseMask is not empty, tries to check if the non-masked
383/// elements actually mask the insertelement buildvector, if any.
384template <bool IsPoisonOnly = false>
386 const SmallBitVector &UseMask = {}) {
387 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
388 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
389 if (isa<T>(V))
390 return Res;
391 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
392 if (!VecTy)
393 return Res.reset();
394 auto *C = dyn_cast<Constant>(V);
395 if (!C) {
396 if (!UseMask.empty()) {
397 const Value *Base = V;
398 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
399 Base = II->getOperand(0);
400 if (isa<T>(II->getOperand(1)))
401 continue;
402 std::optional<unsigned> Idx = getInsertIndex(II);
403 if (!Idx) {
404 Res.reset();
405 return Res;
406 }
407 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
408 Res.reset(*Idx);
409 }
410 // TODO: Add analysis for shuffles here too.
411 if (V == Base) {
412 Res.reset();
413 } else {
414 SmallBitVector SubMask(UseMask.size(), false);
415 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
416 }
417 } else {
418 Res.reset();
419 }
420 return Res;
421 }
422 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
423 if (Constant *Elem = C->getAggregateElement(I))
424 if (!isa<T>(Elem) &&
425 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
426 Res.reset(I);
427 }
428 return Res;
429}
430
431/// Checks if the vector of instructions can be represented as a shuffle, like:
432/// %x0 = extractelement <4 x i8> %x, i32 0
433/// %x3 = extractelement <4 x i8> %x, i32 3
434/// %y1 = extractelement <4 x i8> %y, i32 1
435/// %y2 = extractelement <4 x i8> %y, i32 2
436/// %x0x0 = mul i8 %x0, %x0
437/// %x3x3 = mul i8 %x3, %x3
438/// %y1y1 = mul i8 %y1, %y1
439/// %y2y2 = mul i8 %y2, %y2
440/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
441/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
442/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
443/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
444/// ret <4 x i8> %ins4
445/// can be transformed into:
446/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
447/// i32 6>
448/// %2 = mul <4 x i8> %1, %1
449/// ret <4 x i8> %2
450/// Mask will return the Shuffle Mask equivalent to the extracted elements.
451/// TODO: Can we split off and reuse the shuffle mask detection from
452/// ShuffleVectorInst/getShuffleCost?
453static std::optional<TargetTransformInfo::ShuffleKind>
455 const auto *It =
456 find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); });
457 if (It == VL.end())
458 return std::nullopt;
459 auto *EI0 = cast<ExtractElementInst>(*It);
460 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
461 return std::nullopt;
462 unsigned Size =
463 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
464 Value *Vec1 = nullptr;
465 Value *Vec2 = nullptr;
466 enum ShuffleMode { Unknown, Select, Permute };
467 ShuffleMode CommonShuffleMode = Unknown;
468 Mask.assign(VL.size(), PoisonMaskElem);
469 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
470 // Undef can be represented as an undef element in a vector.
471 if (isa<UndefValue>(VL[I]))
472 continue;
473 auto *EI = cast<ExtractElementInst>(VL[I]);
474 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
475 return std::nullopt;
476 auto *Vec = EI->getVectorOperand();
477 // We can extractelement from undef or poison vector.
478 if (isUndefVector(Vec).all())
479 continue;
480 // All vector operands must have the same number of vector elements.
481 if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
482 return std::nullopt;
483 if (isa<UndefValue>(EI->getIndexOperand()))
484 continue;
485 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
486 if (!Idx)
487 return std::nullopt;
488 // Undefined behavior if Idx is negative or >= Size.
489 if (Idx->getValue().uge(Size))
490 continue;
491 unsigned IntIdx = Idx->getValue().getZExtValue();
492 Mask[I] = IntIdx;
493 // For correct shuffling we have to have at most 2 different vector operands
494 // in all extractelement instructions.
495 if (!Vec1 || Vec1 == Vec) {
496 Vec1 = Vec;
497 } else if (!Vec2 || Vec2 == Vec) {
498 Vec2 = Vec;
499 Mask[I] += Size;
500 } else {
501 return std::nullopt;
502 }
503 if (CommonShuffleMode == Permute)
504 continue;
505 // If the extract index is not the same as the operation number, it is a
506 // permutation.
507 if (IntIdx != I) {
508 CommonShuffleMode = Permute;
509 continue;
510 }
511 CommonShuffleMode = Select;
512 }
513 // If we're not crossing lanes in different vectors, consider it as blending.
514 if (CommonShuffleMode == Select && Vec2)
516 // If Vec2 was never used, we have a permutation of a single vector, otherwise
517 // we have permutation of 2 vectors.
520}
521
522/// \returns True if Extract{Value,Element} instruction extracts element Idx.
523static std::optional<unsigned> getExtractIndex(Instruction *E) {
524 unsigned Opcode = E->getOpcode();
525 assert((Opcode == Instruction::ExtractElement ||
526 Opcode == Instruction::ExtractValue) &&
527 "Expected extractelement or extractvalue instruction.");
528 if (Opcode == Instruction::ExtractElement) {
529 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
530 if (!CI)
531 return std::nullopt;
532 return CI->getZExtValue();
533 }
534 auto *EI = cast<ExtractValueInst>(E);
535 if (EI->getNumIndices() != 1)
536 return std::nullopt;
537 return *EI->idx_begin();
538}
539
540namespace {
541
542/// Main data required for vectorization of instructions.
543struct InstructionsState {
544 /// The very first instruction in the list with the main opcode.
545 Value *OpValue = nullptr;
546
547 /// The main/alternate instruction.
548 Instruction *MainOp = nullptr;
549 Instruction *AltOp = nullptr;
550
551 /// The main/alternate opcodes for the list of instructions.
552 unsigned getOpcode() const {
553 return MainOp ? MainOp->getOpcode() : 0;
554 }
555
556 unsigned getAltOpcode() const {
557 return AltOp ? AltOp->getOpcode() : 0;
558 }
559
560 /// Some of the instructions in the list have alternate opcodes.
561 bool isAltShuffle() const { return AltOp != MainOp; }
562
563 bool isOpcodeOrAlt(Instruction *I) const {
564 unsigned CheckedOpcode = I->getOpcode();
565 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
566 }
567
568 InstructionsState() = delete;
569 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
570 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
571};
572
573} // end anonymous namespace
574
575/// Chooses the correct key for scheduling data. If \p Op has the same (or
576/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
577/// OpValue.
578static Value *isOneOf(const InstructionsState &S, Value *Op) {
579 auto *I = dyn_cast<Instruction>(Op);
580 if (I && S.isOpcodeOrAlt(I))
581 return Op;
582 return S.OpValue;
583}
584
585/// \returns true if \p Opcode is allowed as part of the main/alternate
586/// instruction for SLP vectorization.
587///
588/// Example of unsupported opcode is SDIV that can potentially cause UB if the
589/// "shuffled out" lane would result in division by zero.
590static bool isValidForAlternation(unsigned Opcode) {
591 if (Instruction::isIntDivRem(Opcode))
592 return false;
593
594 return true;
595}
596
597static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
598 const TargetLibraryInfo &TLI,
599 unsigned BaseIndex = 0);
600
601/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
602/// compatible instructions or constants, or just some other regular values.
603static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
604 Value *Op1, const TargetLibraryInfo &TLI) {
605 return (isConstant(BaseOp0) && isConstant(Op0)) ||
606 (isConstant(BaseOp1) && isConstant(Op1)) ||
607 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
608 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
609 BaseOp0 == Op0 || BaseOp1 == Op1 ||
610 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
611 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
612}
613
614/// \returns true if a compare instruction \p CI has similar "look" and
615/// same predicate as \p BaseCI, "as is" or with its operands and predicate
616/// swapped, false otherwise.
617static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
618 const TargetLibraryInfo &TLI) {
619 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
620 "Assessing comparisons of different types?");
621 CmpInst::Predicate BasePred = BaseCI->getPredicate();
622 CmpInst::Predicate Pred = CI->getPredicate();
624
625 Value *BaseOp0 = BaseCI->getOperand(0);
626 Value *BaseOp1 = BaseCI->getOperand(1);
627 Value *Op0 = CI->getOperand(0);
628 Value *Op1 = CI->getOperand(1);
629
630 return (BasePred == Pred &&
631 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
632 (BasePred == SwappedPred &&
633 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
634}
635
636/// \returns analysis of the Instructions in \p VL described in
637/// InstructionsState, the Opcode that we suppose the whole list
638/// could be vectorized even if its structure is diverse.
639static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
640 const TargetLibraryInfo &TLI,
641 unsigned BaseIndex) {
642 // Make sure these are all Instructions.
643 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
644 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
645
646 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
647 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
648 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
649 CmpInst::Predicate BasePred =
650 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
652 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
653 unsigned AltOpcode = Opcode;
654 unsigned AltIndex = BaseIndex;
655
656 // Check for one alternate opcode from another BinaryOperator.
657 // TODO - generalize to support all operators (types, calls etc.).
658 auto *IBase = cast<Instruction>(VL[BaseIndex]);
659 Intrinsic::ID BaseID = 0;
660 SmallVector<VFInfo> BaseMappings;
661 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
663 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
664 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
665 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
666 }
667 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
668 auto *I = cast<Instruction>(VL[Cnt]);
669 unsigned InstOpcode = I->getOpcode();
670 if (IsBinOp && isa<BinaryOperator>(I)) {
671 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
672 continue;
673 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
674 isValidForAlternation(Opcode)) {
675 AltOpcode = InstOpcode;
676 AltIndex = Cnt;
677 continue;
678 }
679 } else if (IsCastOp && isa<CastInst>(I)) {
680 Value *Op0 = IBase->getOperand(0);
681 Type *Ty0 = Op0->getType();
682 Value *Op1 = I->getOperand(0);
683 Type *Ty1 = Op1->getType();
684 if (Ty0 == Ty1) {
685 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
686 continue;
687 if (Opcode == AltOpcode) {
689 isValidForAlternation(InstOpcode) &&
690 "Cast isn't safe for alternation, logic needs to be updated!");
691 AltOpcode = InstOpcode;
692 AltIndex = Cnt;
693 continue;
694 }
695 }
696 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
697 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
698 Type *Ty0 = BaseInst->getOperand(0)->getType();
699 Type *Ty1 = Inst->getOperand(0)->getType();
700 if (Ty0 == Ty1) {
701 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
702 // Check for compatible operands. If the corresponding operands are not
703 // compatible - need to perform alternate vectorization.
704 CmpInst::Predicate CurrentPred = Inst->getPredicate();
705 CmpInst::Predicate SwappedCurrentPred =
706 CmpInst::getSwappedPredicate(CurrentPred);
707
708 if (E == 2 &&
709 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
710 continue;
711
712 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
713 continue;
714 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
715 if (AltIndex != BaseIndex) {
716 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
717 continue;
718 } else if (BasePred != CurrentPred) {
719 assert(
720 isValidForAlternation(InstOpcode) &&
721 "CmpInst isn't safe for alternation, logic needs to be updated!");
722 AltIndex = Cnt;
723 continue;
724 }
725 CmpInst::Predicate AltPred = AltInst->getPredicate();
726 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
727 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
728 continue;
729 }
730 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
731 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
732 if (Gep->getNumOperands() != 2 ||
733 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
734 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
735 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
737 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
738 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
739 auto *BaseLI = cast<LoadInst>(IBase);
740 if (!LI->isSimple() || !BaseLI->isSimple())
741 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
742 } else if (auto *Call = dyn_cast<CallInst>(I)) {
743 auto *CallBase = cast<CallInst>(IBase);
744 if (Call->getCalledFunction() != CallBase->getCalledFunction())
745 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
746 if (Call->hasOperandBundles() &&
747 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
748 Call->op_begin() + Call->getBundleOperandsEndIndex(),
749 CallBase->op_begin() +
751 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
753 if (ID != BaseID)
754 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
755 if (!ID) {
756 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
757 if (Mappings.size() != BaseMappings.size() ||
758 Mappings.front().ISA != BaseMappings.front().ISA ||
759 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
760 Mappings.front().VectorName != BaseMappings.front().VectorName ||
761 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
762 Mappings.front().Shape.Parameters !=
763 BaseMappings.front().Shape.Parameters)
764 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
765 }
766 }
767 continue;
768 }
769 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
770 }
771
772 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
773 cast<Instruction>(VL[AltIndex]));
774}
775
776/// \returns true if all of the values in \p VL have the same type or false
777/// otherwise.
779 Type *Ty = VL.front()->getType();
780 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
781}
782
783/// \returns True if in-tree use also needs extract. This refers to
784/// possible scalar operand in vectorized instruction.
785static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
786 TargetLibraryInfo *TLI) {
787 unsigned Opcode = UserInst->getOpcode();
788 switch (Opcode) {
789 case Instruction::Load: {
790 LoadInst *LI = cast<LoadInst>(UserInst);
791 return (LI->getPointerOperand() == Scalar);
792 }
793 case Instruction::Store: {
794 StoreInst *SI = cast<StoreInst>(UserInst);
795 return (SI->getPointerOperand() == Scalar);
796 }
797 case Instruction::Call: {
798 CallInst *CI = cast<CallInst>(UserInst);
800 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
801 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
802 Arg.value().get() == Scalar;
803 });
804 }
805 default:
806 return false;
807 }
808}
809
810/// \returns the AA location that is being access by the instruction.
812 if (StoreInst *SI = dyn_cast<StoreInst>(I))
813 return MemoryLocation::get(SI);
814 if (LoadInst *LI = dyn_cast<LoadInst>(I))
815 return MemoryLocation::get(LI);
816 return MemoryLocation();
817}
818
819/// \returns True if the instruction is not a volatile or atomic load/store.
820static bool isSimple(Instruction *I) {
821 if (LoadInst *LI = dyn_cast<LoadInst>(I))
822 return LI->isSimple();
823 if (StoreInst *SI = dyn_cast<StoreInst>(I))
824 return SI->isSimple();
825 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
826 return !MI->isVolatile();
827 return true;
828}
829
830/// Shuffles \p Mask in accordance with the given \p SubMask.
831/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
832/// one but two input vectors.
833static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
834 bool ExtendingManyInputs = false) {
835 if (SubMask.empty())
836 return;
837 assert(
838 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
839 // Check if input scalars were extended to match the size of other node.
840 (SubMask.size() == Mask.size() &&
841 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
842 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
843 "SubMask with many inputs support must be larger than the mask.");
844 if (Mask.empty()) {
845 Mask.append(SubMask.begin(), SubMask.end());
846 return;
847 }
848 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
849 int TermValue = std::min(Mask.size(), SubMask.size());
850 for (int I = 0, E = SubMask.size(); I < E; ++I) {
851 if (SubMask[I] == PoisonMaskElem ||
852 (!ExtendingManyInputs &&
853 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
854 continue;
855 NewMask[I] = Mask[SubMask[I]];
856 }
857 Mask.swap(NewMask);
858}
859
860/// Order may have elements assigned special value (size) which is out of
861/// bounds. Such indices only appear on places which correspond to undef values
862/// (see canReuseExtract for details) and used in order to avoid undef values
863/// have effect on operands ordering.
864/// The first loop below simply finds all unused indices and then the next loop
865/// nest assigns these indices for undef values positions.
866/// As an example below Order has two undef positions and they have assigned
867/// values 3 and 7 respectively:
868/// before: 6 9 5 4 9 2 1 0
869/// after: 6 3 5 4 7 2 1 0
871 const unsigned Sz = Order.size();
872 SmallBitVector UnusedIndices(Sz, /*t=*/true);
873 SmallBitVector MaskedIndices(Sz);
874 for (unsigned I = 0; I < Sz; ++I) {
875 if (Order[I] < Sz)
876 UnusedIndices.reset(Order[I]);
877 else
878 MaskedIndices.set(I);
879 }
880 if (MaskedIndices.none())
881 return;
882 assert(UnusedIndices.count() == MaskedIndices.count() &&
883 "Non-synced masked/available indices.");
884 int Idx = UnusedIndices.find_first();
885 int MIdx = MaskedIndices.find_first();
886 while (MIdx >= 0) {
887 assert(Idx >= 0 && "Indices must be synced.");
888 Order[MIdx] = Idx;
889 Idx = UnusedIndices.find_next(Idx);
890 MIdx = MaskedIndices.find_next(MIdx);
891 }
892}
893
894namespace llvm {
895
897 SmallVectorImpl<int> &Mask) {
898 Mask.clear();
899 const unsigned E = Indices.size();
900 Mask.resize(E, PoisonMaskElem);
901 for (unsigned I = 0; I < E; ++I)
902 Mask[Indices[I]] = I;
903}
904
905/// Reorders the list of scalars in accordance with the given \p Mask.
907 ArrayRef<int> Mask) {
908 assert(!Mask.empty() && "Expected non-empty mask.");
909 SmallVector<Value *> Prev(Scalars.size(),
910 UndefValue::get(Scalars.front()->getType()));
911 Prev.swap(Scalars);
912 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
913 if (Mask[I] != PoisonMaskElem)
914 Scalars[Mask[I]] = Prev[I];
915}
916
917/// Checks if the provided value does not require scheduling. It does not
918/// require scheduling if this is not an instruction or it is an instruction
919/// that does not read/write memory and all operands are either not instructions
920/// or phi nodes or instructions from different blocks.
922 auto *I = dyn_cast<Instruction>(V);
923 if (!I)
924 return true;
925 return !mayHaveNonDefUseDependency(*I) &&
926 all_of(I->operands(), [I](Value *V) {
927 auto *IO = dyn_cast<Instruction>(V);
928 if (!IO)
929 return true;
930 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
931 });
932}
933
934/// Checks if the provided value does not require scheduling. It does not
935/// require scheduling if this is not an instruction or it is an instruction
936/// that does not read/write memory and all users are phi nodes or instructions
937/// from the different blocks.
938static bool isUsedOutsideBlock(Value *V) {
939 auto *I = dyn_cast<Instruction>(V);
940 if (!I)
941 return true;
942 // Limits the number of uses to save compile time.
943 constexpr int UsesLimit = 8;
944 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
945 all_of(I->users(), [I](User *U) {
946 auto *IU = dyn_cast<Instruction>(U);
947 if (!IU)
948 return true;
949 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
950 });
951}
952
953/// Checks if the specified value does not require scheduling. It does not
954/// require scheduling if all operands and all users do not need to be scheduled
955/// in the current basic block.
958}
959
960/// Checks if the specified array of instructions does not require scheduling.
961/// It is so if all either instructions have operands that do not require
962/// scheduling or their users do not require scheduling since they are phis or
963/// in other basic blocks.
965 return !VL.empty() &&
967}
968
969namespace slpvectorizer {
970
971/// Bottom Up SLP Vectorizer.
972class BoUpSLP {
973 struct TreeEntry;
974 struct ScheduleData;
977
978public:
986
988 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
991 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li),
992 DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
994 // Use the vector register size specified by the target unless overridden
995 // by a command-line option.
996 // TODO: It would be better to limit the vectorization factor based on
997 // data type rather than just register size. For example, x86 AVX has
998 // 256-bit registers, but it does not support integer operations
999 // at that width (that requires AVX2).
1000 if (MaxVectorRegSizeOption.getNumOccurrences())
1001 MaxVecRegSize = MaxVectorRegSizeOption;
1002 else
1003 MaxVecRegSize =
1005 .getFixedValue();
1006
1007 if (MinVectorRegSizeOption.getNumOccurrences())
1008 MinVecRegSize = MinVectorRegSizeOption;
1009 else
1010 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1011 }
1012
1013 /// Vectorize the tree that starts with the elements in \p VL.
1014 /// Returns the vectorized root.
1016
1017 /// Vectorize the tree but with the list of externally used values \p
1018 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1019 /// generated extractvalue instructions.
1020 /// \param ReplacedExternals containd list of replaced external values
1021 /// {scalar, replace} after emitting extractelement for external uses.
1022 Value *
1023 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1024 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1025 Instruction *ReductionRoot = nullptr);
1026
1027 /// \returns the cost incurred by unwanted spills and fills, caused by
1028 /// holding live values over call sites.
1030
1031 /// \returns the vectorization cost of the subtree that starts at \p VL.
1032 /// A negative number means that this is profitable.
1033 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1034
1035 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1036 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1037 void buildTree(ArrayRef<Value *> Roots,
1038 const SmallDenseSet<Value *> &UserIgnoreLst);
1039
1040 /// Construct a vectorizable tree that starts at \p Roots.
1041 void buildTree(ArrayRef<Value *> Roots);
1042
1043 /// Returns whether the root node has in-tree uses.
1045 return !VectorizableTree.empty() &&
1046 !VectorizableTree.front()->UserTreeIndices.empty();
1047 }
1048
1049 /// Return the scalars of the root node.
1051 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1052 return VectorizableTree.front()->Scalars;
1053 }
1054
1055 /// Builds external uses of the vectorized scalars, i.e. the list of
1056 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1057 /// ExternallyUsedValues contains additional list of external uses to handle
1058 /// vectorization of reductions.
1059 void
1060 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1061
1062 /// Clear the internal data structures that are created by 'buildTree'.
1063 void deleteTree() {
1064 VectorizableTree.clear();
1065 ScalarToTreeEntry.clear();
1066 MultiNodeScalars.clear();
1067 MustGather.clear();
1068 EntryToLastInstruction.clear();
1069 ExternalUses.clear();
1070 for (auto &Iter : BlocksSchedules) {
1071 BlockScheduling *BS = Iter.second.get();
1072 BS->clear();
1073 }
1074 MinBWs.clear();
1075 InstrElementSize.clear();
1076 UserIgnoreList = nullptr;
1077 PostponedGathers.clear();
1078 ValueToGatherNodes.clear();
1079 }
1080
1081 unsigned getTreeSize() const { return VectorizableTree.size(); }
1082
1083 /// Perform LICM and CSE on the newly generated gather sequences.
1085
1086 /// Checks if the specified gather tree entry \p TE can be represented as a
1087 /// shuffled vector entry + (possibly) permutation with other gathers. It
1088 /// implements the checks only for possibly ordered scalars (Loads,
1089 /// ExtractElement, ExtractValue), which can be part of the graph.
1090 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1091
1092 /// Sort loads into increasing pointers offsets to allow greater clustering.
1093 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1094
1095 /// Gets reordering data for the given tree entry. If the entry is vectorized
1096 /// - just return ReorderIndices, otherwise check if the scalars can be
1097 /// reordered and return the most optimal order.
1098 /// \return std::nullopt if ordering is not important, empty order, if
1099 /// identity order is important, or the actual order.
1100 /// \param TopToBottom If true, include the order of vectorized stores and
1101 /// insertelement nodes, otherwise skip them.
1102 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1103 bool TopToBottom);
1104
1105 /// Reorders the current graph to the most profitable order starting from the
1106 /// root node to the leaf nodes. The best order is chosen only from the nodes
1107 /// of the same size (vectorization factor). Smaller nodes are considered
1108 /// parts of subgraph with smaller VF and they are reordered independently. We
1109 /// can make it because we still need to extend smaller nodes to the wider VF
1110 /// and we can merge reordering shuffles with the widening shuffles.
1111 void reorderTopToBottom();
1112
1113 /// Reorders the current graph to the most profitable order starting from
1114 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1115 /// number of reshuffles if the leaf nodes use the same order. In this case we
1116 /// can merge the orders and just shuffle user node instead of shuffling its
1117 /// operands. Plus, even the leaf nodes have different orders, it allows to
1118 /// sink reordering in the graph closer to the root node and merge it later
1119 /// during analysis.
1120 void reorderBottomToTop(bool IgnoreReorder = false);
1121
1122 /// \return The vector element size in bits to use when vectorizing the
1123 /// expression tree ending at \p V. If V is a store, the size is the width of
1124 /// the stored value. Otherwise, the size is the width of the largest loaded
1125 /// value reaching V. This method is used by the vectorizer to calculate
1126 /// vectorization factors.
1127 unsigned getVectorElementSize(Value *V);
1128
1129 /// Compute the minimum type sizes required to represent the entries in a
1130 /// vectorizable tree.
1132
1133 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1134 unsigned getMaxVecRegSize() const {
1135 return MaxVecRegSize;
1136 }
1137
1138 // \returns minimum vector register size as set by cl::opt.
1139 unsigned getMinVecRegSize() const {
1140 return MinVecRegSize;
1141 }
1142
1143 unsigned getMinVF(unsigned Sz) const {
1144 return std::max(2U, getMinVecRegSize() / Sz);
1145 }
1146
1147 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1148 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1149 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1150 return MaxVF ? MaxVF : UINT_MAX;
1151 }
1152
1153 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1154 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1155 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1156 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1157 ///
1158 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1159 unsigned canMapToVector(Type *T) const;
1160
1161 /// \returns True if the VectorizableTree is both tiny and not fully
1162 /// vectorizable. We do not vectorize such trees.
1163 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1164
1165 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1166 /// can be load combined in the backend. Load combining may not be allowed in
1167 /// the IR optimizer, so we do not want to alter the pattern. For example,
1168 /// partially transforming a scalar bswap() pattern into vector code is
1169 /// effectively impossible for the backend to undo.
1170 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1171 /// may not be necessary.
1172 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1173
1174 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1175 /// can be load combined in the backend. Load combining may not be allowed in
1176 /// the IR optimizer, so we do not want to alter the pattern. For example,
1177 /// partially transforming a scalar bswap() pattern into vector code is
1178 /// effectively impossible for the backend to undo.
1179 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1180 /// may not be necessary.
1181 bool isLoadCombineCandidate() const;
1182
1184
1185 /// This structure holds any data we need about the edges being traversed
1186 /// during buildTree_rec(). We keep track of:
1187 /// (i) the user TreeEntry index, and
1188 /// (ii) the index of the edge.
1189 struct EdgeInfo {
1190 EdgeInfo() = default;
1191 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1193 /// The user TreeEntry.
1194 TreeEntry *UserTE = nullptr;
1195 /// The operand index of the use.
1196 unsigned EdgeIdx = UINT_MAX;
1197#ifndef NDEBUG
1199 const BoUpSLP::EdgeInfo &EI) {
1200 EI.dump(OS);
1201 return OS;
1202 }
1203 /// Debug print.
1204 void dump(raw_ostream &OS) const {
1205 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1206 << " EdgeIdx:" << EdgeIdx << "}";
1207 }
1208 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1209#endif
1210 bool operator == (const EdgeInfo &Other) const {
1211 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1212 }
1213 };
1214
1215 /// A helper class used for scoring candidates for two consecutive lanes.
1217 const TargetLibraryInfo &TLI;
1218 const DataLayout &DL;
1219 ScalarEvolution &SE;
1220 const BoUpSLP &R;
1221 int NumLanes; // Total number of lanes (aka vectorization factor).
1222 int MaxLevel; // The maximum recursion depth for accumulating score.
1223
1224 public:
1226 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1227 int MaxLevel)
1228 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1229 MaxLevel(MaxLevel) {}
1230
1231 // The hard-coded scores listed here are not very important, though it shall
1232 // be higher for better matches to improve the resulting cost. When
1233 // computing the scores of matching one sub-tree with another, we are
1234 // basically counting the number of values that are matching. So even if all
1235 // scores are set to 1, we would still get a decent matching result.
1236 // However, sometimes we have to break ties. For example we may have to
1237 // choose between matching loads vs matching opcodes. This is what these
1238 // scores are helping us with: they provide the order of preference. Also,
1239 // this is important if the scalar is externally used or used in another
1240 // tree entry node in the different lane.
1241
1242 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1243 static const int ScoreConsecutiveLoads = 4;
1244 /// The same load multiple times. This should have a better score than
1245 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1246 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1247 /// a vector load and 1.0 for a broadcast.
1248 static const int ScoreSplatLoads = 3;
1249 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1250 static const int ScoreReversedLoads = 3;
1251 /// A load candidate for masked gather.
1252 static const int ScoreMaskedGatherCandidate = 1;
1253 /// ExtractElementInst from same vector and consecutive indexes.
1254 static const int ScoreConsecutiveExtracts = 4;
1255 /// ExtractElementInst from same vector and reversed indices.
1256 static const int ScoreReversedExtracts = 3;
1257 /// Constants.
1258 static const int ScoreConstants = 2;
1259 /// Instructions with the same opcode.
1260 static const int ScoreSameOpcode = 2;
1261 /// Instructions with alt opcodes (e.g, add + sub).
1262 static const int ScoreAltOpcodes = 1;
1263 /// Identical instructions (a.k.a. splat or broadcast).
1264 static const int ScoreSplat = 1;
1265 /// Matching with an undef is preferable to failing.
1266 static const int ScoreUndef = 1;
1267 /// Score for failing to find a decent match.
1268 static const int ScoreFail = 0;
1269 /// Score if all users are vectorized.
1270 static const int ScoreAllUserVectorized = 1;
1271
1272 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1273 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1274 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1275 /// MainAltOps.
1277 ArrayRef<Value *> MainAltOps) const {
1278 if (!isValidElementType(V1->getType()) ||
1279 !isValidElementType(V2->getType()))
1281
1282 if (V1 == V2) {
1283 if (isa<LoadInst>(V1)) {
1284 // Retruns true if the users of V1 and V2 won't need to be extracted.
1285 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1286 // Bail out if we have too many uses to save compilation time.
1287 static constexpr unsigned Limit = 8;
1288 if (V1->hasNUsesOrMore(Limit) || V2->hasNUsesOrMore(Limit))
1289 return false;
1290
1291 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1292 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1293 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1294 });
1295 };
1296 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1297 };
1298 // A broadcast of a load can be cheaper on some targets.
1299 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1300 ElementCount::getFixed(NumLanes)) &&
1301 ((int)V1->getNumUses() == NumLanes ||
1302 AllUsersAreInternal(V1, V2)))
1304 }
1306 }
1307
1308 auto *LI1 = dyn_cast<LoadInst>(V1);
1309 auto *LI2 = dyn_cast<LoadInst>(V2);
1310 if (LI1 && LI2) {
1311 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1312 !LI2->isSimple())
1314
1315 std::optional<int> Dist = getPointersDiff(
1316 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1317 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1318 if (!Dist || *Dist == 0) {
1319 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1320 getUnderlyingObject(LI2->getPointerOperand()) &&
1321 R.TTI->isLegalMaskedGather(
1322 FixedVectorType::get(LI1->getType(), NumLanes),
1323 LI1->getAlign()))
1326 }
1327 // The distance is too large - still may be profitable to use masked
1328 // loads/gathers.
1329 if (std::abs(*Dist) > NumLanes / 2)
1331 // This still will detect consecutive loads, but we might have "holes"
1332 // in some cases. It is ok for non-power-2 vectorization and may produce
1333 // better results. It should not affect current vectorization.
1336 }
1337
1338 auto *C1 = dyn_cast<Constant>(V1);
1339 auto *C2 = dyn_cast<Constant>(V2);
1340 if (C1 && C2)
1342
1343 // Extracts from consecutive indexes of the same vector better score as
1344 // the extracts could be optimized away.
1345 Value *EV1;
1346 ConstantInt *Ex1Idx;
1347 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1348 // Undefs are always profitable for extractelements.
1349 // Compiler can easily combine poison and extractelement <non-poison> or
1350 // undef and extractelement <poison>. But combining undef +
1351 // extractelement <non-poison-but-may-produce-poison> requires some
1352 // extra operations.
1353 if (isa<UndefValue>(V2))
1354 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1357 Value *EV2 = nullptr;
1358 ConstantInt *Ex2Idx = nullptr;
1359 if (match(V2,
1361 m_Undef())))) {
1362 // Undefs are always profitable for extractelements.
1363 if (!Ex2Idx)
1365 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1367 if (EV2 == EV1) {
1368 int Idx1 = Ex1Idx->getZExtValue();
1369 int Idx2 = Ex2Idx->getZExtValue();
1370 int Dist = Idx2 - Idx1;
1371 // The distance is too large - still may be profitable to use
1372 // shuffles.
1373 if (std::abs(Dist) == 0)
1375 if (std::abs(Dist) > NumLanes / 2)
1379 }
1381 }
1383 }
1384
1385 auto *I1 = dyn_cast<Instruction>(V1);
1386 auto *I2 = dyn_cast<Instruction>(V2);
1387 if (I1 && I2) {
1388 if (I1->getParent() != I2->getParent())
1390 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1391 Ops.push_back(I1);
1392 Ops.push_back(I2);
1393 InstructionsState S = getSameOpcode(Ops, TLI);
1394 // Note: Only consider instructions with <= 2 operands to avoid
1395 // complexity explosion.
1396 if (S.getOpcode() &&
1397 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1398 !S.isAltShuffle()) &&
1399 all_of(Ops, [&S](Value *V) {
1400 return cast<Instruction>(V)->getNumOperands() ==
1401 S.MainOp->getNumOperands();
1402 }))
1403 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1405 }
1406
1407 if (isa<UndefValue>(V2))
1409
1411 }
1412
1413 /// Go through the operands of \p LHS and \p RHS recursively until
1414 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1415 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1416 /// of \p U1 and \p U2), except at the beginning of the recursion where
1417 /// these are set to nullptr.
1418 ///
1419 /// For example:
1420 /// \verbatim
1421 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1422 /// \ / \ / \ / \ /
1423 /// + + + +
1424 /// G1 G2 G3 G4
1425 /// \endverbatim
1426 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1427 /// each level recursively, accumulating the score. It starts from matching
1428 /// the additions at level 0, then moves on to the loads (level 1). The
1429 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1430 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1431 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1432 /// Please note that the order of the operands does not matter, as we
1433 /// evaluate the score of all profitable combinations of operands. In
1434 /// other words the score of G1 and G4 is the same as G1 and G2. This
1435 /// heuristic is based on ideas described in:
1436 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1437 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1438 /// Luís F. W. Góes
1440 Instruction *U2, int CurrLevel,
1441 ArrayRef<Value *> MainAltOps) const {
1442
1443 // Get the shallow score of V1 and V2.
1444 int ShallowScoreAtThisLevel =
1445 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1446
1447 // If reached MaxLevel,
1448 // or if V1 and V2 are not instructions,
1449 // or if they are SPLAT,
1450 // or if they are not consecutive,
1451 // or if profitable to vectorize loads or extractelements, early return
1452 // the current cost.
1453 auto *I1 = dyn_cast<Instruction>(LHS);
1454 auto *I2 = dyn_cast<Instruction>(RHS);
1455 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1456 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1457 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1458 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1459 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1460 ShallowScoreAtThisLevel))
1461 return ShallowScoreAtThisLevel;
1462 assert(I1 && I2 && "Should have early exited.");
1463
1464 // Contains the I2 operand indexes that got matched with I1 operands.
1465 SmallSet<unsigned, 4> Op2Used;
1466
1467 // Recursion towards the operands of I1 and I2. We are trying all possible
1468 // operand pairs, and keeping track of the best score.
1469 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1470 OpIdx1 != NumOperands1; ++OpIdx1) {
1471 // Try to pair op1I with the best operand of I2.
1472 int MaxTmpScore = 0;
1473 unsigned MaxOpIdx2 = 0;
1474 bool FoundBest = false;
1475 // If I2 is commutative try all combinations.
1476 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1477 unsigned ToIdx = isCommutative(I2)
1478 ? I2->getNumOperands()
1479 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1480 assert(FromIdx <= ToIdx && "Bad index");
1481 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1482 // Skip operands already paired with OpIdx1.
1483 if (Op2Used.count(OpIdx2))
1484 continue;
1485 // Recursively calculate the cost at each level
1486 int TmpScore =
1487 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1488 I1, I2, CurrLevel + 1, std::nullopt);
1489 // Look for the best score.
1490 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1491 TmpScore > MaxTmpScore) {
1492 MaxTmpScore = TmpScore;
1493 MaxOpIdx2 = OpIdx2;
1494 FoundBest = true;
1495 }
1496 }
1497 if (FoundBest) {
1498 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1499 Op2Used.insert(MaxOpIdx2);
1500 ShallowScoreAtThisLevel += MaxTmpScore;
1501 }
1502 }
1503 return ShallowScoreAtThisLevel;
1504 }
1505 };
1506 /// A helper data structure to hold the operands of a vector of instructions.
1507 /// This supports a fixed vector length for all operand vectors.
1509 /// For each operand we need (i) the value, and (ii) the opcode that it
1510 /// would be attached to if the expression was in a left-linearized form.
1511 /// This is required to avoid illegal operand reordering.
1512 /// For example:
1513 /// \verbatim
1514 /// 0 Op1
1515 /// |/
1516 /// Op1 Op2 Linearized + Op2
1517 /// \ / ----------> |/
1518 /// - -
1519 ///
1520 /// Op1 - Op2 (0 + Op1) - Op2
1521 /// \endverbatim
1522 ///
1523 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1524 ///
1525 /// Another way to think of this is to track all the operations across the
1526 /// path from the operand all the way to the root of the tree and to
1527 /// calculate the operation that corresponds to this path. For example, the
1528 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1529 /// corresponding operation is a '-' (which matches the one in the
1530 /// linearized tree, as shown above).
1531 ///
1532 /// For lack of a better term, we refer to this operation as Accumulated
1533 /// Path Operation (APO).
1534 struct OperandData {
1535 OperandData() = default;
1536 OperandData(Value *V, bool APO, bool IsUsed)
1537 : V(V), APO(APO), IsUsed(IsUsed) {}
1538 /// The operand value.
1539 Value *V = nullptr;
1540 /// TreeEntries only allow a single opcode, or an alternate sequence of
1541 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1542 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1543 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1544 /// (e.g., Add/Mul)
1545 bool APO = false;
1546 /// Helper data for the reordering function.
1547 bool IsUsed = false;
1548 };
1549
1550 /// During operand reordering, we are trying to select the operand at lane
1551 /// that matches best with the operand at the neighboring lane. Our
1552 /// selection is based on the type of value we are looking for. For example,
1553 /// if the neighboring lane has a load, we need to look for a load that is
1554 /// accessing a consecutive address. These strategies are summarized in the
1555 /// 'ReorderingMode' enumerator.
1556 enum class ReorderingMode {
1557 Load, ///< Matching loads to consecutive memory addresses
1558 Opcode, ///< Matching instructions based on opcode (same or alternate)
1559 Constant, ///< Matching constants
1560 Splat, ///< Matching the same instruction multiple times (broadcast)
1561 Failed, ///< We failed to create a vectorizable group
1562 };
1563
1565
1566 /// A vector of operand vectors.
1568
1569 const TargetLibraryInfo &TLI;
1570 const DataLayout &DL;
1571 ScalarEvolution &SE;
1572 const BoUpSLP &R;
1573
1574 /// \returns the operand data at \p OpIdx and \p Lane.
1575 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1576 return OpsVec[OpIdx][Lane];
1577 }
1578
1579 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1580 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1581 return OpsVec[OpIdx][Lane];
1582 }
1583
1584 /// Clears the used flag for all entries.
1585 void clearUsed() {
1586 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1587 OpIdx != NumOperands; ++OpIdx)
1588 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1589 ++Lane)
1590 OpsVec[OpIdx][Lane].IsUsed = false;
1591 }
1592
1593 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1594 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1595 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1596 }
1597
1598 /// \param Lane lane of the operands under analysis.
1599 /// \param OpIdx operand index in \p Lane lane we're looking the best
1600 /// candidate for.
1601 /// \param Idx operand index of the current candidate value.
1602 /// \returns The additional score due to possible broadcasting of the
1603 /// elements in the lane. It is more profitable to have power-of-2 unique
1604 /// elements in the lane, it will be vectorized with higher probability
1605 /// after removing duplicates. Currently the SLP vectorizer supports only
1606 /// vectorization of the power-of-2 number of unique scalars.
1607 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1608 Value *IdxLaneV = getData(Idx, Lane).V;
1609 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1610 return 0;
1612 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1613 if (Ln == Lane)
1614 continue;
1615 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1616 if (!isa<Instruction>(OpIdxLnV))
1617 return 0;
1618 Uniques.insert(OpIdxLnV);
1619 }
1620 int UniquesCount = Uniques.size();
1621 int UniquesCntWithIdxLaneV =
1622 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1623 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1624 int UniquesCntWithOpIdxLaneV =
1625 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1626 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1627 return 0;
1628 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1629 UniquesCntWithOpIdxLaneV) -
1630 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1631 }
1632
1633 /// \param Lane lane of the operands under analysis.
1634 /// \param OpIdx operand index in \p Lane lane we're looking the best
1635 /// candidate for.
1636 /// \param Idx operand index of the current candidate value.
1637 /// \returns The additional score for the scalar which users are all
1638 /// vectorized.
1639 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1640 Value *IdxLaneV = getData(Idx, Lane).V;
1641 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1642 // Do not care about number of uses for vector-like instructions
1643 // (extractelement/extractvalue with constant indices), they are extracts
1644 // themselves and already externally used. Vectorization of such
1645 // instructions does not add extra extractelement instruction, just may
1646 // remove it.
1647 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1648 isVectorLikeInstWithConstOps(OpIdxLaneV))
1650 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1651 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1652 return 0;
1653 return R.areAllUsersVectorized(IdxLaneI)
1655 : 0;
1656 }
1657
1658 /// Score scaling factor for fully compatible instructions but with
1659 /// different number of external uses. Allows better selection of the
1660 /// instructions with less external uses.
1661 static const int ScoreScaleFactor = 10;
1662
1663 /// \Returns the look-ahead score, which tells us how much the sub-trees
1664 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1665 /// score. This helps break ties in an informed way when we cannot decide on
1666 /// the order of the operands by just considering the immediate
1667 /// predecessors.
1668 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1669 int Lane, unsigned OpIdx, unsigned Idx,
1670 bool &IsUsed) {
1671 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1673 // Keep track of the instruction stack as we recurse into the operands
1674 // during the look-ahead score exploration.
1675 int Score =
1676 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1677 /*CurrLevel=*/1, MainAltOps);
1678 if (Score) {
1679 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1680 if (Score <= -SplatScore) {
1681 // Set the minimum score for splat-like sequence to avoid setting
1682 // failed state.
1683 Score = 1;
1684 } else {
1685 Score += SplatScore;
1686 // Scale score to see the difference between different operands
1687 // and similar operands but all vectorized/not all vectorized
1688 // uses. It does not affect actual selection of the best
1689 // compatible operand in general, just allows to select the
1690 // operand with all vectorized uses.
1691 Score *= ScoreScaleFactor;
1692 Score += getExternalUseScore(Lane, OpIdx, Idx);
1693 IsUsed = true;
1694 }
1695 }
1696 return Score;
1697 }
1698
1699 /// Best defined scores per lanes between the passes. Used to choose the
1700 /// best operand (with the highest score) between the passes.
1701 /// The key - {Operand Index, Lane}.
1702 /// The value - the best score between the passes for the lane and the
1703 /// operand.
1705 BestScoresPerLanes;
1706
1707 // Search all operands in Ops[*][Lane] for the one that matches best
1708 // Ops[OpIdx][LastLane] and return its opreand index.
1709 // If no good match can be found, return std::nullopt.
1710 std::optional<unsigned>
1711 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1712 ArrayRef<ReorderingMode> ReorderingModes,
1713 ArrayRef<Value *> MainAltOps) {
1714 unsigned NumOperands = getNumOperands();
1715
1716 // The operand of the previous lane at OpIdx.
1717 Value *OpLastLane = getData(OpIdx, LastLane).V;
1718
1719 // Our strategy mode for OpIdx.
1720 ReorderingMode RMode = ReorderingModes[OpIdx];
1721 if (RMode == ReorderingMode::Failed)
1722 return std::nullopt;
1723
1724 // The linearized opcode of the operand at OpIdx, Lane.
1725 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1726
1727 // The best operand index and its score.
1728 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1729 // are using the score to differentiate between the two.
1730 struct BestOpData {
1731 std::optional<unsigned> Idx;
1732 unsigned Score = 0;
1733 } BestOp;
1734 BestOp.Score =
1735 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1736 .first->second;
1737
1738 // Track if the operand must be marked as used. If the operand is set to
1739 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1740 // want to reestimate the operands again on the following iterations).
1741 bool IsUsed =
1742 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1743 // Iterate through all unused operands and look for the best.
1744 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1745 // Get the operand at Idx and Lane.
1746 OperandData &OpData = getData(Idx, Lane);
1747 Value *Op = OpData.V;
1748 bool OpAPO = OpData.APO;
1749
1750 // Skip already selected operands.
1751 if (OpData.IsUsed)
1752 continue;
1753
1754 // Skip if we are trying to move the operand to a position with a
1755 // different opcode in the linearized tree form. This would break the
1756 // semantics.
1757 if (OpAPO != OpIdxAPO)
1758 continue;
1759
1760 // Look for an operand that matches the current mode.
1761 switch (RMode) {
1762 case ReorderingMode::Load:
1763 case ReorderingMode::Constant:
1764 case ReorderingMode::Opcode: {
1765 bool LeftToRight = Lane > LastLane;
1766 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1767 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1768 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1769 OpIdx, Idx, IsUsed);
1770 if (Score > static_cast<int>(BestOp.Score)) {
1771 BestOp.Idx = Idx;
1772 BestOp.Score = Score;
1773 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1774 }
1775 break;
1776 }
1777 case ReorderingMode::Splat:
1778 if (Op == OpLastLane)
1779 BestOp.Idx = Idx;
1780 break;
1781 case ReorderingMode::Failed:
1782 llvm_unreachable("Not expected Failed reordering mode.");
1783 }
1784 }
1785
1786 if (BestOp.Idx) {
1787 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1788 return BestOp.Idx;
1789 }
1790 // If we could not find a good match return std::nullopt.
1791 return std::nullopt;
1792 }
1793
1794 /// Helper for reorderOperandVecs.
1795 /// \returns the lane that we should start reordering from. This is the one
1796 /// which has the least number of operands that can freely move about or
1797 /// less profitable because it already has the most optimal set of operands.
1798 unsigned getBestLaneToStartReordering() const {
1799 unsigned Min = UINT_MAX;
1800 unsigned SameOpNumber = 0;
1801 // std::pair<unsigned, unsigned> is used to implement a simple voting
1802 // algorithm and choose the lane with the least number of operands that
1803 // can freely move about or less profitable because it already has the
1804 // most optimal set of operands. The first unsigned is a counter for
1805 // voting, the second unsigned is the counter of lanes with instructions
1806 // with same/alternate opcodes and same parent basic block.
1808 // Try to be closer to the original results, if we have multiple lanes
1809 // with same cost. If 2 lanes have the same cost, use the one with the
1810 // lowest index.
1811 for (int I = getNumLanes(); I > 0; --I) {
1812 unsigned Lane = I - 1;
1813 OperandsOrderData NumFreeOpsHash =
1814 getMaxNumOperandsThatCanBeReordered(Lane);
1815 // Compare the number of operands that can move and choose the one with
1816 // the least number.
1817 if (NumFreeOpsHash.NumOfAPOs < Min) {
1818 Min = NumFreeOpsHash.NumOfAPOs;
1819 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1820 HashMap.clear();
1821 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1822 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1823 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1824 // Select the most optimal lane in terms of number of operands that
1825 // should be moved around.
1826 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1827 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1828 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1829 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1830 auto *It = HashMap.find(NumFreeOpsHash.Hash);
1831 if (It == HashMap.end())
1832 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1833 else
1834 ++It->second.first;
1835 }
1836 }
1837 // Select the lane with the minimum counter.
1838 unsigned BestLane = 0;
1839 unsigned CntMin = UINT_MAX;
1840 for (const auto &Data : reverse(HashMap)) {
1841 if (Data.second.first < CntMin) {
1842 CntMin = Data.second.first;
1843 BestLane = Data.second.second;
1844 }
1845 }
1846 return BestLane;
1847 }
1848
1849 /// Data structure that helps to reorder operands.
1850 struct OperandsOrderData {
1851 /// The best number of operands with the same APOs, which can be
1852 /// reordered.
1853 unsigned NumOfAPOs = UINT_MAX;
1854 /// Number of operands with the same/alternate instruction opcode and
1855 /// parent.
1856 unsigned NumOpsWithSameOpcodeParent = 0;
1857 /// Hash for the actual operands ordering.
1858 /// Used to count operands, actually their position id and opcode
1859 /// value. It is used in the voting mechanism to find the lane with the
1860 /// least number of operands that can freely move about or less profitable
1861 /// because it already has the most optimal set of operands. Can be
1862 /// replaced with SmallVector<unsigned> instead but hash code is faster
1863 /// and requires less memory.
1864 unsigned Hash = 0;
1865 };
1866 /// \returns the maximum number of operands that are allowed to be reordered
1867 /// for \p Lane and the number of compatible instructions(with the same
1868 /// parent/opcode). This is used as a heuristic for selecting the first lane
1869 /// to start operand reordering.
1870 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
1871 unsigned CntTrue = 0;
1872 unsigned NumOperands = getNumOperands();
1873 // Operands with the same APO can be reordered. We therefore need to count
1874 // how many of them we have for each APO, like this: Cnt[APO] = x.
1875 // Since we only have two APOs, namely true and false, we can avoid using
1876 // a map. Instead we can simply count the number of operands that
1877 // correspond to one of them (in this case the 'true' APO), and calculate
1878 // the other by subtracting it from the total number of operands.
1879 // Operands with the same instruction opcode and parent are more
1880 // profitable since we don't need to move them in many cases, with a high
1881 // probability such lane already can be vectorized effectively.
1882 bool AllUndefs = true;
1883 unsigned NumOpsWithSameOpcodeParent = 0;
1884 Instruction *OpcodeI = nullptr;
1885 BasicBlock *Parent = nullptr;
1886 unsigned Hash = 0;
1887 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1888 const OperandData &OpData = getData(OpIdx, Lane);
1889 if (OpData.APO)
1890 ++CntTrue;
1891 // Use Boyer-Moore majority voting for finding the majority opcode and
1892 // the number of times it occurs.
1893 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
1894 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
1895 I->getParent() != Parent) {
1896 if (NumOpsWithSameOpcodeParent == 0) {
1897 NumOpsWithSameOpcodeParent = 1;
1898 OpcodeI = I;
1899 Parent = I->getParent();
1900 } else {
1901 --NumOpsWithSameOpcodeParent;
1902 }
1903 } else {
1904 ++NumOpsWithSameOpcodeParent;
1905 }
1906 }
1907 Hash = hash_combine(
1908 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
1909 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
1910 }
1911 if (AllUndefs)
1912 return {};
1913 OperandsOrderData Data;
1914 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
1915 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
1916 Data.Hash = Hash;
1917 return Data;
1918 }
1919
1920 /// Go through the instructions in VL and append their operands.
1921 void appendOperandsOfVL(ArrayRef<Value *> VL) {
1922 assert(!VL.empty() && "Bad VL");
1923 assert((empty() || VL.size() == getNumLanes()) &&
1924 "Expected same number of lanes");
1925 assert(isa<Instruction>(VL[0]) && "Expected instruction");
1926 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
1927 OpsVec.resize(NumOperands);
1928 unsigned NumLanes = VL.size();
1929 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1930 OpsVec[OpIdx].resize(NumLanes);
1931 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1932 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
1933 // Our tree has just 3 nodes: the root and two operands.
1934 // It is therefore trivial to get the APO. We only need to check the
1935 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
1936 // RHS operand. The LHS operand of both add and sub is never attached
1937 // to an inversese operation in the linearized form, therefore its APO
1938 // is false. The RHS is true only if VL[Lane] is an inverse operation.
1939
1940 // Since operand reordering is performed on groups of commutative
1941 // operations or alternating sequences (e.g., +, -), we can safely
1942 // tell the inverse operations by checking commutativity.
1943 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
1944 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
1945 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
1946 APO, false};
1947 }
1948 }
1949 }
1950
1951 /// \returns the number of operands.
1952 unsigned getNumOperands() const { return OpsVec.size(); }
1953
1954 /// \returns the number of lanes.
1955 unsigned getNumLanes() const { return OpsVec[0].size(); }
1956
1957 /// \returns the operand value at \p OpIdx and \p Lane.
1958 Value *getValue(unsigned OpIdx, unsigned Lane) const {
1959 return getData(OpIdx, Lane).V;
1960 }
1961
1962 /// \returns true if the data structure is empty.
1963 bool empty() const { return OpsVec.empty(); }
1964
1965 /// Clears the data.
1966 void clear() { OpsVec.clear(); }
1967
1968 /// \Returns true if there are enough operands identical to \p Op to fill
1969 /// the whole vector.
1970 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
1971 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
1972 bool OpAPO = getData(OpIdx, Lane).APO;
1973 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
1974 if (Ln == Lane)
1975 continue;
1976 // This is set to true if we found a candidate for broadcast at Lane.
1977 bool FoundCandidate = false;
1978 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
1979 OperandData &Data = getData(OpI, Ln);
1980 if (Data.APO != OpAPO || Data.IsUsed)
1981 continue;
1982 if (Data.V == Op) {
1983 FoundCandidate = true;
1984 Data.IsUsed = true;
1985 break;
1986 }
1987 }
1988 if (!FoundCandidate)
1989 return false;
1990 }
1991 return true;
1992 }
1993
1994 public:
1995 /// Initialize with all the operands of the instruction vector \p RootVL.
1997 const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R)
1998 : TLI(TLI), DL(DL), SE(SE), R(R) {
1999 // Append all the operands of RootVL.
2000 appendOperandsOfVL(RootVL);
2001 }
2002
2003 /// \Returns a value vector with the operands across all lanes for the
2004 /// opearnd at \p OpIdx.
2005 ValueList getVL(unsigned OpIdx) const {
2006 ValueList OpVL(OpsVec[OpIdx].size());
2007 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2008 "Expected same num of lanes across all operands");
2009 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2010 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2011 return OpVL;
2012 }
2013
2014 // Performs operand reordering for 2 or more operands.
2015 // The original operands are in OrigOps[OpIdx][Lane].
2016 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2017 void reorder() {
2018 unsigned NumOperands = getNumOperands();
2019 unsigned NumLanes = getNumLanes();
2020 // Each operand has its own mode. We are using this mode to help us select
2021 // the instructions for each lane, so that they match best with the ones
2022 // we have selected so far.
2023 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2024
2025 // This is a greedy single-pass algorithm. We are going over each lane
2026 // once and deciding on the best order right away with no back-tracking.
2027 // However, in order to increase its effectiveness, we start with the lane
2028 // that has operands that can move the least. For example, given the
2029 // following lanes:
2030 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2031 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2032 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2033 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2034 // we will start at Lane 1, since the operands of the subtraction cannot
2035 // be reordered. Then we will visit the rest of the lanes in a circular
2036 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2037
2038 // Find the first lane that we will start our search from.
2039 unsigned FirstLane = getBestLaneToStartReordering();
2040
2041 // Initialize the modes.
2042 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2043 Value *OpLane0 = getValue(OpIdx, FirstLane);
2044 // Keep track if we have instructions with all the same opcode on one
2045 // side.
2046 if (isa<LoadInst>(OpLane0))
2047 ReorderingModes[OpIdx] = ReorderingMode::Load;
2048 else if (isa<Instruction>(OpLane0)) {
2049 // Check if OpLane0 should be broadcast.
2050 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2051 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2052 else
2053 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2054 }
2055 else if (isa<Constant>(OpLane0))
2056 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2057 else if (isa<Argument>(OpLane0))
2058 // Our best hope is a Splat. It may save some cost in some cases.
2059 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2060 else
2061 // NOTE: This should be unreachable.
2062 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2063 }
2064
2065 // Check that we don't have same operands. No need to reorder if operands
2066 // are just perfect diamond or shuffled diamond match. Do not do it only
2067 // for possible broadcasts or non-power of 2 number of scalars (just for
2068 // now).
2069 auto &&SkipReordering = [this]() {
2070 SmallPtrSet<Value *, 4> UniqueValues;
2071 ArrayRef<OperandData> Op0 = OpsVec.front();
2072 for (const OperandData &Data : Op0)
2073 UniqueValues.insert(Data.V);
2074 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2075 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2076 return !UniqueValues.contains(Data.V);
2077 }))
2078 return false;
2079 }
2080 // TODO: Check if we can remove a check for non-power-2 number of
2081 // scalars after full support of non-power-2 vectorization.
2082 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2083 };
2084
2085 // If the initial strategy fails for any of the operand indexes, then we
2086 // perform reordering again in a second pass. This helps avoid assigning
2087 // high priority to the failed strategy, and should improve reordering for
2088 // the non-failed operand indexes.
2089 for (int Pass = 0; Pass != 2; ++Pass) {
2090 // Check if no need to reorder operands since they're are perfect or
2091 // shuffled diamond match.
2092 // Need to do it to avoid extra external use cost counting for
2093 // shuffled matches, which may cause regressions.
2094 if (SkipReordering())
2095 break;
2096 // Skip the second pass if the first pass did not fail.
2097 bool StrategyFailed = false;
2098 // Mark all operand data as free to use.
2099 clearUsed();
2100 // We keep the original operand order for the FirstLane, so reorder the
2101 // rest of the lanes. We are visiting the nodes in a circular fashion,
2102 // using FirstLane as the center point and increasing the radius
2103 // distance.
2104 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2105 for (unsigned I = 0; I < NumOperands; ++I)
2106 MainAltOps[I].push_back(getData(I, FirstLane).V);
2107
2108 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2109 // Visit the lane on the right and then the lane on the left.
2110 for (int Direction : {+1, -1}) {
2111 int Lane = FirstLane + Direction * Distance;
2112 if (Lane < 0 || Lane >= (int)NumLanes)
2113 continue;
2114 int LastLane = Lane - Direction;
2115 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2116 "Out of bounds");
2117 // Look for a good match for each operand.
2118 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2119 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2120 std::optional<unsigned> BestIdx = getBestOperand(
2121 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2122 // By not selecting a value, we allow the operands that follow to
2123 // select a better matching value. We will get a non-null value in
2124 // the next run of getBestOperand().
2125 if (BestIdx) {
2126 // Swap the current operand with the one returned by
2127 // getBestOperand().
2128 swap(OpIdx, *BestIdx, Lane);
2129 } else {
2130 // We failed to find a best operand, set mode to 'Failed'.
2131 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2132 // Enable the second pass.
2133 StrategyFailed = true;
2134 }
2135 // Try to get the alternate opcode and follow it during analysis.
2136 if (MainAltOps[OpIdx].size() != 2) {
2137 OperandData &AltOp = getData(OpIdx, Lane);
2138 InstructionsState OpS =
2139 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2140 if (OpS.getOpcode() && OpS.isAltShuffle())
2141 MainAltOps[OpIdx].push_back(AltOp.V);
2142 }
2143 }
2144 }
2145 }
2146 // Skip second pass if the strategy did not fail.
2147 if (!StrategyFailed)
2148 break;
2149 }
2150 }
2151
2152#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2153 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2154 switch (RMode) {
2155 case ReorderingMode::Load:
2156 return "Load";
2157 case ReorderingMode::Opcode:
2158 return "Opcode";
2159 case ReorderingMode::Constant:
2160 return "Constant";
2161 case ReorderingMode::Splat:
2162 return "Splat";
2163 case ReorderingMode::Failed:
2164 return "Failed";
2165 }
2166 llvm_unreachable("Unimplemented Reordering Type");
2167 }
2168
2169 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2170 raw_ostream &OS) {
2171 return OS << getModeStr(RMode);
2172 }
2173
2174 /// Debug print.
2175 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2176 printMode(RMode, dbgs());
2177 }
2178
2179 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2180 return printMode(RMode, OS);
2181 }
2182
2184 const unsigned Indent = 2;
2185 unsigned Cnt = 0;
2186 for (const OperandDataVec &OpDataVec : OpsVec) {
2187 OS << "Operand " << Cnt++ << "\n";
2188 for (const OperandData &OpData : OpDataVec) {
2189 OS.indent(Indent) << "{";
2190 if (Value *V = OpData.V)
2191 OS << *V;
2192 else
2193 OS << "null";
2194 OS << ", APO:" << OpData.APO << "}\n";
2195 }
2196 OS << "\n";
2197 }
2198 return OS;
2199 }
2200
2201 /// Debug print.
2202 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2203#endif
2204 };
2205
2206 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2207 /// for a pair which have highest score deemed to have best chance to form
2208 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2209 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2210 /// of the cost, considered to be good enough score.
2211 std::optional<int>
2212 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2213 int Limit = LookAheadHeuristics::ScoreFail) {
2214 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2216 int BestScore = Limit;
2217 std::optional<int> Index;
2218 for (int I : seq<int>(0, Candidates.size())) {
2219 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2220 Candidates[I].second,
2221 /*U1=*/nullptr, /*U2=*/nullptr,
2222 /*Level=*/1, std::nullopt);
2223 if (Score > BestScore) {
2224 BestScore = Score;
2225 Index = I;
2226 }
2227 }
2228 return Index;
2229 }
2230
2231 /// Checks if the instruction is marked for deletion.
2232 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2233
2234 /// Removes an instruction from its block and eventually deletes it.
2235 /// It's like Instruction::eraseFromParent() except that the actual deletion
2236 /// is delayed until BoUpSLP is destructed.
2238 DeletedInstructions.insert(I);
2239 }
2240
2241 /// Checks if the instruction was already analyzed for being possible
2242 /// reduction root.
2244 return AnalyzedReductionsRoots.count(I);
2245 }
2246 /// Register given instruction as already analyzed for being possible
2247 /// reduction root.
2249 AnalyzedReductionsRoots.insert(I);
2250 }
2251 /// Checks if the provided list of reduced values was checked already for
2252 /// vectorization.
2254 return AnalyzedReductionVals.contains(hash_value(VL));
2255 }
2256 /// Adds the list of reduced values to list of already checked values for the
2257 /// vectorization.
2259 AnalyzedReductionVals.insert(hash_value(VL));
2260 }
2261 /// Clear the list of the analyzed reduction root instructions.
2263 AnalyzedReductionsRoots.clear();
2264 AnalyzedReductionVals.clear();
2265 }
2266 /// Checks if the given value is gathered in one of the nodes.
2267 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2268 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2269 }
2270
2271 /// Check if the value is vectorized in the tree.
2272 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2273
2274 ~BoUpSLP();
2275
2276private:
2277 /// Determine if a vectorized value \p V in can be demoted to
2278 /// a smaller type with a truncation. We collect the values that will be
2279 /// demoted in ToDemote and additional roots that require investigating in
2280 /// Roots.
2281 /// \param DemotedConsts list of Instruction/OperandIndex pairs that are
2282 /// constant and to be demoted. Required to correctly identify constant nodes
2283 /// to be demoted.
2284 bool collectValuesToDemote(
2285 Value *V, SmallVectorImpl<Value *> &ToDemote,
2286 DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,
2287 SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const;
2288
2289 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2290 /// reordering (i.e. the operands can be reordered because they have only one
2291 /// user and reordarable).
2292 /// \param ReorderableGathers List of all gather nodes that require reordering
2293 /// (e.g., gather of extractlements or partially vectorizable loads).
2294 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2295 /// reordering, subset of \p NonVectorized.
2296 bool
2297 canReorderOperands(TreeEntry *UserTE,
2298 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2299 ArrayRef<TreeEntry *> ReorderableGathers,
2300 SmallVectorImpl<TreeEntry *> &GatherOps);
2301
2302 /// Checks if the given \p TE is a gather node with clustered reused scalars
2303 /// and reorders it per given \p Mask.
2304 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2305
2306 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2307 /// if any. If it is not vectorized (gather node), returns nullptr.
2308 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2309 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2310 TreeEntry *TE = nullptr;
2311 const auto *It = find_if(VL, [&](Value *V) {
2312 TE = getTreeEntry(V);
2313 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2314 return true;
2315 auto It = MultiNodeScalars.find(V);
2316 if (It != MultiNodeScalars.end()) {
2317 for (TreeEntry *E : It->second) {
2318 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2319 TE = E;
2320 return true;
2321 }
2322 }
2323 }
2324 return false;
2325 });
2326 if (It != VL.end()) {
2327 assert(TE->isSame(VL) && "Expected same scalars.");
2328 return TE;
2329 }
2330 return nullptr;
2331 }
2332
2333 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2334 /// if any. If it is not vectorized (gather node), returns nullptr.
2335 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2336 unsigned OpIdx) const {
2337 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2338 const_cast<TreeEntry *>(UserTE), OpIdx);
2339 }
2340
2341 /// Checks if all users of \p I are the part of the vectorization tree.
2342 bool areAllUsersVectorized(
2343 Instruction *I,
2344 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2345
2346 /// Return information about the vector formed for the specified index
2347 /// of a vector of (the same) instruction.
2349
2350 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2351 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2352
2353 /// \returns the cost of the vectorizable entry.
2354 InstructionCost getEntryCost(const TreeEntry *E,
2355 ArrayRef<Value *> VectorizedVals,
2356 SmallPtrSetImpl<Value *> &CheckedExtracts);
2357
2358 /// This is the recursive part of buildTree.
2359 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2360 const EdgeInfo &EI);
2361
2362 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2363 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2364 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2365 /// returns false, setting \p CurrentOrder to either an empty vector or a
2366 /// non-identity permutation that allows to reuse extract instructions.
2367 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2368 /// extract order.
2369 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2370 SmallVectorImpl<unsigned> &CurrentOrder,
2371 bool ResizeAllowed = false) const;
2372
2373 /// Vectorize a single entry in the tree.
2374 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2375 /// avoid issues with def-use order.
2376 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2377
2378 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2379 /// \p E.
2380 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2381 /// avoid issues with def-use order.
2382 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2383
2384 /// Create a new vector from a list of scalar values. Produces a sequence
2385 /// which exploits values reused across lanes, and arranges the inserts
2386 /// for ease of later optimization.
2387 template <typename BVTy, typename ResTy, typename... Args>
2388 ResTy processBuildVector(const TreeEntry *E, Args &...Params);
2389
2390 /// Create a new vector from a list of scalar values. Produces a sequence
2391 /// which exploits values reused across lanes, and arranges the inserts
2392 /// for ease of later optimization.
2393 Value *createBuildVector(const TreeEntry *E);
2394
2395 /// Returns the instruction in the bundle, which can be used as a base point
2396 /// for scheduling. Usually it is the last instruction in the bundle, except
2397 /// for the case when all operands are external (in this case, it is the first
2398 /// instruction in the list).
2399 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2400
2401 /// Tries to find extractelement instructions with constant indices from fixed
2402 /// vector type and gather such instructions into a bunch, which highly likely
2403 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2404 /// was successful, the matched scalars are replaced by poison values in \p VL
2405 /// for future analysis.
2406 std::optional<TargetTransformInfo::ShuffleKind>
2407 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2408 SmallVectorImpl<int> &Mask) const;
2409
2410 /// Tries to find extractelement instructions with constant indices from fixed
2411 /// vector type and gather such instructions into a bunch, which highly likely
2412 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2413 /// was successful, the matched scalars are replaced by poison values in \p VL
2414 /// for future analysis.
2416 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2418 unsigned NumParts) const;
2419
2420 /// Checks if the gathered \p VL can be represented as a single register
2421 /// shuffle(s) of previous tree entries.
2422 /// \param TE Tree entry checked for permutation.
2423 /// \param VL List of scalars (a subset of the TE scalar), checked for
2424 /// permutations. Must form single-register vector.
2425 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2426 /// commands to build the mask using the original vector value, without
2427 /// relying on the potential reordering.
2428 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2429 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2430 std::optional<TargetTransformInfo::ShuffleKind>
2431 isGatherShuffledSingleRegisterEntry(
2432 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2433 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2434 bool ForOrder);
2435
2436 /// Checks if the gathered \p VL can be represented as multi-register
2437 /// shuffle(s) of previous tree entries.
2438 /// \param TE Tree entry checked for permutation.
2439 /// \param VL List of scalars (a subset of the TE scalar), checked for
2440 /// permutations.
2441 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2442 /// commands to build the mask using the original vector value, without
2443 /// relying on the potential reordering.
2444 /// \returns per-register series of ShuffleKind, if gathered values can be
2445 /// represented as shuffles of previous tree entries. \p Mask is filled with
2446 /// the shuffle mask (also on per-register base).
2448 isGatherShuffledEntry(
2449 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2451 unsigned NumParts, bool ForOrder = false);
2452
2453 /// \returns the scalarization cost for this list of values. Assuming that
2454 /// this subtree gets vectorized, we may need to extract the values from the
2455 /// roots. This method calculates the cost of extracting the values.
2456 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2457 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc) const;
2458
2459 /// Set the Builder insert point to one after the last instruction in
2460 /// the bundle
2461 void setInsertPointAfterBundle(const TreeEntry *E);
2462
2463 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2464 /// specified, the starting vector value is poison.
2465 Value *gather(ArrayRef<Value *> VL, Value *Root);
2466
2467 /// \returns whether the VectorizableTree is fully vectorizable and will
2468 /// be beneficial even the tree height is tiny.
2469 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2470
2471 /// Reorder commutative or alt operands to get better probability of
2472 /// generating vectorized code.
2473 static void reorderInputsAccordingToOpcode(
2476 const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R);
2477
2478 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2479 /// users of \p TE and collects the stores. It returns the map from the store
2480 /// pointers to the collected stores.
2482 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2483
2484 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2485 /// stores in \p StoresVec can form a vector instruction. If so it returns
2486 /// true and populates \p ReorderIndices with the shuffle indices of the
2487 /// stores when compared to the sorted vector.
2488 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2489 OrdersType &ReorderIndices) const;
2490
2491 /// Iterates through the users of \p TE, looking for scalar stores that can be
2492 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2493 /// their order and builds an order index vector for each store bundle. It
2494 /// returns all these order vectors found.
2495 /// We run this after the tree has formed, otherwise we may come across user
2496 /// instructions that are not yet in the tree.
2498 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2499
2500 struct TreeEntry {
2501 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2502 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2503
2504 /// \returns Common mask for reorder indices and reused scalars.
2505 SmallVector<int> getCommonMask() const {
2507 inversePermutation(ReorderIndices, Mask);
2508 ::addMask(Mask, ReuseShuffleIndices);
2509 return Mask;
2510 }
2511
2512 /// \returns true if the scalars in VL are equal to this entry.
2513 bool isSame(ArrayRef<Value *> VL) const {
2514 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2515 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2516 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2517 return VL.size() == Mask.size() &&
2518 std::equal(VL.begin(), VL.end(), Mask.begin(),
2519 [Scalars](Value *V, int Idx) {
2520 return (isa<UndefValue>(V) &&
2521 Idx == PoisonMaskElem) ||
2522 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2523 });
2524 };
2525 if (!ReorderIndices.empty()) {
2526 // TODO: implement matching if the nodes are just reordered, still can
2527 // treat the vector as the same if the list of scalars matches VL
2528 // directly, without reordering.
2530 inversePermutation(ReorderIndices, Mask);
2531 if (VL.size() == Scalars.size())
2532 return IsSame(Scalars, Mask);
2533 if (VL.size() == ReuseShuffleIndices.size()) {
2534 ::addMask(Mask, ReuseShuffleIndices);
2535 return IsSame(Scalars, Mask);
2536 }
2537 return false;
2538 }
2539 return IsSame(Scalars, ReuseShuffleIndices);
2540 }
2541
2542 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2543 return State == TreeEntry::NeedToGather &&
2544 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2545 UserTreeIndices.front().UserTE == UserEI.UserTE;
2546 }
2547
2548 /// \returns true if current entry has same operands as \p TE.
2549 bool hasEqualOperands(const TreeEntry &TE) const {
2550 if (TE.getNumOperands() != getNumOperands())
2551 return false;
2552 SmallBitVector Used(getNumOperands());
2553 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2554 unsigned PrevCount = Used.count();
2555 for (unsigned K = 0; K < E; ++K) {
2556 if (Used.test(K))
2557 continue;
2558 if (getOperand(K) == TE.getOperand(I)) {
2559 Used.set(K);
2560 break;
2561 }
2562 }
2563 // Check if we actually found the matching operand.
2564 if (PrevCount == Used.count())
2565 return false;
2566 }
2567 return true;
2568 }
2569
2570 /// \return Final vectorization factor for the node. Defined by the total
2571 /// number of vectorized scalars, including those, used several times in the
2572 /// entry and counted in the \a ReuseShuffleIndices, if any.
2573 unsigned getVectorFactor() const {
2574 if (!ReuseShuffleIndices.empty())
2575 return ReuseShuffleIndices.size();
2576 return Scalars.size();
2577 };
2578
2579 /// A vector of scalars.
2580 ValueList Scalars;
2581
2582 /// The Scalars are vectorized into this value. It is initialized to Null.
2583 WeakTrackingVH VectorizedValue = nullptr;
2584
2585 /// New vector phi instructions emitted for the vectorized phi nodes.
2586 PHINode *PHI = nullptr;
2587
2588 /// Do we need to gather this sequence or vectorize it
2589 /// (either with vector instruction or with scatter/gather
2590 /// intrinsics for store/load)?
2591 enum EntryState {
2592 Vectorize,
2593 ScatterVectorize,
2594 StridedVectorize,
2595 NeedToGather
2596 };
2597 EntryState State;
2598
2599 /// Does this sequence require some shuffling?
2600 SmallVector<int, 4> ReuseShuffleIndices;
2601
2602 /// Does this entry require reordering?
2603 SmallVector<unsigned, 4> ReorderIndices;
2604
2605 /// Points back to the VectorizableTree.
2606 ///
2607 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2608 /// to be a pointer and needs to be able to initialize the child iterator.
2609 /// Thus we need a reference back to the container to translate the indices
2610 /// to entries.
2611 VecTreeTy &Container;
2612
2613 /// The TreeEntry index containing the user of this entry. We can actually
2614 /// have multiple users so the data structure is not truly a tree.
2615 SmallVector<EdgeInfo, 1> UserTreeIndices;
2616
2617 /// The index of this treeEntry in VectorizableTree.
2618 int Idx = -1;
2619
2620 private:
2621 /// The operands of each instruction in each lane Operands[op_index][lane].
2622 /// Note: This helps avoid the replication of the code that performs the
2623 /// reordering of operands during buildTree_rec() and vectorizeTree().
2625
2626 /// The main/alternate instruction.
2627 Instruction *MainOp = nullptr;
2628 Instruction *AltOp = nullptr;
2629
2630 public:
2631 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
2632 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2633 if (Operands.size() < OpIdx + 1)
2634 Operands.resize(OpIdx + 1);
2635 assert(Operands[OpIdx].empty() && "Already resized?");
2636 assert(OpVL.size() <= Scalars.size() &&
2637 "Number of operands is greater than the number of scalars.");
2638 Operands[OpIdx].resize(OpVL.size());
2639 copy(OpVL, Operands[OpIdx].begin());
2640 }
2641
2642 /// Set the operands of this bundle in their original order.
2643 void setOperandsInOrder() {
2644 assert(Operands.empty() && "Already initialized?");
2645 auto *I0 = cast<Instruction>(Scalars[0]);
2646 Operands.resize(I0->getNumOperands());
2647 unsigned NumLanes = Scalars.size();
2648 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2649 OpIdx != NumOperands; ++OpIdx) {
2650 Operands[OpIdx].resize(NumLanes);
2651 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2652 auto *I = cast<Instruction>(Scalars[Lane]);
2653 assert(I->getNumOperands() == NumOperands &&
2654 "Expected same number of operands");
2655 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
2656 }
2657 }
2658 }
2659
2660 /// Reorders operands of the node to the given mask \p Mask.
2661 void reorderOperands(ArrayRef<int> Mask) {
2662 for (ValueList &Operand : Operands)
2663 reorderScalars(Operand, Mask);
2664 }
2665
2666 /// \returns the \p OpIdx operand of this TreeEntry.
2667 ValueList &getOperand(unsigned OpIdx) {
2668 assert(OpIdx < Operands.size() && "Off bounds");
2669 return Operands[OpIdx];
2670 }
2671
2672 /// \returns the \p OpIdx operand of this TreeEntry.
2673 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
2674 assert(OpIdx < Operands.size() && "Off bounds");
2675 return Operands[OpIdx];
2676 }
2677
2678 /// \returns the number of operands.
2679 unsigned getNumOperands() const { return Operands.size(); }
2680
2681 /// \return the single \p OpIdx operand.
2682 Value *getSingleOperand(unsigned OpIdx) const {
2683 assert(OpIdx < Operands.size() && "Off bounds");
2684 assert(!Operands[OpIdx].empty() && "No operand available");
2685 return Operands[OpIdx][0];
2686 }
2687
2688 /// Some of the instructions in the list have alternate opcodes.
2689 bool isAltShuffle() const { return MainOp != AltOp; }
2690
2691 bool isOpcodeOrAlt(Instruction *I) const {
2692 unsigned CheckedOpcode = I->getOpcode();
2693 return (getOpcode() == CheckedOpcode ||
2694 getAltOpcode() == CheckedOpcode);
2695 }
2696
2697 /// Chooses the correct key for scheduling data. If \p Op has the same (or
2698 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
2699 /// \p OpValue.
2700 Value *isOneOf(Value *Op) const {
2701 auto *I = dyn_cast<Instruction>(Op);
2702 if (I && isOpcodeOrAlt(I))
2703 return Op;
2704 return MainOp;
2705 }
2706
2707 void setOperations(const InstructionsState &S) {
2708 MainOp = S.MainOp;
2709 AltOp = S.AltOp;
2710 }
2711
2712 Instruction *getMainOp() const {
2713 return MainOp;
2714 }
2715
2716 Instruction *getAltOp() const {
2717 return AltOp;
2718 }
2719
2720 /// The main/alternate opcodes for the list of instructions.
2721 unsigned getOpcode() const {
2722 return MainOp ? MainOp->getOpcode() : 0;
2723 }
2724
2725 unsigned getAltOpcode() const {
2726 return AltOp ? AltOp->getOpcode() : 0;
2727 }
2728
2729 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
2730 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
2731 int findLaneForValue(Value *V) const {
2732 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
2733 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2734 if (!ReorderIndices.empty())
2735 FoundLane = ReorderIndices[FoundLane];
2736 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2737 if (!ReuseShuffleIndices.empty()) {
2738 FoundLane = std::distance(ReuseShuffleIndices.begin(),
2739 find(ReuseShuffleIndices, FoundLane));
2740 }
2741 return FoundLane;
2742 }
2743
2744 /// Build a shuffle mask for graph entry which represents a merge of main
2745 /// and alternate operations.
2746 void
2747 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
2749 SmallVectorImpl<Value *> *OpScalars = nullptr,
2750 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
2751
2752#ifndef NDEBUG
2753 /// Debug printer.
2754 LLVM_DUMP_METHOD void dump() const {
2755 dbgs() << Idx << ".\n";
2756 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
2757 dbgs() << "Operand " << OpI << ":\n";
2758 for (const Value *V : Operands[OpI])
2759 dbgs().indent(2) << *V << "\n";
2760 }
2761 dbgs() << "Scalars: \n";
2762 for (Value *V : Scalars)
2763 dbgs().indent(2) << *V << "\n";
2764 dbgs() << "State: ";
2765 switch (State) {
2766 case Vectorize:
2767 dbgs() << "Vectorize\n";
2768 break;
2769 case ScatterVectorize:
2770 dbgs() << "ScatterVectorize\n";
2771 break;
2772 case StridedVectorize:
2773 dbgs() << "StridedVectorize\n";
2774 break;
2775 case NeedToGather:
2776 dbgs() << "NeedToGather\n";
2777 break;
2778 }
2779 dbgs() << "MainOp: ";
2780 if (MainOp)
2781 dbgs() << *MainOp << "\n";
2782 else
2783 dbgs() << "NULL\n";
2784 dbgs() << "AltOp: ";
2785 if (AltOp)
2786 dbgs() << *AltOp << "\n";
2787 else
2788 dbgs() << "NULL\n";
2789 dbgs() << "VectorizedValue: ";
2790 if (VectorizedValue)
2791 dbgs() << *VectorizedValue << "\n";
2792 else
2793 dbgs() << "NULL\n";
2794 dbgs() << "ReuseShuffleIndices: ";
2795 if (ReuseShuffleIndices.empty())
2796 dbgs() << "Empty";
2797 else
2798 for (int ReuseIdx : ReuseShuffleIndices)
2799 dbgs() << ReuseIdx << ", ";
2800 dbgs() << "\n";
2801 dbgs() << "ReorderIndices: ";
2802 for (unsigned ReorderIdx : ReorderIndices)
2803 dbgs() << ReorderIdx << ", ";
2804 dbgs() << "\n";
2805 dbgs() << "UserTreeIndices: ";
2806 for (const auto &EInfo : UserTreeIndices)
2807 dbgs() << EInfo << ", ";
2808 dbgs() << "\n";
2809 }
2810#endif
2811 };
2812
2813#ifndef NDEBUG
2814 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
2815 InstructionCost VecCost, InstructionCost ScalarCost,
2816 StringRef Banner) const {
2817 dbgs() << "SLP: " << Banner << ":\n";
2818 E->dump();
2819 dbgs() << "SLP: Costs:\n";
2820 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
2821 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
2822 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
2823 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2824 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
2825 }
2826#endif
2827
2828 /// Create a new VectorizableTree entry.
2829 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2830 std::optional<ScheduleData *> Bundle,
2831 const InstructionsState &S,
2832 const EdgeInfo &UserTreeIdx,
2833 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2834 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2835 TreeEntry::EntryState EntryState =
2836 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2837 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2838 ReuseShuffleIndices, ReorderIndices);
2839 }
2840
2841 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2842 TreeEntry::EntryState EntryState,
2843 std::optional<ScheduleData *> Bundle,
2844 const InstructionsState &S,
2845 const EdgeInfo &UserTreeIdx,
2846 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2847 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2848 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2849 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2850 "Need to vectorize gather entry?");
2851 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
2852 TreeEntry *Last = VectorizableTree.back().get();
2853 Last->Idx = VectorizableTree.size() - 1;
2854 Last->State = EntryState;
2855 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2856 ReuseShuffleIndices.end());
2857 if (ReorderIndices.empty()) {
2858 Last->Scalars.assign(VL.begin(), VL.end());
2859 Last->setOperations(S);
2860 } else {
2861 // Reorder scalars and build final mask.
2862 Last->Scalars.assign(VL.size(), nullptr);
2863 transform(ReorderIndices, Last->Scalars.begin(),
2864 [VL](unsigned Idx) -> Value * {
2865 if (Idx >= VL.size())
2866 return UndefValue::get(VL.front()->getType());
2867 return VL[Idx];
2868 });
2869 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
2870 Last->setOperations(S);
2871 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
2872 }
2873 if (Last->State != TreeEntry::NeedToGather) {
2874 for (Value *V : VL) {
2875 const TreeEntry *TE = getTreeEntry(V);
2876 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
2877 "Scalar already in tree!");
2878 if (TE) {
2879 if (TE != Last)
2880 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
2881 continue;
2882 }
2883 ScalarToTreeEntry[V] = Last;
2884 }
2885 // Update the scheduler bundle to point to this TreeEntry.
2886 ScheduleData *BundleMember = *Bundle;
2887 assert((BundleMember || isa<PHINode>(S.MainOp) ||
2888 isVectorLikeInstWithConstOps(S.MainOp) ||
2889 doesNotNeedToSchedule(VL)) &&
2890 "Bundle and VL out of sync");
2891 if (BundleMember) {
2892 for (Value *V : VL) {
2894 continue;
2895 if (!BundleMember)
2896 continue;
2897 BundleMember->TE = Last;
2898 BundleMember = BundleMember->NextInBundle;
2899 }
2900 }
2901 assert(!BundleMember && "Bundle and VL out of sync");
2902 } else {
2903 MustGather.insert(VL.begin(), VL.end());
2904 // Build a map for gathered scalars to the nodes where they are used.
2905 for (Value *V : VL)
2906 if (!isConstant(V))
2907 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
2908 }
2909
2910 if (UserTreeIdx.UserTE)
2911 Last->UserTreeIndices.push_back(UserTreeIdx);
2912
2913 return Last;
2914 }
2915
2916 /// -- Vectorization State --
2917 /// Holds all of the tree entries.
2918 TreeEntry::VecTreeTy VectorizableTree;
2919
2920#ifndef NDEBUG
2921 /// Debug printer.
2922 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
2923 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
2924 VectorizableTree[Id]->dump();
2925 dbgs() << "\n";
2926 }
2927 }
2928#endif
2929
2930 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
2931
2932 const TreeEntry *getTreeEntry(Value *V) const {
2933 return ScalarToTreeEntry.lookup(V);
2934 }
2935
2936 /// Checks if the specified list of the instructions/values can be vectorized
2937 /// and fills required data before actual scheduling of the instructions.
2938 TreeEntry::EntryState getScalarsVectorizationState(
2939 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
2940 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
2941
2942 /// Maps a specific scalar to its tree entry.
2943 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
2944
2945 /// List of scalars, used in several vectorize nodes, and the list of the
2946 /// nodes.
2948
2949 /// Maps a value to the proposed vectorizable size.
2950 SmallDenseMap<Value *, unsigned> InstrElementSize;
2951
2952 /// A list of scalars that we found that we need to keep as scalars.
2953 ValueSet MustGather;
2954
2955 /// A map between the vectorized entries and the last instructions in the
2956 /// bundles. The bundles are built in use order, not in the def order of the
2957 /// instructions. So, we cannot rely directly on the last instruction in the
2958 /// bundle being the last instruction in the program order during
2959 /// vectorization process since the basic blocks are affected, need to
2960 /// pre-gather them before.
2961 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
2962
2963 /// List of gather nodes, depending on other gather/vector nodes, which should
2964 /// be emitted after the vector instruction emission process to correctly
2965 /// handle order of the vector instructions and shuffles.
2966 SetVector<const TreeEntry *> PostponedGathers;
2967
2968 using ValueToGatherNodesMap =
2970 ValueToGatherNodesMap ValueToGatherNodes;
2971
2972 /// This POD struct describes one external user in the vectorized tree.
2973 struct ExternalUser {
2974 ExternalUser(Value *S, llvm::User *U, int L)
2975 : Scalar(S), User(U), Lane(L) {}
2976
2977 // Which scalar in our function.
2978 Value *Scalar;
2979
2980 // Which user that uses the scalar.
2982
2983 // Which lane does the scalar belong to.
2984 int Lane;
2985 };
2986 using UserList = SmallVector<ExternalUser, 16>;
2987
2988 /// Checks if two instructions may access the same memory.
2989 ///
2990 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
2991 /// is invariant in the calling loop.
2992 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
2993 Instruction *Inst2) {
2994 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
2995 return true;
2996 // First check if the result is already in the cache.
2997 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
2998 auto It = AliasCache.find(Key);
2999 if (It != AliasCache.end())
3000 return It->second;
3001 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3002 // Store the result in the cache.
3003 AliasCache.try_emplace(Key, Aliased);
3004 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3005 return Aliased;
3006 }
3007
3008 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3009
3010 /// Cache for alias results.
3011 /// TODO: consider moving this to the AliasAnalysis itself.
3013
3014 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3015 // globally through SLP because we don't perform any action which
3016 // invalidates capture results.
3017 BatchAAResults BatchAA;
3018
3019 /// Temporary store for deleted instructions. Instructions will be deleted
3020 /// eventually when the BoUpSLP is destructed. The deferral is required to
3021 /// ensure that there are no incorrect collisions in the AliasCache, which
3022 /// can happen if a new instruction is allocated at the same address as a
3023 /// previously deleted instruction.
3024 DenseSet<Instruction *> DeletedInstructions;
3025
3026 /// Set of the instruction, being analyzed already for reductions.
3027 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3028
3029 /// Set of hashes for the list of reduction values already being analyzed.
3030 DenseSet<size_t> AnalyzedReductionVals;
3031
3032 /// A list of values that need to extracted out of the tree.
3033 /// This list holds pairs of (Internal Scalar : External User). External User
3034 /// can be nullptr, it means that this Internal Scalar will be used later,
3035 /// after vectorization.
3036 UserList ExternalUses;
3037
3038 /// Values used only by @llvm.assume calls.
3040
3041 /// Holds all of the instructions that we gathered, shuffle instructions and
3042 /// extractelements.
3043 SetVector<Instruction *> GatherShuffleExtractSeq;
3044
3045 /// A list of blocks that we are going to CSE.
3046 DenseSet<BasicBlock *> CSEBlocks;
3047
3048 /// Contains all scheduling relevant data for an instruction.
3049 /// A ScheduleData either represents a single instruction or a member of an
3050 /// instruction bundle (= a group of instructions which is combined into a
3051 /// vector instruction).
3052 struct ScheduleData {
3053 // The initial value for the dependency counters. It means that the
3054 // dependencies are not calculated yet.
3055 enum { InvalidDeps = -1 };
3056
3057 ScheduleData() = default;
3058
3059 void init(int BlockSchedulingRegionID, Value *OpVal) {
3060 FirstInBundle = this;
3061 NextInBundle = nullptr;
3062 NextLoadStore = nullptr;
3063 IsScheduled = false;
3064 SchedulingRegionID = BlockSchedulingRegionID;
3065 clearDependencies();
3066 OpValue = OpVal;
3067 TE = nullptr;
3068 }
3069
3070 /// Verify basic self consistency properties
3071 void verify() {
3072 if (hasValidDependencies()) {
3073 assert(UnscheduledDeps <= Dependencies && "invariant");
3074 } else {
3075 assert(UnscheduledDeps == Dependencies && "invariant");
3076 }
3077
3078 if (IsScheduled) {
3079 assert(isSchedulingEntity() &&
3080 "unexpected scheduled state");
3081 for (const ScheduleData *BundleMember = this; BundleMember;
3082 BundleMember = BundleMember->NextInBundle) {
3083 assert(BundleMember->hasValidDependencies() &&
3084 BundleMember->UnscheduledDeps == 0 &&
3085 "unexpected scheduled state");
3086 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3087 "only bundle is marked scheduled");
3088 }
3089 }
3090
3091 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3092 "all bundle members must be in same basic block");
3093 }
3094
3095 /// Returns true if the dependency information has been calculated.
3096 /// Note that depenendency validity can vary between instructions within
3097 /// a single bundle.
3098 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3099
3100 /// Returns true for single instructions and for bundle representatives
3101 /// (= the head of a bundle).
3102 bool isSchedulingEntity() const { return FirstInBundle == this; }
3103
3104 /// Returns true if it represents an instruction bundle and not only a
3105 /// single instruction.
3106 bool isPartOfBundle() const {
3107 return NextInBundle != nullptr || FirstInBundle != this || TE;
3108 }
3109
3110 /// Returns true if it is ready for scheduling, i.e. it has no more
3111 /// unscheduled depending instructions/bundles.
3112 bool isReady() const {
3113 assert(isSchedulingEntity() &&
3114 "can't consider non-scheduling entity for ready list");
3115 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3116 }
3117
3118 /// Modifies the number of unscheduled dependencies for this instruction,
3119 /// and returns the number of remaining dependencies for the containing
3120 /// bundle.
3121 int incrementUnscheduledDeps(int Incr) {
3122 assert(hasValidDependencies() &&
3123 "increment of unscheduled deps would be meaningless");
3124 UnscheduledDeps += Incr;
3125 return FirstInBundle->unscheduledDepsInBundle();
3126 }
3127
3128 /// Sets the number of unscheduled dependencies to the number of
3129 /// dependencies.
3130 void resetUnscheduledDeps() {
3131 UnscheduledDeps = Dependencies;
3132 }
3133
3134 /// Clears all dependency information.
3135 void clearDependencies() {
3136 Dependencies = InvalidDeps;
3137 resetUnscheduledDeps();
3138 MemoryDependencies.clear();
3139 ControlDependencies.clear();
3140 }
3141
3142 int unscheduledDepsInBundle() const {
3143 assert(isSchedulingEntity() && "only meaningful on the bundle");
3144 int Sum = 0;
3145 for (const ScheduleData *BundleMember = this; BundleMember;
3146 BundleMember = BundleMember->NextInBundle) {
3147 if (BundleMember->UnscheduledDeps == InvalidDeps)
3148 return InvalidDeps;
3149 Sum += BundleMember->UnscheduledDeps;
3150 }
3151 return Sum;
3152 }
3153
3154 void dump(raw_ostream &os) const {
3155 if (!isSchedulingEntity()) {
3156 os << "/ " << *Inst;
3157 } else if (NextInBundle) {
3158 os << '[' << *Inst;
3159 ScheduleData *SD = NextInBundle;
3160 while (SD) {
3161 os << ';' << *SD->Inst;
3162 SD = SD->NextInBundle;
3163 }
3164 os << ']';
3165 } else {
3166 os << *Inst;
3167 }
3168 }
3169
3170 Instruction *Inst = nullptr;
3171
3172 /// Opcode of the current instruction in the schedule data.
3173 Value *OpValue = nullptr;
3174
3175 /// The TreeEntry that this instruction corresponds to.
3176 TreeEntry *TE = nullptr;
3177
3178 /// Points to the head in an instruction bundle (and always to this for
3179 /// single instructions).
3180 ScheduleData *FirstInBundle = nullptr;
3181
3182 /// Single linked list of all instructions in a bundle. Null if it is a
3183 /// single instruction.
3184 ScheduleData *NextInBundle = nullptr;
3185
3186 /// Single linked list of all memory instructions (e.g. load, store, call)
3187 /// in the block - until the end of the scheduling region.
3188 ScheduleData *NextLoadStore = nullptr;
3189
3190 /// The dependent memory instructions.
3191 /// This list is derived on demand in calculateDependencies().
3192 SmallVector<ScheduleData *, 4> MemoryDependencies;
3193
3194 /// List of instructions which this instruction could be control dependent
3195 /// on. Allowing such nodes to be scheduled below this one could introduce
3196 /// a runtime fault which didn't exist in the original program.
3197 /// ex: this is a load or udiv following a readonly call which inf loops
3198 SmallVector<ScheduleData *, 4> ControlDependencies;
3199
3200 /// This ScheduleData is in the current scheduling region if this matches
3201 /// the current SchedulingRegionID of BlockScheduling.
3202 int SchedulingRegionID = 0;
3203
3204 /// Used for getting a "good" final ordering of instructions.
3205 int SchedulingPriority = 0;
3206
3207 /// The number of dependencies. Constitutes of the number of users of the
3208 /// instruction plus the number of dependent memory instructions (if any).
3209 /// This value is calculated on demand.
3210 /// If InvalidDeps, the number of dependencies is not calculated yet.
3211 int Dependencies = InvalidDeps;
3212
3213 /// The number of dependencies minus the number of dependencies of scheduled
3214 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3215 /// for scheduling.
3216 /// Note that this is negative as long as Dependencies is not calculated.
3217 int UnscheduledDeps = InvalidDeps;
3218
3219 /// True if this instruction is scheduled (or considered as scheduled in the
3220 /// dry-run).
3221 bool IsScheduled = false;
3222 };
3223
3224#ifndef NDEBUG
3226 const BoUpSLP::ScheduleData &SD) {
3227 SD.dump(os);
3228 return os;
3229 }
3230#endif
3231
3232 friend struct GraphTraits<BoUpSLP *>;
3233 friend struct DOTGraphTraits<BoUpSLP *>;
3234
3235 /// Contains all scheduling data for a basic block.
3236 /// It does not schedules instructions, which are not memory read/write
3237 /// instructions and their operands are either constants, or arguments, or
3238 /// phis, or instructions from others blocks, or their users are phis or from
3239 /// the other blocks. The resulting vector instructions can be placed at the
3240 /// beginning of the basic block without scheduling (if operands does not need
3241 /// to be scheduled) or at the end of the block (if users are outside of the
3242 /// block). It allows to save some compile time and memory used by the
3243 /// compiler.
3244 /// ScheduleData is assigned for each instruction in between the boundaries of
3245 /// the tree entry, even for those, which are not part of the graph. It is
3246 /// required to correctly follow the dependencies between the instructions and
3247 /// their correct scheduling. The ScheduleData is not allocated for the
3248 /// instructions, which do not require scheduling, like phis, nodes with
3249 /// extractelements/insertelements only or nodes with instructions, with
3250 /// uses/operands outside of the block.
3251 struct BlockScheduling {
3252 BlockScheduling(BasicBlock *BB)
3253 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3254
3255 void clear() {
3256 ReadyInsts.clear();
3257 ScheduleStart = nullptr;
3258 ScheduleEnd = nullptr;
3259 FirstLoadStoreInRegion = nullptr;
3260 LastLoadStoreInRegion = nullptr;
3261 RegionHasStackSave = false;
3262
3263 // Reduce the maximum schedule region size by the size of the
3264 // previous scheduling run.
3265 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3266 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3267 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3268 ScheduleRegionSize = 0;
3269
3270 // Make a new scheduling region, i.e. all existing ScheduleData is not
3271 // in the new region yet.
3272 ++SchedulingRegionID;
3273 }
3274
3275 ScheduleData *getScheduleData(Instruction *I) {
3276 if (BB != I->getParent())
3277 // Avoid lookup if can't possibly be in map.
3278 return nullptr;
3279 ScheduleData *SD = ScheduleDataMap.lookup(I);
3280 if (SD && isInSchedulingRegion(SD))
3281 return SD;
3282 return nullptr;
3283 }
3284
3285 ScheduleData *getScheduleData(Value *V) {
3286 if (auto *I = dyn_cast<Instruction>(V))
3287 return getScheduleData(I);
3288 return nullptr;
3289 }
3290
3291 ScheduleData *getScheduleData(Value *V, Value *Key) {
3292 if (V == Key)
3293 return getScheduleData(V);
3294 auto I = ExtraScheduleDataMap.find(V);
3295 if (I != ExtraScheduleDataMap.end()) {
3296 ScheduleData *SD = I->second.lookup(Key);
3297 if (SD && isInSchedulingRegion(SD))
3298 return SD;
3299 }
3300 return nullptr;
3301 }
3302
3303 bool isInSchedulingRegion(ScheduleData *SD) const {
3304 return SD->SchedulingRegionID == SchedulingRegionID;
3305 }
3306
3307 /// Marks an instruction as scheduled and puts all dependent ready
3308 /// instructions into the ready-list.
3309 template <typename ReadyListType>
3310 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3311 SD->IsScheduled = true;
3312 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3313
3314 for (ScheduleData *BundleMember = SD; BundleMember;
3315 BundleMember = BundleMember->NextInBundle) {
3316 if (BundleMember->Inst != BundleMember->OpValue)
3317 continue;
3318
3319 // Handle the def-use chain dependencies.
3320
3321 // Decrement the unscheduled counter and insert to ready list if ready.
3322 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3323 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
3324 if (OpDef && OpDef->hasValidDependencies() &&
3325 OpDef->incrementUnscheduledDeps(-1) == 0) {
3326 // There are no more unscheduled dependencies after
3327 // decrementing, so we can put the dependent instruction
3328 // into the ready list.
3329 ScheduleData *DepBundle = OpDef->FirstInBundle;
3330 assert(!DepBundle->IsScheduled &&
3331 "already scheduled bundle gets ready");
3332 ReadyList.insert(DepBundle);
3333 LLVM_DEBUG(dbgs()
3334 << "SLP: gets ready (def): " << *DepBundle << "\n");
3335 }
3336 });
3337 };
3338
3339 // If BundleMember is a vector bundle, its operands may have been
3340 // reordered during buildTree(). We therefore need to get its operands
3341 // through the TreeEntry.
3342 if (TreeEntry *TE = BundleMember->TE) {
3343 // Need to search for the lane since the tree entry can be reordered.
3344 int Lane = std::distance(TE->Scalars.begin(),
3345 find(TE->Scalars, BundleMember->Inst));
3346 assert(Lane >= 0 && "Lane not set");
3347
3348 // Since vectorization tree is being built recursively this assertion
3349 // ensures that the tree entry has all operands set before reaching
3350 // this code. Couple of exceptions known at the moment are extracts
3351 // where their second (immediate) operand is not added. Since
3352 // immediates do not affect scheduler behavior this is considered
3353 // okay.
3354 auto *In = BundleMember->Inst;
3355 assert(In &&
3356 (isa<ExtractValueInst, ExtractElementInst>(In) ||
3357 In->getNumOperands() == TE->getNumOperands()) &&
3358 "Missed TreeEntry operands?");
3359 (void)In; // fake use to avoid build failure when assertions disabled
3360
3361 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3362 OpIdx != NumOperands; ++OpIdx)
3363 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3364 DecrUnsched(I);
3365 } else {
3366 // If BundleMember is a stand-alone instruction, no operand reordering
3367 // has taken place, so we directly access its operands.
3368 for (Use &U : BundleMember->Inst->operands())
3369 if (auto *I = dyn_cast<Instruction>(U.get()))
3370 DecrUnsched(I);
3371 }
3372 // Handle the memory dependencies.
3373 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3374 if (MemoryDepSD->hasValidDependencies() &&
3375 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3376 // There are no more unscheduled dependencies after decrementing,
3377 // so we can put the dependent instruction into the ready list.
3378 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3379 assert(!DepBundle->IsScheduled &&
3380 "already scheduled bundle gets ready");
3381 ReadyList.insert(DepBundle);
3383 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3384 }
3385 }
3386 // Handle the control dependencies.
3387 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3388 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3389 // There are no more unscheduled dependencies after decrementing,
3390 // so we can put the dependent instruction into the ready list.
3391 ScheduleData *DepBundle = DepSD->FirstInBundle;
3392 assert(!DepBundle->IsScheduled &&
3393 "already scheduled bundle gets ready");
3394 ReadyList.insert(DepBundle);
3396 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3397 }
3398 }
3399 }
3400 }
3401
3402 /// Verify basic self consistency properties of the data structure.
3403 void verify() {
3404 if (!ScheduleStart)
3405 return;
3406
3407 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3408 ScheduleStart->comesBefore(ScheduleEnd) &&
3409 "Not a valid scheduling region?");
3410
3411 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3412 auto *SD = getScheduleData(I);
3413 if (!SD)
3414 continue;
3415 assert(isInSchedulingRegion(SD) &&
3416 "primary schedule data not in window?");
3417 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3418 "entire bundle in window!");
3419 (void)SD;
3420 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3421 }
3422
3423 for (auto *SD : ReadyInsts) {
3424 assert(SD->isSchedulingEntity() && SD->isReady() &&
3425 "item in ready list not ready?");
3426 (void)SD;
3427 }
3428 }
3429
3430 void doForAllOpcodes(Value *V,
3431 function_ref<void(ScheduleData *SD)> Action) {
3432 if (ScheduleData *SD = getScheduleData(V))
3433 Action(SD);
3434 auto I = ExtraScheduleDataMap.find(V);
3435 if (I != ExtraScheduleDataMap.end())
3436 for (auto &P : I->second)
3437 if (isInSchedulingRegion(P.second))
3438 Action(P.second);
3439 }
3440
3441 /// Put all instructions into the ReadyList which are ready for scheduling.
3442 template <typename ReadyListType>
3443 void initialFillReadyList(ReadyListType &ReadyList) {
3444 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3445 doForAllOpcodes(I, [&](ScheduleData *SD) {
3446 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3447 SD->isReady()) {
3448 ReadyList.insert(SD);
3449 LLVM_DEBUG(dbgs()
3450 << "SLP: initially in ready list: " << *SD << "\n");
3451 }
3452 });
3453 }
3454 }
3455
3456 /// Build a bundle from the ScheduleData nodes corresponding to the
3457 /// scalar instruction for each lane.
3458 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3459
3460 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3461 /// cyclic dependencies. This is only a dry-run, no instructions are
3462 /// actually moved at this stage.
3463 /// \returns the scheduling bundle. The returned Optional value is not
3464 /// std::nullopt if \p VL is allowed to be scheduled.
3465 std::optional<ScheduleData *>
3466 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3467 const InstructionsState &S);
3468
3469 /// Un-bundles a group of instructions.
3470 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3471
3472 /// Allocates schedule data chunk.
3473 ScheduleData *allocateScheduleDataChunks();
3474
3475 /// Extends the scheduling region so that V is inside the region.
3476 /// \returns true if the region size is within the limit.
3477 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3478
3479 /// Initialize the ScheduleData structures for new instructions in the
3480 /// scheduling region.
3481 void initScheduleData(Instruction *FromI, Instruction *ToI,
3482 ScheduleData *PrevLoadStore,
3483 ScheduleData *NextLoadStore);
3484
3485 /// Updates the dependency information of a bundle and of all instructions/
3486 /// bundles which depend on the original bundle.
3487 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3488 BoUpSLP *SLP);
3489
3490 /// Sets all instruction in the scheduling region to un-scheduled.
3491 void resetSchedule();
3492
3493 BasicBlock *BB;
3494
3495 /// Simple memory allocation for ScheduleData.
3497
3498 /// The size of a ScheduleData array in ScheduleDataChunks.
3499 int ChunkSize;
3500
3501 /// The allocator position in the current chunk, which is the last entry
3502 /// of ScheduleDataChunks.
3503 int ChunkPos;
3504
3505 /// Attaches ScheduleData to Instruction.
3506 /// Note that the mapping survives during all vectorization iterations, i.e.
3507 /// ScheduleData structures are recycled.
3509
3510 /// Attaches ScheduleData to Instruction with the leading key.
3512 ExtraScheduleDataMap;
3513
3514 /// The ready-list for scheduling (only used for the dry-run).
3515 SetVector<ScheduleData *> ReadyInsts;
3516
3517 /// The first instruction of the scheduling region.
3518 Instruction *ScheduleStart = nullptr;
3519
3520 /// The first instruction _after_ the scheduling region.
3521 Instruction *ScheduleEnd = nullptr;
3522
3523 /// The first memory accessing instruction in the scheduling region
3524 /// (can be null).
3525 ScheduleData *FirstLoadStoreInRegion = nullptr;
3526
3527 /// The last memory accessing instruction in the scheduling region
3528 /// (can be null).
3529 ScheduleData *LastLoadStoreInRegion = nullptr;
3530
3531 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3532 /// region? Used to optimize the dependence calculation for the
3533 /// common case where there isn't.
3534 bool RegionHasStackSave = false;
3535
3536 /// The current size of the scheduling region.
3537 int ScheduleRegionSize = 0;
3538
3539 /// The maximum size allowed for the scheduling region.
3540 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3541
3542 /// The ID of the scheduling region. For a new vectorization iteration this
3543 /// is incremented which "removes" all ScheduleData from the region.
3544 /// Make sure that the initial SchedulingRegionID is greater than the
3545 /// initial SchedulingRegionID in ScheduleData (which is 0).
3546 int SchedulingRegionID = 1;
3547 };
3548
3549 /// Attaches the BlockScheduling structures to basic blocks.
3551
3552 /// Performs the "real" scheduling. Done before vectorization is actually
3553 /// performed in a basic block.
3554 void scheduleBlock(BlockScheduling *BS);
3555
3556 /// List of users to ignore during scheduling and that don't need extracting.
3557 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3558
3559 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3560 /// sorted SmallVectors of unsigned.
3561 struct OrdersTypeDenseMapInfo {
3562 static OrdersType getEmptyKey() {
3563 OrdersType V;
3564 V.push_back(~1U);
3565 return V;
3566 }
3567
3568 static OrdersType getTombstoneKey() {
3569 OrdersType V;
3570 V.push_back(~2U);
3571 return V;
3572 }
3573
3574 static unsigned getHashValue(const OrdersType &V) {
3575 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3576 }
3577
3578 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3579 return LHS == RHS;
3580 }
3581 };
3582
3583 // Analysis and block reference.
3584 Function *F;
3585 ScalarEvolution *SE;
3587 TargetLibraryInfo *TLI;
3588 LoopInfo *LI;
3589 DominatorTree *DT;
3590 AssumptionCache *AC;
3591 DemandedBits *DB;
3592 const DataLayout *DL;
3594
3595 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3596 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3597
3598 /// Instruction builder to construct the vectorized tree.
3599 IRBuilder<> Builder;
3600
3601 /// A map of scalar integer values to the smallest bit width with which they
3602 /// can legally be represented. The values map to (width, signed) pairs,
3603 /// where "width" indicates the minimum bit width and "signed" is True if the
3604 /// value must be signed-extended, rather than zero-extended, back to its
3605 /// original width.
3607};
3608
3609} // end namespace slpvectorizer
3610
3611template <> struct GraphTraits<BoUpSLP *> {
3612 using TreeEntry = BoUpSLP::TreeEntry;
3613
3614 /// NodeRef has to be a pointer per the GraphWriter.
3616
3618
3619 /// Add the VectorizableTree to the index iterator to be able to return
3620 /// TreeEntry pointers.
3621 struct ChildIteratorType
3622 : public iterator_adaptor_base<
3623 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3625
3627 ContainerTy &VT)
3628 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
3629
3630 NodeRef operator*() { return I->UserTE; }
3631 };
3632
3634 return R.VectorizableTree[0].get();
3635 }
3636
3637 static ChildIteratorType child_begin(NodeRef N) {
3638 return {N->UserTreeIndices.begin(), N->Container};
3639 }
3640
3641 static ChildIteratorType child_end(NodeRef N) {
3642 return {N->UserTreeIndices.end(), N->Container};
3643 }
3644
3645 /// For the node iterator we just need to turn the TreeEntry iterator into a
3646 /// TreeEntry* iterator so that it dereferences to NodeRef.
3647 class nodes_iterator {
3649 ItTy It;
3650
3651 public:
3652 nodes_iterator(const ItTy &It2) : It(It2) {}
3653 NodeRef operator*() { return It->get(); }
3654 nodes_iterator operator++() {
3655 ++It;
3656 return *this;
3657 }
3658 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
3659 };
3660
3661 static nodes_iterator nodes_begin(BoUpSLP *R) {
3662 return nodes_iterator(R->VectorizableTree.begin());
3663 }
3664
3665 static nodes_iterator nodes_end(BoUpSLP *R) {
3666 return nodes_iterator(R->VectorizableTree.end());
3667 }
3668
3669 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
3670};
3671
3672template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
3673 using TreeEntry = BoUpSLP::TreeEntry;
3674
3675 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
3676
3677 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
3678 std::string Str;
3680 OS << Entry->Idx << ".\n";
3681 if (isSplat(Entry->Scalars))
3682 OS << "<splat> ";
3683 for (auto *V : Entry->Scalars) {
3684 OS << *V;
3685 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
3686 return EU.Scalar == V;
3687 }))
3688 OS << " <extract>";
3689 OS << "\n";
3690 }
3691 return Str;
3692 }
3693
3694 static std::string getNodeAttributes(const TreeEntry *Entry,
3695 const BoUpSLP *) {
3696 if (Entry->State == TreeEntry::NeedToGather)
3697 return "color=red";
3698 if (Entry->State == TreeEntry::ScatterVectorize ||
3699 Entry->State == TreeEntry::StridedVectorize)
3700 return "color=blue";
3701 return "";
3702 }
3703};
3704
3705} // end namespace llvm
3706
3709 for (auto *I : DeletedInstructions) {
3710 for (Use &U : I->operands()) {
3711 auto *Op = dyn_cast<Instruction>(U.get());
3712 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
3714 DeadInsts.emplace_back(Op);
3715 }
3716 I->dropAllReferences();
3717 }
3718 for (auto *I : DeletedInstructions) {
3719 assert(I->use_empty() &&
3720 "trying to erase instruction with users.");
3721 I->eraseFromParent();
3722 }
3723
3724 // Cleanup any dead scalar code feeding the vectorized instructions
3726
3727#ifdef EXPENSIVE_CHECKS
3728 // If we could guarantee that this call is not extremely slow, we could
3729 // remove the ifdef limitation (see PR47712).
3730 assert(!verifyFunction(*F, &dbgs()));
3731#endif
3732}
3733
3734/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
3735/// contains original mask for the scalars reused in the node. Procedure
3736/// transform this mask in accordance with the given \p Mask.
3738 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
3739 "Expected non-empty mask.");
3740 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
3741 Prev.swap(Reuses);
3742 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
3743 if (Mask[I] != PoisonMaskElem)
3744 Reuses[Mask[I]] = Prev[I];
3745}
3746
3747/// Reorders the given \p Order according to the given \p Mask. \p Order - is
3748/// the original order of the scalars. Procedure transforms the provided order
3749/// in accordance with the given \p Mask. If the resulting \p Order is just an
3750/// identity order, \p Order is cleared.
3752 bool BottomOrder = false) {
3753 assert(!Mask.empty() && "Expected non-empty mask.");
3754 unsigned Sz = Mask.size();
3755 if (BottomOrder) {
3756 SmallVector<unsigned> PrevOrder;
3757 if (Order.empty()) {
3758 PrevOrder.resize(Sz);
3759 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
3760 } else {
3761 PrevOrder.swap(Order);
3762 }
3763 Order.assign(Sz, Sz);
3764 for (unsigned I = 0; I < Sz; ++I)
3765 if (Mask[I] != PoisonMaskElem)
3766 Order[I] = PrevOrder[Mask[I]];
3767 if (all_of(enumerate(Order), [&](const auto &Data) {
3768 return Data.value() == Sz || Data.index() == Data.value();
3769 })) {
3770 Order.clear();
3771 return;
3772 }
3773 fixupOrderingIndices(Order);
3774 return;
3775 }
3776 SmallVector<int> MaskOrder;
3777 if (Order.empty()) {
3778 MaskOrder.resize(Sz);
3779 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
3780 } else {
3781 inversePermutation(Order, MaskOrder);
3782 }
3783 reorderReuses(MaskOrder, Mask);
3784 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
3785 Order.clear();
3786 return;
3787 }
3788 Order.assign(Sz, Sz);
3789 for (unsigned I = 0; I < Sz; ++I)
3790 if (MaskOrder[I] != PoisonMaskElem)
3791 Order[MaskOrder[I]] = I;
3792 fixupOrderingIndices(Order);
3793}
3794
3795std::optional<BoUpSLP::OrdersType>
3796BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
3797 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
3798 // Try to find subvector extract/insert patterns and reorder only such
3799 // patterns.
3800 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
3801 Type *ScalarTy = GatheredScalars.front()->getType();
3802 int NumScalars = GatheredScalars.size();
3803 if (!isValidElementType(ScalarTy))
3804 return std::nullopt;
3805 auto *VecTy = FixedVectorType::get(ScalarTy, NumScalars);
3806 int NumParts = TTI->getNumberOfParts(VecTy);
3807 if (NumParts == 0 || NumParts >= NumScalars)
3808 NumParts = 1;
3809 SmallVector<int> ExtractMask;
3810 SmallVector<int> Mask;
3813 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
3815 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
3816 /*ForOrder=*/true);
3817 // No shuffled operands - ignore.
3818 if (GatherShuffles.empty() && ExtractShuffles.empty())
3819 return std::nullopt;
3820 OrdersType CurrentOrder(NumScalars, NumScalars);
3821 if (GatherShuffles.size() == 1 &&
3822 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
3823 Entries.front().front()->isSame(TE.Scalars)) {
3824 // Perfect match in the graph, will reuse the previously vectorized
3825 // node. Cost is 0.
3826 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
3827 return CurrentOrder;
3828 }
3829 auto IsSplatMask = [](ArrayRef<int> Mask) {
3830 int SingleElt = PoisonMaskElem;
3831 return all_of(Mask, [&](int I) {
3832 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
3833 SingleElt = I;
3834 return I == PoisonMaskElem || I == SingleElt;
3835 });
3836 };
3837 // Exclusive broadcast mask - ignore.
3838 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
3839 (Entries.size() != 1 ||
3840 Entries.front().front()->ReorderIndices.empty())) ||
3841 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
3842 return std::nullopt;
3843 SmallBitVector ShuffledSubMasks(NumParts);
3844 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
3845 ArrayRef<int> Mask, int PartSz, int NumParts,
3846 function_ref<unsigned(unsigned)> GetVF) {
3847 for (int I : seq<int>(0, NumParts)) {
3848 if (ShuffledSubMasks.test(I))
3849 continue;
3850 const int VF = GetVF(I);
3851 if (VF == 0)
3852 continue;
3853 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, PartSz);
3854 // Shuffle of at least 2 vectors - ignore.
3855 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
3856 std::fill(Slice.begin(), Slice.end(), NumScalars);
3857 ShuffledSubMasks.set(I);
3858 continue;
3859 }
3860 // Try to include as much elements from the mask as possible.
3861 int FirstMin = INT_MAX;
3862 int SecondVecFound = false;
3863 for (int K : seq<int>(0, PartSz)) {
3864 int Idx = Mask[I * PartSz + K];
3865 if (Idx == PoisonMaskElem) {
3866 Value *V = GatheredScalars[I * PartSz + K];
3867 if (isConstant(V) && !isa<PoisonValue>(V)) {
3868 SecondVecFound = true;
3869 break;
3870 }
3871 continue;
3872 }
3873 if (Idx < VF) {
3874 if (FirstMin > Idx)
3875 FirstMin = Idx;
3876 } else {
3877 SecondVecFound = true;
3878 break;
3879 }
3880 }
3881 FirstMin = (FirstMin / PartSz) * PartSz;
3882 // Shuffle of at least 2 vectors - ignore.
3883 if (SecondVecFound) {
3884 std::fill(Slice.begin(), Slice.end(), NumScalars);
3885 ShuffledSubMasks.set(I);
3886 continue;
3887 }
3888 for (int K : seq<int>(0, PartSz)) {
3889 int Idx = Mask[I * PartSz + K];
3890 if (Idx == PoisonMaskElem)
3891 continue;
3892 Idx -= FirstMin;
3893 if (Idx >= PartSz) {
3894 SecondVecFound = true;
3895 break;
3896 }
3897 if (CurrentOrder[I * PartSz + Idx] >
3898 static_cast<unsigned>(I * PartSz + K) &&
3899 CurrentOrder[I * PartSz + Idx] !=
3900 static_cast<unsigned>(I * PartSz + Idx))
3901 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
3902 }
3903 // Shuffle of at least 2 vectors - ignore.
3904 if (SecondVecFound) {
3905 std::fill(Slice.begin(), Slice.end(), NumScalars);
3906 ShuffledSubMasks.set(I);
3907 continue;
3908 }
3909 }
3910 };
3911 int PartSz = NumScalars / NumParts;
3912 if (!ExtractShuffles.empty())
3913 TransformMaskToOrder(
3914 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
3915 if (!ExtractShuffles[I])
3916 return 0U;
3917 unsigned VF = 0;
3918 for (unsigned Idx : seq<unsigned>(0, PartSz)) {
3919 int K = I * PartSz + Idx;
3920 if (ExtractMask[K] == PoisonMaskElem)
3921 continue;
3922 if (!TE.ReuseShuffleIndices.empty())
3923 K = TE.ReuseShuffleIndices[K];
3924 if (!TE.ReorderIndices.empty())
3925 K = std::distance(TE.ReorderIndices.begin(),
3926 find(TE.ReorderIndices, K));
3927 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
3928 if (!EI)
3929 continue;
3930 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
3931 ->getElementCount()
3932 .getKnownMinValue());
3933 }
3934 return VF;
3935 });
3936 // Check special corner case - single shuffle of the same entry.
3937 if (GatherShuffles.size() == 1 && NumParts != 1) {
3938 if (ShuffledSubMasks.any())
3939 return std::nullopt;
3940 PartSz = NumScalars;
3941 NumParts = 1;
3942 }
3943 if (!Entries.empty())
3944 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
3945 if (!GatherShuffles[I])
3946 return 0U;
3947 return std::max(Entries[I].front()->getVectorFactor(),
3948 Entries[I].back()->getVectorFactor());
3949 });
3950 int NumUndefs =
3951 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
3952 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
3953 return std::nullopt;
3954 return std::move(CurrentOrder);
3955}
3956
3957namespace {
3958/// Tracks the state we can represent the loads in the given sequence.
3959enum class LoadsState { Gather, Vectorize, ScatterVectorize, StridedVectorize };
3960} // anonymous namespace
3961
3962static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
3963 const TargetLibraryInfo &TLI,
3964 bool CompareOpcodes = true) {
3965 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
3966 return false;
3967 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
3968 if (!GEP1)
3969 return false;
3970 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
3971 if (!GEP2)
3972 return false;
3973 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
3974 ((isConstant(GEP1->getOperand(1)) &&
3975 isConstant(GEP2->getOperand(1))) ||
3976 !CompareOpcodes ||
3977 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
3978 .getOpcode());
3979}
3980
3981/// Calculates minimal alignment as a common alignment.
3982template <typename T>
3984 Align CommonAlignment = cast<T>(VL.front())->getAlign();
3985 for (Value *V : VL.drop_front())
3986 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
3987 return CommonAlignment;
3988}
3989
3990/// Check if \p Order represents reverse order.
3992 unsigned Sz = Order.size();
3993 return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
3994 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
3995 });
3996}
3997
3998/// Checks if the given array of loads can be represented as a vectorized,
3999/// scatter or just simple gather.
4000static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
4001 const TargetTransformInfo &TTI,
4002 const DataLayout &DL, ScalarEvolution &SE,
4003 LoopInfo &LI, const TargetLibraryInfo &TLI,
4005 SmallVectorImpl<Value *> &PointerOps) {
4006 // Check that a vectorized load would load the same memory as a scalar
4007 // load. For example, we don't want to vectorize loads that are smaller
4008 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4009 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4010 // from such a struct, we read/write packed bits disagreeing with the
4011 // unvectorized version.
4012 Type *ScalarTy = VL0->getType();
4013
4014 if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy))
4015 return LoadsState::Gather;
4016
4017 // Make sure all loads in the bundle are simple - we can't vectorize
4018 // atomic or volatile loads.
4019 PointerOps.clear();
4020 const unsigned Sz = VL.size();
4021 PointerOps.resize(Sz);
4022 auto *POIter = PointerOps.begin();
4023 for (Value *V : VL) {
4024 auto *L = cast<LoadInst>(V);
4025 if (!L->isSimple())
4026 return LoadsState::Gather;
4027 *POIter = L->getPointerOperand();
4028 ++POIter;
4029 }
4030
4031 Order.clear();
4032 auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
4033 // Check the order of pointer operands or that all pointers are the same.
4034 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
4035 if (IsSorted || all_of(PointerOps, [&](Value *P) {
4036 return arePointersCompatible(P, PointerOps.front(), TLI);
4037 })) {
4038 if (IsSorted) {
4039 Value *Ptr0;
4040 Value *PtrN;
4041 if (Order.empty()) {
4042 Ptr0 = PointerOps.front();
4043 PtrN = PointerOps.back();
4044 } else {
4045 Ptr0 = PointerOps[Order.front()];
4046 PtrN = PointerOps[Order.back()];
4047 }
4048 std::optional<int> Diff =
4049 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
4050 // Check that the sorted loads are consecutive.
4051 if (static_cast<unsigned>(*Diff) == Sz - 1)
4052 return LoadsState::Vectorize;
4053 // Simple check if not a strided access - clear order.
4054 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4055 // Try to generate strided load node if:
4056 // 1. Target with strided load support is detected.
4057 // 2. The number of loads is greater than MinProfitableStridedLoads,
4058 // or the potential stride <= MaxProfitableLoadStride and the
4059 // potential stride is power-of-2 (to avoid perf regressions for the very
4060 // small number of loads) and max distance > number of loads, or potential
4061 // stride is -1.
4062 // 3. The loads are ordered, or number of unordered loads <=
4063 // MaxProfitableUnorderedLoads, or loads are in reversed order.
4064 // (this check is to avoid extra costs for very expensive shuffles).
4065 if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
4066 (static_cast<unsigned>(std::abs(*Diff)) <=
4068 isPowerOf2_32(std::abs(*Diff)))) &&
4069 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4070 *Diff == -(static_cast<int>(Sz) - 1))) {
4071 int Stride = *Diff / static_cast<int>(Sz - 1);
4072 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4073 Align Alignment =
4074 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
4075 ->getAlign();
4076 if (TTI.isLegalStridedLoadStore(VecTy, Alignment)) {
4077 // Iterate through all pointers and check if all distances are
4078 // unique multiple of Dist.
4079 SmallSet<int, 4> Dists;
4080 for (Value *Ptr : PointerOps) {
4081 int Dist = 0;
4082 if (Ptr == PtrN)
4083 Dist = *Diff;
4084 else if (Ptr != Ptr0)
4085 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
4086 // If the strides are not the same or repeated, we can't
4087 // vectorize.
4088 if (((Dist / Stride) * Stride) != Dist ||
4089 !Dists.insert(Dist).second)
4090 break;
4091 }
4092 if (Dists.size() == Sz)
4093 return LoadsState::StridedVectorize;
4094 }
4095 }
4096 }
4097 }
4098 // TODO: need to improve analysis of the pointers, if not all of them are
4099 // GEPs or have > 2 operands, we end up with a gather node, which just
4100 // increases the cost.
4101 Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent());
4102 bool ProfitableGatherPointers =
4103 static_cast<unsigned>(count_if(
4104 PointerOps,
4105 [L](Value *V) { return L && L->isLoopInvariant(V); })) <= Sz / 2 &&
4106 Sz > 2;
4107 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
4108 auto *GEP = dyn_cast<GetElementPtrInst>(P);
4109 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
4110 (GEP && GEP->getNumOperands() == 2);
4111 })) {
4112 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4113 if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
4114 !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))
4115 return LoadsState::ScatterVectorize;
4116 }
4117 }
4118
4119 return LoadsState::Gather;
4120}
4121
4123 const DataLayout &DL, ScalarEvolution &SE,
4124 SmallVectorImpl<unsigned> &SortedIndices) {
4126 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4127 "Expected list of pointer operands.");
4128 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4129 // Ptr into, sort and return the sorted indices with values next to one
4130 // another.
4132 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4133
4134 unsigned Cnt = 1;
4135 for (Value *Ptr : VL.drop_front()) {
4136 bool Found = any_of(Bases, [&](auto &Base) {
4137 std::optional<int> Diff =
4138 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4139 /*StrictCheck=*/true);
4140 if (!Diff)
4141 return false;
4142
4143 Base.second.emplace_back(Ptr, *Diff, Cnt++);
4144 return true;
4145 });
4146
4147 if (!Found) {
4148 // If we haven't found enough to usefully cluster, return early.
4149 if (Bases.size() > VL.size() / 2 - 1)
4150 return false;
4151
4152 // Not found already - add a new Base
4153 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
4154 }
4155 }
4156
4157 // For each of the bases sort the pointers by Offset and check if any of the
4158 // base become consecutively allocated.
4159 bool AnyConsecutive = false;
4160 for (auto &Base : Bases) {
4161 auto &Vec = Base.second;
4162 if (Vec.size() > 1) {
4163 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
4164 const std::tuple<Value *, int, unsigned> &Y) {
4165 return std::get<1>(X) < std::get<1>(Y);
4166 });
4167 int InitialOffset = std::get<1>(Vec[0]);
4168 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
4169 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4170 });
4171 }
4172 }
4173
4174 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4175 SortedIndices.clear();
4176 if (!AnyConsecutive)
4177 return false;
4178
4179 for (auto &Base : Bases) {
4180 for (auto &T : Base.second)
4181 SortedIndices.push_back(std::get<2>(T));
4182 }
4183
4184 assert(SortedIndices.size() == VL.size() &&
4185 "Expected SortedIndices to be the size of VL");
4186 return true;
4187}
4188
4189std::optional<BoUpSLP::OrdersType>
4190BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4191 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
4192 Type *ScalarTy = TE.Scalars[0]->getType();
4193
4195 Ptrs.reserve(TE.Scalars.size());
4196 for (Value *V : TE.Scalars) {
4197 auto *L = dyn_cast<LoadInst>(V);
4198 if (!L || !L->isSimple())
4199 return std::nullopt;
4200 Ptrs.push_back(L->getPointerOperand());
4201 }
4202
4203 BoUpSLP::OrdersType Order;
4204 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
4205 return std::move(Order);
4206 return std::nullopt;
4207}
4208
4209/// Check if two insertelement instructions are from the same buildvector.
4212 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4213 // Instructions must be from the same basic blocks.
4214 if (VU->getParent() != V->getParent())
4215 return false;
4216 // Checks if 2 insertelements are from the same buildvector.
4217 if (VU->getType() != V->getType())
4218 return false;
4219 // Multiple used inserts are separate nodes.
4220 if (!VU->hasOneUse() && !V->hasOneUse())
4221 return false;
4222 auto *IE1 = VU;
4223 auto *IE2 = V;
4224 std::optional<unsigned> Idx1 = getInsertIndex(IE1);
4225 std::optional<unsigned> Idx2 = getInsertIndex(IE2);
4226 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4227 return false;
4228 // Go through the vector operand of insertelement instructions trying to find
4229 // either VU as the original vector for IE2 or V as the original vector for
4230 // IE1.
4231 SmallBitVector ReusedIdx(
4232 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
4233 bool IsReusedIdx = false;
4234 do {
4235 if (IE2 == VU && !IE1)
4236 return VU->hasOneUse();
4237 if (IE1 == V && !IE2)
4238 return V->hasOneUse();
4239 if (IE1 && IE1 != V) {
4240 unsigned Idx1 = getInsertIndex(IE1).value_or(*Idx2);
4241 IsReusedIdx |= ReusedIdx.test(Idx1);
4242 ReusedIdx.set(Idx1);
4243 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4244 IE1 = nullptr;
4245 else
4246 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4247 }
4248 if (IE2 && IE2 != VU) {
4249 unsigned Idx2 = getInsertIndex(IE2).value_or(*Idx1);
4250 IsReusedIdx |= ReusedIdx.test(Idx2);
4251 ReusedIdx.set(Idx2);
4252 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4253 IE2 = nullptr;
4254 else
4255 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4256 }
4257 } while (!IsReusedIdx && (IE1 || IE2));
4258 return false;
4259}
4260
4261std::optional<BoUpSLP::OrdersType>
4262BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4263 // No need to reorder if need to shuffle reuses, still need to shuffle the
4264 // node.
4265 if (!TE.ReuseShuffleIndices.empty()) {
4266 if (isSplat(TE.Scalars))
4267 return std::nullopt;
4268 // Check if reuse shuffle indices can be improved by reordering.
4269 // For this, check that reuse mask is "clustered", i.e. each scalar values
4270 // is used once in each submask of size <number_of_scalars>.
4271 // Example: 4 scalar values.
4272 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4273 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4274 // element 3 is used twice in the second submask.
4275 unsigned Sz = TE.Scalars.size();
4276 if (TE.State == TreeEntry::NeedToGather) {
4277 if (std::optional<OrdersType> CurrentOrder =
4279 SmallVector<int> Mask;
4280 fixupOrderingIndices(*CurrentOrder);
4281 inversePermutation(*CurrentOrder, Mask);
4282 ::addMask(Mask, TE.ReuseShuffleIndices);
4283 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4284 unsigned Sz = TE.Scalars.size();
4285 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
4286 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
4287 if (Idx != PoisonMaskElem)
4288 Res[Idx + K * Sz] = I + K * Sz;
4289 }
4290 return std::move(Res);
4291 }
4292 }
4293 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4295 TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
4296 return std::nullopt;
4297 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
4298 Sz)) {
4299 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4300 if (TE.ReorderIndices.empty())
4301 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4302 else
4303 inversePermutation(TE.ReorderIndices, ReorderMask);
4304 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4305 unsigned VF = ReorderMask.size();
4306 OrdersType ResOrder(VF, VF);
4307 unsigned NumParts = VF / Sz;
4308 SmallBitVector UsedVals(NumParts);
4309 for (unsigned I = 0; I < VF; I += Sz) {
4310 int Val = PoisonMaskElem;
4311 unsigned UndefCnt = 0;
4312 if (any_of(ArrayRef(ReorderMask).slice(I, Sz),
4313 [&](int Idx) {
4314 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
4315 Val = Idx;
4316 if (Idx == PoisonMaskElem)
4317 ++UndefCnt;
4318 return Idx != PoisonMaskElem && Idx != Val;
4319 }) ||
4320 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
4321 UndefCnt > Sz / 2)
4322 return std::nullopt;
4323 UsedVals.set(Val);
4324 for (unsigned K = 0; K < NumParts; ++K)
4325 ResOrder[Val + Sz * K] = I + K;
4326 }
4327 return std::move(ResOrder);
4328 }
4329 unsigned VF = TE.getVectorFactor();
4330 // Try build correct order for extractelement instructions.
4331 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
4332 TE.ReuseShuffleIndices.end());
4333 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4334 all_of(TE.Scalars, [Sz](Value *V) {
4335 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4336 return Idx && *Idx < Sz;
4337 })) {
4338 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4339 if (TE.ReorderIndices.empty())
4340 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4341 else
4342 inversePermutation(TE.ReorderIndices, ReorderMask);
4343 for (unsigned I = 0; I < VF; ++I) {
4344 int &Idx = ReusedMask[I];
4345 if (Idx == PoisonMaskElem)
4346 continue;
4347 Value *V = TE.Scalars[ReorderMask[Idx]];
4348 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
4349 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
4350 }
4351 }
4352 // Build the order of the VF size, need to reorder reuses shuffles, they are
4353 // always of VF size.
4354 OrdersType ResOrder(VF);
4355 std::iota(ResOrder.begin(), ResOrder.end(), 0);
4356 auto *It = ResOrder.begin();
4357 for (unsigned K = 0; K < VF; K += Sz) {
4358 OrdersType CurrentOrder(TE.ReorderIndices);
4359 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
4360 if (SubMask.front() == PoisonMaskElem)
4361 std::iota(SubMask.begin(), SubMask.end(), 0);
4362 reorderOrder(CurrentOrder, SubMask);
4363 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
4364 std::advance(It, Sz);
4365 }
4366 if (TE.State == TreeEntry::NeedToGather &&
4367 all_of(enumerate(ResOrder),
4368 [](const auto &Data) { return Data.index() == Data.value(); }))
4369 return std::nullopt; // No need to reorder.
4370 return std::move(ResOrder);
4371 }
4372 if ((TE.State == TreeEntry::Vectorize ||
4373 TE.State == TreeEntry::StridedVectorize) &&
4374 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4375 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4376 !TE.isAltShuffle())
4377 return TE.ReorderIndices;
4378 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4379 auto PHICompare = [&](unsigned I1, unsigned I2) {
4380 Value *V1 = TE.Scalars[I1];
4381 Value *V2 = TE.Scalars[I2];
4382 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
4383 return false;
4384 if (V1->getNumUses() < V2->getNumUses())
4385 return true;
4386 if (V1->getNumUses() > V2->getNumUses())
4387 return false;
4388 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
4389 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4390 if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4391 if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4393 IE1, IE2,
4394 [](InsertElementInst *II) { return II->getOperand(0); }))
4395 return false;
4396 std::optional<unsigned> Idx1 = getInsertIndex(IE1);
4397 std::optional<unsigned> Idx2 = getInsertIndex(IE2);
4398 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4399 return false;
4400 return *Idx1 < *Idx2;
4401 }
4402 if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4403 if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4404 if (EE1->getOperand(0) != EE2->getOperand(0))
4405 return false;
4406 std::optional<unsigned> Idx1 = getExtractIndex(EE1);
4407 std::optional<unsigned> Idx2 = getExtractIndex(EE2);
4408 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4409 return false;
4410 return *Idx1 < *Idx2;
4411 }
4412 return false;
4413 };
4414 auto IsIdentityOrder = [](const OrdersType &Order) {
4415 for (unsigned Idx : seq<unsigned>(0, Order.size()))
4416 if (Idx != Order[Idx])
4417 return false;
4418 return true;
4419 };
4420 if (!TE.ReorderIndices.empty())
4421 return TE.ReorderIndices;
4423 SmallVector<unsigned> Phis(TE.Scalars.size());
4424 std::iota(Phis.begin(), Phis.end(), 0);
4425 OrdersType ResOrder(TE.Scalars.size());
4426 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4427 PhiToId[Id] = Id;
4428 stable_sort(Phis, PHICompare);
4429 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4430 ResOrder[Id] = PhiToId[Phis[Id]];
4431 if (IsIdentityOrder(ResOrder))
4432 return std::nullopt; // No need to reorder.
4433 return std::move(ResOrder);
4434 }
4435 if (TE.State == TreeEntry::NeedToGather) {
4436 // TODO: add analysis of other gather nodes with extractelement
4437 // instructions and other values/instructions, not only undefs.
4438 if (((TE.getOpcode() == Instruction::ExtractElement &&
4439 !TE.isAltShuffle()) ||
4440 (all_of(TE.Scalars,
4441 [](Value *V) {
4442 return isa<UndefValue, ExtractElementInst>(V);
4443 }) &&
4444 any_of(TE.Scalars,
4445 [](Value *V) { return isa<ExtractElementInst>(V); }))) &&
4446 all_of(TE.Scalars,
4447 [](Value *V) {
4448 auto *EE = dyn_cast<ExtractElementInst>(V);
4449 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4450 }) &&
4451 allSameType(TE.Scalars)) {
4452 // Check that gather of extractelements can be represented as
4453 // just a shuffle of a single vector.
4454 OrdersType CurrentOrder;
4455 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4456 /*ResizeAllowed=*/true);
4457 if (Reuse || !CurrentOrder.empty())
4458 return std::move(CurrentOrder);
4459 }
4460 // If the gather node is <undef, v, .., poison> and
4461 // insertelement poison, v, 0 [+ permute]
4462 // is cheaper than
4463 // insertelement poison, v, n - try to reorder.
4464 // If rotating the whole graph, exclude the permute cost, the whole graph
4465 // might be transformed.
4466 int Sz = TE.Scalars.size();
4467 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
4468 count_if(TE.Scalars, UndefValue::classof) == Sz - 1) {
4469 const auto *It =
4470 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
4471 if (It == TE.Scalars.begin())
4472 return OrdersType();
4473 auto *Ty = FixedVectorType::get(TE.Scalars.front()->getType(), Sz);
4474 if (It != TE.Scalars.end()) {
4475 OrdersType Order(Sz, Sz);
4476 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4477 Order[Idx] = 0;
4478 fixupOrderingIndices(Order);
4479 SmallVector<int> Mask;
4480 inversePermutation(Order, Mask);
4481 InstructionCost PermuteCost =
4482 TopToBottom
4483 ? 0
4485 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
4486 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
4487 PoisonValue::get(Ty), *It);
4488 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
4489 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
4490 PoisonValue::get(Ty), *It);
4491 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
4492 OrdersType Order(Sz, Sz);
4493 Order[Idx] = 0;
4494 return std::move(Order);
4495 }
4496 }
4497 }
4498 if (isSplat(TE.Scalars))
4499 return std::nullopt;
4500 if (TE.Scalars.size() >= 4)
4501 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
4502 return Order;
4503 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
4504 return CurrentOrder;
4505 }
4506 return std::nullopt;
4507}
4508
4509/// Checks if the given mask is a "clustered" mask with the same clusters of
4510/// size \p Sz, which are not identity submasks.
4512 unsigned Sz) {
4513 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
4514 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
4515 return false;
4516 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
4517 ArrayRef<int> Cluster = Mask.slice(I, Sz);
4518 if (Cluster != FirstCluster)
4519 return false;
4520 }
4521 return true;
4522}
4523
4524void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
4525 // Reorder reuses mask.
4526 reorderReuses(TE.ReuseShuffleIndices, Mask);
4527 const unsigned Sz = TE.Scalars.size();
4528 // For vectorized and non-clustered reused no need to do anything else.
4529 if (TE.State != TreeEntry::NeedToGather ||
4531 Sz) ||
4532 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
4533 return;
4534 SmallVector<int> NewMask;
4535 inversePermutation(TE.ReorderIndices, NewMask);
4536 addMask(NewMask, TE.ReuseShuffleIndices);
4537 // Clear reorder since it is going to be applied to the new mask.
4538 TE.ReorderIndices.clear();
4539 // Try to improve gathered nodes with clustered reuses, if possible.
4540 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
4541 SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
4542 inversePermutation(NewOrder, NewMask);
4543 reorderScalars(TE.Scalars, NewMask);
4544 // Fill the reuses mask with the identity submasks.
4545 for (auto *It = TE.ReuseShuffleIndices.begin(),
4546 *End = TE.ReuseShuffleIndices.end();
4547 It != End; std::advance(It, Sz))
4548 std::iota(It, std::next(It, Sz), 0);
4549}
4550
4552 ArrayRef<unsigned> SecondaryOrder) {
4553 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
4554 "Expected same size of orders");
4555 unsigned Sz = Order.size();
4556 SmallBitVector UsedIndices(Sz);
4557 for (unsigned Idx : seq<unsigned>(0, Sz)) {
4558 if (Order[Idx] != Sz)
4559 UsedIndices.set(Order[Idx]);
4560 }
4561 if (SecondaryOrder.empty()) {
4562 for (unsigned Idx : seq<unsigned>(0, Sz))
4563 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
4564 Order[Idx] = Idx;
4565 } else {
4566 for (unsigned Idx : seq<unsigned>(0, Sz))
4567 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
4568 !UsedIndices.test(SecondaryOrder[Idx]))
4569 Order[Idx] = SecondaryOrder[Idx];
4570 }
4571}
4572
4574 // Maps VF to the graph nodes.
4576 // ExtractElement gather nodes which can be vectorized and need to handle
4577 // their ordering.
4579
4580 // Phi nodes can have preferred ordering based on their result users
4582
4583 // AltShuffles can also have a preferred ordering that leads to fewer
4584 // instructions, e.g., the addsub instruction in x86.
4585 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
4586
4587 // Maps a TreeEntry to the reorder indices of external users.
4589 ExternalUserReorderMap;
4590 // FIXME: Workaround for syntax error reported by MSVC buildbots.
4591 TargetTransformInfo &TTIRef = *TTI;
4592 // Find all reorderable nodes with the given VF.
4593 // Currently the are vectorized stores,loads,extracts + some gathering of
4594 // extracts.
4595 for_each(VectorizableTree, [this, &TTIRef, &VFToOrderedEntries,
4596 &GathersToOrders, &ExternalUserReorderMap,
4597 &AltShufflesToOrders, &PhisToOrders](
4598 const std::unique_ptr<TreeEntry> &TE) {
4599 // Look for external users that will probably be vectorized.
4600 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
4601 findExternalStoreUsersReorderIndices(TE.get());
4602 if (!ExternalUserReorderIndices.empty()) {
4603 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4604 ExternalUserReorderMap.try_emplace(TE.get(),
4605 std::move(ExternalUserReorderIndices));
4606 }
4607
4608 // Patterns like [fadd,fsub] can be combined into a single instruction in
4609 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
4610 // to take into account their order when looking for the most used order.
4611 if (TE->isAltShuffle()) {
4612 VectorType *VecTy =
4613 FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
4614 unsigned Opcode0 = TE->getOpcode();
4615 unsigned Opcode1 = TE->getAltOpcode();
4616 // The opcode mask selects between the two opcodes.
4617 SmallBitVector OpcodeMask(TE->Scalars.size(), false);
4618 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
4619 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
4620 OpcodeMask.set(Lane);
4621 // If this pattern is supported by the target then we consider the order.
4622 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
4623 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4624 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
4625 }
4626 // TODO: Check the reverse order too.
4627 }
4628
4629 if (std::optional<OrdersType> CurrentOrder =
4630 getReorderingData(*TE, /*TopToBottom=*/true)) {
4631 // Do not include ordering for nodes used in the alt opcode vectorization,
4632 // better to reorder them during bottom-to-top stage. If follow the order
4633 // here, it causes reordering of the whole graph though actually it is
4634 // profitable just to reorder the subgraph that starts from the alternate
4635 // opcode vectorization node. Such nodes already end-up with the shuffle
4636 // instruction and it is just enough to change this shuffle rather than
4637 // rotate the scalars for the whole graph.
4638 unsigned Cnt = 0;
4639 const TreeEntry *UserTE = TE.get();
4640 while (UserTE && Cnt < RecursionMaxDepth) {
4641 if (UserTE->UserTreeIndices.size() != 1)
4642 break;
4643 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
4644 return EI.UserTE->State == TreeEntry::Vectorize &&
4645 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
4646 }))
4647 return;
4648 UserTE = UserTE->UserTreeIndices.back().UserTE;
4649 ++Cnt;
4650 }
4651 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4652 if (!(TE->State == TreeEntry::Vectorize ||
4653 TE->State == TreeEntry::StridedVectorize) ||
4654 !TE->ReuseShuffleIndices.empty())
4655 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
4656 if (TE->State == TreeEntry::Vectorize &&
4657 TE->getOpcode() == Instruction::PHI)
4658 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
4659 }
4660 });
4661
4662 // Reorder the graph nodes according to their vectorization factor.
4663 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
4664 VF /= 2) {
4665 auto It = VFToOrderedEntries.find(VF);
4666 if (It == VFToOrderedEntries.end())
4667 continue;
4668 // Try to find the most profitable order. We just are looking for the most
4669 // used order and reorder scalar elements in the nodes according to this
4670 // mostly used order.
4671 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
4672 // All operands are reordered and used only in this node - propagate the
4673 // most used order to the user node.
4676 OrdersUses;
4678 for (const TreeEntry *OpTE : OrderedEntries) {
4679 // No need to reorder this nodes, still need to extend and to use shuffle,
4680 // just need to merge reordering shuffle and the reuse shuffle.
4681 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
4682 continue;
4683 // Count number of orders uses.
4684 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
4685 &PhisToOrders]() -> const OrdersType & {
4686 if (OpTE->State == TreeEntry::NeedToGather ||
4687 !OpTE->ReuseShuffleIndices.empty()) {
4688 auto It = GathersToOrders.find(OpTE);
4689 if (It != GathersToOrders.end())
4690 return It->second;
4691 }
4692 if (OpTE->isAltShuffle()) {
4693 auto It = AltShufflesToOrders.find(OpTE);
4694 if (It != AltShufflesToOrders.end())
4695 return It->second;
4696 }
4697 if (OpTE->State == TreeEntry::Vectorize &&
4698 OpTE->getOpcode() == Instruction::PHI) {
4699 auto It = PhisToOrders.find(OpTE);
4700 if (It != PhisToOrders.end())
4701 return It->second;
4702 }
4703 return OpTE->ReorderIndices;
4704 }();
4705 // First consider the order of the external scalar users.
4706 auto It = ExternalUserReorderMap.find(OpTE);
4707 if (It != ExternalUserReorderMap.end()) {
4708 const auto &ExternalUserReorderIndices = It->second;
4709 // If the OpTE vector factor != number of scalars - use natural order,
4710 // it is an attempt to reorder node with reused scalars but with
4711 // external uses.
4712 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
4713 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
4714 ExternalUserReorderIndices.size();
4715 } else {
4716 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
4717 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
4718 }
4719 // No other useful reorder data in this entry.
4720 if (Order.empty())
4721 continue;
4722 }
4723 // Stores actually store the mask, not the order, need to invert.
4724 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4725 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4726 SmallVector<int> Mask;
4727 inversePermutation(Order, Mask);
4728 unsigned E = Order.size();
4729 OrdersType CurrentOrder(E, E);
4730 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
4731 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
4732 });
4733 fixupOrderingIndices(CurrentOrder);
4734 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
4735 } else {
4736 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
4737 }
4738 }
4739 if (OrdersUses.empty())
4740 continue;
4741 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
4742 const unsigned Sz = Order.size();
4743 for (unsigned Idx : seq<unsigned>(0, Sz))
4744 if (Idx != Order[Idx] && Order[Idx] != Sz)
4745 return false;
4746 return true;
4747 };
4748 // Choose the most used order.
4749 unsigned IdentityCnt = 0;
4750 unsigned FilledIdentityCnt = 0;
4751 OrdersType IdentityOrder(VF, VF);
4752 for (auto &Pair : OrdersUses) {
4753 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
4754 if (!Pair.first.empty())
4755 FilledIdentityCnt += Pair.second;
4756 IdentityCnt += Pair.second;
4757 combineOrders(IdentityOrder, Pair.first);
4758 }
4759 }
4760 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
4761 unsigned Cnt = IdentityCnt;
4762 for (auto &Pair : OrdersUses) {
4763 // Prefer identity order. But, if filled identity found (non-empty order)
4764 // with same number of uses, as the new candidate order, we can choose
4765 // this candidate order.
4766 if (Cnt < Pair.second ||
4767 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
4768 Cnt == Pair.second && !BestOrder.empty() &&
4769 IsIdentityOrder(BestOrder))) {
4770 combineOrders(Pair.first, BestOrder);
4771 BestOrder = Pair.first;
4772 Cnt = Pair.second;
4773 } else {
4774 combineOrders(BestOrder, Pair.first);
4775 }
4776 }
4777 // Set order of the user node.
4778 if (IsIdentityOrder(BestOrder))
4779 continue;
4780 fixupOrderingIndices(BestOrder);
4781 SmallVector<int> Mask;
4782 inversePermutation(BestOrder, Mask);
4783 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
4784 unsigned E = BestOrder.size();
4785 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
4786 return I < E ? static_cast<int>(I) : PoisonMaskElem;
4787 });
4788 // Do an actual reordering, if profitable.
4789 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
4790 // Just do the reordering for the nodes with the given VF.
4791 if (TE->Scalars.size() != VF) {
4792 if (TE->ReuseShuffleIndices.size() == VF) {
4793 // Need to reorder the reuses masks of the operands with smaller VF to
4794 // be able to find the match between the graph nodes and scalar
4795 // operands of the given node during vectorization/cost estimation.
4796 assert(all_of(TE->UserTreeIndices,
4797 [VF, &TE](const EdgeInfo &EI) {
4798 return EI.UserTE->Scalars.size() == VF ||
4799 EI.UserTE->Scalars.size() ==
4800 TE->Scalars.size();
4801 }) &&
4802 "All users must be of VF size.");
4803 // Update ordering of the operands with the smaller VF than the given
4804 // one.
4805 reorderNodeWithReuses(*TE, Mask);
4806 }
4807 continue;
4808 }
4809 if ((TE->State == TreeEntry::Vectorize ||
4810 TE->State == TreeEntry::StridedVectorize) &&
4812 InsertElementInst>(TE->getMainOp()) &&
4813 !TE->isAltShuffle()) {
4814 // Build correct orders for extract{element,value}, loads and
4815 // stores.
4816 reorderOrder(TE->ReorderIndices, Mask);
4817 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
4818 TE->reorderOperands(Mask);
4819 } else {
4820 // Reorder the node and its operands.
4821 TE->reorderOperands(Mask);
4822 assert(TE->ReorderIndices.empty() &&
4823 "Expected empty reorder sequence.");
4824 reorderScalars(TE->Scalars, Mask);
4825 }
4826 if (!TE->ReuseShuffleIndices.empty()) {
4827 // Apply reversed order to keep the original ordering of the reused
4828 // elements to avoid extra reorder indices shuffling.
4829 OrdersType CurrentOrder;
4830 reorderOrder(CurrentOrder, MaskOrder);
4831 SmallVector<int> NewReuses;
4832 inversePermutation(CurrentOrder, NewReuses);
4833 addMask(NewReuses, TE->ReuseShuffleIndices);
4834 TE->ReuseShuffleIndices.swap(NewReuses);
4835 }
4836 }
4837 }
4838}
4839
4840bool BoUpSLP::canReorderOperands(
4841 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
4842 ArrayRef<TreeEntry *> ReorderableGathers,
4843 SmallVectorImpl<TreeEntry *> &GatherOps) {
4844 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
4845 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
4846 return OpData.first == I &&
4847 OpData.second->State == TreeEntry::Vectorize;
4848 }))
4849 continue;
4850 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
4851 // Do not reorder if operand node is used by many user nodes.
4852 if (any_of(TE->UserTreeIndices,
4853 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
4854 return false;
4855 // Add the node to the list of the ordered nodes with the identity
4856 // order.
4857 Edges.emplace_back(I, TE);
4858 // Add ScatterVectorize nodes to the list of operands, where just
4859 // reordering of the scalars is required. Similar to the gathers, so
4860 // simply add to the list of gathered ops.
4861 // If there are reused scalars, process this node as a regular vectorize
4862 // node, just reorder reuses mask.
4863 if (TE->State != TreeEntry::Vectorize &&
4864 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
4865 GatherOps.push_back(TE);
4866 continue;
4867 }
4868 TreeEntry *Gather = nullptr;
4869 if (count_if(ReorderableGathers,
4870 [&Gather, UserTE, I](TreeEntry *TE) {
4871 assert(TE->State != TreeEntry::Vectorize &&
4872 "Only non-vectorized nodes are expected.");
4873 if (any_of(TE->UserTreeIndices,
4874 [UserTE, I](const EdgeInfo &EI) {
4875 return EI.UserTE == UserTE && EI.EdgeIdx == I;
4876 })) {
4877 assert(TE->isSame(UserTE->getOperand(I)) &&
4878 "Operand entry does not match operands.");
4879 Gather = TE;
4880 return true;
4881 }
4882 return false;
4883 }) > 1 &&
4884 !allConstant(UserTE->getOperand(I)))
4885 return false;
4886 if (Gather)
4887 GatherOps.push_back(Gather);
4888 }
4889 return true;
4890}
4891
4892void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
4893 SetVector<TreeEntry *> OrderedEntries;
4894 DenseSet<const TreeEntry *> GathersToOrders;
4895 // Find all reorderable leaf nodes with the given VF.
4896 // Currently the are vectorized loads,extracts without alternate operands +
4897 // some gathering of extracts.
4898 SmallVector<TreeEntry *> NonVectorized;
4899 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
4900 if (TE->State != TreeEntry::Vectorize &&
4901 TE->State != TreeEntry::StridedVectorize)
4902 NonVectorized.push_back(TE.get());
4903 if (std::optional<OrdersType> CurrentOrder =
4904 getReorderingData(*TE, /*TopToBottom=*/false)) {
4905 OrderedEntries.insert(TE.get());
4906 if (!(TE->State == TreeEntry::Vectorize ||
4907 TE->State == TreeEntry::StridedVectorize) ||
4908 !TE->ReuseShuffleIndices.empty())
4909 GathersToOrders.insert(TE.get());
4910 }
4911 }
4912
4913 // 1. Propagate order to the graph nodes, which use only reordered nodes.
4914 // I.e., if the node has operands, that are reordered, try to make at least
4915 // one operand order in the natural order and reorder others + reorder the
4916 // user node itself.
4918 while (!OrderedEntries.empty()) {
4919 // 1. Filter out only reordered nodes.
4920 // 2. If the entry has multiple uses - skip it and jump to the next node.
4922 SmallVector<TreeEntry *> Filtered;
4923 for (TreeEntry *TE : OrderedEntries) {
4924 if (!(TE->State == TreeEntry::Vectorize ||
4925 TE->State == TreeEntry::StridedVectorize ||
4926 (TE->State == TreeEntry::NeedToGather &&
4927 GathersToOrders.contains(TE))) ||
4928 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4929 !all_of(drop_begin(TE->UserTreeIndices),
4930 [TE](const EdgeInfo &EI) {
4931 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
4932 }) ||
4933 !Visited.insert(TE).second) {
4934 Filtered.push_back(TE);
4935 continue;
4936 }
4937 // Build a map between user nodes and their operands order to speedup
4938 // search. The graph currently does not provide this dependency directly.
4939 for (EdgeInfo &EI : TE->UserTreeIndices) {
4940 TreeEntry *UserTE = EI.UserTE;
4941 auto It = Users.find(UserTE);
4942 if (It == Users.end())
4943 It = Users.insert({UserTE, {}}).first;
4944 It->second.emplace_back(EI.EdgeIdx, TE);
4945 }
4946 }
4947 // Erase filtered entries.
4948 for (TreeEntry *TE : Filtered)
4949 OrderedEntries.remove(TE);
4951 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
4952 UsersVec(Users.begin(), Users.end());
4953 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
4954 return Data1.first->Idx > Data2.first->Idx;
4955 });
4956 for (auto &Data : UsersVec) {
4957 // Check that operands are used only in the User node.
4958 SmallVector<TreeEntry *> GatherOps;
4959 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
4960 GatherOps)) {
4961 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
4962 OrderedEntries.remove(Op.second);
4963 continue;
4964 }
4965 // All operands are reordered and used only in this node - propagate the
4966 // most used order to the user node.
4969 OrdersUses;
4970 // Do the analysis for each tree entry only once, otherwise the order of
4971 // the same node my be considered several times, though might be not
4972 // profitable.
4975 for (const auto &Op : Data.second) {
4976 TreeEntry *OpTE = Op.second;
4977 if (!VisitedOps.insert(OpTE).second)
4978 continue;
4979 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
4980 continue;
4981 const auto Order = [&]() -> const OrdersType {
4982 if (OpTE->State == TreeEntry::NeedToGather ||
4983 !OpTE->ReuseShuffleIndices.empty())
4984 return getReorderingData(*OpTE, /*TopToBottom=*/false)
4985 .value_or(OrdersType(1));
4986 return OpTE->ReorderIndices;
4987 }();
4988 // The order is partially ordered, skip it in favor of fully non-ordered
4989 // orders.
4990 if (Order.size() == 1)
4991 continue;
4992 unsigned NumOps = count_if(
4993 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
4994 return P.second == OpTE;
4995 });
4996 // Stores actually store the mask, not the order, need to invert.
4997 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4998 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4999 SmallVector<int> Mask;
5000 inversePermutation(Order, Mask);
5001 unsigned E = Order.size();
5002 OrdersType CurrentOrder(E, E);
5003 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5004 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5005 });