LLVM 17.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
24#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
50#include "llvm/IR/Attributes.h"
51#include "llvm/IR/BasicBlock.h"
52#include "llvm/IR/Constant.h"
53#include "llvm/IR/Constants.h"
54#include "llvm/IR/DataLayout.h"
56#include "llvm/IR/Dominators.h"
57#include "llvm/IR/Function.h"
58#include "llvm/IR/IRBuilder.h"
59#include "llvm/IR/InstrTypes.h"
60#include "llvm/IR/Instruction.h"
63#include "llvm/IR/Intrinsics.h"
64#include "llvm/IR/Module.h"
65#include "llvm/IR/Operator.h"
67#include "llvm/IR/Type.h"
68#include "llvm/IR/Use.h"
69#include "llvm/IR/User.h"
70#include "llvm/IR/Value.h"
71#include "llvm/IR/ValueHandle.h"
72#ifdef EXPENSIVE_CHECKS
73#include "llvm/IR/Verifier.h"
74#endif
75#include "llvm/Pass.h"
80#include "llvm/Support/Debug.h"
91#include <algorithm>
92#include <cassert>
93#include <cstdint>
94#include <iterator>
95#include <memory>
96#include <optional>
97#include <set>
98#include <string>
99#include <tuple>
100#include <utility>
101#include <vector>
102
103using namespace llvm;
104using namespace llvm::PatternMatch;
105using namespace slpvectorizer;
106
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
109
110STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111
113 cl::desc("Run the SLP vectorization passes"));
114
115static cl::opt<int>
117 cl::desc("Only vectorize if you gain more than this "
118 "number "));
119
120static cl::opt<bool>
121ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
122 cl::desc("Attempt to vectorize horizontal reductions"));
123
125 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
126 cl::desc(
127 "Attempt to vectorize horizontal reductions feeding into a store"));
128
129// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
130// even if we match a reduction but do not vectorize in the end.
132 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
133 cl::desc("Allow optimization of original scalar identity operations on "
134 "matched horizontal reductions."));
135
136static cl::opt<int>
138 cl::desc("Attempt to vectorize for this register size in bits"));
139
142 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
143
144static cl::opt<int>
145MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
146 cl::desc("Maximum depth of the lookup for consecutive stores."));
147
148/// Limits the size of scheduling regions in a block.
149/// It avoid long compile times for _very_ large blocks where vector
150/// instructions are spread over a wide range.
151/// This limit is way higher than needed by real-world functions.
152static cl::opt<int>
153ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
154 cl::desc("Limit the size of the SLP scheduling region per block"));
155
157 "slp-min-reg-size", cl::init(128), cl::Hidden,
158 cl::desc("Attempt to vectorize for this register size in bits"));
159
161 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
162 cl::desc("Limit the recursion depth when building a vectorizable tree"));
163
165 "slp-min-tree-size", cl::init(3), cl::Hidden,
166 cl::desc("Only vectorize small trees if they are fully vectorizable"));
167
168// The maximum depth that the look-ahead score heuristic will explore.
169// The higher this value, the higher the compilation time overhead.
171 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
172 cl::desc("The maximum look-ahead depth for operand reordering scores"));
173
174// The maximum depth that the look-ahead score heuristic will explore
175// when it probing among candidates for vectorization tree roots.
176// The higher this value, the higher the compilation time overhead but unlike
177// similar limit for operands ordering this is less frequently used, hence
178// impact of higher value is less noticeable.
180 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
181 cl::desc("The maximum look-ahead depth for searching best rooting option"));
182
183static cl::opt<bool>
184 ViewSLPTree("view-slp-tree", cl::Hidden,
185 cl::desc("Display the SLP trees with Graphviz"));
186
187// Limit the number of alias checks. The limit is chosen so that
188// it has no negative effect on the llvm benchmarks.
189static const unsigned AliasedCheckLimit = 10;
190
191// Another limit for the alias checks: The maximum distance between load/store
192// instructions where alias checks are done.
193// This limit is useful for very large basic blocks.
194static const unsigned MaxMemDepDistance = 160;
195
196/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
197/// regions to be handled.
198static const int MinScheduleRegionSize = 16;
199
200/// Predicate for the element types that the SLP vectorizer supports.
201///
202/// The most important thing to filter here are types which are invalid in LLVM
203/// vectors. We also filter target specific types which have absolutely no
204/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
205/// avoids spending time checking the cost model and realizing that they will
206/// be inevitably scalarized.
207static bool isValidElementType(Type *Ty) {
208 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
209 !Ty->isPPC_FP128Ty();
210}
211
212/// \returns True if the value is a constant (but not globals/constant
213/// expressions).
214static bool isConstant(Value *V) {
215 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
216}
217
218/// Checks if \p V is one of vector-like instructions, i.e. undef,
219/// insertelement/extractelement with constant indices for fixed vector type or
220/// extractvalue instruction.
222 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
223 !isa<ExtractValueInst, UndefValue>(V))
224 return false;
225 auto *I = dyn_cast<Instruction>(V);
226 if (!I || isa<ExtractValueInst>(I))
227 return true;
228 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
229 return false;
230 if (isa<ExtractElementInst>(I))
231 return isConstant(I->getOperand(1));
232 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
233 return isConstant(I->getOperand(2));
234}
235
236/// \returns true if all of the instructions in \p VL are in the same block or
237/// false otherwise.
239 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
240 if (!I0)
241 return false;
243 return true;
244
245 BasicBlock *BB = I0->getParent();
246 for (int I = 1, E = VL.size(); I < E; I++) {
247 auto *II = dyn_cast<Instruction>(VL[I]);
248 if (!II)
249 return false;
250
251 if (BB != II->getParent())
252 return false;
253 }
254 return true;
255}
256
257/// \returns True if all of the values in \p VL are constants (but not
258/// globals/constant expressions).
260 // Constant expressions and globals can't be vectorized like normal integer/FP
261 // constants.
262 return all_of(VL, isConstant);
263}
264
265/// \returns True if all of the values in \p VL are identical or some of them
266/// are UndefValue.
267static bool isSplat(ArrayRef<Value *> VL) {
268 Value *FirstNonUndef = nullptr;
269 for (Value *V : VL) {
270 if (isa<UndefValue>(V))
271 continue;
272 if (!FirstNonUndef) {
273 FirstNonUndef = V;
274 continue;
275 }
276 if (V != FirstNonUndef)
277 return false;
278 }
279 return FirstNonUndef != nullptr;
280}
281
282/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
284 if (auto *Cmp = dyn_cast<CmpInst>(I))
285 return Cmp->isCommutative();
286 if (auto *BO = dyn_cast<BinaryOperator>(I))
287 return BO->isCommutative();
288 // TODO: This should check for generic Instruction::isCommutative(), but
289 // we need to confirm that the caller code correctly handles Intrinsics
290 // for example (does not have 2 operands).
291 return false;
292}
293
294/// \returns inserting index of InsertElement or InsertValue instruction,
295/// using Offset as base offset for index.
296static std::optional<unsigned> getInsertIndex(const Value *InsertInst,
297 unsigned Offset = 0) {
298 int Index = Offset;
299 if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
300 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
301 if (!VT)
302 return std::nullopt;
303 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
304 if (!CI)
305 return std::nullopt;
306 if (CI->getValue().uge(VT->getNumElements()))
307 return std::nullopt;
308 Index *= VT->getNumElements();
309 Index += CI->getZExtValue();
310 return Index;
311 }
312
313 const auto *IV = cast<InsertValueInst>(InsertInst);
314 Type *CurrentType = IV->getType();
315 for (unsigned I : IV->indices()) {
316 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
317 Index *= ST->getNumElements();
318 CurrentType = ST->getElementType(I);
319 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
320 Index *= AT->getNumElements();
321 CurrentType = AT->getElementType();
322 } else {
323 return std::nullopt;
324 }
325 Index += I;
326 }
327 return Index;
328}
329
330namespace {
331/// Specifies the way the mask should be analyzed for undefs/poisonous elements
332/// in the shuffle mask.
333enum class UseMask {
334 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
335 ///< check for the mask elements for the first argument (mask
336 ///< indices are in range [0:VF)).
337 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
338 ///< for the mask elements for the second argument (mask indices
339 ///< are in range [VF:2*VF))
340 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
341 ///< future shuffle elements and mark them as ones as being used
342 ///< in future. Non-undef elements are considered as unused since
343 ///< they're already marked as used in the mask.
344};
345} // namespace
346
347/// Prepares a use bitset for the given mask either for the first argument or
348/// for the second.
350 UseMask MaskArg) {
351 SmallBitVector UseMask(VF, true);
352 for (auto [Idx, Value] : enumerate(Mask)) {
353 if (Value == UndefMaskElem) {
354 if (MaskArg == UseMask::UndefsAsMask)
355 UseMask.reset(Idx);
356 continue;
357 }
358 if (MaskArg == UseMask::FirstArg && Value < VF)
359 UseMask.reset(Value);
360 else if (MaskArg == UseMask::SecondArg && Value >= VF)
361 UseMask.reset(Value - VF);
362 }
363 return UseMask;
364}
365
366/// Checks if the given value is actually an undefined constant vector.
367/// Also, if the \p UseMask is not empty, tries to check if the non-masked
368/// elements actually mask the insertelement buildvector, if any.
369template <bool IsPoisonOnly = false>
371 const SmallBitVector &UseMask = {}) {
372 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
373 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
374 if (isa<T>(V))
375 return Res;
376 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
377 if (!VecTy)
378 return Res.reset();
379 auto *C = dyn_cast<Constant>(V);
380 if (!C) {
381 if (!UseMask.empty()) {
382 const Value *Base = V;
383 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
384 Base = II->getOperand(0);
385 if (isa<T>(II->getOperand(1)))
386 continue;
387 std::optional<unsigned> Idx = getInsertIndex(II);
388 if (!Idx)
389 continue;
390 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
391 Res.reset(*Idx);
392 }
393 // TODO: Add analysis for shuffles here too.
394 if (V == Base) {
395 Res.reset();
396 } else {
397 SmallBitVector SubMask(UseMask.size(), false);
398 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
399 }
400 } else {
401 Res.reset();
402 }
403 return Res;
404 }
405 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
406 if (Constant *Elem = C->getAggregateElement(I))
407 if (!isa<T>(Elem) &&
408 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
409 Res.reset(I);
410 }
411 return Res;
412}
413
414/// Checks if the vector of instructions can be represented as a shuffle, like:
415/// %x0 = extractelement <4 x i8> %x, i32 0
416/// %x3 = extractelement <4 x i8> %x, i32 3
417/// %y1 = extractelement <4 x i8> %y, i32 1
418/// %y2 = extractelement <4 x i8> %y, i32 2
419/// %x0x0 = mul i8 %x0, %x0
420/// %x3x3 = mul i8 %x3, %x3
421/// %y1y1 = mul i8 %y1, %y1
422/// %y2y2 = mul i8 %y2, %y2
423/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
424/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
425/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
426/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
427/// ret <4 x i8> %ins4
428/// can be transformed into:
429/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
430/// i32 6>
431/// %2 = mul <4 x i8> %1, %1
432/// ret <4 x i8> %2
433/// We convert this initially to something like:
434/// %x0 = extractelement <4 x i8> %x, i32 0
435/// %x3 = extractelement <4 x i8> %x, i32 3
436/// %y1 = extractelement <4 x i8> %y, i32 1
437/// %y2 = extractelement <4 x i8> %y, i32 2
438/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0
439/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
440/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
441/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
442/// %5 = mul <4 x i8> %4, %4
443/// %6 = extractelement <4 x i8> %5, i32 0
444/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0
445/// %7 = extractelement <4 x i8> %5, i32 1
446/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
447/// %8 = extractelement <4 x i8> %5, i32 2
448/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
449/// %9 = extractelement <4 x i8> %5, i32 3
450/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
451/// ret <4 x i8> %ins4
452/// InstCombiner transforms this into a shuffle and vector mul
453/// Mask will return the Shuffle Mask equivalent to the extracted elements.
454/// TODO: Can we split off and reuse the shuffle mask detection from
455/// ShuffleVectorInst/getShuffleCost?
456static std::optional<TargetTransformInfo::ShuffleKind>
458 const auto *It =
459 find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); });
460 if (It == VL.end())
461 return std::nullopt;
462 auto *EI0 = cast<ExtractElementInst>(*It);
463 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
464 return std::nullopt;
465 unsigned Size =
466 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
467 Value *Vec1 = nullptr;
468 Value *Vec2 = nullptr;
469 enum ShuffleMode { Unknown, Select, Permute };
470 ShuffleMode CommonShuffleMode = Unknown;
471 Mask.assign(VL.size(), UndefMaskElem);
472 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
473 // Undef can be represented as an undef element in a vector.
474 if (isa<UndefValue>(VL[I]))
475 continue;
476 auto *EI = cast<ExtractElementInst>(VL[I]);
477 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
478 return std::nullopt;
479 auto *Vec = EI->getVectorOperand();
480 // We can extractelement from undef or poison vector.
481 if (isUndefVector(Vec).all())
482 continue;
483 // All vector operands must have the same number of vector elements.
484 if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
485 return std::nullopt;
486 if (isa<UndefValue>(EI->getIndexOperand()))
487 continue;
488 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
489 if (!Idx)
490 return std::nullopt;
491 // Undefined behavior if Idx is negative or >= Size.
492 if (Idx->getValue().uge(Size))
493 continue;
494 unsigned IntIdx = Idx->getValue().getZExtValue();
495 Mask[I] = IntIdx;
496 // For correct shuffling we have to have at most 2 different vector operands
497 // in all extractelement instructions.
498 if (!Vec1 || Vec1 == Vec) {
499 Vec1 = Vec;
500 } else if (!Vec2 || Vec2 == Vec) {
501 Vec2 = Vec;
502 Mask[I] += Size;
503 } else {
504 return std::nullopt;
505 }
506 if (CommonShuffleMode == Permute)
507 continue;
508 // If the extract index is not the same as the operation number, it is a
509 // permutation.
510 if (IntIdx != I) {
511 CommonShuffleMode = Permute;
512 continue;
513 }
514 CommonShuffleMode = Select;
515 }
516 // If we're not crossing lanes in different vectors, consider it as blending.
517 if (CommonShuffleMode == Select && Vec2)
519 // If Vec2 was never used, we have a permutation of a single vector, otherwise
520 // we have permutation of 2 vectors.
523}
524
525/// \returns True if Extract{Value,Element} instruction extracts element Idx.
526static std::optional<unsigned> getExtractIndex(Instruction *E) {
527 unsigned Opcode = E->getOpcode();
528 assert((Opcode == Instruction::ExtractElement ||
529 Opcode == Instruction::ExtractValue) &&
530 "Expected extractelement or extractvalue instruction.");
531 if (Opcode == Instruction::ExtractElement) {
532 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
533 if (!CI)
534 return std::nullopt;
535 return CI->getZExtValue();
536 }
537 auto *EI = cast<ExtractValueInst>(E);
538 if (EI->getNumIndices() != 1)
539 return std::nullopt;
540 return *EI->idx_begin();
541}
542
543/// Tries to find extractelement instructions with constant indices from fixed
544/// vector type and gather such instructions into a bunch, which highly likely
545/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
546/// successful, the matched scalars are replaced by poison values in \p VL for
547/// future analysis.
548static std::optional<TTI::ShuffleKind>
550 SmallVectorImpl<int> &Mask) {
551 // Scan list of gathered scalars for extractelements that can be represented
552 // as shuffles.
554 SmallVector<int> UndefVectorExtracts;
555 for (int I = 0, E = VL.size(); I < E; ++I) {
556 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
557 if (!EI) {
558 if (isa<UndefValue>(VL[I]))
559 UndefVectorExtracts.push_back(I);
560 continue;
561 }
562 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
563 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
564 continue;
565 std::optional<unsigned> Idx = getExtractIndex(EI);
566 // Undefined index.
567 if (!Idx) {
568 UndefVectorExtracts.push_back(I);
569 continue;
570 }
571 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
572 ExtractMask.reset(*Idx);
573 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
574 UndefVectorExtracts.push_back(I);
575 continue;
576 }
577 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
578 }
579 // Sort the vector operands by the maximum number of uses in extractelements.
581 for (const auto &Data : VectorOpToIdx)
582 VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]
583 .push_back(Data.first);
584 for (auto &Data : VFToVector) {
585 stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {
586 return VectorOpToIdx.find(V1)->second.size() >
587 VectorOpToIdx.find(V2)->second.size();
588 });
589 }
590 // Find the best pair of the vectors with the same number of elements or a
591 // single vector.
592 const int UndefSz = UndefVectorExtracts.size();
593 unsigned SingleMax = 0;
594 Value *SingleVec = nullptr;
595 unsigned PairMax = 0;
596 std::pair<Value *, Value *> PairVec(nullptr, nullptr);
597 for (auto &Data : VFToVector) {
598 Value *V1 = Data.second.front();
599 if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {
600 SingleMax = VectorOpToIdx[V1].size() + UndefSz;
601 SingleVec = V1;
602 }
603 Value *V2 = nullptr;
604 if (Data.second.size() > 1)
605 V2 = *std::next(Data.second.begin());
606 if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +
607 UndefSz) {
608 PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;
609 PairVec = std::make_pair(V1, V2);
610 }
611 }
612 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
613 return std::nullopt;
614 // Check if better to perform a shuffle of 2 vectors or just of a single
615 // vector.
616 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
617 SmallVector<Value *> GatheredExtracts(
618 VL.size(), PoisonValue::get(VL.front()->getType()));
619 if (SingleMax >= PairMax && SingleMax) {
620 for (int Idx : VectorOpToIdx[SingleVec])
621 std::swap(GatheredExtracts[Idx], VL[Idx]);
622 } else {
623 for (Value *V : {PairVec.first, PairVec.second})
624 for (int Idx : VectorOpToIdx[V])
625 std::swap(GatheredExtracts[Idx], VL[Idx]);
626 }
627 // Add extracts from undefs too.
628 for (int Idx : UndefVectorExtracts)
629 std::swap(GatheredExtracts[Idx], VL[Idx]);
630 // Check that gather of extractelements can be represented as just a
631 // shuffle of a single/two vectors the scalars are extracted from.
632 std::optional<TTI::ShuffleKind> Res =
633 isFixedVectorShuffle(GatheredExtracts, Mask);
634 if (!Res) {
635 // TODO: try to check other subsets if possible.
636 // Restore the original VL if attempt was not successful.
637 VL.swap(SavedVL);
638 return std::nullopt;
639 }
640 // Restore unused scalars from mask, if some of the extractelements were not
641 // selected for shuffle.
642 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
643 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
644 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
645 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
646 is_contained(UndefVectorExtracts, I))
647 continue;
648 if (Mask[I] == UndefMaskElem && !isa<PoisonValue>(GatheredExtracts[I]))
649 std::swap(VL[I], GatheredExtracts[I]);
650 }
651 return Res;
652}
653
654namespace {
655
656/// Main data required for vectorization of instructions.
657struct InstructionsState {
658 /// The very first instruction in the list with the main opcode.
659 Value *OpValue = nullptr;
660
661 /// The main/alternate instruction.
662 Instruction *MainOp = nullptr;
663 Instruction *AltOp = nullptr;
664
665 /// The main/alternate opcodes for the list of instructions.
666 unsigned getOpcode() const {
667 return MainOp ? MainOp->getOpcode() : 0;
668 }
669
670 unsigned getAltOpcode() const {
671 return AltOp ? AltOp->getOpcode() : 0;
672 }
673
674 /// Some of the instructions in the list have alternate opcodes.
675 bool isAltShuffle() const { return AltOp != MainOp; }
676
677 bool isOpcodeOrAlt(Instruction *I) const {
678 unsigned CheckedOpcode = I->getOpcode();
679 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
680 }
681
682 InstructionsState() = delete;
683 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
684 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
685};
686
687} // end anonymous namespace
688
689/// Chooses the correct key for scheduling data. If \p Op has the same (or
690/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
691/// OpValue.
692static Value *isOneOf(const InstructionsState &S, Value *Op) {
693 auto *I = dyn_cast<Instruction>(Op);
694 if (I && S.isOpcodeOrAlt(I))
695 return Op;
696 return S.OpValue;
697}
698
699/// \returns true if \p Opcode is allowed as part of of the main/alternate
700/// instruction for SLP vectorization.
701///
702/// Example of unsupported opcode is SDIV that can potentially cause UB if the
703/// "shuffled out" lane would result in division by zero.
704static bool isValidForAlternation(unsigned Opcode) {
705 if (Instruction::isIntDivRem(Opcode))
706 return false;
707
708 return true;
709}
710
711static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
712 const TargetLibraryInfo &TLI,
713 unsigned BaseIndex = 0);
714
715/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
716/// compatible instructions or constants, or just some other regular values.
717static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
718 Value *Op1, const TargetLibraryInfo &TLI) {
719 return (isConstant(BaseOp0) && isConstant(Op0)) ||
720 (isConstant(BaseOp1) && isConstant(Op1)) ||
721 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
722 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
723 BaseOp0 == Op0 || BaseOp1 == Op1 ||
724 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
725 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
726}
727
728/// \returns true if a compare instruction \p CI has similar "look" and
729/// same predicate as \p BaseCI, "as is" or with its operands and predicate
730/// swapped, false otherwise.
731static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
732 const TargetLibraryInfo &TLI) {
733 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
734 "Assessing comparisons of different types?");
735 CmpInst::Predicate BasePred = BaseCI->getPredicate();
736 CmpInst::Predicate Pred = CI->getPredicate();
738
739 Value *BaseOp0 = BaseCI->getOperand(0);
740 Value *BaseOp1 = BaseCI->getOperand(1);
741 Value *Op0 = CI->getOperand(0);
742 Value *Op1 = CI->getOperand(1);
743
744 return (BasePred == Pred &&
745 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
746 (BasePred == SwappedPred &&
747 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
748}
749
750/// \returns analysis of the Instructions in \p VL described in
751/// InstructionsState, the Opcode that we suppose the whole list
752/// could be vectorized even if its structure is diverse.
753static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
754 const TargetLibraryInfo &TLI,
755 unsigned BaseIndex) {
756 // Make sure these are all Instructions.
757 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
758 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
759
760 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
761 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
762 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
763 CmpInst::Predicate BasePred =
764 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
766 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
767 unsigned AltOpcode = Opcode;
768 unsigned AltIndex = BaseIndex;
769
770 // Check for one alternate opcode from another BinaryOperator.
771 // TODO - generalize to support all operators (types, calls etc.).
772 auto *IBase = cast<Instruction>(VL[BaseIndex]);
773 Intrinsic::ID BaseID = 0;
774 SmallVector<VFInfo> BaseMappings;
775 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
777 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
778 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
779 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
780 }
781 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
782 auto *I = cast<Instruction>(VL[Cnt]);
783 unsigned InstOpcode = I->getOpcode();
784 if (IsBinOp && isa<BinaryOperator>(I)) {
785 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
786 continue;
787 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
788 isValidForAlternation(Opcode)) {
789 AltOpcode = InstOpcode;
790 AltIndex = Cnt;
791 continue;
792 }
793 } else if (IsCastOp && isa<CastInst>(I)) {
794 Value *Op0 = IBase->getOperand(0);
795 Type *Ty0 = Op0->getType();
796 Value *Op1 = I->getOperand(0);
797 Type *Ty1 = Op1->getType();
798 if (Ty0 == Ty1) {
799 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
800 continue;
801 if (Opcode == AltOpcode) {
803 isValidForAlternation(InstOpcode) &&
804 "Cast isn't safe for alternation, logic needs to be updated!");
805 AltOpcode = InstOpcode;
806 AltIndex = Cnt;
807 continue;
808 }
809 }
810 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
811 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
812 Type *Ty0 = BaseInst->getOperand(0)->getType();
813 Type *Ty1 = Inst->getOperand(0)->getType();
814 if (Ty0 == Ty1) {
815 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
816 // Check for compatible operands. If the corresponding operands are not
817 // compatible - need to perform alternate vectorization.
818 CmpInst::Predicate CurrentPred = Inst->getPredicate();
819 CmpInst::Predicate SwappedCurrentPred =
820 CmpInst::getSwappedPredicate(CurrentPred);
821
822 if (E == 2 &&
823 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
824 continue;
825
826 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
827 continue;
828 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
829 if (AltIndex != BaseIndex) {
830 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
831 continue;
832 } else if (BasePred != CurrentPred) {
833 assert(
834 isValidForAlternation(InstOpcode) &&
835 "CmpInst isn't safe for alternation, logic needs to be updated!");
836 AltIndex = Cnt;
837 continue;
838 }
839 CmpInst::Predicate AltPred = AltInst->getPredicate();
840 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
841 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
842 continue;
843 }
844 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
845 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
846 if (Gep->getNumOperands() != 2 ||
847 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
848 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
849 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
851 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
852 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
853 auto *BaseLI = cast<LoadInst>(IBase);
854 if (!LI->isSimple() || !BaseLI->isSimple())
855 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
856 } else if (auto *Call = dyn_cast<CallInst>(I)) {
857 auto *CallBase = cast<CallInst>(IBase);
858 if (Call->getCalledFunction() != CallBase->getCalledFunction())
859 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
860 if (Call->hasOperandBundles() &&
861 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
862 Call->op_begin() + Call->getBundleOperandsEndIndex(),
863 CallBase->op_begin() +
865 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
867 if (ID != BaseID)
868 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
869 if (!ID) {
871 if (Mappings.size() != BaseMappings.size() ||
872 Mappings.front().ISA != BaseMappings.front().ISA ||
873 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
874 Mappings.front().VectorName != BaseMappings.front().VectorName ||
875 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
876 Mappings.front().Shape.Parameters !=
877 BaseMappings.front().Shape.Parameters)
878 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
879 }
880 }
881 continue;
882 }
883 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
884 }
885
886 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
887 cast<Instruction>(VL[AltIndex]));
888}
889
890/// \returns true if all of the values in \p VL have the same type or false
891/// otherwise.
893 Type *Ty = VL[0]->getType();
894 for (int i = 1, e = VL.size(); i < e; i++)
895 if (VL[i]->getType() != Ty)
896 return false;
897
898 return true;
899}
900
901/// \returns True if in-tree use also needs extract. This refers to
902/// possible scalar operand in vectorized instruction.
903static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
904 TargetLibraryInfo *TLI) {
905 unsigned Opcode = UserInst->getOpcode();
906 switch (Opcode) {
907 case Instruction::Load: {
908 LoadInst *LI = cast<LoadInst>(UserInst);
909 return (LI->getPointerOperand() == Scalar);
910 }
911 case Instruction::Store: {
912 StoreInst *SI = cast<StoreInst>(UserInst);
913 return (SI->getPointerOperand() == Scalar);
914 }
915 case Instruction::Call: {
916 CallInst *CI = cast<CallInst>(UserInst);
918 for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {
920 return (CI->getArgOperand(i) == Scalar);
921 }
922 [[fallthrough]];
923 }
924 default:
925 return false;
926 }
927}
928
929/// \returns the AA location that is being access by the instruction.
931 if (StoreInst *SI = dyn_cast<StoreInst>(I))
932 return MemoryLocation::get(SI);
933 if (LoadInst *LI = dyn_cast<LoadInst>(I))
934 return MemoryLocation::get(LI);
935 return MemoryLocation();
936}
937
938/// \returns True if the instruction is not a volatile or atomic load/store.
939static bool isSimple(Instruction *I) {
940 if (LoadInst *LI = dyn_cast<LoadInst>(I))
941 return LI->isSimple();
942 if (StoreInst *SI = dyn_cast<StoreInst>(I))
943 return SI->isSimple();
944 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
945 return !MI->isVolatile();
946 return true;
947}
948
949/// Shuffles \p Mask in accordance with the given \p SubMask.
950static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
951 if (SubMask.empty())
952 return;
953 if (Mask.empty()) {
954 Mask.append(SubMask.begin(), SubMask.end());
955 return;
956 }
957 SmallVector<int> NewMask(SubMask.size(), UndefMaskElem);
958 int TermValue = std::min(Mask.size(), SubMask.size());
959 for (int I = 0, E = SubMask.size(); I < E; ++I) {
960 if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
961 Mask[SubMask[I]] >= TermValue)
962 continue;
963 NewMask[I] = Mask[SubMask[I]];
964 }
965 Mask.swap(NewMask);
966}
967
968/// Order may have elements assigned special value (size) which is out of
969/// bounds. Such indices only appear on places which correspond to undef values
970/// (see canReuseExtract for details) and used in order to avoid undef values
971/// have effect on operands ordering.
972/// The first loop below simply finds all unused indices and then the next loop
973/// nest assigns these indices for undef values positions.
974/// As an example below Order has two undef positions and they have assigned
975/// values 3 and 7 respectively:
976/// before: 6 9 5 4 9 2 1 0
977/// after: 6 3 5 4 7 2 1 0
979 const unsigned Sz = Order.size();
980 SmallBitVector UnusedIndices(Sz, /*t=*/true);
981 SmallBitVector MaskedIndices(Sz);
982 for (unsigned I = 0; I < Sz; ++I) {
983 if (Order[I] < Sz)
984 UnusedIndices.reset(Order[I]);
985 else
986 MaskedIndices.set(I);
987 }
988 if (MaskedIndices.none())
989 return;
990 assert(UnusedIndices.count() == MaskedIndices.count() &&
991 "Non-synced masked/available indices.");
992 int Idx = UnusedIndices.find_first();
993 int MIdx = MaskedIndices.find_first();
994 while (MIdx >= 0) {
995 assert(Idx >= 0 && "Indices must be synced.");
996 Order[MIdx] = Idx;
997 Idx = UnusedIndices.find_next(Idx);
998 MIdx = MaskedIndices.find_next(MIdx);
999 }
1000}
1001
1002namespace llvm {
1003
1005 SmallVectorImpl<int> &Mask) {
1006 Mask.clear();
1007 const unsigned E = Indices.size();
1008 Mask.resize(E, UndefMaskElem);
1009 for (unsigned I = 0; I < E; ++I)
1010 Mask[Indices[I]] = I;
1011}
1012
1013/// Reorders the list of scalars in accordance with the given \p Mask.
1015 ArrayRef<int> Mask) {
1016 assert(!Mask.empty() && "Expected non-empty mask.");
1017 SmallVector<Value *> Prev(Scalars.size(),
1018 UndefValue::get(Scalars.front()->getType()));
1019 Prev.swap(Scalars);
1020 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1021 if (Mask[I] != UndefMaskElem)
1022 Scalars[Mask[I]] = Prev[I];
1023}
1024
1025/// Checks if the provided value does not require scheduling. It does not
1026/// require scheduling if this is not an instruction or it is an instruction
1027/// that does not read/write memory and all operands are either not instructions
1028/// or phi nodes or instructions from different blocks.
1030 auto *I = dyn_cast<Instruction>(V);
1031 if (!I)
1032 return true;
1033 return !mayHaveNonDefUseDependency(*I) &&
1034 all_of(I->operands(), [I](Value *V) {
1035 auto *IO = dyn_cast<Instruction>(V);
1036 if (!IO)
1037 return true;
1038 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1039 });
1040}
1041
1042/// Checks if the provided value does not require scheduling. It does not
1043/// require scheduling if this is not an instruction or it is an instruction
1044/// that does not read/write memory and all users are phi nodes or instructions
1045/// from the different blocks.
1046static bool isUsedOutsideBlock(Value *V) {
1047 auto *I = dyn_cast<Instruction>(V);
1048 if (!I)
1049 return true;
1050 // Limits the number of uses to save compile time.
1051 constexpr int UsesLimit = 8;
1052 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1053 all_of(I->users(), [I](User *U) {
1054 auto *IU = dyn_cast<Instruction>(U);
1055 if (!IU)
1056 return true;
1057 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1058 });
1059}
1060
1061/// Checks if the specified value does not require scheduling. It does not
1062/// require scheduling if all operands and all users do not need to be scheduled
1063/// in the current basic block.
1066}
1067
1068/// Checks if the specified array of instructions does not require scheduling.
1069/// It is so if all either instructions have operands that do not require
1070/// scheduling or their users do not require scheduling since they are phis or
1071/// in other basic blocks.
1073 return !VL.empty() &&
1075}
1076
1077namespace slpvectorizer {
1078
1079/// Bottom Up SLP Vectorizer.
1080class BoUpSLP {
1081 struct TreeEntry;
1082 struct ScheduleData;
1084
1085public:
1093
1095 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1098 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li),
1099 DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
1100 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1101 // Use the vector register size specified by the target unless overridden
1102 // by a command-line option.
1103 // TODO: It would be better to limit the vectorization factor based on
1104 // data type rather than just register size. For example, x86 AVX has
1105 // 256-bit registers, but it does not support integer operations
1106 // at that width (that requires AVX2).
1107 if (MaxVectorRegSizeOption.getNumOccurrences())
1108 MaxVecRegSize = MaxVectorRegSizeOption;
1109 else
1110 MaxVecRegSize =
1112 .getFixedValue();
1113
1114 if (MinVectorRegSizeOption.getNumOccurrences())
1115 MinVecRegSize = MinVectorRegSizeOption;
1116 else
1117 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1118 }
1119
1120 /// Vectorize the tree that starts with the elements in \p VL.
1121 /// Returns the vectorized root.
1123
1124 /// Vectorize the tree but with the list of externally used values \p
1125 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1126 /// generated extractvalue instructions.
1127 Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
1128 Instruction *ReductionRoot = nullptr);
1129
1130 /// \returns the cost incurred by unwanted spills and fills, caused by
1131 /// holding live values over call sites.
1133
1134 /// \returns the vectorization cost of the subtree that starts at \p VL.
1135 /// A negative number means that this is profitable.
1136 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1137
1138 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1139 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1140 void buildTree(ArrayRef<Value *> Roots,
1141 const SmallDenseSet<Value *> &UserIgnoreLst);
1142
1143 /// Construct a vectorizable tree that starts at \p Roots.
1144 void buildTree(ArrayRef<Value *> Roots);
1145
1146 /// Checks if the very first tree node is going to be vectorized.
1148 return !VectorizableTree.empty() &&
1149 VectorizableTree.front()->State == TreeEntry::Vectorize;
1150 }
1151
1152 /// Returns the main instruction for the very first node.
1154 assert(!VectorizableTree.empty() && "No tree to get the first node from");
1155 return VectorizableTree.front()->getMainOp();
1156 }
1157
1158 /// Returns whether the root node has in-tree uses.
1160 return !VectorizableTree.empty() &&
1161 !VectorizableTree.front()->UserTreeIndices.empty();
1162 }
1163
1164 /// Builds external uses of the vectorized scalars, i.e. the list of
1165 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1166 /// ExternallyUsedValues contains additional list of external uses to handle
1167 /// vectorization of reductions.
1168 void
1169 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1170
1171 /// Clear the internal data structures that are created by 'buildTree'.
1172 void deleteTree() {
1173 VectorizableTree.clear();
1174 ScalarToTreeEntry.clear();
1175 MustGather.clear();
1176 EntryToLastInstruction.clear();
1177 ExternalUses.clear();
1178 for (auto &Iter : BlocksSchedules) {
1179 BlockScheduling *BS = Iter.second.get();
1180 BS->clear();
1181 }
1182 MinBWs.clear();
1183 InstrElementSize.clear();
1184 UserIgnoreList = nullptr;
1185 PostponedGathers.clear();
1186 }
1187
1188 unsigned getTreeSize() const { return VectorizableTree.size(); }
1189
1190 /// Perform LICM and CSE on the newly generated gather sequences.
1192
1193 /// Checks if the specified gather tree entry \p TE can be represented as a
1194 /// shuffled vector entry + (possibly) permutation with other gathers. It
1195 /// implements the checks only for possibly ordered scalars (Loads,
1196 /// ExtractElement, ExtractValue), which can be part of the graph.
1197 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1198
1199 /// Sort loads into increasing pointers offsets to allow greater clustering.
1200 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1201
1202 /// Gets reordering data for the given tree entry. If the entry is vectorized
1203 /// - just return ReorderIndices, otherwise check if the scalars can be
1204 /// reordered and return the most optimal order.
1205 /// \return std::nullopt if ordering is not important, empty order, if
1206 /// identity order is important, or the actual order.
1207 /// \param TopToBottom If true, include the order of vectorized stores and
1208 /// insertelement nodes, otherwise skip them.
1209 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1210 bool TopToBottom);
1211
1212 /// Reorders the current graph to the most profitable order starting from the
1213 /// root node to the leaf nodes. The best order is chosen only from the nodes
1214 /// of the same size (vectorization factor). Smaller nodes are considered
1215 /// parts of subgraph with smaller VF and they are reordered independently. We
1216 /// can make it because we still need to extend smaller nodes to the wider VF
1217 /// and we can merge reordering shuffles with the widening shuffles.
1218 void reorderTopToBottom();
1219
1220 /// Reorders the current graph to the most profitable order starting from
1221 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1222 /// number of reshuffles if the leaf nodes use the same order. In this case we
1223 /// can merge the orders and just shuffle user node instead of shuffling its
1224 /// operands. Plus, even the leaf nodes have different orders, it allows to
1225 /// sink reordering in the graph closer to the root node and merge it later
1226 /// during analysis.
1227 void reorderBottomToTop(bool IgnoreReorder = false);
1228
1229 /// \return The vector element size in bits to use when vectorizing the
1230 /// expression tree ending at \p V. If V is a store, the size is the width of
1231 /// the stored value. Otherwise, the size is the width of the largest loaded
1232 /// value reaching V. This method is used by the vectorizer to calculate
1233 /// vectorization factors.
1234 unsigned getVectorElementSize(Value *V);
1235
1236 /// Compute the minimum type sizes required to represent the entries in a
1237 /// vectorizable tree.
1239
1240 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1241 unsigned getMaxVecRegSize() const {
1242 return MaxVecRegSize;
1243 }
1244
1245 // \returns minimum vector register size as set by cl::opt.
1246 unsigned getMinVecRegSize() const {
1247 return MinVecRegSize;
1248 }
1249
1250 unsigned getMinVF(unsigned Sz) const {
1251 return std::max(2U, getMinVecRegSize() / Sz);
1252 }
1253
1254 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1255 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1256 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1257 return MaxVF ? MaxVF : UINT_MAX;
1258 }
1259
1260 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1261 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1262 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1263 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1264 ///
1265 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1266 unsigned canMapToVector(Type *T, const DataLayout &DL) const;
1267
1268 /// \returns True if the VectorizableTree is both tiny and not fully
1269 /// vectorizable. We do not vectorize such trees.
1270 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1271
1272 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1273 /// can be load combined in the backend. Load combining may not be allowed in
1274 /// the IR optimizer, so we do not want to alter the pattern. For example,
1275 /// partially transforming a scalar bswap() pattern into vector code is
1276 /// effectively impossible for the backend to undo.
1277 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1278 /// may not be necessary.
1279 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1280
1281 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1282 /// can be load combined in the backend. Load combining may not be allowed in
1283 /// the IR optimizer, so we do not want to alter the pattern. For example,
1284 /// partially transforming a scalar bswap() pattern into vector code is
1285 /// effectively impossible for the backend to undo.
1286 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1287 /// may not be necessary.
1288 bool isLoadCombineCandidate() const;
1289
1291
1292 /// This structure holds any data we need about the edges being traversed
1293 /// during buildTree_rec(). We keep track of:
1294 /// (i) the user TreeEntry index, and
1295 /// (ii) the index of the edge.
1296 struct EdgeInfo {
1297 EdgeInfo() = default;
1298 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1300 /// The user TreeEntry.
1301 TreeEntry *UserTE = nullptr;
1302 /// The operand index of the use.
1303 unsigned EdgeIdx = UINT_MAX;
1304#ifndef NDEBUG
1306 const BoUpSLP::EdgeInfo &EI) {
1307 EI.dump(OS);
1308 return OS;
1309 }
1310 /// Debug print.
1311 void dump(raw_ostream &OS) const {
1312 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1313 << " EdgeIdx:" << EdgeIdx << "}";
1314 }
1315 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1316#endif
1317 };
1318
1319 /// A helper class used for scoring candidates for two consecutive lanes.
1321 const TargetLibraryInfo &TLI;
1322 const DataLayout &DL;
1323 ScalarEvolution &SE;
1324 const BoUpSLP &R;
1325 int NumLanes; // Total number of lanes (aka vectorization factor).
1326 int MaxLevel; // The maximum recursion depth for accumulating score.
1327
1328 public:
1330 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1331 int MaxLevel)
1332 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1333 MaxLevel(MaxLevel) {}
1334
1335 // The hard-coded scores listed here are not very important, though it shall
1336 // be higher for better matches to improve the resulting cost. When
1337 // computing the scores of matching one sub-tree with another, we are
1338 // basically counting the number of values that are matching. So even if all
1339 // scores are set to 1, we would still get a decent matching result.
1340 // However, sometimes we have to break ties. For example we may have to
1341 // choose between matching loads vs matching opcodes. This is what these
1342 // scores are helping us with: they provide the order of preference. Also,
1343 // this is important if the scalar is externally used or used in another
1344 // tree entry node in the different lane.
1345
1346 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1347 static const int ScoreConsecutiveLoads = 4;
1348 /// The same load multiple times. This should have a better score than
1349 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1350 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1351 /// a vector load and 1.0 for a broadcast.
1352 static const int ScoreSplatLoads = 3;
1353 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1354 static const int ScoreReversedLoads = 3;
1355 /// A load candidate for masked gather.
1356 static const int ScoreMaskedGatherCandidate = 1;
1357 /// ExtractElementInst from same vector and consecutive indexes.
1358 static const int ScoreConsecutiveExtracts = 4;
1359 /// ExtractElementInst from same vector and reversed indices.
1360 static const int ScoreReversedExtracts = 3;
1361 /// Constants.
1362 static const int ScoreConstants = 2;
1363 /// Instructions with the same opcode.
1364 static const int ScoreSameOpcode = 2;
1365 /// Instructions with alt opcodes (e.g, add + sub).
1366 static const int ScoreAltOpcodes = 1;
1367 /// Identical instructions (a.k.a. splat or broadcast).
1368 static const int ScoreSplat = 1;
1369 /// Matching with an undef is preferable to failing.
1370 static const int ScoreUndef = 1;
1371 /// Score for failing to find a decent match.
1372 static const int ScoreFail = 0;
1373 /// Score if all users are vectorized.
1374 static const int ScoreAllUserVectorized = 1;
1375
1376 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1377 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1378 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1379 /// MainAltOps.
1381 ArrayRef<Value *> MainAltOps) const {
1382 if (!isValidElementType(V1->getType()) ||
1383 !isValidElementType(V2->getType()))
1385
1386 if (V1 == V2) {
1387 if (isa<LoadInst>(V1)) {
1388 // Retruns true if the users of V1 and V2 won't need to be extracted.
1389 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1390 // Bail out if we have too many uses to save compilation time.
1391 static constexpr unsigned Limit = 8;
1392 if (V1->hasNUsesOrMore(Limit) || V2->hasNUsesOrMore(Limit))
1393 return false;
1394
1395 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1396 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1397 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1398 });
1399 };
1400 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1401 };
1402 // A broadcast of a load can be cheaper on some targets.
1403 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1404 ElementCount::getFixed(NumLanes)) &&
1405 ((int)V1->getNumUses() == NumLanes ||
1406 AllUsersAreInternal(V1, V2)))
1408 }
1410 }
1411
1412 auto *LI1 = dyn_cast<LoadInst>(V1);
1413 auto *LI2 = dyn_cast<LoadInst>(V2);
1414 if (LI1 && LI2) {
1415 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1416 !LI2->isSimple())
1418
1419 std::optional<int> Dist = getPointersDiff(
1420 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1421 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1422 if (!Dist || *Dist == 0) {
1423 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1424 getUnderlyingObject(LI2->getPointerOperand()) &&
1425 R.TTI->isLegalMaskedGather(
1426 FixedVectorType::get(LI1->getType(), NumLanes),
1427 LI1->getAlign()))
1430 }
1431 // The distance is too large - still may be profitable to use masked
1432 // loads/gathers.
1433 if (std::abs(*Dist) > NumLanes / 2)
1435 // This still will detect consecutive loads, but we might have "holes"
1436 // in some cases. It is ok for non-power-2 vectorization and may produce
1437 // better results. It should not affect current vectorization.
1440 }
1441
1442 auto *C1 = dyn_cast<Constant>(V1);
1443 auto *C2 = dyn_cast<Constant>(V2);
1444 if (C1 && C2)
1446
1447 // Extracts from consecutive indexes of the same vector better score as
1448 // the extracts could be optimized away.
1449 Value *EV1;
1450 ConstantInt *Ex1Idx;
1451 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1452 // Undefs are always profitable for extractelements.
1453 // Compiler can easily combine poison and extractelement <non-poison> or
1454 // undef and extractelement <poison>. But combining undef +
1455 // extractelement <non-poison-but-may-produce-poison> requires some
1456 // extra operations.
1457 if (isa<UndefValue>(V2))
1458 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1461 Value *EV2 = nullptr;
1462 ConstantInt *Ex2Idx = nullptr;
1463 if (match(V2,
1465 m_Undef())))) {
1466 // Undefs are always profitable for extractelements.
1467 if (!Ex2Idx)
1469 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1471 if (EV2 == EV1) {
1472 int Idx1 = Ex1Idx->getZExtValue();
1473 int Idx2 = Ex2Idx->getZExtValue();
1474 int Dist = Idx2 - Idx1;
1475 // The distance is too large - still may be profitable to use
1476 // shuffles.
1477 if (std::abs(Dist) == 0)
1479 if (std::abs(Dist) > NumLanes / 2)
1483 }
1485 }
1487 }
1488
1489 auto *I1 = dyn_cast<Instruction>(V1);
1490 auto *I2 = dyn_cast<Instruction>(V2);
1491 if (I1 && I2) {
1492 if (I1->getParent() != I2->getParent())
1494 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1495 Ops.push_back(I1);
1496 Ops.push_back(I2);
1497 InstructionsState S = getSameOpcode(Ops, TLI);
1498 // Note: Only consider instructions with <= 2 operands to avoid
1499 // complexity explosion.
1500 if (S.getOpcode() &&
1501 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1502 !S.isAltShuffle()) &&
1503 all_of(Ops, [&S](Value *V) {
1504 return cast<Instruction>(V)->getNumOperands() ==
1505 S.MainOp->getNumOperands();
1506 }))
1507 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1509 }
1510
1511 if (isa<UndefValue>(V2))
1513
1515 }
1516
1517 /// Go through the operands of \p LHS and \p RHS recursively until
1518 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1519 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1520 /// of \p U1 and \p U2), except at the beginning of the recursion where
1521 /// these are set to nullptr.
1522 ///
1523 /// For example:
1524 /// \verbatim
1525 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1526 /// \ / \ / \ / \ /
1527 /// + + + +
1528 /// G1 G2 G3 G4
1529 /// \endverbatim
1530 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1531 /// each level recursively, accumulating the score. It starts from matching
1532 /// the additions at level 0, then moves on to the loads (level 1). The
1533 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1534 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1535 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1536 /// Please note that the order of the operands does not matter, as we
1537 /// evaluate the score of all profitable combinations of operands. In
1538 /// other words the score of G1 and G4 is the same as G1 and G2. This
1539 /// heuristic is based on ideas described in:
1540 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1541 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1542 /// Luís F. W. Góes
1544 Instruction *U2, int CurrLevel,
1545 ArrayRef<Value *> MainAltOps) const {
1546
1547 // Get the shallow score of V1 and V2.
1548 int ShallowScoreAtThisLevel =
1549 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1550
1551 // If reached MaxLevel,
1552 // or if V1 and V2 are not instructions,
1553 // or if they are SPLAT,
1554 // or if they are not consecutive,
1555 // or if profitable to vectorize loads or extractelements, early return
1556 // the current cost.
1557 auto *I1 = dyn_cast<Instruction>(LHS);
1558 auto *I2 = dyn_cast<Instruction>(RHS);
1559 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1560 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1561 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1562 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1563 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1564 ShallowScoreAtThisLevel))
1565 return ShallowScoreAtThisLevel;
1566 assert(I1 && I2 && "Should have early exited.");
1567
1568 // Contains the I2 operand indexes that got matched with I1 operands.
1569 SmallSet<unsigned, 4> Op2Used;
1570
1571 // Recursion towards the operands of I1 and I2. We are trying all possible
1572 // operand pairs, and keeping track of the best score.
1573 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1574 OpIdx1 != NumOperands1; ++OpIdx1) {
1575 // Try to pair op1I with the best operand of I2.
1576 int MaxTmpScore = 0;
1577 unsigned MaxOpIdx2 = 0;
1578 bool FoundBest = false;
1579 // If I2 is commutative try all combinations.
1580 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1581 unsigned ToIdx = isCommutative(I2)
1582 ? I2->getNumOperands()
1583 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1584 assert(FromIdx <= ToIdx && "Bad index");
1585 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1586 // Skip operands already paired with OpIdx1.
1587 if (Op2Used.count(OpIdx2))
1588 continue;
1589 // Recursively calculate the cost at each level
1590 int TmpScore =
1591 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1592 I1, I2, CurrLevel + 1, std::nullopt);
1593 // Look for the best score.
1594 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1595 TmpScore > MaxTmpScore) {
1596 MaxTmpScore = TmpScore;
1597 MaxOpIdx2 = OpIdx2;
1598 FoundBest = true;
1599 }
1600 }
1601 if (FoundBest) {
1602 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1603 Op2Used.insert(MaxOpIdx2);
1604 ShallowScoreAtThisLevel += MaxTmpScore;
1605 }
1606 }
1607 return ShallowScoreAtThisLevel;
1608 }
1609 };
1610 /// A helper data structure to hold the operands of a vector of instructions.
1611 /// This supports a fixed vector length for all operand vectors.
1613 /// For each operand we need (i) the value, and (ii) the opcode that it
1614 /// would be attached to if the expression was in a left-linearized form.
1615 /// This is required to avoid illegal operand reordering.
1616 /// For example:
1617 /// \verbatim
1618 /// 0 Op1
1619 /// |/
1620 /// Op1 Op2 Linearized + Op2
1621 /// \ / ----------> |/
1622 /// - -
1623 ///
1624 /// Op1 - Op2 (0 + Op1) - Op2
1625 /// \endverbatim
1626 ///
1627 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1628 ///
1629 /// Another way to think of this is to track all the operations across the
1630 /// path from the operand all the way to the root of the tree and to
1631 /// calculate the operation that corresponds to this path. For example, the
1632 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1633 /// corresponding operation is a '-' (which matches the one in the
1634 /// linearized tree, as shown above).
1635 ///
1636 /// For lack of a better term, we refer to this operation as Accumulated
1637 /// Path Operation (APO).
1638 struct OperandData {
1639 OperandData() = default;
1640 OperandData(Value *V, bool APO, bool IsUsed)
1641 : V(V), APO(APO), IsUsed(IsUsed) {}
1642 /// The operand value.
1643 Value *V = nullptr;
1644 /// TreeEntries only allow a single opcode, or an alternate sequence of
1645 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1646 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1647 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1648 /// (e.g., Add/Mul)
1649 bool APO = false;
1650 /// Helper data for the reordering function.
1651 bool IsUsed = false;
1652 };
1653
1654 /// During operand reordering, we are trying to select the operand at lane
1655 /// that matches best with the operand at the neighboring lane. Our
1656 /// selection is based on the type of value we are looking for. For example,
1657 /// if the neighboring lane has a load, we need to look for a load that is
1658 /// accessing a consecutive address. These strategies are summarized in the
1659 /// 'ReorderingMode' enumerator.
1660 enum class ReorderingMode {
1661 Load, ///< Matching loads to consecutive memory addresses
1662 Opcode, ///< Matching instructions based on opcode (same or alternate)
1663 Constant, ///< Matching constants
1664 Splat, ///< Matching the same instruction multiple times (broadcast)
1665 Failed, ///< We failed to create a vectorizable group
1666 };
1667
1669
1670 /// A vector of operand vectors.
1672
1673 const TargetLibraryInfo &TLI;
1674 const DataLayout &DL;
1675 ScalarEvolution &SE;
1676 const BoUpSLP &R;
1677
1678 /// \returns the operand data at \p OpIdx and \p Lane.
1679 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1680 return OpsVec[OpIdx][Lane];
1681 }
1682
1683 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1684 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1685 return OpsVec[OpIdx][Lane];
1686 }
1687
1688 /// Clears the used flag for all entries.
1689 void clearUsed() {
1690 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1691 OpIdx != NumOperands; ++OpIdx)
1692 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1693 ++Lane)
1694 OpsVec[OpIdx][Lane].IsUsed = false;
1695 }
1696
1697 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1698 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1699 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1700 }
1701
1702 /// \param Lane lane of the operands under analysis.
1703 /// \param OpIdx operand index in \p Lane lane we're looking the best
1704 /// candidate for.
1705 /// \param Idx operand index of the current candidate value.
1706 /// \returns The additional score due to possible broadcasting of the
1707 /// elements in the lane. It is more profitable to have power-of-2 unique
1708 /// elements in the lane, it will be vectorized with higher probability
1709 /// after removing duplicates. Currently the SLP vectorizer supports only
1710 /// vectorization of the power-of-2 number of unique scalars.
1711 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1712 Value *IdxLaneV = getData(Idx, Lane).V;
1713 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1714 return 0;
1716 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1717 if (Ln == Lane)
1718 continue;
1719 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1720 if (!isa<Instruction>(OpIdxLnV))
1721 return 0;
1722 Uniques.insert(OpIdxLnV);
1723 }
1724 int UniquesCount = Uniques.size();
1725 int UniquesCntWithIdxLaneV =
1726 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1727 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1728 int UniquesCntWithOpIdxLaneV =
1729 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1730 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1731 return 0;
1732 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1733 UniquesCntWithOpIdxLaneV) -
1734 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1735 }
1736
1737 /// \param Lane lane of the operands under analysis.
1738 /// \param OpIdx operand index in \p Lane lane we're looking the best
1739 /// candidate for.
1740 /// \param Idx operand index of the current candidate value.
1741 /// \returns The additional score for the scalar which users are all
1742 /// vectorized.
1743 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1744 Value *IdxLaneV = getData(Idx, Lane).V;
1745 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1746 // Do not care about number of uses for vector-like instructions
1747 // (extractelement/extractvalue with constant indices), they are extracts
1748 // themselves and already externally used. Vectorization of such
1749 // instructions does not add extra extractelement instruction, just may
1750 // remove it.
1751 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1752 isVectorLikeInstWithConstOps(OpIdxLaneV))
1754 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1755 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1756 return 0;
1757 return R.areAllUsersVectorized(IdxLaneI, std::nullopt)
1759 : 0;
1760 }
1761
1762 /// Score scaling factor for fully compatible instructions but with
1763 /// different number of external uses. Allows better selection of the
1764 /// instructions with less external uses.
1765 static const int ScoreScaleFactor = 10;
1766
1767 /// \Returns the look-ahead score, which tells us how much the sub-trees
1768 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1769 /// score. This helps break ties in an informed way when we cannot decide on
1770 /// the order of the operands by just considering the immediate
1771 /// predecessors.
1772 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1773 int Lane, unsigned OpIdx, unsigned Idx,
1774 bool &IsUsed) {
1775 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1777 // Keep track of the instruction stack as we recurse into the operands
1778 // during the look-ahead score exploration.
1779 int Score =
1780 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1781 /*CurrLevel=*/1, MainAltOps);
1782 if (Score) {
1783 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1784 if (Score <= -SplatScore) {
1785 // Set the minimum score for splat-like sequence to avoid setting
1786 // failed state.
1787 Score = 1;
1788 } else {
1789 Score += SplatScore;
1790 // Scale score to see the difference between different operands
1791 // and similar operands but all vectorized/not all vectorized
1792 // uses. It does not affect actual selection of the best
1793 // compatible operand in general, just allows to select the
1794 // operand with all vectorized uses.
1795 Score *= ScoreScaleFactor;
1796 Score += getExternalUseScore(Lane, OpIdx, Idx);
1797 IsUsed = true;
1798 }
1799 }
1800 return Score;
1801 }
1802
1803 /// Best defined scores per lanes between the passes. Used to choose the
1804 /// best operand (with the highest score) between the passes.
1805 /// The key - {Operand Index, Lane}.
1806 /// The value - the best score between the passes for the lane and the
1807 /// operand.
1809 BestScoresPerLanes;
1810
1811 // Search all operands in Ops[*][Lane] for the one that matches best
1812 // Ops[OpIdx][LastLane] and return its opreand index.
1813 // If no good match can be found, return std::nullopt.
1814 std::optional<unsigned>
1815 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1816 ArrayRef<ReorderingMode> ReorderingModes,
1817 ArrayRef<Value *> MainAltOps) {
1818 unsigned NumOperands = getNumOperands();
1819
1820 // The operand of the previous lane at OpIdx.
1821 Value *OpLastLane = getData(OpIdx, LastLane).V;
1822
1823 // Our strategy mode for OpIdx.
1824 ReorderingMode RMode = ReorderingModes[OpIdx];
1825 if (RMode == ReorderingMode::Failed)
1826 return std::nullopt;
1827
1828 // The linearized opcode of the operand at OpIdx, Lane.
1829 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1830
1831 // The best operand index and its score.
1832 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1833 // are using the score to differentiate between the two.
1834 struct BestOpData {
1835 std::optional<unsigned> Idx;
1836 unsigned Score = 0;
1837 } BestOp;
1838 BestOp.Score =
1839 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1840 .first->second;
1841
1842 // Track if the operand must be marked as used. If the operand is set to
1843 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1844 // want to reestimate the operands again on the following iterations).
1845 bool IsUsed =
1846 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1847 // Iterate through all unused operands and look for the best.
1848 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1849 // Get the operand at Idx and Lane.
1850 OperandData &OpData = getData(Idx, Lane);
1851 Value *Op = OpData.V;
1852 bool OpAPO = OpData.APO;
1853
1854 // Skip already selected operands.
1855 if (OpData.IsUsed)
1856 continue;
1857
1858 // Skip if we are trying to move the operand to a position with a
1859 // different opcode in the linearized tree form. This would break the
1860 // semantics.
1861 if (OpAPO != OpIdxAPO)
1862 continue;
1863
1864 // Look for an operand that matches the current mode.
1865 switch (RMode) {
1866 case ReorderingMode::Load:
1867 case ReorderingMode::Constant:
1868 case ReorderingMode::Opcode: {
1869 bool LeftToRight = Lane > LastLane;
1870 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1871 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1872 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1873 OpIdx, Idx, IsUsed);
1874 if (Score > static_cast<int>(BestOp.Score)) {
1875 BestOp.Idx = Idx;
1876 BestOp.Score = Score;
1877 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1878 }
1879 break;
1880 }
1881 case ReorderingMode::Splat:
1882 if (Op == OpLastLane)
1883 BestOp.Idx = Idx;
1884 break;
1885 case ReorderingMode::Failed:
1886 llvm_unreachable("Not expected Failed reordering mode.");
1887 }
1888 }
1889
1890 if (BestOp.Idx) {
1891 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1892 return BestOp.Idx;
1893 }
1894 // If we could not find a good match return std::nullopt.
1895 return std::nullopt;
1896 }
1897
1898 /// Helper for reorderOperandVecs.
1899 /// \returns the lane that we should start reordering from. This is the one
1900 /// which has the least number of operands that can freely move about or
1901 /// less profitable because it already has the most optimal set of operands.
1902 unsigned getBestLaneToStartReordering() const {
1903 unsigned Min = UINT_MAX;
1904 unsigned SameOpNumber = 0;
1905 // std::pair<unsigned, unsigned> is used to implement a simple voting
1906 // algorithm and choose the lane with the least number of operands that
1907 // can freely move about or less profitable because it already has the
1908 // most optimal set of operands. The first unsigned is a counter for
1909 // voting, the second unsigned is the counter of lanes with instructions
1910 // with same/alternate opcodes and same parent basic block.
1912 // Try to be closer to the original results, if we have multiple lanes
1913 // with same cost. If 2 lanes have the same cost, use the one with the
1914 // lowest index.
1915 for (int I = getNumLanes(); I > 0; --I) {
1916 unsigned Lane = I - 1;
1917 OperandsOrderData NumFreeOpsHash =
1918 getMaxNumOperandsThatCanBeReordered(Lane);
1919 // Compare the number of operands that can move and choose the one with
1920 // the least number.
1921 if (NumFreeOpsHash.NumOfAPOs < Min) {
1922 Min = NumFreeOpsHash.NumOfAPOs;
1923 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1924 HashMap.clear();
1925 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1926 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1927 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1928 // Select the most optimal lane in terms of number of operands that
1929 // should be moved around.
1930 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1931 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1932 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1933 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1934 auto It = HashMap.find(NumFreeOpsHash.Hash);
1935 if (It == HashMap.end())
1936 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1937 else
1938 ++It->second.first;
1939 }
1940 }
1941 // Select the lane with the minimum counter.
1942 unsigned BestLane = 0;
1943 unsigned CntMin = UINT_MAX;
1944 for (const auto &Data : reverse(HashMap)) {
1945 if (Data.second.first < CntMin) {
1946 CntMin = Data.second.first;
1947 BestLane = Data.second.second;
1948 }
1949 }
1950 return BestLane;
1951 }
1952
1953 /// Data structure that helps to reorder operands.
1954 struct OperandsOrderData {
1955 /// The best number of operands with the same APOs, which can be
1956 /// reordered.
1957 unsigned NumOfAPOs = UINT_MAX;
1958 /// Number of operands with the same/alternate instruction opcode and
1959 /// parent.
1960 unsigned NumOpsWithSameOpcodeParent = 0;
1961 /// Hash for the actual operands ordering.
1962 /// Used to count operands, actually their position id and opcode
1963 /// value. It is used in the voting mechanism to find the lane with the
1964 /// least number of operands that can freely move about or less profitable
1965 /// because it already has the most optimal set of operands. Can be
1966 /// replaced with SmallVector<unsigned> instead but hash code is faster
1967 /// and requires less memory.
1968 unsigned Hash = 0;
1969 };
1970 /// \returns the maximum number of operands that are allowed to be reordered
1971 /// for \p Lane and the number of compatible instructions(with the same
1972 /// parent/opcode). This is used as a heuristic for selecting the first lane
1973 /// to start operand reordering.
1974 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
1975 unsigned CntTrue = 0;
1976 unsigned NumOperands = getNumOperands();
1977 // Operands with the same APO can be reordered. We therefore need to count
1978 // how many of them we have for each APO, like this: Cnt[APO] = x.
1979 // Since we only have two APOs, namely true and false, we can avoid using
1980 // a map. Instead we can simply count the number of operands that
1981 // correspond to one of them (in this case the 'true' APO), and calculate
1982 // the other by subtracting it from the total number of operands.
1983 // Operands with the same instruction opcode and parent are more
1984 // profitable since we don't need to move them in many cases, with a high
1985 // probability such lane already can be vectorized effectively.
1986 bool AllUndefs = true;
1987 unsigned NumOpsWithSameOpcodeParent = 0;
1988 Instruction *OpcodeI = nullptr;
1989 BasicBlock *Parent = nullptr;
1990 unsigned Hash = 0;
1991 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1992 const OperandData &OpData = getData(OpIdx, Lane);
1993 if (OpData.APO)
1994 ++CntTrue;
1995 // Use Boyer-Moore majority voting for finding the majority opcode and
1996 // the number of times it occurs.
1997 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
1998 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
1999 I->getParent() != Parent) {
2000 if (NumOpsWithSameOpcodeParent == 0) {
2001 NumOpsWithSameOpcodeParent = 1;
2002 OpcodeI = I;
2003 Parent = I->getParent();
2004 } else {
2005 --NumOpsWithSameOpcodeParent;
2006 }
2007 } else {
2008 ++NumOpsWithSameOpcodeParent;
2009 }
2010 }
2011 Hash = hash_combine(
2012 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2013 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2014 }
2015 if (AllUndefs)
2016 return {};
2017 OperandsOrderData Data;
2018 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2019 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2020 Data.Hash = Hash;
2021 return Data;
2022 }
2023
2024 /// Go through the instructions in VL and append their operands.
2025 void appendOperandsOfVL(ArrayRef<Value *> VL) {
2026 assert(!VL.empty() && "Bad VL");
2027 assert((empty() || VL.size() == getNumLanes()) &&
2028 "Expected same number of lanes");
2029 assert(isa<Instruction>(VL[0]) && "Expected instruction");
2030 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2031 OpsVec.resize(NumOperands);
2032 unsigned NumLanes = VL.size();
2033 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2034 OpsVec[OpIdx].resize(NumLanes);
2035 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2036 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2037 // Our tree has just 3 nodes: the root and two operands.
2038 // It is therefore trivial to get the APO. We only need to check the
2039 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2040 // RHS operand. The LHS operand of both add and sub is never attached
2041 // to an inversese operation in the linearized form, therefore its APO
2042 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2043
2044 // Since operand reordering is performed on groups of commutative
2045 // operations or alternating sequences (e.g., +, -), we can safely
2046 // tell the inverse operations by checking commutativity.
2047 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2048 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2049 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2050 APO, false};
2051 }
2052 }
2053 }
2054
2055 /// \returns the number of operands.
2056 unsigned getNumOperands() const { return OpsVec.size(); }
2057
2058 /// \returns the number of lanes.
2059 unsigned getNumLanes() const { return OpsVec[0].size(); }
2060
2061 /// \returns the operand value at \p OpIdx and \p Lane.
2062 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2063 return getData(OpIdx, Lane).V;
2064 }
2065
2066 /// \returns true if the data structure is empty.
2067 bool empty() const { return OpsVec.empty(); }
2068
2069 /// Clears the data.
2070 void clear() { OpsVec.clear(); }
2071
2072 /// \Returns true if there are enough operands identical to \p Op to fill
2073 /// the whole vector.
2074 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2075 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2076 bool OpAPO = getData(OpIdx, Lane).APO;
2077 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2078 if (Ln == Lane)
2079 continue;
2080 // This is set to true if we found a candidate for broadcast at Lane.
2081 bool FoundCandidate = false;
2082 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2083 OperandData &Data = getData(OpI, Ln);
2084 if (Data.APO != OpAPO || Data.IsUsed)
2085 continue;
2086 if (Data.V == Op) {
2087 FoundCandidate = true;
2088 Data.IsUsed = true;
2089 break;
2090 }
2091 }
2092 if (!FoundCandidate)
2093 return false;
2094 }
2095 return true;
2096 }
2097
2098 public:
2099 /// Initialize with all the operands of the instruction vector \p RootVL.
2101 const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R)
2102 : TLI(TLI), DL(DL), SE(SE), R(R) {
2103 // Append all the operands of RootVL.
2104 appendOperandsOfVL(RootVL);
2105 }
2106
2107 /// \Returns a value vector with the operands across all lanes for the
2108 /// opearnd at \p OpIdx.
2109 ValueList getVL(unsigned OpIdx) const {
2110 ValueList OpVL(OpsVec[OpIdx].size());
2111 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2112 "Expected same num of lanes across all operands");
2113 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2114 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2115 return OpVL;
2116 }
2117
2118 // Performs operand reordering for 2 or more operands.
2119 // The original operands are in OrigOps[OpIdx][Lane].
2120 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2121 void reorder() {
2122 unsigned NumOperands = getNumOperands();
2123 unsigned NumLanes = getNumLanes();
2124 // Each operand has its own mode. We are using this mode to help us select
2125 // the instructions for each lane, so that they match best with the ones
2126 // we have selected so far.
2127 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2128
2129 // This is a greedy single-pass algorithm. We are going over each lane
2130 // once and deciding on the best order right away with no back-tracking.
2131 // However, in order to increase its effectiveness, we start with the lane
2132 // that has operands that can move the least. For example, given the
2133 // following lanes:
2134 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2135 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2136 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2137 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2138 // we will start at Lane 1, since the operands of the subtraction cannot
2139 // be reordered. Then we will visit the rest of the lanes in a circular
2140 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2141
2142 // Find the first lane that we will start our search from.
2143 unsigned FirstLane = getBestLaneToStartReordering();
2144
2145 // Initialize the modes.
2146 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2147 Value *OpLane0 = getValue(OpIdx, FirstLane);
2148 // Keep track if we have instructions with all the same opcode on one
2149 // side.
2150 if (isa<LoadInst>(OpLane0))
2151 ReorderingModes[OpIdx] = ReorderingMode::Load;
2152 else if (isa<Instruction>(OpLane0)) {
2153 // Check if OpLane0 should be broadcast.
2154 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2155 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2156 else
2157 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2158 }
2159 else if (isa<Constant>(OpLane0))
2160 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2161 else if (isa<Argument>(OpLane0))
2162 // Our best hope is a Splat. It may save some cost in some cases.
2163 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2164 else
2165 // NOTE: This should be unreachable.
2166 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2167 }
2168
2169 // Check that we don't have same operands. No need to reorder if operands
2170 // are just perfect diamond or shuffled diamond match. Do not do it only
2171 // for possible broadcasts or non-power of 2 number of scalars (just for
2172 // now).
2173 auto &&SkipReordering = [this]() {
2174 SmallPtrSet<Value *, 4> UniqueValues;
2175 ArrayRef<OperandData> Op0 = OpsVec.front();
2176 for (const OperandData &Data : Op0)
2177 UniqueValues.insert(Data.V);
2178 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2179 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2180 return !UniqueValues.contains(Data.V);
2181 }))
2182 return false;
2183 }
2184 // TODO: Check if we can remove a check for non-power-2 number of
2185 // scalars after full support of non-power-2 vectorization.
2186 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2187 };
2188
2189 // If the initial strategy fails for any of the operand indexes, then we
2190 // perform reordering again in a second pass. This helps avoid assigning
2191 // high priority to the failed strategy, and should improve reordering for
2192 // the non-failed operand indexes.
2193 for (int Pass = 0; Pass != 2; ++Pass) {
2194 // Check if no need to reorder operands since they're are perfect or
2195 // shuffled diamond match.
2196 // Need to to do it to avoid extra external use cost counting for
2197 // shuffled matches, which may cause regressions.
2198 if (SkipReordering())
2199 break;
2200 // Skip the second pass if the first pass did not fail.
2201 bool StrategyFailed = false;
2202 // Mark all operand data as free to use.
2203 clearUsed();
2204 // We keep the original operand order for the FirstLane, so reorder the
2205 // rest of the lanes. We are visiting the nodes in a circular fashion,
2206 // using FirstLane as the center point and increasing the radius
2207 // distance.
2208 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2209 for (unsigned I = 0; I < NumOperands; ++I)
2210 MainAltOps[I].push_back(getData(I, FirstLane).V);
2211
2212 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2213 // Visit the lane on the right and then the lane on the left.
2214 for (int Direction : {+1, -1}) {
2215 int Lane = FirstLane + Direction * Distance;
2216 if (Lane < 0 || Lane >= (int)NumLanes)
2217 continue;
2218 int LastLane = Lane - Direction;
2219 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2220 "Out of bounds");
2221 // Look for a good match for each operand.
2222 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2223 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2224 std::optional<unsigned> BestIdx = getBestOperand(
2225 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2226 // By not selecting a value, we allow the operands that follow to
2227 // select a better matching value. We will get a non-null value in
2228 // the next run of getBestOperand().
2229 if (BestIdx) {
2230 // Swap the current operand with the one returned by
2231 // getBestOperand().
2232 swap(OpIdx, *BestIdx, Lane);
2233 } else {
2234 // We failed to find a best operand, set mode to 'Failed'.
2235 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2236 // Enable the second pass.
2237 StrategyFailed = true;
2238 }
2239 // Try to get the alternate opcode and follow it during analysis.
2240 if (MainAltOps[OpIdx].size() != 2) {
2241 OperandData &AltOp = getData(OpIdx, Lane);
2242 InstructionsState OpS =
2243 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2244 if (OpS.getOpcode() && OpS.isAltShuffle())
2245 MainAltOps[OpIdx].push_back(AltOp.V);
2246 }
2247 }
2248 }
2249 }
2250 // Skip second pass if the strategy did not fail.
2251 if (!StrategyFailed)
2252 break;
2253 }
2254 }
2255
2256#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2257 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2258 switch (RMode) {
2259 case ReorderingMode::Load:
2260 return "Load";
2261 case ReorderingMode::Opcode:
2262 return "Opcode";
2263 case ReorderingMode::Constant:
2264 return "Constant";
2265 case ReorderingMode::Splat:
2266 return "Splat";
2267 case ReorderingMode::Failed:
2268 return "Failed";
2269 }
2270 llvm_unreachable("Unimplemented Reordering Type");
2271 }
2272
2273 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2274 raw_ostream &OS) {
2275 return OS << getModeStr(RMode);
2276 }
2277
2278 /// Debug print.
2279 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2280 printMode(RMode, dbgs());
2281 }
2282
2283 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2284 return printMode(RMode, OS);
2285 }
2286
2288 const unsigned Indent = 2;
2289 unsigned Cnt = 0;
2290 for (const OperandDataVec &OpDataVec : OpsVec) {
2291 OS << "Operand " << Cnt++ << "\n";
2292 for (const OperandData &OpData : OpDataVec) {
2293 OS.indent(Indent) << "{";
2294 if (Value *V = OpData.V)
2295 OS << *V;
2296 else
2297 OS << "null";
2298 OS << ", APO:" << OpData.APO << "}\n";
2299 }
2300 OS << "\n";
2301 }
2302 return OS;
2303 }
2304
2305 /// Debug print.
2306 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2307#endif
2308 };
2309
2310 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2311 /// for a pair which have highest score deemed to have best chance to form
2312 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2313 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2314 /// of the cost, considered to be good enough score.
2315 std::optional<int>
2316 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2317 int Limit = LookAheadHeuristics::ScoreFail) {
2318 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2320 int BestScore = Limit;
2321 std::optional<int> Index;
2322 for (int I : seq<int>(0, Candidates.size())) {
2323 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2324 Candidates[I].second,
2325 /*U1=*/nullptr, /*U2=*/nullptr,
2326 /*Level=*/1, std::nullopt);
2327 if (Score > BestScore) {
2328 BestScore = Score;
2329 Index = I;
2330 }
2331 }
2332 return Index;
2333 }
2334
2335 /// Checks if the instruction is marked for deletion.
2336 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2337
2338 /// Removes an instruction from its block and eventually deletes it.
2339 /// It's like Instruction::eraseFromParent() except that the actual deletion
2340 /// is delayed until BoUpSLP is destructed.
2342 DeletedInstructions.insert(I);
2343 }
2344
2345 /// Checks if the instruction was already analyzed for being possible
2346 /// reduction root.
2348 return AnalyzedReductionsRoots.count(I);
2349 }
2350 /// Register given instruction as already analyzed for being possible
2351 /// reduction root.
2353 AnalyzedReductionsRoots.insert(I);
2354 }
2355 /// Checks if the provided list of reduced values was checked already for
2356 /// vectorization.
2358 return AnalyzedReductionVals.contains(hash_value(VL));
2359 }
2360 /// Adds the list of reduced values to list of already checked values for the
2361 /// vectorization.
2363 AnalyzedReductionVals.insert(hash_value(VL));
2364 }
2365 /// Clear the list of the analyzed reduction root instructions.
2367 AnalyzedReductionsRoots.clear();
2368 AnalyzedReductionVals.clear();
2369 }
2370 /// Checks if the given value is gathered in one of the nodes.
2371 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2372 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2373 }
2374
2375 /// Check if the value is vectorized in the tree.
2376 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2377
2378 ~BoUpSLP();
2379
2380private:
2381 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2382 /// reordering (i.e. the operands can be reordered because they have only one
2383 /// user and reordarable).
2384 /// \param ReorderableGathers List of all gather nodes that require reordering
2385 /// (e.g., gather of extractlements or partially vectorizable loads).
2386 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2387 /// reordering, subset of \p NonVectorized.
2388 bool
2389 canReorderOperands(TreeEntry *UserTE,
2390 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2391 ArrayRef<TreeEntry *> ReorderableGathers,
2392 SmallVectorImpl<TreeEntry *> &GatherOps);
2393
2394 /// Checks if the given \p TE is a gather node with clustered reused scalars
2395 /// and reorders it per given \p Mask.
2396 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2397
2398 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2399 /// if any. If it is not vectorized (gather node), returns nullptr.
2400 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2401 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2402 TreeEntry *TE = nullptr;
2403 const auto *It = find_if(VL, [this, &TE](Value *V) {
2404 TE = getTreeEntry(V);
2405 return TE;
2406 });
2407 if (It != VL.end() && TE->isSame(VL))
2408 return TE;
2409 return nullptr;
2410 }
2411
2412 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2413 /// if any. If it is not vectorized (gather node), returns nullptr.
2414 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2415 unsigned OpIdx) const {
2416 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2417 const_cast<TreeEntry *>(UserTE), OpIdx);
2418 }
2419
2420 /// Checks if all users of \p I are the part of the vectorization tree.
2421 bool areAllUsersVectorized(Instruction *I,
2422 ArrayRef<Value *> VectorizedVals) const;
2423
2424 /// Return information about the vector formed for the specified index
2425 /// of a vector of (the same) instruction.
2427 unsigned OpIdx);
2428
2429 /// \returns the cost of the vectorizable entry.
2430 InstructionCost getEntryCost(const TreeEntry *E,
2431 ArrayRef<Value *> VectorizedVals);
2432
2433 /// This is the recursive part of buildTree.
2434 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2435 const EdgeInfo &EI);
2436
2437 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2438 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2439 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2440 /// returns false, setting \p CurrentOrder to either an empty vector or a
2441 /// non-identity permutation that allows to reuse extract instructions.
2442 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2443 SmallVectorImpl<unsigned> &CurrentOrder) const;
2444
2445 /// Vectorize a single entry in the tree.
2446 Value *vectorizeTree(TreeEntry *E);
2447
2448 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2449 /// \p E.
2450 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
2451
2452 /// Create a new vector from a list of scalar values. Produces a sequence
2453 /// which exploits values reused across lanes, and arranges the inserts
2454 /// for ease of later optimization.
2455 Value *createBuildVector(const TreeEntry *E);
2456
2457 /// \returns the scalarization cost for this type. Scalarization in this
2458 /// context means the creation of vectors from a group of scalars. If \p
2459 /// NeedToShuffle is true, need to add a cost of reshuffling some of the
2460 /// vector elements.
2461 InstructionCost getGatherCost(FixedVectorType *Ty,
2462 const APInt &ShuffledIndices,
2463 bool NeedToShuffle) const;
2464
2465 /// Returns the instruction in the bundle, which can be used as a base point
2466 /// for scheduling. Usually it is the last instruction in the bundle, except
2467 /// for the case when all operands are external (in this case, it is the first
2468 /// instruction in the list).
2469 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2470
2471 /// Checks if the gathered \p VL can be represented as shuffle(s) of previous
2472 /// tree entries.
2473 /// \param TE Tree entry checked for permutation.
2474 /// \param VL List of scalars (a subset of the TE scalar), checked for
2475 /// permutations.
2476 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2477 /// previous tree entries. \p Mask is filled with the shuffle mask.
2478 std::optional<TargetTransformInfo::ShuffleKind>
2479 isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
2482
2483 /// \returns the scalarization cost for this list of values. Assuming that
2484 /// this subtree gets vectorized, we may need to extract the values from the
2485 /// roots. This method calculates the cost of extracting the values.
2486 InstructionCost getGatherCost(ArrayRef<Value *> VL) const;
2487
2488 /// Set the Builder insert point to one after the last instruction in
2489 /// the bundle
2490 void setInsertPointAfterBundle(const TreeEntry *E);
2491
2492 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2493 /// specified, the starting vector value is poison.
2494 Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr);
2495
2496 /// \returns whether the VectorizableTree is fully vectorizable and will
2497 /// be beneficial even the tree height is tiny.
2498 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2499
2500 /// Reorder commutative or alt operands to get better probability of
2501 /// generating vectorized code.
2502 static void reorderInputsAccordingToOpcode(
2505 const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R);
2506
2507 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2508 /// users of \p TE and collects the stores. It returns the map from the store
2509 /// pointers to the collected stores.
2511 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2512
2513 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2514 /// stores in \p StoresVec can form a vector instruction. If so it returns true
2515 /// and populates \p ReorderIndices with the shuffle indices of the the stores
2516 /// when compared to the sorted vector.
2517 bool canFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
2518 OrdersType &ReorderIndices) const;
2519
2520 /// Iterates through the users of \p TE, looking for scalar stores that can be
2521 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2522 /// their order and builds an order index vector for each store bundle. It
2523 /// returns all these order vectors found.
2524 /// We run this after the tree has formed, otherwise we may come across user
2525 /// instructions that are not yet in the tree.
2527 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2528
2529 struct TreeEntry {
2530 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2531 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2532
2533 /// \returns true if the scalars in VL are equal to this entry.
2534 bool isSame(ArrayRef<Value *> VL) const {
2535 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2536 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2537 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2538 return VL.size() == Mask.size() &&
2539 std::equal(VL.begin(), VL.end(), Mask.begin(),
2540 [Scalars](Value *V, int Idx) {
2541 return (isa<UndefValue>(V) &&
2542 Idx == UndefMaskElem) ||
2543 (Idx != UndefMaskElem && V == Scalars[Idx]);
2544 });
2545 };
2546 if (!ReorderIndices.empty()) {
2547 // TODO: implement matching if the nodes are just reordered, still can
2548 // treat the vector as the same if the list of scalars matches VL
2549 // directly, without reordering.
2551 inversePermutation(ReorderIndices, Mask);
2552 if (VL.size() == Scalars.size())
2553 return IsSame(Scalars, Mask);
2554 if (VL.size() == ReuseShuffleIndices.size()) {
2555 ::addMask(Mask, ReuseShuffleIndices);
2556 return IsSame(Scalars, Mask);
2557 }
2558 return false;
2559 }
2560 return IsSame(Scalars, ReuseShuffleIndices);
2561 }
2562
2563 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2564 return State == TreeEntry::NeedToGather &&
2565 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2566 UserTreeIndices.front().UserTE == UserEI.UserTE;
2567 }
2568
2569 /// \returns true if current entry has same operands as \p TE.
2570 bool hasEqualOperands(const TreeEntry &TE) const {
2571 if (TE.getNumOperands() != getNumOperands())
2572 return false;
2573 SmallBitVector Used(getNumOperands());
2574 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2575 unsigned PrevCount = Used.count();
2576 for (unsigned K = 0; K < E; ++K) {
2577 if (Used.test(K))
2578 continue;
2579 if (getOperand(K) == TE.getOperand(I)) {
2580 Used.set(K);
2581 break;
2582 }
2583 }
2584 // Check if we actually found the matching operand.
2585 if (PrevCount == Used.count())
2586 return false;
2587 }
2588 return true;
2589 }
2590
2591 /// \return Final vectorization factor for the node. Defined by the total
2592 /// number of vectorized scalars, including those, used several times in the
2593 /// entry and counted in the \a ReuseShuffleIndices, if any.
2594 unsigned getVectorFactor() const {
2595 if (!ReuseShuffleIndices.empty())
2596 return ReuseShuffleIndices.size();
2597 return Scalars.size();
2598 };
2599
2600 /// A vector of scalars.
2601 ValueList Scalars;
2602
2603 /// The Scalars are vectorized into this value. It is initialized to Null.
2604 Value *VectorizedValue = nullptr;
2605
2606 /// Do we need to gather this sequence or vectorize it
2607 /// (either with vector instruction or with scatter/gather
2608 /// intrinsics for store/load)?
2609 enum EntryState { Vectorize, ScatterVectorize, NeedToGather };
2610 EntryState State;
2611
2612 /// Does this sequence require some shuffling?
2613 SmallVector<int, 4> ReuseShuffleIndices;
2614
2615 /// Does this entry require reordering?
2616 SmallVector<unsigned, 4> ReorderIndices;
2617
2618 /// Points back to the VectorizableTree.
2619 ///
2620 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2621 /// to be a pointer and needs to be able to initialize the child iterator.
2622 /// Thus we need a reference back to the container to translate the indices
2623 /// to entries.
2624 VecTreeTy &Container;
2625
2626 /// The TreeEntry index containing the user of this entry. We can actually
2627 /// have multiple users so the data structure is not truly a tree.
2628 SmallVector<EdgeInfo, 1> UserTreeIndices;
2629
2630 /// The index of this treeEntry in VectorizableTree.
2631 int Idx = -1;
2632
2633 private:
2634 /// The operands of each instruction in each lane Operands[op_index][lane].
2635 /// Note: This helps avoid the replication of the code that performs the
2636 /// reordering of operands during buildTree_rec() and vectorizeTree().
2638
2639 /// The main/alternate instruction.
2640 Instruction *MainOp = nullptr;
2641 Instruction *AltOp = nullptr;
2642
2643 public:
2644 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
2645 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2646 if (Operands.size() < OpIdx + 1)
2647 Operands.resize(OpIdx + 1);
2648 assert(Operands[OpIdx].empty() && "Already resized?");
2649 assert(OpVL.size() <= Scalars.size() &&
2650 "Number of operands is greater than the number of scalars.");
2651 Operands[OpIdx].resize(OpVL.size());
2652 copy(OpVL, Operands[OpIdx].begin());
2653 }
2654
2655 /// Set the operands of this bundle in their original order.
2656 void setOperandsInOrder() {
2657 assert(Operands.empty() && "Already initialized?");
2658 auto *I0 = cast<Instruction>(Scalars[0]);
2659 Operands.resize(I0->getNumOperands());
2660 unsigned NumLanes = Scalars.size();
2661 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2662 OpIdx != NumOperands; ++OpIdx) {
2663 Operands[OpIdx].resize(NumLanes);
2664 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2665 auto *I = cast<Instruction>(Scalars[Lane]);
2666 assert(I->getNumOperands() == NumOperands &&
2667 "Expected same number of operands");
2668 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
2669 }
2670 }
2671 }
2672
2673 /// Reorders operands of the node to the given mask \p Mask.
2674 void reorderOperands(ArrayRef<int> Mask) {
2675 for (ValueList &Operand : Operands)
2676 reorderScalars(Operand, Mask);
2677 }
2678
2679 /// \returns the \p OpIdx operand of this TreeEntry.
2680 ValueList &getOperand(unsigned OpIdx) {
2681 assert(OpIdx < Operands.size() && "Off bounds");
2682 return Operands[OpIdx];
2683 }
2684
2685 /// \returns the \p OpIdx operand of this TreeEntry.
2686 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
2687 assert(OpIdx < Operands.size() && "Off bounds");
2688 return Operands[OpIdx];
2689 }
2690
2691 /// \returns the number of operands.
2692 unsigned getNumOperands() const { return Operands.size(); }
2693
2694 /// \return the single \p OpIdx operand.
2695 Value *getSingleOperand(unsigned OpIdx) const {
2696 assert(OpIdx < Operands.size() && "Off bounds");
2697 assert(!Operands[OpIdx].empty() && "No operand available");
2698 return Operands[OpIdx][0];
2699 }
2700
2701 /// Some of the instructions in the list have alternate opcodes.
2702 bool isAltShuffle() const { return MainOp != AltOp; }
2703
2704 bool isOpcodeOrAlt(Instruction *I) const {
2705 unsigned CheckedOpcode = I->getOpcode();
2706 return (getOpcode() == CheckedOpcode ||
2707 getAltOpcode() == CheckedOpcode);
2708 }
2709
2710 /// Chooses the correct key for scheduling data. If \p Op has the same (or
2711 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
2712 /// \p OpValue.
2713 Value *isOneOf(Value *Op) const {
2714 auto *I = dyn_cast<Instruction>(Op);
2715 if (I && isOpcodeOrAlt(I))
2716 return Op;
2717 return MainOp;
2718 }
2719
2720 void setOperations(const InstructionsState &S) {
2721 MainOp = S.MainOp;
2722 AltOp = S.AltOp;
2723 }
2724
2725 Instruction *getMainOp() const {
2726 return MainOp;
2727 }
2728
2729 Instruction *getAltOp() const {
2730 return AltOp;
2731 }
2732
2733 /// The main/alternate opcodes for the list of instructions.
2734 unsigned getOpcode() const {
2735 return MainOp ? MainOp->getOpcode() : 0;
2736 }
2737
2738 unsigned getAltOpcode() const {
2739 return AltOp ? AltOp->getOpcode() : 0;
2740 }
2741
2742 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
2743 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
2744 int findLaneForValue(Value *V) const {
2745 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
2746 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2747 if (!ReorderIndices.empty())
2748 FoundLane = ReorderIndices[FoundLane];
2749 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2750 if (!ReuseShuffleIndices.empty()) {
2751 FoundLane = std::distance(ReuseShuffleIndices.begin(),
2752 find(ReuseShuffleIndices, FoundLane));
2753 }
2754 return FoundLane;
2755 }
2756
2757#ifndef NDEBUG
2758 /// Debug printer.
2759 LLVM_DUMP_METHOD void dump() const {
2760 dbgs() << Idx << ".\n";
2761 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
2762 dbgs() << "Operand " << OpI << ":\n";
2763 for (const Value *V : Operands[OpI])
2764 dbgs().indent(2) << *V << "\n";
2765 }
2766 dbgs() << "Scalars: \n";
2767 for (Value *V : Scalars)
2768 dbgs().indent(2) << *V << "\n";
2769 dbgs() << "State: ";
2770 switch (State) {
2771 case Vectorize:
2772 dbgs() << "Vectorize\n";
2773 break;
2774 case ScatterVectorize:
2775 dbgs() << "ScatterVectorize\n";
2776 break;
2777 case NeedToGather:
2778 dbgs() << "NeedToGather\n";
2779 break;
2780 }
2781 dbgs() << "MainOp: ";
2782 if (MainOp)
2783 dbgs() << *MainOp << "\n";
2784 else
2785 dbgs() << "NULL\n";
2786 dbgs() << "AltOp: ";
2787 if (AltOp)
2788 dbgs() << *AltOp << "\n";
2789 else
2790 dbgs() << "NULL\n";
2791 dbgs() << "VectorizedValue: ";
2792 if (VectorizedValue)
2793 dbgs() << *VectorizedValue << "\n";
2794 else
2795 dbgs() << "NULL\n";
2796 dbgs() << "ReuseShuffleIndices: ";
2797 if (ReuseShuffleIndices.empty())
2798 dbgs() << "Empty";
2799 else
2800 for (int ReuseIdx : ReuseShuffleIndices)
2801 dbgs() << ReuseIdx << ", ";
2802 dbgs() << "\n";
2803 dbgs() << "ReorderIndices: ";
2804 for (unsigned ReorderIdx : ReorderIndices)
2805 dbgs() << ReorderIdx << ", ";
2806 dbgs() << "\n";
2807 dbgs() << "UserTreeIndices: ";
2808 for (const auto &EInfo : UserTreeIndices)
2809 dbgs() << EInfo << ", ";
2810 dbgs() << "\n";
2811 }
2812#endif
2813 };
2814
2815#ifndef NDEBUG
2816 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
2817 InstructionCost VecCost, InstructionCost ScalarCost,
2818 StringRef Banner) const {
2819 dbgs() << "SLP: " << Banner << ":\n";
2820 E->dump();
2821 dbgs() << "SLP: Costs:\n";
2822 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
2823 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
2824 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
2825 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2826 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
2827 }
2828#endif
2829
2830 /// Create a new VectorizableTree entry.
2831 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2832 std::optional<ScheduleData *> Bundle,
2833 const InstructionsState &S,
2834 const EdgeInfo &UserTreeIdx,
2835 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2836 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2837 TreeEntry::EntryState EntryState =
2838 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2839 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2840 ReuseShuffleIndices, ReorderIndices);
2841 }
2842
2843 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2844 TreeEntry::EntryState EntryState,
2845 std::optional<ScheduleData *> Bundle,
2846 const InstructionsState &S,
2847 const EdgeInfo &UserTreeIdx,
2848 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2849 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2850 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2851 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2852 "Need to vectorize gather entry?");
2853 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
2854 TreeEntry *Last = VectorizableTree.back().get();
2855 Last->Idx = VectorizableTree.size() - 1;
2856 Last->State = EntryState;
2857 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2858 ReuseShuffleIndices.end());
2859 if (ReorderIndices.empty()) {
2860 Last->Scalars.assign(VL.begin(), VL.end());
2861 Last->setOperations(S);
2862 } else {
2863 // Reorder scalars and build final mask.
2864 Last->Scalars.assign(VL.size(), nullptr);
2865 transform(ReorderIndices, Last->Scalars.begin(),
2866 [VL](unsigned Idx) -> Value * {
2867 if (Idx >= VL.size())
2868 return UndefValue::get(VL.front()->getType());
2869 return VL[Idx];
2870 });
2871 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
2872 Last->setOperations(S);
2873 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
2874 }
2875 if (Last->State != TreeEntry::NeedToGather) {
2876 for (Value *V : VL) {
2877 assert(!getTreeEntry(V) && "Scalar already in tree!");
2878 ScalarToTreeEntry[V] = Last;
2879 }
2880 // Update the scheduler bundle to point to this TreeEntry.
2881 ScheduleData *BundleMember = *Bundle;
2882 assert((BundleMember || isa<PHINode>(S.MainOp) ||
2883 isVectorLikeInstWithConstOps(S.MainOp) ||
2884 doesNotNeedToSchedule(VL)) &&
2885 "Bundle and VL out of sync");
2886 if (BundleMember) {
2887 for (Value *V : VL) {
2889 continue;
2890 assert(BundleMember && "Unexpected end of bundle.");
2891 BundleMember->TE = Last;
2892 BundleMember = BundleMember->NextInBundle;
2893 }
2894 }
2895 assert(!BundleMember && "Bundle and VL out of sync");
2896 } else {
2897 MustGather.insert(VL.begin(), VL.end());
2898 }
2899
2900 if (UserTreeIdx.UserTE)
2901 Last->UserTreeIndices.push_back(UserTreeIdx);
2902
2903 return Last;
2904 }
2905
2906 /// -- Vectorization State --
2907 /// Holds all of the tree entries.
2908 TreeEntry::VecTreeTy VectorizableTree;
2909
2910#ifndef NDEBUG
2911 /// Debug printer.
2912 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
2913 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
2914 VectorizableTree[Id]->dump();
2915 dbgs() << "\n";
2916 }
2917 }
2918#endif
2919
2920 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
2921
2922 const TreeEntry *getTreeEntry(Value *V) const {
2923 return ScalarToTreeEntry.lookup(V);
2924 }
2925
2926 /// Maps a specific scalar to its tree entry.
2927 SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
2928
2929 /// Maps a value to the proposed vectorizable size.
2930 SmallDenseMap<Value *, unsigned> InstrElementSize;
2931
2932 /// A list of scalars that we found that we need to keep as scalars.
2933 ValueSet MustGather;
2934
2935 /// A map between the vectorized entries and the last instructions in the
2936 /// bundles. The bundles are built in use order, not in the def order of the
2937 /// instructions. So, we cannot rely directly on the last instruction in the
2938 /// bundle being the last instruction in the program order during
2939 /// vectorization process since the basic blocks are affected, need to
2940 /// pre-gather them before.
2941 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
2942
2943 /// List of gather nodes, depending on other gather/vector nodes, which should
2944 /// be emitted after the vector instruction emission process to correctly
2945 /// handle order of the vector instructions and shuffles.
2946 SetVector<const TreeEntry *> PostponedGathers;
2947
2948 /// This POD struct describes one external user in the vectorized tree.
2949 struct ExternalUser {
2950 ExternalUser(Value *S, llvm::User *U, int L)
2951 : Scalar(S), User(U), Lane(L) {}
2952
2953 // Which scalar in our function.
2954 Value *Scalar;
2955
2956 // Which user that uses the scalar.
2958
2959 // Which lane does the scalar belong to.
2960 int Lane;
2961 };
2962 using UserList = SmallVector<ExternalUser, 16>;
2963
2964 /// Checks if two instructions may access the same memory.
2965 ///
2966 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
2967 /// is invariant in the calling loop.
2968 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
2969 Instruction *Inst2) {
2970 // First check if the result is already in the cache.
2971 AliasCacheKey key = std::make_pair(Inst1, Inst2);
2972 std::optional<bool> &result = AliasCache[key];
2973 if (result) {
2974 return *result;
2975 }
2976 bool aliased = true;
2977 if (Loc1.Ptr && isSimple(Inst1))
2978 aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
2979 // Store the result in the cache.
2980 result = aliased;
2981 return aliased;
2982 }
2983
2984 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
2985
2986 /// Cache for alias results.
2987 /// TODO: consider moving this to the AliasAnalysis itself.
2989
2990 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
2991 // globally through SLP because we don't perform any action which
2992 // invalidates capture results.
2993 BatchAAResults BatchAA;
2994
2995 /// Temporary store for deleted instructions. Instructions will be deleted
2996 /// eventually when the BoUpSLP is destructed. The deferral is required to
2997 /// ensure that there are no incorrect collisions in the AliasCache, which
2998 /// can happen if a new instruction is allocated at the same address as a
2999 /// previously deleted instruction.
3000 DenseSet<Instruction *> DeletedInstructions;
3001
3002 /// Set of the instruction, being analyzed already for reductions.
3003 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3004
3005 /// Set of hashes for the list of reduction values already being analyzed.
3006 DenseSet<size_t> AnalyzedReductionVals;
3007
3008 /// A list of values that need to extracted out of the tree.
3009 /// This list holds pairs of (Internal Scalar : External User). External User
3010 /// can be nullptr, it means that this Internal Scalar will be used later,
3011 /// after vectorization.
3012 UserList ExternalUses;
3013
3014 /// Values used only by @llvm.assume calls.
3016
3017 /// Holds all of the instructions that we gathered, shuffle instructions and
3018 /// extractelements.
3019 SetVector<Instruction *> GatherShuffleExtractSeq;
3020
3021 /// A list of blocks that we are going to CSE.
3022 SetVector<BasicBlock *> CSEBlocks;
3023
3024 /// Contains all scheduling relevant data for an instruction.
3025 /// A ScheduleData either represents a single instruction or a member of an
3026 /// instruction bundle (= a group of instructions which is combined into a
3027 /// vector instruction).
3028 struct ScheduleData {
3029 // The initial value for the dependency counters. It means that the
3030 // dependencies are not calculated yet.
3031 enum { InvalidDeps = -1 };
3032
3033 ScheduleData() = default;
3034
3035 void init(int BlockSchedulingRegionID, Value *OpVal) {
3036 FirstInBundle = this;
3037 NextInBundle = nullptr;
3038 NextLoadStore = nullptr;
3039 IsScheduled = false;
3040 SchedulingRegionID = BlockSchedulingRegionID;
3041 clearDependencies();
3042 OpValue = OpVal;
3043 TE = nullptr;
3044 }
3045
3046 /// Verify basic self consistency properties
3047 void verify() {
3048 if (hasValidDependencies()) {
3049 assert(UnscheduledDeps <= Dependencies && "invariant");
3050 } else {
3051 assert(UnscheduledDeps == Dependencies && "invariant");
3052 }
3053
3054 if (IsScheduled) {
3055 assert(isSchedulingEntity() &&
3056 "unexpected scheduled state");
3057 for (const ScheduleData *BundleMember = this; BundleMember;
3058 BundleMember = BundleMember->NextInBundle) {
3059 assert(BundleMember->hasValidDependencies() &&
3060 BundleMember->UnscheduledDeps == 0 &&
3061 "unexpected scheduled state");
3062 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3063 "only bundle is marked scheduled");
3064 }
3065 }
3066
3067 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3068 "all bundle members must be in same basic block");
3069 }
3070
3071 /// Returns true if the dependency information has been calculated.
3072 /// Note that depenendency validity can vary between instructions within
3073 /// a single bundle.
3074 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3075
3076 /// Returns true for single instructions and for bundle representatives
3077 /// (= the head of a bundle).
3078 bool isSchedulingEntity() const { return FirstInBundle == this; }
3079
3080 /// Returns true if it represents an instruction bundle and not only a
3081 /// single instruction.
3082 bool isPartOfBundle() const {
3083 return NextInBundle != nullptr || FirstInBundle != this || TE;
3084 }
3085
3086 /// Returns true if it is ready for scheduling, i.e. it has no more
3087 /// unscheduled depending instructions/bundles.
3088 bool isReady() const {
3089 assert(isSchedulingEntity() &&
3090 "can't consider non-scheduling entity for ready list");
3091 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3092 }
3093
3094 /// Modifies the number of unscheduled dependencies for this instruction,
3095 /// and returns the number of remaining dependencies for the containing
3096 /// bundle.
3097 int incrementUnscheduledDeps(int Incr) {
3098 assert(hasValidDependencies() &&
3099 "increment of unscheduled deps would be meaningless");
3100 UnscheduledDeps += Incr;
3101 return FirstInBundle->unscheduledDepsInBundle();
3102 }
3103
3104 /// Sets the number of unscheduled dependencies to the number of
3105 /// dependencies.
3106 void resetUnscheduledDeps() {
3107 UnscheduledDeps = Dependencies;
3108 }
3109
3110 /// Clears all dependency information.
3111 void clearDependencies() {
3112 Dependencies = InvalidDeps;
3113 resetUnscheduledDeps();
3114 MemoryDependencies.clear();
3115 ControlDependencies.clear();
3116 }
3117
3118 int unscheduledDepsInBundle() const {
3119 assert(isSchedulingEntity() && "only meaningful on the bundle");
3120 int Sum = 0;
3121 for (const ScheduleData *BundleMember = this; BundleMember;
3122 BundleMember = BundleMember->NextInBundle) {
3123 if (BundleMember->UnscheduledDeps == InvalidDeps)
3124 return InvalidDeps;
3125 Sum += BundleMember->UnscheduledDeps;
3126 }
3127 return Sum;
3128 }
3129
3130 void dump(raw_ostream &os) const {
3131 if (!isSchedulingEntity()) {
3132 os << "/ " << *Inst;
3133 } else if (NextInBundle) {
3134 os << '[' << *Inst;
3135 ScheduleData *SD = NextInBundle;
3136 while (SD) {
3137 os << ';' << *SD->Inst;
3138 SD = SD->NextInBundle;
3139 }
3140 os << ']';
3141 } else {
3142 os << *Inst;
3143 }
3144 }
3145
3146 Instruction *Inst = nullptr;
3147
3148 /// Opcode of the current instruction in the schedule data.
3149 Value *OpValue = nullptr;
3150
3151 /// The TreeEntry that this instruction corresponds to.
3152 TreeEntry *TE = nullptr;
3153
3154 /// Points to the head in an instruction bundle (and always to this for
3155 /// single instructions).
3156 ScheduleData *FirstInBundle = nullptr;
3157
3158 /// Single linked list of all instructions in a bundle. Null if it is a
3159 /// single instruction.
3160 ScheduleData *NextInBundle = nullptr;
3161
3162 /// Single linked list of all memory instructions (e.g. load, store, call)
3163 /// in the block - until the end of the scheduling region.
3164 ScheduleData *NextLoadStore = nullptr;
3165
3166 /// The dependent memory instructions.
3167 /// This list is derived on demand in calculateDependencies().
3168 SmallVector<ScheduleData *, 4> MemoryDependencies;
3169
3170 /// List of instructions which this instruction could be control dependent
3171 /// on. Allowing such nodes to be scheduled below this one could introduce
3172 /// a runtime fault which didn't exist in the original program.
3173 /// ex: this is a load or udiv following a readonly call which inf loops
3174 SmallVector<ScheduleData *, 4> ControlDependencies;
3175
3176 /// This ScheduleData is in the current scheduling region if this matches
3177 /// the current SchedulingRegionID of BlockScheduling.
3178 int SchedulingRegionID = 0;
3179
3180 /// Used for getting a "good" final ordering of instructions.
3181 int SchedulingPriority = 0;
3182
3183 /// The number of dependencies. Constitutes of the number of users of the
3184 /// instruction plus the number of dependent memory instructions (if any).
3185 /// This value is calculated on demand.
3186 /// If InvalidDeps, the number of dependencies is not calculated yet.
3187 int Dependencies = InvalidDeps;
3188
3189 /// The number of dependencies minus the number of dependencies of scheduled
3190 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3191 /// for scheduling.
3192 /// Note that this is negative as long as Dependencies is not calculated.
3193 int UnscheduledDeps = InvalidDeps;
3194
3195 /// True if this instruction is scheduled (or considered as scheduled in the
3196 /// dry-run).
3197 bool IsScheduled = false;
3198 };
3199
3200#ifndef NDEBUG
3202 const BoUpSLP::ScheduleData &SD) {
3203 SD.dump(os);
3204 return os;
3205 }
3206#endif
3207
3208 friend struct GraphTraits<BoUpSLP *>;
3209 friend struct DOTGraphTraits<BoUpSLP *>;
3210
3211 /// Contains all scheduling data for a basic block.
3212 /// It does not schedules instructions, which are not memory read/write
3213 /// instructions and their operands are either constants, or arguments, or
3214 /// phis, or instructions from others blocks, or their users are phis or from
3215 /// the other blocks. The resulting vector instructions can be placed at the
3216 /// beginning of the basic block without scheduling (if operands does not need
3217 /// to be scheduled) or at the end of the block (if users are outside of the
3218 /// block). It allows to save some compile time and memory used by the
3219 /// compiler.
3220 /// ScheduleData is assigned for each instruction in between the boundaries of
3221 /// the tree entry, even for those, which are not part of the graph. It is
3222 /// required to correctly follow the dependencies between the instructions and
3223 /// their correct scheduling. The ScheduleData is not allocated for the
3224 /// instructions, which do not require scheduling, like phis, nodes with
3225 /// extractelements/insertelements only or nodes with instructions, with
3226 /// uses/operands outside of the block.
3227 struct BlockScheduling {
3228 BlockScheduling(BasicBlock *BB)
3229 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3230
3231 void clear() {
3232 ReadyInsts.clear();
3233 ScheduleStart = nullptr;
3234 ScheduleEnd = nullptr;
3235 FirstLoadStoreInRegion = nullptr;
3236 LastLoadStoreInRegion = nullptr;
3237 RegionHasStackSave = false;
3238
3239 // Reduce the maximum schedule region size by the size of the
3240 // previous scheduling run.
3241 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3242 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3243 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3244 ScheduleRegionSize = 0;
3245
3246 // Make a new scheduling region, i.e. all existing ScheduleData is not
3247 // in the new region yet.
3248 ++SchedulingRegionID;
3249 }
3250
3251 ScheduleData *getScheduleData(Instruction *I) {
3252 if (BB != I->getParent())
3253 // Avoid lookup if can't possibly be in map.
3254 return nullptr;
3255 ScheduleData *SD = ScheduleDataMap.lookup(I);
3256 if (SD && isInSchedulingRegion(SD))
3257 return SD;
3258 return nullptr;
3259 }
3260
3261 ScheduleData *getScheduleData(Value *V) {
3262 if (auto *I = dyn_cast<Instruction>(V))
3263 return getScheduleData(I);
3264 return nullptr;
3265 }
3266
3267 ScheduleData *getScheduleData(Value *V, Value *Key) {
3268 if (V == Key)
3269 return getScheduleData(V);
3270 auto I = ExtraScheduleDataMap.find(V);
3271 if (I != ExtraScheduleDataMap.end()) {
3272 ScheduleData *SD = I->second.lookup(Key);
3273 if (SD && isInSchedulingRegion(SD))
3274 return SD;
3275 }
3276 return nullptr;
3277 }
3278
3279 bool isInSchedulingRegion(ScheduleData *SD) const {
3280 return SD->SchedulingRegionID == SchedulingRegionID;
3281 }
3282
3283 /// Marks an instruction as scheduled and puts all dependent ready
3284 /// instructions into the ready-list.
3285 template <typename ReadyListType>
3286 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3287 SD->IsScheduled = true;
3288 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3289
3290 for (ScheduleData *BundleMember = SD; BundleMember;
3291 BundleMember = BundleMember->NextInBundle) {
3292 if (BundleMember->Inst != BundleMember->OpValue)
3293 continue;
3294
3295 // Handle the def-use chain dependencies.
3296
3297 // Decrement the unscheduled counter and insert to ready list if ready.
3298 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3299 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
3300 if (OpDef && OpDef->hasValidDependencies() &&
3301 OpDef->incrementUnscheduledDeps(-1) == 0) {
3302 // There are no more unscheduled dependencies after
3303 // decrementing, so we can put the dependent instruction
3304 // into the ready list.
3305 ScheduleData *DepBundle = OpDef->FirstInBundle;
3306 assert(!DepBundle->IsScheduled &&
3307 "already scheduled bundle gets ready");
3308 ReadyList.insert(DepBundle);
3309 LLVM_DEBUG(dbgs()
3310 << "SLP: gets ready (def): " << *DepBundle << "\n");
3311 }
3312 });
3313 };
3314
3315 // If BundleMember is a vector bundle, its operands may have been
3316 // reordered during buildTree(). We therefore need to get its operands
3317 // through the TreeEntry.
3318 if (TreeEntry *TE = BundleMember->TE) {
3319 // Need to search for the lane since the tree entry can be reordered.
3320 int Lane = std::distance(TE->Scalars.begin(),
3321 find(TE->Scalars, BundleMember->Inst));
3322 assert(Lane >= 0 && "Lane not set");
3323
3324 // Since vectorization tree is being built recursively this assertion
3325 // ensures that the tree entry has all operands set before reaching
3326 // this code. Couple of exceptions known at the moment are extracts
3327 // where their second (immediate) operand is not added. Since
3328 // immediates do not affect scheduler behavior this is considered
3329 // okay.
3330 auto *In = BundleMember->Inst;
3331 assert(In &&
3332 (isa<ExtractValueInst, ExtractElementInst>(In) ||
3333 In->getNumOperands() == TE->getNumOperands()) &&
3334 "Missed TreeEntry operands?");
3335 (void)In; // fake use to avoid build failure when assertions disabled
3336
3337 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3338 OpIdx != NumOperands; ++OpIdx)
3339 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3340 DecrUnsched(I);
3341 } else {
3342 // If BundleMember is a stand-alone instruction, no operand reordering
3343 // has taken place, so we directly access its operands.
3344 for (Use &U : BundleMember->Inst->operands())
3345 if (auto *I = dyn_cast<Instruction>(U.get()))
3346 DecrUnsched(I);
3347 }
3348 // Handle the memory dependencies.
3349 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3350 if (MemoryDepSD->hasValidDependencies() &&
3351 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3352 // There are no more unscheduled dependencies after decrementing,
3353 // so we can put the dependent instruction into the ready list.
3354 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3355 assert(!DepBundle->IsScheduled &&
3356 "already scheduled bundle gets ready");
3357 ReadyList.insert(DepBundle);
3359 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3360 }
3361 }
3362 // Handle the control dependencies.
3363 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3364 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3365 // There are no more unscheduled dependencies after decrementing,
3366 // so we can put the dependent instruction into the ready list.
3367 ScheduleData *DepBundle = DepSD->FirstInBundle;
3368 assert(!DepBundle->IsScheduled &&
3369 "already scheduled bundle gets ready");
3370 ReadyList.insert(DepBundle);
3372 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3373 }
3374 }
3375
3376 }
3377 }
3378
3379 /// Verify basic self consistency properties of the data structure.
3380 void verify() {
3381 if (!ScheduleStart)
3382 return;
3383
3384 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3385 ScheduleStart->comesBefore(ScheduleEnd) &&
3386 "Not a valid scheduling region?");
3387
3388 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3389 auto *SD = getScheduleData(I);
3390 if (!SD)
3391 continue;
3392 assert(isInSchedulingRegion(SD) &&
3393 "primary schedule data not in window?");
3394 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3395 "entire bundle in window!");
3396 (void)SD;
3397 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3398 }
3399
3400 for (auto *SD : ReadyInsts) {
3401 assert(SD->isSchedulingEntity() && SD->isReady() &&
3402 "item in ready list not ready?");
3403 (void)SD;
3404 }
3405 }
3406
3407 void doForAllOpcodes(Value *V,
3408 function_ref<void(ScheduleData *SD)> Action) {
3409 if (ScheduleData *SD = getScheduleData(V))
3410 Action(SD);
3411 auto I = ExtraScheduleDataMap.find(V);
3412 if (I != ExtraScheduleDataMap.end())
3413 for (auto &P : I->second)
3414 if (isInSchedulingRegion(P.second))
3415 Action(P.second);
3416 }
3417
3418 /// Put all instructions into the ReadyList which are ready for scheduling.
3419 template <typename ReadyListType>
3420 void initialFillReadyList(ReadyListType &ReadyList) {
3421 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3422 doForAllOpcodes(I, [&](ScheduleData *SD) {
3423 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3424 SD->isReady()) {
3425 ReadyList.insert(SD);
3426 LLVM_DEBUG(dbgs()
3427 << "SLP: initially in ready list: " << *SD << "\n");
3428 }
3429 });
3430 }
3431 }
3432
3433 /// Build a bundle from the ScheduleData nodes corresponding to the
3434 /// scalar instruction for each lane.
3435 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3436
3437 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3438 /// cyclic dependencies. This is only a dry-run, no instructions are
3439 /// actually moved at this stage.
3440 /// \returns the scheduling bundle. The returned Optional value is not
3441 /// std::nullopt if \p VL is allowed to be scheduled.
3442 std::optional<ScheduleData *>
3443 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3444 const InstructionsState &S);
3445
3446 /// Un-bundles a group of instructions.
3447 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3448
3449 /// Allocates schedule data chunk.
3450 ScheduleData *allocateScheduleDataChunks();
3451
3452 /// Extends the scheduling region so that V is inside the region.
3453 /// \returns true if the region size is within the limit.
3454 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3455
3456 /// Initialize the ScheduleData structures for new instructions in the
3457 /// scheduling region.
3458 void initScheduleData(Instruction *FromI, Instruction *ToI,
3459 ScheduleData *PrevLoadStore,
3460 ScheduleData *NextLoadStore);
3461
3462 /// Updates the dependency information of a bundle and of all instructions/
3463 /// bundles which depend on the original bundle.
3464 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3465 BoUpSLP *SLP);
3466
3467 /// Sets all instruction in the scheduling region to un-scheduled.
3468 void resetSchedule();
3469
3470 BasicBlock *BB;
3471
3472 /// Simple memory allocation for ScheduleData.
3473 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
3474
3475 /// The size of a ScheduleData array in ScheduleDataChunks.
3476 int ChunkSize;
3477
3478 /// The allocator position in the current chunk, which is the last entry
3479 /// of ScheduleDataChunks.
3480 int ChunkPos;
3481
3482 /// Attaches ScheduleData to Instruction.
3483 /// Note that the mapping survives during all vectorization iterations, i.e.
3484 /// ScheduleData structures are recycled.
3486
3487 /// Attaches ScheduleData to Instruction with the leading key.
3489 ExtraScheduleDataMap;
3490
3491 /// The ready-list for scheduling (only used for the dry-run).
3492 SetVector<ScheduleData *> ReadyInsts;
3493
3494 /// The first instruction of the scheduling region.
3495 Instruction *ScheduleStart = nullptr;
3496
3497 /// The first instruction _after_ the scheduling region.
3498 Instruction *ScheduleEnd = nullptr;
3499
3500 /// The first memory accessing instruction in the scheduling region
3501 /// (can be null).
3502 ScheduleData *FirstLoadStoreInRegion = nullptr;
3503
3504 /// The last memory accessing instruction in the scheduling region
3505 /// (can be null).
3506 ScheduleData *LastLoadStoreInRegion = nullptr;
3507
3508 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3509 /// region? Used to optimize the dependence calculation for the
3510 /// common case where there isn't.
3511 bool RegionHasStackSave = false;
3512
3513 /// The current size of the scheduling region.
3514 int ScheduleRegionSize = 0;
3515
3516 /// The maximum size allowed for the scheduling region.
3517 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3518
3519 /// The ID of the scheduling region. For a new vectorization iteration this
3520 /// is incremented which "removes" all ScheduleData from the region.
3521 /// Make sure that the initial SchedulingRegionID is greater than the
3522 /// initial SchedulingRegionID in ScheduleData (which is 0).
3523 int SchedulingRegionID = 1;
3524 };
3525
3526 /// Attaches the BlockScheduling structures to basic blocks.
3528
3529 /// Performs the "real" scheduling. Done before vectorization is actually
3530 /// performed in a basic block.
3531 void scheduleBlock(BlockScheduling *BS);
3532
3533 /// List of users to ignore during scheduling and that don't need extracting.
3534 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3535
3536 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3537 /// sorted SmallVectors of unsigned.
3538 struct OrdersTypeDenseMapInfo {
3539 static OrdersType getEmptyKey() {
3540 OrdersType V;
3541 V.push_back(~1U);
3542 return V;
3543 }
3544
3545 static OrdersType getTombstoneKey() {
3546 OrdersType V;
3547 V.push_back(~2U);
3548 return V;
3549 }
3550
3551 static unsigned getHashValue(const OrdersType &V) {
3552 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3553 }
3554
3555 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3556 return LHS == RHS;
3557 }
3558 };
3559
3560 // Analysis and block reference.
3561 Function *F;
3562 ScalarEvolution *SE;
3564 TargetLibraryInfo *TLI;
3565 LoopInfo *LI;
3566 DominatorTree *DT;
3567 AssumptionCache *AC;
3568 DemandedBits *DB;
3569 const DataLayout *DL;
3571
3572 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3573 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3574
3575 /// Instruction builder to construct the vectorized tree.
3576 IRBuilder<> Builder;
3577
3578 /// A map of scalar integer values to the smallest bit width with which they
3579 /// can legally be represented. The values map to (width, signed) pairs,
3580 /// where "width" indicates the minimum bit width and "signed" is True if the
3581 /// value must be signed-extended, rather than zero-extended, back to its
3582 /// original width.
3584};
3585
3586} // end namespace slpvectorizer
3587
3588template <> struct GraphTraits<BoUpSLP *> {
3589 using TreeEntry = BoUpSLP::TreeEntry;
3590
3591 /// NodeRef has to be a pointer per the GraphWriter.
3593
3595
3596 /// Add the VectorizableTree to the index iterator to be able to return
3597 /// TreeEntry pointers.
3598 struct ChildIteratorType
3599 : public iterator_adaptor_base<
3600 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3602
3604 ContainerTy &VT)
3605 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
3606
3607 NodeRef operator*() { return I->UserTE; }
3608 };
3609
3611 return R.VectorizableTree[0].get();
3612 }
3613
3614 static ChildIteratorType child_begin(NodeRef N) {
3615 return {N->UserTreeIndices.begin(), N->Container};
3616 }
3617
3618 static ChildIteratorType child_end(NodeRef N) {
3619 return {N->UserTreeIndices.end(), N->Container};
3620 }
3621
3622 /// For the node iterator we just need to turn the TreeEntry iterator into a
3623 /// TreeEntry* iterator so that it dereferences to NodeRef.
3624 class nodes_iterator {
3626 ItTy It;
3627
3628 public:
3629 nodes_iterator(const ItTy &It2) : It(It2) {}
3630 NodeRef operator*() { return It->get(); }
3631 nodes_iterator operator++() {
3632 ++It;
3633 return *this;
3634 }
3635 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
3636 };
3637
3638 static nodes_iterator nodes_begin(BoUpSLP *R) {
3639 return nodes_iterator(R->VectorizableTree.begin());
3640 }
3641
3642 static nodes_iterator nodes_end(BoUpSLP *R) {
3643 return nodes_iterator(R->VectorizableTree.end());
3644 }
3645
3646 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
3647};
3648
3649template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
3650 using TreeEntry = BoUpSLP::TreeEntry;
3651
3653
3654 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
3655 std::string Str;
3657 OS << Entry->Idx << ".\n";
3658 if (isSplat(Entry->Scalars))
3659 OS << "<splat> ";
3660 for (auto *V : Entry->Scalars) {
3661 OS << *V;
3662 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
3663 return EU.Scalar == V;
3664 }))
3665 OS << " <extract>";
3666 OS << "\n";
3667 }
3668 return Str;
3669 }
3670
3671 static std::string getNodeAttributes(const TreeEntry *Entry,
3672 const BoUpSLP *) {
3673 if (Entry->State == TreeEntry::NeedToGather)
3674 return "color=red";
3675 if (Entry->State == TreeEntry::ScatterVectorize)
3676 return "color=blue";
3677 return "";
3678 }
3679};
3680
3681} // end namespace llvm
3682
3685 for (auto *I : DeletedInstructions) {
3686 for (Use &U : I->operands()) {
3687 auto *Op = dyn_cast<Instruction>(U.get());
3688 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
3690 DeadInsts.emplace_back(Op);
3691 }
3692 I->dropAllReferences();
3693 }
3694 for (auto *I : DeletedInstructions) {
3695 assert(I->use_empty() &&
3696 "trying to erase instruction with users.");
3697 I->eraseFromParent();
3698 }
3699
3700 // Cleanup any dead scalar code feeding the vectorized instructions
3702
3703#ifdef EXPENSIVE_CHECKS
3704 // If we could guarantee that this call is not extremely slow, we could
3705 // remove the ifdef limitation (see PR47712).
3706 assert(!verifyFunction(*F, &dbgs()));
3707#endif
3708}
3709
3710/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
3711/// contains original mask for the scalars reused in the node. Procedure
3712/// transform this mask in accordance with the given \p Mask.
3714 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
3715 "Expected non-empty mask.");
3716 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
3717 Prev.swap(Reuses);
3718 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
3719 if (Mask[I] != UndefMaskElem)
3720 Reuses[Mask[I]] = Prev[I];
3721}
3722
3723/// Reorders the given \p Order according to the given \p Mask. \p Order - is
3724/// the original order of the scalars. Procedure transforms the provided order
3725/// in accordance with the given \p Mask. If the resulting \p Order is just an
3726/// identity order, \p Order is cleared.
3728 assert(!Mask.empty() && "Expected non-empty mask.");
3729 SmallVector<int> MaskOrder;
3730 if (Order.empty()) {
3731 MaskOrder.resize(Mask.size());
3732 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
3733 } else {
3734 inversePermutation(Order, MaskOrder);
3735 }
3736 reorderReuses(MaskOrder, Mask);
3737 if (ShuffleVectorInst::isIdentityMask(MaskOrder)) {
3738 Order.clear();
3739 return;
3740 }
3741 Order.assign(Mask.size(), Mask.size());
3742 for (unsigned I = 0, E = Mask.size(); I < E; ++I)
3743 if (MaskOrder[I] != UndefMaskElem)
3744 Order[MaskOrder[I]] = I;
3745 fixupOrderingIndices(Order);
3746}
3747
3748std::optional<BoUpSLP::OrdersType>
3749BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
3750 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
3751 unsigned NumScalars = TE.Scalars.size();
3752 OrdersType CurrentOrder(NumScalars, NumScalars);
3753 SmallVector<int> Positions;
3754 SmallBitVector UsedPositions(NumScalars);
3755 const TreeEntry *STE = nullptr;
3756 // Try to find all gathered scalars that are gets vectorized in other
3757 // vectorize node. Here we can have only one single tree vector node to
3758 // correctly identify order of the gathered scalars.
3759 for (unsigned I = 0; I < NumScalars; ++I) {
3760 Value *V = TE.Scalars[I];
3761 if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
3762 continue;
3763 if (const auto *LocalSTE = getTreeEntry(V)) {
3764 if (!STE)
3765 STE = LocalSTE;
3766 else if (STE != LocalSTE)
3767 // Take the order only from the single vector node.
3768 return std::nullopt;
3769 unsigned Lane =
3770 std::distance(STE->Scalars.begin(), find(STE->Scalars, V));
3771 if (Lane >= NumScalars)
3772 return std::nullopt;
3773 if (CurrentOrder[Lane] != NumScalars) {
3774 if (Lane != I)
3775 continue;
3776 UsedPositions.reset(CurrentOrder[Lane]);
3777 }
3778 // The partial identity (where only some elements of the gather node are
3779 // in the identity order) is good.
3780 CurrentOrder[Lane] = I;
3781 UsedPositions.set(I);
3782 }
3783 }
3784 // Need to keep the order if we have a vector entry and at least 2 scalars or
3785 // the vectorized entry has just 2 scalars.
3786 if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) {
3787 auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) {
3788 for (unsigned I = 0; I < NumScalars; ++I)
3789 if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)
3790 return false;
3791 return true;
3792 };
3793 if (IsIdentityOrder(CurrentOrder))
3794 return OrdersType();
3795 auto *It = CurrentOrder.begin();
3796 for (unsigned I = 0; I < NumScalars;) {
3797 if (UsedPositions.test(I)) {
3798 ++I;
3799 continue;
3800 }
3801 if (*It == NumScalars) {
3802 *It = I;
3803 ++I;
3804 }
3805 ++It;
3806 }
3807 return std::move(CurrentOrder);
3808 }
3809 return std::nullopt;
3810}
3811
3812namespace {
3813/// Tracks the state we can represent the loads in the given sequence.
3814enum class LoadsState { Gather, Vectorize, ScatterVectorize };
3815} // anonymous namespace
3816
3817static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
3818 const TargetLibraryInfo &TLI,
3819 bool CompareOpcodes = true) {
3820 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
3821 return false;
3822 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
3823 if (!GEP1)
3824 return false;
3825 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
3826 if (!GEP2)
3827 return false;
3828 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
3829 ((isConstant(GEP1->getOperand(1)) &&
3830 isConstant(GEP2->getOperand(1))) ||
3831 !CompareOpcodes ||
3832 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
3833 .getOpcode());
3834}
3835
3836/// Checks if the given array of loads can be represented as a vectorized,
3837/// scatter or just simple gather.
3838static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
3839 const TargetTransformInfo &TTI,
3840 const DataLayout &DL, ScalarEvolution &SE,
3841 LoopInfo &LI, const TargetLibraryInfo &TLI,
3843 SmallVectorImpl<Value *> &PointerOps) {
3844 // Check that a vectorized load would load the same memory as a scalar
3845 // load. For example, we don't want to vectorize loads that are smaller
3846 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
3847 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
3848 // from such a struct, we read/write packed bits disagreeing with the
3849 // unvectorized version.
3850 Type *ScalarTy = VL0->getType();
3851
3852 if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy))
3853 return LoadsState::Gather;
3854
3855 // Make sure all loads in the bundle are simple - we can't vectorize
3856 // atomic or volatile loads.
3857 PointerOps.clear();
3858 PointerOps.resize(VL.size());
3859 auto *POIter = PointerOps.begin();
3860 for (Value *V : VL) {
3861 auto *L = cast<LoadInst>(V);
3862 if (!L->isSimple())
3863 return LoadsState::Gather;
3864 *POIter = L->getPointerOperand();
3865 ++POIter;
3866 }
3867
3868 Order.clear();
3869 // Check the order of pointer operands or that all pointers are the same.
3870 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
3871 if (IsSorted || all_of(PointerOps, [&](Value *P) {
3872 return arePointersCompatible(P, PointerOps.front(), TLI);
3873 })) {
3874 if (IsSorted) {
3875 Value *Ptr0;
3876 Value *PtrN;
3877 if (Order.empty()) {
3878 Ptr0 = PointerOps.front();
3879 PtrN = PointerOps.back();
3880 } else {
3881 Ptr0 = PointerOps[Order.front()];
3882 PtrN = PointerOps[Order.back()];
3883 }
3884 std::optional<int> Diff =
3885 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
3886 // Check that the sorted loads are consecutive.
3887 if (static_cast<unsigned>(*Diff) == VL.size() - 1)
3888 return LoadsState::Vectorize;
3889 }
3890 // TODO: need to improve analysis of the pointers, if not all of them are
3891 // GEPs or have > 2 operands, we end up with a gather node, which just
3892 // increases the cost.
3893 Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent());
3894 bool ProfitableGatherPointers =
3895 static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
3896 return L && L->isLoopInvariant(V);
3897 })) <= VL.size() / 2 && VL.size() > 2;
3898 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
3899 auto *GEP = dyn_cast<GetElementPtrInst>(P);
3900 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
3901 (GEP && GEP->getNumOperands() == 2);
3902 })) {
3903 Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
3904 for (Value *V : VL)
3905 CommonAlignment =
3906 std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
3907 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
3908 if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
3909 !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))
3910 return LoadsState::ScatterVectorize;
3911 }
3912 }
3913
3914 return LoadsState::Gather;
3915}
3916
3918 const DataLayout &DL, ScalarEvolution &SE,
3919 SmallVectorImpl<unsigned> &SortedIndices) {
3921 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
3922 "Expected list of pointer operands.");
3923 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
3924 // Ptr into, sort and return the sorted indices with values next to one
3925 // another.
3927 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
3928
3929 unsigned Cnt = 1;
3930 for (Value *Ptr : VL.drop_front()) {
3931 bool Found = any_of(Bases, [&](auto &Base) {
3932 std::optional<int> Diff =
3933 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
3934 /*StrictCheck=*/true);
3935 if (!Diff)
3936 return false;
3937
3938 Base.second.emplace_back(Ptr, *Diff, Cnt++);
3939 return true;
3940 });
3941
3942 if (!Found) {
3943 // If we haven't found enough to usefully cluster, return early.
3944 if (Bases.size() > VL.size() / 2 - 1)
3945 return false;
3946
3947 // Not found already - add a new Base
3948 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
3949 }
3950 }
3951
3952 // For each of the bases sort the pointers by Offset and check if any of the
3953 // base become consecutively allocated.
3954 bool AnyConsecutive = false;
3955 for (auto &Base : Bases) {
3956 auto &Vec = Base.second;
3957 if (Vec.size() > 1) {
3958 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
3959 const std::tuple<Value *, int, unsigned> &Y) {
3960 return std::get<1>(X) < std::get<1>(Y);
3961 });
3962 int InitialOffset = std::get<1>(Vec[0]);
3963 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
3964 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
3965 });
3966 }
3967 }
3968
3969 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
3970 SortedIndices.clear();
3971 if (!AnyConsecutive)
3972 return false;
3973
3974 for (auto &Base : Bases) {
3975 for (auto &T : Base.second)
3976 SortedIndices.push_back(std::get<2>(T));
3977 }
3978
3979 assert(SortedIndices.size() == VL.size() &&
3980 "Expected SortedIndices to be the size of VL");
3981 return true;
3982}
3983
3984std::optional<BoUpSLP::OrdersType>
3985BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
3986 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
3987 Type *ScalarTy = TE.Scalars[0]->getType();
3988
3990 Ptrs.reserve(TE.Scalars.size());
3991 for (Value *V : TE.Scalars) {
3992 auto *L = dyn_cast<LoadInst>(V);
3993 if (!L || !L->isSimple())
3994 return std::nullopt;
3995 Ptrs.push_back(L->getPointerOperand());
3996 }
3997
3998 BoUpSLP::OrdersType Order;
3999 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
4000 return std::move(Order);
4001 return std::nullopt;
4002}
4003
4004/// Check if two insertelement instructions are from the same buildvector.
4007 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4008 // Instructions must be from the same basic blocks.
4009 if (VU->getParent() != V->getParent())
4010 return false;
4011 // Checks if 2 insertelements are from the same buildvector.
4012 if (VU->getType() != V->getType())
4013 return false;
4014 // Multiple used inserts are separate nodes.
4015 if (!VU->hasOneUse() && !V->hasOneUse())
4016 return false;
4017 auto *IE1 = VU;
4018 auto *IE2 = V;
4019 std::optional<unsigned> Idx1 = getInsertIndex(IE1);
4020 std::optional<unsigned> Idx2 = getInsertIndex(IE2);
4021 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4022 return false;
4023 // Go through the vector operand of insertelement instructions trying to find
4024 // either VU as the original vector for IE2 or V as the original vector for
4025 // IE1.
4026 SmallSet<int, 8> ReusedIdx;
4027 bool IsReusedIdx = false;
4028 do {
4029 if (IE2 == VU && !IE1)
4030 return VU->hasOneUse();
4031 if (IE1 == V && !IE2)
4032 return V->hasOneUse();
4033 if (IE1 && IE1 != V) {
4034 IsReusedIdx |=
4035 !ReusedIdx.insert(getInsertIndex(IE1).value_or(*Idx2)).second;
4036 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4037 IE1 = nullptr;
4038 else
4039 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4040 }
4041 if (IE2 && IE2 != VU) {
4042 IsReusedIdx |=
4043 !ReusedIdx.insert(getInsertIndex(IE2).value_or(*Idx1)).second;
4044 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4045 IE2 = nullptr;
4046 else
4047 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4048 }
4049 } while (!IsReusedIdx && (IE1 || IE2));
4050 return false;
4051}
4052
4053std::optional<BoUpSLP::OrdersType>
4054BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4055 // No need to reorder if need to shuffle reuses, still need to shuffle the
4056 // node.
4057 if (!TE.ReuseShuffleIndices.empty()) {
4058 // Check if reuse shuffle indices can be improved by reordering.
4059 // For this, check that reuse mask is "clustered", i.e. each scalar values
4060 // is used once in each submask of size <number_of_scalars>.
4061 // Example: 4 scalar values.
4062 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4063 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4064 // element 3 is used twice in the second submask.
4065 unsigned Sz = TE.Scalars.size();
4066 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
4067 Sz))
4068 return std::nullopt;
4069 unsigned VF = TE.getVectorFactor();
4070 // Try build correct order for extractelement instructions.
4071 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
4072 TE.ReuseShuffleIndices.end());
4073 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4074 all_of(TE.Scalars, [Sz](Value *V) {
4075 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4076 return Idx && *Idx < Sz;
4077 })) {
4078 SmallVector<int> ReorderMask(Sz, UndefMaskElem);
4079 if (TE.ReorderIndices.empty())
4080 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4081 else
4082 inversePermutation(TE.ReorderIndices, ReorderMask);
4083 for (unsigned I = 0; I < VF; ++I) {
4084 int &Idx = ReusedMask[I];
4085 if (Idx == UndefMaskElem)
4086 continue;
4087 Value *V = TE.Scalars[ReorderMask[Idx]];
4088 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
4089 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
4090 }
4091 }
4092 // Build the order of the VF size, need to reorder reuses shuffles, they are
4093 // always of VF size.
4094 OrdersType ResOrder(VF);
4095 std::iota(ResOrder.begin(), ResOrder.end(), 0);
4096 auto *It = ResOrder.begin();
4097 for (unsigned K = 0; K < VF; K += Sz) {
4098 OrdersType CurrentOrder(TE.ReorderIndices);
4099 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
4100 if (SubMask.front() == UndefMaskElem)
4101 std::iota(SubMask.begin(), SubMask.end(), 0);
4102 reorderOrder(CurrentOrder, SubMask);
4103 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
4104 std::advance(It, Sz);
4105 }
4106 if (all_of(enumerate(ResOrder),
4107 [](const auto &Data) { return Data.index() == Data.value(); }))
4108 return std::nullopt; // No need to reorder.
4109 return std::move(ResOrder);
4110 }
4111 if (TE.State == TreeEntry::Vectorize &&
4112 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4113 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4114 !TE.isAltShuffle())
4115 return TE.ReorderIndices;
4116 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4117 auto PHICompare = [](llvm::Value *V1, llvm::Value *V2) {
4118 if (!V1->hasOneUse() || !V2->hasOneUse())
4119 return false;
4120 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
4121 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4122 if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4123 if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4125 IE1, IE2,
4126 [](InsertElementInst *II) { return II->getOperand(0); }))
4127 return false;
4128 std::optional<unsigned> Idx1 = getInsertIndex(IE1);
4129 std::optional<unsigned> Idx2 = getInsertIndex(IE2);
4130 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4131 return false;
4132 return *Idx1 < *Idx2;
4133 }
4134 if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4135 if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4136 if (EE1->getOperand(0) != EE2->getOperand(0))
4137 return false;
4138 std::optional<unsigned> Idx1 = getExtractIndex(EE1);
4139 std::optional<unsigned> Idx2 = getExtractIndex(EE2);
4140 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4141 return false;
4142 return *Idx1 < *Idx2;
4143 }
4144 return false;
4145 };
4146 auto IsIdentityOrder = [](const OrdersType &Order) {
4147 for (unsigned Idx : seq<unsigned>(0, Order.size()))
4148 if (Idx != Order[Idx])
4149 return false;
4150 return true;
4151 };
4152 if (!TE.ReorderIndices.empty())
4153 return TE.ReorderIndices;
4156 OrdersType ResOrder(TE.Scalars.size());
4157 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id) {
4158 PhiToId[TE.Scalars[Id]] = Id;
4159 Phis.push_back(TE.Scalars[Id]);
4160 }
4161 llvm::stable_sort(Phis, PHICompare);
4162 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4163 ResOrder[Id] = PhiToId[Phis[Id]];
4164 if (IsIdentityOrder(ResOrder))
4165 return std::nullopt; // No need to reorder.
4166 return std::move(ResOrder);
4167 }
4168 if (TE.State == TreeEntry::NeedToGather) {
4169 // TODO: add analysis of other gather nodes with extractelement
4170 // instructions and other values/instructions, not only undefs.
4171 if (((TE.getOpcode() == Instruction::ExtractElement &&
4172 !TE.isAltShuffle()) ||
4173 (all_of(TE.Scalars,
4174 [](Value *V) {
4175 return isa<UndefValue, ExtractElementInst>(V);
4176 }) &&
4177 any_of(TE.Scalars,
4178 [](Value *V) { return isa<ExtractElementInst>(V); }))) &&
4179 all_of(TE.Scalars,
4180 [](Value *V) {
4181 auto *EE = dyn_cast<ExtractElementInst>(V);
4182 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4183 }) &&
4184 allSameType(TE.Scalars)) {
4185 // Check that gather of extractelements can be represented as
4186 // just a shuffle of a single vector.
4187 OrdersType CurrentOrder;
4188 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder);
4189 if (Reuse || !CurrentOrder.empty()) {
4190 if (!CurrentOrder.empty())
4191 fixupOrderingIndices(CurrentOrder);
4192 return std::move(CurrentOrder);
4193 }
4194 }
4195 // If the gather node is <undef, v, .., poison> and
4196 // insertelement poison, v, 0 [+ permute]
4197 // is cheaper than
4198 // insertelement poison, v, n - try to reorder.
4199 // If rotating the whole graph, exclude the permute cost, the whole graph
4200 // might be transformed.
4201 int Sz = TE.Scalars.size();
4202 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
4203 count_if(TE.Scalars, UndefValue::classof) == Sz - 1) {
4204 const auto *It =
4205 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
4206 if (It == TE.Scalars.begin())
4207 return OrdersType();
4208 auto *Ty = FixedVectorType::get(TE.Scalars.front()->getType(), Sz);
4209 if (It != TE.Scalars.end()) {
4210 OrdersType Order(Sz, Sz);
4211 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4212 Order[Idx] = 0;
4213 fixupOrderingIndices(Order);
4214 SmallVector<int> Mask;
4215 inversePermutation(Order, Mask);
4216 InstructionCost PermuteCost =
4217 TopToBottom
4218 ? 0
4220 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
4221 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
4222 PoisonValue::get(Ty), *It);
4223 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
4224 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
4225 PoisonValue::get(Ty), *It);
4226 if (InsertFirstCost + PermuteCost < InsertIdxCost)
4227 return std::move(Order);
4228 }
4229 }
4230 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
4231 return CurrentOrder;
4232 if (TE.Scalars.size() >= 4)
4233 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
4234 return Order;
4235 }
4236 return std::nullopt;
4237}
4238
4239/// Checks if the given mask is a "clustered" mask with the same clusters of
4240/// size \p Sz, which are not identity submasks.
4242 unsigned Sz) {
4243 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
4244 if (ShuffleVectorInst::isIdentityMask(FirstCluster))
4245 return false;
4246 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
4247 ArrayRef<int> Cluster = Mask.slice(I, Sz);
4248 if (Cluster != FirstCluster)
4249 return false;
4250 }
4251 return true;
4252}
4253
4254void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
4255 // Reorder reuses mask.
4256 reorderReuses(TE.ReuseShuffleIndices, Mask);
4257 const unsigned Sz = TE.Scalars.size();
4258 // For vectorized and non-clustered reused no need to do anything else.
4259 if (TE.State != TreeEntry::NeedToGather ||
4261 Sz) ||
4262 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
4263 return;
4264 SmallVector<int> NewMask;
4265 inversePermutation(TE.ReorderIndices, NewMask);
4266 addMask(NewMask, TE.ReuseShuffleIndices);
4267 // Clear reorder since it is going to be applied to the new mask.
4268 TE.ReorderIndices.clear();
4269 // Try to improve gathered nodes with clustered reuses, if possible.
4270 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
4271 SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
4272 inversePermutation(NewOrder, NewMask);
4273 reorderScalars(TE.Scalars, NewMask);
4274 // Fill the reuses mask with the identity submasks.
4275 for (auto *It = TE.ReuseShuffleIndices.begin(),
4276 *End = TE.ReuseShuffleIndices.end();
4277 It != End; std::advance(It, Sz))
4278 std::iota(It, std::next(It, Sz), 0);
4279}
4280
4282 // Maps VF to the graph nodes.
4284 // ExtractElement gather nodes which can be vectorized and need to handle
4285 // their ordering.
4287
4288 // Phi nodes can have preferred ordering based on their result users
4290
4291 // AltShuffles can also have a preferred ordering that leads to fewer
4292 // instructions, e.g., the addsub instruction in x86.
4293 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
4294
4295 // Maps a TreeEntry to the reorder indices of external users.
4297 ExternalUserReorderMap;
4298 // FIXME: Workaround for syntax error reported by MSVC buildbots.
4299 TargetTransformInfo &TTIRef = *TTI;
4300 // Find all reorderable nodes with the given VF.
4301 // Currently the are vectorized stores,loads,extracts + some gathering of
4302 // extracts.
4303 for_each(VectorizableTree, [this, &TTIRef, &VFToOrderedEntries,
4304 &GathersToOrders, &ExternalUserReorderMap,
4305 &AltShufflesToOrders, &PhisToOrders](
4306 const std::unique_ptr<TreeEntry> &TE) {
4307 // Look for external users that will probably be vectorized.
4308 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
4309 findExternalStoreUsersReorderIndices(TE.get());
4310 if (!ExternalUserReorderIndices.empty()) {
4311 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4312 ExternalUserReorderMap.try_emplace(TE.get(),
4313 std::move(ExternalUserReorderIndices));
4314 }
4315
4316 // Patterns like [fadd,fsub] can be combined into a single instruction in
4317 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
4318 // to take into account their order when looking for the most used order.
4319 if (TE->isAltShuffle()) {
4320 VectorType *VecTy =
4321 FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
4322 unsigned Opcode0 = TE->getOpcode();
4323 unsigned Opcode1 = TE->getAltOpcode();
4324 // The opcode mask selects between the two opcodes.
4325 SmallBitVector OpcodeMask(TE->Scalars.size(), false);
4326 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
4327 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
4328 OpcodeMask.set(Lane);
4329 // If this pattern is supported by the target then we consider the order.
4330 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
4331 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4332 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
4333 }
4334 // TODO: Check the reverse order too.
4335 }
4336
4337 if (std::optional<OrdersType> CurrentOrder =
4338 getReorderingData(*TE, /*TopToBottom=*/true)) {
4339 // Do not include ordering for nodes used in the alt opcode vectorization,
4340 // better to reorder them during bottom-to-top stage. If follow the order
4341 // here, it causes reordering of the whole graph though actually it is
4342 // profitable just to reorder the subgraph that starts from the alternate
4343 // opcode vectorization node. Such nodes already end-up with the shuffle
4344 // instruction and it is just enough to change this shuffle rather than
4345 // rotate the scalars for the whole graph.
4346 unsigned Cnt = 0;
4347 const TreeEntry *UserTE = TE.get();
4348 while (UserTE && Cnt < RecursionMaxDepth) {
4349 if (UserTE->UserTreeIndices.size() != 1)
4350 break;
4351 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
4352 return EI.UserTE->State == TreeEntry::Vectorize &&
4353 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
4354 }))
4355 return;
4356 UserTE = UserTE->UserTreeIndices.back().UserTE;
4357 ++Cnt;
4358 }
4359 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4360 if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty())
4361 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
4362 if (TE->State == TreeEntry::Vectorize &&
4363 TE->getOpcode() == Instruction::PHI)
4364 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
4365 }
4366 });
4367
4368 // Reorder the graph nodes according to their vectorization factor.
4369 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
4370 VF /= 2) {
4371 auto It = VFToOrderedEntries.find(VF);
4372 if (It == VFToOrderedEntries.end())
4373 continue;
4374 // Try to find the most profitable order. We just are looking for the most
4375 // used order and reorder scalar elements in the nodes according to this
4376 // mostly used order.
4377 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
4378 // All operands are reordered and used only in this node - propagate the
4379 // most used order to the user node.
4382 OrdersUses;
4384 for (const TreeEntry *OpTE : OrderedEntries) {
4385 // No need to reorder this nodes, still need to extend and to use shuffle,
4386 // just need to merge reordering shuffle and the reuse shuffle.
4387 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
4388 continue;
4389 // Count number of orders uses.
4390 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
4391 &PhisToOrders]() -> const OrdersType & {
4392 if (OpTE->State == TreeEntry::NeedToGather ||
4393 !OpTE->ReuseShuffleIndices.empty()) {
4394 auto It = GathersToOrders.find(OpTE);
4395 if (It != GathersToOrders.end())
4396 return It->second;
4397 }
4398 if (OpTE->isAltShuffle()) {
4399 auto It = AltShufflesToOrders.find(OpTE);
4400 if (It != AltShufflesToOrders.end())
4401 return It->second;
4402 }
4403 if (OpTE->State == TreeEntry::Vectorize &&
4404 OpTE->getOpcode() == Instruction::PHI) {
4405 auto It = PhisToOrders.find(OpTE);
4406 if (It != PhisToOrders.end())
4407 return It->second;
4408 }
4409 return OpTE->ReorderIndices;
4410 }();
4411 // First consider the order of the external scalar users.
4412 auto It = ExternalUserReorderMap.find(OpTE);
4413 if (It != ExternalUserReorderMap.end()) {
4414 const auto &ExternalUserReorderIndices = It->second;
4415 // If the OpTE vector factor != number of scalars - use natural order,
4416 // it is an attempt to reorder node with reused scalars but with
4417 // external uses.
4418 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
4419 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
4420 ExternalUserReorderIndices.size();
4421 } else {
4422 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
4423 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
4424 }
4425 // No other useful reorder data in this entry.
4426 if (Order.empty())
4427 continue;
4428 }
4429 // Stores actually store the mask, not the order, need to invert.
4430 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4431 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4432 SmallVector<int> Mask;
4433 inversePermutation(Order, Mask);
4434 unsigned E = Order.size();
4435 OrdersType CurrentOrder(E, E);
4436 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
4437 return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
4438 });
4439 fixupOrderingIndices(CurrentOrder);
4440 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
4441 } else {
4442 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
4443 }
4444 }
4445 // Set order of the user node.
4446 if (OrdersUses.empty())
4447 continue;
4448 // Choose the most used order.
4449 ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
4450 unsigned Cnt = OrdersUses.front().second;
4451 for (const auto &Pair : drop_begin(OrdersUses)) {
4452 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
4453 BestOrder = Pair.first;
4454 Cnt = Pair.second;
4455 }
4456 }
4457 // Set order of the user node.
4458 if (BestOrder.empty())
4459 continue;
4460 SmallVector<int> Mask;
4461 inversePermutation(BestOrder, Mask);
4462 SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);
4463 unsigned E = BestOrder.size();
4464 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
4465 return I < E ? static_cast<int>(I) : UndefMaskElem;
4466 });
4467 // Do an actual reordering, if profitable.
4468 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
4469 // Just do the reordering for the nodes with the given VF.
4470 if (TE->Scalars.size() != VF) {
4471 if (TE->ReuseShuffleIndices.size() == VF) {
4472 // Need to reorder the reuses masks of the operands with smaller VF to
4473 // be able to find the match between the graph nodes and scalar
4474 // operands of the given node during vectorization/cost estimation.
4475 assert(all_of(TE->UserTreeIndices,
4476 [VF, &TE](const EdgeInfo &EI) {
4477 return EI.UserTE->Scalars.size() == VF ||
4478 EI.UserTE->Scalars.size() ==
4479 TE->Scalars.size();
4480 }) &&
4481 "All users must be of VF size.");
4482 // Update ordering of the operands with the smaller VF than the given
4483 // one.
4484 reorderNodeWithReuses(*TE, Mask);
4485 }
4486 continue;
4487 }
4488 if (TE->State == TreeEntry::Vectorize &&
4490 InsertElementInst>(TE->getMainOp()) &&
4491 !TE->isAltShuffle()) {
4492 // Build correct orders for extract{element,value}, loads and
4493 // stores.
4494 reorderOrder(TE->ReorderIndices, Mask);
4495 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
4496 TE->reorderOperands(Mask);
4497 } else {
4498 // Reorder the node and its operands.
4499 TE->reorderOperands(Mask);
4500 assert(TE->ReorderIndices.empty() &&
4501 "Expected empty reorder sequence.");
4502 reorderScalars(TE->Scalars, Mask);
4503 }
4504 if (!TE->ReuseShuffleIndices.empty()) {
4505 // Apply reversed order to keep the original ordering of the reused
4506 // elements to avoid extra reorder indices shuffling.
4507 OrdersType CurrentOrder;
4508 reorderOrder(CurrentOrder, MaskOrder);
4509 SmallVector<int> NewReuses;
4510 inversePermutation(CurrentOrder, NewReuses);
4511 addMask(NewReuses, TE->ReuseShuffleIndices);
4512 TE->ReuseShuffleIndices.swap(NewReuses);
4513 }
4514 }
4515 }
4516}
4517
4518bool BoUpSLP::canReorderOperands(
4519 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
4520 ArrayRef<TreeEntry *> ReorderableGathers,
4521 SmallVectorImpl<TreeEntry *> &GatherOps) {
4522 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
4523 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
4524 return OpData.first == I &&
4525 OpData.second->State == TreeEntry::Vectorize;
4526 }))
4527 continue;
4528 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
4529 // Do not reorder if operand node is used by many user nodes.
4530 if (any_of(TE->UserTreeIndices,
4531 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
4532 return false;
4533 // Add the node to the list of the ordered nodes with the identity
4534 // order.
4535 Edges.emplace_back(I, TE);
4536 // Add ScatterVectorize nodes to the list of operands, where just
4537 // reordering of the scalars is required. Similar to the gathers, so
4538 // simply add to the list of gathered ops.
4539 // If there are reused scalars, process this node as a regular vectorize
4540 // node, just reorder reuses mask.
4541 if (TE->State != TreeEntry::Vectorize && TE->ReuseShuffleIndices.empty())
4542 GatherOps.push_back(TE);
4543 continue;
4544 }
4545 TreeEntry *Gather = nullptr;
4546 if (count_if(ReorderableGathers,
4547 [&Gather, UserTE, I](TreeEntry *TE) {
4548 assert(TE->State != TreeEntry::Vectorize &&
4549 "Only non-vectorized nodes are expected.");
4550 if (any_of(TE->UserTreeIndices,
4551 [UserTE, I](const EdgeInfo &EI) {
4552 return EI.UserTE == UserTE && EI.EdgeIdx == I;
4553 })) {
4554 assert(TE->isSame(UserTE->getOperand(I)) &&
4555 "Operand entry does not match operands.");
4556 Gather = TE;
4557 return true;
4558 }
4559 return false;
4560 }) > 1 &&
4561 !allConstant(UserTE->getOperand(I)))
4562 return false;
4563 if (Gather)
4564 GatherOps.push_back(Gather);
4565 }
4566 return true;
4567}
4568
4569void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
4570 SetVector<TreeEntry *> OrderedEntries;
4572 // Find all reorderable leaf nodes with the given VF.
4573 // Currently the are vectorized loads,extracts without alternate operands +
4574 // some gathering of extracts.
4575 SmallVector<TreeEntry *> NonVectorized;
4576 for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders,
4577 &NonVectorized](
4578 const std::unique_ptr<TreeEntry> &TE) {
4579 if (TE->State != TreeEntry::Vectorize)
4580 NonVectorized.push_back(TE.get());
4581 if (std::optional<OrdersType> CurrentOrder =
4582 getReorderingData(*TE, /*TopToBottom=*/false)) {
4583 OrderedEntries.insert(TE.get());
4584 if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty())
4585 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
4586 }
4587 });
4588
4589 // 1. Propagate order to the graph nodes, which use only reordered nodes.
4590 // I.e., if the node has operands, that are reordered, try to make at least
4591 // one operand order in the natural order and reorder others + reorder the
4592 // user node itself.
4594 while (!OrderedEntries.empty()) {
4595 // 1. Filter out only reordered nodes.
4596 // 2. If the entry has multiple uses - skip it and jump to the next node.
4598 SmallVector<TreeEntry *> Filtered;
4599 for (TreeEntry *TE : OrderedEntries) {
4600 if (!(TE->State == TreeEntry::Vectorize ||
4601 (TE->State == TreeEntry::NeedToGather &&
4602 GathersToOrders.count(TE))) ||
4603 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4604 !all_of(drop_begin(TE->UserTreeIndices),
4605 [TE](const EdgeInfo &EI) {
4606 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
4607 }) ||
4608 !Visited.insert(TE).second) {
4609 Filtered.push_back(TE);
4610 continue;
4611 }
4612 // Build a map between user nodes and their operands order to speedup
4613 // search. The graph currently does not provide this dependency directly.
4614 for (EdgeInfo &EI : TE->UserTreeIndices) {
4615 TreeEntry *UserTE = EI.UserTE;
4616 auto It = Users.find(UserTE);
4617 if (It == Users.end())
4618 It = Users.insert({UserTE, {}}).first;
4619 It->second.emplace_back(EI.EdgeIdx, TE);
4620 }
4621 }
4622 // Erase filtered entries.
4623 for_each(Filtered,
4624 [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });
4626 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
4627 UsersVec(Users.begin(), Users.end());
4628 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
4629 return Data1.first->Idx > Data2.first->Idx;
4630 });
4631 for (auto &Data : UsersVec) {
4632 // Check that operands are used only in the User node.
4633 SmallVector<TreeEntry *> GatherOps;
4634 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
4635 GatherOps)) {
4636 for_each(Data.second,
4637 [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
4638 OrderedEntries.remove(Op.second);
4639 });
4640 continue;
4641 }
4642 // All operands are reordered and used only in this node - propagate the
4643 // most used order to the user node.
4646 OrdersUses;
4647 // Do the analysis for each tree entry only once, otherwise the order of
4648 // the same node my be considered several times, though might be not
4649 // profitable.
4652 for (const auto &Op : Data.second) {
4653 TreeEntry *OpTE = Op.second;
4654 if (!VisitedOps.insert(OpTE).second)
4655 continue;
4656 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
4657 continue;
4658 const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
4659 if (OpTE->State == TreeEntry::NeedToGather ||
4660 !OpTE->ReuseShuffleIndices.empty())
4661 return GathersToOrders.find(OpTE)->second;
4662 return OpTE->ReorderIndices;
4663 }();
4664 unsigned NumOps = count_if(
4665 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
4666 return P.second == OpTE;
4667 });
4668 // Stores actually store the mask, not the order, need to invert.
4669 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4670 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4671 SmallVector<int> Mask;
4672 inversePermutation(Order, Mask);
4673 unsigned E = Order.size();
4674 OrdersType CurrentOrder(E, E);
4675 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
4676 return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
4677 });
4678 fixupOrderingIndices(CurrentOrder);
4679 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
4680 NumOps;
4681 } else {
4682 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
4683 }
4684 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
4685 const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
4686 const TreeEntry *TE) {
4687 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4688 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
4689 (IgnoreReorder && TE->Idx == 0))
4690 return true;
4691 if (TE->State == TreeEntry::NeedToGather) {
4692 auto It = GathersToOrders.find(TE);
4693 if (It != GathersToOrders.end())
4694 return !It->second.empty();
4695 return true;
4696 }
4697 return false;
4698 };
4699 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
4700 TreeEntry *UserTE = EI.UserTE;
4701 if (!VisitedUsers.insert(UserTE).second)
4702 continue;
4703 // May reorder user node if it requires reordering, has reused
4704 // scalars, is an alternate op vectorize node or its op nodes require
4705 // reordering.
4706 if (AllowsReordering(UserTE))
4707 continue;
4708 // Check if users allow reordering.
4709 // Currently look up just 1 level of operands to avoid increase of
4710 // the compile time.
4711 // Profitable to reorder if definitely more operands allow
4712 // reordering rather than those with natural order.
4714 if (static_cast<unsigned>(count_if(
4715 Ops, [UserTE, &AllowsReordering](
4716 const std::pair<unsigned, TreeEntry *> &Op) {
4717 return AllowsReordering(Op.second) &&
4718 all_of(Op.second->UserTreeIndices,
4719 [UserTE](const EdgeInfo &EI) {
4720 return EI.UserTE == UserTE;
4721 });
4722 })) <= Ops.size() / 2)
4723 ++Res.first->second;
4724 }
4725 }
4726 // If no orders - skip current nodes and jump to the next one, if any.
4727 if (OrdersUses.empty()) {
4728 for_each(Data.second,
4729 [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
4730 OrderedEntries.remove(Op.second);
4731 });
4732 continue;
4733 }
4734 // Choose the best order.
4735 ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
4736 unsigned Cnt = OrdersUses.front().second;
4737 for (const auto &Pair : drop_begin(OrdersUses)) {
4738 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
4739 BestOrder = Pair.first;
4740 Cnt = Pair.second;
4741 }
4742 }
4743 // Set order of the user node (reordering of operands and user nodes).
4744 if (BestOrder.empty()) {
4745 for_each(Data.second,
4746 [&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {
4747 OrderedEntries.remove(Op.second);
4748 });
4749 continue;
4750 }
4751 // Erase operands from OrderedEntries list and adjust their orders.
4752 VisitedOps.clear();
4753 SmallVector<int> Mask;
4754 inversePermutation(BestOrder, Mask);
4755 SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);
4756 unsigned E = BestOrder.size();
4757 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
4758 return I < E ? static_cast<int>(I) : UndefMaskElem;
4759 });
4760 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
4761 TreeEntry *TE = Op.second;
4762 OrderedEntries.remove(TE);
4763 if (!VisitedOps.insert(TE).second)
4764 continue;
4765 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
4766 reorderNodeWithReuses(*TE, Mask);
4767 continue;
4768 }
4769 // Gathers are processed separately.
4770 if (TE->State != TreeEntry::Vectorize)
4771 continue;
4772 assert((BestOrder.size() == TE->ReorderIndices.size() ||
4773 TE->ReorderIndices.empty()) &&
4774 "Non-matching sizes of user/operand entries.");
4775 reorderOrder(TE->ReorderIndices, Mask);
4776 if (IgnoreReorder && TE == VectorizableTree.front().get())
4777 IgnoreReorder = false;
4778 }
4779 // For gathers just need to reorder its scalars.
4780 for (TreeEntry *Gather : GatherOps) {
4781 assert(Gather->ReorderIndices.empty() &&
4782 "Unexpected reordering of gathers.");
4783 if (!Gather->ReuseShuffleIndices.empty()) {
4784 // Just reorder reuses indices.
4785 reorderReuses(Gather->ReuseShuffleIndices, Mask);
4786 continue;
4787 }
4788 reorderScalars(Gather->Scalars, Mask);
4789 OrderedEntries.remove(Gather);
4790 }
4791 // Reorder operands of the user node and set the ordering for the user
4792 // node itself.
4793 if (Data.first->State != TreeEntry::Vectorize ||
4794 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
4795 Data.first->getMainOp()) ||
4796 Data.first->isAltShuffle())
4797 Data.first->reorderOperands(Mask);
4798 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
4799 Data.first->isAltShuffle()) {
4800 reorderScalars(Data.first->Scalars, Mask);
4801 reorderOrder(Data.first->ReorderIndices, MaskOrder);
4802 if (Data.first->ReuseShuffleIndices.empty() &&
4803 !Data.first->ReorderIndices.empty() &&
4804 !Data.first->isAltShuffle()) {
4805 // Insert user node to the list to try to sink reordering deeper in
4806 // the graph.
4807 OrderedEntries.insert(Data.first);
4808 }
4809 } else {
4810 reorderOrder(Data.first->ReorderIndices, Mask);
4811 }
4812 }
4813 }
4814 // If the reordering is unnecessary, just remove the reorder.
4815 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
4816 VectorizableTree.front()->ReuseShuffleIndices.empty())
4817 VectorizableTree.front()->ReorderIndices.clear();
4818}
4819
4821 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
4822 // Collect the values that we need to extract from the tree.
4823 for (auto &TEPtr : VectorizableTree) {
4824 TreeEntry *Entry = TEPtr.get();
4825
4826 // No need to handle users of gathered values.
4827 if (Entry->State == TreeEntry::NeedToGather)
4828 continue;
4829
4830 // For each lane:
4831 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
4832 Value *Scalar = Entry->Scalars[Lane];
4833 int FoundLane = Entry->findLaneForValue(Scalar);
4834
4835 // Check if the scalar is externally used as an extra arg.
4836 auto ExtI = ExternallyUsedValues.find(Scalar);
4837 if (ExtI != ExternallyUsedValues.end()) {
4838 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
4839 << Lane << " from " << *Scalar << ".\n");
4840 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
4841 }
4842 for (User *U : Scalar->users()) {
4843 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
4844
4845 Instruction *UserInst = dyn_cast<Instruction>(U);
4846 if (!UserInst)
4847 continue;
4848
4849 if (isDeleted(UserInst))
4850 continue;
4851
4852 // Skip in-tree scalars that become vectors
4853 if (TreeEntry *UseEntry = getTreeEntry(U)) {
4854 Value *UseScalar = UseEntry->Scalars[0];
4855 // Some in-tree scalars will remain as scalar in vectorized
4856 // instructions. If that is the case, the one in Lane 0 will
4857 // be used.
4858 if (UseScalar != U ||
4859 UseEntry->State == TreeEntry::ScatterVectorize ||
4860 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
4861 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
4862 << ".\n");
4863 assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
4864 continue;
4865 }
4866 }
4867
4868 // Ignore users in the user ignore list.
4869 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
4870 continue;
4871
4872 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
4873 << Lane << " from " << *Scalar << ".\n");
4874 ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
4875 }
4876 }
4877 }
4878}
4879
4881BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
4883 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
4884 Value *V = TE->Scalars[Lane];
4885 // To save compilation time we don't visit if we have too many users.
4886 static constexpr unsigned UsersLimit = 4;
4887 if (V->hasNUsesOrMore(UsersLimit))
4888 break;
4889
4890 // Collect stores per pointer object.
4891 for (User *U : V->users()) {
4892 auto *SI = dyn_cast<StoreInst>(U);
4893 if (SI == nullptr || !SI->isSimple() ||
4894 !isValidElementType(SI->getValueOperand()->getType()))
4895 continue;
4896 // Skip entry if already
4897 if (getTreeEntry(U))
4898 continue;
4899
4900 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
4901 auto &StoresVec = PtrToStoresMap[Ptr];
4902 // For now just keep one store per pointer object per lane.
4903 // TODO: Extend this to support multiple stores per pointer per lane
4904 if (StoresVec.size() > Lane)
4905 continue;
4906 // Skip if in different BBs.
4907 if (!StoresVec.empty() &&
4908 SI->getParent() != StoresVec.back()->getParent())
4909 continue;
4910 // Make sure that the stores are of the same type.
4911 if (!StoresVec.empty() &&
4912 SI->getValueOperand()->getType() !=
4913 StoresVec.back()->getValueOperand()->getType())
4914 continue;
4915 StoresVec.push_back(SI);
4916 }
4917 }
4918 return PtrToStoresMap;
4919}
4920
4921bool BoUpSLP::canFormVector(const SmallVector<StoreInst *, 4> &StoresVec,
4922 OrdersType &ReorderIndices) const {
4923 // We check whether the stores in StoreVec can form a vector by sorting them
4924 // and checking whether they are consecutive.
4925
4926 // To avoid calling getPointersDiff() while sorting we create a vector of
4927 // pairs {store, offset from first} and sort this instead.
4928 SmallVector<std::pair<StoreInst *, int>, 4> StoreOffsetVec(StoresVec.size());
4929 StoreInst *S0 = StoresVec[0];
4930 StoreOffsetVec[0] = {S0, 0};
4931 Type *S0Ty = S0->getValueOperand()->getType();
4932 Value *S0Ptr = S0->getPointerOperand();
4933 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
4934 StoreInst *SI = StoresVec[Idx];
4935 std::optional<int> Diff =
4936 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
4937 SI->getPointerOperand(), *DL, *SE,
4938 /*StrictCheck=*/true);
4939 // We failed to compare the pointers so just abandon this StoresVec.
4940 if (!Diff)
4941 return false;
4942 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
4943 }
4944
4945 // Sort the vector based on the pointers. We create a copy because we may
4946 // need the original later for calculating the reorder (shuffle) indices.
4947 stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
4948 const std::pair<StoreInst *, int> &Pair2) {
4949 int Offset1 = Pair1.second;
4950 int Offset2 = Pair2.second;
4951 return Offset1 < Offset2;
4952 });
4953
4954 // Check if the stores are consecutive by checking if their difference is 1.
4955 for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
4956 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx-1].second + 1)
4957 return false;
4958
4959 // Calculate the shuffle indices according to their offset against the sorted
4960 // StoreOffsetVec.
4961 ReorderIndices.reserve(StoresVec.size());
4962 for (StoreInst *SI : StoresVec) {
4963 unsigned Idx = find_if(StoreOffsetVec,
4964 [SI](const std::pair<StoreInst *, int> &Pair) {
4965 return Pair.first == SI;
4966 }) -
4967 StoreOffsetVec.begin();
4968 ReorderIndices.push_back(Idx);
4969 }
4970 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
4971 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
4972 // same convention here.
4973 auto IsIdentityOrder = [](const OrdersType &Order) {
4974 for (unsigned Idx : seq<unsigned>(0, Order.size()))
4975 if (Idx != Order[Idx])
4976 return false;
4977 return true;
4978 };
4979 if (IsIdentityOrder(ReorderIndices))
4980 ReorderIndices.clear();
4981
4982 return true;
4983}
4984
4985#ifndef NDEBUG
4987 for (unsigned Idx : Order)
4988 dbgs() << Idx << ", ";
4989 dbgs() << "\n";
4990}
4991#endif
4992
4994BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
4995 unsigned NumLanes = TE->Scalars.size();
4996
4998 collectUserStores(TE);
4999
5000 // Holds the reorder indices for each candidate store vector that is a user of
5001 // the current TreeEntry.
5002 SmallVector<OrdersType, 1> ExternalReorderIndices;
5003
5004 // Now inspect the stores collected per pointer and look for vectorization
5005 // candidates. For each candidate calculate the reorder index vector and push
5006 // it into `ExternalReorderIndices`
5007 for (const auto &Pair : PtrToStoresMap) {
5008 auto &StoresVec = Pair.second;
5009 // If we have fewer than NumLanes stores, then we can't form a vector.
5010 if (StoresVec.size() != NumLanes)
5011 continue;
5012
5013 // If the stores are not consecutive then abandon this StoresVec.
5014 OrdersType ReorderIndices;
5015 if (!canFormVector(StoresVec, ReorderIndices))
5016 continue;
5017
5018 // We now know that the scalars in StoresVec can form a vector instruction,
5019 // so set the reorder indices.
5020 ExternalReorderIndices.push_back(ReorderIndices);
5021 }
5022 return ExternalReorderIndices;
5023}
5024
5026 const SmallDenseSet<Value *> &UserIgnoreLst) {
5027 deleteTree();
5028 UserIgnoreList = &UserIgnoreLst;
5029 if (!allSameType(Roots))
5030 return;
5031 buildTree_rec(Roots, 0, EdgeInfo());
5032}
5033
5035 deleteTree();
5036 if (!allSameType(Roots))
5037 return;
5038 buildTree_rec(Roots, 0, EdgeInfo());
5039}
5040
5041/// \return true if the specified list of values has only one instruction that
5042/// requires scheduling, false otherwise.
5043#ifndef NDEBUG
5045 Value *NeedsScheduling = nullptr;
5046 for (Value *V : VL) {
5048 continue;
5049 if (!NeedsScheduling) {
5050 NeedsScheduling = V;
5051 continue;
5052 }
5053 return false;
5054 }
5055 return NeedsScheduling;
5056}
5057#endif
5058
5059/// Generates key/subkey pair for the given value to provide effective sorting
5060/// of the values and better detection of the vectorizable values sequences. The
5061/// keys/subkeys can be used for better sorting of the values themselves (keys)
5062/// and in values subgroups (su