LLVM 18.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
24#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
50#include "llvm/IR/Attributes.h"
51#include "llvm/IR/BasicBlock.h"
52#include "llvm/IR/Constant.h"
53#include "llvm/IR/Constants.h"
54#include "llvm/IR/DataLayout.h"
56#include "llvm/IR/Dominators.h"
57#include "llvm/IR/Function.h"
58#include "llvm/IR/IRBuilder.h"
59#include "llvm/IR/InstrTypes.h"
60#include "llvm/IR/Instruction.h"
63#include "llvm/IR/Intrinsics.h"
64#include "llvm/IR/Module.h"
65#include "llvm/IR/Operator.h"
67#include "llvm/IR/Type.h"
68#include "llvm/IR/Use.h"
69#include "llvm/IR/User.h"
70#include "llvm/IR/Value.h"
71#include "llvm/IR/ValueHandle.h"
72#ifdef EXPENSIVE_CHECKS
73#include "llvm/IR/Verifier.h"
74#endif
75#include "llvm/Pass.h"
80#include "llvm/Support/Debug.h"
90#include <algorithm>
91#include <cassert>
92#include <cstdint>
93#include <iterator>
94#include <memory>
95#include <optional>
96#include <set>
97#include <string>
98#include <tuple>
99#include <utility>
100#include <vector>
101
102using namespace llvm;
103using namespace llvm::PatternMatch;
104using namespace slpvectorizer;
105
106#define SV_NAME "slp-vectorizer"
107#define DEBUG_TYPE "SLP"
108
109STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
110
112 cl::desc("Run the SLP vectorization passes"));
113
114static cl::opt<int>
116 cl::desc("Only vectorize if you gain more than this "
117 "number "));
118
119static cl::opt<bool>
120ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
121 cl::desc("Attempt to vectorize horizontal reductions"));
122
124 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
125 cl::desc(
126 "Attempt to vectorize horizontal reductions feeding into a store"));
127
128// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
129// even if we match a reduction but do not vectorize in the end.
131 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
132 cl::desc("Allow optimization of original scalar identity operations on "
133 "matched horizontal reductions."));
134
135static cl::opt<int>
137 cl::desc("Attempt to vectorize for this register size in bits"));
138
141 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
142
143/// Limits the size of scheduling regions in a block.
144/// It avoid long compile times for _very_ large blocks where vector
145/// instructions are spread over a wide range.
146/// This limit is way higher than needed by real-world functions.
147static cl::opt<int>
148ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
149 cl::desc("Limit the size of the SLP scheduling region per block"));
150
152 "slp-min-reg-size", cl::init(128), cl::Hidden,
153 cl::desc("Attempt to vectorize for this register size in bits"));
154
156 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
157 cl::desc("Limit the recursion depth when building a vectorizable tree"));
158
160 "slp-min-tree-size", cl::init(3), cl::Hidden,
161 cl::desc("Only vectorize small trees if they are fully vectorizable"));
162
163// The maximum depth that the look-ahead score heuristic will explore.
164// The higher this value, the higher the compilation time overhead.
166 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
167 cl::desc("The maximum look-ahead depth for operand reordering scores"));
168
169// The maximum depth that the look-ahead score heuristic will explore
170// when it probing among candidates for vectorization tree roots.
171// The higher this value, the higher the compilation time overhead but unlike
172// similar limit for operands ordering this is less frequently used, hence
173// impact of higher value is less noticeable.
175 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
176 cl::desc("The maximum look-ahead depth for searching best rooting option"));
177
178static cl::opt<bool>
179 ViewSLPTree("view-slp-tree", cl::Hidden,
180 cl::desc("Display the SLP trees with Graphviz"));
181
182// Limit the number of alias checks. The limit is chosen so that
183// it has no negative effect on the llvm benchmarks.
184static const unsigned AliasedCheckLimit = 10;
185
186// Another limit for the alias checks: The maximum distance between load/store
187// instructions where alias checks are done.
188// This limit is useful for very large basic blocks.
189static const unsigned MaxMemDepDistance = 160;
190
191/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
192/// regions to be handled.
193static const int MinScheduleRegionSize = 16;
194
195/// Predicate for the element types that the SLP vectorizer supports.
196///
197/// The most important thing to filter here are types which are invalid in LLVM
198/// vectors. We also filter target specific types which have absolutely no
199/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
200/// avoids spending time checking the cost model and realizing that they will
201/// be inevitably scalarized.
202static bool isValidElementType(Type *Ty) {
203 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
204 !Ty->isPPC_FP128Ty();
205}
206
207/// \returns True if the value is a constant (but not globals/constant
208/// expressions).
209static bool isConstant(Value *V) {
210 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
211}
212
213/// Checks if \p V is one of vector-like instructions, i.e. undef,
214/// insertelement/extractelement with constant indices for fixed vector type or
215/// extractvalue instruction.
217 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
218 !isa<ExtractValueInst, UndefValue>(V))
219 return false;
220 auto *I = dyn_cast<Instruction>(V);
221 if (!I || isa<ExtractValueInst>(I))
222 return true;
223 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
224 return false;
225 if (isa<ExtractElementInst>(I))
226 return isConstant(I->getOperand(1));
227 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
228 return isConstant(I->getOperand(2));
229}
230
231#if !defined(NDEBUG)
232/// Print a short descriptor of the instruction bundle suitable for debug output.
233static std::string shortBundleName(ArrayRef<Value *> VL) {
234 std::string Result;
235 raw_string_ostream OS(Result);
236 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
237 OS.flush();
238 return Result;
239}
240#endif
241
242/// \returns true if all of the instructions in \p VL are in the same block or
243/// false otherwise.
245 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
246 if (!I0)
247 return false;
249 return true;
250
251 BasicBlock *BB = I0->getParent();
252 for (int I = 1, E = VL.size(); I < E; I++) {
253 auto *II = dyn_cast<Instruction>(VL[I]);
254 if (!II)
255 return false;
256
257 if (BB != II->getParent())
258 return false;
259 }
260 return true;
261}
262
263/// \returns True if all of the values in \p VL are constants (but not
264/// globals/constant expressions).
266 // Constant expressions and globals can't be vectorized like normal integer/FP
267 // constants.
268 return all_of(VL, isConstant);
269}
270
271/// \returns True if all of the values in \p VL are identical or some of them
272/// are UndefValue.
273static bool isSplat(ArrayRef<Value *> VL) {
274 Value *FirstNonUndef = nullptr;
275 for (Value *V : VL) {
276 if (isa<UndefValue>(V))
277 continue;
278 if (!FirstNonUndef) {
279 FirstNonUndef = V;
280 continue;
281 }
282 if (V != FirstNonUndef)
283 return false;
284 }
285 return FirstNonUndef != nullptr;
286}
287
288/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
290 if (auto *Cmp = dyn_cast<CmpInst>(I))
291 return Cmp->isCommutative();
292 if (auto *BO = dyn_cast<BinaryOperator>(I))
293 return BO->isCommutative();
294 // TODO: This should check for generic Instruction::isCommutative(), but
295 // we need to confirm that the caller code correctly handles Intrinsics
296 // for example (does not have 2 operands).
297 return false;
298}
299
300/// \returns inserting index of InsertElement or InsertValue instruction,
301/// using Offset as base offset for index.
302static std::optional<unsigned> getInsertIndex(const Value *InsertInst,
303 unsigned Offset = 0) {
304 int Index = Offset;
305 if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
306 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
307 if (!VT)
308 return std::nullopt;
309 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
310 if (!CI)
311 return std::nullopt;
312 if (CI->getValue().uge(VT->getNumElements()))
313 return std::nullopt;
314 Index *= VT->getNumElements();
315 Index += CI->getZExtValue();
316 return Index;
317 }
318
319 const auto *IV = cast<InsertValueInst>(InsertInst);
320 Type *CurrentType = IV->getType();
321 for (unsigned I : IV->indices()) {
322 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
323 Index *= ST->getNumElements();
324 CurrentType = ST->getElementType(I);
325 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
326 Index *= AT->getNumElements();
327 CurrentType = AT->getElementType();
328 } else {
329 return std::nullopt;
330 }
331 Index += I;
332 }
333 return Index;
334}
335
336namespace {
337/// Specifies the way the mask should be analyzed for undefs/poisonous elements
338/// in the shuffle mask.
339enum class UseMask {
340 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
341 ///< check for the mask elements for the first argument (mask
342 ///< indices are in range [0:VF)).
343 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
344 ///< for the mask elements for the second argument (mask indices
345 ///< are in range [VF:2*VF))
346 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
347 ///< future shuffle elements and mark them as ones as being used
348 ///< in future. Non-undef elements are considered as unused since
349 ///< they're already marked as used in the mask.
350};
351} // namespace
352
353/// Prepares a use bitset for the given mask either for the first argument or
354/// for the second.
356 UseMask MaskArg) {
357 SmallBitVector UseMask(VF, true);
358 for (auto [Idx, Value] : enumerate(Mask)) {
359 if (Value == PoisonMaskElem) {
360 if (MaskArg == UseMask::UndefsAsMask)
361 UseMask.reset(Idx);
362 continue;
363 }
364 if (MaskArg == UseMask::FirstArg && Value < VF)
365 UseMask.reset(Value);
366 else if (MaskArg == UseMask::SecondArg && Value >= VF)
367 UseMask.reset(Value - VF);
368 }
369 return UseMask;
370}
371
372/// Checks if the given value is actually an undefined constant vector.
373/// Also, if the \p UseMask is not empty, tries to check if the non-masked
374/// elements actually mask the insertelement buildvector, if any.
375template <bool IsPoisonOnly = false>
377 const SmallBitVector &UseMask = {}) {
378 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
379 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
380 if (isa<T>(V))
381 return Res;
382 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
383 if (!VecTy)
384 return Res.reset();
385 auto *C = dyn_cast<Constant>(V);
386 if (!C) {
387 if (!UseMask.empty()) {
388 const Value *Base = V;
389 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
390 Base = II->getOperand(0);
391 if (isa<T>(II->getOperand(1)))
392 continue;
393 std::optional<unsigned> Idx = getInsertIndex(II);
394 if (!Idx)
395 continue;
396 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
397 Res.reset(*Idx);
398 }
399 // TODO: Add analysis for shuffles here too.
400 if (V == Base) {
401 Res.reset();
402 } else {
403 SmallBitVector SubMask(UseMask.size(), false);
404 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
405 }
406 } else {
407 Res.reset();
408 }
409 return Res;
410 }
411 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
412 if (Constant *Elem = C->getAggregateElement(I))
413 if (!isa<T>(Elem) &&
414 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
415 Res.reset(I);
416 }
417 return Res;
418}
419
420/// Checks if the vector of instructions can be represented as a shuffle, like:
421/// %x0 = extractelement <4 x i8> %x, i32 0
422/// %x3 = extractelement <4 x i8> %x, i32 3
423/// %y1 = extractelement <4 x i8> %y, i32 1
424/// %y2 = extractelement <4 x i8> %y, i32 2
425/// %x0x0 = mul i8 %x0, %x0
426/// %x3x3 = mul i8 %x3, %x3
427/// %y1y1 = mul i8 %y1, %y1
428/// %y2y2 = mul i8 %y2, %y2
429/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
430/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
431/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
432/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
433/// ret <4 x i8> %ins4
434/// can be transformed into:
435/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
436/// i32 6>
437/// %2 = mul <4 x i8> %1, %1
438/// ret <4 x i8> %2
439/// We convert this initially to something like:
440/// %x0 = extractelement <4 x i8> %x, i32 0
441/// %x3 = extractelement <4 x i8> %x, i32 3
442/// %y1 = extractelement <4 x i8> %y, i32 1
443/// %y2 = extractelement <4 x i8> %y, i32 2
444/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0
445/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
446/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
447/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
448/// %5 = mul <4 x i8> %4, %4
449/// %6 = extractelement <4 x i8> %5, i32 0
450/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0
451/// %7 = extractelement <4 x i8> %5, i32 1
452/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
453/// %8 = extractelement <4 x i8> %5, i32 2
454/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
455/// %9 = extractelement <4 x i8> %5, i32 3
456/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
457/// ret <4 x i8> %ins4
458/// InstCombiner transforms this into a shuffle and vector mul
459/// Mask will return the Shuffle Mask equivalent to the extracted elements.
460/// TODO: Can we split off and reuse the shuffle mask detection from
461/// ShuffleVectorInst/getShuffleCost?
462static std::optional<TargetTransformInfo::ShuffleKind>
464 const auto *It =
465 find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); });
466 if (It == VL.end())
467 return std::nullopt;
468 auto *EI0 = cast<ExtractElementInst>(*It);
469 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
470 return std::nullopt;
471 unsigned Size =
472 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
473 Value *Vec1 = nullptr;
474 Value *Vec2 = nullptr;
475 enum ShuffleMode { Unknown, Select, Permute };
476 ShuffleMode CommonShuffleMode = Unknown;
477 Mask.assign(VL.size(), PoisonMaskElem);
478 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
479 // Undef can be represented as an undef element in a vector.
480 if (isa<UndefValue>(VL[I]))
481 continue;
482 auto *EI = cast<ExtractElementInst>(VL[I]);
483 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
484 return std::nullopt;
485 auto *Vec = EI->getVectorOperand();
486 // We can extractelement from undef or poison vector.
487 if (isUndefVector(Vec).all())
488 continue;
489 // All vector operands must have the same number of vector elements.
490 if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
491 return std::nullopt;
492 if (isa<UndefValue>(EI->getIndexOperand()))
493 continue;
494 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
495 if (!Idx)
496 return std::nullopt;
497 // Undefined behavior if Idx is negative or >= Size.
498 if (Idx->getValue().uge(Size))
499 continue;
500 unsigned IntIdx = Idx->getValue().getZExtValue();
501 Mask[I] = IntIdx;
502 // For correct shuffling we have to have at most 2 different vector operands
503 // in all extractelement instructions.
504 if (!Vec1 || Vec1 == Vec) {
505 Vec1 = Vec;
506 } else if (!Vec2 || Vec2 == Vec) {
507 Vec2 = Vec;
508 Mask[I] += Size;
509 } else {
510 return std::nullopt;
511 }
512 if (CommonShuffleMode == Permute)
513 continue;
514 // If the extract index is not the same as the operation number, it is a
515 // permutation.
516 if (IntIdx != I) {
517 CommonShuffleMode = Permute;
518 continue;
519 }
520 CommonShuffleMode = Select;
521 }
522 // If we're not crossing lanes in different vectors, consider it as blending.
523 if (CommonShuffleMode == Select && Vec2)
525 // If Vec2 was never used, we have a permutation of a single vector, otherwise
526 // we have permutation of 2 vectors.
529}
530
531/// \returns True if Extract{Value,Element} instruction extracts element Idx.
532static std::optional<unsigned> getExtractIndex(Instruction *E) {
533 unsigned Opcode = E->getOpcode();
534 assert((Opcode == Instruction::ExtractElement ||
535 Opcode == Instruction::ExtractValue) &&
536 "Expected extractelement or extractvalue instruction.");
537 if (Opcode == Instruction::ExtractElement) {
538 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
539 if (!CI)
540 return std::nullopt;
541 return CI->getZExtValue();
542 }
543 auto *EI = cast<ExtractValueInst>(E);
544 if (EI->getNumIndices() != 1)
545 return std::nullopt;
546 return *EI->idx_begin();
547}
548
549/// Tries to find extractelement instructions with constant indices from fixed
550/// vector type and gather such instructions into a bunch, which highly likely
551/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
552/// successful, the matched scalars are replaced by poison values in \p VL for
553/// future analysis.
554static std::optional<TTI::ShuffleKind>
556 SmallVectorImpl<int> &Mask) {
557 // Scan list of gathered scalars for extractelements that can be represented
558 // as shuffles.
560 SmallVector<int> UndefVectorExtracts;
561 for (int I = 0, E = VL.size(); I < E; ++I) {
562 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
563 if (!EI) {
564 if (isa<UndefValue>(VL[I]))
565 UndefVectorExtracts.push_back(I);
566 continue;
567 }
568 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
569 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
570 continue;
571 std::optional<unsigned> Idx = getExtractIndex(EI);
572 // Undefined index.
573 if (!Idx) {
574 UndefVectorExtracts.push_back(I);
575 continue;
576 }
577 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
578 ExtractMask.reset(*Idx);
579 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
580 UndefVectorExtracts.push_back(I);
581 continue;
582 }
583 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
584 }
585 // Sort the vector operands by the maximum number of uses in extractelements.
587 for (const auto &Data : VectorOpToIdx)
588 VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]
589 .push_back(Data.first);
590 for (auto &Data : VFToVector) {
591 stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {
592 return VectorOpToIdx.find(V1)->second.size() >
593 VectorOpToIdx.find(V2)->second.size();
594 });
595 }
596 // Find the best pair of the vectors with the same number of elements or a
597 // single vector.
598 const int UndefSz = UndefVectorExtracts.size();
599 unsigned SingleMax = 0;
600 Value *SingleVec = nullptr;
601 unsigned PairMax = 0;
602 std::pair<Value *, Value *> PairVec(nullptr, nullptr);
603 for (auto &Data : VFToVector) {
604 Value *V1 = Data.second.front();
605 if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {
606 SingleMax = VectorOpToIdx[V1].size() + UndefSz;
607 SingleVec = V1;
608 }
609 Value *V2 = nullptr;
610 if (Data.second.size() > 1)
611 V2 = *std::next(Data.second.begin());
612 if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +
613 UndefSz) {
614 PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;
615 PairVec = std::make_pair(V1, V2);
616 }
617 }
618 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
619 return std::nullopt;
620 // Check if better to perform a shuffle of 2 vectors or just of a single
621 // vector.
622 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
623 SmallVector<Value *> GatheredExtracts(
624 VL.size(), PoisonValue::get(VL.front()->getType()));
625 if (SingleMax >= PairMax && SingleMax) {
626 for (int Idx : VectorOpToIdx[SingleVec])
627 std::swap(GatheredExtracts[Idx], VL[Idx]);
628 } else {
629 for (Value *V : {PairVec.first, PairVec.second})
630 for (int Idx : VectorOpToIdx[V])
631 std::swap(GatheredExtracts[Idx], VL[Idx]);
632 }
633 // Add extracts from undefs too.
634 for (int Idx : UndefVectorExtracts)
635 std::swap(GatheredExtracts[Idx], VL[Idx]);
636 // Check that gather of extractelements can be represented as just a
637 // shuffle of a single/two vectors the scalars are extracted from.
638 std::optional<TTI::ShuffleKind> Res =
639 isFixedVectorShuffle(GatheredExtracts, Mask);
640 if (!Res) {
641 // TODO: try to check other subsets if possible.
642 // Restore the original VL if attempt was not successful.
643 VL.swap(SavedVL);
644 return std::nullopt;
645 }
646 // Restore unused scalars from mask, if some of the extractelements were not
647 // selected for shuffle.
648 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
649 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
650 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
651 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
652 is_contained(UndefVectorExtracts, I))
653 continue;
654 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]))
655 std::swap(VL[I], GatheredExtracts[I]);
656 }
657 return Res;
658}
659
660namespace {
661
662/// Main data required for vectorization of instructions.
663struct InstructionsState {
664 /// The very first instruction in the list with the main opcode.
665 Value *OpValue = nullptr;
666
667 /// The main/alternate instruction.
668 Instruction *MainOp = nullptr;
669 Instruction *AltOp = nullptr;
670
671 /// The main/alternate opcodes for the list of instructions.
672 unsigned getOpcode() const {
673 return MainOp ? MainOp->getOpcode() : 0;
674 }
675
676 unsigned getAltOpcode() const {
677 return AltOp ? AltOp->getOpcode() : 0;
678 }
679
680 /// Some of the instructions in the list have alternate opcodes.
681 bool isAltShuffle() const { return AltOp != MainOp; }
682
683 bool isOpcodeOrAlt(Instruction *I) const {
684 unsigned CheckedOpcode = I->getOpcode();
685 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
686 }
687
688 InstructionsState() = delete;
689 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
690 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
691};
692
693} // end anonymous namespace
694
695/// Chooses the correct key for scheduling data. If \p Op has the same (or
696/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
697/// OpValue.
698static Value *isOneOf(const InstructionsState &S, Value *Op) {
699 auto *I = dyn_cast<Instruction>(Op);
700 if (I && S.isOpcodeOrAlt(I))
701 return Op;
702 return S.OpValue;
703}
704
705/// \returns true if \p Opcode is allowed as part of the main/alternate
706/// instruction for SLP vectorization.
707///
708/// Example of unsupported opcode is SDIV that can potentially cause UB if the
709/// "shuffled out" lane would result in division by zero.
710static bool isValidForAlternation(unsigned Opcode) {
711 if (Instruction::isIntDivRem(Opcode))
712 return false;
713
714 return true;
715}
716
717static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
718 const TargetLibraryInfo &TLI,
719 unsigned BaseIndex = 0);
720
721/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
722/// compatible instructions or constants, or just some other regular values.
723static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
724 Value *Op1, const TargetLibraryInfo &TLI) {
725 return (isConstant(BaseOp0) && isConstant(Op0)) ||
726 (isConstant(BaseOp1) && isConstant(Op1)) ||
727 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
728 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
729 BaseOp0 == Op0 || BaseOp1 == Op1 ||
730 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
731 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
732}
733
734/// \returns true if a compare instruction \p CI has similar "look" and
735/// same predicate as \p BaseCI, "as is" or with its operands and predicate
736/// swapped, false otherwise.
737static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
738 const TargetLibraryInfo &TLI) {
739 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
740 "Assessing comparisons of different types?");
741 CmpInst::Predicate BasePred = BaseCI->getPredicate();
742 CmpInst::Predicate Pred = CI->getPredicate();
744
745 Value *BaseOp0 = BaseCI->getOperand(0);
746 Value *BaseOp1 = BaseCI->getOperand(1);
747 Value *Op0 = CI->getOperand(0);
748 Value *Op1 = CI->getOperand(1);
749
750 return (BasePred == Pred &&
751 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
752 (BasePred == SwappedPred &&
753 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
754}
755
756/// \returns analysis of the Instructions in \p VL described in
757/// InstructionsState, the Opcode that we suppose the whole list
758/// could be vectorized even if its structure is diverse.
759static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
760 const TargetLibraryInfo &TLI,
761 unsigned BaseIndex) {
762 // Make sure these are all Instructions.
763 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
764 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
765
766 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
767 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
768 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
769 CmpInst::Predicate BasePred =
770 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
772 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
773 unsigned AltOpcode = Opcode;
774 unsigned AltIndex = BaseIndex;
775
776 // Check for one alternate opcode from another BinaryOperator.
777 // TODO - generalize to support all operators (types, calls etc.).
778 auto *IBase = cast<Instruction>(VL[BaseIndex]);
779 Intrinsic::ID BaseID = 0;
780 SmallVector<VFInfo> BaseMappings;
781 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
783 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
784 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
785 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
786 }
787 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
788 auto *I = cast<Instruction>(VL[Cnt]);
789 unsigned InstOpcode = I->getOpcode();
790 if (IsBinOp && isa<BinaryOperator>(I)) {
791 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
792 continue;
793 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
794 isValidForAlternation(Opcode)) {
795 AltOpcode = InstOpcode;
796 AltIndex = Cnt;
797 continue;
798 }
799 } else if (IsCastOp && isa<CastInst>(I)) {
800 Value *Op0 = IBase->getOperand(0);
801 Type *Ty0 = Op0->getType();
802 Value *Op1 = I->getOperand(0);
803 Type *Ty1 = Op1->getType();
804 if (Ty0 == Ty1) {
805 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
806 continue;
807 if (Opcode == AltOpcode) {
809 isValidForAlternation(InstOpcode) &&
810 "Cast isn't safe for alternation, logic needs to be updated!");
811 AltOpcode = InstOpcode;
812 AltIndex = Cnt;
813 continue;
814 }
815 }
816 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
817 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
818 Type *Ty0 = BaseInst->getOperand(0)->getType();
819 Type *Ty1 = Inst->getOperand(0)->getType();
820 if (Ty0 == Ty1) {
821 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
822 // Check for compatible operands. If the corresponding operands are not
823 // compatible - need to perform alternate vectorization.
824 CmpInst::Predicate CurrentPred = Inst->getPredicate();
825 CmpInst::Predicate SwappedCurrentPred =
826 CmpInst::getSwappedPredicate(CurrentPred);
827
828 if (E == 2 &&
829 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
830 continue;
831
832 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
833 continue;
834 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
835 if (AltIndex != BaseIndex) {
836 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
837 continue;
838 } else if (BasePred != CurrentPred) {
839 assert(
840 isValidForAlternation(InstOpcode) &&
841 "CmpInst isn't safe for alternation, logic needs to be updated!");
842 AltIndex = Cnt;
843 continue;
844 }
845 CmpInst::Predicate AltPred = AltInst->getPredicate();
846 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
847 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
848 continue;
849 }
850 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
851 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
852 if (Gep->getNumOperands() != 2 ||
853 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
854 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
855 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
857 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
858 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
859 auto *BaseLI = cast<LoadInst>(IBase);
860 if (!LI->isSimple() || !BaseLI->isSimple())
861 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
862 } else if (auto *Call = dyn_cast<CallInst>(I)) {
863 auto *CallBase = cast<CallInst>(IBase);
864 if (Call->getCalledFunction() != CallBase->getCalledFunction())
865 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
866 if (Call->hasOperandBundles() &&
867 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
868 Call->op_begin() + Call->getBundleOperandsEndIndex(),
869 CallBase->op_begin() +
871 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
873 if (ID != BaseID)
874 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
875 if (!ID) {
876 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
877 if (Mappings.size() != BaseMappings.size() ||
878 Mappings.front().ISA != BaseMappings.front().ISA ||
879 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
880 Mappings.front().VectorName != BaseMappings.front().VectorName ||
881 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
882 Mappings.front().Shape.Parameters !=
883 BaseMappings.front().Shape.Parameters)
884 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
885 }
886 }
887 continue;
888 }
889 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
890 }
891
892 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
893 cast<Instruction>(VL[AltIndex]));
894}
895
896/// \returns true if all of the values in \p VL have the same type or false
897/// otherwise.
899 Type *Ty = VL.front()->getType();
900 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
901}
902
903/// \returns True if in-tree use also needs extract. This refers to
904/// possible scalar operand in vectorized instruction.
905static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
906 TargetLibraryInfo *TLI) {
907 unsigned Opcode = UserInst->getOpcode();
908 switch (Opcode) {
909 case Instruction::Load: {
910 LoadInst *LI = cast<LoadInst>(UserInst);
911 return (LI->getPointerOperand() == Scalar);
912 }
913 case Instruction::Store: {
914 StoreInst *SI = cast<StoreInst>(UserInst);
915 return (SI->getPointerOperand() == Scalar);
916 }
917 case Instruction::Call: {
918 CallInst *CI = cast<CallInst>(UserInst);
920 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
921 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
922 Arg.value().get() == Scalar;
923 });
924 }
925 default:
926 return false;
927 }
928}
929
930/// \returns the AA location that is being access by the instruction.
932 if (StoreInst *SI = dyn_cast<StoreInst>(I))
933 return MemoryLocation::get(SI);
934 if (LoadInst *LI = dyn_cast<LoadInst>(I))
935 return MemoryLocation::get(LI);
936 return MemoryLocation();
937}
938
939/// \returns True if the instruction is not a volatile or atomic load/store.
940static bool isSimple(Instruction *I) {
941 if (LoadInst *LI = dyn_cast<LoadInst>(I))
942 return LI->isSimple();
943 if (StoreInst *SI = dyn_cast<StoreInst>(I))
944 return SI->isSimple();
945 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
946 return !MI->isVolatile();
947 return true;
948}
949
950/// Shuffles \p Mask in accordance with the given \p SubMask.
951/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
952/// one but two input vectors.
953static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
954 bool ExtendingManyInputs = false) {
955 if (SubMask.empty())
956 return;
957 assert(
958 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
959 // Check if input scalars were extended to match the size of other node.
960 (SubMask.size() == Mask.size() &&
961 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
962 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
963 "SubMask with many inputs support must be larger than the mask.");
964 if (Mask.empty()) {
965 Mask.append(SubMask.begin(), SubMask.end());
966 return;
967 }
968 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
969 int TermValue = std::min(Mask.size(), SubMask.size());
970 for (int I = 0, E = SubMask.size(); I < E; ++I) {
971 if (SubMask[I] == PoisonMaskElem ||
972 (!ExtendingManyInputs &&
973 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
974 continue;
975 NewMask[I] = Mask[SubMask[I]];
976 }
977 Mask.swap(NewMask);
978}
979
980/// Order may have elements assigned special value (size) which is out of
981/// bounds. Such indices only appear on places which correspond to undef values
982/// (see canReuseExtract for details) and used in order to avoid undef values
983/// have effect on operands ordering.
984/// The first loop below simply finds all unused indices and then the next loop
985/// nest assigns these indices for undef values positions.
986/// As an example below Order has two undef positions and they have assigned
987/// values 3 and 7 respectively:
988/// before: 6 9 5 4 9 2 1 0
989/// after: 6 3 5 4 7 2 1 0
991 const unsigned Sz = Order.size();
992 SmallBitVector UnusedIndices(Sz, /*t=*/true);
993 SmallBitVector MaskedIndices(Sz);
994 for (unsigned I = 0; I < Sz; ++I) {
995 if (Order[I] < Sz)
996 UnusedIndices.reset(Order[I]);
997 else
998 MaskedIndices.set(I);
999 }
1000 if (MaskedIndices.none())
1001 return;
1002 assert(UnusedIndices.count() == MaskedIndices.count() &&
1003 "Non-synced masked/available indices.");
1004 int Idx = UnusedIndices.find_first();
1005 int MIdx = MaskedIndices.find_first();
1006 while (MIdx >= 0) {
1007 assert(Idx >= 0 && "Indices must be synced.");
1008 Order[MIdx] = Idx;
1009 Idx = UnusedIndices.find_next(Idx);
1010 MIdx = MaskedIndices.find_next(MIdx);
1011 }
1012}
1013
1014namespace llvm {
1015
1017 SmallVectorImpl<int> &Mask) {
1018 Mask.clear();
1019 const unsigned E = Indices.size();
1020 Mask.resize(E, PoisonMaskElem);
1021 for (unsigned I = 0; I < E; ++I)
1022 Mask[Indices[I]] = I;
1023}
1024
1025/// Reorders the list of scalars in accordance with the given \p Mask.
1027 ArrayRef<int> Mask) {
1028 assert(!Mask.empty() && "Expected non-empty mask.");
1029 SmallVector<Value *> Prev(Scalars.size(),
1030 UndefValue::get(Scalars.front()->getType()));
1031 Prev.swap(Scalars);
1032 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1033 if (Mask[I] != PoisonMaskElem)
1034 Scalars[Mask[I]] = Prev[I];
1035}
1036
1037/// Checks if the provided value does not require scheduling. It does not
1038/// require scheduling if this is not an instruction or it is an instruction
1039/// that does not read/write memory and all operands are either not instructions
1040/// or phi nodes or instructions from different blocks.
1042 auto *I = dyn_cast<Instruction>(V);
1043 if (!I)
1044 return true;
1045 return !mayHaveNonDefUseDependency(*I) &&
1046 all_of(I->operands(), [I](Value *V) {
1047 auto *IO = dyn_cast<Instruction>(V);
1048 if (!IO)
1049 return true;
1050 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1051 });
1052}
1053
1054/// Checks if the provided value does not require scheduling. It does not
1055/// require scheduling if this is not an instruction or it is an instruction
1056/// that does not read/write memory and all users are phi nodes or instructions
1057/// from the different blocks.
1058static bool isUsedOutsideBlock(Value *V) {
1059 auto *I = dyn_cast<Instruction>(V);
1060 if (!I)
1061 return true;
1062 // Limits the number of uses to save compile time.
1063 constexpr int UsesLimit = 8;
1064 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1065 all_of(I->users(), [I](User *U) {
1066 auto *IU = dyn_cast<Instruction>(U);
1067 if (!IU)
1068 return true;
1069 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1070 });
1071}
1072
1073/// Checks if the specified value does not require scheduling. It does not
1074/// require scheduling if all operands and all users do not need to be scheduled
1075/// in the current basic block.
1078}
1079
1080/// Checks if the specified array of instructions does not require scheduling.
1081/// It is so if all either instructions have operands that do not require
1082/// scheduling or their users do not require scheduling since they are phis or
1083/// in other basic blocks.
1085 return !VL.empty() &&
1087}
1088
1089namespace slpvectorizer {
1090
1091/// Bottom Up SLP Vectorizer.
1092class BoUpSLP {
1093 struct TreeEntry;
1094 struct ScheduleData;
1097
1098public:
1106
1108 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1111 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li),
1112 DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
1113 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1114 // Use the vector register size specified by the target unless overridden
1115 // by a command-line option.
1116 // TODO: It would be better to limit the vectorization factor based on
1117 // data type rather than just register size. For example, x86 AVX has
1118 // 256-bit registers, but it does not support integer operations
1119 // at that width (that requires AVX2).
1120 if (MaxVectorRegSizeOption.getNumOccurrences())
1121 MaxVecRegSize = MaxVectorRegSizeOption;
1122 else
1123 MaxVecRegSize =
1125 .getFixedValue();
1126
1127 if (MinVectorRegSizeOption.getNumOccurrences())
1128 MinVecRegSize = MinVectorRegSizeOption;
1129 else
1130 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1131 }
1132
1133 /// Vectorize the tree that starts with the elements in \p VL.
1134 /// Returns the vectorized root.
1136
1137 /// Vectorize the tree but with the list of externally used values \p
1138 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1139 /// generated extractvalue instructions.
1140 /// \param ReplacedExternals containd list of replaced external values
1141 /// {scalar, replace} after emitting extractelement for external uses.
1142 Value *
1143 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1144 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1145 Instruction *ReductionRoot = nullptr);
1146
1147 /// \returns the cost incurred by unwanted spills and fills, caused by
1148 /// holding live values over call sites.
1150
1151 /// \returns the vectorization cost of the subtree that starts at \p VL.
1152 /// A negative number means that this is profitable.
1153 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1154
1155 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1156 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1157 void buildTree(ArrayRef<Value *> Roots,
1158 const SmallDenseSet<Value *> &UserIgnoreLst);
1159
1160 /// Construct a vectorizable tree that starts at \p Roots.
1161 void buildTree(ArrayRef<Value *> Roots);
1162
1163 /// Returns whether the root node has in-tree uses.
1165 return !VectorizableTree.empty() &&
1166 !VectorizableTree.front()->UserTreeIndices.empty();
1167 }
1168
1169 /// Return the scalars of the root node.
1171 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1172 return VectorizableTree.front()->Scalars;
1173 }
1174
1175 /// Builds external uses of the vectorized scalars, i.e. the list of
1176 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1177 /// ExternallyUsedValues contains additional list of external uses to handle
1178 /// vectorization of reductions.
1179 void
1180 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1181
1182 /// Clear the internal data structures that are created by 'buildTree'.
1183 void deleteTree() {
1184 VectorizableTree.clear();
1185 ScalarToTreeEntry.clear();
1186 MultiNodeScalars.clear();
1187 MustGather.clear();
1188 EntryToLastInstruction.clear();
1189 ExternalUses.clear();
1190 for (auto &Iter : BlocksSchedules) {
1191 BlockScheduling *BS = Iter.second.get();
1192 BS->clear();
1193 }
1194 MinBWs.clear();
1195 InstrElementSize.clear();
1196 UserIgnoreList = nullptr;
1197 PostponedGathers.clear();
1198 ValueToGatherNodes.clear();
1199 }
1200
1201 unsigned getTreeSize() const { return VectorizableTree.size(); }
1202
1203 /// Perform LICM and CSE on the newly generated gather sequences.
1205
1206 /// Checks if the specified gather tree entry \p TE can be represented as a
1207 /// shuffled vector entry + (possibly) permutation with other gathers. It
1208 /// implements the checks only for possibly ordered scalars (Loads,
1209 /// ExtractElement, ExtractValue), which can be part of the graph.
1210 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1211
1212 /// Sort loads into increasing pointers offsets to allow greater clustering.
1213 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1214
1215 /// Gets reordering data for the given tree entry. If the entry is vectorized
1216 /// - just return ReorderIndices, otherwise check if the scalars can be
1217 /// reordered and return the most optimal order.
1218 /// \return std::nullopt if ordering is not important, empty order, if
1219 /// identity order is important, or the actual order.
1220 /// \param TopToBottom If true, include the order of vectorized stores and
1221 /// insertelement nodes, otherwise skip them.
1222 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1223 bool TopToBottom);
1224
1225 /// Reorders the current graph to the most profitable order starting from the
1226 /// root node to the leaf nodes. The best order is chosen only from the nodes
1227 /// of the same size (vectorization factor). Smaller nodes are considered
1228 /// parts of subgraph with smaller VF and they are reordered independently. We
1229 /// can make it because we still need to extend smaller nodes to the wider VF
1230 /// and we can merge reordering shuffles with the widening shuffles.
1231 void reorderTopToBottom();
1232
1233 /// Reorders the current graph to the most profitable order starting from
1234 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1235 /// number of reshuffles if the leaf nodes use the same order. In this case we
1236 /// can merge the orders and just shuffle user node instead of shuffling its
1237 /// operands. Plus, even the leaf nodes have different orders, it allows to
1238 /// sink reordering in the graph closer to the root node and merge it later
1239 /// during analysis.
1240 void reorderBottomToTop(bool IgnoreReorder = false);
1241
1242 /// \return The vector element size in bits to use when vectorizing the
1243 /// expression tree ending at \p V. If V is a store, the size is the width of
1244 /// the stored value. Otherwise, the size is the width of the largest loaded
1245 /// value reaching V. This method is used by the vectorizer to calculate
1246 /// vectorization factors.
1247 unsigned getVectorElementSize(Value *V);
1248
1249 /// Compute the minimum type sizes required to represent the entries in a
1250 /// vectorizable tree.
1252
1253 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1254 unsigned getMaxVecRegSize() const {
1255 return MaxVecRegSize;
1256 }
1257
1258 // \returns minimum vector register size as set by cl::opt.
1259 unsigned getMinVecRegSize() const {
1260 return MinVecRegSize;
1261 }
1262
1263 unsigned getMinVF(unsigned Sz) const {
1264 return std::max(2U, getMinVecRegSize() / Sz);
1265 }
1266
1267 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1268 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1269 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1270 return MaxVF ? MaxVF : UINT_MAX;
1271 }
1272
1273 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1274 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1275 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1276 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1277 ///
1278 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1279 unsigned canMapToVector(Type *T, const DataLayout &DL) const;
1280
1281 /// \returns True if the VectorizableTree is both tiny and not fully
1282 /// vectorizable. We do not vectorize such trees.
1283 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1284
1285 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1286 /// can be load combined in the backend. Load combining may not be allowed in
1287 /// the IR optimizer, so we do not want to alter the pattern. For example,
1288 /// partially transforming a scalar bswap() pattern into vector code is
1289 /// effectively impossible for the backend to undo.
1290 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1291 /// may not be necessary.
1292 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1293
1294 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1295 /// can be load combined in the backend. Load combining may not be allowed in
1296 /// the IR optimizer, so we do not want to alter the pattern. For example,
1297 /// partially transforming a scalar bswap() pattern into vector code is
1298 /// effectively impossible for the backend to undo.
1299 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1300 /// may not be necessary.
1301 bool isLoadCombineCandidate() const;
1302
1304
1305 /// This structure holds any data we need about the edges being traversed
1306 /// during buildTree_rec(). We keep track of:
1307 /// (i) the user TreeEntry index, and
1308 /// (ii) the index of the edge.
1309 struct EdgeInfo {
1310 EdgeInfo() = default;
1311 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1313 /// The user TreeEntry.
1314 TreeEntry *UserTE = nullptr;
1315 /// The operand index of the use.
1316 unsigned EdgeIdx = UINT_MAX;
1317#ifndef NDEBUG
1319 const BoUpSLP::EdgeInfo &EI) {
1320 EI.dump(OS);
1321 return OS;
1322 }
1323 /// Debug print.
1324 void dump(raw_ostream &OS) const {
1325 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1326 << " EdgeIdx:" << EdgeIdx << "}";
1327 }
1328 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1329#endif
1330 bool operator == (const EdgeInfo &Other) const {
1331 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1332 }
1333 };
1334
1335 /// A helper class used for scoring candidates for two consecutive lanes.
1337 const TargetLibraryInfo &TLI;
1338 const DataLayout &DL;
1339 ScalarEvolution &SE;
1340 const BoUpSLP &R;
1341 int NumLanes; // Total number of lanes (aka vectorization factor).
1342 int MaxLevel; // The maximum recursion depth for accumulating score.
1343
1344 public:
1346 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1347 int MaxLevel)
1348 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1349 MaxLevel(MaxLevel) {}
1350
1351 // The hard-coded scores listed here are not very important, though it shall
1352 // be higher for better matches to improve the resulting cost. When
1353 // computing the scores of matching one sub-tree with another, we are
1354 // basically counting the number of values that are matching. So even if all
1355 // scores are set to 1, we would still get a decent matching result.
1356 // However, sometimes we have to break ties. For example we may have to
1357 // choose between matching loads vs matching opcodes. This is what these
1358 // scores are helping us with: they provide the order of preference. Also,
1359 // this is important if the scalar is externally used or used in another
1360 // tree entry node in the different lane.
1361
1362 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1363 static const int ScoreConsecutiveLoads = 4;
1364 /// The same load multiple times. This should have a better score than
1365 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1366 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1367 /// a vector load and 1.0 for a broadcast.
1368 static const int ScoreSplatLoads = 3;
1369 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1370 static const int ScoreReversedLoads = 3;
1371 /// A load candidate for masked gather.
1372 static const int ScoreMaskedGatherCandidate = 1;
1373 /// ExtractElementInst from same vector and consecutive indexes.
1374 static const int ScoreConsecutiveExtracts = 4;
1375 /// ExtractElementInst from same vector and reversed indices.
1376 static const int ScoreReversedExtracts = 3;
1377 /// Constants.
1378 static const int ScoreConstants = 2;
1379 /// Instructions with the same opcode.
1380 static const int ScoreSameOpcode = 2;
1381 /// Instructions with alt opcodes (e.g, add + sub).
1382 static const int ScoreAltOpcodes = 1;
1383 /// Identical instructions (a.k.a. splat or broadcast).
1384 static const int ScoreSplat = 1;
1385 /// Matching with an undef is preferable to failing.
1386 static const int ScoreUndef = 1;
1387 /// Score for failing to find a decent match.
1388 static const int ScoreFail = 0;
1389 /// Score if all users are vectorized.
1390 static const int ScoreAllUserVectorized = 1;
1391
1392 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1393 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1394 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1395 /// MainAltOps.
1397 ArrayRef<Value *> MainAltOps) const {
1398 if (!isValidElementType(V1->getType()) ||
1399 !isValidElementType(V2->getType()))
1401
1402 if (V1 == V2) {
1403 if (isa<LoadInst>(V1)) {
1404 // Retruns true if the users of V1 and V2 won't need to be extracted.
1405 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1406 // Bail out if we have too many uses to save compilation time.
1407 static constexpr unsigned Limit = 8;
1408 if (V1->hasNUsesOrMore(Limit) || V2->hasNUsesOrMore(Limit))
1409 return false;
1410
1411 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1412 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1413 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1414 });
1415 };
1416 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1417 };
1418 // A broadcast of a load can be cheaper on some targets.
1419 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1420 ElementCount::getFixed(NumLanes)) &&
1421 ((int)V1->getNumUses() == NumLanes ||
1422 AllUsersAreInternal(V1, V2)))
1424 }
1426 }
1427
1428 auto *LI1 = dyn_cast<LoadInst>(V1);
1429 auto *LI2 = dyn_cast<LoadInst>(V2);
1430 if (LI1 && LI2) {
1431 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1432 !LI2->isSimple())
1434
1435 std::optional<int> Dist = getPointersDiff(
1436 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1437 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1438 if (!Dist || *Dist == 0) {
1439 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1440 getUnderlyingObject(LI2->getPointerOperand()) &&
1441 R.TTI->isLegalMaskedGather(
1442 FixedVectorType::get(LI1->getType(), NumLanes),
1443 LI1->getAlign()))
1446 }
1447 // The distance is too large - still may be profitable to use masked
1448 // loads/gathers.
1449 if (std::abs(*Dist) > NumLanes / 2)
1451 // This still will detect consecutive loads, but we might have "holes"
1452 // in some cases. It is ok for non-power-2 vectorization and may produce
1453 // better results. It should not affect current vectorization.
1456 }
1457
1458 auto *C1 = dyn_cast<Constant>(V1);
1459 auto *C2 = dyn_cast<Constant>(V2);
1460 if (C1 && C2)
1462
1463 // Extracts from consecutive indexes of the same vector better score as
1464 // the extracts could be optimized away.
1465 Value *EV1;
1466 ConstantInt *Ex1Idx;
1467 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1468 // Undefs are always profitable for extractelements.
1469 // Compiler can easily combine poison and extractelement <non-poison> or
1470 // undef and extractelement <poison>. But combining undef +
1471 // extractelement <non-poison-but-may-produce-poison> requires some
1472 // extra operations.
1473 if (isa<UndefValue>(V2))
1474 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1477 Value *EV2 = nullptr;
1478 ConstantInt *Ex2Idx = nullptr;
1479 if (match(V2,
1481 m_Undef())))) {
1482 // Undefs are always profitable for extractelements.
1483 if (!Ex2Idx)
1485 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1487 if (EV2 == EV1) {
1488 int Idx1 = Ex1Idx->getZExtValue();
1489 int Idx2 = Ex2Idx->getZExtValue();
1490 int Dist = Idx2 - Idx1;
1491 // The distance is too large - still may be profitable to use
1492 // shuffles.
1493 if (std::abs(Dist) == 0)
1495 if (std::abs(Dist) > NumLanes / 2)
1499 }
1501 }
1503 }
1504
1505 auto *I1 = dyn_cast<Instruction>(V1);
1506 auto *I2 = dyn_cast<Instruction>(V2);
1507 if (I1 && I2) {
1508 if (I1->getParent() != I2->getParent())
1510 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1511 Ops.push_back(I1);
1512 Ops.push_back(I2);
1513 InstructionsState S = getSameOpcode(Ops, TLI);
1514 // Note: Only consider instructions with <= 2 operands to avoid
1515 // complexity explosion.
1516 if (S.getOpcode() &&
1517 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1518 !S.isAltShuffle()) &&
1519 all_of(Ops, [&S](Value *V) {
1520 return cast<Instruction>(V)->getNumOperands() ==
1521 S.MainOp->getNumOperands();
1522 }))
1523 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1525 }
1526
1527 if (isa<UndefValue>(V2))
1529
1531 }
1532
1533 /// Go through the operands of \p LHS and \p RHS recursively until
1534 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1535 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1536 /// of \p U1 and \p U2), except at the beginning of the recursion where
1537 /// these are set to nullptr.
1538 ///
1539 /// For example:
1540 /// \verbatim
1541 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1542 /// \ / \ / \ / \ /
1543 /// + + + +
1544 /// G1 G2 G3 G4
1545 /// \endverbatim
1546 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1547 /// each level recursively, accumulating the score. It starts from matching
1548 /// the additions at level 0, then moves on to the loads (level 1). The
1549 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1550 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1551 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1552 /// Please note that the order of the operands does not matter, as we
1553 /// evaluate the score of all profitable combinations of operands. In
1554 /// other words the score of G1 and G4 is the same as G1 and G2. This
1555 /// heuristic is based on ideas described in:
1556 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1557 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1558 /// Luís F. W. Góes
1560 Instruction *U2, int CurrLevel,
1561 ArrayRef<Value *> MainAltOps) const {
1562
1563 // Get the shallow score of V1 and V2.
1564 int ShallowScoreAtThisLevel =
1565 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1566
1567 // If reached MaxLevel,
1568 // or if V1 and V2 are not instructions,
1569 // or if they are SPLAT,
1570 // or if they are not consecutive,
1571 // or if profitable to vectorize loads or extractelements, early return
1572 // the current cost.
1573 auto *I1 = dyn_cast<Instruction>(LHS);
1574 auto *I2 = dyn_cast<Instruction>(RHS);
1575 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1576 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1577 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1578 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1579 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1580 ShallowScoreAtThisLevel))
1581 return ShallowScoreAtThisLevel;
1582 assert(I1 && I2 && "Should have early exited.");
1583
1584 // Contains the I2 operand indexes that got matched with I1 operands.
1585 SmallSet<unsigned, 4> Op2Used;
1586
1587 // Recursion towards the operands of I1 and I2. We are trying all possible
1588 // operand pairs, and keeping track of the best score.
1589 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1590 OpIdx1 != NumOperands1; ++OpIdx1) {
1591 // Try to pair op1I with the best operand of I2.
1592 int MaxTmpScore = 0;
1593 unsigned MaxOpIdx2 = 0;
1594 bool FoundBest = false;
1595 // If I2 is commutative try all combinations.
1596 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1597 unsigned ToIdx = isCommutative(I2)
1598 ? I2->getNumOperands()
1599 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1600 assert(FromIdx <= ToIdx && "Bad index");
1601 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1602 // Skip operands already paired with OpIdx1.
1603 if (Op2Used.count(OpIdx2))
1604 continue;
1605 // Recursively calculate the cost at each level
1606 int TmpScore =
1607 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1608 I1, I2, CurrLevel + 1, std::nullopt);
1609 // Look for the best score.
1610 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1611 TmpScore > MaxTmpScore) {
1612 MaxTmpScore = TmpScore;
1613 MaxOpIdx2 = OpIdx2;
1614 FoundBest = true;
1615 }
1616 }
1617 if (FoundBest) {
1618 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1619 Op2Used.insert(MaxOpIdx2);
1620 ShallowScoreAtThisLevel += MaxTmpScore;
1621 }
1622 }
1623 return ShallowScoreAtThisLevel;
1624 }
1625 };
1626 /// A helper data structure to hold the operands of a vector of instructions.
1627 /// This supports a fixed vector length for all operand vectors.
1629 /// For each operand we need (i) the value, and (ii) the opcode that it
1630 /// would be attached to if the expression was in a left-linearized form.
1631 /// This is required to avoid illegal operand reordering.
1632 /// For example:
1633 /// \verbatim
1634 /// 0 Op1
1635 /// |/
1636 /// Op1 Op2 Linearized + Op2
1637 /// \ / ----------> |/
1638 /// - -
1639 ///
1640 /// Op1 - Op2 (0 + Op1) - Op2
1641 /// \endverbatim
1642 ///
1643 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1644 ///
1645 /// Another way to think of this is to track all the operations across the
1646 /// path from the operand all the way to the root of the tree and to
1647 /// calculate the operation that corresponds to this path. For example, the
1648 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1649 /// corresponding operation is a '-' (which matches the one in the
1650 /// linearized tree, as shown above).
1651 ///
1652 /// For lack of a better term, we refer to this operation as Accumulated
1653 /// Path Operation (APO).
1654 struct OperandData {
1655 OperandData() = default;
1656 OperandData(Value *V, bool APO, bool IsUsed)
1657 : V(V), APO(APO), IsUsed(IsUsed) {}
1658 /// The operand value.
1659 Value *V = nullptr;
1660 /// TreeEntries only allow a single opcode, or an alternate sequence of
1661 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1662 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1663 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1664 /// (e.g., Add/Mul)
1665 bool APO = false;
1666 /// Helper data for the reordering function.
1667 bool IsUsed = false;
1668 };
1669
1670 /// During operand reordering, we are trying to select the operand at lane
1671 /// that matches best with the operand at the neighboring lane. Our
1672 /// selection is based on the type of value we are looking for. For example,
1673 /// if the neighboring lane has a load, we need to look for a load that is
1674 /// accessing a consecutive address. These strategies are summarized in the
1675 /// 'ReorderingMode' enumerator.
1676 enum class ReorderingMode {
1677 Load, ///< Matching loads to consecutive memory addresses
1678 Opcode, ///< Matching instructions based on opcode (same or alternate)
1679 Constant, ///< Matching constants
1680 Splat, ///< Matching the same instruction multiple times (broadcast)
1681 Failed, ///< We failed to create a vectorizable group
1682 };
1683
1685
1686 /// A vector of operand vectors.
1688
1689 const TargetLibraryInfo &TLI;
1690 const DataLayout &DL;
1691 ScalarEvolution &SE;
1692 const BoUpSLP &R;
1693
1694 /// \returns the operand data at \p OpIdx and \p Lane.
1695 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1696 return OpsVec[OpIdx][Lane];
1697 }
1698
1699 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1700 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1701 return OpsVec[OpIdx][Lane];
1702 }
1703
1704 /// Clears the used flag for all entries.
1705 void clearUsed() {
1706 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1707 OpIdx != NumOperands; ++OpIdx)
1708 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1709 ++Lane)
1710 OpsVec[OpIdx][Lane].IsUsed = false;
1711 }
1712
1713 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1714 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1715 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1716 }
1717
1718 /// \param Lane lane of the operands under analysis.
1719 /// \param OpIdx operand index in \p Lane lane we're looking the best
1720 /// candidate for.
1721 /// \param Idx operand index of the current candidate value.
1722 /// \returns The additional score due to possible broadcasting of the
1723 /// elements in the lane. It is more profitable to have power-of-2 unique
1724 /// elements in the lane, it will be vectorized with higher probability
1725 /// after removing duplicates. Currently the SLP vectorizer supports only
1726 /// vectorization of the power-of-2 number of unique scalars.
1727 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1728 Value *IdxLaneV = getData(Idx, Lane).V;
1729 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1730 return 0;
1732 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1733 if (Ln == Lane)
1734 continue;
1735 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1736 if (!isa<Instruction>(OpIdxLnV))
1737 return 0;
1738 Uniques.insert(OpIdxLnV);
1739 }
1740 int UniquesCount = Uniques.size();
1741 int UniquesCntWithIdxLaneV =
1742 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1743 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1744 int UniquesCntWithOpIdxLaneV =
1745 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1746 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1747 return 0;
1748 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1749 UniquesCntWithOpIdxLaneV) -
1750 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1751 }
1752
1753 /// \param Lane lane of the operands under analysis.
1754 /// \param OpIdx operand index in \p Lane lane we're looking the best
1755 /// candidate for.
1756 /// \param Idx operand index of the current candidate value.
1757 /// \returns The additional score for the scalar which users are all
1758 /// vectorized.
1759 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1760 Value *IdxLaneV = getData(Idx, Lane).V;
1761 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1762 // Do not care about number of uses for vector-like instructions
1763 // (extractelement/extractvalue with constant indices), they are extracts
1764 // themselves and already externally used. Vectorization of such
1765 // instructions does not add extra extractelement instruction, just may
1766 // remove it.
1767 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1768 isVectorLikeInstWithConstOps(OpIdxLaneV))
1770 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1771 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1772 return 0;
1773 return R.areAllUsersVectorized(IdxLaneI)
1775 : 0;
1776 }
1777
1778 /// Score scaling factor for fully compatible instructions but with
1779 /// different number of external uses. Allows better selection of the
1780 /// instructions with less external uses.
1781 static const int ScoreScaleFactor = 10;
1782
1783 /// \Returns the look-ahead score, which tells us how much the sub-trees
1784 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1785 /// score. This helps break ties in an informed way when we cannot decide on
1786 /// the order of the operands by just considering the immediate
1787 /// predecessors.
1788 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1789 int Lane, unsigned OpIdx, unsigned Idx,
1790 bool &IsUsed) {
1791 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1793 // Keep track of the instruction stack as we recurse into the operands
1794 // during the look-ahead score exploration.
1795 int Score =
1796 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1797 /*CurrLevel=*/1, MainAltOps);
1798 if (Score) {
1799 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1800 if (Score <= -SplatScore) {
1801 // Set the minimum score for splat-like sequence to avoid setting
1802 // failed state.
1803 Score = 1;
1804 } else {
1805 Score += SplatScore;
1806 // Scale score to see the difference between different operands
1807 // and similar operands but all vectorized/not all vectorized
1808 // uses. It does not affect actual selection of the best
1809 // compatible operand in general, just allows to select the
1810 // operand with all vectorized uses.
1811 Score *= ScoreScaleFactor;
1812 Score += getExternalUseScore(Lane, OpIdx, Idx);
1813 IsUsed = true;
1814 }
1815 }
1816 return Score;
1817 }
1818
1819 /// Best defined scores per lanes between the passes. Used to choose the
1820 /// best operand (with the highest score) between the passes.
1821 /// The key - {Operand Index, Lane}.
1822 /// The value - the best score between the passes for the lane and the
1823 /// operand.
1825 BestScoresPerLanes;
1826
1827 // Search all operands in Ops[*][Lane] for the one that matches best
1828 // Ops[OpIdx][LastLane] and return its opreand index.
1829 // If no good match can be found, return std::nullopt.
1830 std::optional<unsigned>
1831 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1832 ArrayRef<ReorderingMode> ReorderingModes,
1833 ArrayRef<Value *> MainAltOps) {
1834 unsigned NumOperands = getNumOperands();
1835
1836 // The operand of the previous lane at OpIdx.
1837 Value *OpLastLane = getData(OpIdx, LastLane).V;
1838
1839 // Our strategy mode for OpIdx.
1840 ReorderingMode RMode = ReorderingModes[OpIdx];
1841 if (RMode == ReorderingMode::Failed)
1842 return std::nullopt;
1843
1844 // The linearized opcode of the operand at OpIdx, Lane.
1845 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1846
1847 // The best operand index and its score.
1848 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1849 // are using the score to differentiate between the two.
1850 struct BestOpData {
1851 std::optional<unsigned> Idx;
1852 unsigned Score = 0;
1853 } BestOp;
1854 BestOp.Score =
1855 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1856 .first->second;
1857
1858 // Track if the operand must be marked as used. If the operand is set to
1859 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1860 // want to reestimate the operands again on the following iterations).
1861 bool IsUsed =
1862 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1863 // Iterate through all unused operands and look for the best.
1864 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1865 // Get the operand at Idx and Lane.
1866 OperandData &OpData = getData(Idx, Lane);
1867 Value *Op = OpData.V;
1868 bool OpAPO = OpData.APO;
1869
1870 // Skip already selected operands.
1871 if (OpData.IsUsed)
1872 continue;
1873
1874 // Skip if we are trying to move the operand to a position with a
1875 // different opcode in the linearized tree form. This would break the
1876 // semantics.
1877 if (OpAPO != OpIdxAPO)
1878 continue;
1879
1880 // Look for an operand that matches the current mode.
1881 switch (RMode) {
1882 case ReorderingMode::Load:
1883 case ReorderingMode::Constant:
1884 case ReorderingMode::Opcode: {
1885 bool LeftToRight = Lane > LastLane;
1886 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1887 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1888 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1889 OpIdx, Idx, IsUsed);
1890 if (Score > static_cast<int>(BestOp.Score)) {
1891 BestOp.Idx = Idx;
1892 BestOp.Score = Score;
1893 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1894 }
1895 break;
1896 }
1897 case ReorderingMode::Splat:
1898 if (Op == OpLastLane)
1899 BestOp.Idx = Idx;
1900 break;
1901 case ReorderingMode::Failed:
1902 llvm_unreachable("Not expected Failed reordering mode.");
1903 }
1904 }
1905
1906 if (BestOp.Idx) {
1907 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1908 return BestOp.Idx;
1909 }
1910 // If we could not find a good match return std::nullopt.
1911 return std::nullopt;
1912 }
1913
1914 /// Helper for reorderOperandVecs.
1915 /// \returns the lane that we should start reordering from. This is the one
1916 /// which has the least number of operands that can freely move about or
1917 /// less profitable because it already has the most optimal set of operands.
1918 unsigned getBestLaneToStartReordering() const {
1919 unsigned Min = UINT_MAX;
1920 unsigned SameOpNumber = 0;
1921 // std::pair<unsigned, unsigned> is used to implement a simple voting
1922 // algorithm and choose the lane with the least number of operands that
1923 // can freely move about or less profitable because it already has the
1924 // most optimal set of operands. The first unsigned is a counter for
1925 // voting, the second unsigned is the counter of lanes with instructions
1926 // with same/alternate opcodes and same parent basic block.
1928 // Try to be closer to the original results, if we have multiple lanes
1929 // with same cost. If 2 lanes have the same cost, use the one with the
1930 // lowest index.
1931 for (int I = getNumLanes(); I > 0; --I) {
1932 unsigned Lane = I - 1;
1933 OperandsOrderData NumFreeOpsHash =
1934 getMaxNumOperandsThatCanBeReordered(Lane);
1935 // Compare the number of operands that can move and choose the one with
1936 // the least number.
1937 if (NumFreeOpsHash.NumOfAPOs < Min) {
1938 Min = NumFreeOpsHash.NumOfAPOs;
1939 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1940 HashMap.clear();
1941 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1942 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1943 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1944 // Select the most optimal lane in terms of number of operands that
1945 // should be moved around.
1946 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1947 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1948 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1949 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1950 auto It = HashMap.find(NumFreeOpsHash.Hash);
1951 if (It == HashMap.end())
1952 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1953 else
1954 ++It->second.first;
1955 }
1956 }
1957 // Select the lane with the minimum counter.
1958 unsigned BestLane = 0;
1959 unsigned CntMin = UINT_MAX;
1960 for (const auto &Data : reverse(HashMap)) {
1961 if (Data.second.first < CntMin) {
1962 CntMin = Data.second.first;
1963 BestLane = Data.second.second;
1964 }
1965 }
1966 return BestLane;
1967 }
1968
1969 /// Data structure that helps to reorder operands.
1970 struct OperandsOrderData {
1971 /// The best number of operands with the same APOs, which can be
1972 /// reordered.
1973 unsigned NumOfAPOs = UINT_MAX;
1974 /// Number of operands with the same/alternate instruction opcode and
1975 /// parent.
1976 unsigned NumOpsWithSameOpcodeParent = 0;
1977 /// Hash for the actual operands ordering.
1978 /// Used to count operands, actually their position id and opcode
1979 /// value. It is used in the voting mechanism to find the lane with the
1980 /// least number of operands that can freely move about or less profitable
1981 /// because it already has the most optimal set of operands. Can be
1982 /// replaced with SmallVector<unsigned> instead but hash code is faster
1983 /// and requires less memory.
1984 unsigned Hash = 0;
1985 };
1986 /// \returns the maximum number of operands that are allowed to be reordered
1987 /// for \p Lane and the number of compatible instructions(with the same
1988 /// parent/opcode). This is used as a heuristic for selecting the first lane
1989 /// to start operand reordering.
1990 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
1991 unsigned CntTrue = 0;
1992 unsigned NumOperands = getNumOperands();
1993 // Operands with the same APO can be reordered. We therefore need to count
1994 // how many of them we have for each APO, like this: Cnt[APO] = x.
1995 // Since we only have two APOs, namely true and false, we can avoid using
1996 // a map. Instead we can simply count the number of operands that
1997 // correspond to one of them (in this case the 'true' APO), and calculate
1998 // the other by subtracting it from the total number of operands.
1999 // Operands with the same instruction opcode and parent are more
2000 // profitable since we don't need to move them in many cases, with a high
2001 // probability such lane already can be vectorized effectively.
2002 bool AllUndefs = true;
2003 unsigned NumOpsWithSameOpcodeParent = 0;
2004 Instruction *OpcodeI = nullptr;
2005 BasicBlock *Parent = nullptr;
2006 unsigned Hash = 0;
2007 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2008 const OperandData &OpData = getData(OpIdx, Lane);
2009 if (OpData.APO)
2010 ++CntTrue;
2011 // Use Boyer-Moore majority voting for finding the majority opcode and
2012 // the number of times it occurs.
2013 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2014 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
2015 I->getParent() != Parent) {
2016 if (NumOpsWithSameOpcodeParent == 0) {
2017 NumOpsWithSameOpcodeParent = 1;
2018 OpcodeI = I;
2019 Parent = I->getParent();
2020 } else {
2021 --NumOpsWithSameOpcodeParent;
2022 }
2023 } else {
2024 ++NumOpsWithSameOpcodeParent;
2025 }
2026 }
2027 Hash = hash_combine(
2028 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2029 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2030 }
2031 if (AllUndefs)
2032 return {};
2033 OperandsOrderData Data;
2034 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2035 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2036 Data.Hash = Hash;
2037 return Data;
2038 }
2039
2040 /// Go through the instructions in VL and append their operands.
2041 void appendOperandsOfVL(ArrayRef<Value *> VL) {
2042 assert(!VL.empty() && "Bad VL");
2043 assert((empty() || VL.size() == getNumLanes()) &&
2044 "Expected same number of lanes");
2045 assert(isa<Instruction>(VL[0]) && "Expected instruction");
2046 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2047 OpsVec.resize(NumOperands);
2048 unsigned NumLanes = VL.size();
2049 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2050 OpsVec[OpIdx].resize(NumLanes);
2051 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2052 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2053 // Our tree has just 3 nodes: the root and two operands.
2054 // It is therefore trivial to get the APO. We only need to check the
2055 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2056 // RHS operand. The LHS operand of both add and sub is never attached
2057 // to an inversese operation in the linearized form, therefore its APO
2058 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2059
2060 // Since operand reordering is performed on groups of commutative
2061 // operations or alternating sequences (e.g., +, -), we can safely
2062 // tell the inverse operations by checking commutativity.
2063 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2064 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2065 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2066 APO, false};
2067 }
2068 }
2069 }
2070
2071 /// \returns the number of operands.
2072 unsigned getNumOperands() const { return OpsVec.size(); }
2073
2074 /// \returns the number of lanes.
2075 unsigned getNumLanes() const { return OpsVec[0].size(); }
2076
2077 /// \returns the operand value at \p OpIdx and \p Lane.
2078 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2079 return getData(OpIdx, Lane).V;
2080 }
2081
2082 /// \returns true if the data structure is empty.
2083 bool empty() const { return OpsVec.empty(); }
2084
2085 /// Clears the data.
2086 void clear() { OpsVec.clear(); }
2087
2088 /// \Returns true if there are enough operands identical to \p Op to fill
2089 /// the whole vector.
2090 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2091 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2092 bool OpAPO = getData(OpIdx, Lane).APO;
2093 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2094 if (Ln == Lane)
2095 continue;
2096 // This is set to true if we found a candidate for broadcast at Lane.
2097 bool FoundCandidate = false;
2098 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2099 OperandData &Data = getData(OpI, Ln);
2100 if (Data.APO != OpAPO || Data.IsUsed)
2101 continue;
2102 if (Data.V == Op) {
2103 FoundCandidate = true;
2104 Data.IsUsed = true;
2105 break;
2106 }
2107 }
2108 if (!FoundCandidate)
2109 return false;
2110 }
2111 return true;
2112 }
2113
2114 public:
2115 /// Initialize with all the operands of the instruction vector \p RootVL.
2117 const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R)
2118 : TLI(TLI), DL(DL), SE(SE), R(R) {
2119 // Append all the operands of RootVL.
2120 appendOperandsOfVL(RootVL);
2121 }
2122
2123 /// \Returns a value vector with the operands across all lanes for the
2124 /// opearnd at \p OpIdx.
2125 ValueList getVL(unsigned OpIdx) const {
2126 ValueList OpVL(OpsVec[OpIdx].size());
2127 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2128 "Expected same num of lanes across all operands");
2129 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2130 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2131 return OpVL;
2132 }
2133
2134 // Performs operand reordering for 2 or more operands.
2135 // The original operands are in OrigOps[OpIdx][Lane].
2136 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2137 void reorder() {
2138 unsigned NumOperands = getNumOperands();
2139 unsigned NumLanes = getNumLanes();
2140 // Each operand has its own mode. We are using this mode to help us select
2141 // the instructions for each lane, so that they match best with the ones
2142 // we have selected so far.
2143 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2144
2145 // This is a greedy single-pass algorithm. We are going over each lane
2146 // once and deciding on the best order right away with no back-tracking.
2147 // However, in order to increase its effectiveness, we start with the lane
2148 // that has operands that can move the least. For example, given the
2149 // following lanes:
2150 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2151 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2152 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2153 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2154 // we will start at Lane 1, since the operands of the subtraction cannot
2155 // be reordered. Then we will visit the rest of the lanes in a circular
2156 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2157
2158 // Find the first lane that we will start our search from.
2159 unsigned FirstLane = getBestLaneToStartReordering();
2160
2161 // Initialize the modes.
2162 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2163 Value *OpLane0 = getValue(OpIdx, FirstLane);
2164 // Keep track if we have instructions with all the same opcode on one
2165 // side.
2166 if (isa<LoadInst>(OpLane0))
2167 ReorderingModes[OpIdx] = ReorderingMode::Load;
2168 else if (isa<Instruction>(OpLane0)) {
2169 // Check if OpLane0 should be broadcast.
2170 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2171 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2172 else
2173 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2174 }
2175 else if (isa<Constant>(OpLane0))
2176 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2177 else if (isa<Argument>(OpLane0))
2178 // Our best hope is a Splat. It may save some cost in some cases.
2179 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2180 else
2181 // NOTE: This should be unreachable.
2182 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2183 }
2184
2185 // Check that we don't have same operands. No need to reorder if operands
2186 // are just perfect diamond or shuffled diamond match. Do not do it only
2187 // for possible broadcasts or non-power of 2 number of scalars (just for
2188 // now).
2189 auto &&SkipReordering = [this]() {
2190 SmallPtrSet<Value *, 4> UniqueValues;
2191 ArrayRef<OperandData> Op0 = OpsVec.front();
2192 for (const OperandData &Data : Op0)
2193 UniqueValues.insert(Data.V);
2194 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2195 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2196 return !UniqueValues.contains(Data.V);
2197 }))
2198 return false;
2199 }
2200 // TODO: Check if we can remove a check for non-power-2 number of
2201 // scalars after full support of non-power-2 vectorization.
2202 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2203 };
2204
2205 // If the initial strategy fails for any of the operand indexes, then we
2206 // perform reordering again in a second pass. This helps avoid assigning
2207 // high priority to the failed strategy, and should improve reordering for
2208 // the non-failed operand indexes.
2209 for (int Pass = 0; Pass != 2; ++Pass) {
2210 // Check if no need to reorder operands since they're are perfect or
2211 // shuffled diamond match.
2212 // Need to do it to avoid extra external use cost counting for
2213 // shuffled matches, which may cause regressions.
2214 if (SkipReordering())
2215 break;
2216 // Skip the second pass if the first pass did not fail.
2217 bool StrategyFailed = false;
2218 // Mark all operand data as free to use.
2219 clearUsed();
2220 // We keep the original operand order for the FirstLane, so reorder the
2221 // rest of the lanes. We are visiting the nodes in a circular fashion,
2222 // using FirstLane as the center point and increasing the radius
2223 // distance.
2224 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2225 for (unsigned I = 0; I < NumOperands; ++I)
2226 MainAltOps[I].push_back(getData(I, FirstLane).V);
2227
2228 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2229 // Visit the lane on the right and then the lane on the left.
2230 for (int Direction : {+1, -1}) {
2231 int Lane = FirstLane + Direction * Distance;
2232 if (Lane < 0 || Lane >= (int)NumLanes)
2233 continue;
2234 int LastLane = Lane - Direction;
2235 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2236 "Out of bounds");
2237 // Look for a good match for each operand.
2238 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2239 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2240 std::optional<unsigned> BestIdx = getBestOperand(
2241 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2242 // By not selecting a value, we allow the operands that follow to
2243 // select a better matching value. We will get a non-null value in
2244 // the next run of getBestOperand().
2245 if (BestIdx) {
2246 // Swap the current operand with the one returned by
2247 // getBestOperand().
2248 swap(OpIdx, *BestIdx, Lane);
2249 } else {
2250 // We failed to find a best operand, set mode to 'Failed'.
2251 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2252 // Enable the second pass.
2253 StrategyFailed = true;
2254 }
2255 // Try to get the alternate opcode and follow it during analysis.
2256 if (MainAltOps[OpIdx].size() != 2) {
2257 OperandData &AltOp = getData(OpIdx, Lane);
2258 InstructionsState OpS =
2259 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2260 if (OpS.getOpcode() && OpS.isAltShuffle())
2261 MainAltOps[OpIdx].push_back(AltOp.V);
2262 }
2263 }
2264 }
2265 }
2266 // Skip second pass if the strategy did not fail.
2267 if (!StrategyFailed)
2268 break;
2269 }
2270 }
2271
2272#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2273 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2274 switch (RMode) {
2275 case ReorderingMode::Load:
2276 return "Load";
2277 case ReorderingMode::Opcode:
2278 return "Opcode";
2279 case ReorderingMode::Constant:
2280 return "Constant";
2281 case ReorderingMode::Splat:
2282 return "Splat";
2283 case ReorderingMode::Failed:
2284 return "Failed";
2285 }
2286 llvm_unreachable("Unimplemented Reordering Type");
2287 }
2288
2289 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2290 raw_ostream &OS) {
2291 return OS << getModeStr(RMode);
2292 }
2293
2294 /// Debug print.
2295 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2296 printMode(RMode, dbgs());
2297 }
2298
2299 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2300 return printMode(RMode, OS);
2301 }
2302
2304 const unsigned Indent = 2;
2305 unsigned Cnt = 0;
2306 for (const OperandDataVec &OpDataVec : OpsVec) {
2307 OS << "Operand " << Cnt++ << "\n";
2308 for (const OperandData &OpData : OpDataVec) {
2309 OS.indent(Indent) << "{";
2310 if (Value *V = OpData.V)
2311 OS << *V;
2312 else
2313 OS << "null";
2314 OS << ", APO:" << OpData.APO << "}\n";
2315 }
2316 OS << "\n";
2317 }
2318 return OS;
2319 }
2320
2321 /// Debug print.
2322 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2323#endif
2324 };
2325
2326 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2327 /// for a pair which have highest score deemed to have best chance to form
2328 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2329 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2330 /// of the cost, considered to be good enough score.
2331 std::optional<int>
2332 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2333 int Limit = LookAheadHeuristics::ScoreFail) {
2334 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2336 int BestScore = Limit;
2337 std::optional<int> Index;
2338 for (int I : seq<int>(0, Candidates.size())) {
2339 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2340 Candidates[I].second,
2341 /*U1=*/nullptr, /*U2=*/nullptr,
2342 /*Level=*/1, std::nullopt);
2343 if (Score > BestScore) {
2344 BestScore = Score;
2345 Index = I;
2346 }
2347 }
2348 return Index;
2349 }
2350
2351 /// Checks if the instruction is marked for deletion.
2352 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2353
2354 /// Removes an instruction from its block and eventually deletes it.
2355 /// It's like Instruction::eraseFromParent() except that the actual deletion
2356 /// is delayed until BoUpSLP is destructed.
2358 DeletedInstructions.insert(I);
2359 }
2360
2361 /// Checks if the instruction was already analyzed for being possible
2362 /// reduction root.
2364 return AnalyzedReductionsRoots.count(I);
2365 }
2366 /// Register given instruction as already analyzed for being possible
2367 /// reduction root.
2369 AnalyzedReductionsRoots.insert(I);
2370 }
2371 /// Checks if the provided list of reduced values was checked already for
2372 /// vectorization.
2374 return AnalyzedReductionVals.contains(hash_value(VL));
2375 }
2376 /// Adds the list of reduced values to list of already checked values for the
2377 /// vectorization.
2379 AnalyzedReductionVals.insert(hash_value(VL));
2380 }
2381 /// Clear the list of the analyzed reduction root instructions.
2383 AnalyzedReductionsRoots.clear();
2384 AnalyzedReductionVals.clear();
2385 }
2386 /// Checks if the given value is gathered in one of the nodes.
2387 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2388 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2389 }
2390
2391 /// Check if the value is vectorized in the tree.
2392 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2393
2394 ~BoUpSLP();
2395
2396private:
2397 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2398 /// reordering (i.e. the operands can be reordered because they have only one
2399 /// user and reordarable).
2400 /// \param ReorderableGathers List of all gather nodes that require reordering
2401 /// (e.g., gather of extractlements or partially vectorizable loads).
2402 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2403 /// reordering, subset of \p NonVectorized.
2404 bool
2405 canReorderOperands(TreeEntry *UserTE,
2406 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2407 ArrayRef<TreeEntry *> ReorderableGathers,
2408 SmallVectorImpl<TreeEntry *> &GatherOps);
2409
2410 /// Checks if the given \p TE is a gather node with clustered reused scalars
2411 /// and reorders it per given \p Mask.
2412 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2413
2414 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2415 /// if any. If it is not vectorized (gather node), returns nullptr.
2416 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2417 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2418 TreeEntry *TE = nullptr;
2419 const auto *It = find_if(VL, [&](Value *V) {
2420 TE = getTreeEntry(V);
2421 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2422 return true;
2423 auto It = MultiNodeScalars.find(V);
2424 if (It != MultiNodeScalars.end()) {
2425 for (TreeEntry *E : It->second) {
2426 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2427 TE = E;
2428 return true;
2429 }
2430 }
2431 }
2432 return false;
2433 });
2434 if (It != VL.end()) {
2435 assert(TE->isSame(VL) && "Expected same scalars.");
2436 return TE;
2437 }
2438 return nullptr;
2439 }
2440
2441 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2442 /// if any. If it is not vectorized (gather node), returns nullptr.
2443 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2444 unsigned OpIdx) const {
2445 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2446 const_cast<TreeEntry *>(UserTE), OpIdx);
2447 }
2448
2449 /// Checks if all users of \p I are the part of the vectorization tree.
2450 bool areAllUsersVectorized(
2451 Instruction *I,
2452 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2453
2454 /// Return information about the vector formed for the specified index
2455 /// of a vector of (the same) instruction.
2457
2458 /// \returns the cost of the vectorizable entry.
2459 InstructionCost getEntryCost(const TreeEntry *E,
2460 ArrayRef<Value *> VectorizedVals,
2461 SmallPtrSetImpl<Value *> &CheckedExtracts);
2462
2463 /// This is the recursive part of buildTree.
2464 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2465 const EdgeInfo &EI);
2466
2467 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2468 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2469 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2470 /// returns false, setting \p CurrentOrder to either an empty vector or a
2471 /// non-identity permutation that allows to reuse extract instructions.
2472 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2473 /// extract order.
2474 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2475 SmallVectorImpl<unsigned> &CurrentOrder,
2476 bool ResizeAllowed = false) const;
2477
2478 /// Vectorize a single entry in the tree.
2479 Value *vectorizeTree(TreeEntry *E);
2480
2481 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2482 /// \p E.
2483 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
2484
2485 /// Create a new vector from a list of scalar values. Produces a sequence
2486 /// which exploits values reused across lanes, and arranges the inserts
2487 /// for ease of later optimization.
2488 template <typename BVTy, typename ResTy, typename... Args>
2489 ResTy processBuildVector(const TreeEntry *E, Args &...Params);
2490
2491 /// Create a new vector from a list of scalar values. Produces a sequence
2492 /// which exploits values reused across lanes, and arranges the inserts
2493 /// for ease of later optimization.
2494 Value *createBuildVector(const TreeEntry *E);
2495
2496 /// Returns the instruction in the bundle, which can be used as a base point
2497 /// for scheduling. Usually it is the last instruction in the bundle, except
2498 /// for the case when all operands are external (in this case, it is the first
2499 /// instruction in the list).
2500 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2501
2502 /// Checks if the gathered \p VL can be represented as shuffle(s) of previous
2503 /// tree entries.
2504 /// \param TE Tree entry checked for permutation.
2505 /// \param VL List of scalars (a subset of the TE scalar), checked for
2506 /// permutations.
2507 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2508 /// previous tree entries. \p Mask is filled with the shuffle mask.
2509 std::optional<TargetTransformInfo::ShuffleKind>
2510 isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
2513
2514 /// \returns the scalarization cost for this list of values. Assuming that
2515 /// this subtree gets vectorized, we may need to extract the values from the
2516 /// roots. This method calculates the cost of extracting the values.
2517 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2518 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc) const;
2519
2520 /// Set the Builder insert point to one after the last instruction in
2521 /// the bundle
2522 void setInsertPointAfterBundle(const TreeEntry *E);
2523
2524 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2525 /// specified, the starting vector value is poison.
2526 Value *gather(ArrayRef<Value *> VL, Value *Root);
2527
2528 /// \returns whether the VectorizableTree is fully vectorizable and will
2529 /// be beneficial even the tree height is tiny.
2530 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2531
2532 /// Reorder commutative or alt operands to get better probability of
2533 /// generating vectorized code.
2534 static void reorderInputsAccordingToOpcode(
2537 const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R);
2538
2539 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2540 /// users of \p TE and collects the stores. It returns the map from the store
2541 /// pointers to the collected stores.
2543 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2544
2545 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2546 /// stores in \p StoresVec can form a vector instruction. If so it returns
2547 /// true and populates \p ReorderIndices with the shuffle indices of the
2548 /// stores when compared to the sorted vector.
2549 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2550 OrdersType &ReorderIndices) const;
2551
2552 /// Iterates through the users of \p TE, looking for scalar stores that can be
2553 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2554 /// their order and builds an order index vector for each store bundle. It
2555 /// returns all these order vectors found.
2556 /// We run this after the tree has formed, otherwise we may come across user
2557 /// instructions that are not yet in the tree.
2559 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2560
2561 struct TreeEntry {
2562 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2563 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2564
2565 /// \returns Common mask for reorder indices and reused scalars.
2566 SmallVector<int> getCommonMask() const {
2568 inversePermutation(ReorderIndices, Mask);
2569 ::addMask(Mask, ReuseShuffleIndices);
2570 return Mask;
2571 }
2572
2573 /// \returns true if the scalars in VL are equal to this entry.
2574 bool isSame(ArrayRef<Value *> VL) const {
2575 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2576 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2577 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2578 return VL.size() == Mask.size() &&
2579 std::equal(VL.begin(), VL.end(), Mask.begin(),
2580 [Scalars](Value *V, int Idx) {
2581 return (isa<UndefValue>(V) &&
2582 Idx == PoisonMaskElem) ||
2583 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2584 });
2585 };
2586 if (!ReorderIndices.empty()) {
2587 // TODO: implement matching if the nodes are just reordered, still can
2588 // treat the vector as the same if the list of scalars matches VL
2589 // directly, without reordering.
2591 inversePermutation(ReorderIndices, Mask);
2592 if (VL.size() == Scalars.size())
2593 return IsSame(Scalars, Mask);
2594 if (VL.size() == ReuseShuffleIndices.size()) {
2595 ::addMask(Mask, ReuseShuffleIndices);
2596 return IsSame(Scalars, Mask);
2597 }
2598 return false;
2599 }
2600 return IsSame(Scalars, ReuseShuffleIndices);
2601 }
2602
2603 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2604 return State == TreeEntry::NeedToGather &&
2605 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2606 UserTreeIndices.front().UserTE == UserEI.UserTE;
2607 }
2608
2609 /// \returns true if current entry has same operands as \p TE.
2610 bool hasEqualOperands(const TreeEntry &TE) const {
2611 if (TE.getNumOperands() != getNumOperands())
2612 return false;
2613 SmallBitVector Used(getNumOperands());
2614 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2615 unsigned PrevCount = Used.count();
2616 for (unsigned K = 0; K < E; ++K) {
2617 if (Used.test(K))
2618 continue;
2619 if (getOperand(K) == TE.getOperand(I)) {
2620 Used.set(K);
2621 break;
2622 }
2623 }
2624 // Check if we actually found the matching operand.
2625 if (PrevCount == Used.count())
2626 return false;
2627 }
2628 return true;
2629 }
2630
2631 /// \return Final vectorization factor for the node. Defined by the total
2632 /// number of vectorized scalars, including those, used several times in the
2633 /// entry and counted in the \a ReuseShuffleIndices, if any.
2634 unsigned getVectorFactor() const {
2635 if (!ReuseShuffleIndices.empty())
2636 return ReuseShuffleIndices.size();
2637 return Scalars.size();
2638 };
2639
2640 /// A vector of scalars.
2641 ValueList Scalars;
2642
2643 /// The Scalars are vectorized into this value. It is initialized to Null.
2644 WeakTrackingVH VectorizedValue = nullptr;
2645
2646 /// Do we need to gather this sequence or vectorize it
2647 /// (either with vector instruction or with scatter/gather
2648 /// intrinsics for store/load)?
2649 enum EntryState {
2650 Vectorize,
2651 ScatterVectorize,
2652 PossibleStridedVectorize,
2653 NeedToGather
2654 };
2655 EntryState State;
2656
2657 /// Does this sequence require some shuffling?
2658 SmallVector<int, 4> ReuseShuffleIndices;
2659
2660 /// Does this entry require reordering?
2661 SmallVector<unsigned, 4> ReorderIndices;
2662
2663 /// Points back to the VectorizableTree.
2664 ///
2665 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2666 /// to be a pointer and needs to be able to initialize the child iterator.
2667 /// Thus we need a reference back to the container to translate the indices
2668 /// to entries.
2669 VecTreeTy &Container;
2670
2671 /// The TreeEntry index containing the user of this entry. We can actually
2672 /// have multiple users so the data structure is not truly a tree.
2673 SmallVector<EdgeInfo, 1> UserTreeIndices;
2674
2675 /// The index of this treeEntry in VectorizableTree.
2676 int Idx = -1;
2677
2678 private:
2679 /// The operands of each instruction in each lane Operands[op_index][lane].
2680 /// Note: This helps avoid the replication of the code that performs the
2681 /// reordering of operands during buildTree_rec() and vectorizeTree().
2683
2684 /// The main/alternate instruction.
2685 Instruction *MainOp = nullptr;
2686 Instruction *AltOp = nullptr;
2687
2688 public:
2689 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
2690 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2691 if (Operands.size() < OpIdx + 1)
2692 Operands.resize(OpIdx + 1);
2693 assert(Operands[OpIdx].empty() && "Already resized?");
2694 assert(OpVL.size() <= Scalars.size() &&
2695 "Number of operands is greater than the number of scalars.");
2696 Operands[OpIdx].resize(OpVL.size());
2697 copy(OpVL, Operands[OpIdx].begin());
2698 }
2699
2700 /// Set the operands of this bundle in their original order.
2701 void setOperandsInOrder() {
2702 assert(Operands.empty() && "Already initialized?");
2703 auto *I0 = cast<Instruction>(Scalars[0]);
2704 Operands.resize(I0->getNumOperands());
2705 unsigned NumLanes = Scalars.size();
2706 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2707 OpIdx != NumOperands; ++OpIdx) {
2708 Operands[OpIdx].resize(NumLanes);
2709 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2710 auto *I = cast<Instruction>(Scalars[Lane]);
2711 assert(I->getNumOperands() == NumOperands &&
2712 "Expected same number of operands");
2713 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
2714 }
2715 }
2716 }
2717
2718 /// Reorders operands of the node to the given mask \p Mask.
2719 void reorderOperands(ArrayRef<int> Mask) {
2720 for (ValueList &Operand : Operands)
2721 reorderScalars(Operand, Mask);
2722 }
2723
2724 /// \returns the \p OpIdx operand of this TreeEntry.
2725 ValueList &getOperand(unsigned OpIdx) {
2726 assert(OpIdx < Operands.size() && "Off bounds");
2727 return Operands[OpIdx];
2728 }
2729
2730 /// \returns the \p OpIdx operand of this TreeEntry.
2731 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
2732 assert(OpIdx < Operands.size() && "Off bounds");
2733 return Operands[OpIdx];
2734 }
2735
2736 /// \returns the number of operands.
2737 unsigned getNumOperands() const { return Operands.size(); }
2738
2739 /// \return the single \p OpIdx operand.
2740 Value *getSingleOperand(unsigned OpIdx) const {
2741 assert(OpIdx < Operands.size() && "Off bounds");
2742 assert(!Operands[OpIdx].empty() && "No operand available");
2743 return Operands[OpIdx][0];
2744 }
2745
2746 /// Some of the instructions in the list have alternate opcodes.
2747 bool isAltShuffle() const { return MainOp != AltOp; }
2748
2749 bool isOpcodeOrAlt(Instruction *I) const {
2750 unsigned CheckedOpcode = I->getOpcode();
2751 return (getOpcode() == CheckedOpcode ||
2752 getAltOpcode() == CheckedOpcode);
2753 }
2754
2755 /// Chooses the correct key for scheduling data. If \p Op has the same (or
2756 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
2757 /// \p OpValue.
2758 Value *isOneOf(Value *Op) const {
2759 auto *I = dyn_cast<Instruction>(Op);
2760 if (I && isOpcodeOrAlt(I))
2761 return Op;
2762 return MainOp;
2763 }
2764
2765 void setOperations(const InstructionsState &S) {
2766 MainOp = S.MainOp;
2767 AltOp = S.AltOp;
2768 }
2769
2770 Instruction *getMainOp() const {
2771 return MainOp;
2772 }
2773
2774 Instruction *getAltOp() const {
2775 return AltOp;
2776 }
2777
2778 /// The main/alternate opcodes for the list of instructions.
2779 unsigned getOpcode() const {
2780 return MainOp ? MainOp->getOpcode() : 0;
2781 }
2782
2783 unsigned getAltOpcode() const {
2784 return AltOp ? AltOp->getOpcode() : 0;
2785 }
2786
2787 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
2788 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
2789 int findLaneForValue(Value *V) const {
2790 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
2791 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2792 if (!ReorderIndices.empty())
2793 FoundLane = ReorderIndices[FoundLane];
2794 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2795 if (!ReuseShuffleIndices.empty()) {
2796 FoundLane = std::distance(ReuseShuffleIndices.begin(),
2797 find(ReuseShuffleIndices, FoundLane));
2798 }
2799 return FoundLane;
2800 }
2801
2802 /// Build a shuffle mask for graph entry which represents a merge of main
2803 /// and alternate operations.
2804 void
2805 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
2807 SmallVectorImpl<Value *> *OpScalars = nullptr,
2808 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
2809
2810#ifndef NDEBUG
2811 /// Debug printer.
2812 LLVM_DUMP_METHOD void dump() const {
2813 dbgs() << Idx << ".\n";
2814 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
2815 dbgs() << "Operand " << OpI << ":\n";
2816 for (const Value *V : Operands[OpI])
2817 dbgs().indent(2) << *V << "\n";
2818 }
2819 dbgs() << "Scalars: \n";
2820 for (Value *V : Scalars)
2821 dbgs().indent(2) << *V << "\n";
2822 dbgs() << "State: ";
2823 switch (State) {
2824 case Vectorize:
2825 dbgs() << "Vectorize\n";
2826 break;
2827 case ScatterVectorize:
2828 dbgs() << "ScatterVectorize\n";
2829 break;
2830 case PossibleStridedVectorize:
2831 dbgs() << "PossibleStridedVectorize\n";
2832 break;
2833 case NeedToGather:
2834 dbgs() << "NeedToGather\n";
2835 break;
2836 }
2837 dbgs() << "MainOp: ";
2838 if (MainOp)
2839 dbgs() << *MainOp << "\n";
2840 else
2841 dbgs() << "NULL\n";
2842 dbgs() << "AltOp: ";
2843 if (AltOp)
2844 dbgs() << *AltOp << "\n";
2845 else
2846 dbgs() << "NULL\n";
2847 dbgs() << "VectorizedValue: ";
2848 if (VectorizedValue)
2849 dbgs() << *VectorizedValue << "\n";
2850 else
2851 dbgs() << "NULL\n";
2852 dbgs() << "ReuseShuffleIndices: ";
2853 if (ReuseShuffleIndices.empty())
2854 dbgs() << "Empty";
2855 else
2856 for (int ReuseIdx : ReuseShuffleIndices)
2857 dbgs() << ReuseIdx << ", ";
2858 dbgs() << "\n";
2859 dbgs() << "ReorderIndices: ";
2860 for (unsigned ReorderIdx : ReorderIndices)
2861 dbgs() << ReorderIdx << ", ";
2862 dbgs() << "\n";
2863 dbgs() << "UserTreeIndices: ";
2864 for (const auto &EInfo : UserTreeIndices)
2865 dbgs() << EInfo << ", ";
2866 dbgs() << "\n";
2867 }
2868#endif
2869 };
2870
2871#ifndef NDEBUG
2872 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
2873 InstructionCost VecCost, InstructionCost ScalarCost,
2874 StringRef Banner) const {
2875 dbgs() << "SLP: " << Banner << ":\n";
2876 E->dump();
2877 dbgs() << "SLP: Costs:\n";
2878 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
2879 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
2880 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
2881 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2882 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
2883 }
2884#endif
2885
2886 /// Create a new VectorizableTree entry.
2887 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2888 std::optional<ScheduleData *> Bundle,
2889 const InstructionsState &S,
2890 const EdgeInfo &UserTreeIdx,
2891 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2892 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2893 TreeEntry::EntryState EntryState =
2894 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2895 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2896 ReuseShuffleIndices, ReorderIndices);
2897 }
2898
2899 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2900 TreeEntry::EntryState EntryState,
2901 std::optional<ScheduleData *> Bundle,
2902 const InstructionsState &S,
2903 const EdgeInfo &UserTreeIdx,
2904 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2905 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2906 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2907 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2908 "Need to vectorize gather entry?");
2909 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
2910 TreeEntry *Last = VectorizableTree.back().get();
2911 Last->Idx = VectorizableTree.size() - 1;
2912 Last->State = EntryState;
2913 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2914 ReuseShuffleIndices.end());
2915 if (ReorderIndices.empty()) {
2916 Last->Scalars.assign(VL.begin(), VL.end());
2917 Last->setOperations(S);
2918 } else {
2919 // Reorder scalars and build final mask.
2920 Last->Scalars.assign(VL.size(), nullptr);
2921 transform(ReorderIndices, Last->Scalars.begin(),
2922 [VL](unsigned Idx) -> Value * {
2923 if (Idx >= VL.size())
2924 return UndefValue::get(VL.front()->getType());
2925 return VL[Idx];
2926 });
2927 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
2928 Last->setOperations(S);
2929 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
2930 }
2931 if (Last->State != TreeEntry::NeedToGather) {
2932 for (Value *V : VL) {
2933 const TreeEntry *TE = getTreeEntry(V);
2934 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
2935 "Scalar already in tree!");
2936 if (TE) {
2937 if (TE != Last)
2938 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
2939 continue;
2940 }
2941 ScalarToTreeEntry[V] = Last;
2942 }
2943 // Update the scheduler bundle to point to this TreeEntry.
2944 ScheduleData *BundleMember = *Bundle;
2945 assert((BundleMember || isa<PHINode>(S.MainOp) ||
2946 isVectorLikeInstWithConstOps(S.MainOp) ||
2947 doesNotNeedToSchedule(VL)) &&
2948 "Bundle and VL out of sync");
2949 if (BundleMember) {
2950 for (Value *V : VL) {
2952 continue;
2953 if (!BundleMember)
2954 continue;
2955 BundleMember->TE = Last;
2956 BundleMember = BundleMember->NextInBundle;
2957 }
2958 }
2959 assert(!BundleMember && "Bundle and VL out of sync");
2960 } else {
2961 MustGather.insert(VL.begin(), VL.end());
2962 }
2963
2964 if (UserTreeIdx.UserTE)
2965 Last->UserTreeIndices.push_back(UserTreeIdx);
2966
2967 return Last;
2968 }
2969
2970 /// -- Vectorization State --
2971 /// Holds all of the tree entries.
2972 TreeEntry::VecTreeTy VectorizableTree;
2973
2974#ifndef NDEBUG
2975 /// Debug printer.
2976 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
2977 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
2978 VectorizableTree[Id]->dump();
2979 dbgs() << "\n";
2980 }
2981 }
2982#endif
2983
2984 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
2985
2986 const TreeEntry *getTreeEntry(Value *V) const {
2987 return ScalarToTreeEntry.lookup(V);
2988 }
2989
2990 /// Checks if the specified list of the instructions/values can be vectorized
2991 /// and fills required data before actual scheduling of the instructions.
2992 TreeEntry::EntryState getScalarsVectorizationState(
2993 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
2994 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
2995
2996 /// Maps a specific scalar to its tree entry.
2997 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
2998
2999 /// List of scalars, used in several vectorize nodes, and the list of the
3000 /// nodes.
3002
3003 /// Maps a value to the proposed vectorizable size.
3004 SmallDenseMap<Value *, unsigned> InstrElementSize;
3005
3006 /// A list of scalars that we found that we need to keep as scalars.
3007 ValueSet MustGather;
3008
3009 /// A map between the vectorized entries and the last instructions in the
3010 /// bundles. The bundles are built in use order, not in the def order of the
3011 /// instructions. So, we cannot rely directly on the last instruction in the
3012 /// bundle being the last instruction in the program order during
3013 /// vectorization process since the basic blocks are affected, need to
3014 /// pre-gather them before.
3015 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3016
3017 /// List of gather nodes, depending on other gather/vector nodes, which should
3018 /// be emitted after the vector instruction emission process to correctly
3019 /// handle order of the vector instructions and shuffles.
3020 SetVector<const TreeEntry *> PostponedGathers;
3021
3022 using ValueToGatherNodesMap =
3024 ValueToGatherNodesMap ValueToGatherNodes;
3025
3026 /// This POD struct describes one external user in the vectorized tree.
3027 struct ExternalUser {
3028 ExternalUser(Value *S, llvm::User *U, int L)
3029 : Scalar(S), User(U), Lane(L) {}
3030
3031 // Which scalar in our function.
3032 Value *Scalar;
3033
3034 // Which user that uses the scalar.
3036
3037 // Which lane does the scalar belong to.
3038 int Lane;
3039 };
3040 using UserList = SmallVector<ExternalUser, 16>;
3041
3042 /// Checks if two instructions may access the same memory.
3043 ///
3044 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3045 /// is invariant in the calling loop.
3046 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3047 Instruction *Inst2) {
3048 // First check if the result is already in the cache.
3049 AliasCacheKey key = std::make_pair(Inst1, Inst2);
3050 std::optional<bool> &result = AliasCache[key];
3051 if (result) {
3052 return *result;
3053 }
3054 bool aliased = true;
3055 if (Loc1.Ptr && isSimple(Inst1))
3056 aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3057 // Store the result in the cache.
3058 result = aliased;
3059 return aliased;
3060 }
3061
3062 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3063
3064 /// Cache for alias results.
3065 /// TODO: consider moving this to the AliasAnalysis itself.
3067
3068 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3069 // globally through SLP because we don't perform any action which
3070 // invalidates capture results.
3071 BatchAAResults BatchAA;
3072
3073 /// Temporary store for deleted instructions. Instructions will be deleted
3074 /// eventually when the BoUpSLP is destructed. The deferral is required to
3075 /// ensure that there are no incorrect collisions in the AliasCache, which
3076 /// can happen if a new instruction is allocated at the same address as a
3077 /// previously deleted instruction.
3078 DenseSet<Instruction *> DeletedInstructions;
3079
3080 /// Set of the instruction, being analyzed already for reductions.
3081 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3082
3083 /// Set of hashes for the list of reduction values already being analyzed.
3084 DenseSet<size_t> AnalyzedReductionVals;
3085
3086 /// A list of values that need to extracted out of the tree.
3087 /// This list holds pairs of (Internal Scalar : External User). External User
3088 /// can be nullptr, it means that this Internal Scalar will be used later,
3089 /// after vectorization.
3090 UserList ExternalUses;
3091
3092 /// Values used only by @llvm.assume calls.
3094
3095 /// Holds all of the instructions that we gathered, shuffle instructions and
3096 /// extractelements.
3097 SetVector<Instruction *> GatherShuffleExtractSeq;
3098
3099 /// A list of blocks that we are going to CSE.
3100 SetVector<BasicBlock *> CSEBlocks;
3101
3102 /// Contains all scheduling relevant data for an instruction.
3103 /// A ScheduleData either represents a single instruction or a member of an
3104 /// instruction bundle (= a group of instructions which is combined into a
3105 /// vector instruction).
3106 struct ScheduleData {
3107 // The initial value for the dependency counters. It means that the
3108 // dependencies are not calculated yet.
3109 enum { InvalidDeps = -1 };
3110
3111 ScheduleData() = default;
3112
3113 void init(int BlockSchedulingRegionID, Value *OpVal) {
3114 FirstInBundle = this;
3115 NextInBundle = nullptr;
3116 NextLoadStore = nullptr;
3117 IsScheduled = false;
3118 SchedulingRegionID = BlockSchedulingRegionID;
3119 clearDependencies();
3120 OpValue = OpVal;
3121 TE = nullptr;
3122 }
3123
3124 /// Verify basic self consistency properties
3125 void verify() {
3126 if (hasValidDependencies()) {
3127 assert(UnscheduledDeps <= Dependencies && "invariant");
3128 } else {
3129 assert(UnscheduledDeps == Dependencies && "invariant");
3130 }
3131
3132 if (IsScheduled) {
3133 assert(isSchedulingEntity() &&
3134 "unexpected scheduled state");
3135 for (const ScheduleData *BundleMember = this; BundleMember;
3136 BundleMember = BundleMember->NextInBundle) {
3137 assert(BundleMember->hasValidDependencies() &&
3138 BundleMember->UnscheduledDeps == 0 &&
3139 "unexpected scheduled state");
3140 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3141 "only bundle is marked scheduled");
3142 }
3143 }
3144
3145 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3146 "all bundle members must be in same basic block");
3147 }
3148
3149 /// Returns true if the dependency information has been calculated.
3150 /// Note that depenendency validity can vary between instructions within
3151 /// a single bundle.
3152 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3153
3154 /// Returns true for single instructions and for bundle representatives
3155 /// (= the head of a bundle).
3156 bool isSchedulingEntity() const { return FirstInBundle == this; }
3157
3158 /// Returns true if it represents an instruction bundle and not only a
3159 /// single instruction.
3160 bool isPartOfBundle() const {
3161 return NextInBundle != nullptr || FirstInBundle != this || TE;
3162 }
3163
3164 /// Returns true if it is ready for scheduling, i.e. it has no more
3165 /// unscheduled depending instructions/bundles.
3166 bool isReady() const {
3167 assert(isSchedulingEntity() &&
3168 "can't consider non-scheduling entity for ready list");
3169 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3170 }
3171
3172 /// Modifies the number of unscheduled dependencies for this instruction,
3173 /// and returns the number of remaining dependencies for the containing
3174 /// bundle.
3175 int incrementUnscheduledDeps(int Incr) {
3176 assert(hasValidDependencies() &&
3177 "increment of unscheduled deps would be meaningless");
3178 UnscheduledDeps += Incr;
3179 return FirstInBundle->unscheduledDepsInBundle();
3180 }
3181
3182 /// Sets the number of unscheduled dependencies to the number of
3183 /// dependencies.
3184 void resetUnscheduledDeps() {
3185 UnscheduledDeps = Dependencies;
3186 }
3187
3188 /// Clears all dependency information.
3189 void clearDependencies() {
3190 Dependencies = InvalidDeps;
3191 resetUnscheduledDeps();
3192 MemoryDependencies.clear();
3193 ControlDependencies.clear();
3194 }
3195
3196 int unscheduledDepsInBundle() const {
3197 assert(isSchedulingEntity() && "only meaningful on the bundle");
3198 int Sum = 0;
3199 for (const ScheduleData *BundleMember = this; BundleMember;
3200 BundleMember = BundleMember->NextInBundle) {
3201 if (BundleMember->UnscheduledDeps == InvalidDeps)
3202 return InvalidDeps;
3203 Sum += BundleMember->UnscheduledDeps;
3204 }
3205 return Sum;
3206 }
3207
3208 void dump(raw_ostream &os) const {
3209 if (!isSchedulingEntity()) {
3210 os << "/ " << *Inst;
3211 } else if (NextInBundle) {
3212 os << '[' << *Inst;
3213 ScheduleData *SD = NextInBundle;
3214 while (SD) {
3215 os << ';' << *SD->Inst;
3216 SD = SD->NextInBundle;
3217 }
3218 os << ']';
3219 } else {
3220 os << *Inst;
3221 }
3222 }
3223
3224 Instruction *Inst = nullptr;
3225
3226 /// Opcode of the current instruction in the schedule data.
3227 Value *OpValue = nullptr;
3228
3229 /// The TreeEntry that this instruction corresponds to.
3230 TreeEntry *TE = nullptr;
3231
3232 /// Points to the head in an instruction bundle (and always to this for
3233 /// single instructions).
3234 ScheduleData *FirstInBundle = nullptr;
3235
3236 /// Single linked list of all instructions in a bundle. Null if it is a
3237 /// single instruction.
3238 ScheduleData *NextInBundle = nullptr;
3239
3240 /// Single linked list of all memory instructions (e.g. load, store, call)
3241 /// in the block - until the end of the scheduling region.
3242 ScheduleData *NextLoadStore = nullptr;
3243
3244 /// The dependent memory instructions.
3245 /// This list is derived on demand in calculateDependencies().
3246 SmallVector<ScheduleData *, 4> MemoryDependencies;
3247
3248 /// List of instructions which this instruction could be control dependent
3249 /// on. Allowing such nodes to be scheduled below this one could introduce
3250 /// a runtime fault which didn't exist in the original program.
3251 /// ex: this is a load or udiv following a readonly call which inf loops
3252 SmallVector<ScheduleData *, 4> ControlDependencies;
3253
3254 /// This ScheduleData is in the current scheduling region if this matches
3255 /// the current SchedulingRegionID of BlockScheduling.
3256 int SchedulingRegionID = 0;
3257
3258 /// Used for getting a "good" final ordering of instructions.
3259 int SchedulingPriority = 0;
3260
3261 /// The number of dependencies. Constitutes of the number of users of the
3262 /// instruction plus the number of dependent memory instructions (if any).
3263 /// This value is calculated on demand.
3264 /// If InvalidDeps, the number of dependencies is not calculated yet.
3265 int Dependencies = InvalidDeps;
3266
3267 /// The number of dependencies minus the number of dependencies of scheduled
3268 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3269 /// for scheduling.
3270 /// Note that this is negative as long as Dependencies is not calculated.
3271 int UnscheduledDeps = InvalidDeps;
3272
3273 /// True if this instruction is scheduled (or considered as scheduled in the
3274 /// dry-run).
3275 bool IsScheduled = false;
3276 };
3277
3278#ifndef NDEBUG
3280 const BoUpSLP::ScheduleData &SD) {
3281 SD.dump(os);
3282 return os;
3283 }
3284#endif
3285
3286 friend struct GraphTraits<BoUpSLP *>;
3287 friend struct DOTGraphTraits<BoUpSLP *>;
3288
3289 /// Contains all scheduling data for a basic block.
3290 /// It does not schedules instructions, which are not memory read/write
3291 /// instructions and their operands are either constants, or arguments, or
3292 /// phis, or instructions from others blocks, or their users are phis or from
3293 /// the other blocks. The resulting vector instructions can be placed at the
3294 /// beginning of the basic block without scheduling (if operands does not need
3295 /// to be scheduled) or at the end of the block (if users are outside of the
3296 /// block). It allows to save some compile time and memory used by the
3297 /// compiler.
3298 /// ScheduleData is assigned for each instruction in between the boundaries of
3299 /// the tree entry, even for those, which are not part of the graph. It is
3300 /// required to correctly follow the dependencies between the instructions and
3301 /// their correct scheduling. The ScheduleData is not allocated for the
3302 /// instructions, which do not require scheduling, like phis, nodes with
3303 /// extractelements/insertelements only or nodes with instructions, with
3304 /// uses/operands outside of the block.
3305 struct BlockScheduling {
3306 BlockScheduling(BasicBlock *BB)
3307 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3308
3309 void clear() {
3310 ReadyInsts.clear();
3311 ScheduleStart = nullptr;
3312 ScheduleEnd = nullptr;
3313 FirstLoadStoreInRegion = nullptr;
3314 LastLoadStoreInRegion = nullptr;
3315 RegionHasStackSave = false;
3316
3317 // Reduce the maximum schedule region size by the size of the
3318 // previous scheduling run.
3319 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3320 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3321 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3322 ScheduleRegionSize = 0;
3323
3324 // Make a new scheduling region, i.e. all existing ScheduleData is not
3325 // in the new region yet.
3326 ++SchedulingRegionID;
3327 }
3328
3329 ScheduleData *getScheduleData(Instruction *I) {
3330 if (BB != I->getParent())
3331 // Avoid lookup if can't possibly be in map.
3332 return nullptr;
3333 ScheduleData *SD = ScheduleDataMap.lookup(I);
3334 if (SD && isInSchedulingRegion(SD))
3335 return SD;
3336 return nullptr;
3337 }
3338
3339 ScheduleData *getScheduleData(Value *V) {
3340 if (auto *I = dyn_cast<Instruction>(V))
3341 return getScheduleData(I);
3342 return nullptr;
3343 }
3344
3345 ScheduleData *getScheduleData(Value *V, Value *Key) {
3346 if (V == Key)
3347 return getScheduleData(V);
3348 auto I = ExtraScheduleDataMap.find(V);
3349 if (I != ExtraScheduleDataMap.end()) {
3350 ScheduleData *SD = I->second.lookup(Key);
3351 if (SD && isInSchedulingRegion(SD))
3352 return SD;
3353 }
3354 return nullptr;
3355 }
3356
3357 bool isInSchedulingRegion(ScheduleData *SD) const {
3358 return SD->SchedulingRegionID == SchedulingRegionID;
3359 }
3360
3361 /// Marks an instruction as scheduled and puts all dependent ready
3362 /// instructions into the ready-list.
3363 template <typename ReadyListType>
3364 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3365 SD->IsScheduled = true;
3366 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3367
3368 for (ScheduleData *BundleMember = SD; BundleMember;
3369 BundleMember = BundleMember->NextInBundle) {
3370 if (BundleMember->Inst != BundleMember->OpValue)
3371 continue;
3372
3373 // Handle the def-use chain dependencies.
3374
3375 // Decrement the unscheduled counter and insert to ready list if ready.
3376 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3377 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
3378 if (OpDef && OpDef->hasValidDependencies() &&
3379 OpDef->incrementUnscheduledDeps(-1) == 0) {
3380 // There are no more unscheduled dependencies after
3381 // decrementing, so we can put the dependent instruction
3382 // into the ready list.
3383 ScheduleData *DepBundle = OpDef->FirstInBundle;
3384 assert(!DepBundle->IsScheduled &&
3385 "already scheduled bundle gets ready");
3386 ReadyList.insert(DepBundle);
3387 LLVM_DEBUG(dbgs()
3388 << "SLP: gets ready (def): " << *DepBundle << "\n");
3389 }
3390 });
3391 };
3392
3393 // If BundleMember is a vector bundle, its operands may have been
3394 // reordered during buildTree(). We therefore need to get its operands
3395 // through the TreeEntry.
3396 if (TreeEntry *TE = BundleMember->TE) {
3397 // Need to search for the lane since the tree entry can be reordered.
3398 int Lane = std::distance(TE->Scalars.begin(),
3399 find(TE->Scalars, BundleMember->Inst));
3400 assert(Lane >= 0 && "Lane not set");
3401
3402 // Since vectorization tree is being built recursively this assertion
3403 // ensures that the tree entry has all operands set before reaching
3404 // this code. Couple of exceptions known at the moment are extracts
3405 // where their second (immediate) operand is not added. Since
3406 // immediates do not affect scheduler behavior this is considered
3407 // okay.
3408 auto *In = BundleMember->Inst;
3409 assert(In &&
3410 (isa<ExtractValueInst, ExtractElementInst>(In) ||
3411 In->getNumOperands() == TE->getNumOperands()) &&
3412 "Missed TreeEntry operands?");
3413 (void)In; // fake use to avoid build failure when assertions disabled
3414
3415 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3416 OpIdx != NumOperands; ++OpIdx)
3417 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3418 DecrUnsched(I);
3419 } else {
3420 // If BundleMember is a stand-alone instruction, no operand reordering
3421 // has taken place, so we directly access its operands.
3422 for (Use &U : BundleMember->Inst->operands())
3423 if (auto *I = dyn_cast<Instruction>(U.get()))
3424 DecrUnsched(I);
3425 }
3426 // Handle the memory dependencies.
3427 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3428 if (MemoryDepSD->hasValidDependencies() &&
3429 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3430 // There are no more unscheduled dependencies after decrementing,
3431 // so we can put the dependent instruction into the ready list.
3432 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3433 assert(!DepBundle->IsScheduled &&
3434 "already scheduled bundle gets ready");
3435 ReadyList.insert(DepBundle);
3437 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3438 }
3439 }
3440 // Handle the control dependencies.
3441 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3442 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3443 // There are no more unscheduled dependencies after decrementing,
3444 // so we can put the dependent instruction into the ready list.
3445 ScheduleData *DepBundle = DepSD->FirstInBundle;
3446 assert(!DepBundle->IsScheduled &&
3447 "already scheduled bundle gets ready");
3448 ReadyList.insert(DepBundle);
3450 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3451 }
3452 }
3453 }
3454 }
3455
3456 /// Verify basic self consistency properties of the data structure.
3457 void verify() {
3458 if (!ScheduleStart)
3459 return;
3460
3461 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3462 ScheduleStart->comesBefore(ScheduleEnd) &&
3463 "Not a valid scheduling region?");
3464
3465 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3466 auto *SD = getScheduleData(I);
3467 if (!SD)
3468 continue;
3469 assert(isInSchedulingRegion(SD) &&
3470 "primary schedule data not in window?");
3471 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3472 "entire bundle in window!");
3473 (void)SD;
3474 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3475 }
3476
3477 for (auto *SD : ReadyInsts) {
3478 assert(SD->isSchedulingEntity() && SD->isReady() &&
3479 "item in ready list not ready?");
3480 (void)SD;
3481 }
3482 }
3483
3484 void doForAllOpcodes(Value *V,
3485 function_ref<void(ScheduleData *SD)> Action) {
3486 if (ScheduleData *SD = getScheduleData(V))
3487 Action(SD);
3488 auto I = ExtraScheduleDataMap.find(V);
3489 if (I != ExtraScheduleDataMap.end())
3490 for (auto &P : I->second)
3491 if (isInSchedulingRegion(P.second))
3492 Action(P.second);
3493 }
3494
3495 /// Put all instructions into the ReadyList which are ready for scheduling.
3496 template <typename ReadyListType>
3497 void initialFillReadyList(ReadyListType &ReadyList) {
3498 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3499 doForAllOpcodes(I, [&](ScheduleData *SD) {
3500 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3501 SD->isReady()) {
3502 ReadyList.insert(SD);
3503 LLVM_DEBUG(dbgs()
3504 << "SLP: initially in ready list: " << *SD << "\n");
3505 }
3506 });
3507 }
3508 }
3509
3510 /// Build a bundle from the ScheduleData nodes corresponding to the
3511 /// scalar instruction for each lane.
3512 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3513
3514 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3515 /// cyclic dependencies. This is only a dry-run, no instructions are
3516 /// actually moved at this stage.
3517 /// \returns the scheduling bundle. The returned Optional value is not
3518 /// std::nullopt if \p VL is allowed to be scheduled.
3519 std::optional<ScheduleData *>
3520 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3521 const InstructionsState &S);
3522
3523 /// Un-bundles a group of instructions.
3524 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3525
3526 /// Allocates schedule data chunk.
3527 ScheduleData *allocateScheduleDataChunks();
3528
3529 /// Extends the scheduling region so that V is inside the region.
3530 /// \returns true if the region size is within the limit.
3531 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3532
3533 /// Initialize the ScheduleData structures for new instructions in the
3534 /// scheduling region.
3535 void initScheduleData(Instruction *FromI, Instruction *ToI,
3536 ScheduleData *PrevLoadStore,
3537 ScheduleData *NextLoadStore);
3538
3539 /// Updates the dependency information of a bundle and of all instructions/
3540 /// bundles which depend on the original bundle.
3541 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3542 BoUpSLP *SLP);
3543
3544 /// Sets all instruction in the scheduling region to un-scheduled.
3545 void resetSchedule();
3546
3547 BasicBlock *BB;
3548
3549 /// Simple memory allocation for ScheduleData.
3550 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
3551
3552 /// The size of a ScheduleData array in ScheduleDataChunks.
3553 int ChunkSize;
3554
3555 /// The allocator position in the current chunk, which is the last entry
3556 /// of ScheduleDataChunks.
3557 int ChunkPos;
3558
3559 /// Attaches ScheduleData to Instruction.
3560 /// Note that the mapping survives during all vectorization iterations, i.e.
3561 /// ScheduleData structures are recycled.
3563
3564 /// Attaches ScheduleData to Instruction with the leading key.
3566 ExtraScheduleDataMap;
3567
3568 /// The ready-list for scheduling (only used for the dry-run).
3569 SetVector<ScheduleData *> ReadyInsts;
3570
3571 /// The first instruction of the scheduling region.
3572 Instruction *ScheduleStart = nullptr;
3573
3574 /// The first instruction _after_ the scheduling region.
3575 Instruction *ScheduleEnd = nullptr;
3576
3577 /// The first memory accessing instruction in the scheduling region
3578 /// (can be null).
3579 ScheduleData *FirstLoadStoreInRegion = nullptr;
3580
3581 /// The last memory accessing instruction in the scheduling region
3582 /// (can be null).
3583 ScheduleData *LastLoadStoreInRegion = nullptr;
3584
3585 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3586 /// region? Used to optimize the dependence calculation for the
3587 /// common case where there isn't.
3588 bool RegionHasStackSave = false;
3589
3590 /// The current size of the scheduling region.
3591 int ScheduleRegionSize = 0;
3592
3593 /// The maximum size allowed for the scheduling region.
3594 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3595
3596 /// The ID of the scheduling region. For a new vectorization iteration this
3597 /// is incremented which "removes" all ScheduleData from the region.
3598 /// Make sure that the initial SchedulingRegionID is greater than the
3599 /// initial SchedulingRegionID in ScheduleData (which is 0).
3600 int SchedulingRegionID = 1;
3601 };
3602
3603 /// Attaches the BlockScheduling structures to basic blocks.
3605
3606 /// Performs the "real" scheduling. Done before vectorization is actually
3607 /// performed in a basic block.
3608 void scheduleBlock(BlockScheduling *BS);
3609
3610 /// List of users to ignore during scheduling and that don't need extracting.
3611 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3612
3613 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3614 /// sorted SmallVectors of unsigned.
3615 struct OrdersTypeDenseMapInfo {
3616 static OrdersType getEmptyKey() {
3617 OrdersType V;
3618 V.push_back(~1U);
3619 return V;
3620 }
3621
3622 static OrdersType getTombstoneKey() {
3623 OrdersType V;
3624 V.push_back(~2U);
3625 return V;
3626 }
3627
3628 static unsigned getHashValue(const OrdersType &V) {
3629 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3630 }
3631
3632 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3633 return LHS == RHS;
3634 }
3635 };
3636
3637 // Analysis and block reference.
3638 Function *F;
3639 ScalarEvolution *SE;
3641 TargetLibraryInfo *TLI;
3642 LoopInfo *LI;
3643 DominatorTree *DT;
3644 AssumptionCache *AC;
3645 DemandedBits *DB;
3646 const DataLayout *DL;
3648
3649 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3650 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3651
3652 /// Instruction builder to construct the vectorized tree.
3653 IRBuilder<> Builder;
3654
3655 /// A map of scalar integer values to the smallest bit width with which they
3656 /// can legally be represented. The values map to (width, signed) pairs,
3657 /// where "width" indicates the minimum bit width and "signed" is True if the
3658 /// value must be signed-extended, rather than zero-extended, back to its
3659 /// original width.
3661};
3662
3663} // end namespace slpvectorizer
3664
3665template <> struct GraphTraits<BoUpSLP *> {
3666 using TreeEntry = BoUpSLP::TreeEntry;
3667
3668 /// NodeRef has to be a pointer per the GraphWriter.
3670
3672
3673 /// Add the VectorizableTree to the index iterator to be able to return
3674 /// TreeEntry pointers.
3675 struct ChildIteratorType
3676 : public iterator_adaptor_base<
3677 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3679
3681 ContainerTy &VT)
3682 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
3683
3684 NodeRef operator*() { return I->UserTE; }
3685 };
3686
3688 return R.VectorizableTree[0].get();
3689 }
3690
3691 static ChildIteratorType child_begin(NodeRef N) {
3692 return {N->UserTreeIndices.begin(), N->Container};
3693 }
3694
3695 static ChildIteratorType child_end(NodeRef N) {
3696 return {N->UserTreeIndices.end(), N->Container};
3697 }
3698
3699 /// For the node iterator we just need to turn the TreeEntry iterator into a
3700 /// TreeEntry* iterator so that it dereferences to NodeRef.
3701 class nodes_iterator {
3703 ItTy It;
3704
3705 public:
3706 nodes_iterator(const ItTy &It2) : It(It2) {}
3707 NodeRef operator*() { return It->get(); }
3708 nodes_iterator operator++() {
3709 ++It;
3710 return *this;
3711 }
3712 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
3713 };
3714
3715 static nodes_iterator nodes_begin(BoUpSLP *R) {
3716 return nodes_iterator(R->VectorizableTree.begin());
3717 }
3718
3719 static nodes_iterator nodes_end(BoUpSLP *R) {
3720 return nodes_iterator(R->VectorizableTree.end());
3721 }
3722
3723 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
3724};
3725
3726template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
3727 using TreeEntry = BoUpSLP::TreeEntry;
3728
3730
3731 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
3732 std::string Str;
3734 OS << Entry->Idx << ".\n";
3735 if (isSplat(Entry->Scalars))
3736 OS << "<splat> ";
3737 for (auto *V : Entry->Scalars) {
3738 OS << *V;
3739 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
3740 return EU.Scalar == V;
3741 }))
3742 OS << " <extract>";
3743 OS << "\n";
3744 }
3745 return Str;
3746 }
3747
3748 static std::string getNodeAttributes(const TreeEntry *Entry,
3749 const BoUpSLP *) {
3750 if (Entry->State == TreeEntry::NeedToGather)
3751 return "color=red";
3752 if (Entry->State == TreeEntry::ScatterVectorize ||
3753 Entry->State == TreeEntry::PossibleStridedVectorize)
3754 return "color=blue";
3755 return "";
3756 }
3757};
3758
3759} // end namespace llvm
3760
3763 for (auto *I : DeletedInstructions) {
3764 for (Use &U : I->operands()) {
3765 auto *Op = dyn_cast<Instruction>(U.get());
3766 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
3768 DeadInsts.emplace_back(Op);
3769 }
3770 I->dropAllReferences();
3771 }
3772 for (auto *I : DeletedInstructions) {
3773 assert(I->use_empty() &&
3774 "trying to erase instruction with users.");
3775 I->eraseFromParent();
3776 }
3777
3778 // Cleanup any dead scalar code feeding the vectorized instructions
3780
3781#ifdef EXPENSIVE_CHECKS
3782 // If we could guarantee that this call is not extremely slow, we could
3783 // remove the ifdef limitation (see PR47712).
3784 assert(!verifyFunction(*F, &dbgs()));
3785#endif
3786}
3787
3788/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
3789/// contains original mask for the scalars reused in the node. Procedure
3790/// transform this mask in accordance with the given \p Mask.
3792 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
3793 "Expected non-empty mask.");
3794 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
3795 Prev.swap(Reuses);
3796 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
3797 if (Mask[I] != PoisonMaskElem)
3798 Reuses[Mask[I]] = Prev[I];
3799}
3800
3801/// Reorders the given \p Order according to the given \p Mask. \p Order - is
3802/// the original order of the scalars. Procedure transforms the provided order
3803/// in accordance with the given \p Mask. If the resulting \p Order is just an
3804/// identity order, \p Order is cleared.
3806 assert(!Mask.empty() && "Expected non-empty mask.");
3807 SmallVector<int> MaskOrder;
3808 if (Order.empty()) {
3809 MaskOrder.resize(Mask.size());
3810 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
3811 } else {
3812 inversePermutation(Order, MaskOrder);
3813 }
3814 reorderReuses(MaskOrder, Mask);
3815 if (ShuffleVectorInst::isIdentityMask(MaskOrder, MaskOrder.size())) {
3816 Order.clear();
3817 return;
3818 }
3819 Order.assign(Mask.size(), Mask.size());
3820 for (unsigned I = 0, E = Mask.size(); I < E; ++I)
3821 if (MaskOrder[I] != PoisonMaskElem)
3822 Order[MaskOrder[I]] = I;
3823 fixupOrderingIndices(Order);
3824}
3825
3826std::optional<BoUpSLP::OrdersType>
3827BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
3828 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
3829 unsigned NumScalars = TE.Scalars.size();
3830 OrdersType CurrentOrder(NumScalars, NumScalars);
3831 SmallVector<int> Positions;
3832 SmallBitVector UsedPositions(NumScalars);
3833 const TreeEntry *STE = nullptr;
3834 // Try to find all gathered scalars that are gets vectorized in other
3835 // vectorize node. Here we can have only one single tree vector node to
3836 // correctly identify order of the gathered scalars.
3837 for (unsigned I = 0; I < NumScalars; ++I) {
3838 Value *V = TE.Scalars[I];
3839 if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
3840 continue;
3841 if (const auto *LocalSTE = getTreeEntry(V)) {
3842 if (!STE)
3843 STE = LocalSTE;
3844 else if (STE != LocalSTE)
3845 // Take the order only from the single vector node.
3846 return std::nullopt;
3847 unsigned Lane =
3848 std::distance(STE->Scalars.begin(), find(STE->Scalars, V));
3849 if (Lane >= NumScalars)
3850 return std::nullopt;
3851 if (CurrentOrder[Lane] != NumScalars) {
3852 if (Lane != I)
3853 continue;
3854 UsedPositions.reset(CurrentOrder[Lane]);
3855 }
3856 // The partial identity (where only some elements of the gather node are
3857 // in the identity order) is good.
3858 CurrentOrder[Lane] = I;
3859 UsedPositions.set(I);
3860 }
3861 }
3862 // Need to keep the order if we have a vector entry and at least 2 scalars or
3863 // the vectorized entry has just 2 scalars.
3864 if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) {
3865 auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) {
3866 for (unsigned I = 0; I < NumScalars; ++I)
3867 if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)
3868 return false;
3869 return true;
3870 };
3871 if (IsIdentityOrder(CurrentOrder))
3872 return OrdersType();
3873 auto *It = CurrentOrder.begin();
3874 for (unsigned I = 0; I < NumScalars;) {
3875 if (UsedPositions.test(I)) {
3876 ++I;
3877 continue;
3878 }
3879 if (*It == NumScalars) {
3880 *It = I;
3881 ++I;
3882 }
3883 ++It;
3884 }
3885 return std::move(CurrentOrder);
3886 }
3887 return std::nullopt;
3888}
3889
3890namespace {
3891/// Tracks the state we can represent the loads in the given sequence.
3892enum class LoadsState {
3893 Gather,
3894 Vectorize,
3895 ScatterVectorize,
3896 PossibleStridedVectorize
3897};
3898} // anonymous namespace
3899
3900static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
3901 const TargetLibraryInfo &TLI,
3902 bool CompareOpcodes = true) {
3903 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
3904 return false;
3905 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
3906 if (!GEP1)
3907 return false;
3908 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
3909 if (!GEP2)
3910 return false;
3911 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
3912 ((isConstant(GEP1->getOperand(1)) &&
3913 isConstant(GEP2->getOperand(1))) ||
3914 !CompareOpcodes ||
3915 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
3916 .getOpcode());
3917}
3918
3919/// Checks if the given array of loads can be represented as a vectorized,
3920/// scatter or just simple gather.
3921static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
3922 const TargetTransformInfo &TTI,
3923 const DataLayout &DL, ScalarEvolution &SE,
3924 LoopInfo &LI, const TargetLibraryInfo &TLI,
3926 SmallVectorImpl<Value *> &PointerOps) {
3927 // Check that a vectorized load would load the same memory as a scalar
3928 // load. For example, we don't want to vectorize loads that are smaller
3929 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
3930 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
3931 // from such a struct, we read/write packed bits disagreeing with the
3932 // unvectorized version.
3933 Type *ScalarTy = VL0->getType();
3934
3935 if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy))
3936 return LoadsState::Gather;
3937
3938 // Make sure all loads in the bundle are simple - we can't vectorize
3939 // atomic or volatile loads.
3940 PointerOps.clear();
3941 PointerOps.resize(VL.size());
3942 auto *POIter = PointerOps.begin();
3943 for (Value *V : VL) {
3944 auto *L = cast<LoadInst>(V);
3945 if (!L->isSimple())
3946 return LoadsState::Gather;
3947 *POIter = L->getPointerOperand();
3948 ++POIter;
3949 }
3950
3951 Order.clear();
3952 // Check the order of pointer operands or that all pointers are the same.
3953 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
3954 if (IsSorted || all_of(PointerOps, [&](Value *P) {
3955 return arePointersCompatible(P, PointerOps.front(), TLI);
3956 })) {
3957 bool IsPossibleStrided = false;
3958 if (IsSorted) {
3959 Value *Ptr0;
3960 Value *PtrN;
3961 if (Order.empty()) {
3962 Ptr0 = PointerOps.front();
3963 PtrN = PointerOps.back();
3964 } else {
3965 Ptr0 = PointerOps[Order.front()];
3966 PtrN = PointerOps[Order.back()];
3967 }
3968 std::optional<int> Diff =
3969 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
3970 // Check that the sorted loads are consecutive.
3971 if (static_cast<unsigned>(*Diff) == VL.size() - 1)
3972 return LoadsState::Vectorize;
3973 // Simple check if not a strided access - clear order.
3974 IsPossibleStrided = *Diff % (VL.size() - 1) == 0;
3975 }
3976 // TODO: need to improve analysis of the pointers, if not all of them are
3977 // GEPs or have > 2 operands, we end up with a gather node, which just
3978 // increases the cost.
3979 Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent());
3980 bool ProfitableGatherPointers =
3981 static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
3982 return L && L->isLoopInvariant(V);
3983 })) <= VL.size() / 2 && VL.size() > 2;
3984 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
3985 auto *GEP = dyn_cast<GetElementPtrInst>(P);
3986 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
3987 (GEP && GEP->getNumOperands() == 2);
3988 })) {
3989 Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
3990 for (Value *V : VL)
3991 CommonAlignment =
3992 std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
3993 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
3994 if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
3995 !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))
3996 return IsPossibleStrided ? LoadsState::PossibleStridedVectorize
3997 : LoadsState::ScatterVectorize;
3998 }
3999 }
4000
4001 return LoadsState::Gather;
4002}
4003
4005 const DataLayout &DL, ScalarEvolution &SE,
4006 SmallVectorImpl<unsigned> &SortedIndices) {
4008 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4009 "Expected list of pointer operands.");
4010 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4011 // Ptr into, sort and return the sorted indices with values next to one
4012 // another.
4014 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4015
4016 unsigned Cnt = 1;
4017 for (Value *Ptr : VL.drop_front()) {
4018 bool Found = any_of(Bases, [&](auto &Base) {
4019 std::optional<int> Diff =
4020 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4021 /*StrictCheck=*/true);
4022 if (!Diff)
4023 return false;
4024
4025 Base.second.emplace_back(Ptr, *Diff, Cnt++);
4026 return true;
4027 });
4028
4029 if (!Found) {
4030 // If we haven't found enough to usefully cluster, return early.
4031 if (Bases.size() > VL.size() / 2 - 1)
4032 return false;
4033
4034 // Not found already - add a new Base
4035 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
4036 }
4037 }
4038
4039 // For each of the bases sort the pointers by Offset and check if any of the
4040 // base become consecutively allocated.
4041 bool AnyConsecutive = false;
4042 for (auto &Base : Bases) {
4043 auto &Vec = Base.second;
4044 if (Vec.size() > 1) {
4045 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
4046 const std::tuple<Value *, int, unsigned> &Y) {
4047 return std::get<1>(X) < std::get<1>(Y);
4048 });
4049 int InitialOffset = std::get<1>(Vec[0]);
4050 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
4051 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4052 });
4053 }
4054 }
4055
4056 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4057 SortedIndices.clear();
4058 if (!AnyConsecutive)
4059 return false;
4060
4061 for (auto &Base : Bases) {
4062 for (auto &T : Base.second)
4063 SortedIndices.push_back(std::get<2>(T));
4064 }
4065
4066 assert(SortedIndices.size() == VL.size() &&
4067 "Expected SortedIndices to be the size of VL");
4068 return true;
4069}
4070
4071std::optional<BoUpSLP::OrdersType>
4072BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4073 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
4074 Type *ScalarTy = TE.Scalars[0]->getType();
4075
4077 Ptrs.reserve(TE.Scalars.size());
4078 for (Value *V : TE.Scalars) {
4079 auto *L = dyn_cast<LoadInst>(V);
4080 if (!L || !L->isSimple())
4081 return std::nullopt;
4082 Ptrs.push_back(L->getPointerOperand());
4083 }
4084
4085 BoUpSLP::OrdersType Order;
4086 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
4087 return std::move(Order);
4088 return std::nullopt;
4089}
4090
4091/// Check if two insertelement instructions are from the same buildvector.
4094 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4095 // Instructions must be from the same basic blocks.
4096 if (VU->getParent() != V->getParent())
4097 return false;
4098 // Checks if 2 insertelements are from the same buildvector.
4099 if (VU->getType() != V->getType())
4100 return false;
4101 // Multiple used inserts are separate nodes.
4102 if (!VU->hasOneUse() && !V->hasOneUse())
4103 return false;
4104 auto *IE1 = VU;
4105 auto *IE2 = V;
4106 std::optional<unsigned> Idx1 = getInsertIndex(IE1);
4107 std::optional<unsigned> Idx2 = getInsertIndex(IE2);
4108 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4109 return false;
4110 // Go through the vector operand of insertelement instructions trying to find
4111 // either VU as the original vector for IE2 or V as the original vector for
4112 // IE1.
4113 SmallSet<int, 8> ReusedIdx;
4114 bool IsReusedIdx = false;
4115 do {
4116 if (IE2 == VU && !IE1)
4117 return VU->hasOneUse();
4118 if (IE1 == V && !IE2)
4119 return V->hasOneUse();
4120 if (IE1 && IE1 != V) {
4121 IsReusedIdx |=
4122 !ReusedIdx.insert(getInsertIndex(IE1).value_or(*Idx2)).second;
4123 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4124 IE1 = nullptr;
4125 else
4126 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4127 }
4128 if (IE2 && IE2 != VU) {
4129 IsReusedIdx |=
4130 !ReusedIdx.insert(getInsertIndex(IE2).value_or(*Idx1)).second;
4131 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4132 IE2 = nullptr;
4133 else
4134 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4135 }
4136 } while (!IsReusedIdx && (IE1 || IE2));
4137 return false;
4138}
4139
4140std::optional<BoUpSLP::OrdersType>
4141BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4142 // No need to reorder if need to shuffle reuses, still need to shuffle the
4143 // node.
4144 if (!TE.ReuseShuffleIndices.empty()) {
4145 // Check if reuse shuffle indices can be improved by reordering.
4146 // For this, check that reuse mask is "clustered", i.e. each scalar values
4147 // is used once in each submask of size <number_of_scalars>.
4148 // Example: 4 scalar values.
4149 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4150 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4151 // element 3 is used twice in the second submask.
4152 unsigned Sz = TE.Scalars.size();
4153 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
4154 Sz))
4155 return std::nullopt;
4156 unsigned VF = TE.getVectorFactor();
4157 // Try build correct order for extractelement instructions.
4158 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
4159 TE.ReuseShuffleIndices.end());
4160 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4161 all_of(TE.Scalars, [Sz](Value *V) {
4162 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4163 return Idx && *Idx < Sz;
4164 })) {
4165 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4166 if (TE.ReorderIndices.empty())
4167 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4168 else
4169 inversePermutation(TE.ReorderIndices, ReorderMask);
4170 for (unsigned I = 0; I < VF; ++I) {
4171 int &Idx = ReusedMask[I];
4172 if (Idx == PoisonMaskElem)
4173 continue;
4174 Value *V = TE.Scalars[ReorderMask[Idx]];
4175 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
4176 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
4177 }
4178 }
4179 // Build the order of the VF size, need to reorder reuses shuffles, they are
4180 // always of VF size.
4181 OrdersType ResOrder(VF);
4182 std::iota(ResOrder.begin(), ResOrder.end(), 0);
4183 auto *It = ResOrder.begin();
4184 for (unsigned K = 0; K < VF; K += Sz) {
4185 OrdersType CurrentOrder(TE.ReorderIndices);
4186 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
4187 if (SubMask.front() == PoisonMaskElem)
4188 std::iota(SubMask.begin(), SubMask.end(), 0);
4189 reorderOrder(CurrentOrder, SubMask);
4190 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
4191 std::advance(It, Sz);
4192 }
4193 if (all_of(enumerate(ResOrder),
4194 [](const auto &Data) { return Data.index() == Data.value(); }))
4195 return std::nullopt; // No need to reorder.
4196 return std::move(ResOrder);
4197 }
4198 if ((TE.State == TreeEntry::Vectorize ||
4199 TE.State == TreeEntry::PossibleStridedVectorize) &&
4200 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4201 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4202 !TE.isAltShuffle())
4203 return TE.ReorderIndices;
4204 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4205 auto PHICompare = [&](unsigned I1, unsigned I2) {
4206 Value *V1 = TE.Scalars[I1];
4207 Value *V2 = TE.Scalars[I2];
4208 if (V1 == V2)
4209 return false;
4210 if (!V1->hasOneUse() || !V2->hasOneUse())
4211 return false;
4212 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
4213 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4214 if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4215 if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4217 IE1, IE2,
4218 [](InsertElementInst *II) { return II->getOperand(0); }))
4219 return false;
4220 std::optional<unsigned> Idx1 = getInsertIndex(IE1);
4221 std::optional<unsigned> Idx2 = getInsertIndex(IE2);
4222 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4223 return false;
4224 return *Idx1 < *Idx2;
4225 }
4226 if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4227 if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4228 if (EE1->getOperand(0) != EE2->getOperand(0))
4229 return false;
4230 std::optional<unsigned> Idx1 = getExtractIndex(EE1);
4231 std::optional<unsigned> Idx2 = getExtractIndex(EE2);
4232 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4233 return false;
4234 return *Idx1 < *Idx2;
4235 }
4236 return false;
4237 };
4238 auto IsIdentityOrder = [](const OrdersType &Order) {
4239 for (unsigned Idx : seq<unsigned>(0, Order.size()))
4240 if (Idx != Order[Idx])
4241 return false;
4242 return true;
4243 };
4244 if (!TE.ReorderIndices.empty())
4245 return TE.ReorderIndices;
4247 SmallVector<unsigned> Phis(TE.Scalars.size());
4248 std::iota(Phis.begin(), Phis.end(), 0);
4249 OrdersType ResOrder(TE.Scalars.size());
4250 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4251 PhiToId[Id] = Id;
4252 stable_sort(Phis, PHICompare);
4253 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4254 ResOrder[Id] = PhiToId[Phis[Id]];
4255 if (IsIdentityOrder(ResOrder))
4256 return std::nullopt; // No need to reorder.
4257 return std::move(ResOrder);
4258 }
4259 if (TE.State == TreeEntry::NeedToGather) {
4260 // TODO: add analysis of other gather nodes with extractelement
4261 // instructions and other values/instructions, not only undefs.
4262 if (((TE.getOpcode() == Instruction::ExtractElement &&
4263 !TE.isAltShuffle()) ||
4264 (all_of(TE.Scalars,
4265 [](Value *V) {
4266 return isa<UndefValue, ExtractElementInst>(V);
4267 }) &&
4268 any_of(TE.Scalars,
4269 [](Value *V) { return isa<ExtractElementInst>(V); }))) &&
4270 all_of(TE.Scalars,
4271 [](Value *V) {
4272 auto *EE = dyn_cast<ExtractElementInst>(V);
4273 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4274 }) &&
4275 allSameType(TE.Scalars)) {
4276 // Check that gather of extractelements can be represented as
4277 // just a shuffle of a single vector.
4278 OrdersType CurrentOrder;
4279 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4280 /*ResizeAllowed=*/true);
4281 if (Reuse || !CurrentOrder.empty()) {
4282 if (!CurrentOrder.empty())
4283 fixupOrderingIndices(CurrentOrder);
4284 return std::move(CurrentOrder);
4285 }
4286 }
4287 // If the gather node is <undef, v, .., poison> and
4288 // insertelement poison, v, 0 [+ permute]
4289 // is cheaper than
4290 // insertelement poison, v, n - try to reorder.
4291 // If rotating the whole graph, exclude the permute cost, the whole graph
4292 // might be transformed.
4293 int Sz = TE.Scalars.size();
4294 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
4295 count_if(TE.Scalars, UndefValue::classof) == Sz - 1) {
4296 const auto *It =
4297 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
4298 if (It == TE.Scalars.begin())
4299 return OrdersType();
4300 auto *Ty = FixedVectorType::get(TE.Scalars.front()->getType(), Sz);
4301 if (It != TE.Scalars.end()) {
4302 OrdersType Order(Sz, Sz);
4303 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4304 Order[Idx] = 0;
4305 fixupOrderingIndices(Order);
4306 SmallVector<int> Mask;
4307 inversePermutation(Order, Mask);
4308 InstructionCost PermuteCost =
4309 TopToBottom
4310 ? 0
4312 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
4313 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
4314 PoisonValue::get(Ty), *It);
4315 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
4316 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
4317 PoisonValue::get(Ty), *It);
4318 if (InsertFirstCost + PermuteCost < InsertIdxCost)
4319 return std::move(Order);
4320 }
4321 }
4322 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
4323 return CurrentOrder;
4324 if (TE.Scalars.size() >= 4)
4325 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
4326 return Order;
4327 }
4328 return std::nullopt;
4329}
4330
4331/// Checks if the given mask is a "clustered" mask with the same clusters of
4332/// size \p Sz, which are not identity submasks.
4334 unsigned Sz) {
4335 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
4336 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
4337 return false;
4338 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
4339 ArrayRef<int> Cluster = Mask.slice(I, Sz);
4340 if (Cluster != FirstCluster)
4341 return false;
4342 }
4343 return true;
4344}
4345
4346void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
4347 // Reorder reuses mask.
4348 reorderReuses(TE.ReuseShuffleIndices, Mask);
4349 const unsigned Sz = TE.Scalars.size();
4350 // For vectorized and non-clustered reused no need to do anything else.
4351 if (TE.State != TreeEntry::NeedToGather ||
4353 Sz) ||
4354 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
4355 return;
4356 SmallVector<int> NewMask;
4357 inversePermutation(TE.ReorderIndices, NewMask);
4358 addMask(NewMask, TE.ReuseShuffleIndices);
4359 // Clear reorder since it is going to be applied to the new mask.
4360 TE.ReorderIndices.clear();
4361 // Try to improve gathered nodes with clustered reuses, if possible.
4362 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
4363 SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
4364 inversePermutation(NewOrder, NewMask);
4365 reorderScalars(TE.Scalars, NewMask);
4366 // Fill the reuses mask with the identity submasks.
4367 for (auto *It = TE.ReuseShuffleIndices.begin(),
4368 *End = TE.ReuseShuffleIndices.end();
4369 It != End; std::advance(It, Sz))
4370 std::iota(It, std::next(It, Sz), 0);
4371}
4372
4374 // Maps VF to the graph nodes.
4376 // ExtractElement gather nodes which can be vectorized and need to handle
4377 // their ordering.
4379
4380 // Phi nodes can have preferred ordering based on their result users
4382
4383 // AltShuffles can also have a preferred ordering that leads to fewer
4384 // instructions, e.g., the addsub instruction in x86.
4385 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
4386
4387 // Maps a TreeEntry to the reorder indices of external users.
4389 ExternalUserReorderMap;
4390 // FIXME: Workaround for syntax error reported by MSVC buildbots.
4391 TargetTransformInfo &TTIRef = *TTI;
4392 // Find all reorderable nodes with the given VF.
4393 // Currently the are vectorized stores,loads,extracts + some gathering of
4394 // extracts.
4395 for_each(VectorizableTree, [this, &TTIRef, &VFToOrderedEntries,
4396 &GathersToOrders, &ExternalUserReorderMap,
4397 &AltShufflesToOrders, &PhisToOrders](
4398 const std::unique_ptr<TreeEntry> &TE) {
4399 // Look for external users that will probably be vectorized.
4400 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
4401 findExternalStoreUsersReorderIndices(TE.get());
4402 if (!ExternalUserReorderIndices.empty()) {
4403 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4404 ExternalUserReorderMap.try_emplace(TE.get(),
4405 std::move(ExternalUserReorderIndices));
4406 }
4407
4408 // Patterns like [fadd,fsub] can be combined into a single instruction in
4409 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
4410 // to take into account their order when looking for the most used order.
4411 if (TE->isAltShuffle()) {
4412 VectorType *VecTy =
4413 FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
4414 unsigned Opcode0 = TE->getOpcode();
4415 unsigned Opcode1 = TE->getAltOpcode();
4416 // The opcode mask selects between the two opcodes.
4417 SmallBitVector OpcodeMask(TE->Scalars.size(), false);
4418 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
4419 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
4420 OpcodeMask.set(Lane);
4421 // If this pattern is supported by the target then we consider the order.
4422 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
4423 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4424 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
4425 }
4426 // TODO: Check the reverse order too.
4427 }
4428
4429 if (std::optional<OrdersType> CurrentOrder =
4430 getReorderingData(*TE, /*TopToBottom=*/true)) {
4431 // Do not include ordering for nodes used in the alt opcode vectorization,
4432 // better to reorder them during bottom-to-top stage. If follow the order
4433 // here, it causes reordering of the whole graph though actually it is
4434 // profitable just to reorder the subgraph that starts from the alternate
4435 // opcode vectorization node. Such nodes already end-up with the shuffle
4436 // instruction and it is just enough to change this shuffle rather than
4437 // rotate the scalars for the whole graph.
4438 unsigned Cnt = 0;
4439 const TreeEntry *UserTE = TE.get();
4440 while (UserTE && Cnt < RecursionMaxDepth) {
4441 if (UserTE->UserTreeIndices.size() != 1)
4442 break;
4443 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
4444 return EI.UserTE->State == TreeEntry::Vectorize &&
4445 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
4446 }))
4447 return;
4448 UserTE = UserTE->UserTreeIndices.back().UserTE;
4449 ++Cnt;
4450 }
4451 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
4452 if (!(TE->State == TreeEntry::Vectorize ||
4453 TE->State == TreeEntry::PossibleStridedVectorize) ||
4454 !TE->ReuseShuffleIndices.empty())
4455 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
4456 if (TE->State == TreeEntry::Vectorize &&
4457 TE->getOpcode() == Instruction::PHI)
4458 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
4459 }
4460 });
4461
4462 // Reorder the graph nodes according to their vectorization factor.
4463 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
4464 VF /= 2) {
4465 auto It = VFToOrderedEntries.find(VF);
4466 if (It == VFToOrderedEntries.end())
4467 continue;
4468 // Try to find the most profitable order. We just are looking for the most
4469 // used order and reorder scalar elements in the nodes according to this
4470 // mostly used order.
4471 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
4472 // All operands are reordered and used only in this node - propagate the
4473 // most used order to the user node.
4476 OrdersUses;
4477 // Last chance orders - scatter vectorize. Try to use their orders if no
4478 // other orders or the order is counted already.
4479 SmallVector<OrdersType> StridedVectorizeOrders;
4481 for (const TreeEntry *OpTE : OrderedEntries) {
4482 // No need to reorder this nodes, still need to extend and to use shuffle,
4483 // just need to merge reordering shuffle and the reuse shuffle.
4484 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
4485 continue;
4486 // Count number of orders uses.
4487 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
4488 &PhisToOrders]() -> const OrdersType & {
4489 if (OpTE->State == TreeEntry::NeedToGather ||
4490 !OpTE->ReuseShuffleIndices.empty()) {
4491 auto It = GathersToOrders.find(OpTE);
4492 if (It != GathersToOrders.end())
4493 return It->second;
4494 }
4495 if (OpTE->isAltShuffle()) {
4496 auto It = AltShufflesToOrders.find(OpTE);
4497 if (It != AltShufflesToOrders.end())
4498 return It->second;
4499 }
4500 if (OpTE->State == TreeEntry::Vectorize &&
4501 OpTE->getOpcode() == Instruction::PHI) {
4502 auto It = PhisToOrders.find(OpTE);
4503 if (It != PhisToOrders.end())
4504 return It->second;
4505 }
4506 return OpTE->ReorderIndices;
4507 }();
4508 // First consider the order of the external scalar users.
4509 auto It = ExternalUserReorderMap.find(OpTE);
4510 if (It != ExternalUserReorderMap.end()) {
4511 const auto &ExternalUserReorderIndices = It->second;
4512 // If the OpTE vector factor != number of scalars - use natural order,
4513 // it is an attempt to reorder node with reused scalars but with
4514 // external uses.
4515 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
4516 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
4517 ExternalUserReorderIndices.size();
4518 } else {
4519 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
4520 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
4521 }
4522 // No other useful reorder data in this entry.
4523 if (Order.empty())
4524 continue;
4525 }
4526 // Postpone scatter orders.
4527 if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
4528 StridedVectorizeOrders.push_back(Order);
4529 continue;
4530 }
4531 // Stores actually store the mask, not the order, need to invert.
4532 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4533 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4534 SmallVector<int> Mask;
4535 inversePermutation(Order, Mask);
4536 unsigned E = Order.size();
4537 OrdersType CurrentOrder(E, E);
4538 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
4539 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
4540 });
4541 fixupOrderingIndices(CurrentOrder);
4542 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
4543 } else {
4544 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
4545 }
4546 }
4547 // Set order of the user node.
4548 if (OrdersUses.empty()) {
4549 if (StridedVectorizeOrders.empty())
4550 continue;
4551 // Add (potentially!) strided vectorize orders.
4552 for (OrdersType &Order : StridedVectorizeOrders)
4553 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
4554 } else {
4555 // Account (potentially!) strided vectorize orders only if it was used
4556 // already.
4557 for (OrdersType &Order : StridedVectorizeOrders) {
4558 auto *It = OrdersUses.find(Order);
4559 if (It != OrdersUses.end())
4560 ++It->second;
4561 }
4562 }
4563 // Choose the most used order.
4564 ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
4565 unsigned Cnt = OrdersUses.front().second;
4566 for (const auto &Pair : drop_begin(OrdersUses)) {
4567 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
4568 BestOrder = Pair.first;
4569 Cnt = Pair.second;
4570 }
4571 }
4572 // Set order of the user node.
4573 if (BestOrder.empty())
4574 continue;
4575 SmallVector<int> Mask;
4576 inversePermutation(BestOrder, Mask);
4577 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
4578 unsigned E = BestOrder.size();
4579 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
4580 return I < E ? static_cast<int>(I) : PoisonMaskElem;
4581 });
4582 // Do an actual reordering, if profitable.
4583 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
4584 // Just do the reordering for the nodes with the given VF.
4585 if (TE->Scalars.size() != VF) {
4586 if (TE->ReuseShuffleIndices.size() == VF) {
4587 // Need to reorder the reuses masks of the operands with smaller VF to
4588 // be able to find the match between the graph nodes and scalar
4589 // operands of the given node during vectorization/cost estimation.
4590 assert(all_of(TE->UserTreeIndices,
4591 [VF, &TE](const EdgeInfo &EI) {
4592 return EI.UserTE->Scalars.size() == VF ||
4593 EI.UserTE->Scalars.size() ==
4594 TE->Scalars.size();
4595 }) &&
4596 "All users must be of VF size.");
4597 // Update ordering of the operands with the smaller VF than the given
4598 // one.
4599 reorderNodeWithReuses(*TE, Mask);
4600 }
4601 continue;
4602 }
4603 if ((TE->State == TreeEntry::Vectorize ||
4604 TE->State == TreeEntry::PossibleStridedVectorize) &&
4606 InsertElementInst>(TE->getMainOp()) &&
4607 !TE->isAltShuffle()) {
4608 // Build correct orders for extract{element,value}, loads and
4609 // stores.
4610 reorderOrder(TE->ReorderIndices, Mask);
4611 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
4612 TE->reorderOperands(Mask);
4613 } else {
4614 // Reorder the node and its operands.
4615 TE->reorderOperands(Mask);
4616 assert(TE->ReorderIndices.empty() &&
4617 "Expected empty reorder sequence.");
4618 reorderScalars(TE->Scalars, Mask);
4619 }
4620 if (!TE->ReuseShuffleIndices.empty()) {
4621 // Apply reversed order to keep the original ordering of the reused
4622 // elements to avoid extra reorder indices shuffling.
4623 OrdersType CurrentOrder;
4624 reorderOrder(CurrentOrder, MaskOrder);
4625 SmallVector<int> NewReuses;
4626 inversePermutation(CurrentOrder, NewReuses);
4627 addMask(NewReuses, TE->ReuseShuffleIndices);
4628 TE->ReuseShuffleIndices.swap(NewReuses);
4629 }
4630 }
4631 }
4632}
4633
4634bool BoUpSLP::canReorderOperands(
4635 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
4636 ArrayRef<TreeEntry *> ReorderableGathers,
4637 SmallVectorImpl<TreeEntry *> &GatherOps) {
4638 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
4639 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
4640 return OpData.first == I &&
4641 OpData.second->State == TreeEntry::Vectorize;
4642 }))
4643 continue;
4644 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
4645 // FIXME: Do not reorder (possible!) strided vectorized nodes, they
4646 // require reordering of the operands, which is not implemented yet.
4647 if (TE->State == TreeEntry::PossibleStridedVectorize)
4648 return false;
4649 // Do not reorder if operand node is used by many user nodes.
4650 if (any_of(TE->UserTreeIndices,
4651 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
4652 return false;
4653 // Add the node to the list of the ordered nodes with the identity
4654 // order.
4655 Edges.emplace_back(I, TE);
4656 // Add ScatterVectorize nodes to the list of operands, where just
4657 // reordering of the scalars is required. Similar to the gathers, so
4658 // simply add to the list of gathered ops.
4659 // If there are reused scalars, process this node as a regular vectorize
4660 // node, just reorder reuses mask.
4661 if (TE->State != TreeEntry::Vectorize &&
4662 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
4663 GatherOps.push_back(TE);
4664 continue;
4665 }
4666 TreeEntry *Gather = nullptr;
4667 if (count_if(ReorderableGathers,
4668 [&Gather, UserTE, I](TreeEntry *TE) {
4669 assert(TE->State != TreeEntry::Vectorize &&
4670 "Only non-vectorized nodes are expected.");
4671 if (any_of(TE->UserTreeIndices,
4672 [UserTE, I](const EdgeInfo &EI) {
4673 return EI.UserTE == UserTE && EI.EdgeIdx == I;
4674 })) {
4675 assert(TE->isSame(UserTE->getOperand(I)) &&
4676 "Operand entry does not match operands.");
4677 Gather = TE;
4678 return true;
4679 }
4680 return false;
4681 }) > 1 &&
4682 !allConstant(UserTE->getOperand(I)))
4683 return false;
4684 if (Gather)
4685 GatherOps.push_back(Gather);
4686 }
4687 return true;
4688}
4689
4690void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
4691 SetVector<TreeEntry *> OrderedEntries;
4693 // Find all reorderable leaf nodes with the given VF.
4694 // Currently the are vectorized loads,extracts without alternate operands +
4695 // some gathering of extracts.
4696 SmallVector<TreeEntry *> NonVectorized;
4697 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
4698 if (TE->State != TreeEntry::Vectorize &&
4699 TE->State != TreeEntry::PossibleStridedVectorize)
4700 NonVectorized.push_back(TE.get());
4701 if (std::optional<OrdersType> CurrentOrder =
4702 getReorderingData(*TE, /*TopToBottom=*/false)) {
4703 OrderedEntries.insert(TE.get());
4704 if (!(TE->State == TreeEntry::Vectorize ||
4705 TE->State == TreeEntry::PossibleStridedVectorize) ||
4706 !TE->ReuseShuffleIndices.empty())
4707 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
4708 }
4709 }
4710
4711 // 1. Propagate order to the graph nodes, which use only reordered nodes.
4712 // I.e., if the node has operands, that are reordered, try to make at least
4713 // one operand order in the natural order and reorder others + reorder the
4714 // user node itself.
4716 while (!OrderedEntries.empty()) {
4717 // 1. Filter out only reordered nodes.
4718 // 2. If the entry has multiple uses - skip it and jump to the next node.
4720 SmallVector<TreeEntry *> Filtered;
4721 for (TreeEntry *TE : OrderedEntries) {
4722 if (!(TE->State == TreeEntry::Vectorize ||
4723 TE->State == TreeEntry::PossibleStridedVectorize ||
4724 (TE->State == TreeEntry::NeedToGather &&
4725 GathersToOrders.count(TE))) ||
4726 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4727 !all_of(drop_begin(TE->UserTreeIndices),
4728 [TE](const EdgeInfo &EI) {
4729 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
4730 }) ||
4731 !Visited.insert(TE).second) {
4732 Filtered.push_back(TE);
4733 continue;
4734 }
4735 // Build a map between user nodes and their operands order to speedup
4736 // search. The graph currently does not provide this dependency directly.
4737 for (EdgeInfo &EI : TE->UserTreeIndices) {
4738 TreeEntry *UserTE = EI.UserTE;
4739 auto It = Users.find(UserTE);
4740 if (It == Users.end())
4741 It = Users.insert({UserTE, {}}).first;
4742 It->second.emplace_back(EI.EdgeIdx, TE);
4743 }
4744 }
4745 // Erase filtered entries.
4746 for (TreeEntry *TE : Filtered)
4747 OrderedEntries.remove(TE);
4749 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
4750 UsersVec(Users.begin(), Users.end());
4751 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
4752 return Data1.first->Idx > Data2.first->Idx;
4753 });
4754 for (auto &Data : UsersVec) {
4755 // Check that operands are used only in the User node.
4756 SmallVector<TreeEntry *> GatherOps;
4757 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
4758 GatherOps)) {
4759 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
4760 OrderedEntries.remove(Op.second);
4761 continue;
4762 }
4763 // All operands are reordered and used only in this node - propagate the
4764 // most used order to the user node.
4767 OrdersUses;
4768 // Last chance orders - scatter vectorize. Try to use their orders if no
4769 // other orders or the order is counted already.
4770 SmallVector<std::pair<OrdersType, unsigned>> StridedVectorizeOrders;
4771 // Do the analysis for each tree entry only once, otherwise the order of
4772 // the same node my be considered several times, though might be not
4773 // profitable.
4776 for (const auto &Op : Data.second) {
4777 TreeEntry *OpTE = Op.second;
4778 if (!VisitedOps.insert(OpTE).second)
4779 continue;
4780 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
4781 continue;
4782 const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
4783 if (OpTE->State == TreeEntry::NeedToGather ||
4784 !OpTE->ReuseShuffleIndices.empty())
4785 return GathersToOrders.find(OpTE)->second;
4786 return OpTE->ReorderIndices;
4787 }();
4788 unsigned NumOps = count_if(
4789 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
4790 return P.second == OpTE;
4791 });
4792 // Postpone scatter orders.
4793 if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
4794 StridedVectorizeOrders.emplace_back(Order, NumOps);
4795 continue;
4796 }
4797 // Stores actually store the mask, not the order, need to invert.
4798 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4799 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4800 SmallVector<int> Mask;
4801 inversePermutation(Order, Mask);
4802 unsigned E = Order.size();
4803 OrdersType CurrentOrder(E, E);
4804 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
4805 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
4806 });
4807 fixupOrderingIndices(CurrentOrder);
4808 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
4809 NumOps;
4810 } else {
4811 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
4812 }
4813 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
4814 const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
4815 const TreeEntry *TE) {
4816 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4817 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
4818 (IgnoreReorder && TE->Idx == 0))
4819 return true;
4820 if (TE->State == TreeEntry::NeedToGather) {
4821 auto It = GathersToOrders.find(TE);
4822 if (It != GathersToOrders.end())
4823 return !It->second.empty();
4824 return true;
4825 }
4826 return false;
4827 };
4828 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
4829 TreeEntry *UserTE = EI.UserTE;
4830 if (!VisitedUsers.insert(UserTE).second)
4831 continue;
4832 // May reorder user node if it requires reordering, has reused
4833 // scalars, is an alternate op vectorize node or its op nodes require
4834 // reordering.
4835 if (AllowsReordering(UserTE))
4836 continue;
4837 // Check if users allow reordering.
4838 // Currently look up just 1 level of operands to avoid increase of
4839 // the compile time.
4840 // Profitable to reorder if definitely more operands allow
4841 // reordering rather than those with natural order.
4843 if (static_cast<unsigned>(count_if(
4844 Ops, [UserTE, &AllowsReordering](
4845 const std::pair<unsigned, TreeEntry *> &Op) {
4846 return AllowsReordering(Op.second) &&
4847 all_of(Op.second->UserTreeIndices,
4848 [UserTE](const EdgeInfo &EI) {
4849 return EI.UserTE == UserTE;
4850 });
4851 })) <= Ops.size() / 2)
4852 ++Res.first->second;
4853 }
4854 }
4855 // If no orders - skip current nodes and jump to the next one, if any.
4856 if (OrdersUses.empty()) {
4857 if (StridedVectorizeOrders.empty() ||
4858 (Data.first->ReorderIndices.empty() &&
4859 Data.first->ReuseShuffleIndices.empty() &&
4860 !(IgnoreReorder &&
4861 Data.first == VectorizableTree.front().get()))) {
4862 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
4863 OrderedEntries.remove(Op.second);
4864 continue;
4865 }
4866 // Add (potentially!) strided vectorize orders.
4867 for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders)
4868 OrdersUses.insert(std::make_pair(Pair.first, 0)).first->second +=
4869 Pair.second;
4870 } else {
4871 // Account (potentially!) strided vectorize orders only if it was used
4872 // already.
4873 for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders) {
4874 auto *It = OrdersUses.find(Pair.first);
4875 if (It != OrdersUses.end())
4876 It->second += Pair.second;
4877 }
4878 }
4879 // Choose the best order.
4880 ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
4881 unsigned Cnt = OrdersUses.front().second;
4882 for (const auto &Pair : drop_begin(OrdersUses)) {
4883 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
4884 BestOrder = Pair.first;
4885 Cnt = Pair.second;
4886 }
4887 }
4888 // Set order of the user node (reordering of operands and user nodes).
4889 if (BestOrder.empty()) {
4890 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
4891 OrderedEntries.remove(Op.second);
4892 continue;
4893 }
4894 // Erase operands from OrderedEntries list and adjust their orders.
4895 VisitedOps.clear();
4896 SmallVector<int> Mask;
4897 inversePermutation(BestOrder, Mask);
4898 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
4899 unsigned E = BestOrder.size();
4900 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
4901 return I < E ? static_cast<int>(I) : PoisonMaskElem;
4902 });
4903 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
4904 TreeEntry *TE = Op.second;
4905 OrderedEntries.remove(TE);
4906 if (!VisitedOps.insert(TE).second)
4907 continue;
4908 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
4909 reorderNodeWithReuses(*TE, Mask);
4910 continue;
4911 }
4912 // Gathers are processed separately.
4913 if (TE->State != TreeEntry::Vectorize &&
4914 TE->State != TreeEntry::PossibleStridedVectorize &&
4915 (TE->State != TreeEntry::ScatterVectorize ||
4916 TE->ReorderIndices.empty()))
4917 continue;
4918 assert((BestOrder.size() == TE->ReorderIndices.size() ||
4919 TE->ReorderIndices.empty()) &&
4920 "Non-matching sizes of user/operand entries.");
4921 reorderOrder(TE->ReorderIndices, Mask);
4922 if (IgnoreReorder && TE == VectorizableTree.front().get())
4923 IgnoreReorder = false;
4924 }
4925 // For gathers just need to reorder its scalars.
4926 for (TreeEntry *Gather : GatherOps) {
4927 assert(Gather->ReorderIndices.empty() &&
4928 "Unexpected reordering of gathers.");
4929 if (!Gather->ReuseShuffleIndices.empty()) {
4930 // Just reorder reuses indices.
4931 reorderReuses(Gather->ReuseShuffleIndices, Mask);
4932 continue;
4933 }
4934 reorderScalars(Gather->Scalars, Mask);
4935 OrderedEntries.remove(Gather);
4936 }
4937 // Reorder operands of the user node and set the ordering for the user
4938 // node itself.
4939 if (Data.first->State != TreeEntry::Vectorize ||
4940 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
4941 Data.first->getMainOp()) ||
4942 Data.first->isAltShuffle())
4943 Data.first->reorderOperands(Mask);
4944 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
4945 Data.first->isAltShuffle() ||
4946 Data.first->State == TreeEntry::PossibleStridedVectorize) {
4947 reorderScalars(Data.first->Scalars, Mask);
4948 reorderOrder(Data.first->ReorderIndices, MaskOrder);
4949 if (Data.first->ReuseShuffleIndices.empty() &&
4950 !Data.first->ReorderIndices.empty() &&
4951 !Data.first->isAltShuffle()) {
4952 // Insert user node to the list to try to sink reordering deeper in
4953 // the graph.
4954 OrderedEntries.insert(Data.first);
4955 }
4956 } else {
4957 reorderOrder(Data.first->ReorderIndices, Mask);
4958 }
4959 }
4960 }
4961 // If the reordering is unnecessary, just remove the reorder.
4962 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
4963 VectorizableTree.front()->ReuseShuffleIndices.empty())
4964 VectorizableTree.front()->ReorderIndices.clear();
4965}
4966
4968 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
4969 // Collect the values that we need to extract from the tree.
4970 for (auto &TEPtr : VectorizableTree) {
4971 TreeEntry *Entry = TEPtr.get();
4972
4973 // No need to handle users of gathered values.
4974 if (Entry->State == TreeEntry::NeedToGather)
4975 continue;
4976
4977 // For each lane:
4978 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
4979 Value *Scalar = Entry->Scalars[Lane];
4980 if (!isa<Instruction>(Scalar))
4981 continue;
4982 int FoundLane = Entry->findLaneForValue(Scalar);
4983
4984 // Check if the scalar is externally used as an extra arg.
4985 auto ExtI = ExternallyUsedValues.find(Scalar);
4986 if (ExtI != ExternallyUsedValues.end()) {
4987 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
4988 << Lane << " from " << *Scalar << ".\n");
4989 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
4990 }
4991 for (User *U : Scalar->users()) {
4992 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
4993
4994 Instruction *UserInst = dyn_cast<Instruction>(U);
4995 if (!UserInst)
4996 continue;
4997
4998 if (isDeleted(UserInst))
4999 continue;
5000
5001 // Skip in-tree scalars that become vectors
5002 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5003 Value *UseScalar = UseEntry->Scalars[0];
5004 // Some in-tree scalars will remain as scalar in vectorized
5005 // instructions. If that is the case, the one in Lane 0 will
5006 // be used.
5007 if (UseScalar != U ||
5008 UseEntry->State == TreeEntry::ScatterVectorize ||
5009 UseEntry->State == TreeEntry::PossibleStridedVectorize ||
5010 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
5011 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
5012 << ".\n");
5013 assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
5014 continue;
5015 }
5016 }
5017
5018 // Ignore users in the user ignore list.
5019 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5020 continue;
5021
5022 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
5023 << Lane << " from " << *Scalar << ".\n");
5024 ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
5025 }
5026 }
5027 }
5028}
5029
5031BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
5033 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5034 Value *V = TE->Scalars[Lane];
5035 // To save compilation time we don't visit if we have too many users.
5036 static constexpr unsigned UsersLimit = 4;
5037 if (V->hasNUsesOrMore(UsersLimit))
5038 break;
5039
5040 // Collect stores per pointer object.
5041 for (User *U : V->users()) {
5042 auto *SI = dyn_cast<StoreInst>(U);
5043 if (SI == nullptr || !SI->isSimple() ||
5044 !isValidElementType(SI->getValueOperand()->getType()))
5045 continue;
5046 // Skip entry if already
5047 if (getTreeEntry(U))
5048 continue;
5049
5050 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
5051 auto &StoresVec = PtrToStoresMap[