LLVM 23.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
32#include "llvm/ADT/Statistic.h"
33#include "llvm/ADT/iterator.h"
42#include "llvm/Analysis/Loads.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
75#ifdef EXPENSIVE_CHECKS
76#include "llvm/IR/Verifier.h"
77#endif
78#include "llvm/Pass.h"
83#include "llvm/Support/Debug.h"
95#include <algorithm>
96#include <cassert>
97#include <cstdint>
98#include <iterator>
99#include <map>
100#include <memory>
101#include <optional>
102#include <set>
103#include <string>
104#include <tuple>
105#include <utility>
106
107using namespace llvm;
108using namespace llvm::PatternMatch;
109using namespace slpvectorizer;
110using namespace std::placeholders;
111
112#define SV_NAME "slp-vectorizer"
113#define DEBUG_TYPE "SLP"
114
115STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
116STATISTIC(NumStridedStoreChains, "Number of vectorized stride stores");
117STATISTIC(NumStoreChains, "Number of vector stores created");
118STATISTIC(NumVectorizedStores, "Number of vectorized stores");
119
120DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
121 "Controls which SLP graphs should be vectorized.");
122
123static cl::opt<bool>
124 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
125 cl::desc("Run the SLP vectorization passes"));
126
127static cl::opt<bool>
128 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
129 cl::desc("Enable vectorization for wider vector utilization"));
130
131static cl::opt<int>
133 cl::desc("Only vectorize if you gain more than this "
134 "number "));
135
136static cl::opt<bool>
137ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
138 cl::desc("Attempt to vectorize horizontal reductions"));
139
141 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
142 cl::desc(
143 "Attempt to vectorize horizontal reductions feeding into a store"));
144
146 "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
147 cl::desc("Improve the code quality by splitting alternate instructions"));
148
150 "slp-inst-count-check", cl::init(true), cl::Hidden,
151 cl::desc("Reject vectorization if vector instruction count exceeds "
152 "scalar instruction count"));
153
154static cl::opt<int>
156 cl::desc("Attempt to vectorize for this register size in bits"));
157
160 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
161
162/// Limits the size of scheduling regions in a block.
163/// It avoid long compile times for _very_ large blocks where vector
164/// instructions are spread over a wide range.
165/// This limit is way higher than needed by real-world functions.
166static cl::opt<int>
167ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
168 cl::desc("Limit the size of the SLP scheduling region per block"));
169
171 "slp-min-reg-size", cl::init(128), cl::Hidden,
172 cl::desc("Attempt to vectorize for this register size in bits"));
173
175 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
176 cl::desc("Limit the recursion depth when building a vectorizable tree"));
177
179 "slp-min-tree-size", cl::init(3), cl::Hidden,
180 cl::desc("Only vectorize small trees if they are fully vectorizable"));
181
183 "slp-phi-vectorization-budget", cl::init(1024), cl::Hidden,
184 cl::desc("Do not vectorize a bundle of PHI nodes if the product of the "
185 "bundle size and the number of incoming values exceeds this "
186 "value, to limit the compile time spent on wide PHIs"));
187
188// The maximum depth that the look-ahead score heuristic will explore.
189// The higher this value, the higher the compilation time overhead.
191 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
192 cl::desc("The maximum look-ahead depth for operand reordering scores"));
193
194// The maximum depth that the look-ahead score heuristic will explore
195// when it probing among candidates for vectorization tree roots.
196// The higher this value, the higher the compilation time overhead but unlike
197// similar limit for operands ordering this is less frequently used, hence
198// impact of higher value is less noticeable.
200 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
201 cl::desc("The maximum look-ahead depth for searching best rooting option"));
202
204 "slp-min-strided-loads", cl::init(2), cl::Hidden,
205 cl::desc("The minimum number of loads, which should be considered strided, "
206 "if the stride is > 1 or is runtime value"));
207
209 "slp-min-strided-stores", cl::init(2), cl::Hidden,
210 cl::desc(
211 "The minimum number of stores, which should be considered strided, "
212 "if the stride is > 1 or is runtime value"));
213
215 "slp-max-stride", cl::init(8), cl::Hidden,
216 cl::desc("The maximum stride, considered to be profitable."));
217
218static cl::opt<bool>
219 EnableStridedStores("slp-enable-strided-stores", cl::init(false),
221 cl::desc("Enable SLP trees to be built from strided "
222 "store chains."));
223
224static cl::opt<bool>
225 DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden,
226 cl::desc("Disable tree reordering even if it is "
227 "profitable. Used for testing only."));
228
229static cl::opt<bool>
230 ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden,
231 cl::desc("Generate strided loads even if they are not "
232 "profitable. Used for testing only."));
233
234static cl::opt<bool>
235 ViewSLPTree("view-slp-tree", cl::Hidden,
236 cl::desc("Display the SLP trees with Graphviz"));
237
239 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
240 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
241
243 "slp-postprocess-stores-operands", cl::init(false), cl::Hidden,
244 cl::desc("Force vectorization of non-vectorizable stores operands."));
245
247 "slp-non-vectorizables-as-reductions", cl::init(false), cl::Hidden,
248 cl::desc(
249 "Use non-vectorizable instructions as potential reduction roots."));
250
251/// True when \p slp-vectorize-non-power-of-2 is enabled and \p NumElts is a
252/// supported non-power-of-2 width: \p NumElts + 1 must be a power of two
253/// (e.g. 3 or 7 lanes, i.e. almost a full power-of-2 register).
254static bool isAllowedNonPowerOf2VF(unsigned NumElts) {
255 return VectorizeNonPowerOf2 && has_single_bit(NumElts + 1);
256}
257
258/// Enables vectorization of copyable elements.
260 "slp-copyable-elements", cl::init(true), cl::Hidden,
261 cl::desc("Try to replace values with the idempotent instructions for "
262 "better vectorization."));
263
265 "slp-cost-loop-trip-count", cl::init(2), cl::Hidden,
266 cl::desc("Loop trip count, considered by the cost model during "
267 "modeling (0=loops are ignored and considered flat code)"));
268
269/// Refine the loop-aware cost scaling of gather/buildvector tree entries by
270/// using the per-lane execution scale of the operand that feeds each lane,
271/// instead of a single whole-entry scale. This matches the LICM hoisting
272/// performed by optimizeGatherSequence() at codegen time: lanes whose
273/// operands are loop-invariant in an inner loop contribute the outer loop's
274/// execution scale rather than the inner loop's, which avoids over-costing
275/// buildvectors that bridge values from outer loop nests into an inner loop.
277 "slp-per-lane-gather-scale", cl::init(true), cl::Hidden,
278 cl::desc("Use per-lane execution scale for gather/buildvector tree "
279 "entries to model LICM-hoistable buildvector sequences."));
280
281// Limit the number of alias checks. The limit is chosen so that
282// it has no negative effect on the llvm benchmarks.
283static const unsigned AliasedCheckLimit = 10;
284
285// Limit of the number of uses for potentially transformed instructions/values,
286// used in checks to avoid compile-time explode.
287static constexpr int UsesLimit = 64;
288
289// Another limit for the alias checks: The maximum distance between load/store
290// instructions where alias checks are done.
291// This limit is useful for very large basic blocks.
292static const unsigned MaxMemDepDistance = 160;
293
294/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
295/// regions to be handled.
296static const int MinScheduleRegionSize = 16;
297
298/// Maximum allowed number of operands in the PHI nodes.
299static const unsigned MaxPHINumOperands = 128;
300
301/// Predicate for the element types that the SLP vectorizer supports.
302///
303/// The most important thing to filter here are types which are invalid in LLVM
304/// vectors. We also filter target specific types which have absolutely no
305/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
306/// avoids spending time checking the cost model and realizing that they will
307/// be inevitably scalarized.
308static bool isValidElementType(Type *Ty) {
309 // TODO: Support ScalableVectorType.
310 if (SLPReVec && isVectorizedTy(Ty) && !getVectorizedTypeVF(Ty).isScalable())
311 Ty = toScalarizedTy(Ty);
312 return canVectorizeTy(Ty) && !Ty->isX86_FP80Ty() && !Ty->isPPC_FP128Ty() &&
313 !Ty->isVoidTy();
314}
315
316/// Returns the "element type" of the given value/instruction \p V.
317/// For stores, returns the stored value type; for insertelement (when ReVec is
318/// off), the inserted operand type. For compares, the default is to return the
319/// result type (i1); when \p LookThroughCmp is true, returns the type of the
320/// compared operands instead, which is needed for vector width calculations
321/// (the width is determined by the operand type, not the i1 result).
322static Type *getValueType(Value *V, bool LookThroughCmp = false) {
323 if (auto *SI = dyn_cast<StoreInst>(V))
324 return SI->getValueOperand()->getType();
325 if (LookThroughCmp)
326 if (auto *CI = dyn_cast<CmpInst>(V))
327 return CI->getOperand(0)->getType();
328 if (!SLPReVec)
329 if (auto *IE = dyn_cast<InsertElementInst>(V))
330 return IE->getOperand(1)->getType();
331 return V->getType();
332}
333
334/// \returns the number of elements for Ty.
335static unsigned getNumElements(Type *Ty) {
337 "ScalableVectorType is not supported.");
338 if (isVectorizedTy(Ty))
340 return 1;
341}
342
343/// \returns the vector type of ScalarTy based on vectorization factor.
344static Type *getWidenedType(Type *ScalarTy, unsigned VF) {
345 if (VF == 1 && !isVectorizedTy(ScalarTy)) {
346 // Workaround for 1 x vector types: toVectorizedTy returns the type
347 // unchanged when EC is scalar, but BoUpSLP relies on widening to
348 // <1 x ScalarTy> (or struct of <1 x ElTy>) to keep the rest of the
349 // pipeline operating on vector types.
350 if (auto *StructTy = dyn_cast<StructType>(ScalarTy)) {
352 "expected unpacked struct literal");
353 assert(all_of(StructTy->elements(), VectorType::isValidElementType) &&
354 "expected all element types to be valid vector element types");
355 return StructType::get(
356 StructTy->getContext(),
357 map_to_vector(StructTy->elements(), [&](Type *ElTy) -> Type * {
358 return FixedVectorType::get(ElTy, 1);
359 }));
360 }
361 return FixedVectorType::get(ScalarTy, 1);
362 }
363 return toVectorizedTy(toScalarizedTy(ScalarTy),
364 ElementCount::getFixed(VF * getNumElements(ScalarTy)));
365}
366
367/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
368/// which forms type, which splits by \p TTI into whole vector types during
369/// legalization.
371 Type *Ty, unsigned Sz) {
372 if (!isValidElementType(Ty) || isa<StructType>(Ty))
373 return bit_ceil(Sz);
374 // Find the number of elements, which forms full vectors.
375 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
376 if (NumParts == 0 || NumParts >= Sz)
377 return bit_ceil(Sz);
378 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
379}
380
381/// Returns the number of elements of the given type \p Ty, not greater than \p
382/// Sz, which forms type, which splits by \p TTI into whole vector types during
383/// legalization.
384static unsigned
386 unsigned Sz) {
387 if (!isValidElementType(Ty) || isa<StructType>(Ty))
388 return bit_floor(Sz);
389 // Find the number of elements, which forms full vectors.
390 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
391 if (NumParts == 0 || NumParts >= Sz)
392 return bit_floor(Sz);
393 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
394 if (RegVF > Sz)
395 return bit_floor(Sz);
396 return (Sz / RegVF) * RegVF;
397}
398
399static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
400 SmallVectorImpl<int> &Mask) {
401 // The ShuffleBuilder implementation use shufflevector to splat an "element".
402 // But the element have different meaning for SLP (scalar) and REVEC
403 // (vector). We need to expand Mask into masks which shufflevector can use
404 // directly.
405 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
406 for (unsigned I : seq<unsigned>(Mask.size()))
407 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
408 I * VecTyNumElements, VecTyNumElements)))
409 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
410 : Mask[I] * VecTyNumElements + J;
411 Mask.swap(NewMask);
412}
413
414/// \returns the number of groups of shufflevector
415/// A group has the following features
416/// 1. All of value in a group are shufflevector.
417/// 2. The mask of all shufflevector is isExtractSubvectorMask.
418/// 3. The mask of all shufflevector uses all of the elements of the source.
419/// e.g., it is 1 group (%0)
420/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
421/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
422/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
423/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
424/// it is 2 groups (%3 and %4)
425/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
426/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
427/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
428/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
429/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
430/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
431/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
432/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
433/// it is 0 group
434/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
435/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
436/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
437/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
439 if (VL.empty())
440 return 0;
442 return 0;
443 auto *SV = cast<ShuffleVectorInst>(VL.front());
444 unsigned SVNumElements =
445 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
446 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
447 if (SVNumElements % ShuffleMaskSize != 0)
448 return 0;
449 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
450 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
451 return 0;
452 unsigned NumGroup = 0;
453 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
454 auto *SV = cast<ShuffleVectorInst>(VL[I]);
455 Value *Src = SV->getOperand(0);
456 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
457 SmallBitVector ExpectedIndex(GroupSize);
458 if (!all_of(Group, [&](Value *V) {
459 auto *SV = cast<ShuffleVectorInst>(V);
460 // From the same source.
461 if (SV->getOperand(0) != Src)
462 return false;
463 int Index;
464 if (!SV->isExtractSubvectorMask(Index))
465 return false;
466 ExpectedIndex.set(Index / ShuffleMaskSize);
467 return true;
468 }))
469 return 0;
470 if (!ExpectedIndex.all())
471 return 0;
472 ++NumGroup;
473 }
474 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
475 return NumGroup;
476}
477
478/// \returns a shufflevector mask which is used to vectorize shufflevectors
479/// e.g.,
480/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
481/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
482/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
483/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
484/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
485/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
486/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
487/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
488/// the result is
489/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
491 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
492 auto *SV = cast<ShuffleVectorInst>(VL.front());
493 unsigned SVNumElements =
494 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
495 SmallVector<int> Mask;
496 unsigned AccumulateLength = 0;
497 for (Value *V : VL) {
498 auto *SV = cast<ShuffleVectorInst>(V);
499 for (int M : SV->getShuffleMask())
500 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
501 : AccumulateLength + M);
502 AccumulateLength += SVNumElements;
503 }
504 return Mask;
505}
506
507/// \returns True if the value is a constant (but not globals/constant
508/// expressions).
509static bool isConstant(Value *V) {
511}
512
513/// Checks if \p V is one of vector-like instructions, i.e. undef,
514/// insertelement/extractelement with constant indices for fixed vector type or
515/// extractvalue instruction.
519 return false;
520 auto *I = dyn_cast<Instruction>(V);
521 if (!I || isa<ExtractValueInst>(I))
522 return true;
523 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
524 return false;
526 return isConstant(I->getOperand(1));
527 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
528 return isConstant(I->getOperand(2));
529}
530
531/// Returns power-of-2 number of elements in a single register (part), given the
532/// total number of elements \p Size and number of registers (parts) \p
533/// NumParts.
534static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
535 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
536}
537
538/// Returns correct remaining number of elements, considering total amount \p
539/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
540/// and current register (part) \p Part.
541static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
542 unsigned Part) {
543 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
544}
545
546#if !defined(NDEBUG)
547/// Print a short descriptor of the instruction bundle suitable for debug output.
548static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
549 std::string Result;
550 raw_string_ostream OS(Result);
551 if (Idx >= 0)
552 OS << "Idx: " << Idx << ", ";
553 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
554 return Result;
555}
556#endif
557
558/// \returns true if all of the instructions in \p VL are in the same block or
559/// false otherwise.
561 auto *It = find_if(VL, IsaPred<Instruction>);
562 if (It == VL.end())
563 return false;
566 return true;
567
568 BasicBlock *BB = I0->getParent();
569 for (Value *V : iterator_range(It, VL.end())) {
570 if (isa<PoisonValue>(V))
571 continue;
572 auto *II = dyn_cast<Instruction>(V);
573 if (!II)
574 return false;
575
576 if (BB != II->getParent())
577 return false;
578 }
579 return true;
580}
581
582/// \returns True if all of the values in \p VL are constants (but not
583/// globals/constant expressions).
585 // Constant expressions and globals can't be vectorized like normal integer/FP
586 // constants.
587 return all_of(VL, isConstant);
588}
589
590/// \returns True if all of the values in \p VL are identical or some of them
591/// are UndefValue.
592static bool isSplat(ArrayRef<Value *> VL) {
593 Value *FirstNonUndef = nullptr;
594 for (Value *V : VL) {
595 if (isa<UndefValue>(V))
596 continue;
597 if (!FirstNonUndef) {
598 FirstNonUndef = V;
599 continue;
600 }
601 if (V != FirstNonUndef)
602 return false;
603 }
604 return FirstNonUndef != nullptr;
605}
606
607/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
608/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
609/// patterns that make it effectively commutative (like equality comparisons
610/// with zero).
611/// In most cases, users should not call this function directly (since \p I and
612/// \p InstWithUses are the same). However, when analyzing interchangeable
613/// instructions, we need to use the converted opcode along with the original
614/// uses.
615/// \param I The instruction to check for commutativity
616/// \param ValWithUses The value whose uses are analyzed for special
617/// patterns
618static bool isCommutative(Instruction *I, Value *ValWithUses,
619 bool IsCopyable = false) {
620 if (auto *Cmp = dyn_cast<CmpInst>(I))
621 return Cmp->isCommutative();
622 if (auto *BO = dyn_cast<BinaryOperator>(I))
623 return BO->isCommutative() ||
624 (BO->getOpcode() == Instruction::Sub &&
625 ValWithUses->hasUseList() &&
626 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
627 all_of(
628 ValWithUses->uses(),
629 [&](const Use &U) {
630 // Commutative, if icmp eq/ne sub, 0
631 CmpPredicate Pred;
632 if (match(U.getUser(),
633 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
634 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
635 return true;
636 // Commutative, if abs(sub nsw, true) or abs(sub, false).
637 ConstantInt *Flag;
638 auto *I = dyn_cast<BinaryOperator>(U.get());
639 return match(U.getUser(),
640 m_Intrinsic<Intrinsic::abs>(
641 m_Specific(U.get()), m_ConstantInt(Flag))) &&
642 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
643 Flag->isOne());
644 })) ||
645 (BO->getOpcode() == Instruction::FSub &&
646 ValWithUses->hasUseList() &&
647 !ValWithUses->hasNUsesOrMore(UsesLimit) &&
648 all_of(ValWithUses->uses(), [](const Use &U) {
649 return match(U.getUser(),
650 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
651 }));
652 return I->isCommutative();
653}
654
655/// Checks if the operand is commutative. In commutative operations, not all
656/// operands might commutable, e.g. for fmuladd only 2 first operands are
657/// commutable.
658static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op,
659 bool IsCopyable = false) {
660 assert(::isCommutative(I, ValWithUses, IsCopyable) &&
661 "The instruction is not commutative.");
662 if (isa<CmpInst>(I))
663 return true;
664 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
665 switch (BO->getOpcode()) {
666 case Instruction::Sub:
667 case Instruction::FSub:
668 return true;
669 default:
670 break;
671 }
672 }
673 return I->isCommutableOperand(Op);
674}
675
676/// This is a helper function to check whether \p I is commutative.
677/// This is a convenience wrapper that calls the two-parameter version of
678/// isCommutative with the same instruction for both parameters. This is
679/// the common case where the instruction being checked for commutativity
680/// is the same as the instruction whose uses are analyzed for special
681/// patterns (see the two-parameter version above for details).
682/// \param I The instruction to check for commutativity
683/// \returns true if the instruction is commutative, false otherwise
684static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
685
686/// \returns number of operands of \p I, considering commutativity. Returns 2
687/// for commutative intrinsics.
688/// \param I The instruction to check for commutativity
691 // IntrinsicInst::isCommutative returns true if swapping the first "two"
692 // arguments to the intrinsic produces the same result.
693 constexpr unsigned IntrinsicNumOperands = 2;
694 return IntrinsicNumOperands;
695 }
696 return I->getNumOperands();
697}
698
699template <typename T>
700static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
701 unsigned Offset) {
702 static_assert(std::is_same_v<T, InsertElementInst> ||
703 std::is_same_v<T, ExtractElementInst>,
704 "unsupported T");
705 int Index = Offset;
706 if (const auto *IE = dyn_cast<T>(Inst)) {
707 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
708 if (!VT)
709 return std::nullopt;
710 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
711 if (!CI)
712 return std::nullopt;
713 if (CI->getValue().uge(VT->getNumElements()))
714 return std::nullopt;
715 Index *= VT->getNumElements();
716 Index += CI->getZExtValue();
717 return Index;
718 }
719 return std::nullopt;
720}
721
722/// \returns inserting or extracting index of InsertElement, ExtractElement or
723/// InsertValue instruction, using Offset as base offset for index.
724/// \returns std::nullopt if the index is not an immediate.
725static std::optional<unsigned> getElementIndex(const Value *Inst,
726 unsigned Offset = 0) {
727 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
728 return Index;
730 return Index;
731
732 int Index = Offset;
733
734 const auto *IV = dyn_cast<InsertValueInst>(Inst);
735 if (!IV)
736 return std::nullopt;
737
738 Type *CurrentType = IV->getType();
739 for (unsigned I : IV->indices()) {
740 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
741 Index *= ST->getNumElements();
742 CurrentType = ST->getElementType(I);
743 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
744 Index *= AT->getNumElements();
745 CurrentType = AT->getElementType();
746 } else {
747 return std::nullopt;
748 }
749 Index += I;
750 }
751 return Index;
752}
753
754/// \returns true if all of the values in \p VL use the same opcode.
755/// For comparison instructions, also checks if predicates match.
756/// PoisonValues are considered matching.
757/// Interchangeable instructions are not considered.
759 auto *It = find_if(VL, IsaPred<Instruction>);
760 if (It == VL.end())
761 return true;
762 Instruction *MainOp = cast<Instruction>(*It);
763 unsigned Opcode = MainOp->getOpcode();
764 bool IsCmpOp = isa<CmpInst>(MainOp);
765 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
767 return std::all_of(It, VL.end(), [&](Value *V) {
768 if (auto *CI = dyn_cast<CmpInst>(V))
769 return BasePred == CI->getPredicate();
770 if (auto *I = dyn_cast<Instruction>(V))
771 return I->getOpcode() == Opcode;
772 return isa<PoisonValue>(V);
773 });
774}
775
776namespace {
777/// Specifies the way the mask should be analyzed for undefs/poisonous elements
778/// in the shuffle mask.
779enum class UseMask {
780 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
781 ///< check for the mask elements for the first argument (mask
782 ///< indices are in range [0:VF)).
783 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
784 ///< for the mask elements for the second argument (mask indices
785 ///< are in range [VF:2*VF))
786 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
787 ///< future shuffle elements and mark them as ones as being used
788 ///< in future. Non-undef elements are considered as unused since
789 ///< they're already marked as used in the mask.
790};
791} // namespace
792
793/// Prepares a use bitset for the given mask either for the first argument or
794/// for the second.
796 UseMask MaskArg) {
797 SmallBitVector UseMask(VF, true);
798 for (auto [Idx, Value] : enumerate(Mask)) {
799 if (Value == PoisonMaskElem) {
800 if (MaskArg == UseMask::UndefsAsMask)
801 UseMask.reset(Idx);
802 continue;
803 }
804 if (MaskArg == UseMask::FirstArg && Value < VF)
805 UseMask.reset(Value);
806 else if (MaskArg == UseMask::SecondArg && Value >= VF)
807 UseMask.reset(Value - VF);
808 }
809 return UseMask;
810}
811
812/// Checks if the given value is actually an undefined constant vector.
813/// Also, if the \p UseMask is not empty, tries to check if the non-masked
814/// elements actually mask the insertelement buildvector, if any.
815template <bool IsPoisonOnly = false>
817 const SmallBitVector &UseMask = {}) {
818 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
819 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
820 if (isa<T>(V))
821 return Res;
822 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
823 if (!VecTy)
824 return Res.reset();
825 auto *C = dyn_cast<Constant>(V);
826 if (!C) {
827 if (!UseMask.empty()) {
828 const Value *Base = V;
829 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
830 Base = II->getOperand(0);
831 if (isa<T>(II->getOperand(1)))
832 continue;
833 std::optional<unsigned> Idx = getElementIndex(II);
834 if (!Idx) {
835 Res.reset();
836 return Res;
837 }
838 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
839 Res.reset(*Idx);
840 }
841 // TODO: Add analysis for shuffles here too.
842 if (V == Base) {
843 Res.reset();
844 } else {
845 SmallBitVector SubMask(UseMask.size(), false);
846 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
847 }
848 } else {
849 Res.reset();
850 }
851 return Res;
852 }
853 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
854 if (Constant *Elem = C->getAggregateElement(I))
855 if (!isa<T>(Elem) &&
856 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
857 Res.reset(I);
858 }
859 return Res;
860}
861
862/// Checks if the vector of instructions can be represented as a shuffle, like:
863/// %x0 = extractelement <4 x i8> %x, i32 0
864/// %x3 = extractelement <4 x i8> %x, i32 3
865/// %y1 = extractelement <4 x i8> %y, i32 1
866/// %y2 = extractelement <4 x i8> %y, i32 2
867/// %x0x0 = mul i8 %x0, %x0
868/// %x3x3 = mul i8 %x3, %x3
869/// %y1y1 = mul i8 %y1, %y1
870/// %y2y2 = mul i8 %y2, %y2
871/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
872/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
873/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
874/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
875/// ret <4 x i8> %ins4
876/// can be transformed into:
877/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
878/// i32 6>
879/// %2 = mul <4 x i8> %1, %1
880/// ret <4 x i8> %2
881/// Mask will return the Shuffle Mask equivalent to the extracted elements.
882/// TODO: Can we split off and reuse the shuffle mask detection from
883/// ShuffleVectorInst/getShuffleCost?
884static std::optional<TargetTransformInfo::ShuffleKind>
886 AssumptionCache *AC) {
887 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
888 if (It == VL.end())
889 return std::nullopt;
890 unsigned Size = accumulate(VL, 0u, [](unsigned S, Value *V) {
891 auto *EI = dyn_cast<ExtractElementInst>(V);
892 if (!EI)
893 return S;
894 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
895 if (!VTy)
896 return S;
897 return std::max(S, VTy->getNumElements());
898 });
899
900 Value *Vec1 = nullptr;
901 Value *Vec2 = nullptr;
902 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
903 auto *EE = dyn_cast<ExtractElementInst>(V);
904 if (!EE)
905 return false;
906 Value *Vec = EE->getVectorOperand();
907 if (isa<UndefValue>(Vec))
908 return false;
909 return isGuaranteedNotToBePoison(Vec, AC);
910 });
911 enum ShuffleMode { Unknown, Select, Permute };
912 ShuffleMode CommonShuffleMode = Unknown;
913 Mask.assign(VL.size(), PoisonMaskElem);
914 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
915 // Undef can be represented as an undef element in a vector.
916 if (isa<UndefValue>(VL[I]))
917 continue;
918 auto *EI = cast<ExtractElementInst>(VL[I]);
919 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
920 return std::nullopt;
921 auto *Vec = EI->getVectorOperand();
922 // We can extractelement from undef or poison vector.
924 continue;
925 // All vector operands must have the same number of vector elements.
926 if (isa<UndefValue>(Vec)) {
927 Mask[I] = I;
928 } else {
929 if (isa<UndefValue>(EI->getIndexOperand()))
930 continue;
931 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
932 if (!Idx)
933 return std::nullopt;
934 // Undefined behavior if Idx is negative or >= Size.
935 if (Idx->getValue().uge(Size))
936 continue;
937 unsigned IntIdx = Idx->getValue().getZExtValue();
938 Mask[I] = IntIdx;
939 }
940 if (isUndefVector(Vec).all() && HasNonUndefVec)
941 continue;
942 // For correct shuffling we have to have at most 2 different vector operands
943 // in all extractelement instructions.
944 if (!Vec1 || Vec1 == Vec) {
945 Vec1 = Vec;
946 } else if (!Vec2 || Vec2 == Vec) {
947 Vec2 = Vec;
948 Mask[I] += Size;
949 } else {
950 return std::nullopt;
951 }
952 if (CommonShuffleMode == Permute)
953 continue;
954 // If the extract index is not the same as the operation number, it is a
955 // permutation.
956 if (Mask[I] % Size != I) {
957 CommonShuffleMode = Permute;
958 continue;
959 }
960 CommonShuffleMode = Select;
961 }
962 // If we're not crossing lanes in different vectors, consider it as blending.
963 if (CommonShuffleMode == Select && Vec2)
965 // If Vec2 was never used, we have a permutation of a single vector, otherwise
966 // we have permutation of 2 vectors.
969}
970
971/// \returns True if Extract{Value,Element} instruction extracts element Idx.
972static std::optional<unsigned> getExtractIndex(const Instruction *E) {
973 unsigned Opcode = E->getOpcode();
974 assert((Opcode == Instruction::ExtractElement ||
975 Opcode == Instruction::ExtractValue) &&
976 "Expected extractelement or extractvalue instruction.");
977 if (Opcode == Instruction::ExtractElement) {
978 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
979 if (!CI)
980 return std::nullopt;
981 // Check if the index is out of bound - we can get the source vector from
982 // operand 0
983 unsigned Idx = CI->getZExtValue();
984 auto *EE = cast<ExtractElementInst>(E);
985 const unsigned VF = ::getNumElements(EE->getVectorOperandType());
986 if (Idx >= VF)
987 return std::nullopt;
988 return Idx;
989 }
990 auto *EI = cast<ExtractValueInst>(E);
991 if (EI->getNumIndices() != 1)
992 return std::nullopt;
993 return *EI->idx_begin();
994}
995
996/// Checks if the provided value does not require scheduling. It does not
997/// require scheduling if this is not an instruction or it is an instruction
998/// that does not read/write memory and all operands are either not instructions
999/// or phi nodes or instructions from different blocks.
1000static bool areAllOperandsNonInsts(Value *V);
1001/// Checks if the provided value does not require scheduling. It does not
1002/// require scheduling if this is not an instruction or it is an instruction
1003/// that does not read/write memory and all users are phi nodes or instructions
1004/// from the different blocks.
1005static bool isUsedOutsideBlock(Value *V);
1006/// Checks if the specified value does not require scheduling. It does not
1007/// require scheduling if all operands and all users do not need to be scheduled
1008/// in the current basic block.
1009static bool doesNotNeedToBeScheduled(Value *V);
1010
1011/// \returns true if \p Opcode is allowed as part of the main/alternate
1012/// instruction for SLP vectorization.
1013///
1014/// Example of unsupported opcode is SDIV that can potentially cause UB if the
1015/// "shuffled out" lane would result in division by zero.
1016static bool isValidForAlternation(unsigned Opcode) {
1017 return !Instruction::isIntDivRem(Opcode);
1018}
1019
1020namespace {
1021
1022/// Helper class that determines VL can use the same opcode.
1023/// Alternate instruction is supported. In addition, it supports interchangeable
1024/// instruction. An interchangeable instruction is an instruction that can be
1025/// converted to another instruction with same semantics. For example, x << 1 is
1026/// equal to x * 2. x * 1 is equal to x | 0.
1027class BinOpSameOpcodeHelper {
1028 using MaskType = std::uint_fast32_t;
1029 /// Sort SupportedOp because it is used by binary_search.
1030 constexpr static std::initializer_list<unsigned> SupportedOp = {
1031 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
1032 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
1033 static_assert(llvm::is_sorted_constexpr(SupportedOp) &&
1034 "SupportedOp is not sorted.");
1035 enum : MaskType {
1036 ShlBIT = 1,
1037 AShrBIT = 1 << 1,
1038 MulBIT = 1 << 2,
1039 AddBIT = 1 << 3,
1040 SubBIT = 1 << 4,
1041 AndBIT = 1 << 5,
1042 OrBIT = 1 << 6,
1043 XorBIT = 1 << 7,
1044 MainOpBIT = 1 << 8,
1045 LLVM_MARK_AS_BITMASK_ENUM(MainOpBIT)
1046 };
1047 /// Return a non-nullptr if either operand of I is a ConstantInt.
1048 /// The second return value represents the operand position. We check the
1049 /// right-hand side first (1). If the right hand side is not a ConstantInt and
1050 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
1051 /// side (0).
1052 static std::pair<ConstantInt *, unsigned>
1053 isBinOpWithConstantInt(const Instruction *I) {
1054 unsigned Opcode = I->getOpcode();
1055 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
1056 (void)SupportedOp;
1057 auto *BinOp = cast<BinaryOperator>(I);
1058 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)))
1059 return {CI, 1};
1060 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
1061 Opcode == Instruction::AShr)
1062 return {nullptr, 0};
1063 if (auto *CI = dyn_cast<ConstantInt>(BinOp->getOperand(0)))
1064 return {CI, 0};
1065 return {nullptr, 0};
1066 }
1067 struct InterchangeableInfo {
1068 const Instruction *I = nullptr;
1069 /// The bit it sets represents whether MainOp can be converted to.
1070 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
1071 MulBIT | AShrBIT | ShlBIT;
1072 /// We cannot create an interchangeable instruction that does not exist in
1073 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
1074 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
1075 /// 1]. SeenBefore is used to know what operations have been seen before.
1076 MaskType SeenBefore = 0;
1077 InterchangeableInfo(const Instruction *I) : I(I) {}
1078 /// Return false allows BinOpSameOpcodeHelper to find an alternate
1079 /// instruction. Directly setting the mask will destroy the mask state,
1080 /// preventing us from determining which instruction it should convert to.
1081 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
1082 if (Mask & InterchangeableMask) {
1083 SeenBefore |= OpcodeInMaskForm;
1084 Mask &= InterchangeableMask;
1085 return true;
1086 }
1087 return false;
1088 }
1089 bool equal(unsigned Opcode) {
1090 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
1091 }
1092 unsigned getOpcode() const {
1093 MaskType Candidate = Mask & SeenBefore;
1094 if (Candidate & MainOpBIT)
1095 return I->getOpcode();
1096 if (Candidate & ShlBIT)
1097 return Instruction::Shl;
1098 if (Candidate & AShrBIT)
1099 return Instruction::AShr;
1100 if (Candidate & MulBIT)
1101 return Instruction::Mul;
1102 if (Candidate & AddBIT)
1103 return Instruction::Add;
1104 if (Candidate & SubBIT)
1105 return Instruction::Sub;
1106 if (Candidate & AndBIT)
1107 return Instruction::And;
1108 if (Candidate & OrBIT)
1109 return Instruction::Or;
1110 if (Candidate & XorBIT)
1111 return Instruction::Xor;
1112 llvm_unreachable("Cannot find interchangeable instruction.");
1113 }
1114
1115 bool hasDefinedOpcode() const { return (Mask & SeenBefore) > 0; }
1116
1117 /// Return true if the instruction can be converted to \p Opcode.
1118 bool hasCandidateOpcode(unsigned Opcode) const {
1119 MaskType Candidate = Mask & SeenBefore;
1120 switch (Opcode) {
1121 case Instruction::Shl:
1122 return Candidate & ShlBIT;
1123 case Instruction::AShr:
1124 return Candidate & AShrBIT;
1125 case Instruction::Mul:
1126 return Candidate & MulBIT;
1127 case Instruction::Add:
1128 return Candidate & AddBIT;
1129 case Instruction::Sub:
1130 return Candidate & SubBIT;
1131 case Instruction::And:
1132 return Candidate & AndBIT;
1133 case Instruction::Or:
1134 return Candidate & OrBIT;
1135 case Instruction::Xor:
1136 return Candidate & XorBIT;
1137 case Instruction::LShr:
1138 case Instruction::FAdd:
1139 case Instruction::FSub:
1140 case Instruction::FMul:
1141 case Instruction::SDiv:
1142 case Instruction::UDiv:
1143 case Instruction::FDiv:
1144 case Instruction::SRem:
1145 case Instruction::URem:
1146 case Instruction::FRem:
1147 return false;
1148 default:
1149 break;
1150 }
1151 llvm_unreachable("Cannot find interchangeable instruction.");
1152 }
1153
1154 SmallVector<Value *> getOperand(const Instruction *To) const {
1155 unsigned ToOpcode = To->getOpcode();
1156 unsigned FromOpcode = I->getOpcode();
1157 if (FromOpcode == ToOpcode)
1158 return SmallVector<Value *>(I->operands());
1159 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1160 auto [CI, Pos] = isBinOpWithConstantInt(I);
1161 const APInt &FromCIValue = CI->getValue();
1162 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1163 Type *RHSType = I->getOperand(Pos)->getType();
1164 Constant *RHS;
1165 switch (FromOpcode) {
1166 case Instruction::Shl:
1167 if (ToOpcode == Instruction::Add && FromCIValue.isOne())
1168 return {I->getOperand(0), I->getOperand(0)};
1169 if (ToOpcode == Instruction::Mul) {
1170 RHS = ConstantInt::get(
1171 RHSType, APInt::getOneBitSet(FromCIValueBitWidth,
1172 FromCIValue.getZExtValue()));
1173 } else {
1174 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1175 RHS = ConstantExpr::getBinOpIdentity(ToOpcode, RHSType,
1176 /*AllowRHSConstant=*/true);
1177 }
1178 break;
1179 case Instruction::Mul:
1180 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1181 if (ToOpcode == Instruction::Shl) {
1182 RHS = ConstantInt::get(
1183 RHSType, APInt(FromCIValueBitWidth, FromCIValue.logBase2()));
1184 } else {
1185 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1186 RHS = ConstantExpr::getBinOpIdentity(ToOpcode, RHSType,
1187 /*AllowRHSConstant=*/true);
1188 }
1189 break;
1190 case Instruction::Add:
1191 case Instruction::Sub:
1192 if (FromCIValue.isZero()) {
1193 RHS = ConstantExpr::getBinOpIdentity(ToOpcode, RHSType,
1194 /*AllowRHSConstant=*/true);
1195 } else {
1196 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1197 "Cannot convert the instruction.");
1198 APInt NegatedVal = APInt(FromCIValue);
1199 NegatedVal.negate();
1200 RHS = ConstantInt::get(RHSType, NegatedVal);
1201 }
1202 break;
1203 case Instruction::And:
1204 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1205 RHS = ConstantExpr::getBinOpIdentity(ToOpcode, RHSType,
1206 /*AllowRHSConstant=*/true);
1207 break;
1208 default:
1209 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1210 RHS = ConstantExpr::getBinOpIdentity(ToOpcode, RHSType,
1211 /*AllowRHSConstant=*/true);
1212 break;
1213 }
1214 Value *LHS = I->getOperand(1 - Pos);
1215 // If the target opcode is non-commutative (e.g., shl, sub),
1216 // force the variable to the left and the constant to the right.
1217 if (Pos == 1 || !Instruction::isCommutative(ToOpcode))
1218 return SmallVector<Value *>({LHS, RHS});
1219
1220 return SmallVector<Value *>({RHS, LHS});
1221 }
1222 };
1223 InterchangeableInfo MainOp;
1224 InterchangeableInfo AltOp;
1225 bool isValidForAlternation(const Instruction *I) const {
1226 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1227 ::isValidForAlternation(I->getOpcode());
1228 }
1229 bool initializeAltOp(const Instruction *I) {
1230 if (AltOp.I)
1231 return true;
1233 return false;
1234 AltOp.I = I;
1235 return true;
1236 }
1237
1238public:
1239 BinOpSameOpcodeHelper(const Instruction *MainOp,
1240 const Instruction *AltOp = nullptr)
1241 : MainOp(MainOp), AltOp(AltOp) {}
1242 bool add(const Instruction *I) {
1244 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1245 unsigned Opcode = I->getOpcode();
1246 MaskType OpcodeInMaskForm;
1247 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1248 switch (Opcode) {
1249 case Instruction::Shl:
1250 OpcodeInMaskForm = ShlBIT;
1251 break;
1252 case Instruction::AShr:
1253 OpcodeInMaskForm = AShrBIT;
1254 break;
1255 case Instruction::Mul:
1256 OpcodeInMaskForm = MulBIT;
1257 break;
1258 case Instruction::Add:
1259 OpcodeInMaskForm = AddBIT;
1260 break;
1261 case Instruction::Sub:
1262 OpcodeInMaskForm = SubBIT;
1263 break;
1264 case Instruction::And:
1265 OpcodeInMaskForm = AndBIT;
1266 break;
1267 case Instruction::Or:
1268 OpcodeInMaskForm = OrBIT;
1269 break;
1270 case Instruction::Xor:
1271 OpcodeInMaskForm = XorBIT;
1272 break;
1273 default:
1274 return MainOp.equal(Opcode) ||
1275 (initializeAltOp(I) && AltOp.equal(Opcode));
1276 }
1277 MaskType InterchangeableMask = OpcodeInMaskForm;
1278 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1279 if (CI) {
1280 constexpr MaskType CanBeAll =
1281 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1282 const APInt &CIValue = CI->getValue();
1283 switch (Opcode) {
1284 case Instruction::Shl:
1285 if (CIValue.ult(CIValue.getBitWidth()))
1286 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1287 if (CIValue.isOne())
1288 InterchangeableMask |= AddBIT;
1289 break;
1290 case Instruction::Mul:
1291 if (CIValue.isOne()) {
1292 InterchangeableMask = CanBeAll;
1293 break;
1294 }
1295 if (CIValue.isPowerOf2())
1296 InterchangeableMask = MulBIT | ShlBIT;
1297 break;
1298 case Instruction::Add:
1299 case Instruction::Sub:
1300 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1301 break;
1302 case Instruction::And:
1303 if (CIValue.isAllOnes())
1304 InterchangeableMask = CanBeAll;
1305 break;
1306 case Instruction::Xor:
1307 if (CIValue.isZero())
1308 InterchangeableMask = XorBIT | OrBIT | SubBIT | AddBIT;
1309 break;
1310 default:
1311 if (CIValue.isZero())
1312 InterchangeableMask = CanBeAll;
1313 break;
1314 }
1315 }
1316 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1317 (initializeAltOp(I) &&
1318 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1319 }
1320 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1321 bool hasDefinedMainOpcode() const { return MainOp.hasDefinedOpcode(); }
1322 /// Checks if the list of potential opcodes includes \p Opcode.
1323 bool hasCandidateOpcode(unsigned Opcode) const {
1324 return MainOp.hasCandidateOpcode(Opcode);
1325 }
1326 bool hasAltOp() const { return AltOp.I; }
1327 unsigned getAltOpcode() const {
1328 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1329 }
1330 bool hasDefinedAltOpcode() const {
1331 return !hasAltOp() || AltOp.hasDefinedOpcode();
1332 }
1333 SmallVector<Value *> getOperand(const Instruction *I) const {
1334 return MainOp.getOperand(I);
1335 }
1336};
1337
1338/// Main data required for vectorization of instructions.
1339class InstructionsState {
1340 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1341 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1342 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1343 /// isAltShuffle).
1344 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1345 /// from getMainAltOpsNoStateVL.
1346 /// For those InstructionsState that use alternate instructions, the resulting
1347 /// vectorized output ultimately comes from a shufflevector. For example,
1348 /// given a vector list (VL):
1349 /// VL[0] = add i32 a, e
1350 /// VL[1] = sub i32 b, f
1351 /// VL[2] = add i32 c, g
1352 /// VL[3] = sub i32 d, h
1353 /// The vectorized result would be:
1354 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1355 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1356 /// result = shufflevector <4 x i32> intermediated_0,
1357 /// <4 x i32> intermediated_1,
1358 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1359 /// Since shufflevector is used in the final result, when calculating the cost
1360 /// (getEntryCost), we must account for the usage of shufflevector in
1361 /// GetVectorCost.
1362 Instruction *MainOp = nullptr;
1363 Instruction *AltOp = nullptr;
1364 /// Wether the instruction state represents copyable instructions.
1365 bool HasCopyables = false;
1366
1367public:
1368 Instruction *getMainOp() const {
1369 assert(valid() && "InstructionsState is invalid.");
1370 return MainOp;
1371 }
1372
1373 Instruction *getAltOp() const {
1374 assert(valid() && "InstructionsState is invalid.");
1375 return AltOp;
1376 }
1377
1378 /// The main/alternate opcodes for the list of instructions.
1379 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1380
1381 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1382
1383 /// Some of the instructions in the list have alternate opcodes.
1384 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1385
1386 /// Checks if the instruction matches either the main or alternate opcode.
1387 /// \returns
1388 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1389 /// to it
1390 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1391 /// it
1392 /// - nullptr if \param I cannot be matched or converted to either opcode
1393 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1394 assert(MainOp && "MainOp cannot be nullptr.");
1395 if (I->getOpcode() == MainOp->getOpcode())
1396 return MainOp;
1397 if (MainOp->getOpcode() == Instruction::Select &&
1398 I->getOpcode() == Instruction::ZExt && !isAltShuffle())
1399 return MainOp;
1400 // Prefer AltOp instead of interchangeable instruction of MainOp.
1401 assert(AltOp && "AltOp cannot be nullptr.");
1402 if (I->getOpcode() == AltOp->getOpcode())
1403 return AltOp;
1404 if (!I->isBinaryOp())
1405 return nullptr;
1406 BinOpSameOpcodeHelper Converter(MainOp);
1407 if (!Converter.add(I) || !Converter.add(MainOp))
1408 return nullptr;
1409 if (isAltShuffle() && !Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1410 BinOpSameOpcodeHelper AltConverter(AltOp);
1411 if (AltConverter.add(I) && AltConverter.add(AltOp) &&
1412 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1413 return AltOp;
1414 }
1415 if (Converter.hasAltOp() && !isAltShuffle())
1416 return nullptr;
1417 return Converter.hasAltOp() ? AltOp : MainOp;
1418 }
1419
1420 /// Checks if main/alt instructions are shift operations.
1421 bool isShiftOp() const {
1422 return getMainOp()->isShift() && getAltOp()->isShift();
1423 }
1424
1425 /// Checks if main/alt instructions are bitwise logic operations.
1426 bool isBitwiseLogicOp() const {
1427 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1428 }
1429
1430 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1431 bool isMulDivLikeOp() const {
1432 constexpr std::array<unsigned, 8> MulDiv = {
1433 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1434 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1435 Instruction::URem, Instruction::FRem};
1436 return is_contained(MulDiv, getOpcode()) &&
1437 is_contained(MulDiv, getAltOpcode());
1438 }
1439
1440 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1441 bool isAddSubLikeOp() const {
1442 constexpr std::array<unsigned, 4> AddSub = {
1443 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1444 Instruction::FSub};
1445 return is_contained(AddSub, getOpcode()) &&
1446 is_contained(AddSub, getAltOpcode());
1447 }
1448
1449 /// Checks if main/alt instructions are cmp operations.
1450 bool isCmpOp() const {
1451 return (getOpcode() == Instruction::ICmp ||
1452 getOpcode() == Instruction::FCmp) &&
1453 getAltOpcode() == getOpcode();
1454 }
1455
1456 /// Checks if the current state is valid, i.e. has non-null MainOp
1457 bool valid() const { return MainOp && AltOp; }
1458
1459 explicit operator bool() const { return valid(); }
1460
1461 InstructionsState() = delete;
1462 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1463 bool HasCopyables = false)
1464 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1465 static InstructionsState invalid() { return {nullptr, nullptr}; }
1466
1467 /// Checks if the value is a copyable element.
1468 bool isCopyableElement(Value *V) const {
1469 assert(valid() && "InstructionsState is invalid.");
1470 if (!HasCopyables)
1471 return false;
1472 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1473 return false;
1474 auto *I = dyn_cast<Instruction>(V);
1475 if (!I)
1476 return !isa<PoisonValue>(V);
1477 if (I->getParent() != MainOp->getParent() &&
1480 return true;
1481 if (I->getOpcode() == MainOp->getOpcode())
1482 return false;
1483 if (!I->isBinaryOp())
1484 return true;
1485 BinOpSameOpcodeHelper Converter(MainOp);
1486 return !Converter.add(I) || !Converter.add(MainOp) ||
1487 Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
1488 }
1489
1490 /// Checks if the value \p V is a transformed instruction, compatible either
1491 /// with main or alternate ops.
1492 bool isExpandedBinOp(Value *V) const {
1493 assert(valid() && "InstructionsState is invalid.");
1494 if (isCopyableElement(V))
1495 return false;
1496 auto *ExpandingOp = dyn_cast<Instruction>(V);
1497 if (!ExpandingOp)
1498 return false;
1499 auto CheckForTransformedOpcode = [](const Instruction *RefOp,
1500 const Instruction *ExpandingOp) {
1501 switch (RefOp->getOpcode()) {
1502 case Instruction::Add:
1503 switch (ExpandingOp->getOpcode()) {
1504 case Instruction::Shl:
1505 return match(ExpandingOp, m_Shl(m_Value(), m_One()));
1506 default:
1507 break;
1508 }
1509 break;
1510 default:
1511 break;
1512 }
1513 return false;
1514 };
1515 Instruction *MainOp = getMatchingMainOpOrAltOp(ExpandingOp);
1516 assert(MainOp &&
1517 "The instruction should be compatible with either main or alt op.");
1518 return CheckForTransformedOpcode(MainOp, ExpandingOp);
1519 }
1520
1521 /// Checks if the operand at index \p Idx of instruction \p I is an expanded
1522 /// operand.
1523 bool isExpandedOperand(Instruction *I, unsigned Idx) const {
1524 assert(isExpandedBinOp(I) && "Expected an expanded binop.");
1525 switch (I->getOpcode()) {
1526 case Instruction::Shl:
1527 assert(match(I, m_Shl(m_Value(), m_One())) && "Expected shl x, 1 only.");
1528 return Idx == 1;
1529 default:
1530 llvm_unreachable("Unexpected opcode for an expanded operand.");
1531 }
1532 }
1533
1534 /// Checks if the value is non-schedulable.
1535 bool isNonSchedulable(Value *V) const {
1536 assert(valid() && "InstructionsState is invalid.");
1537 auto *I = dyn_cast<Instruction>(V);
1538 if (!HasCopyables)
1539 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1541 // MainOp for copyables always schedulable to correctly identify
1542 // non-schedulable copyables.
1543 if (getMainOp() == V)
1544 return false;
1545 if (isCopyableElement(V)) {
1546 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1547 auto *I = dyn_cast<Instruction>(V);
1548 return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
1550 // If the copyable instructions comes after MainOp
1551 // (non-schedulable, but used in the block) - cannot vectorize
1552 // it, will possibly generate use before def.
1553 !MainOp->comesBefore(I));
1554 };
1555
1556 return IsNonSchedulableCopyableElement(V);
1557 }
1558 return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
1560 }
1561
1562 /// Checks if the state represents copyable instructions.
1563 bool areInstructionsWithCopyableElements() const {
1564 assert(valid() && "InstructionsState is invalid.");
1565 return HasCopyables;
1566 }
1567};
1568
1569std::pair<Instruction *, SmallVector<Value *>>
1570convertTo(Instruction *I, const InstructionsState &S) {
1571 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1572 assert(SelectedOp && "Cannot convert the instruction.");
1573 if (I->isBinaryOp()) {
1574 BinOpSameOpcodeHelper Converter(I);
1575 return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp));
1576 }
1577 return std::make_pair(SelectedOp, SmallVector<Value *>(I->operands()));
1578}
1579
1580} // end anonymous namespace
1581
1582static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1583 const TargetLibraryInfo &TLI);
1584
1585/// Find an instruction with a specific opcode in VL.
1586/// \param VL Array of values to search through. Must contain only Instructions
1587/// and PoisonValues.
1588/// \param Opcode The instruction opcode to search for
1589/// \returns
1590/// - The first instruction found with matching opcode
1591/// - nullptr if no matching instruction is found
1593 unsigned Opcode) {
1594 for (Value *V : VL) {
1595 if (isa<PoisonValue>(V))
1596 continue;
1597 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1598 auto *Inst = cast<Instruction>(V);
1599 if (Inst->getOpcode() == Opcode)
1600 return Inst;
1601 }
1602 return nullptr;
1603}
1604
1605/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1606/// compatible instructions or constants, or just some other regular values.
1607static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1608 Value *Op1, const TargetLibraryInfo &TLI) {
1609 return (isConstant(BaseOp0) && isConstant(Op0)) ||
1610 (isConstant(BaseOp1) && isConstant(Op1)) ||
1611 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
1612 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
1613 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1614 getSameOpcode({BaseOp0, Op0}, TLI) ||
1615 getSameOpcode({BaseOp1, Op1}, TLI);
1616}
1617
1618/// \returns true if a compare instruction \p CI has similar "look" and
1619/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1620/// swapped, false otherwise.
1621static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1622 const TargetLibraryInfo &TLI) {
1623 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1624 "Assessing comparisons of different types?");
1625 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1626 CmpInst::Predicate Pred = CI->getPredicate();
1628
1629 Value *BaseOp0 = BaseCI->getOperand(0);
1630 Value *BaseOp1 = BaseCI->getOperand(1);
1631 Value *Op0 = CI->getOperand(0);
1632 Value *Op1 = CI->getOperand(1);
1633
1634 return (BasePred == Pred &&
1635 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1636 (BasePred == SwappedPred &&
1637 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
1638}
1639
1640/// \returns analysis of the Instructions in \p VL described in
1641/// InstructionsState, the Opcode that we suppose the whole list
1642/// could be vectorized even if its structure is diverse.
1643static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1644 const TargetLibraryInfo &TLI) {
1645 // Make sure these are all Instructions.
1647 return InstructionsState::invalid();
1648
1649 auto *It = find_if(VL, IsaPred<Instruction>);
1650 if (It == VL.end())
1651 return InstructionsState::invalid();
1652
1653 Instruction *MainOp = cast<Instruction>(*It);
1654 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
1655 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
1656 (VL.size() == 2 && InstCnt < 2))
1657 return InstructionsState::invalid();
1658
1659 bool IsCastOp = isa<CastInst>(MainOp);
1660 bool IsBinOp = isa<BinaryOperator>(MainOp);
1661 bool IsCmpOp = isa<CmpInst>(MainOp);
1662 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
1664 Instruction *AltOp = MainOp;
1665 unsigned Opcode = MainOp->getOpcode();
1666 unsigned AltOpcode = Opcode;
1667
1668 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1669 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1670 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1671 UniquePreds.insert(BasePred);
1672 UniqueNonSwappedPreds.insert(BasePred);
1673 for (Value *V : VL) {
1674 auto *I = dyn_cast<CmpInst>(V);
1675 if (!I)
1676 return false;
1677 CmpInst::Predicate CurrentPred = I->getPredicate();
1678 CmpInst::Predicate SwappedCurrentPred =
1679 CmpInst::getSwappedPredicate(CurrentPred);
1680 UniqueNonSwappedPreds.insert(CurrentPred);
1681 if (!UniquePreds.contains(CurrentPred) &&
1682 !UniquePreds.contains(SwappedCurrentPred))
1683 UniquePreds.insert(CurrentPred);
1684 }
1685 // Total number of predicates > 2, but if consider swapped predicates
1686 // compatible only 2, consider swappable predicates as compatible opcodes,
1687 // not alternate.
1688 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1689 }();
1690 // Check for one alternate opcode from another BinaryOperator.
1691 // TODO - generalize to support all operators (types, calls etc.).
1692 Intrinsic::ID BaseID = 0;
1693 SmallVector<VFInfo> BaseMappings;
1694 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
1695 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
1696 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
1697 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
1698 return InstructionsState::invalid();
1699 }
1700 bool AnyPoison = InstCnt != VL.size();
1701 // Check MainOp too to be sure that it matches the requirements for the
1702 // instructions.
1703 for (Value *V : iterator_range(It, VL.end())) {
1704 auto *I = dyn_cast<Instruction>(V);
1705 if (!I)
1706 continue;
1707
1708 // Cannot combine poison and divisions.
1709 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1710 // intrinsics/functions only.
1711 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
1712 return InstructionsState::invalid();
1713 unsigned InstOpcode = I->getOpcode();
1714 if (IsBinOp && isa<BinaryOperator>(I)) {
1715 if (BinOpHelper.add(I))
1716 continue;
1717 } else if (IsCastOp && isa<CastInst>(I)) {
1718 Value *Op0 = MainOp->getOperand(0);
1719 Type *Ty0 = Op0->getType();
1720 Value *Op1 = I->getOperand(0);
1721 Type *Ty1 = Op1->getType();
1722 if (Ty0 == Ty1) {
1723 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1724 continue;
1725 if (Opcode == AltOpcode) {
1726 assert(isValidForAlternation(Opcode) &&
1727 isValidForAlternation(InstOpcode) &&
1728 "Cast isn't safe for alternation, logic needs to be updated!");
1729 AltOpcode = InstOpcode;
1730 AltOp = I;
1731 continue;
1732 }
1733 }
1734 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1735 auto *BaseInst = cast<CmpInst>(MainOp);
1736 Type *Ty0 = BaseInst->getOperand(0)->getType();
1737 Type *Ty1 = Inst->getOperand(0)->getType();
1738 if (Ty0 == Ty1) {
1739 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1740 assert(InstOpcode == AltOpcode &&
1741 "Alternate instructions are only supported by BinaryOperator "
1742 "and CastInst.");
1743 // Check for compatible operands. If the corresponding operands are not
1744 // compatible - need to perform alternate vectorization.
1745 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1746 CmpInst::Predicate SwappedCurrentPred =
1747 CmpInst::getSwappedPredicate(CurrentPred);
1748
1749 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1750 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1751 continue;
1752
1753 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1754 continue;
1755 auto *AltInst = cast<CmpInst>(AltOp);
1756 if (MainOp != AltOp) {
1757 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1758 continue;
1759 } else if (BasePred != CurrentPred) {
1760 assert(
1761 isValidForAlternation(InstOpcode) &&
1762 "CmpInst isn't safe for alternation, logic needs to be updated!");
1763 AltOp = I;
1764 continue;
1765 }
1766 CmpInst::Predicate AltPred = AltInst->getPredicate();
1767 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1768 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1769 continue;
1770 }
1771 } else if (InstOpcode == Opcode) {
1772 assert(InstOpcode == AltOpcode &&
1773 "Alternate instructions are only supported by BinaryOperator and "
1774 "CastInst.");
1775 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1776 if (Gep->getNumOperands() != 2 ||
1777 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1778 return InstructionsState::invalid();
1779 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1781 return InstructionsState::invalid();
1782 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1783 auto *BaseLI = cast<LoadInst>(MainOp);
1784 if (!LI->isSimple() || !BaseLI->isSimple())
1785 return InstructionsState::invalid();
1786 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1787 auto *CallBase = cast<CallInst>(MainOp);
1788 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1789 return InstructionsState::invalid();
1790 if (Call->hasOperandBundles() &&
1792 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1793 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1794 CallBase->op_begin() +
1796 return InstructionsState::invalid();
1798 if (ID != BaseID)
1799 return InstructionsState::invalid();
1800 if (!ID) {
1801 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1802 if (Mappings.size() != BaseMappings.size() ||
1803 Mappings.front().ISA != BaseMappings.front().ISA ||
1804 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1805 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1806 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1807 Mappings.front().Shape.Parameters !=
1808 BaseMappings.front().Shape.Parameters)
1809 return InstructionsState::invalid();
1810 }
1811 }
1812 continue;
1813 }
1814 return InstructionsState::invalid();
1815 }
1816
1817 if (IsBinOp) {
1818 if (!BinOpHelper.hasDefinedMainOpcode() ||
1819 !BinOpHelper.hasDefinedAltOpcode())
1820 return InstructionsState::invalid();
1821 MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode());
1822 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1823 AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode());
1824 assert(AltOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1825 }
1826 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1827 "Incorrect implementation of allSameOpcode.");
1828 InstructionsState S(MainOp, AltOp);
1829 assert(all_of(VL,
1830 [&](Value *V) {
1831 return isa<PoisonValue>(V) ||
1832 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1833 }) &&
1834 "Invalid InstructionsState.");
1835 return S;
1836}
1837
1838/// \returns true if all of the values in \p VL have the same type or false
1839/// otherwise.
1841 Type *Ty = VL.consume_front()->getType();
1842 return all_of(VL, [&](Value *V) { return V->getType() == Ty; });
1843}
1844
1845/// \returns True if in-tree use also needs extract. This refers to
1846/// possible scalar operand in vectorized instruction.
1847static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1848 TargetLibraryInfo *TLI,
1849 const TargetTransformInfo *TTI) {
1850 if (!UserInst)
1851 return false;
1852 unsigned Opcode = UserInst->getOpcode();
1853 switch (Opcode) {
1854 case Instruction::Load: {
1855 LoadInst *LI = cast<LoadInst>(UserInst);
1856 return (LI->getPointerOperand() == Scalar);
1857 }
1858 case Instruction::Store: {
1859 StoreInst *SI = cast<StoreInst>(UserInst);
1860 return (SI->getPointerOperand() == Scalar);
1861 }
1862 case Instruction::Call: {
1863 CallInst *CI = cast<CallInst>(UserInst);
1865 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1866 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1867 Arg.value().get() == Scalar;
1868 });
1869 }
1870 default:
1871 return false;
1872 }
1873}
1874
1875/// \returns the AA location that is being access by the instruction.
1878 return MemoryLocation::get(SI);
1879 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1880 return MemoryLocation::get(LI);
1881 return MemoryLocation();
1882}
1883
1884/// \returns True if the instruction is not a volatile or atomic load/store.
1885static bool isSimple(Instruction *I) {
1886 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1887 return LI->isSimple();
1889 return SI->isSimple();
1891 return !MI->isVolatile();
1892 return true;
1893}
1894
1895/// Shuffles \p Mask in accordance with the given \p SubMask.
1896/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1897/// one but two input vectors.
1898static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1899 bool ExtendingManyInputs = false) {
1900 if (SubMask.empty())
1901 return;
1902 assert(
1903 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1904 // Check if input scalars were extended to match the size of other node.
1905 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1906 "SubMask with many inputs support must be larger than the mask.");
1907 if (Mask.empty()) {
1908 Mask.append(SubMask.begin(), SubMask.end());
1909 return;
1910 }
1911 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1912 int TermValue = std::min(Mask.size(), SubMask.size());
1913 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1914 if (SubMask[I] == PoisonMaskElem ||
1915 (!ExtendingManyInputs &&
1916 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1917 continue;
1918 NewMask[I] = Mask[SubMask[I]];
1919 }
1920 Mask.swap(NewMask);
1921}
1922
1923/// Order may have elements assigned special value (size) which is out of
1924/// bounds. Such indices only appear on places which correspond to undef values
1925/// (see canReuseExtract for details) and used in order to avoid undef values
1926/// have effect on operands ordering.
1927/// The first loop below simply finds all unused indices and then the next loop
1928/// nest assigns these indices for undef values positions.
1929/// As an example below Order has two undef positions and they have assigned
1930/// values 3 and 7 respectively:
1931/// before: 6 9 5 4 9 2 1 0
1932/// after: 6 3 5 4 7 2 1 0
1934 const size_t Sz = Order.size();
1935 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1936 SmallBitVector MaskedIndices(Sz);
1937 for (unsigned I = 0; I < Sz; ++I) {
1938 if (Order[I] < Sz)
1939 UnusedIndices.reset(Order[I]);
1940 else
1941 MaskedIndices.set(I);
1942 }
1943 if (MaskedIndices.none())
1944 return;
1945 assert(UnusedIndices.count() == MaskedIndices.count() &&
1946 "Non-synced masked/available indices.");
1947 int Idx = UnusedIndices.find_first();
1948 int MIdx = MaskedIndices.find_first();
1949 while (MIdx >= 0) {
1950 assert(Idx >= 0 && "Indices must be synced.");
1951 Order[MIdx] = Idx;
1952 Idx = UnusedIndices.find_next(Idx);
1953 MIdx = MaskedIndices.find_next(MIdx);
1954 }
1955}
1956
1957/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1958/// Opcode1.
1960 unsigned Opcode0, unsigned Opcode1) {
1961 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1962 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1963 for (unsigned Lane : seq<unsigned>(VL.size())) {
1964 if (isa<PoisonValue>(VL[Lane]))
1965 continue;
1966 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1967 OpcodeMask.set(Lane * ScalarTyNumElements,
1968 Lane * ScalarTyNumElements + ScalarTyNumElements);
1969 }
1970 return OpcodeMask;
1971}
1972
1973/// Replicates the given \p Val \p VF times.
1975 unsigned VF) {
1976 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1977 "Expected scalar constants.");
1978 SmallVector<Constant *> NewVal(Val.size() * VF);
1979 for (auto [I, V] : enumerate(Val))
1980 std::fill_n(NewVal.begin() + I * VF, VF, V);
1981 return NewVal;
1982}
1983
1985 SmallVectorImpl<int> &Mask) {
1986 Mask.clear();
1987 const unsigned E = Indices.size();
1988 Mask.resize(E, PoisonMaskElem);
1989 for (unsigned I = 0; I < E; ++I)
1990 Mask[Indices[I]] = I;
1991}
1992
1993/// Reorders the list of scalars in accordance with the given \p Mask.
1995 ArrayRef<int> Mask) {
1996 assert(!Mask.empty() && "Expected non-empty mask.");
1997 SmallVector<Value *> Prev(Scalars.size(),
1998 PoisonValue::get(Scalars.front()->getType()));
1999 Prev.swap(Scalars);
2000 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
2001 if (Mask[I] != PoisonMaskElem)
2002 Scalars[Mask[I]] = Prev[I];
2003}
2004
2005/// Checks if the provided value does not require scheduling. It does not
2006/// require scheduling if this is not an instruction or it is an instruction
2007/// that does not read/write memory and all operands are either not instructions
2008/// or phi nodes or instructions from different blocks.
2010 auto *I = dyn_cast<Instruction>(V);
2011 if (!I)
2012 return true;
2013 return !mayHaveNonDefUseDependency(*I) &&
2014 all_of(I->operands(), [I](Value *V) {
2015 auto *IO = dyn_cast<Instruction>(V);
2016 if (!IO)
2017 return true;
2018 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
2019 });
2020}
2021
2022/// Checks if the provided value does not require scheduling. It does not
2023/// require scheduling if this is not an instruction or it is an instruction
2024/// that does not read/write memory and all users are phi nodes or instructions
2025/// from the different blocks.
2026static bool isUsedOutsideBlock(Value *V) {
2027 auto *I = dyn_cast<Instruction>(V);
2028 if (!I)
2029 return true;
2030 // Limits the number of uses to save compile time.
2031 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
2032 all_of(I->users(), [I](User *U) {
2033 auto *IU = dyn_cast<Instruction>(U);
2034 if (!IU)
2035 return true;
2036 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
2037 });
2038}
2039
2040/// Checks if the specified value does not require scheduling. It does not
2041/// require scheduling if all operands and all users do not need to be scheduled
2042/// in the current basic block.
2045}
2046
2047/// Checks if the specified array of instructions does not require scheduling.
2048/// It is so if all either instructions have operands that do not require
2049/// scheduling or their users do not require scheduling since they are phis or
2050/// in other basic blocks.
2052 return !VL.empty() &&
2054}
2055
2056/// Returns true if widened type of \p Ty elements with size \p Sz represents
2057/// full vector type, i.e. adding extra element results in extra parts upon type
2058/// legalization.
2060 unsigned Sz) {
2061 if (Sz <= 1)
2062 return false;
2064 return false;
2065 if (has_single_bit(Sz))
2066 return true;
2067 if (isa<StructType>(Ty))
2068 return false;
2069 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
2070 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
2071 Sz % NumParts == 0;
2072}
2073
2074/// Returns number of parts, the type \p VecTy will be split at the codegen
2075/// phase. If the type is going to be scalarized or does not uses whole
2076/// registers, returns 1.
2077static unsigned
2079 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
2080 if (isa<StructType>(VecTy))
2081 return 1;
2082 unsigned NumParts = TTI.getNumberOfParts(VecTy);
2083 if (NumParts == 0 || NumParts >= Limit)
2084 return 1;
2085 unsigned Sz = getNumElements(VecTy);
2086 unsigned ScalarSz = getNumElements(ScalarTy);
2087 Type *ElementTy = toScalarizedTy(VecTy);
2088 unsigned PWSz = getFullVectorNumberOfElements(TTI, ElementTy, Sz);
2089 if (NumParts >= Sz || PWSz % NumParts != 0 ||
2090 (PWSz / NumParts) % ScalarSz != 0 ||
2091 !hasFullVectorsOrPowerOf2(TTI, ElementTy, PWSz / NumParts))
2092 return 1;
2093 const unsigned NumElts = PWSz / NumParts;
2094 if (divideCeil(Sz, NumElts) != NumParts)
2095 return 1;
2096 return NumParts;
2097}
2098
2099/// Bottom Up SLP Vectorizer.
2101 class TreeEntry;
2102 class ScheduleEntity;
2103 class ScheduleData;
2104 class ScheduleCopyableData;
2105 class ScheduleBundle;
2108
2109public:
2110 /// If we decide to generate strided load / store, this struct contains all
2111 /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
2112 /// and analyzeConstantStrideCandidate. Note that Stride can be given either
2113 /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
2114 /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
2115 /// size of element of FixedVectorType.
2117 Value *StrideVal = nullptr;
2118 const SCEV *StrideSCEV = nullptr;
2120 };
2121
2122 /// Tracks the state we can represent the loads in the given sequence.
2130
2137
2139 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
2141 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
2142 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
2143 AC(AC), DB(DB), DL(DL), ORE(ORE),
2144 Builder(Se->getContext(), TargetFolder(*DL)) {
2145 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
2146 // Use the vector register size specified by the target unless overridden
2147 // by a command-line option.
2148 // TODO: It would be better to limit the vectorization factor based on
2149 // data type rather than just register size. For example, x86 AVX has
2150 // 256-bit registers, but it does not support integer operations
2151 // at that width (that requires AVX2).
2152 if (MaxVectorRegSizeOption.getNumOccurrences())
2153 MaxVecRegSize = MaxVectorRegSizeOption;
2154 else
2155 MaxVecRegSize =
2156 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
2157 .getFixedValue();
2158
2159 if (MinVectorRegSizeOption.getNumOccurrences())
2160 MinVecRegSize = MinVectorRegSizeOption;
2161 else
2162 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2163 }
2164
2165 /// Vectorize the tree that starts with the elements in \p VL.
2166 /// Returns the vectorized root.
2168
2169 /// Vectorize the tree but with the list of externally used values \p
2170 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
2171 /// generated extractvalue instructions.
2172 Value *
2173 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
2174 Instruction *ReductionRoot = nullptr,
2175 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
2176 VectorValuesAndScales = {});
2177
2178 /// \returns the cost incurred by unwanted spills and fills, caused by
2179 /// holding live values over call sites.
2181
2182 /// Calculates the cost of the subtrees, trims non-profitable ones and returns
2183 /// final cost.
2186 Instruction *RdxRoot = nullptr);
2187
2188 /// \returns the vectorization cost of the subtree that starts at \p VL.
2189 /// A negative number means that this is profitable.
2191 ArrayRef<Value *> VectorizedVals = {},
2192 InstructionCost ReductionCost = TTI::TCC_Free,
2193 Instruction *RdxRoot = nullptr);
2194
2195 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
2196 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
2197 void buildTree(ArrayRef<Value *> Roots,
2198 const SmallDenseSet<Value *> &UserIgnoreLst);
2199
2200 /// Construct a vectorizable tree that starts at \p Roots.
2201 void buildTree(ArrayRef<Value *> Roots);
2202
2203 /// Return the scalars of the root node.
2205 assert(!VectorizableTree.empty() && "No graph to get the first node from");
2206 return VectorizableTree.front()->Scalars;
2207 }
2208
2209 /// Returns the type/is-signed info for the root node in the graph without
2210 /// casting.
2211 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2212 const TreeEntry &Root = *VectorizableTree.front();
2213 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2214 !Root.Scalars.front()->getType()->isIntegerTy())
2215 return std::nullopt;
2216 auto It = MinBWs.find(&Root);
2217 if (It != MinBWs.end())
2218 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
2219 It->second.first),
2220 It->second.second);
2221 if (Root.getOpcode() == Instruction::ZExt ||
2222 Root.getOpcode() == Instruction::SExt)
2223 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
2224 Root.getOpcode() == Instruction::SExt);
2225 return std::nullopt;
2226 }
2227
2228 /// Checks if the root graph node can be emitted with narrower bitwidth at
2229 /// codegen and returns it signedness, if so.
2231 return MinBWs.at(VectorizableTree.front().get()).second;
2232 }
2233
2234 /// Returns reduction type after minbitdth analysis.
2236 if (ReductionBitWidth == 0 ||
2237 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2238 ReductionBitWidth >=
2239 DL->getTypeSizeInBits(
2240 VectorizableTree.front()->Scalars.front()->getType()))
2241 return cast<FixedVectorType>(
2242 getWidenedType(VectorizableTree.front()->Scalars.front()->getType(),
2243 VectorizableTree.front()->getVectorFactor()));
2246 VectorizableTree.front()->Scalars.front()->getContext(),
2247 ReductionBitWidth),
2248 VectorizableTree.front()->getVectorFactor()));
2249 }
2250
2251 /// Returns true if the tree results in one of the reduced bitcasts variants.
2253 return VectorizableTree.front()->hasState() &&
2254 (VectorizableTree.front()->CombinedOp == TreeEntry::ReducedBitcast ||
2255 VectorizableTree.front()->CombinedOp ==
2256 TreeEntry::ReducedBitcastBSwap ||
2257 VectorizableTree.front()->CombinedOp ==
2258 TreeEntry::ReducedBitcastLoads ||
2259 VectorizableTree.front()->CombinedOp ==
2260 TreeEntry::ReducedBitcastBSwapLoads) &&
2261 VectorizableTree.front()->State == TreeEntry::Vectorize;
2262 }
2263
2264 /// Returns true if the tree results in the reduced cmp bitcast root.
2266 return VectorizableTree.front()->hasState() &&
2267 VectorizableTree.front()->CombinedOp ==
2268 TreeEntry::ReducedCmpBitcast &&
2269 VectorizableTree.front()->State == TreeEntry::Vectorize;
2270 }
2271
2272 /// Returns true if the tree is a reduction tree.
2273 bool isReductionTree() const { return UserIgnoreList != nullptr; }
2274
2275 /// Builds external uses of the vectorized scalars, i.e. the list of
2276 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2277 /// ExternallyUsedValues contains additional list of external uses to handle
2278 /// vectorization of reductions.
2279 void
2280 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2281
2282 /// Transforms graph nodes to target specific representations, if profitable.
2283 void transformNodes();
2284
2285 /// Clear the internal data structures that are created by 'buildTree'.
2286 void deleteTree() {
2287 VectorizableTree.clear();
2288 ScalarToTreeEntries.clear();
2289 DeletedNodes.clear();
2290 TransformedToGatherNodes.clear();
2291 OperandsToTreeEntry.clear();
2292 ScalarsInSplitNodes.clear();
2293 MustGather.clear();
2294 NonScheduledFirst.clear();
2295 EntryToLastInstruction.clear();
2296 LastInstructionToPos.clear();
2297 LoadEntriesToVectorize.clear();
2298 IsGraphTransformMode = false;
2299 GatheredLoadsEntriesFirst.reset();
2300 CompressEntryToData.clear();
2301 ExternalUses.clear();
2302 ExternalUsesAsOriginalScalar.clear();
2303 ExternalUsesWithNonUsers.clear();
2304 for (auto &Iter : BlocksSchedules) {
2305 BlockScheduling *BS = Iter.second.get();
2306 BS->clear();
2307 }
2308 MinBWs.clear();
2309 ReductionBitWidth = 0;
2310 BaseGraphSize = 1;
2311 CastMaxMinBWSizes.reset();
2312 ExtraBitWidthNodes.clear();
2313 InstrElementSize.clear();
2314 UserIgnoreList = nullptr;
2315 PostponedGathers.clear();
2316 ValueToGatherNodes.clear();
2317 TreeEntryToStridedPtrInfoMap.clear();
2318 CurrentLoopNest.clear();
2319 MergedLoopBTCs.clear();
2320 }
2321
2322 unsigned getTreeSize() const { return VectorizableTree.size(); }
2323
2324 /// Returns the base graph size, before any transformations.
2325 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2326
2327 /// Perform LICM and CSE on the newly generated gather sequences.
2329
2330 /// Does this non-empty order represent an identity order? Identity
2331 /// should be represented as an empty order, so this is used to
2332 /// decide if we can canonicalize a computed order. Undef elements
2333 /// (represented as size) are ignored.
2335 assert(!Order.empty() && "expected non-empty order");
2336 const unsigned Sz = Order.size();
2337 return all_of(enumerate(Order), [&](const auto &P) {
2338 return P.value() == P.index() || P.value() == Sz;
2339 });
2340 }
2341
2342 /// Checks if the specified gather tree entry \p TE can be represented as a
2343 /// shuffled vector entry + (possibly) permutation with other gathers. It
2344 /// implements the checks only for possibly ordered scalars (Loads,
2345 /// ExtractElement, ExtractValue), which can be part of the graph.
2346 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2347 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2348 /// node might be ignored.
2349 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2350 bool TopToBottom,
2351 bool IgnoreReorder);
2352
2353 /// Sort loads into increasing pointers offsets to allow greater clustering.
2354 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2355
2356 /// Gets reordering data for the given tree entry. If the entry is vectorized
2357 /// - just return ReorderIndices, otherwise check if the scalars can be
2358 /// reordered and return the most optimal order.
2359 /// \return std::nullopt if ordering is not important, empty order, if
2360 /// identity order is important, or the actual order.
2361 /// \param TopToBottom If true, include the order of vectorized stores and
2362 /// insertelement nodes, otherwise skip them.
2363 /// \param IgnoreReorder true, if the root node order can be ignored.
2364 std::optional<OrdersType>
2365 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2366
2367 /// Checks if it is profitable to reorder the current tree.
2368 /// If the tree does not contain many profitable reordable nodes, better to
2369 /// skip it to save compile time.
2370 bool isProfitableToReorder() const;
2371
2372 /// Reorders the current graph to the most profitable order starting from the
2373 /// root node to the leaf nodes. The best order is chosen only from the nodes
2374 /// of the same size (vectorization factor). Smaller nodes are considered
2375 /// parts of subgraph with smaller VF and they are reordered independently. We
2376 /// can make it because we still need to extend smaller nodes to the wider VF
2377 /// and we can merge reordering shuffles with the widening shuffles.
2378 void reorderTopToBottom();
2379
2380 /// Reorders the current graph to the most profitable order starting from
2381 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2382 /// number of reshuffles if the leaf nodes use the same order. In this case we
2383 /// can merge the orders and just shuffle user node instead of shuffling its
2384 /// operands. Plus, even the leaf nodes have different orders, it allows to
2385 /// sink reordering in the graph closer to the root node and merge it later
2386 /// during analysis.
2387 void reorderBottomToTop(bool IgnoreReorder = false);
2388
2389 /// \return The vector element size in bits to use when vectorizing the
2390 /// expression tree ending at \p V. If V is a store, the size is the width of
2391 /// the stored value. Otherwise, the size is the width of the largest loaded
2392 /// value reaching V. This method is used by the vectorizer to calculate
2393 /// vectorization factors.
2394 unsigned getVectorElementSize(Value *V);
2395
2396 /// Compute the minimum type sizes required to represent the entries in a
2397 /// vectorizable tree.
2399
2400 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2401 unsigned getMaxVecRegSize() const {
2402 return MaxVecRegSize;
2403 }
2404
2405 // \returns minimum vector register size as set by cl::opt.
2406 unsigned getMinVecRegSize() const {
2407 return MinVecRegSize;
2408 }
2409
2410 unsigned getMinVF(unsigned Sz) const {
2411 return std::max(2U, getMinVecRegSize() / Sz);
2412 }
2413
2414 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2415 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2416 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2417 return MaxVF ? MaxVF : UINT_MAX;
2418 }
2419
2420 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2421 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2422 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2423 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2424 ///
2425 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2426 unsigned canMapToVector(Type *T) const;
2427
2428 /// \returns True if the VectorizableTree is both tiny and not fully
2429 /// vectorizable. We do not vectorize such trees.
2430 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2431
2432 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2433 /// It may happen, if all gather nodes are loads and they cannot be
2434 /// "clusterized". In this case even subgraphs cannot be vectorized more
2435 /// effectively than the base graph.
2436 bool isTreeNotExtendable() const;
2437
2438 bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2439 Align Alignment, const int64_t Diff,
2440 const size_t Sz) const;
2441
2442 /// Return true if an array of scalar loads can be replaced with a strided
2443 /// load (with constant stride).
2444 ///
2445 /// It is possible that the load gets "widened". Suppose that originally each
2446 /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
2447 /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
2448 /// ...
2449 /// %b + 0 * %s + (w - 1)
2450 ///
2451 /// %b + 1 * %s + 0
2452 /// %b + 1 * %s + 1
2453 /// %b + 1 * %s + 2
2454 /// ...
2455 /// %b + 1 * %s + (w - 1)
2456 /// ...
2457 ///
2458 /// %b + (n - 1) * %s + 0
2459 /// %b + (n - 1) * %s + 1
2460 /// %b + (n - 1) * %s + 2
2461 /// ...
2462 /// %b + (n - 1) * %s + (w - 1)
2463 ///
2464 /// In this case we will generate a strided load of type `<n x (k * w)>`.
2465 ///
2466 /// \param PointerOps list of pointer arguments of loads.
2467 /// \param ElemTy original scalar type of loads.
2468 /// \param Alignment alignment of the first load.
2469 /// \param SortedIndices is the order of PointerOps as returned by
2470 /// `sortPtrAccesses`
2471 /// \param Diff Pointer difference between the lowest and the highes pointer
2472 /// in `PointerOps` as returned by `getPointersDiff`.
2473 /// \param Ptr0 first pointer in `PointersOps`.
2474 /// \param PtrN last pointer in `PointersOps`.
2475 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2476 /// of `SPtrInfo` necessary to generate the strided load later.
2478 const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
2479 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
2480 Value *Ptr0, StridedPtrInfo &SPtrInfo) const;
2481
2482 /// Return true if an array of scalar loads can be replaced with a strided
2483 /// load (with run-time stride).
2484 /// \param PointerOps list of pointer arguments of loads.
2485 /// \param ScalarTy type of loads.
2486 /// \param CommonAlignment common alignement of loads as computed by
2487 /// `computeCommonAlignment<LoadInst>`.
2488 /// \param SortedIndicies is a list of indicies computed by this function such
2489 /// that the sequence `PointerOps[SortedIndices[0]],
2490 /// PointerOps[SortedIndicies[1]], ..., PointerOps[SortedIndices[n]]` is
2491 /// ordered by the coefficient of the stride. For example, if PointerOps is
2492 /// `%base + %stride, %base, %base + 2 * stride` the `SortedIndices` will be
2493 /// `[1, 0, 2]`. We follow the convention that if `SortedIndices` has to be
2494 /// `0, 1, 2, 3, ...` we return empty vector for `SortedIndicies`.
2495 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2496 /// of `SPtrInfo` necessary to generate the strided load later.
2497 /// \param IsLoad Is this a strided load (true) or strided store (false)
2498 bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2499 Align CommonAlignment,
2500 SmallVectorImpl<unsigned> &SortedIndices,
2501 StridedPtrInfo &SPtrInfo, bool IsLoad) const;
2502
2503 /// Checks if the given array of loads can be represented as a vectorized,
2504 /// scatter or just simple gather.
2505 /// \param VL list of loads.
2506 /// \param VL0 main load value.
2507 /// \param Order returned order of load instructions.
2508 /// \param PointerOps returned list of pointer operands.
2509 /// \param BestVF return best vector factor, if recursive check found better
2510 /// vectorization sequences rather than masked gather.
2511 /// \param TryRecursiveCheck used to check if long masked gather can be
2512 /// represented as a serie of loads/insert subvector, if profitable.
2515 SmallVectorImpl<Value *> &PointerOps,
2516 StridedPtrInfo &SPtrInfo,
2517 unsigned *BestVF = nullptr,
2518 bool TryRecursiveCheck = true) const;
2519
2520 /// Checks whether some existing tree entry has scalars equal to \p VL.
2521 /// \p S is the common opcode of \p VL when one exists; an empty \p S means
2522 /// the values have no common opcode (mixed buildvector/gather candidates).
2523 bool hasSameNode(const InstructionsState &S, ArrayRef<Value *> VL) const {
2524 auto IsSame = [&](const TreeEntry *TE) { return TE->isSame(VL); };
2525 if (S) {
2526 // Any vectorized or gather entry equal to VL must contain S.getMainOp()
2527 // (the representative instruction, which is also the recorded scalar
2528 // for copyable-elements bundles), so probing the MainOp-indexed maps
2529 // is sufficient and avoids scanning the whole tree.
2530 return any_of(getTreeEntries(S.getMainOp()), IsSame) ||
2531 any_of(ValueToGatherNodes.lookup(S.getMainOp()), IsSame);
2532 }
2533 // No common opcode: only gather entries can match. Each non-constant
2534 // value in VL has to be in the gather entry's scalar list and is
2535 // therefore present in ValueToGatherNodes. Probe by VL members instead
2536 // of scanning the whole tree (O(tree) -> O(|VL|)).
2538 for (Value *V : VL) {
2539 // Constants/poisons are not tracked in ValueToGatherNodes.
2540 if (isConstant(V))
2541 continue;
2542 for (const TreeEntry *TE : ValueToGatherNodes.lookup(V)) {
2543 if (!Visited.insert(TE).second)
2544 continue;
2545 if (IsSame(TE))
2546 return true;
2547 }
2548 }
2549 return false;
2550 }
2551
2552 /// Registers non-vectorizable sequence of loads
2553 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2554 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2555 }
2556
2557 /// Checks if the given loads sequence is known as not vectorizable
2558 template <typename T>
2560 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
2561 }
2562
2564
2565 /// This structure holds any data we need about the edges being traversed
2566 /// during buildTreeRec(). We keep track of:
2567 /// (i) the user TreeEntry index, and
2568 /// (ii) the index of the edge.
2569 struct EdgeInfo {
2570 EdgeInfo() = default;
2571 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2573 /// The user TreeEntry.
2574 TreeEntry *UserTE = nullptr;
2575 /// The operand index of the use.
2576 unsigned EdgeIdx = UINT_MAX;
2577#ifndef NDEBUG
2579 const BoUpSLP::EdgeInfo &EI) {
2580 EI.dump(OS);
2581 return OS;
2582 }
2583 /// Debug print.
2584 void dump(raw_ostream &OS) const {
2585 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2586 << " EdgeIdx:" << EdgeIdx << "}";
2587 }
2588 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2589#endif
2590 bool operator == (const EdgeInfo &Other) const {
2591 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2592 }
2593
2594 operator bool() const { return UserTE != nullptr; }
2595 };
2596 friend struct DenseMapInfo<EdgeInfo>;
2597
2598 /// A helper class used for scoring candidates for two consecutive lanes.
2600 const TargetLibraryInfo &TLI;
2601 const DataLayout &DL;
2602 ScalarEvolution &SE;
2603 const BoUpSLP &R;
2604 int NumLanes; // Total number of lanes (aka vectorization factor).
2605 int MaxLevel; // The maximum recursion depth for accumulating score.
2606
2607 public:
2609 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2610 int MaxLevel)
2611 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2612 MaxLevel(MaxLevel) {}
2613
2614 // The hard-coded scores listed here are not very important, though it shall
2615 // be higher for better matches to improve the resulting cost. When
2616 // computing the scores of matching one sub-tree with another, we are
2617 // basically counting the number of values that are matching. So even if all
2618 // scores are set to 1, we would still get a decent matching result.
2619 // However, sometimes we have to break ties. For example we may have to
2620 // choose between matching loads vs matching opcodes. This is what these
2621 // scores are helping us with: they provide the order of preference. Also,
2622 // this is important if the scalar is externally used or used in another
2623 // tree entry node in the different lane.
2624
2625 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2626 static const int ScoreConsecutiveLoads = 4;
2627 /// The same load multiple times. This should have a better score than
2628 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2629 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2630 /// a vector load and 1.0 for a broadcast.
2631 static const int ScoreSplatLoads = 3;
2632 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2633 static const int ScoreReversedLoads = 3;
2634 /// A load candidate for masked gather.
2635 static const int ScoreMaskedGatherCandidate = 1;
2636 /// ExtractElementInst from same vector and consecutive indexes.
2637 static const int ScoreConsecutiveExtracts = 4;
2638 /// ExtractElementInst from same vector and reversed indices.
2639 static const int ScoreReversedExtracts = 3;
2640 /// Constants.
2641 static const int ScoreConstants = 2;
2642 /// Instructions with the same opcode.
2643 static const int ScoreSameOpcode = 2;
2644 /// Instructions with alt opcodes (e.g, add + sub).
2645 static const int ScoreAltOpcodes = 1;
2646 /// Identical instructions (a.k.a. splat or broadcast).
2647 static const int ScoreSplat = 1;
2648 /// Matching with an undef is preferable to failing.
2649 static const int ScoreUndef = 1;
2650 /// Score for failing to find a decent match.
2651 static const int ScoreFail = 0;
2652 /// Score if all users are vectorized.
2653 static const int ScoreAllUserVectorized = 1;
2654
2655 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2656 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2657 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2658 /// MainAltOps.
2660 ArrayRef<Value *> MainAltOps) const {
2661 if (!isValidElementType(V1->getType()) ||
2664
2665 if (V1 == V2) {
2666 if (isa<LoadInst>(V1)) {
2667 // Retruns true if the users of V1 and V2 won't need to be extracted.
2668 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2669 // Bail out if we have too many uses to save compilation time.
2670 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
2671 return false;
2672
2673 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2674 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
2675 return U == U1 || U == U2 || R.isVectorized(U);
2676 });
2677 };
2678 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2679 };
2680 // A broadcast of a load can be cheaper on some targets.
2681 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2682 ElementCount::getFixed(NumLanes)) &&
2683 ((int)V1->getNumUses() == NumLanes ||
2684 AllUsersAreInternal(V1, V2)))
2686 }
2688 }
2689
2690 auto CheckSameEntryOrFail = [&]() {
2691 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V1); !TEs1.empty()) {
2693 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V2);
2694 !TEs2.empty() &&
2695 any_of(TEs2, [&](TreeEntry *E) { return Set.contains(E); }))
2697 }
2699 };
2700
2701 auto *LI1 = dyn_cast<LoadInst>(V1);
2702 auto *LI2 = dyn_cast<LoadInst>(V2);
2703 if (LI1 && LI2) {
2704 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2705 !LI2->isSimple())
2706 return CheckSameEntryOrFail();
2707
2708 std::optional<int64_t> Dist = getPointersDiff(
2709 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2710 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2711 if (!Dist || *Dist == 0) {
2712 if (getUnderlyingObject(LI1->getPointerOperand()) ==
2713 getUnderlyingObject(LI2->getPointerOperand()) &&
2714 R.TTI->isLegalMaskedGather(
2715 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
2717 return CheckSameEntryOrFail();
2718 }
2719 // The distance is too large - still may be profitable to use masked
2720 // loads/gathers.
2721 if (std::abs(*Dist) > NumLanes / 2)
2723 // This still will detect consecutive loads, but we might have "holes"
2724 // in some cases. It is ok for non-power-2 vectorization and may produce
2725 // better results. It should not affect current vectorization.
2728 }
2729
2730 auto *C1 = dyn_cast<Constant>(V1);
2731 auto *C2 = dyn_cast<Constant>(V2);
2732 if (C1 && C2)
2734
2735 // Consider constants and buildvector compatible.
2736 if ((C1 && isa<InsertElementInst>(V2)) ||
2737 (C2 && isa<InsertElementInst>(V1)))
2739
2740 // Extracts from consecutive indexes of the same vector better score as
2741 // the extracts could be optimized away.
2742 Value *EV1;
2743 ConstantInt *Ex1Idx;
2744 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
2745 // Undefs are always profitable for extractelements.
2746 // Compiler can easily combine poison and extractelement <non-poison> or
2747 // undef and extractelement <poison>. But combining undef +
2748 // extractelement <non-poison-but-may-produce-poison> requires some
2749 // extra operations.
2750 if (isa<UndefValue>(V2))
2751 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
2754 Value *EV2 = nullptr;
2755 ConstantInt *Ex2Idx = nullptr;
2756 if (match(V2,
2758 m_Undef())))) {
2759 // Undefs are always profitable for extractelements.
2760 if (!Ex2Idx)
2762 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
2764 if (EV2 == EV1) {
2765 int Idx1 = Ex1Idx->getZExtValue();
2766 int Idx2 = Ex2Idx->getZExtValue();
2767 int Dist = Idx2 - Idx1;
2768 // The distance is too large - still may be profitable to use
2769 // shuffles.
2770 if (std::abs(Dist) == 0)
2772 if (std::abs(Dist) > NumLanes / 2)
2776 }
2778 }
2779 return CheckSameEntryOrFail();
2780 }
2781
2782 auto *I1 = dyn_cast<Instruction>(V1);
2783 auto *I2 = dyn_cast<Instruction>(V2);
2784 if (I1 && I2) {
2785 if (I1->getParent() != I2->getParent())
2786 return CheckSameEntryOrFail();
2787 Value *V;
2788 Value *Cond;
2789 // ZExt i1 to something must be considered same opcode for select i1
2790 // cmp, x, y
2791 // Required to better match the transformation after
2792 // BoUpSLP::matchesInversedZExtSelect analysis.
2793 if ((match(I1, m_ZExt(m_Value(V))) &&
2794 match(I2, m_Select(m_Value(Cond), m_Value(), m_Value())) &&
2795 V->getType() == Cond->getType()) ||
2796 (match(I2, m_ZExt(m_Value(V))) &&
2797 match(I1, m_Select(m_Value(Cond), m_Value(), m_Value())) &&
2798 V->getType() == Cond->getType()))
2800 SmallVector<Value *, 4> Ops(MainAltOps);
2801 Ops.push_back(I1);
2802 Ops.push_back(I2);
2803 InstructionsState S = getSameOpcode(Ops, TLI);
2804 // Note: Only consider instructions with <= 2 operands to avoid
2805 // complexity explosion.
2806 if (S &&
2807 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2808 !S.isAltShuffle()) &&
2809 all_of(Ops, [&S](Value *V) {
2810 return isa<PoisonValue>(V) ||
2811 cast<Instruction>(V)->getNumOperands() ==
2812 S.getMainOp()->getNumOperands();
2813 }))
2814 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2816 }
2817
2818 if (I1 && isa<PoisonValue>(V2))
2820
2821 if (isa<UndefValue>(V2))
2823
2824 return CheckSameEntryOrFail();
2825 }
2826
2827 /// Go through the operands of \p LHS and \p RHS recursively until
2828 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2829 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2830 /// of \p U1 and \p U2), except at the beginning of the recursion where
2831 /// these are set to nullptr.
2832 ///
2833 /// For example:
2834 /// \verbatim
2835 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2836 /// \ / \ / \ / \ /
2837 /// + + + +
2838 /// G1 G2 G3 G4
2839 /// \endverbatim
2840 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2841 /// each level recursively, accumulating the score. It starts from matching
2842 /// the additions at level 0, then moves on to the loads (level 1). The
2843 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2844 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2845 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2846 /// Please note that the order of the operands does not matter, as we
2847 /// evaluate the score of all profitable combinations of operands. In
2848 /// other words the score of G1 and G4 is the same as G1 and G2. This
2849 /// heuristic is based on ideas described in:
2850 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2851 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2852 /// Luís F. W. Góes
2854 Instruction *U2, int CurrLevel,
2855 ArrayRef<Value *> MainAltOps) const {
2856
2857 // Get the shallow score of V1 and V2.
2858 int ShallowScoreAtThisLevel =
2859 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
2860
2861 // If reached MaxLevel,
2862 // or if V1 and V2 are not instructions,
2863 // or if they are SPLAT,
2864 // or if they are not consecutive,
2865 // or if profitable to vectorize loads or extractelements, early return
2866 // the current cost.
2867 auto *I1 = dyn_cast<Instruction>(LHS);
2868 auto *I2 = dyn_cast<Instruction>(RHS);
2869 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2870 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2871 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
2872 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2874 ShallowScoreAtThisLevel))
2875 return ShallowScoreAtThisLevel;
2876 assert(I1 && I2 && "Should have early exited.");
2877
2878 // Contains the I2 operand indexes that got matched with I1 operands.
2879 SmallSet<unsigned, 4> Op2Used;
2880
2881 // Recursion towards the operands of I1 and I2. We are trying all possible
2882 // operand pairs, and keeping track of the best score.
2883 if (I1->getNumOperands() != I2->getNumOperands())
2885 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2886 OpIdx1 != NumOperands1; ++OpIdx1) {
2887 // Try to pair op1I with the best operand of I2.
2888 int MaxTmpScore = 0;
2889 unsigned MaxOpIdx2 = 0;
2890 bool FoundBest = false;
2891 // If I2 is commutative try all combinations.
2892 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
2893 unsigned ToIdx = isCommutative(I2)
2894 ? I2->getNumOperands()
2895 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2896 assert(FromIdx <= ToIdx && "Bad index");
2897 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2898 // Skip operands already paired with OpIdx1.
2899 if (Op2Used.count(OpIdx2))
2900 continue;
2901 // Recursively calculate the cost at each level
2902 int TmpScore =
2903 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
2904 I1, I2, CurrLevel + 1, {});
2905 // Look for the best score.
2906 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2907 TmpScore > MaxTmpScore) {
2908 MaxTmpScore = TmpScore;
2909 MaxOpIdx2 = OpIdx2;
2910 FoundBest = true;
2911 }
2912 }
2913 if (FoundBest) {
2914 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2915 Op2Used.insert(MaxOpIdx2);
2916 ShallowScoreAtThisLevel += MaxTmpScore;
2917 }
2918 }
2919 return ShallowScoreAtThisLevel;
2920 }
2921 };
2922 /// A helper data structure to hold the operands of a vector of instructions.
2923 /// This supports a fixed vector length for all operand vectors.
2925 /// For each operand we need (i) the value, and (ii) the opcode that it
2926 /// would be attached to if the expression was in a left-linearized form.
2927 /// This is required to avoid illegal operand reordering.
2928 /// For example:
2929 /// \verbatim
2930 /// 0 Op1
2931 /// |/
2932 /// Op1 Op2 Linearized + Op2
2933 /// \ / ----------> |/
2934 /// - -
2935 ///
2936 /// Op1 - Op2 (0 + Op1) - Op2
2937 /// \endverbatim
2938 ///
2939 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2940 ///
2941 /// Another way to think of this is to track all the operations across the
2942 /// path from the operand all the way to the root of the tree and to
2943 /// calculate the operation that corresponds to this path. For example, the
2944 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2945 /// corresponding operation is a '-' (which matches the one in the
2946 /// linearized tree, as shown above).
2947 ///
2948 /// For lack of a better term, we refer to this operation as Accumulated
2949 /// Path Operation (APO).
2950 struct OperandData {
2951 OperandData() = default;
2952 OperandData(Value *V, bool APO, bool IsUsed)
2953 : V(V), APO(APO), IsUsed(IsUsed) {}
2954 /// The operand value.
2955 Value *V = nullptr;
2956 /// TreeEntries only allow a single opcode, or an alternate sequence of
2957 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2958 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2959 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2960 /// (e.g., Add/Mul)
2961 bool APO = false;
2962 /// Helper data for the reordering function.
2963 bool IsUsed = false;
2964 };
2965
2966 /// During operand reordering, we are trying to select the operand at lane
2967 /// that matches best with the operand at the neighboring lane. Our
2968 /// selection is based on the type of value we are looking for. For example,
2969 /// if the neighboring lane has a load, we need to look for a load that is
2970 /// accessing a consecutive address. These strategies are summarized in the
2971 /// 'ReorderingMode' enumerator.
2972 enum class ReorderingMode {
2973 Load, ///< Matching loads to consecutive memory addresses
2974 Opcode, ///< Matching instructions based on opcode (same or alternate)
2975 Constant, ///< Matching constants
2976 Splat, ///< Matching the same instruction multiple times (broadcast)
2977 Failed, ///< We failed to create a vectorizable group
2978 };
2979
2980 using OperandDataVec = SmallVector<OperandData, 2>;
2981
2982 /// A vector of operand vectors.
2984 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2985 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2986 unsigned ArgSize = 0;
2987
2988 const TargetLibraryInfo &TLI;
2989 const DataLayout &DL;
2990 ScalarEvolution &SE;
2991 const BoUpSLP &R;
2992 const Loop *L = nullptr;
2993
2994 /// \returns the operand data at \p OpIdx and \p Lane.
2995 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2996 return OpsVec[OpIdx][Lane];
2997 }
2998
2999 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
3000 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
3001 return OpsVec[OpIdx][Lane];
3002 }
3003
3004 /// Clears the used flag for all entries.
3005 void clearUsed() {
3006 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
3007 OpIdx != NumOperands; ++OpIdx)
3008 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
3009 ++Lane)
3010 OpsVec[OpIdx][Lane].IsUsed = false;
3011 }
3012
3013 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
3014 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
3015 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
3016 }
3017
3018 /// \param Lane lane of the operands under analysis.
3019 /// \param OpIdx operand index in \p Lane lane we're looking the best
3020 /// candidate for.
3021 /// \param Idx operand index of the current candidate value.
3022 /// \returns The additional score due to possible broadcasting of the
3023 /// elements in the lane. It is more profitable to have power-of-2 unique
3024 /// elements in the lane, it will be vectorized with higher probability
3025 /// after removing duplicates. Currently the SLP vectorizer supports only
3026 /// vectorization of the power-of-2 number of unique scalars.
3027 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
3028 const SmallBitVector &UsedLanes) const {
3029 Value *IdxLaneV = getData(Idx, Lane).V;
3030 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
3031 isa<ExtractElementInst>(IdxLaneV))
3032 return 0;
3034 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
3035 if (Ln == Lane)
3036 continue;
3037 Value *OpIdxLnV = getData(OpIdx, Ln).V;
3038 if (!isa<Instruction>(OpIdxLnV))
3039 return 0;
3040 Uniques.try_emplace(OpIdxLnV, Ln);
3041 }
3042 unsigned UniquesCount = Uniques.size();
3043 auto IdxIt = Uniques.find(IdxLaneV);
3044 unsigned UniquesCntWithIdxLaneV =
3045 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
3046 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
3047 auto OpIdxIt = Uniques.find(OpIdxLaneV);
3048 unsigned UniquesCntWithOpIdxLaneV =
3049 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
3050 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
3051 return 0;
3052 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
3053 UniquesCntWithOpIdxLaneV,
3054 UniquesCntWithOpIdxLaneV -
3055 bit_floor(UniquesCntWithOpIdxLaneV)) -
3056 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
3057 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
3058 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
3059 }
3060
3061 /// \param Lane lane of the operands under analysis.
3062 /// \param OpIdx operand index in \p Lane lane we're looking the best
3063 /// candidate for.
3064 /// \param Idx operand index of the current candidate value.
3065 /// \returns The additional score for the scalar which users are all
3066 /// vectorized.
3067 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
3068 Value *IdxLaneV = getData(Idx, Lane).V;
3069 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
3070 // Do not care about number of uses for vector-like instructions
3071 // (extractelement/extractvalue with constant indices), they are extracts
3072 // themselves and already externally used. Vectorization of such
3073 // instructions does not add extra extractelement instruction, just may
3074 // remove it.
3075 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
3076 isVectorLikeInstWithConstOps(OpIdxLaneV))
3078 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
3079 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
3080 return 0;
3081 return R.areAllUsersVectorized(IdxLaneI)
3083 : 0;
3084 }
3085
3086 /// Score scaling factor for fully compatible instructions but with
3087 /// different number of external uses. Allows better selection of the
3088 /// instructions with less external uses.
3089 static const int ScoreScaleFactor = 10;
3090
3091 /// \Returns the look-ahead score, which tells us how much the sub-trees
3092 /// rooted at \p LHS and \p RHS match, the more they match the higher the
3093 /// score. This helps break ties in an informed way when we cannot decide on
3094 /// the order of the operands by just considering the immediate
3095 /// predecessors.
3096 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
3097 int Lane, unsigned OpIdx, unsigned Idx,
3098 bool &IsUsed, const SmallBitVector &UsedLanes) {
3099 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
3101 // Keep track of the instruction stack as we recurse into the operands
3102 // during the look-ahead score exploration.
3103 int Score =
3104 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
3105 /*CurrLevel=*/1, MainAltOps);
3106 if (Score) {
3107 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
3108 if (Score <= -SplatScore) {
3109 // Failed score.
3110 Score = 0;
3111 } else {
3112 Score += SplatScore;
3113 // Scale score to see the difference between different operands
3114 // and similar operands but all vectorized/not all vectorized
3115 // uses. It does not affect actual selection of the best
3116 // compatible operand in general, just allows to select the
3117 // operand with all vectorized uses.
3118 Score *= ScoreScaleFactor;
3119 Score += getExternalUseScore(Lane, OpIdx, Idx);
3120 IsUsed = true;
3121 }
3122 }
3123 return Score;
3124 }
3125
3126 /// Best defined scores per lanes between the passes. Used to choose the
3127 /// best operand (with the highest score) between the passes.
3128 /// The key - {Operand Index, Lane}.
3129 /// The value - the best score between the passes for the lane and the
3130 /// operand.
3132 BestScoresPerLanes;
3133
3134 // Search all operands in Ops[*][Lane] for the one that matches best
3135 // Ops[OpIdx][LastLane] and return its opreand index.
3136 // If no good match can be found, return std::nullopt.
3137 std::optional<unsigned>
3138 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
3139 ArrayRef<ReorderingMode> ReorderingModes,
3140 ArrayRef<Value *> MainAltOps,
3141 const SmallBitVector &UsedLanes) {
3142 unsigned NumOperands = getNumOperands();
3143
3144 // The operand of the previous lane at OpIdx.
3145 Value *OpLastLane = getData(OpIdx, LastLane).V;
3146
3147 // Our strategy mode for OpIdx.
3148 ReorderingMode RMode = ReorderingModes[OpIdx];
3149 if (RMode == ReorderingMode::Failed)
3150 return std::nullopt;
3151
3152 // The linearized opcode of the operand at OpIdx, Lane.
3153 bool OpIdxAPO = getData(OpIdx, Lane).APO;
3154
3155 // The best operand index and its score.
3156 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
3157 // are using the score to differentiate between the two.
3158 struct BestOpData {
3159 std::optional<unsigned> Idx;
3160 unsigned Score = 0;
3161 } BestOp;
3162 BestOp.Score =
3163 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
3164 .first->second;
3165
3166 // Track if the operand must be marked as used. If the operand is set to
3167 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
3168 // want to reestimate the operands again on the following iterations).
3169 bool IsUsed = RMode == ReorderingMode::Splat ||
3170 RMode == ReorderingMode::Constant ||
3171 RMode == ReorderingMode::Load;
3172 // Iterate through all unused operands and look for the best.
3173 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
3174 // Get the operand at Idx and Lane.
3175 OperandData &OpData = getData(Idx, Lane);
3176 Value *Op = OpData.V;
3177 bool OpAPO = OpData.APO;
3178
3179 // Skip already selected operands.
3180 if (OpData.IsUsed)
3181 continue;
3182
3183 // Skip if we are trying to move the operand to a position with a
3184 // different opcode in the linearized tree form. This would break the
3185 // semantics.
3186 if (OpAPO != OpIdxAPO)
3187 continue;
3188
3189 // Look for an operand that matches the current mode.
3190 switch (RMode) {
3191 case ReorderingMode::Load:
3192 case ReorderingMode::Opcode: {
3193 bool LeftToRight = Lane > LastLane;
3194 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
3195 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
3196 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
3197 OpIdx, Idx, IsUsed, UsedLanes);
3198 if (Score > static_cast<int>(BestOp.Score) ||
3199 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
3200 Idx == OpIdx)) {
3201 BestOp.Idx = Idx;
3202 BestOp.Score = Score;
3203 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
3204 }
3205 break;
3206 }
3207 case ReorderingMode::Constant:
3208 if (isa<Constant>(Op) ||
3209 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
3210 BestOp.Idx = Idx;
3211 if (isa<Constant>(Op)) {
3213 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
3215 }
3217 IsUsed = false;
3218 }
3219 break;
3220 case ReorderingMode::Splat:
3221 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
3222 IsUsed = Op == OpLastLane;
3223 if (Op == OpLastLane) {
3224 BestOp.Score = LookAheadHeuristics::ScoreSplat;
3225 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
3227 }
3228 BestOp.Idx = Idx;
3229 }
3230 break;
3231 case ReorderingMode::Failed:
3232 llvm_unreachable("Not expected Failed reordering mode.");
3233 }
3234 }
3235
3236 if (BestOp.Idx) {
3237 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
3238 return BestOp.Idx;
3239 }
3240 // If we could not find a good match return std::nullopt.
3241 return std::nullopt;
3242 }
3243
3244 /// Helper for reorderOperandVecs.
3245 /// \returns the lane that we should start reordering from. This is the one
3246 /// which has the least number of operands that can freely move about or
3247 /// less profitable because it already has the most optimal set of operands.
3248 unsigned getBestLaneToStartReordering() const {
3249 unsigned Min = UINT_MAX;
3250 unsigned SameOpNumber = 0;
3251 // std::pair<unsigned, unsigned> is used to implement a simple voting
3252 // algorithm and choose the lane with the least number of operands that
3253 // can freely move about or less profitable because it already has the
3254 // most optimal set of operands. The first unsigned is a counter for
3255 // voting, the second unsigned is the counter of lanes with instructions
3256 // with same/alternate opcodes and same parent basic block.
3258 // Try to be closer to the original results, if we have multiple lanes
3259 // with same cost. If 2 lanes have the same cost, use the one with the
3260 // highest index.
3261 for (int I = getNumLanes(); I > 0; --I) {
3262 unsigned Lane = I - 1;
3263 OperandsOrderData NumFreeOpsHash =
3264 getMaxNumOperandsThatCanBeReordered(Lane);
3265 // Compare the number of operands that can move and choose the one with
3266 // the least number.
3267 if (NumFreeOpsHash.NumOfAPOs < Min) {
3268 Min = NumFreeOpsHash.NumOfAPOs;
3269 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3270 HashMap.clear();
3271 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3272 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3273 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3274 // Select the most optimal lane in terms of number of operands that
3275 // should be moved around.
3276 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3277 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3278 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3279 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3280 auto [It, Inserted] =
3281 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3282 if (!Inserted)
3283 ++It->second.first;
3284 }
3285 }
3286 // Select the lane with the minimum counter.
3287 unsigned BestLane = 0;
3288 unsigned CntMin = UINT_MAX;
3289 for (const auto &Data : reverse(HashMap)) {
3290 if (Data.second.first < CntMin) {
3291 CntMin = Data.second.first;
3292 BestLane = Data.second.second;
3293 }
3294 }
3295 return BestLane;
3296 }
3297
3298 /// Data structure that helps to reorder operands.
3299 struct OperandsOrderData {
3300 /// The best number of operands with the same APOs, which can be
3301 /// reordered.
3302 unsigned NumOfAPOs = UINT_MAX;
3303 /// Number of operands with the same/alternate instruction opcode and
3304 /// parent.
3305 unsigned NumOpsWithSameOpcodeParent = 0;
3306 /// Hash for the actual operands ordering.
3307 /// Used to count operands, actually their position id and opcode
3308 /// value. It is used in the voting mechanism to find the lane with the
3309 /// least number of operands that can freely move about or less profitable
3310 /// because it already has the most optimal set of operands. Can be
3311 /// replaced with SmallVector<unsigned> instead but hash code is faster
3312 /// and requires less memory.
3313 unsigned Hash = 0;
3314 };
3315 /// \returns the maximum number of operands that are allowed to be reordered
3316 /// for \p Lane and the number of compatible instructions(with the same
3317 /// parent/opcode). This is used as a heuristic for selecting the first lane
3318 /// to start operand reordering.
3319 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
3320 unsigned CntTrue = 0;
3321 unsigned NumOperands = getNumOperands();
3322 // Operands with the same APO can be reordered. We therefore need to count
3323 // how many of them we have for each APO, like this: Cnt[APO] = x.
3324 // Since we only have two APOs, namely true and false, we can avoid using
3325 // a map. Instead we can simply count the number of operands that
3326 // correspond to one of them (in this case the 'true' APO), and calculate
3327 // the other by subtracting it from the total number of operands.
3328 // Operands with the same instruction opcode and parent are more
3329 // profitable since we don't need to move them in many cases, with a high
3330 // probability such lane already can be vectorized effectively.
3331 bool AllUndefs = true;
3332 unsigned NumOpsWithSameOpcodeParent = 0;
3333 Instruction *OpcodeI = nullptr;
3334 BasicBlock *Parent = nullptr;
3335 unsigned Hash = 0;
3336 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3337 const OperandData &OpData = getData(OpIdx, Lane);
3338 if (OpData.APO)
3339 ++CntTrue;
3340 // Use Boyer-Moore majority voting for finding the majority opcode and
3341 // the number of times it occurs.
3342 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
3343 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
3344 I->getParent() != Parent) {
3345 if (NumOpsWithSameOpcodeParent == 0) {
3346 NumOpsWithSameOpcodeParent = 1;
3347 OpcodeI = I;
3348 Parent = I->getParent();
3349 } else {
3350 --NumOpsWithSameOpcodeParent;
3351 }
3352 } else {
3353 ++NumOpsWithSameOpcodeParent;
3354 }
3355 }
3356 Hash = hash_combine(
3357 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
3358 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
3359 }
3360 if (AllUndefs)
3361 return {};
3362 OperandsOrderData Data;
3363 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3364 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3365 Data.Hash = Hash;
3366 return Data;
3367 }
3368
3369 /// Go through the instructions in VL and append their operands.
3370 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3371 const InstructionsState &S) {
3372 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3373 assert((empty() || all_of(Operands,
3374 [this](const ValueList &VL) {
3375 return VL.size() == getNumLanes();
3376 })) &&
3377 "Expected same number of lanes");
3378 assert(S.valid() && "InstructionsState is invalid.");
3379 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3380 // arguments to the intrinsic produces the same result.
3381 Instruction *MainOp = S.getMainOp();
3382 unsigned NumOperands = MainOp->getNumOperands();
3384 OpsVec.resize(ArgSize);
3385 unsigned NumLanes = VL.size();
3386 for (OperandDataVec &Ops : OpsVec)
3387 Ops.resize(NumLanes);
3388 for (unsigned Lane : seq<unsigned>(NumLanes)) {
3389 // Our tree has just 3 nodes: the root and two operands.
3390 // It is therefore trivial to get the APO. We only need to check the
3391 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3392 // operand. The LHS operand of both add and sub is never attached to an
3393 // inversese operation in the linearized form, therefore its APO is
3394 // false. The RHS is true only if V is an inverse operation.
3395
3396 // Since operand reordering is performed on groups of commutative
3397 // operations or alternating sequences (e.g., +, -), we can safely tell
3398 // the inverse operations by checking commutativity.
3399 auto *I = dyn_cast<Instruction>(VL[Lane]);
3400 if (!I && isa<PoisonValue>(VL[Lane])) {
3401 for (unsigned OpIdx : seq<unsigned>(NumOperands))
3402 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3403 continue;
3404 }
3405 bool IsInverseOperation = false;
3406 if (S.isCopyableElement(VL[Lane])) {
3407 // The value is a copyable element.
3408 IsInverseOperation =
3409 !isCommutative(MainOp, VL[Lane], /*IsCopyable=*/true);
3410 } else {
3411 assert(I && "Expected instruction");
3412 auto [SelectedOp, Ops] = convertTo(I, S);
3413 // We cannot check commutativity by the converted instruction
3414 // (SelectedOp) because isCommutative also examines def-use
3415 // relationships.
3416 IsInverseOperation = !isCommutative(SelectedOp, I);
3417 }
3418 for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
3419 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3420 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3421 }
3422 }
3423 }
3424
3425 /// \returns the number of operands.
3426 unsigned getNumOperands() const { return ArgSize; }
3427
3428 /// \returns the number of lanes.
3429 unsigned getNumLanes() const { return OpsVec[0].size(); }
3430
3431 /// \returns the operand value at \p OpIdx and \p Lane.
3432 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3433 return getData(OpIdx, Lane).V;
3434 }
3435
3436 /// \returns true if the data structure is empty.
3437 bool empty() const { return OpsVec.empty(); }
3438
3439 /// Clears the data.
3440 void clear() { OpsVec.clear(); }
3441
3442 /// \Returns true if there are enough operands identical to \p Op to fill
3443 /// the whole vector (it is mixed with constants or loop invariant values).
3444 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3445 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3446 assert(Op == getValue(OpIdx, Lane) &&
3447 "Op is expected to be getValue(OpIdx, Lane).");
3448 // Small number of loads - try load matching.
3449 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
3450 return false;
3451 bool OpAPO = getData(OpIdx, Lane).APO;
3452 bool IsInvariant = L && L->isLoopInvariant(Op);
3453 unsigned Cnt = 0;
3454 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3455 if (Ln == Lane)
3456 continue;
3457 // This is set to true if we found a candidate for broadcast at Lane.
3458 bool FoundCandidate = false;
3459 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3460 OperandData &Data = getData(OpI, Ln);
3461 if (Data.APO != OpAPO || Data.IsUsed)
3462 continue;
3463 Value *OpILane = getValue(OpI, Lane);
3464 bool IsConstantOp = isa<Constant>(OpILane);
3465 // Consider the broadcast candidate if:
3466 // 1. Same value is found in one of the operands.
3467 if (Data.V == Op ||
3468 // 2. The operand in the given lane is not constant but there is a
3469 // constant operand in another lane (which can be moved to the
3470 // given lane). In this case we can represent it as a simple
3471 // permutation of constant and broadcast.
3472 (!IsConstantOp &&
3473 ((Lns > 2 && isa<Constant>(Data.V)) ||
3474 // 2.1. If we have only 2 lanes, need to check that value in the
3475 // next lane does not build same opcode sequence.
3476 (Lns == 2 &&
3477 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
3478 isa<Constant>(Data.V)))) ||
3479 // 3. The operand in the current lane is loop invariant (can be
3480 // hoisted out) and another operand is also a loop invariant
3481 // (though not a constant). In this case the whole vector can be
3482 // hoisted out.
3483 // FIXME: need to teach the cost model about this case for better
3484 // estimation.
3485 (IsInvariant && !isa<Constant>(Data.V) &&
3486 !getSameOpcode({Op, Data.V}, TLI) &&
3487 L->isLoopInvariant(Data.V))) {
3488 FoundCandidate = true;
3489 Data.IsUsed = Data.V == Op;
3490 if (Data.V == Op)
3491 ++Cnt;
3492 break;
3493 }
3494 }
3495 if (!FoundCandidate)
3496 return false;
3497 }
3498 return getNumLanes() == 2 || Cnt > 1;
3499 }
3500
3501 /// Checks if there is at least single compatible operand in lanes other
3502 /// than \p Lane, compatible with the operand \p Op.
3503 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3504 assert(Op == getValue(OpIdx, Lane) &&
3505 "Op is expected to be getValue(OpIdx, Lane).");
3506 bool OpAPO = getData(OpIdx, Lane).APO;
3507 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3508 if (Ln == Lane)
3509 continue;
3510 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
3511 const OperandData &Data = getData(OpI, Ln);
3512 if (Data.APO != OpAPO || Data.IsUsed)
3513 return true;
3514 Value *OpILn = getValue(OpI, Ln);
3515 return (L && L->isLoopInvariant(OpILn)) ||
3516 (getSameOpcode({Op, OpILn}, TLI) &&
3517 allSameBlock({Op, OpILn}));
3518 }))
3519 return true;
3520 }
3521 return false;
3522 }
3523
3524 public:
3525 /// Initialize with all the operands of the instruction vector \p RootVL.
3527 const InstructionsState &S, const BoUpSLP &R)
3528 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3529 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
3530 // Append all the operands of RootVL.
3531 appendOperands(RootVL, Operands, S);
3532 }
3533
3534 /// \Returns a value vector with the operands across all lanes for the
3535 /// opearnd at \p OpIdx.
3536 ValueList getVL(unsigned OpIdx) const {
3537 ValueList OpVL(OpsVec[OpIdx].size());
3538 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3539 "Expected same num of lanes across all operands");
3540 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3541 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3542 return OpVL;
3543 }
3544
3545 // Performs operand reordering for 2 or more operands.
3546 // The original operands are in OrigOps[OpIdx][Lane].
3547 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3548 void reorder() {
3549 unsigned NumOperands = getNumOperands();
3550 unsigned NumLanes = getNumLanes();
3551 // Each operand has its own mode. We are using this mode to help us select
3552 // the instructions for each lane, so that they match best with the ones
3553 // we have selected so far.
3554 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3555
3556 // This is a greedy single-pass algorithm. We are going over each lane
3557 // once and deciding on the best order right away with no back-tracking.
3558 // However, in order to increase its effectiveness, we start with the lane
3559 // that has operands that can move the least. For example, given the
3560 // following lanes:
3561 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3562 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3563 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3564 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3565 // we will start at Lane 1, since the operands of the subtraction cannot
3566 // be reordered. Then we will visit the rest of the lanes in a circular
3567 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3568
3569 // Find the first lane that we will start our search from.
3570 unsigned FirstLane = getBestLaneToStartReordering();
3571
3572 // Initialize the modes.
3573 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3574 Value *OpLane0 = getValue(OpIdx, FirstLane);
3575 // Keep track if we have instructions with all the same opcode on one
3576 // side.
3577 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
3578 // Check if OpLane0 should be broadcast.
3579 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
3580 !canBeVectorized(OpILane0, OpIdx, FirstLane))
3581 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3582 else if (isa<LoadInst>(OpILane0))
3583 ReorderingModes[OpIdx] = ReorderingMode::Load;
3584 else
3585 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3586 } else if (isa<Constant>(OpLane0)) {
3587 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3588 } else if (isa<Argument>(OpLane0)) {
3589 // Our best hope is a Splat. It may save some cost in some cases.
3590 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3591 } else {
3592 llvm_unreachable("Unexpected value kind.");
3593 }
3594 }
3595
3596 // Check that we don't have same operands. No need to reorder if operands
3597 // are just perfect diamond or shuffled diamond match. Do not do it only
3598 // for possible broadcasts.
3599 auto &&SkipReordering = [this]() {
3600 SmallPtrSet<Value *, 4> UniqueValues;
3601 ArrayRef<OperandData> Op0 = OpsVec.front();
3602 for (const OperandData &Data : Op0)
3603 UniqueValues.insert(Data.V);
3605 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3606 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
3607 return !UniqueValues.contains(Data.V);
3608 }))
3609 return false;
3610 }
3611 return UniqueValues.size() != 2;
3612 };
3613
3614 // If the initial strategy fails for any of the operand indexes, then we
3615 // perform reordering again in a second pass. This helps avoid assigning
3616 // high priority to the failed strategy, and should improve reordering for
3617 // the non-failed operand indexes.
3618 for (int Pass = 0; Pass != 2; ++Pass) {
3619 // Check if no need to reorder operands since they're are perfect or
3620 // shuffled diamond match.
3621 // Need to do it to avoid extra external use cost counting for
3622 // shuffled matches, which may cause regressions.
3623 if (SkipReordering())
3624 break;
3625 // Skip the second pass if the first pass did not fail.
3626 bool StrategyFailed = false;
3627 // Mark all operand data as free to use.
3628 clearUsed();
3629 // We keep the original operand order for the FirstLane, so reorder the
3630 // rest of the lanes. We are visiting the nodes in a circular fashion,
3631 // using FirstLane as the center point and increasing the radius
3632 // distance.
3633 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3634 for (unsigned I = 0; I < NumOperands; ++I)
3635 MainAltOps[I].push_back(getData(I, FirstLane).V);
3636
3637 SmallBitVector UsedLanes(NumLanes);
3638 UsedLanes.set(FirstLane);
3639 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3640 // Visit the lane on the right and then the lane on the left.
3641 for (int Direction : {+1, -1}) {
3642 int Lane = FirstLane + Direction * Distance;
3643 if (Lane < 0 || Lane >= (int)NumLanes)
3644 continue;
3645 UsedLanes.set(Lane);
3646 int LastLane = Lane - Direction;
3647 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3648 "Out of bounds");
3649 // Look for a good match for each operand.
3650 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3651 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3652 std::optional<unsigned> BestIdx =
3653 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3654 MainAltOps[OpIdx], UsedLanes);
3655 // By not selecting a value, we allow the operands that follow to
3656 // select a better matching value. We will get a non-null value in
3657 // the next run of getBestOperand().
3658 if (BestIdx) {
3659 // Swap the current operand with the one returned by
3660 // getBestOperand().
3661 swap(OpIdx, *BestIdx, Lane);
3662 } else {
3663 // Enable the second pass.
3664 StrategyFailed = true;
3665 }
3666 // Try to get the alternate opcode and follow it during analysis.
3667 if (MainAltOps[OpIdx].size() != 2) {
3668 OperandData &AltOp = getData(OpIdx, Lane);
3669 InstructionsState OpS =
3670 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3671 if (OpS && OpS.isAltShuffle())
3672 MainAltOps[OpIdx].push_back(AltOp.V);
3673 }
3674 }
3675 }
3676 }
3677 // Skip second pass if the strategy did not fail.
3678 if (!StrategyFailed)
3679 break;
3680 }
3681 }
3682
3683#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3684 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3685 switch (RMode) {
3686 case ReorderingMode::Load:
3687 return "Load";
3688 case ReorderingMode::Opcode:
3689 return "Opcode";
3690 case ReorderingMode::Constant:
3691 return "Constant";
3692 case ReorderingMode::Splat:
3693 return "Splat";
3694 case ReorderingMode::Failed:
3695 return "Failed";
3696 }
3697 llvm_unreachable("Unimplemented Reordering Type");
3698 }
3699
3700 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3701 raw_ostream &OS) {
3702 return OS << getModeStr(RMode);
3703 }
3704
3705 /// Debug print.
3706 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3707 printMode(RMode, dbgs());
3708 }
3709
3710 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3711 return printMode(RMode, OS);
3712 }
3713
3715 const unsigned Indent = 2;
3716 unsigned Cnt = 0;
3717 for (const OperandDataVec &OpDataVec : OpsVec) {
3718 OS << "Operand " << Cnt++ << "\n";
3719 for (const OperandData &OpData : OpDataVec) {
3720 OS.indent(Indent) << "{";
3721 if (Value *V = OpData.V)
3722 OS << *V;
3723 else
3724 OS << "null";
3725 OS << ", APO:" << OpData.APO << "}\n";
3726 }
3727 OS << "\n";
3728 }
3729 return OS;
3730 }
3731
3732 /// Debug print.
3733 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3734#endif
3735 };
3736
3737 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3738 /// for a pair which have highest score deemed to have best chance to form
3739 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3740 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3741 /// of the cost, considered to be good enough score.
3742 std::pair<std::optional<int>, int>
3743 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3744 int Limit = LookAheadHeuristics::ScoreFail) const {
3745 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3747 int BestScore = Limit;
3748 std::optional<int> Index;
3749 for (int I : seq<int>(0, Candidates.size())) {
3750 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
3751 Candidates[I].second,
3752 /*U1=*/nullptr, /*U2=*/nullptr,
3753 /*CurrLevel=*/1, {});
3754 if (Score > BestScore) {
3755 BestScore = Score;
3756 Index = I;
3757 }
3758 }
3759 return std::make_pair(Index, BestScore);
3760 }
3761
3762 /// Checks if the instruction is marked for deletion.
3763 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
3764
3765 /// Removes an instruction from its block and eventually deletes it.
3766 /// It's like Instruction::eraseFromParent() except that the actual deletion
3767 /// is delayed until BoUpSLP is destructed.
3769 DeletedInstructions.insert(I);
3770 }
3771
3772 /// Remove instructions from the parent function and clear the operands of \p
3773 /// DeadVals instructions, marking for deletion trivially dead operands.
3774 template <typename T>
3776 ArrayRef<T *> DeadVals,
3777 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
3778 VectorValuesAndScales) {
3780 for (T *V : DeadVals) {
3781 auto *I = cast<Instruction>(V);
3783 }
3784 DenseSet<Value *> Processed;
3785 for (T *V : DeadVals) {
3786 if (!V || !Processed.insert(V).second)
3787 continue;
3788 auto *I = cast<Instruction>(V);
3790 ArrayRef<TreeEntry *> Entries = getTreeEntries(I);
3791 for (Use &U : I->operands()) {
3792 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
3793 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3795 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3796 return Entry->VectorizedValue == OpI;
3797 })))
3798 DeadInsts.push_back(OpI);
3799 }
3800 I->dropAllReferences();
3801 }
3802 for (T *V : DeadVals) {
3803 auto *I = cast<Instruction>(V);
3804 if (!I->getParent())
3805 continue;
3806 assert((I->use_empty() || all_of(I->uses(),
3807 [&](Use &U) {
3808 return isDeleted(
3809 cast<Instruction>(U.getUser()));
3810 })) &&
3811 "trying to erase instruction with users.");
3812 I->removeFromParent();
3813 SE->forgetValue(I);
3814 }
3815 // Process the dead instruction list until empty.
3816 while (!DeadInsts.empty()) {
3817 Value *V = DeadInsts.pop_back_val();
3819 if (!VI || !VI->getParent())
3820 continue;
3822 "Live instruction found in dead worklist!");
3823 assert(VI->use_empty() && "Instructions with uses are not dead.");
3824
3825 // Don't lose the debug info while deleting the instructions.
3826 salvageDebugInfo(*VI);
3827
3828 // Null out all of the instruction's operands to see if any operand
3829 // becomes dead as we go.
3830 for (Use &OpU : VI->operands()) {
3831 Value *OpV = OpU.get();
3832 if (!OpV)
3833 continue;
3834 OpU.set(nullptr);
3835
3836 if (!OpV->use_empty())
3837 continue;
3838
3839 // If the operand is an instruction that became dead as we nulled out
3840 // the operand, and if it is 'trivially' dead, delete it in a future
3841 // loop iteration.
3842 if (auto *OpI = dyn_cast<Instruction>(OpV))
3843 if (!DeletedInstructions.contains(OpI) &&
3844 (!OpI->getType()->isVectorTy() ||
3845 none_of(
3846 VectorValuesAndScales,
3847 [&](const std::tuple<WeakTrackingVH, unsigned, bool, bool>
3848 &V) { return std::get<0>(V) == OpI; })) &&
3850 DeadInsts.push_back(OpI);
3851 }
3852
3853 VI->removeFromParent();
3854 eraseInstruction(VI);
3855 SE->forgetValue(VI);
3856 }
3857 }
3858
3859 /// Checks if the instruction was already analyzed for being possible
3860 /// reduction root.
3862 return AnalyzedReductionsRoots.count(I);
3863 }
3864 /// Register given instruction as already analyzed for being possible
3865 /// reduction root.
3867 AnalyzedReductionsRoots.insert(I);
3868 }
3869 /// Checks if the provided list of reduced values was checked already for
3870 /// vectorization.
3872 return AnalyzedReductionVals.contains(hash_value(VL));
3873 }
3874 /// Adds the list of reduced values to list of already checked values for the
3875 /// vectorization.
3877 AnalyzedReductionVals.insert(hash_value(VL));
3878 }
3879 /// Clear the list of the analyzed reduction root instructions.
3881 AnalyzedReductionsRoots.clear();
3882 AnalyzedReductionVals.clear();
3883 AnalyzedMinBWVals.clear();
3884 }
3885 /// Checks if the given value is gathered in one of the nodes.
3886 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3887 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
3888 }
3889 /// Checks if the given value is gathered in one of the nodes.
3890 bool isGathered(const Value *V) const {
3891 return MustGather.contains(V);
3892 }
3893 /// Checks if the specified value was not schedule.
3894 bool isNotScheduled(const Value *V) const {
3895 return NonScheduledFirst.contains(V);
3896 }
3897
3898 /// Check if the value is vectorized in the tree.
3899 bool isVectorized(const Value *V) const {
3900 assert(V && "V cannot be nullptr.");
3901 ArrayRef<TreeEntry *> Entries = getTreeEntries(V);
3902 return any_of(Entries, [&](const TreeEntry *E) {
3903 return !DeletedNodes.contains(E) && !TransformedToGatherNodes.contains(E);
3904 });
3905 }
3906
3907 /// Checks if it is legal and profitable to build SplitVectorize node for the
3908 /// given \p VL.
3909 /// \param Op1 first homogeneous scalars.
3910 /// \param Op2 second homogeneous scalars.
3911 /// \param ReorderIndices indices to reorder the scalars.
3912 /// \returns true if the node was successfully built.
3914 const InstructionsState &LocalState,
3917 OrdersType &ReorderIndices) const;
3918
3919 ~BoUpSLP();
3920
3921private:
3922 /// Determine if a node \p E in can be demoted to a smaller type with a
3923 /// truncation. We collect the entries that will be demoted in ToDemote.
3924 /// \param E Node for analysis
3925 /// \param ToDemote indices of the nodes to be demoted.
3926 bool collectValuesToDemote(
3927 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3929 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3930 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3931
3932 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3933 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3934 /// they have only one user and reordarable).
3935 /// \param ReorderableGathers List of all gather nodes that require reordering
3936 /// (e.g., gather of extractlements or partially vectorizable loads).
3937 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3938 /// reordering, subset of \p NonVectorized.
3939 void buildReorderableOperands(
3940 TreeEntry *UserTE,
3941 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3942 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3943 SmallVectorImpl<TreeEntry *> &GatherOps);
3944
3945 /// Checks if the given \p TE is a gather node with clustered reused scalars
3946 /// and reorders it per given \p Mask.
3947 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3948
3949 /// Checks if all users of \p I are the part of the vectorization tree.
3950 bool areAllUsersVectorized(
3951 Instruction *I,
3952 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3953
3954 /// Estimates the number of scalar instructions in the tree.
3955 unsigned getNumScalarInsts() const;
3956
3957 /// Estimates the number of vector instructions (including buildvectors,
3958 /// shuffles, and extracts) that the tree will produce.
3959 unsigned getNumVectorInsts() const;
3960
3961 /// Return information about the vector formed for the specified index
3962 /// of a vector of (the same) instruction.
3965
3966 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3967 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3968 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3969 return const_cast<TreeEntry *>(
3970 getOperandEntry(const_cast<const TreeEntry *>(E), Idx));
3971 }
3972
3973 /// Gets the root instruction for the given node. If the node is a strided
3974 /// load/store node with the reverse order, the root instruction is the last
3975 /// one.
3976 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3977
3978 /// \returns Cast context for the given graph node.
3980 getCastContextHint(const TreeEntry &TE) const;
3981
3982 /// \returns the scale of the given tree entry to the loop iteration.
3983 /// \p Scalar is the scalar value from the entry, if using the parent for the
3984 /// external use.
3985 /// \p U is the user of the vectorized value from the entry, if using the
3986 /// parent for the external use.
3987 uint64_t getScaleToLoopIterations(const TreeEntry &TE,
3988 Value *Scalar = nullptr,
3989 Instruction *U = nullptr);
3990
3991 /// \returns the product of trip counts of the loop \p L and all of its
3992 /// enclosing loops. Unlike the state kept by getScaleToLoopIterations(),
3993 /// this helper depends only on the loop structure and is independent of
3994 /// per-entry operand invariance. Returns 1 when loop-aware cost modeling
3995 /// is disabled or \p L is null.
3996 uint64_t getLoopNestScale(const Loop *L);
3997
3998 /// \returns a refined execution scale for a gather/buildvector tree entry
3999 /// \p TE. The scale is computed as the average of per-lane execution
4000 /// scales: each lane's scale is the loop-nest scale of the loop that
4001 /// contains the lane's defining instruction (or 1 if the lane is a
4002 /// constant / loop-invariant non-instruction value). This models the
4003 /// LICM hoisting that optimizeGatherSequence() performs after vectorization
4004 /// for inserts with loop-invariant operands. Falls back to the whole-entry
4005 /// scale when per-lane information is unavailable or the feature is off.
4006 uint64_t getGatherNodeEffectiveScale(const TreeEntry &TE,
4007 Instruction *U = nullptr);
4008
4009 /// Get the loop nest for the given loop \p L.
4010 ArrayRef<const Loop *> getLoopNest(const Loop *L);
4011
4012 /// \returns the cost of the vectorizable entry.
4013 InstructionCost getEntryCost(const TreeEntry *E,
4014 ArrayRef<Value *> VectorizedVals,
4015 SmallPtrSetImpl<Value *> &CheckedExtracts);
4016
4017 /// Estimates spill/reload cost from vector register pressure for \p E at the
4018 /// point of emitting its vector result type \p FinalVecTy. \p ScalarTy is the
4019 /// scalar/slot type used to widen into \p VecTy/\p FinalVecTy and may itself
4020 /// be a FixedVectorType in ReVec mode or an adjusted type due to MinBWs.
4021 InstructionCost getVectorSpillReloadCost(const TreeEntry *E, Type *ScalarTy,
4022 Type *VecTy, Type *FinalVecTy,
4024
4025 /// This is the recursive part of buildTree.
4026 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
4027 unsigned InterleaveFactor = 0);
4028
4029 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
4030 /// be vectorized to use the original vector (or aggregate "bitcast" to a
4031 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
4032 /// returns false, setting \p CurrentOrder to either an empty vector or a
4033 /// non-identity permutation that allows to reuse extract instructions.
4034 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
4035 /// extract order.
4036 bool canReuseExtract(ArrayRef<Value *> VL,
4037 SmallVectorImpl<unsigned> &CurrentOrder,
4038 bool ResizeAllowed = false) const;
4039
4040 /// Vectorize a single entry in the tree.
4041 Value *vectorizeTree(TreeEntry *E);
4042
4043 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
4044 /// \p E.
4045 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
4046
4047 /// Create a new vector from a list of scalar values. Produces a sequence
4048 /// which exploits values reused across lanes, and arranges the inserts
4049 /// for ease of later optimization.
4050 template <typename BVTy, typename ResTy, typename... Args>
4051 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
4052
4053 /// Create a new vector from a list of scalar values. Produces a sequence
4054 /// which exploits values reused across lanes, and arranges the inserts
4055 /// for ease of later optimization.
4056 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
4057
4058 /// Returns the instruction in the bundle, which can be used as a base point
4059 /// for scheduling. Usually it is the last instruction in the bundle, except
4060 /// for the case when all operands are external (in this case, it is the first
4061 /// instruction in the list).
4062 Instruction &getLastInstructionInBundle(const TreeEntry *E);
4063
4064 /// Tries to find extractelement instructions with constant indices from fixed
4065 /// vector type and gather such instructions into a bunch, which highly likely
4066 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
4067 /// was successful, the matched scalars are replaced by poison values in \p VL
4068 /// for future analysis.
4069 std::optional<TargetTransformInfo::ShuffleKind>
4070 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
4071 SmallVectorImpl<int> &Mask) const;
4072
4073 /// Tries to find extractelement instructions with constant indices from fixed
4074 /// vector type and gather such instructions into a bunch, which highly likely
4075 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
4076 /// was successful, the matched scalars are replaced by poison values in \p VL
4077 /// for future analysis.
4079 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
4081 unsigned NumParts) const;
4082
4083 /// Checks if the gathered \p VL can be represented as a single register
4084 /// shuffle(s) of previous tree entries.
4085 /// \param TE Tree entry checked for permutation.
4086 /// \param VL List of scalars (a subset of the TE scalar), checked for
4087 /// permutations. Must form single-register vector.
4088 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
4089 /// commands to build the mask using the original vector value, without
4090 /// relying on the potential reordering.
4091 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
4092 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
4093 std::optional<TargetTransformInfo::ShuffleKind>
4094 isGatherShuffledSingleRegisterEntry(
4095 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
4096 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder,
4097 unsigned SliceSize);
4098
4099 /// Checks if the gathered \p VL can be represented as multi-register
4100 /// shuffle(s) of previous tree entries.
4101 /// \param TE Tree entry checked for permutation.
4102 /// \param VL List of scalars (a subset of the TE scalar), checked for
4103 /// permutations.
4104 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
4105 /// commands to build the mask using the original vector value, without
4106 /// relying on the potential reordering.
4107 /// \returns per-register series of ShuffleKind, if gathered values can be
4108 /// represented as shuffles of previous tree entries. \p Mask is filled with
4109 /// the shuffle mask (also on per-register base).
4111 isGatherShuffledEntry(
4112 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
4114 unsigned NumParts, bool ForOrder = false);
4115
4116 /// \returns the cost of gathering (inserting) the values in \p VL into a
4117 /// vector.
4118 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
4119 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
4120 Type *ScalarTy) const;
4121
4122 /// Set the Builder insert point to one after the last instruction in
4123 /// the bundle
4124 void setInsertPointAfterBundle(const TreeEntry *E);
4125
4126 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
4127 /// specified, the starting vector value is poison.
4128 Value *
4129 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
4130 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
4131
4132 /// \returns whether the VectorizableTree is fully vectorizable and will
4133 /// be beneficial even the tree height is tiny.
4134 bool isFullyVectorizableTinyTree(bool ForReduction) const;
4135
4136 /// Run through the list of all gathered loads in the graph and try to find
4137 /// vector loads/masked gathers instead of regular gathers. Later these loads
4138 /// are reshufled to build final gathered nodes.
4139 void tryToVectorizeGatheredLoads(
4140 const SmallMapVector<
4141 std::tuple<BasicBlock *, Value *, Type *>,
4142 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
4143 &GatheredLoads);
4144
4145 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
4146 /// users of \p TE and collects the stores. It returns the map from the store
4147 /// pointers to the collected stores.
4149 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
4150
4151 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
4152 /// stores in \p StoresVec can form a vector instruction. If so it returns
4153 /// true and populates \p ReorderIndices with the shuffle indices of the
4154 /// stores when compared to the sorted vector.
4155 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
4156 OrdersType &ReorderIndices) const;
4157
4158 /// Iterates through the users of \p TE, looking for scalar stores that can be
4159 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
4160 /// their order and builds an order index vector for each store bundle. It
4161 /// returns all these order vectors found.
4162 /// We run this after the tree has formed, otherwise we may come across user
4163 /// instructions that are not yet in the tree.
4165 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
4166
4167 /// Tries to reorder the gathering node for better vectorization
4168 /// opportunities.
4169 void reorderGatherNode(TreeEntry &TE);
4170
4171 /// Checks if the tree represents disjoint or reduction of shl(zext, (0, 8,
4172 /// .., 56))-like pattern.
4173 /// If the int shifts unique, also strided, but not ordered, sets \p Order.
4174 /// If the node can be represented as a bitcast + bswap, sets \p IsBSwap.
4175 /// If the root nodes are loads, sets \p ForLoads to true.
4176 bool matchesShlZExt(const TreeEntry &TE, OrdersType &Order, bool &IsBSwap,
4177 bool &ForLoads) const;
4178
4179 /// Checks if the \p SelectTE matches zext+selects, which can be inversed for
4180 /// better codegen in case like zext (icmp ne), select (icmp eq), ....
4181 bool matchesInversedZExtSelect(
4182 const TreeEntry &SelectTE,
4183 SmallVectorImpl<unsigned> &InversedCmpsIndices) const;
4184
4185 /// Checks if the tree is reduction or of bit selects, like select %cmp, <1,
4186 /// 2, 4, 8, ..>, zeroinitializer, which can be reduced just to a bitcast %cmp
4187 /// to in.
4188 bool matchesSelectOfBits(const TreeEntry &SelectTE) const;
4189
4190 class TreeEntry {
4191 public:
4192 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
4193 TreeEntry(VecTreeTy &Container) : Container(Container) {}
4194
4195 /// \returns Common mask for reorder indices and reused scalars.
4196 SmallVector<int> getCommonMask() const {
4197 if (State == TreeEntry::SplitVectorize)
4198 return {};
4199 SmallVector<int> Mask;
4200 inversePermutation(ReorderIndices, Mask);
4201 ::addMask(Mask, ReuseShuffleIndices);
4202 return Mask;
4203 }
4204
4205 /// \returns The mask for split nodes.
4206 SmallVector<int> getSplitMask() const {
4207 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
4208 "Expected only split vectorize node.");
4209 unsigned CommonVF = std::max<unsigned>(
4210 CombinedEntriesWithIndices.back().second,
4211 Scalars.size() - CombinedEntriesWithIndices.back().second);
4212 const unsigned Scale = getNumElements(Scalars.front()->getType());
4213 CommonVF *= Scale;
4214 SmallVector<int> Mask(getVectorFactor() * Scale, PoisonMaskElem);
4215 for (auto [Idx, I] : enumerate(ReorderIndices)) {
4216 for (unsigned K : seq<unsigned>(Scale)) {
4217 Mask[Scale * I + K] =
4218 Scale * Idx + K +
4219 (Idx >= CombinedEntriesWithIndices.back().second
4220 ? CommonVF - CombinedEntriesWithIndices.back().second * Scale
4221 : 0);
4222 }
4223 }
4224 return Mask;
4225 }
4226
4227 /// Updates (reorders) SplitVectorize node according to the given mask \p
4228 /// Mask and order \p MaskOrder.
4229 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
4230 ArrayRef<int> MaskOrder);
4231
4232 /// \returns true if the scalars in VL are equal to this entry.
4233 bool isSame(ArrayRef<Value *> VL) const {
4234 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
4235 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
4236 return std::equal(VL.begin(), VL.end(), Scalars.begin());
4237 return VL.size() == Mask.size() &&
4238 std::equal(VL.begin(), VL.end(), Mask.begin(),
4239 [Scalars](Value *V, int Idx) {
4240 return (isa<UndefValue>(V) &&
4241 Idx == PoisonMaskElem) ||
4242 (Idx != PoisonMaskElem && V == Scalars[Idx]);
4243 });
4244 };
4245 if (!ReorderIndices.empty()) {
4246 // TODO: implement matching if the nodes are just reordered, still can
4247 // treat the vector as the same if the list of scalars matches VL
4248 // directly, without reordering.
4249 SmallVector<int> Mask;
4250 inversePermutation(ReorderIndices, Mask);
4251 if (VL.size() == Scalars.size())
4252 return IsSame(Scalars, Mask);
4253 if (VL.size() == ReuseShuffleIndices.size()) {
4254 ::addMask(Mask, ReuseShuffleIndices);
4255 return IsSame(Scalars, Mask);
4256 }
4257 return false;
4258 }
4259 return IsSame(Scalars, ReuseShuffleIndices);
4260 }
4261
4262 /// \returns true if current entry has same operands as \p TE.
4263 bool hasEqualOperands(const TreeEntry &TE) const {
4264 if (TE.getNumOperands() != getNumOperands())
4265 return false;
4266 SmallBitVector Used(getNumOperands());
4267 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
4268 unsigned PrevCount = Used.count();
4269 for (unsigned K = 0; K < E; ++K) {
4270 if (Used.test(K))
4271 continue;
4272 if (getOperand(K) == TE.getOperand(I)) {
4273 Used.set(K);
4274 break;
4275 }
4276 }
4277 // Check if we actually found the matching operand.
4278 if (PrevCount == Used.count())
4279 return false;
4280 }
4281 return true;
4282 }
4283
4284 /// \return Final vectorization factor for the node. Defined by the total
4285 /// number of vectorized scalars, including those, used several times in the
4286 /// entry and counted in the \a ReuseShuffleIndices, if any.
4287 unsigned getVectorFactor() const {
4288 if (!ReuseShuffleIndices.empty())
4289 return ReuseShuffleIndices.size();
4290 return Scalars.size();
4291 };
4292
4293 /// Checks if the current node is a gather node.
4294 bool isGather() const { return State == NeedToGather; }
4295
4296 /// A vector of scalars.
4297 ValueList Scalars;
4298
4299 /// The Scalars are vectorized into this value. It is initialized to Null.
4300 WeakTrackingVH VectorizedValue = nullptr;
4301
4302 /// Do we need to gather this sequence or vectorize it
4303 /// (either with vector instruction or with scatter/gather
4304 /// intrinsics for store/load)?
4305 enum EntryState {
4306 Vectorize, ///< The node is regularly vectorized.
4307 ScatterVectorize, ///< Masked scatter/gather node.
4308 StridedVectorize, ///< Strided loads (and stores)
4309 CompressVectorize, ///< (Masked) load with compress.
4310 NeedToGather, ///< Gather/buildvector node.
4311 CombinedVectorize, ///< Vectorized node, combined with its user into more
4312 ///< complex node like select/cmp to minmax, mul/add to
4313 ///< fma, etc. Must be used for the following nodes in
4314 ///< the pattern, not the very first one.
4315 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
4316 ///< independently and then combines back.
4317 };
4318 EntryState State;
4319
4320 /// List of combined opcodes supported by the vectorizer.
4321 enum CombinedOpcode {
4322 NotCombinedOp = -1,
4323 MinMax = Instruction::OtherOpsEnd + 1,
4324 FMulAdd,
4325 ReducedBitcast,
4326 ReducedBitcastBSwap,
4327 ReducedBitcastLoads,
4328 ReducedBitcastBSwapLoads,
4329 ReducedCmpBitcast,
4330 };
4331 CombinedOpcode CombinedOp = NotCombinedOp;
4332
4333 /// Does this sequence require some shuffling?
4334 SmallVector<int, 4> ReuseShuffleIndices;
4335
4336 /// Does this entry require reordering?
4337 SmallVector<unsigned, 4> ReorderIndices;
4338
4339 /// Points back to the VectorizableTree.
4340 ///
4341 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
4342 /// to be a pointer and needs to be able to initialize the child iterator.
4343 /// Thus we need a reference back to the container to translate the indices
4344 /// to entries.
4345 VecTreeTy &Container;
4346
4347 /// The TreeEntry index containing the user of this entry.
4348 EdgeInfo UserTreeIndex;
4349
4350 /// The index of this treeEntry in VectorizableTree.
4351 unsigned Idx = 0;
4352
4353 /// For gather/buildvector/alt opcode nodes, which are combined from
4354 /// other nodes as a series of insertvector instructions.
4355 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
4356
4357 /// For ExtractValue entries that are vectorized via the struct-call path
4358 /// (checkEVsForVecCalls succeeded during tree building), stores the common
4359 /// field-index path shared by all scalars in the bundle. Empty for all
4360 /// other entry kinds.
4361 SmallVector<unsigned, 1> StructEVIndices;
4362
4363 private:
4364 /// The operands of each instruction in each lane Operands[op_index][lane].
4365 /// Note: This helps avoid the replication of the code that performs the
4366 /// reordering of operands during buildTreeRec() and vectorizeTree().
4367 SmallVector<ValueList, 2> Operands;
4368
4369 /// Copyable elements of the entry node.
4370 SmallPtrSet<const Value *, 4> CopyableElements;
4371
4372 /// MainOp and AltOp are recorded inside. S should be obtained from
4373 /// newTreeEntry.
4374 InstructionsState S = InstructionsState::invalid();
4375
4376 /// Interleaving factor for interleaved loads Vectorize nodes.
4377 unsigned InterleaveFactor = 0;
4378
4379 /// True if the node does not require scheduling.
4380 bool DoesNotNeedToSchedule = false;
4381
4382 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
4383 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
4384 if (Operands.size() < OpIdx + 1)
4385 Operands.resize(OpIdx + 1);
4386 assert(Operands[OpIdx].empty() && "Already resized?");
4387 assert(OpVL.size() <= Scalars.size() &&
4388 "Number of operands is greater than the number of scalars.");
4389 Operands[OpIdx].resize(OpVL.size());
4390 copy(OpVL, Operands[OpIdx].begin());
4391 }
4392
4393 /// Maps values to their lanes in the node.
4394 mutable SmallDenseMap<Value *, unsigned> ValueToLane;
4395
4396 public:
4397 /// Returns interleave factor for interleave nodes.
4398 unsigned getInterleaveFactor() const { return InterleaveFactor; }
4399 /// Sets interleaving factor for the interleaving nodes.
4400 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
4401
4402 /// Marks the node as one that does not require scheduling.
4403 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
4404 /// Returns true if the node is marked as one that does not require
4405 /// scheduling.
4406 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
4407
4408 /// Set this bundle's operands from \p Operands.
4409 void setOperands(ArrayRef<ValueList> Operands) {
4410 for (unsigned I : seq<unsigned>(Operands.size()))
4411 setOperand(I, Operands[I]);
4412 }
4413
4414 /// Reorders operands of the node to the given mask \p Mask.
4415 void reorderOperands(ArrayRef<int> Mask) {
4416 for (ValueList &Operand : Operands)
4417 reorderScalars(Operand, Mask);
4418 }
4419
4420 /// \returns the \p OpIdx operand of this TreeEntry.
4421 ValueList &getOperand(unsigned OpIdx) {
4422 assert(OpIdx < Operands.size() && "Off bounds");
4423 return Operands[OpIdx];
4424 }
4425
4426 /// \returns the \p OpIdx operand of this TreeEntry.
4427 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4428 assert(OpIdx < Operands.size() && "Off bounds");
4429 return Operands[OpIdx];
4430 }
4431
4432 /// \returns the number of operands.
4433 unsigned getNumOperands() const { return Operands.size(); }
4434
4435 /// \return the single \p OpIdx operand.
4436 Value *getSingleOperand(unsigned OpIdx) const {
4437 assert(OpIdx < Operands.size() && "Off bounds");
4438 assert(!Operands[OpIdx].empty() && "No operand available");
4439 return Operands[OpIdx][0];
4440 }
4441
4442 /// Some of the instructions in the list have alternate opcodes.
4443 bool isAltShuffle() const { return S.isAltShuffle(); }
4444
4445 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4446 return S.getMatchingMainOpOrAltOp(I);
4447 }
4448
4449 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4450 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4451 /// \p OpValue.
4452 Value *isOneOf(Value *Op) const {
4453 auto *I = dyn_cast<Instruction>(Op);
4454 if (I && getMatchingMainOpOrAltOp(I))
4455 return Op;
4456 return S.getMainOp();
4457 }
4458
4459 void setOperations(const InstructionsState &S) {
4460 assert(S && "InstructionsState is invalid.");
4461 this->S = S;
4462 }
4463
4464 Instruction *getMainOp() const { return S.getMainOp(); }
4465
4466 Instruction *getAltOp() const { return S.getAltOp(); }
4467
4468 /// The main/alternate opcodes for the list of instructions.
4469 unsigned getOpcode() const { return S.getOpcode(); }
4470
4471 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4472
4473 bool hasState() const { return S.valid(); }
4474
4475 /// Add \p V to the list of copyable elements.
4476 void addCopyableElement(Value *V) {
4477 assert(S.isCopyableElement(V) && "Not a copyable element.");
4478 CopyableElements.insert(V);
4479 }
4480
4481 /// Returns true if \p V is a copyable element.
4482 bool isCopyableElement(Value *V) const {
4483 return CopyableElements.contains(V);
4484 }
4485
4486 /// Checks if the value \p V is a transformed instruction, compatible either
4487 /// with main or alternate ops.
4488 bool isExpandedBinOp(Value *V) const {
4489 assert(hasState() && "InstructionsState is invalid.");
4490 if (isCopyableElement(V))
4491 return false;
4492 return S.isExpandedBinOp(V);
4493 }
4494
4495 /// Checks if the operand at index \p Idx of instruction \p I is an expanded
4496 /// operand.
4497 bool isExpandedOperand(Instruction *I, unsigned Idx) const {
4498 assert(hasState() && "InstructionsState is invalid.");
4499 if (isCopyableElement(I))
4500 return false;
4501 if (!isExpandedBinOp(I))
4502 return false;
4503 return S.isExpandedOperand(I, Idx);
4504 }
4505
4506 /// Returns true if any scalar in the list is a copyable element.
4507 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4508
4509 /// Returns the state of the operations.
4510 const InstructionsState &getOperations() const { return S; }
4511
4512 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4513 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4514 unsigned findLaneForValue(Value *V) const {
4515 auto Res = ValueToLane.try_emplace(V, getVectorFactor());
4516 if (!Res.second)
4517 return Res.first->second;
4518 unsigned &FoundLane = Res.first->getSecond();
4519 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
4520 std::advance(It, 1)) {
4521 if (*It != V)
4522 continue;
4523 FoundLane = std::distance(Scalars.begin(), It);
4524 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4525 if (!ReorderIndices.empty())
4526 FoundLane = ReorderIndices[FoundLane];
4527 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4528 if (ReuseShuffleIndices.empty())
4529 break;
4530 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
4531 RIt != ReuseShuffleIndices.end()) {
4532 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4533 break;
4534 }
4535 }
4536 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4537 return FoundLane;
4538 }
4539
4540 /// Build a shuffle mask for graph entry which represents a merge of main
4541 /// and alternate operations.
4542 void
4543 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4544 SmallVectorImpl<int> &Mask,
4545 SmallVectorImpl<Value *> *OpScalars = nullptr,
4546 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4547
4548 /// Return true if this is a non-power-of-2 node.
4549 bool isNonPowOf2Vec() const {
4550 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
4551 return IsNonPowerOf2;
4552 }
4553
4554 Value *getOrdered(unsigned Idx) const {
4555 if (ReorderIndices.empty())
4556 return Scalars[Idx];
4557 SmallVector<int> Mask;
4558 inversePermutation(ReorderIndices, Mask);
4559 return Scalars[Mask[Idx]];
4560 }
4561
4562#ifndef NDEBUG
4563 /// Debug printer.
4564 LLVM_DUMP_METHOD void dump() const {
4565 dbgs() << Idx << ".\n";
4566 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4567 dbgs() << "Operand " << OpI << ":\n";
4568 for (const Value *V : Operands[OpI])
4569 dbgs().indent(2) << *V << "\n";
4570 }
4571 dbgs() << "Scalars: \n";
4572 for (Value *V : Scalars) {
4573 dbgs().indent(2) << *V
4574 << ((S && S.isExpandedBinOp(V)) ? " [[Expanded]]\n"
4575 : "\n");
4576 }
4577 dbgs() << "State: ";
4578 if (S && hasCopyableElements())
4579 dbgs() << "[[Copyable]] ";
4580 switch (State) {
4581 case Vectorize:
4582 if (InterleaveFactor > 0) {
4583 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4584 << "\n";
4585 } else {
4586 dbgs() << "Vectorize\n";
4587 }
4588 break;
4589 case ScatterVectorize:
4590 dbgs() << "ScatterVectorize\n";
4591 break;
4592 case StridedVectorize:
4593 dbgs() << "StridedVectorize\n";
4594 break;
4595 case CompressVectorize:
4596 dbgs() << "CompressVectorize\n";
4597 break;
4598 case NeedToGather:
4599 dbgs() << "NeedToGather\n";
4600 break;
4601 case CombinedVectorize:
4602 dbgs() << "CombinedVectorize\n";
4603 break;
4604 case SplitVectorize:
4605 dbgs() << "SplitVectorize\n";
4606 break;
4607 }
4608 if (S) {
4609 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4610 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4611 } else {
4612 dbgs() << "MainOp: NULL\n";
4613 dbgs() << "AltOp: NULL\n";
4614 }
4615 dbgs() << "VectorizedValue: ";
4616 if (VectorizedValue)
4617 dbgs() << *VectorizedValue << "\n";
4618 else
4619 dbgs() << "NULL\n";
4620 dbgs() << "ReuseShuffleIndices: ";
4621 if (ReuseShuffleIndices.empty())
4622 dbgs() << "Empty";
4623 else
4624 for (int ReuseIdx : ReuseShuffleIndices)
4625 dbgs() << ReuseIdx << ", ";
4626 dbgs() << "\n";
4627 dbgs() << "ReorderIndices: ";
4628 for (unsigned ReorderIdx : ReorderIndices)
4629 dbgs() << ReorderIdx << ", ";
4630 dbgs() << "\n";
4631 dbgs() << "UserTreeIndex: ";
4632 if (UserTreeIndex)
4633 dbgs() << UserTreeIndex;
4634 else
4635 dbgs() << "<invalid>";
4636 dbgs() << "\n";
4637 if (!StructEVIndices.empty()) {
4638 dbgs() << "StructEVIndices: ";
4639 interleaveComma(StructEVIndices, dbgs());
4640 dbgs() << "\n";
4641 }
4642 if (!CombinedEntriesWithIndices.empty()) {
4643 dbgs() << "Combined entries: ";
4644 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4645 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4646 });
4647 dbgs() << "\n";
4648 }
4649 }
4650#endif
4651 };
4652
4653#ifndef NDEBUG
4654 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4655 InstructionCost VecCost, InstructionCost ScalarCost,
4656 StringRef Banner) const {
4657 dbgs() << "SLP: " << Banner << ":\n";
4658 E->dump();
4659 dbgs() << "SLP: Costs:\n";
4660 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4661 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4662 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4663 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4664 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4665 }
4666#endif
4667
4668 /// Create a new gather TreeEntry
4669 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4670 const InstructionsState &S,
4671 const EdgeInfo &UserTreeIdx,
4672 ArrayRef<int> ReuseShuffleIndices = {}) {
4673 auto Invalid = ScheduleBundle::invalid();
4674 return newTreeEntry(VL, Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4675 }
4676
4677 /// Create a new VectorizableTree entry.
4678 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4679 const InstructionsState &S,
4680 const EdgeInfo &UserTreeIdx,
4681 ArrayRef<int> ReuseShuffleIndices = {},
4682 ArrayRef<unsigned> ReorderIndices = {},
4683 unsigned InterleaveFactor = 0) {
4684 TreeEntry::EntryState EntryState =
4685 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4686 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4687 ReuseShuffleIndices, ReorderIndices);
4688 if (E && InterleaveFactor > 0)
4689 E->setInterleave(InterleaveFactor);
4690 return E;
4691 }
4692
4693 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4694 TreeEntry::EntryState EntryState,
4695 ScheduleBundle &Bundle, const InstructionsState &S,
4696 const EdgeInfo &UserTreeIdx,
4697 ArrayRef<int> ReuseShuffleIndices = {},
4698 ArrayRef<unsigned> ReorderIndices = {}) {
4699 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4700 EntryState == TreeEntry::SplitVectorize)) ||
4701 (Bundle && EntryState != TreeEntry::NeedToGather &&
4702 EntryState != TreeEntry::SplitVectorize)) &&
4703 "Need to vectorize gather entry?");
4704 // Gathered loads still gathered? Do not create entry, use the original one.
4705 if (GatheredLoadsEntriesFirst.has_value() &&
4706 EntryState == TreeEntry::NeedToGather && S &&
4707 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4708 !UserTreeIdx.UserTE)
4709 return nullptr;
4710 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4711 TreeEntry *Last = VectorizableTree.back().get();
4712 Last->Idx = VectorizableTree.size() - 1;
4713 Last->State = EntryState;
4714 if (UserTreeIdx.UserTE)
4715 OperandsToTreeEntry.try_emplace(
4716 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
4717 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4718 ReuseShuffleIndices.end());
4719 if (ReorderIndices.empty()) {
4720 Last->Scalars.assign(VL.begin(), VL.end());
4721 if (S)
4722 Last->setOperations(S);
4723 } else {
4724 // Reorder scalars and build final mask.
4725 Last->Scalars.assign(VL.size(), nullptr);
4726 transform(ReorderIndices, Last->Scalars.begin(),
4727 [VL](unsigned Idx) -> Value * {
4728 if (Idx >= VL.size())
4729 return UndefValue::get(VL.front()->getType());
4730 return VL[Idx];
4731 });
4732 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
4733 if (S)
4734 Last->setOperations(S);
4735 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
4736 }
4737 if (EntryState == TreeEntry::SplitVectorize) {
4738 assert(S && "Split nodes must have operations.");
4739 Last->setOperations(S);
4740 SmallPtrSet<Value *, 4> Processed;
4741 for (Value *V : VL) {
4742 auto *I = dyn_cast<Instruction>(V);
4743 if (!I)
4744 continue;
4745 auto It = ScalarsInSplitNodes.find(V);
4746 if (It == ScalarsInSplitNodes.end()) {
4747 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(Last);
4748 (void)Processed.insert(V);
4749 } else if (Processed.insert(V).second) {
4750 assert(!is_contained(It->getSecond(), Last) &&
4751 "Value already associated with the node.");
4752 It->getSecond().push_back(Last);
4753 }
4754 }
4755 } else if (!Last->isGather()) {
4756 if (isa<PHINode>(S.getMainOp()) ||
4757 isVectorLikeInstWithConstOps(S.getMainOp()) ||
4758 (!S.areInstructionsWithCopyableElements() &&
4759 doesNotNeedToSchedule(VL)) ||
4760 all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
4761 Last->setDoesNotNeedToSchedule();
4762 SmallPtrSet<Value *, 4> Processed;
4763 for (Value *V : VL) {
4764 if (isa<PoisonValue>(V))
4765 continue;
4766 if (S.isCopyableElement(V)) {
4767 Last->addCopyableElement(V);
4768 continue;
4769 }
4770 auto It = ScalarToTreeEntries.find(V);
4771 if (It == ScalarToTreeEntries.end()) {
4772 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
4773 (void)Processed.insert(V);
4774 } else if (Processed.insert(V).second) {
4775 assert(!is_contained(It->getSecond(), Last) &&
4776 "Value already associated with the node.");
4777 It->getSecond().push_back(Last);
4778 }
4779 }
4780 // Update the scheduler bundle to point to this TreeEntry.
4781 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4782 "Bundle and VL out of sync");
4783 if (!Bundle.getBundle().empty()) {
4784#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4785 auto *BundleMember = Bundle.getBundle().begin();
4786 SmallPtrSet<Value *, 4> Processed;
4787 for (Value *V : VL) {
4788 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4789 continue;
4790 ++BundleMember;
4791 }
4792 assert(BundleMember == Bundle.getBundle().end() &&
4793 "Bundle and VL out of sync");
4794#endif
4795 Bundle.setTreeEntry(Last);
4796 }
4797 } else {
4798 // Build a map for gathered scalars to the nodes where they are used.
4799 bool AllConstsOrCasts = true;
4800 for (Value *V : VL) {
4801 if (S && S.areInstructionsWithCopyableElements() &&
4802 S.isCopyableElement(V))
4803 Last->addCopyableElement(V);
4804 if (!isConstant(V)) {
4805 auto *I = dyn_cast<CastInst>(V);
4806 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4807 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4808 !UserTreeIdx.UserTE->isGather())
4809 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
4810 }
4811 }
4812 if (AllConstsOrCasts)
4813 CastMaxMinBWSizes =
4814 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4815 MustGather.insert_range(VL);
4816 }
4817
4818 if (UserTreeIdx.UserTE)
4819 Last->UserTreeIndex = UserTreeIdx;
4820 return Last;
4821 }
4822
4823 /// -- Vectorization State --
4824 /// Holds all of the tree entries.
4825 TreeEntry::VecTreeTy VectorizableTree;
4826
4827#ifndef NDEBUG
4828 /// Debug printer.
4829 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4830 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4831 VectorizableTree[Id]->dump();
4832 if (TransformedToGatherNodes.contains(VectorizableTree[Id].get()))
4833 dbgs() << "[[TRANSFORMED TO GATHER]]";
4834 else if (DeletedNodes.contains(VectorizableTree[Id].get()))
4835 dbgs() << "[[DELETED NODE]]";
4836 dbgs() << "\n";
4837 }
4838 }
4839#endif
4840
4841 /// Get list of vector entries, associated with the value \p V.
4842 ArrayRef<TreeEntry *> getTreeEntries(const Value *V) const {
4843 assert(V && "V cannot be nullptr.");
4844 auto It = ScalarToTreeEntries.find(V);
4845 if (It == ScalarToTreeEntries.end())
4846 return {};
4847 return It->getSecond();
4848 }
4849
4850 /// Get list of split vector entries, associated with the value \p V.
4851 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4852 assert(V && "V cannot be nullptr.");
4853 auto It = ScalarsInSplitNodes.find(V);
4854 if (It == ScalarsInSplitNodes.end())
4855 return {};
4856 return It->getSecond();
4857 }
4858
4859 /// Returns first vector node for value \p V, matching values \p VL.
4860 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4861 bool SameVF = false) const {
4862 assert(V && "V cannot be nullptr.");
4863 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4864 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4865 return TE;
4866 return nullptr;
4867 }
4868
4869 /// Contains all the outputs of legality analysis for a list of values to
4870 /// vectorize.
4871 class ScalarsVectorizationLegality {
4872 InstructionsState S;
4873 bool IsLegal;
4874 bool TryToFindDuplicates;
4875 bool TrySplitVectorize;
4876
4877 public:
4878 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4879 bool TryToFindDuplicates = true,
4880 bool TrySplitVectorize = false)
4881 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4882 TrySplitVectorize(TrySplitVectorize) {
4883 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4884 "Inconsistent state");
4885 }
4886 const InstructionsState &getInstructionsState() const { return S; };
4887 bool isLegal() const { return IsLegal; }
4888 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4889 bool trySplitVectorize() const { return TrySplitVectorize; }
4890 };
4891
4892 /// Checks if the specified list of the instructions/values can be vectorized
4893 /// in general.
4894 ScalarsVectorizationLegality
4895 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4896 const EdgeInfo &UserTreeIdx) const;
4897
4898 /// Checks if the specified list of the instructions/values can be vectorized
4899 /// and fills required data before actual scheduling of the instructions.
4900 TreeEntry::EntryState getScalarsVectorizationState(
4901 const InstructionsState &S, ArrayRef<Value *> VL,
4902 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4903 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4904
4905 /// Maps a specific scalar to its tree entry(ies).
4906 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4907
4908 /// List of deleted non-profitable nodes.
4909 SmallPtrSet<const TreeEntry *, 8> DeletedNodes;
4910
4911 /// List of nodes, transformed to gathered, with their conservative
4912 /// gather/buildvector cost estimation.
4913 SmallDenseMap<const TreeEntry *, InstructionCost> TransformedToGatherNodes;
4914
4915 /// Maps the operand index and entry to the corresponding tree entry.
4916 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4917 OperandsToTreeEntry;
4918
4919 /// Scalars, used in split vectorize nodes.
4920 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4921
4922 /// Maps a value to the proposed vectorizable size.
4923 SmallDenseMap<Value *, unsigned> InstrElementSize;
4924
4925 /// A list of scalars that we found that we need to keep as scalars.
4926 ValueSet MustGather;
4927
4928 /// A set of first non-schedulable values.
4929 ValueSet NonScheduledFirst;
4930
4931 /// A map between the vectorized entries and the last instructions in the
4932 /// bundles. The bundles are built in use order, not in the def order of the
4933 /// instructions. So, we cannot rely directly on the last instruction in the
4934 /// bundle being the last instruction in the program order during
4935 /// vectorization process since the basic blocks are affected, need to
4936 /// pre-gather them before.
4937 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4938
4939 /// Keeps the mapping between the last instructions and their insertion
4940 /// points, which is an instruction-after-the-last-instruction.
4941 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4942
4943 /// List of gather nodes, depending on other gather/vector nodes, which should
4944 /// be emitted after the vector instruction emission process to correctly
4945 /// handle order of the vector instructions and shuffles.
4946 SetVector<const TreeEntry *> PostponedGathers;
4947
4948 using ValueToGatherNodesMap =
4949 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4950 ValueToGatherNodesMap ValueToGatherNodes;
4951
4952 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
4953
4954 /// A list of the load entries (node indices), which can be vectorized using
4955 /// strided or masked gather approach, but attempted to be represented as
4956 /// contiguous loads.
4957 SetVector<unsigned> LoadEntriesToVectorize;
4958
4959 /// true if graph nodes transforming mode is on.
4960 bool IsGraphTransformMode = false;
4961
4962 /// The index of the first gathered load entry in the VectorizeTree.
4963 std::optional<unsigned> GatheredLoadsEntriesFirst;
4964
4965 /// Maps compress entries to their mask data for the final codegen.
4966 SmallDenseMap<const TreeEntry *,
4967 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4968 CompressEntryToData;
4969
4970 /// The loop nest, used to check if only a single loop nest is vectorized, not
4971 /// multiple, to avoid side-effects from the loop-aware cost model.
4972 SmallVector<const Loop *> CurrentLoopNest;
4973
4974 /// Per-depth SCEVs trip counts at every loop level where the tree builder has
4975 /// joined diverging sibling loops.
4976 SmallVector<const SCEV *> MergedLoopBTCs;
4977
4978 /// Maps the loops to their loop nests.
4979 SmallDenseMap<const Loop *, SmallVector<const Loop *>> LoopToLoopNest;
4980
4981 /// Per-loop cache of nest scale factors: the product of trip counts of the
4982 /// loop and all of its ancestors. Shared by getLoopNestScale() and (via it)
4983 /// by getScaleToLoopIterations() and getGatherNodeEffectiveScale().
4984 SmallDenseMap<const Loop *, uint64_t> LoopNestScaleCache;
4985
4986 /// This POD struct describes one external user in the vectorized tree.
4987 struct ExternalUser {
4988 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4989 : Scalar(S), User(U), E(E), Lane(L) {}
4990
4991 /// Which scalar in our function.
4992 Value *Scalar = nullptr;
4993
4994 /// Which user that uses the scalar.
4995 llvm::User *User = nullptr;
4996
4997 /// Vector node, the value is part of.
4998 const TreeEntry &E;
4999
5000 /// Which lane does the scalar belong to.
5001 unsigned Lane;
5002 };
5003 using UserList = SmallVector<ExternalUser, 16>;
5004
5005 /// Checks if two instructions may access the same memory.
5006 ///
5007 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
5008 /// is invariant in the calling loop.
5009 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
5010 Instruction *Inst2) {
5011 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
5012 // First check if the result is already in the cache.
5013 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
5014 auto Res = AliasCache.try_emplace(Key);
5015 if (!Res.second)
5016 return Res.first->second;
5017 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
5018 // Store the result in the cache.
5019 Res.first->getSecond() = Aliased;
5020 return Aliased;
5021 }
5022
5023 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
5024
5025 /// Cache for alias results.
5026 /// TODO: consider moving this to the AliasAnalysis itself.
5027 SmallDenseMap<AliasCacheKey, bool> AliasCache;
5028
5029 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
5030 // globally through SLP because we don't perform any action which
5031 // invalidates capture results.
5032 BatchAAResults BatchAA;
5033
5034 /// Temporary store for deleted instructions. Instructions will be deleted
5035 /// eventually when the BoUpSLP is destructed. The deferral is required to
5036 /// ensure that there are no incorrect collisions in the AliasCache, which
5037 /// can happen if a new instruction is allocated at the same address as a
5038 /// previously deleted instruction.
5039 DenseSet<Instruction *> DeletedInstructions;
5040
5041 /// Set of the instruction, being analyzed already for reductions.
5042 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
5043
5044 /// Set of hashes for the list of reduction values already being analyzed.
5045 DenseSet<size_t> AnalyzedReductionVals;
5046
5047 /// Values, already been analyzed for mininmal bitwidth and found to be
5048 /// non-profitable.
5049 DenseSet<Value *> AnalyzedMinBWVals;
5050
5051 /// A list of values that need to extracted out of the tree.
5052 /// This list holds pairs of (Internal Scalar : External User). External User
5053 /// can be nullptr, it means that this Internal Scalar will be used later,
5054 /// after vectorization.
5055 UserList ExternalUses;
5056
5057 /// A list of GEPs which can be reaplced by scalar GEPs instead of
5058 /// extractelement instructions.
5059 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
5060
5061 /// A list of scalar to be extracted without specific user necause of too many
5062 /// uses.
5063 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
5064
5065 /// Values used only by @llvm.assume calls.
5066 SmallPtrSet<const Value *, 32> EphValues;
5067
5068 /// Holds all of the instructions that we gathered, shuffle instructions and
5069 /// extractelements.
5070 SetVector<Instruction *> GatherShuffleExtractSeq;
5071
5072 /// A list of blocks that we are going to CSE.
5073 DenseSet<BasicBlock *> CSEBlocks;
5074
5075 /// List of hashes of vector of loads, which are known to be non vectorizable.
5076 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
5077
5078 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
5079 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
5080 /// instructions, while ScheduleBundle represents a batch of instructions,
5081 /// going to be groupped together. ScheduleCopyableData models extra user for
5082 /// "copyable" instructions.
5083 class ScheduleEntity {
5084 friend class ScheduleBundle;
5085 friend class ScheduleData;
5086 friend class ScheduleCopyableData;
5087
5088 protected:
5089 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
5090 Kind getKind() const { return K; }
5091 ScheduleEntity(Kind K) : K(K) {}
5092
5093 private:
5094 /// Used for getting a "good" final ordering of instructions.
5095 int SchedulingPriority = 0;
5096 /// True if this instruction (or bundle) is scheduled (or considered as
5097 /// scheduled in the dry-run).
5098 bool IsScheduled = false;
5099 /// The kind of the ScheduleEntity.
5100 const Kind K = Kind::ScheduleData;
5101
5102 public:
5103 ScheduleEntity() = delete;
5104 /// Gets/sets the scheduling priority.
5105 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
5106 int getSchedulingPriority() const { return SchedulingPriority; }
5107 bool isReady() const {
5108 if (const auto *SD = dyn_cast<ScheduleData>(this))
5109 return SD->isReady();
5110 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
5111 return CD->isReady();
5112 return cast<ScheduleBundle>(this)->isReady();
5113 }
5114 /// Returns true if the dependency information has been calculated.
5115 /// Note that depenendency validity can vary between instructions within
5116 /// a single bundle.
5117 bool hasValidDependencies() const {
5118 if (const auto *SD = dyn_cast<ScheduleData>(this))
5119 return SD->hasValidDependencies();
5120 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
5121 return CD->hasValidDependencies();
5122 return cast<ScheduleBundle>(this)->hasValidDependencies();
5123 }
5124 /// Gets the number of unscheduled dependencies.
5125 int getUnscheduledDeps() const {
5126 if (const auto *SD = dyn_cast<ScheduleData>(this))
5127 return SD->getUnscheduledDeps();
5128 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
5129 return CD->getUnscheduledDeps();
5130 return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
5131 }
5132 /// Increments the number of unscheduled dependencies.
5133 int incrementUnscheduledDeps(int Incr) {
5134 if (auto *SD = dyn_cast<ScheduleData>(this))
5135 return SD->incrementUnscheduledDeps(Incr);
5136 return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
5137 }
5138 /// Gets the number of dependencies.
5139 int getDependencies() const {
5140 if (const auto *SD = dyn_cast<ScheduleData>(this))
5141 return SD->getDependencies();
5142 return cast<ScheduleCopyableData>(this)->getDependencies();
5143 }
5144 /// Gets the instruction.
5145 Instruction *getInst() const {
5146 if (const auto *SD = dyn_cast<ScheduleData>(this))
5147 return SD->getInst();
5148 return cast<ScheduleCopyableData>(this)->getInst();
5149 }
5150
5151 /// Gets/sets if the bundle is scheduled.
5152 bool isScheduled() const { return IsScheduled; }
5153 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
5154
5155 static bool classof(const ScheduleEntity *) { return true; }
5156
5157#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5158 void dump(raw_ostream &OS) const {
5159 if (const auto *SD = dyn_cast<ScheduleData>(this))
5160 return SD->dump(OS);
5161 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
5162 return CD->dump(OS);
5163 return cast<ScheduleBundle>(this)->dump(OS);
5164 }
5165
5166 LLVM_DUMP_METHOD void dump() const {
5167 dump(dbgs());
5168 dbgs() << '\n';
5169 }
5170#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5171 };
5172
5173#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5175 const BoUpSLP::ScheduleEntity &SE) {
5176 SE.dump(OS);
5177 return OS;
5178 }
5179#endif
5180
5181 /// Contains all scheduling relevant data for an instruction.
5182 /// A ScheduleData either represents a single instruction or a member of an
5183 /// instruction bundle (= a group of instructions which is combined into a
5184 /// vector instruction).
5185 class ScheduleData final : public ScheduleEntity {
5186 public:
5187 // The initial value for the dependency counters. It means that the
5188 // dependencies are not calculated yet.
5189 enum { InvalidDeps = -1 };
5190
5191 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
5192 static bool classof(const ScheduleEntity *Entity) {
5193 return Entity->getKind() == Kind::ScheduleData;
5194 }
5195
5196 void init(int BlockSchedulingRegionID, Instruction *I) {
5197 NextLoadStore = nullptr;
5198 IsScheduled = false;
5199 SchedulingRegionID = BlockSchedulingRegionID;
5200 clearDependencies();
5201 Inst = I;
5202 }
5203
5204 /// Verify basic self consistency properties
5205 void verify() {
5206 if (hasValidDependencies()) {
5207 assert(UnscheduledDeps <= Dependencies && "invariant");
5208 } else {
5209 assert(UnscheduledDeps == Dependencies && "invariant");
5210 }
5211
5212 if (IsScheduled) {
5213 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5214 "unexpected scheduled state");
5215 }
5216 }
5217
5218 /// Returns true if the dependency information has been calculated.
5219 /// Note that depenendency validity can vary between instructions within
5220 /// a single bundle.
5221 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
5222
5223 /// Returns true if it is ready for scheduling, i.e. it has no more
5224 /// unscheduled depending instructions/bundles.
5225 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5226
5227 /// Modifies the number of unscheduled dependencies for this instruction,
5228 /// and returns the number of remaining dependencies for the containing
5229 /// bundle.
5230 int incrementUnscheduledDeps(int Incr) {
5231 assert(hasValidDependencies() &&
5232 "increment of unscheduled deps would be meaningless");
5233 UnscheduledDeps += Incr;
5234 assert(UnscheduledDeps >= 0 &&
5235 "Expected valid number of unscheduled deps");
5236 return UnscheduledDeps;
5237 }
5238
5239 /// Sets the number of unscheduled dependencies to the number of
5240 /// dependencies.
5241 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5242
5243 /// Clears all dependency information.
5244 void clearDependencies() {
5245 clearDirectDependencies();
5246 MemoryDependencies.clear();
5247 ControlDependencies.clear();
5248 }
5249
5250 /// Clears all direct dependencies only, except for control and memory
5251 /// dependencies.
5252 /// Required for copyable elements to correctly handle control/memory deps
5253 /// and avoid extra reclaculation of such deps.
5254 void clearDirectDependencies() {
5255 Dependencies = InvalidDeps;
5256 resetUnscheduledDeps();
5257 IsScheduled = false;
5258 }
5259
5260 /// Gets the number of unscheduled dependencies.
5261 int getUnscheduledDeps() const { return UnscheduledDeps; }
5262 /// Gets the number of dependencies.
5263 int getDependencies() const { return Dependencies; }
5264 /// Initializes the number of dependencies.
5265 void initDependencies() { Dependencies = 0; }
5266 /// Increments the number of dependencies.
5267 void incDependencies() { Dependencies++; }
5268
5269 /// Gets scheduling region ID.
5270 int getSchedulingRegionID() const { return SchedulingRegionID; }
5271
5272 /// Gets the instruction.
5273 Instruction *getInst() const { return Inst; }
5274
5275 /// Gets the list of memory dependencies.
5276 ArrayRef<ScheduleData *> getMemoryDependencies() const {
5277 return MemoryDependencies;
5278 }
5279 /// Adds a memory dependency.
5280 void addMemoryDependency(ScheduleData *Dep) {
5281 MemoryDependencies.push_back(Dep);
5282 }
5283 /// Gets the list of control dependencies.
5284 ArrayRef<ScheduleData *> getControlDependencies() const {
5285 return ControlDependencies;
5286 }
5287 /// Adds a control dependency.
5288 void addControlDependency(ScheduleData *Dep) {
5289 ControlDependencies.push_back(Dep);
5290 }
5291 /// Gets/sets the next load/store instruction in the block.
5292 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
5293 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
5294
5295 void dump(raw_ostream &OS) const { OS << *Inst; }
5296
5297 LLVM_DUMP_METHOD void dump() const {
5298 dump(dbgs());
5299 dbgs() << '\n';
5300 }
5301
5302 private:
5303 Instruction *Inst = nullptr;
5304
5305 /// Single linked list of all memory instructions (e.g. load, store, call)
5306 /// in the block - until the end of the scheduling region.
5307 ScheduleData *NextLoadStore = nullptr;
5308
5309 /// The dependent memory instructions.
5310 /// This list is derived on demand in calculateDependencies().
5311 SmallVector<ScheduleData *> MemoryDependencies;
5312
5313 /// List of instructions which this instruction could be control dependent
5314 /// on. Allowing such nodes to be scheduled below this one could introduce
5315 /// a runtime fault which didn't exist in the original program.
5316 /// ex: this is a load or udiv following a readonly call which inf loops
5317 SmallVector<ScheduleData *> ControlDependencies;
5318
5319 /// This ScheduleData is in the current scheduling region if this matches
5320 /// the current SchedulingRegionID of BlockScheduling.
5321 int SchedulingRegionID = 0;
5322
5323 /// The number of dependencies. Constitutes of the number of users of the
5324 /// instruction plus the number of dependent memory instructions (if any).
5325 /// This value is calculated on demand.
5326 /// If InvalidDeps, the number of dependencies is not calculated yet.
5327 int Dependencies = InvalidDeps;
5328
5329 /// The number of dependencies minus the number of dependencies of scheduled
5330 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5331 /// for scheduling.
5332 /// Note that this is negative as long as Dependencies is not calculated.
5333 int UnscheduledDeps = InvalidDeps;
5334 };
5335
5336#ifndef NDEBUG
5338 const BoUpSLP::ScheduleData &SD) {
5339 SD.dump(OS);
5340 return OS;
5341 }
5342#endif
5343
5344 class ScheduleBundle final : public ScheduleEntity {
5345 /// The schedule data for the instructions in the bundle.
5347 /// True if this bundle is valid.
5348 bool IsValid = true;
5349 /// The TreeEntry that this instruction corresponds to.
5350 TreeEntry *TE = nullptr;
5351 ScheduleBundle(bool IsValid)
5352 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
5353
5354 public:
5355 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
5356 static bool classof(const ScheduleEntity *Entity) {
5357 return Entity->getKind() == Kind::ScheduleBundle;
5358 }
5359
5360 /// Verify basic self consistency properties
5361 void verify() const {
5362 for (const ScheduleEntity *SD : Bundle) {
5363 if (SD->hasValidDependencies()) {
5364 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5365 "invariant");
5366 } else {
5367 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5368 "invariant");
5369 }
5370
5371 if (isScheduled()) {
5372 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5373 "unexpected scheduled state");
5374 }
5375 }
5376 }
5377
5378 /// Returns the number of unscheduled dependencies in the bundle.
5379 int unscheduledDepsInBundle() const {
5380 assert(*this && "bundle must not be empty");
5381 int Sum = 0;
5382 for (const ScheduleEntity *BundleMember : Bundle) {
5383 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5384 return ScheduleData::InvalidDeps;
5385 Sum += BundleMember->getUnscheduledDeps();
5386 }
5387 return Sum;
5388 }
5389
5390 /// Returns true if the dependency information has been calculated.
5391 /// Note that depenendency validity can vary between instructions within
5392 /// a single bundle.
5393 bool hasValidDependencies() const {
5394 return all_of(Bundle, [](const ScheduleEntity *SD) {
5395 return SD->hasValidDependencies();
5396 });
5397 }
5398
5399 /// Returns true if it is ready for scheduling, i.e. it has no more
5400 /// unscheduled depending instructions/bundles.
5401 bool isReady() const {
5402 assert(*this && "bundle must not be empty");
5403 return unscheduledDepsInBundle() == 0 && !isScheduled();
5404 }
5405
5406 /// Returns the bundle of scheduling data, associated with the current
5407 /// instruction.
5408 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
5409 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
5410 /// Adds an instruction to the bundle.
5411 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5412
5413 /// Gets/sets the associated tree entry.
5414 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5415 TreeEntry *getTreeEntry() const { return TE; }
5416
5417 static ScheduleBundle invalid() { return {false}; }
5418
5419 operator bool() const { return IsValid; }
5420
5421#ifndef NDEBUG
5422 void dump(raw_ostream &OS) const {
5423 if (!*this) {
5424 OS << "[]";
5425 return;
5426 }
5427 OS << '[';
5428 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
5430 OS << "<Copyable>";
5431 OS << *SD->getInst();
5432 });
5433 OS << ']';
5434 }
5435
5436 LLVM_DUMP_METHOD void dump() const {
5437 dump(dbgs());
5438 dbgs() << '\n';
5439 }
5440#endif // NDEBUG
5441 };
5442
5443#ifndef NDEBUG
5445 const BoUpSLP::ScheduleBundle &Bundle) {
5446 Bundle.dump(OS);
5447 return OS;
5448 }
5449#endif
5450
5451 /// Contains all scheduling relevant data for the copyable instruction.
5452 /// It models the virtual instructions, supposed to replace the original
5453 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
5454 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
5455 /// instruction %virt = add %0, 0.
5456 class ScheduleCopyableData final : public ScheduleEntity {
5457 /// The source schedule data for the instruction.
5458 Instruction *Inst = nullptr;
5459 /// The edge information for the instruction.
5460 const EdgeInfo EI;
5461 /// This ScheduleData is in the current scheduling region if this matches
5462 /// the current SchedulingRegionID of BlockScheduling.
5463 int SchedulingRegionID = 0;
5464 /// Bundle, this data is part of.
5465 ScheduleBundle &Bundle;
5466
5467 public:
5468 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5469 const EdgeInfo &EI, ScheduleBundle &Bundle)
5470 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5471 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5472 static bool classof(const ScheduleEntity *Entity) {
5473 return Entity->getKind() == Kind::ScheduleCopyableData;
5474 }
5475
5476 /// Verify basic self consistency properties
5477 void verify() {
5478 if (hasValidDependencies()) {
5479 assert(UnscheduledDeps <= Dependencies && "invariant");
5480 } else {
5481 assert(UnscheduledDeps == Dependencies && "invariant");
5482 }
5483
5484 if (IsScheduled) {
5485 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5486 "unexpected scheduled state");
5487 }
5488 }
5489
5490 /// Returns true if the dependency information has been calculated.
5491 /// Note that depenendency validity can vary between instructions within
5492 /// a single bundle.
5493 bool hasValidDependencies() const {
5494 return Dependencies != ScheduleData::InvalidDeps;
5495 }
5496
5497 /// Returns true if it is ready for scheduling, i.e. it has no more
5498 /// unscheduled depending instructions/bundles.
5499 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5500
5501 /// Modifies the number of unscheduled dependencies for this instruction,
5502 /// and returns the number of remaining dependencies for the containing
5503 /// bundle.
5504 int incrementUnscheduledDeps(int Incr) {
5505 assert(hasValidDependencies() &&
5506 "increment of unscheduled deps would be meaningless");
5507 UnscheduledDeps += Incr;
5508 assert(UnscheduledDeps >= 0 && "invariant");
5509 return UnscheduledDeps;
5510 }
5511
5512 /// Sets the number of unscheduled dependencies to the number of
5513 /// dependencies.
5514 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5515
5516 /// Gets the number of unscheduled dependencies.
5517 int getUnscheduledDeps() const { return UnscheduledDeps; }
5518 /// Gets the number of dependencies.
5519 int getDependencies() const { return Dependencies; }
5520 /// Initializes the number of dependencies.
5521 void initDependencies() { Dependencies = 0; }
5522 /// Increments the number of dependencies.
5523 void incDependencies() { Dependencies++; }
5524
5525 /// Gets scheduling region ID.
5526 int getSchedulingRegionID() const { return SchedulingRegionID; }
5527
5528 /// Gets the instruction.
5529 Instruction *getInst() const { return Inst; }
5530
5531 /// Clears all dependency information.
5532 void clearDependencies() {
5533 Dependencies = ScheduleData::InvalidDeps;
5534 UnscheduledDeps = ScheduleData::InvalidDeps;
5535 IsScheduled = false;
5536 }
5537
5538 /// Gets the edge information.
5539 const EdgeInfo &getEdgeInfo() const { return EI; }
5540
5541 /// Gets the bundle.
5542 ScheduleBundle &getBundle() { return Bundle; }
5543 const ScheduleBundle &getBundle() const { return Bundle; }
5544
5545#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5546 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5547
5548 LLVM_DUMP_METHOD void dump() const {
5549 dump(dbgs());
5550 dbgs() << '\n';
5551 }
5552#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5553
5554 private:
5555 /// true, if it has valid dependency information. These nodes always have
5556 /// only single dependency.
5557 int Dependencies = ScheduleData::InvalidDeps;
5558
5559 /// The number of dependencies minus the number of dependencies of scheduled
5560 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5561 /// for scheduling.
5562 /// Note that this is negative as long as Dependencies is not calculated.
5563 int UnscheduledDeps = ScheduleData::InvalidDeps;
5564 };
5565
5566#ifndef NDEBUG
5567 friend inline raw_ostream &
5568 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5569 SD.dump(OS);
5570 return OS;
5571 }
5572#endif
5573
5574 friend struct GraphTraits<BoUpSLP *>;
5575 friend struct DOTGraphTraits<BoUpSLP *>;
5576
5577 /// Contains all scheduling data for a basic block.
5578 /// It does not schedules instructions, which are not memory read/write
5579 /// instructions and their operands are either constants, or arguments, or
5580 /// phis, or instructions from others blocks, or their users are phis or from
5581 /// the other blocks. The resulting vector instructions can be placed at the
5582 /// beginning of the basic block without scheduling (if operands does not need
5583 /// to be scheduled) or at the end of the block (if users are outside of the
5584 /// block). It allows to save some compile time and memory used by the
5585 /// compiler.
5586 /// ScheduleData is assigned for each instruction in between the boundaries of
5587 /// the tree entry, even for those, which are not part of the graph. It is
5588 /// required to correctly follow the dependencies between the instructions and
5589 /// their correct scheduling. The ScheduleData is not allocated for the
5590 /// instructions, which do not require scheduling, like phis, nodes with
5591 /// extractelements/insertelements only or nodes with instructions, with
5592 /// uses/operands outside of the block.
5593 struct BlockScheduling {
5594 BlockScheduling(BasicBlock *BB)
5595 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5596
5597 void clear() {
5598 ScheduledBundles.clear();
5599 ScheduledBundlesList.clear();
5600 ScheduleCopyableDataMap.clear();
5601 ScheduleCopyableDataMapByInst.clear();
5602 ScheduleCopyableDataMapByInstUser.clear();
5603 ScheduleCopyableDataMapByUsers.clear();
5604 ReadyInsts.clear();
5605 RecalcCopyableOperandDeps.clear();
5606 ScheduleStart = nullptr;
5607 ScheduleEnd = nullptr;
5608 FirstLoadStoreInRegion = nullptr;
5609 LastLoadStoreInRegion = nullptr;
5610 RegionHasStackSave = false;
5611
5612 // Reduce the maximum schedule region size by the size of the
5613 // previous scheduling run.
5614 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5615 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5616 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5617 ScheduleRegionSize = 0;
5618
5619 // Make a new scheduling region, i.e. all existing ScheduleData is not
5620 // in the new region yet.
5621 ++SchedulingRegionID;
5622 }
5623
5624 ScheduleData *getScheduleData(Instruction *I) {
5625 if (!I)
5626 return nullptr;
5627 if (BB != I->getParent())
5628 // Avoid lookup if can't possibly be in map.
5629 return nullptr;
5630 ScheduleData *SD = ScheduleDataMap.lookup(I);
5631 if (SD && isInSchedulingRegion(*SD))
5632 return SD;
5633 return nullptr;
5634 }
5635
5636 ScheduleData *getScheduleData(Value *V) {
5637 return getScheduleData(dyn_cast<Instruction>(V));
5638 }
5639
5640 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5641 /// operand number) and value.
5642 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5643 const Value *V) const {
5644 if (ScheduleCopyableDataMap.empty())
5645 return nullptr;
5646 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5647 if (It == ScheduleCopyableDataMap.end())
5648 return nullptr;
5649 ScheduleCopyableData *SD = It->getSecond().get();
5650 if (!isInSchedulingRegion(*SD))
5651 return nullptr;
5652 return SD;
5653 }
5654
5655 /// Returns the ScheduleCopyableData for the given user \p User, operand
5656 /// number and operand \p V.
5658 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5659 const Value *V) {
5660 if (ScheduleCopyableDataMapByInstUser.empty())
5661 return {};
5662 const auto It = ScheduleCopyableDataMapByInstUser.find(
5663 std::make_pair(std::make_pair(User, OperandIdx), V));
5664 if (It == ScheduleCopyableDataMapByInstUser.end())
5665 return {};
5667 for (ScheduleCopyableData *SD : It->getSecond()) {
5668 if (isInSchedulingRegion(*SD))
5669 Res.push_back(SD);
5670 }
5671 return Res;
5672 }
5673
5674 /// Returns true if all operands of the given instruction \p User are
5675 /// replaced by copyable data.
5676 /// \param User The user instruction.
5677 /// \param Op The operand, which might be replaced by the copyable data.
5678 /// \param SLP The SLP tree.
5679 /// \param NumOps The number of operands used. If the instruction uses the
5680 /// same operand several times, check for the first use, then the second,
5681 /// etc.
5682 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5683 Instruction *Op, BoUpSLP &SLP,
5684 unsigned NumOps) const {
5685 assert(NumOps > 0 && "No operands");
5686 if (ScheduleCopyableDataMap.empty())
5687 return false;
5688 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5689 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
5690 if (Entries.empty())
5691 return false;
5692 unsigned CurNumOps = 0;
5693 for (const Use &U : User->operands()) {
5694 if (U.get() != Op)
5695 continue;
5696 ++CurNumOps;
5697 // Check all tree entries, if they have operands replaced by copyable
5698 // data.
5699 for (TreeEntry *TE : Entries) {
5700 unsigned Inc = 0;
5701 bool IsNonSchedulableWithParentPhiNode =
5702 TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
5703 TE->UserTreeIndex.UserTE->hasState() &&
5704 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5705 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5706 // Count the number of unique phi nodes, which are the parent for
5707 // parent entry, and exit, if all the unique phis are processed.
5708 if (IsNonSchedulableWithParentPhiNode) {
5709 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5710 const TreeEntry *ParentTE = TE->UserTreeIndex.UserTE;
5711 for (Value *V : ParentTE->Scalars) {
5712 auto *PHI = dyn_cast<PHINode>(V);
5713 if (!PHI)
5714 continue;
5715 if (ParentsUniqueUsers.insert(PHI).second &&
5716 is_contained(PHI->incoming_values(), User))
5717 ++Inc;
5718 }
5719 } else {
5720 Inc = count(TE->Scalars, User);
5721 }
5722
5723 // Check if the user is commutative.
5724 // The commutatives are handled later, as their operands can be
5725 // reordered.
5726 // Same applies even for non-commutative cmps, because we can invert
5727 // their predicate potentially and, thus, reorder the operands.
5728 bool IsCommutativeUser =
5729 ::isCommutative(User) &&
5730 ::isCommutableOperand(User, User, U.getOperandNo());
5731 if (!IsCommutativeUser) {
5732 Instruction *MainOp = TE->getMatchingMainOpOrAltOp(User);
5733 IsCommutativeUser =
5734 ::isCommutative(MainOp, User) &&
5735 ::isCommutableOperand(MainOp, User, U.getOperandNo());
5736 }
5737 // The commutative user with the same operands can be safely
5738 // considered as non-commutative, operands reordering does not change
5739 // the semantics.
5740 assert(
5741 (!IsCommutativeUser ||
5742 (((::isCommutative(User) &&
5743 ::isCommutableOperand(User, User, 0) &&
5744 ::isCommutableOperand(User, User, 1)) ||
5745 (::isCommutative(TE->getMatchingMainOpOrAltOp(User), User) &&
5746 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5747 User, 0) &&
5748 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5749 User, 1))))) &&
5750 "Expected commutative user with 2 first commutable operands");
5751 bool IsCommutativeWithSameOps =
5752 IsCommutativeUser && User->getOperand(0) == User->getOperand(1);
5753 if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
5754 !isa<CmpInst>(User)) {
5755 EdgeInfo EI(TE, U.getOperandNo());
5756 if (CurNumOps != NumOps || getScheduleCopyableData(EI, Op))
5757 continue;
5758 return false;
5759 }
5760 PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
5761 .first->getSecond() += Inc;
5762 }
5763 }
5764 if (PotentiallyReorderedEntriesCount.empty())
5765 return true;
5766 // Check the commutative/cmp entries.
5767 for (auto &P : PotentiallyReorderedEntriesCount) {
5768 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5769 bool IsNonSchedulableWithParentPhiNode =
5770 P.first->doesNotNeedToSchedule() && P.first->UserTreeIndex &&
5771 P.first->UserTreeIndex.UserTE->hasState() &&
5772 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5773 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5774 auto *It = find(P.first->Scalars, User);
5775 do {
5776 assert(It != P.first->Scalars.end() &&
5777 "User is not in the tree entry");
5778 int Lane = std::distance(P.first->Scalars.begin(), It);
5779 assert(Lane >= 0 && "Lane is not found");
5780 if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
5781 Lane = P.first->ReorderIndices[Lane];
5782 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5783 "Couldn't find extract lane");
5784 // Count the number of unique phi nodes, which are the parent for
5785 // parent entry, and exit, if all the unique phis are processed.
5786 if (IsNonSchedulableWithParentPhiNode) {
5787 const TreeEntry *ParentTE = P.first->UserTreeIndex.UserTE;
5788 Value *User = ParentTE->Scalars[Lane];
5789 if (!ParentsUniqueUsers.insert(User).second) {
5790 It =
5791 find(make_range(std::next(It), P.first->Scalars.end()), User);
5792 continue;
5793 }
5794 }
5795 for (unsigned OpIdx :
5797 P.first->getMainOp()))) {
5798 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5799 getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
5800 --P.getSecond();
5801 }
5802 // If parent node is schedulable, it will be handled correctly.
5803 It = find(make_range(std::next(It), P.first->Scalars.end()), User);
5804 } while (It != P.first->Scalars.end());
5805 }
5806 return all_of(PotentiallyReorderedEntriesCount,
5807 [&](const std::pair<const TreeEntry *, unsigned> &P) {
5808 return P.second == NumOps - 1;
5809 });
5810 }
5811
5813 getScheduleCopyableData(const Instruction *I) const {
5814 if (ScheduleCopyableDataMapByInst.empty())
5815 return {};
5816 const auto It = ScheduleCopyableDataMapByInst.find(I);
5817 if (It == ScheduleCopyableDataMapByInst.end())
5818 return {};
5820 for (ScheduleCopyableData *SD : It->getSecond()) {
5821 if (isInSchedulingRegion(*SD))
5822 Res.push_back(SD);
5823 }
5824 return Res;
5825 }
5826
5828 getScheduleCopyableDataUsers(const Instruction *User) const {
5829 if (ScheduleCopyableDataMapByUsers.empty())
5830 return {};
5831 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5832 if (It == ScheduleCopyableDataMapByUsers.end())
5833 return {};
5835 for (ScheduleCopyableData *SD : It->getSecond()) {
5836 if (isInSchedulingRegion(*SD))
5837 Res.push_back(SD);
5838 }
5839 return Res;
5840 }
5841
5842 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5843 Instruction *I,
5844 int SchedulingRegionID,
5845 ScheduleBundle &Bundle) {
5846 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5847 ScheduleCopyableData *CD =
5848 ScheduleCopyableDataMap
5849 .try_emplace(std::make_pair(EI, I),
5850 std::make_unique<ScheduleCopyableData>(
5851 SchedulingRegionID, I, EI, Bundle))
5852 .first->getSecond()
5853 .get();
5854 ScheduleCopyableDataMapByInst[I].push_back(CD);
5855 if (EI.UserTE) {
5856 ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
5857 const auto *It = find(Op, I);
5858 assert(It != Op.end() && "Lane not set");
5859 SmallPtrSet<Instruction *, 4> Visited;
5860 do {
5861 int Lane = std::distance(Op.begin(), It);
5862 assert(Lane >= 0 && "Lane not set");
5863 if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
5864 !EI.UserTE->ReorderIndices.empty())
5865 Lane = EI.UserTE->ReorderIndices[Lane];
5866 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5867 "Couldn't find extract lane");
5868 auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
5869 if (!Visited.insert(In).second) {
5870 It = find(make_range(std::next(It), Op.end()), I);
5871 continue;
5872 }
5873 ScheduleCopyableDataMapByInstUser
5874 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
5875 .first->getSecond()
5876 .push_back(CD);
5877 ScheduleCopyableDataMapByUsers.try_emplace(I)
5878 .first->getSecond()
5879 .insert(CD);
5880 // Remove extra deps for users, becoming non-immediate users of the
5881 // instruction. It may happen, if the chain of same copyable elements
5882 // appears in the tree.
5883 if (In == I) {
5884 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5885 if (ScheduleCopyableData *UserCD =
5886 getScheduleCopyableData(UserEI, In))
5887 ScheduleCopyableDataMapByUsers[I].remove(UserCD);
5888 }
5889 It = find(make_range(std::next(It), Op.end()), I);
5890 } while (It != Op.end());
5891 } else {
5892 ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(
5893 CD);
5894 }
5895 return *CD;
5896 }
5897
5898 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5899 auto *I = dyn_cast<Instruction>(V);
5900 if (!I)
5901 return {};
5902 auto It = ScheduledBundles.find(I);
5903 if (It == ScheduledBundles.end())
5904 return {};
5905 return It->getSecond();
5906 }
5907
5908 /// Returns true if the entity is in the scheduling region.
5909 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5910 if (const auto *Data = dyn_cast<ScheduleData>(&SD))
5911 return Data->getSchedulingRegionID() == SchedulingRegionID;
5912 if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
5913 return CD->getSchedulingRegionID() == SchedulingRegionID;
5914 return all_of(cast<ScheduleBundle>(SD).getBundle(),
5915 [&](const ScheduleEntity *BundleMember) {
5916 return isInSchedulingRegion(*BundleMember);
5917 });
5918 }
5919
5920 /// Marks an instruction as scheduled and puts all dependent ready
5921 /// instructions into the ready-list.
5922 template <typename ReadyListType>
5923 void schedule(const BoUpSLP &R, const InstructionsState &S,
5924 const EdgeInfo &EI, ScheduleEntity *Data,
5925 ReadyListType &ReadyList) {
5926 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5928 // Handle the def-use chain dependencies.
5929
5930 // Decrement the unscheduled counter and insert to ready list if ready.
5931 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5932 if ((IsControl || Data->hasValidDependencies()) &&
5933 Data->incrementUnscheduledDeps(-1) == 0) {
5934 // There are no more unscheduled dependencies after
5935 // decrementing, so we can put the dependent instruction
5936 // into the ready list.
5937 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5939 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5940 CopyableBundle.push_back(&CD->getBundle());
5941 Bundles = CopyableBundle;
5942 } else {
5943 Bundles = getScheduleBundles(Data->getInst());
5944 }
5945 if (!Bundles.empty()) {
5946 for (ScheduleBundle *Bundle : Bundles) {
5947 if (Bundle->unscheduledDepsInBundle() == 0) {
5948 assert(!Bundle->isScheduled() &&
5949 "already scheduled bundle gets ready");
5950 ReadyList.insert(Bundle);
5952 << "SLP: gets ready: " << *Bundle << "\n");
5953 }
5954 }
5955 return;
5956 }
5957 assert(!Data->isScheduled() &&
5958 "already scheduled bundle gets ready");
5960 "Expected non-copyable data");
5961 ReadyList.insert(Data);
5962 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5963 }
5964 };
5965
5966 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5967 Instruction *I) {
5968 if (!ScheduleCopyableDataMap.empty()) {
5970 getScheduleCopyableData(User, OpIdx, I);
5971 for (ScheduleCopyableData *CD : CopyableData)
5972 DecrUnsched(CD, /*IsControl=*/false);
5973 if (!CopyableData.empty())
5974 return;
5975 }
5976 if (ScheduleData *OpSD = getScheduleData(I))
5977 DecrUnsched(OpSD, /*IsControl=*/false);
5978 };
5979
5980 // If BundleMember is a vector bundle, its operands may have been
5981 // reordered during buildTree(). We therefore need to get its operands
5982 // through the TreeEntry.
5983 if (!Bundles.empty()) {
5984 auto *In = BundleMember->getInst();
5985 // Count uses of each instruction operand.
5986 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5987 unsigned TotalOpCount = 0;
5988 if (isa<ScheduleCopyableData>(BundleMember)) {
5989 // Copyable data is used only once (uses itself).
5990 TotalOpCount = OperandsUses[In] = 1;
5991 } else {
5992 for (const Use &U : In->operands()) {
5993 if (auto *I = dyn_cast<Instruction>(U.get())) {
5994 auto Res = OperandsUses.try_emplace(I, 0);
5995 unsigned ExtraDeps = 1;
5996 // Count all expanded operands in the binops.
5997 for (ScheduleBundle *Bundle : Bundles) {
5998 if (const TreeEntry *TE = Bundle->getTreeEntry()) {
5999 if (TE->isExpandedBinOp(In))
6000 ++ExtraDeps;
6001 } else if (S.isExpandedBinOp(In)) {
6002 ++ExtraDeps;
6003 }
6004 }
6005 Res.first->getSecond() += ExtraDeps;
6006 TotalOpCount += ExtraDeps;
6007 }
6008 }
6009 }
6010 // Decrement the unscheduled counter and insert to ready list if
6011 // ready.
6012 auto DecrUnschedForInst =
6013 [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
6014 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
6015 &Checked,
6016 bool IsExpandedOperand = false) {
6017 if (!ScheduleCopyableDataMap.empty()) {
6018 const EdgeInfo EI = {UserTE, OpIdx};
6019 if (ScheduleCopyableData *CD =
6020 getScheduleCopyableData(EI, I)) {
6021 if (!Checked.insert(std::make_pair(CD, OpIdx)).second)
6022 return;
6023 DecrUnsched(CD, /*IsControl=*/false);
6024 return;
6025 }
6026 }
6027 auto It = OperandsUses.find(I);
6028 assert(It != OperandsUses.end() && "Operand not found");
6029 if (It->second > 0) {
6030 if (ScheduleData *OpSD = getScheduleData(I)) {
6031 if (!IsExpandedOperand &&
6032 !Checked.insert(std::make_pair(OpSD, OpIdx)).second)
6033 return;
6034 --It->getSecond();
6035 assert(TotalOpCount > 0 && "No more operands to decrement");
6036 --TotalOpCount;
6037 DecrUnsched(OpSD, /*IsControl=*/false);
6038 } else {
6039 --It->getSecond();
6040 assert(TotalOpCount > 0 && "No more operands to decrement");
6041 --TotalOpCount;
6042 }
6043 }
6044 };
6045
6046 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
6047 for (ScheduleBundle *Bundle : Bundles) {
6048 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
6049 break;
6050 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
6051 // Need to search for the lane since the tree entry can be
6052 // reordered.
6053 auto *It = find(Bundle->getTreeEntry()->Scalars, In);
6054 bool IsNonSchedulableWithParentPhiNode =
6055 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
6056 Bundle->getTreeEntry()->UserTreeIndex &&
6057 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
6058 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
6059 TreeEntry::SplitVectorize &&
6060 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
6061 Instruction::PHI;
6062 do {
6063 int Lane =
6064 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
6065 assert(Lane >= 0 && "Lane not set");
6066 if (isa<StoreInst>(In) &&
6067 !Bundle->getTreeEntry()->ReorderIndices.empty())
6068 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
6069 assert(Lane < static_cast<int>(
6070 Bundle->getTreeEntry()->Scalars.size()) &&
6071 "Couldn't find extract lane");
6072
6073 // Since vectorization tree is being built recursively this
6074 // assertion ensures that the tree entry has all operands set
6075 // before reaching this code. Couple of exceptions known at the
6076 // moment are extracts where their second (immediate) operand is
6077 // not added. Since immediates do not affect scheduler behavior
6078 // this is considered okay.
6079 assert(
6080 In &&
6082 In->getNumOperands() ==
6083 Bundle->getTreeEntry()->getNumOperands() ||
6084 (isa<ZExtInst>(In) && Bundle->getTreeEntry()->getOpcode() ==
6085 Instruction::Select) ||
6086 Bundle->getTreeEntry()->isCopyableElement(In)) &&
6087 "Missed TreeEntry operands?");
6088
6089 // Count the number of unique phi nodes, which are the parent for
6090 // parent entry, and exit, if all the unique phis are processed.
6091 if (IsNonSchedulableWithParentPhiNode) {
6092 const TreeEntry *ParentTE =
6093 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
6094 Value *User = ParentTE->Scalars[Lane];
6095 if (!ParentsUniqueUsers.insert(User).second) {
6096 It = std::find(std::next(It),
6097 Bundle->getTreeEntry()->Scalars.end(), In);
6098 continue;
6099 }
6100 }
6101
6102 for (unsigned OpIdx :
6103 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
6104 if (auto *I = dyn_cast<Instruction>(
6105 Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
6106 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): "
6107 << *I << "\n");
6108 DecrUnschedForInst(
6109 I, Bundle->getTreeEntry(), OpIdx, Checked,
6110 Bundle->getTreeEntry()->isExpandedOperand(In, OpIdx));
6111 }
6112 // If parent node is schedulable, it will be handled correctly.
6113 if (Bundle->getTreeEntry()->isCopyableElement(In))
6114 break;
6115 It = std::find(std::next(It),
6116 Bundle->getTreeEntry()->Scalars.end(), In);
6117 } while (It != Bundle->getTreeEntry()->Scalars.end());
6118 }
6119 } else {
6120 // If BundleMember is a stand-alone instruction, no operand reordering
6121 // has taken place, so we directly access its operands.
6122 for (Use &U : BundleMember->getInst()->operands()) {
6123 if (auto *I = dyn_cast<Instruction>(U.get())) {
6125 << "SLP: check for readiness (def): " << *I << "\n");
6126 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
6127 }
6128 }
6129 }
6130 // Handle the memory dependencies.
6131 auto *SD = dyn_cast<ScheduleData>(BundleMember);
6132 if (!SD)
6133 return;
6134 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
6135 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
6136 if (!VisitedMemory.insert(MemoryDep).second)
6137 continue;
6138 // There are no more unscheduled dependencies after decrementing,
6139 // so we can put the dependent instruction into the ready list.
6140 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
6141 << *MemoryDep << "\n");
6142 DecrUnsched(MemoryDep);
6143 }
6144 // Handle the control dependencies.
6145 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
6146 for (ScheduleData *Dep : SD->getControlDependencies()) {
6147 if (!VisitedControl.insert(Dep).second)
6148 continue;
6149 // There are no more unscheduled dependencies after decrementing,
6150 // so we can put the dependent instruction into the ready list.
6152 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
6153 DecrUnsched(Dep, /*IsControl=*/true);
6154 }
6155 };
6156 if (auto *SD = dyn_cast<ScheduleData>(Data)) {
6157 SD->setScheduled(/*Scheduled=*/true);
6158 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
6161 Instruction *In = SD->getInst();
6162 ArrayRef<TreeEntry *> Entries = R.getTreeEntries(In);
6163 if (!Entries.empty()) {
6164 for (TreeEntry *TE : Entries) {
6166 In->getNumOperands() != TE->getNumOperands())
6167 continue;
6168 auto &BundlePtr =
6169 PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>());
6170 BundlePtr->setTreeEntry(TE);
6171 BundlePtr->add(SD);
6172 Bundles.push_back(BundlePtr.get());
6173 }
6174 }
6175 ProcessBundleMember(SD, Bundles);
6176 } else {
6177 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
6178 Bundle.setScheduled(/*Scheduled=*/true);
6179 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
6180 auto AreAllBundlesScheduled =
6181 [&](const ScheduleEntity *SD,
6182 ArrayRef<ScheduleBundle *> SDBundles) {
6184 return true;
6185 return !SDBundles.empty() &&
6186 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
6187 return SDBundle->isScheduled();
6188 });
6189 };
6190 for (ScheduleEntity *SD : Bundle.getBundle()) {
6193 SDBundles = getScheduleBundles(SD->getInst());
6194 if (!AreAllBundlesScheduled(SD, SDBundles))
6195 continue;
6196 SD->setScheduled(/*Scheduled=*/true);
6197 if (isa<ScheduleCopyableData>(SD) ||
6198 ScheduleCopyableDataMap.empty()) {
6199 ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD) ? &Bundle
6200 : SDBundles);
6201 continue;
6202 }
6203 // The instruction may also belong to tree entries that do not need
6204 // scheduling (e.g. all their values are used outside the block), so
6205 // no schedule bundle is registered for them. Such an entry can still
6206 // model one of this instruction's operands as a copyable element,
6207 // registered on that non-scheduled parent edge. That copyable would
6208 // never be decremented when the instruction is scheduled through a
6209 // different bundle, leaving the copyable's bundle permanently
6210 // unscheduled and tripping the unscheduled-deps assertion. Add
6211 // pseudo-bundles for these missing tree entries, so their copyable
6212 // operand dependencies are decremented here as well. Real operand
6213 // dependencies are protected against double counting by the
6214 // per-operand use counter in ProcessBundleMember.
6215 Instruction *In = SD->getInst();
6217 SmallVector<ScheduleBundle *> AllBundles(SDBundles.begin(),
6218 SDBundles.end());
6219 for (TreeEntry *TE : R.getTreeEntries(In)) {
6220 if (TE->isCopyableElement(In))
6221 continue;
6223 In->getNumOperands() != TE->getNumOperands())
6224 continue;
6225 if (any_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
6226 return SDBundle->getTreeEntry() == TE;
6227 }))
6228 continue;
6229 ScheduleBundle &PseudoBundle =
6230 *PseudoBundles.emplace_back(std::make_unique<ScheduleBundle>());
6231 PseudoBundle.setTreeEntry(TE);
6232 PseudoBundle.add(SD);
6233 AllBundles.push_back(&PseudoBundle);
6234 }
6235 ProcessBundleMember(SD, AllBundles);
6236 }
6237 }
6238 }
6239
6240 /// Verify basic self consistency properties of the data structure.
6241 void verify() {
6242 if (!ScheduleStart)
6243 return;
6244
6245 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
6246 ScheduleStart->comesBefore(ScheduleEnd) &&
6247 "Not a valid scheduling region?");
6248
6249 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
6250 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
6251 if (!Bundles.empty()) {
6252 for (ScheduleBundle *Bundle : Bundles) {
6253 assert(isInSchedulingRegion(*Bundle) &&
6254 "primary schedule data not in window?");
6255 Bundle->verify();
6256 }
6257 continue;
6258 }
6259 auto *SD = getScheduleData(I);
6260 if (!SD)
6261 continue;
6262 assert(isInSchedulingRegion(*SD) &&
6263 "primary schedule data not in window?");
6264 SD->verify();
6265 }
6266
6267 assert(all_of(ReadyInsts,
6268 [](const ScheduleEntity *Bundle) {
6269 return Bundle->isReady();
6270 }) &&
6271 "item in ready list not ready?");
6272 }
6273
6274 /// Put all instructions into the ReadyList which are ready for scheduling.
6275 template <typename ReadyListType>
6276 void initialFillReadyList(ReadyListType &ReadyList) {
6277 SmallPtrSet<ScheduleBundle *, 16> Visited;
6278 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
6279 ScheduleData *SD = getScheduleData(I);
6280 if (SD && SD->hasValidDependencies() && SD->isReady()) {
6281 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(I);
6282 !Bundles.empty()) {
6283 for (ScheduleBundle *Bundle : Bundles) {
6284 if (!Visited.insert(Bundle).second)
6285 continue;
6286 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
6287 ReadyList.insert(Bundle);
6288 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
6289 << *Bundle << "\n");
6290 }
6291 }
6292 continue;
6293 }
6294 ReadyList.insert(SD);
6296 << "SLP: initially in ready list: " << *SD << "\n");
6297 }
6298 }
6299 }
6300
6301 /// Build a bundle from the ScheduleData nodes corresponding to the
6302 /// scalar instruction for each lane.
6303 /// \param VL The list of scalar instructions.
6304 /// \param S The state of the instructions.
6305 /// \param EI The edge in the SLP graph or the user node/operand number.
6306 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
6307 const InstructionsState &S, const EdgeInfo &EI);
6308
6309 /// Checks if a bundle of instructions can be scheduled, i.e. has no
6310 /// cyclic dependencies. This is only a dry-run, no instructions are
6311 /// actually moved at this stage.
6312 /// \returns the scheduling bundle. The returned Optional value is not
6313 /// std::nullopt if \p VL is allowed to be scheduled.
6314 std::optional<ScheduleBundle *>
6315 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
6316 const InstructionsState &S, const EdgeInfo &EI);
6317
6318 /// Allocates schedule data chunk.
6319 ScheduleData *allocateScheduleDataChunks();
6320
6321 /// Extends the scheduling region so that V is inside the region.
6322 /// \returns true if the region size is within the limit.
6323 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
6324
6325 /// Initialize the ScheduleData structures for new instructions in the
6326 /// scheduling region.
6327 void initScheduleData(Instruction *FromI, Instruction *ToI,
6328 ScheduleData *PrevLoadStore,
6329 ScheduleData *NextLoadStore);
6330
6331 /// Updates the dependency information of a bundle and of all instructions/
6332 /// bundles which depend on the original bundle.
6333 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
6334 BoUpSLP *SLP,
6335 const SmallPtrSetImpl<Value *> &ExpandedOps,
6336 ArrayRef<ScheduleData *> ControlDeps = {});
6337
6338 /// Sets all instruction in the scheduling region to un-scheduled.
6339 void resetSchedule();
6340
6341 BasicBlock *BB;
6342
6343 /// Simple memory allocation for ScheduleData.
6345
6346 /// The size of a ScheduleData array in ScheduleDataChunks.
6347 int ChunkSize;
6348
6349 /// The allocator position in the current chunk, which is the last entry
6350 /// of ScheduleDataChunks.
6351 int ChunkPos;
6352
6353 /// Attaches ScheduleData to Instruction.
6354 /// Note that the mapping survives during all vectorization iterations, i.e.
6355 /// ScheduleData structures are recycled.
6356 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
6357
6358 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
6359 /// number) and the operand instruction, represented as copyable element.
6360 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
6361 std::unique_ptr<ScheduleCopyableData>>
6362 ScheduleCopyableDataMap;
6363
6364 /// Represents mapping between instruction and all related
6365 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
6366 /// element). The SLP tree may contain several representations of the same
6367 /// instruction.
6368 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
6369 ScheduleCopyableDataMapByInst;
6370
6371 /// Represents mapping between user value and operand number, the operand
6372 /// value and all related ScheduleCopyableData. The relation is 1:n, because
6373 /// the same user may refernce the same operand in different tree entries
6374 /// and the operand may be modelled by the different copyable data element.
6375 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
6377 ScheduleCopyableDataMapByInstUser;
6378
6379 /// Represents mapping between instruction and all related
6380 /// ScheduleCopyableData. It represents the mapping between the actual
6381 /// instruction and the last copyable data element in the chain. E.g., if
6382 /// the graph models the following instructions:
6383 /// %0 = non-add instruction ...
6384 /// ...
6385 /// %4 = add %3, 1
6386 /// %5 = add %4, 1
6387 /// %6 = insertelement poison, %0, 0
6388 /// %7 = insertelement %6, %5, 1
6389 /// And the graph is modeled as:
6390 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
6391 /// -> [1, 0] -> [%1, 0]
6392 ///
6393 /// this map will map %0 only to the copyable element <1>, which is the last
6394 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
6395 /// keep the map to <0>, not the %0.
6396 SmallDenseMap<const Instruction *,
6397 SmallSetVector<ScheduleCopyableData *, 4>>
6398 ScheduleCopyableDataMapByUsers;
6399
6400 /// Attaches ScheduleBundle to Instruction.
6401 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
6402 ScheduledBundles;
6403 /// The list of ScheduleBundles.
6404 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
6405
6406 /// The ready-list for scheduling (only used for the dry-run).
6407 SetVector<ScheduleEntity *> ReadyInsts;
6408
6409 /// The first instruction of the scheduling region.
6410 Instruction *ScheduleStart = nullptr;
6411
6412 /// The first instruction _after_ the scheduling region.
6413 Instruction *ScheduleEnd = nullptr;
6414
6415 /// The first memory accessing instruction in the scheduling region
6416 /// (can be null).
6417 ScheduleData *FirstLoadStoreInRegion = nullptr;
6418
6419 /// The last memory accessing instruction in the scheduling region
6420 /// (can be null).
6421 ScheduleData *LastLoadStoreInRegion = nullptr;
6422
6423 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
6424 /// region? Used to optimize the dependence calculation for the
6425 /// common case where there isn't.
6426 bool RegionHasStackSave = false;
6427
6428 /// The current size of the scheduling region.
6429 int ScheduleRegionSize = 0;
6430
6431 /// The maximum size allowed for the scheduling region.
6432 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
6433
6434 /// Operands that are modeled as copyable elements in a previously built
6435 /// vectorized node and that are used directly by a duplicate node with the
6436 /// same schedulable instructions. Their direct dependencies must be
6437 /// recomputed at the next bundle scheduling, when the duplicate node is
6438 /// already registered in the tree, so that the direct use is accounted for.
6439 /// If the duplicate node is the last scheduled bundle and no further
6440 /// scheduling consumes this list, the leftover entries are dropped on the
6441 /// next region reset and the dependencies are recomputed against the full
6442 /// tree in scheduleBlock instead. A set is used to avoid recomputing the
6443 /// same operand more than once.
6444 SmallSetVector<ScheduleData *, 8> RecalcCopyableOperandDeps;
6445
6446 /// The ID of the scheduling region. For a new vectorization iteration this
6447 /// is incremented which "removes" all ScheduleData from the region.
6448 /// Make sure that the initial SchedulingRegionID is greater than the
6449 /// initial SchedulingRegionID in ScheduleData (which is 0).
6450 int SchedulingRegionID = 1;
6451 };
6452
6453 /// Attaches the BlockScheduling structures to basic blocks.
6454 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6455
6456 /// Performs the "real" scheduling. Done before vectorization is actually
6457 /// performed in a basic block.
6458 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
6459
6460 /// List of users to ignore during scheduling and that don't need extracting.
6461 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
6462
6463 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
6464 /// sorted SmallVectors of unsigned.
6465 struct OrdersTypeDenseMapInfo {
6466 static OrdersType getEmptyKey() {
6467 OrdersType V;
6468 V.push_back(~1U);
6469 return V;
6470 }
6471
6472 static unsigned getHashValue(const OrdersType &V) {
6473 return static_cast<unsigned>(hash_combine_range(V));
6474 }
6475
6476 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
6477 return LHS == RHS;
6478 }
6479 };
6480
6481 // Analysis and block reference.
6482 Function *F;
6483 ScalarEvolution *SE;
6484 TargetTransformInfo *TTI;
6485 TargetLibraryInfo *TLI;
6486 LoopInfo *LI;
6487 DominatorTree *DT;
6488 AssumptionCache *AC;
6489 DemandedBits *DB;
6490 const DataLayout *DL;
6491 OptimizationRemarkEmitter *ORE;
6492
6493 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
6494 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
6495
6496 /// Instruction builder to construct the vectorized tree.
6497 IRBuilder<TargetFolder> Builder;
6498
6499 /// A map of scalar integer values to the smallest bit width with which they
6500 /// can legally be represented. The values map to (width, signed) pairs,
6501 /// where "width" indicates the minimum bit width and "signed" is True if the
6502 /// value must be signed-extended, rather than zero-extended, back to its
6503 /// original width.
6504 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6505
6506 /// Final size of the reduced vector, if the current graph represents the
6507 /// input for the reduction and it was possible to narrow the size of the
6508 /// reduction.
6509 unsigned ReductionBitWidth = 0;
6510
6511 /// Canonical graph size before the transformations.
6512 unsigned BaseGraphSize = 1;
6513
6514 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
6515 /// type sizes, used in the tree.
6516 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6517
6518 /// Indices of the vectorized nodes, which supposed to be the roots of the new
6519 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
6520 DenseSet<unsigned> ExtraBitWidthNodes;
6521};
6522
6523template <> struct llvm::DenseMapInfo<BoUpSLP::EdgeInfo> {
6527 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
6528 SecondInfo::getEmptyKey());
6529 }
6530
6531 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
6532 return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
6533 SecondInfo::getHashValue(Val.EdgeIdx));
6534 }
6535
6536 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
6537 const BoUpSLP::EdgeInfo &RHS) {
6538 return LHS == RHS;
6539 }
6540};
6541
6542template <> struct llvm::GraphTraits<BoUpSLP *> {
6543 using TreeEntry = BoUpSLP::TreeEntry;
6544
6545 /// NodeRef has to be a pointer per the GraphWriter.
6547
6548 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
6549
6550 /// Add the VectorizableTree to the index iterator to be able to return
6551 /// TreeEntry pointers.
6553 : public iterator_adaptor_base<
6554 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6556
6560
6561 NodeRef operator*() { return I->UserTE; }
6562 };
6563
6565 return R.VectorizableTree[0].get();
6566 }
6567
6569 return {&N->UserTreeIndex, N->Container};
6570 }
6571
6573 return {&N->UserTreeIndex + 1, N->Container};
6574 }
6575
6576 /// For the node iterator we just need to turn the TreeEntry iterator into a
6577 /// TreeEntry* iterator so that it dereferences to NodeRef.
6579 using ItTy = ContainerTy::iterator;
6580 ItTy It;
6581
6582 public:
6583 nodes_iterator(const ItTy &It2) : It(It2) {}
6584 NodeRef operator*() { return It->get(); }
6586 ++It;
6587 return *this;
6588 }
6589 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
6590 };
6591
6593 return nodes_iterator(R->VectorizableTree.begin());
6594 }
6595
6597 return nodes_iterator(R->VectorizableTree.end());
6598 }
6599
6600 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
6601};
6602
6603template <>
6605 using TreeEntry = BoUpSLP::TreeEntry;
6606
6607 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
6608
6609 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
6610 std::string Str;
6611 raw_string_ostream OS(Str);
6612 OS << Entry->Idx << ".\n";
6613 if (isSplat(Entry->Scalars))
6614 OS << "<splat> ";
6615 for (auto *V : Entry->Scalars) {
6616 OS << *V;
6617 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
6618 return EU.Scalar == V;
6619 }))
6620 OS << " <extract>";
6621 OS << "\n";
6622 }
6623 return Str;
6624 }
6625
6626 static std::string getNodeAttributes(const TreeEntry *Entry,
6627 const BoUpSLP *) {
6628 if (Entry->isGather())
6629 return "color=red";
6630 if (Entry->State == TreeEntry::ScatterVectorize ||
6631 Entry->State == TreeEntry::StridedVectorize ||
6632 Entry->State == TreeEntry::CompressVectorize)
6633 return "color=blue";
6634 return "";
6635 }
6636};
6637
6640 for (auto *I : DeletedInstructions) {
6641 if (!I->getParent()) {
6642 // Temporarily insert instruction back to erase them from parent and
6643 // memory later.
6644 if (isa<PHINode>(I))
6645 // Phi nodes must be the very first instructions in the block.
6646 I->insertBefore(F->getEntryBlock(),
6647 F->getEntryBlock().getFirstNonPHIIt());
6648 else
6649 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6650 continue;
6651 }
6652 for (Use &U : I->operands()) {
6653 auto *Op = dyn_cast<Instruction>(U.get());
6654 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
6656 DeadInsts.emplace_back(Op);
6657 }
6658 I->dropAllReferences();
6659 }
6660 for (auto *I : DeletedInstructions) {
6661 assert(I->use_empty() &&
6662 "trying to erase instruction with users.");
6663 I->eraseFromParent();
6664 }
6665
6666 // Cleanup any dead scalar code feeding the vectorized instructions
6668
6669#ifdef EXPENSIVE_CHECKS
6670 // If we could guarantee that this call is not extremely slow, we could
6671 // remove the ifdef limitation (see PR47712).
6672 assert(!verifyFunction(*F, &dbgs()));
6673#endif
6674}
6675
6676/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6677/// contains original mask for the scalars reused in the node. Procedure
6678/// transform this mask in accordance with the given \p Mask.
6680 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6681 "Expected non-empty mask.");
6682 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6683 Prev.swap(Reuses);
6684 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6685 if (Mask[I] != PoisonMaskElem)
6686 Reuses[Mask[I]] = Prev[I];
6687}
6688
6689/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6690/// the original order of the scalars. Procedure transforms the provided order
6691/// in accordance with the given \p Mask. If the resulting \p Order is just an
6692/// identity order, \p Order is cleared.
6694 bool BottomOrder = false) {
6695 assert(!Mask.empty() && "Expected non-empty mask.");
6696 unsigned Sz = Mask.size();
6697 if (BottomOrder) {
6698 SmallVector<unsigned> PrevOrder;
6699 if (Order.empty()) {
6700 PrevOrder.resize(Sz);
6701 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
6702 } else {
6703 PrevOrder.swap(Order);
6704 }
6705 Order.assign(Sz, Sz);
6706 for (unsigned I = 0; I < Sz; ++I)
6707 if (Mask[I] != PoisonMaskElem)
6708 Order[I] = PrevOrder[Mask[I]];
6709 if (all_of(enumerate(Order), [&](const auto &Data) {
6710 return Data.value() == Sz || Data.index() == Data.value();
6711 })) {
6712 Order.clear();
6713 return;
6714 }
6715 fixupOrderingIndices(Order);
6716 return;
6717 }
6718 SmallVector<int> MaskOrder;
6719 if (Order.empty()) {
6720 MaskOrder.resize(Sz);
6721 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
6722 } else {
6723 inversePermutation(Order, MaskOrder);
6724 }
6725 reorderReuses(MaskOrder, Mask);
6726 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
6727 Order.clear();
6728 return;
6729 }
6730 Order.assign(Sz, Sz);
6731 for (unsigned I = 0; I < Sz; ++I)
6732 if (MaskOrder[I] != PoisonMaskElem)
6733 Order[MaskOrder[I]] = I;
6734 fixupOrderingIndices(Order);
6735}
6736
6737std::optional<BoUpSLP::OrdersType>
6738BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6739 bool TopToBottom, bool IgnoreReorder) {
6740 assert(TE.isGather() && "Expected gather node only.");
6741 // Try to find subvector extract/insert patterns and reorder only such
6742 // patterns.
6743 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6744 Type *ScalarTy = GatheredScalars.front()->getType();
6745 size_t NumScalars = GatheredScalars.size();
6746 if (!isValidElementType(ScalarTy))
6747 return std::nullopt;
6748 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6749 unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, ScalarTy, NumScalars);
6750 SmallVector<int> ExtractMask;
6751 SmallVector<int> Mask;
6754 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6756 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6757 /*ForOrder=*/true);
6758 // No shuffled operands - ignore.
6759 if (GatherShuffles.empty() && ExtractShuffles.empty())
6760 return std::nullopt;
6761 OrdersType CurrentOrder(NumScalars, NumScalars);
6762 if (GatherShuffles.size() == 1 &&
6763 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6764 Entries.front().front()->isSame(TE.Scalars)) {
6765 // If the full matched node in whole tree rotation - no need to consider the
6766 // matching order, rotating the whole tree.
6767 if (TopToBottom)
6768 return std::nullopt;
6769 // No need to keep the order for the same user node.
6770 if (Entries.front().front()->UserTreeIndex.UserTE ==
6771 TE.UserTreeIndex.UserTE)
6772 return std::nullopt;
6773 // No need to keep the order for the matched root node, if it can be freely
6774 // reordered.
6775 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6776 return std::nullopt;
6777 // If shuffling 2 elements only and the matching node has reverse reuses -
6778 // no need to count order, both work fine.
6779 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6780 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6781 any_of(enumerate(Entries.front().front()->ReuseShuffleIndices),
6782 [](const auto &P) {
6783 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6784 }))
6785 return std::nullopt;
6786
6787 // Perfect match in the graph, will reuse the previously vectorized
6788 // node. Cost is 0.
6789 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
6790 return CurrentOrder;
6791 }
6792 auto IsSplatMask = [](ArrayRef<int> Mask) {
6793 int SingleElt = PoisonMaskElem;
6794 return all_of(Mask, [&](int I) {
6795 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6796 SingleElt = I;
6797 return I == PoisonMaskElem || I == SingleElt;
6798 });
6799 };
6800 // Exclusive broadcast mask - ignore.
6801 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6802 (Entries.size() != 1 ||
6803 Entries.front().front()->ReorderIndices.empty())) ||
6804 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6805 return std::nullopt;
6806 SmallBitVector ShuffledSubMasks(NumParts);
6807 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6808 ArrayRef<int> Mask, int PartSz, int NumParts,
6809 function_ref<unsigned(unsigned)> GetVF) {
6810 for (int I : seq<int>(NumParts)) {
6811 if (ShuffledSubMasks.test(I))
6812 continue;
6813 const int VF = GetVF(I);
6814 if (VF == 0)
6815 continue;
6816 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
6817 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
6818 // Shuffle of at least 2 vectors - ignore.
6819 if (any_of(Slice, not_equal_to(NumScalars))) {
6820 llvm::fill(Slice, NumScalars);
6821 ShuffledSubMasks.set(I);
6822 continue;
6823 }
6824 // Try to include as much elements from the mask as possible.
6825 int FirstMin = INT_MAX;
6826 int SecondVecFound = false;
6827 for (int K : seq<int>(Limit)) {
6828 int Idx = Mask[I * PartSz + K];
6829 if (Idx == PoisonMaskElem) {
6830 Value *V = GatheredScalars[I * PartSz + K];
6831 if (isConstant(V) && !isa<PoisonValue>(V)) {
6832 SecondVecFound = true;
6833 break;
6834 }
6835 continue;
6836 }
6837 if (Idx < VF) {
6838 if (FirstMin > Idx)
6839 FirstMin = Idx;
6840 } else {
6841 SecondVecFound = true;
6842 break;
6843 }
6844 }
6845 FirstMin = (FirstMin / PartSz) * PartSz;
6846 // Shuffle of at least 2 vectors - ignore.
6847 if (SecondVecFound) {
6848 llvm::fill(Slice, NumScalars);
6849 ShuffledSubMasks.set(I);
6850 continue;
6851 }
6852 for (int K : seq<int>(Limit)) {
6853 int Idx = Mask[I * PartSz + K];
6854 if (Idx == PoisonMaskElem)
6855 continue;
6856 Idx -= FirstMin;
6857 if (Idx >= PartSz) {
6858 // Cross-part / second-vector reference: this slice cannot be
6859 // ordered as a single first-vector permutation, give up.
6860 SecondVecFound = true;
6861 break;
6862 }
6863 // For the last partial slice, Limit < PartSz and Idx in [Limit,
6864 // PartSz) addresses the unused padded tail (no scalar at that
6865 // position). Skip the write but keep ordering the remaining K's.
6866 if (static_cast<unsigned>(I * PartSz + Idx) >= CurrentOrder.size())
6867 continue;
6868 if (CurrentOrder[I * PartSz + Idx] >
6869 static_cast<unsigned>(I * PartSz + K) &&
6870 CurrentOrder[I * PartSz + Idx] !=
6871 static_cast<unsigned>(I * PartSz + Idx))
6872 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6873 }
6874 // Shuffle of at least 2 vectors - ignore.
6875 if (SecondVecFound) {
6876 llvm::fill(Slice, NumScalars);
6877 ShuffledSubMasks.set(I);
6878 continue;
6879 }
6880 }
6881 };
6882 int PartSz = getPartNumElems(NumScalars, NumParts);
6883 if (!ExtractShuffles.empty())
6884 TransformMaskToOrder(
6885 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6886 if (I >= ExtractShuffles.size() || !ExtractShuffles[I])
6887 return 0U;
6888 unsigned VF = 0;
6889 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
6890 for (unsigned Idx : seq<unsigned>(Sz)) {
6891 int K = I * PartSz + Idx;
6892 if (static_cast<unsigned>(K) >= ExtractMask.size())
6893 break;
6894 if (ExtractMask[K] == PoisonMaskElem)
6895 continue;
6896 if (!TE.ReuseShuffleIndices.empty())
6897 K = TE.ReuseShuffleIndices[K];
6898 if (K == PoisonMaskElem)
6899 continue;
6900 if (!TE.ReorderIndices.empty())
6901 K = std::distance(TE.ReorderIndices.begin(),
6902 find(TE.ReorderIndices, K));
6903 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
6904 if (!EI)
6905 continue;
6906 VF = std::max(VF, EI->getVectorOperandType()
6907 ->getElementCount()
6908 .getKnownMinValue());
6909 }
6910 return VF;
6911 });
6912 // Check special corner case - single shuffle of the same entry.
6913 if (GatherShuffles.size() == 1 && NumParts != 1) {
6914 if (ShuffledSubMasks.any())
6915 return std::nullopt;
6916 PartSz = NumScalars;
6917 NumParts = 1;
6918 }
6919 if (!Entries.empty())
6920 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6921 if (I >= GatherShuffles.size() || !GatherShuffles[I])
6922 return 0U;
6923 return std::max(Entries[I].front()->getVectorFactor(),
6924 Entries[I].back()->getVectorFactor());
6925 });
6926 unsigned NumUndefs = count(CurrentOrder, NumScalars);
6927 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6928 return std::nullopt;
6929 return std::move(CurrentOrder);
6930}
6931
6932static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6933 const TargetLibraryInfo &TLI,
6934 bool CompareOpcodes = true) {
6937 return false;
6938 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
6939 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
6940 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6941 (!GEP2 || GEP2->getNumOperands() == 2) &&
6942 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
6943 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
6944 !CompareOpcodes ||
6945 (GEP1 && GEP2 &&
6946 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6947}
6948
6949/// Calculates minimal alignment as a common alignment.
6950template <typename T>
6952 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6953 for (Value *V : VL)
6954 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6955 return CommonAlignment;
6956}
6957
6958/// Check if \p Order represents reverse order.
6960 assert(!Order.empty() &&
6961 "Order is empty. Please check it before using isReverseOrder.");
6962 unsigned Sz = Order.size();
6963 return all_of(enumerate(Order), [&](const auto &Pair) {
6964 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6965 });
6966}
6967
6968/// Checks if the provided list of pointers \p Pointers represents the strided
6969/// pointers for type ElemTy. If they are not, nullptr is returned.
6970/// Otherwise, SCEV* of the stride value is returned.
6971/// If `PointerOps` can be rearanged into the following sequence:
6972/// ```
6973/// %x + c_0 * stride,
6974/// %x + c_1 * stride,
6975/// %x + c_2 * stride
6976/// ...
6977/// ```
6978/// where each `c_i` is constant. The `Coeffs` will contain `c_0, c_1, c_2, ..`
6979/// and the SCEV of the `stride` will be returned.
6980static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
6981 const DataLayout &DL, ScalarEvolution &SE,
6982 SmallVectorImpl<unsigned> &SortedIndices,
6983 SmallVectorImpl<int64_t> &Coeffs) {
6984 assert(Coeffs.size() == PointerOps.size() &&
6985 "Coeffs vector needs to be of correct size");
6987 const SCEV *PtrSCEVLowest = nullptr;
6988 const SCEV *PtrSCEVHighest = nullptr;
6989 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6990 // addresses).
6991 for (Value *Ptr : PointerOps) {
6992 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
6993 if (!PtrSCEV)
6994 return nullptr;
6995 SCEVs.push_back(PtrSCEV);
6996 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6997 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6998 continue;
6999 }
7000 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
7001 if (isa<SCEVCouldNotCompute>(Diff))
7002 return nullptr;
7003 if (Diff->isNonConstantNegative()) {
7004 PtrSCEVLowest = PtrSCEV;
7005 continue;
7006 }
7007 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
7008 if (isa<SCEVCouldNotCompute>(Diff1))
7009 return nullptr;
7010 if (Diff1->isNonConstantNegative()) {
7011 PtrSCEVHighest = PtrSCEV;
7012 continue;
7013 }
7014 }
7015 // Dist = PtrSCEVHighest - PtrSCEVLowest;
7016 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
7017 if (isa<SCEVCouldNotCompute>(Dist))
7018 return nullptr;
7019 int Size = DL.getTypeStoreSize(ElemTy);
7020 auto TryGetStride = [&](const SCEV *Dist,
7021 const SCEV *Multiplier) -> const SCEV * {
7022 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
7023 if (M->getOperand(0) == Multiplier)
7024 return M->getOperand(1);
7025 if (M->getOperand(1) == Multiplier)
7026 return M->getOperand(0);
7027 return nullptr;
7028 }
7029 if (Multiplier == Dist)
7030 return SE.getConstant(Dist->getType(), 1);
7031 return SE.getUDivExactExpr(Dist, Multiplier);
7032 };
7033 // Stride_in_elements = Dist / element_size * (num_elems - 1).
7034 const SCEV *Stride = nullptr;
7035 if (Size != 1 || SCEVs.size() > 1) {
7036 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
7037 Stride = TryGetStride(Dist, Sz);
7038 if (!Stride)
7039 return nullptr;
7040 }
7041 if (!Stride || isa<SCEVConstant>(Stride))
7042 return nullptr;
7043 // Iterate through all pointers and check if all distances are
7044 // unique multiple of Stride.
7045 using DistOrdPair = std::pair<int64_t, int>;
7046 auto Compare = llvm::less_first();
7047 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
7048 int Cnt = 0;
7049 bool IsConsecutive = true;
7050 for (const auto [Idx, PtrSCEV] : enumerate(SCEVs)) {
7051 unsigned Dist = 0;
7052 if (PtrSCEV != PtrSCEVLowest) {
7053 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
7054 const SCEV *Coeff = TryGetStride(Diff, Stride);
7055 if (!Coeff)
7056 return nullptr;
7057 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
7058 if (!SC || isa<SCEVCouldNotCompute>(SC))
7059 return nullptr;
7060 Coeffs[Idx] = (int64_t)SC->getAPInt().getLimitedValue();
7061 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
7062 SE.getMulExpr(Stride, SC)))
7063 ->isZero())
7064 return nullptr;
7065 Dist = SC->getAPInt().getZExtValue();
7066 } else {
7067 Coeffs[Idx] = 0;
7068 }
7069 // If the strides are not the same or repeated, we can't vectorize.
7070 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
7071 return nullptr;
7072 auto Res = Offsets.emplace(Dist, Cnt);
7073 if (!Res.second)
7074 return nullptr;
7075 // Consecutive order if the inserted element is the last one.
7076 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
7077 ++Cnt;
7078 }
7079 if (Offsets.size() != SCEVs.size())
7080 return nullptr;
7081 SortedIndices.clear();
7082 if (!IsConsecutive) {
7083 // Fill SortedIndices array only if it is non-consecutive.
7084 SortedIndices.resize(PointerOps.size());
7085 Cnt = 0;
7086 for (const std::pair<int64_t, int> &Pair : Offsets) {
7087 SortedIndices[Cnt] = Pair.second;
7088 ++Cnt;
7089 }
7090 }
7091 return Stride;
7092}
7093
7094static std::pair<InstructionCost, InstructionCost>
7096 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
7097 Type *ScalarTy, VectorType *VecTy);
7098
7099/// Returns the cost of the shuffle instructions with the given \p Kind, vector
7100/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
7101/// subvector pattern.
7102static InstructionCost
7104 VectorType *Tp, ArrayRef<int> Mask = {},
7106 int Index = 0, VectorType *SubTp = nullptr,
7108 VectorType *DstTy = Tp;
7109 if (!Mask.empty())
7110 DstTy = FixedVectorType::get(Tp->getScalarType(), Mask.size());
7111
7112 if (Kind != TTI::SK_PermuteTwoSrc)
7113 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
7114 Args);
7115 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7116 int NumSubElts;
7118 Mask, NumSrcElts, NumSubElts, Index)) {
7119 if (Index + NumSubElts > NumSrcElts &&
7120 Index + NumSrcElts <= static_cast<int>(Mask.size()))
7121 return TTI.getShuffleCost(TTI::SK_InsertSubvector, DstTy, Tp, Mask,
7122 TTI::TCK_RecipThroughput, Index, Tp);
7123 }
7124 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask, CostKind, Index, SubTp,
7125 Args);
7126}
7127
7128/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
7129/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
7130/// instead of a scalar.
7132 const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty,
7133 const APInt &DemandedElts, bool Insert, bool Extract,
7134 TTI::TargetCostKind CostKind, bool ForPoisonSrc = true,
7135 ArrayRef<Value *> VL = {},
7138 "ScalableVectorType is not supported.");
7139 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
7140 getNumElements(Ty) &&
7141 "Incorrect usage.");
7142 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
7143 assert(SLPReVec && "Only supported by REVEC.");
7144 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
7145 // of CreateInsertElement.
7146 unsigned ScalarTyNumElements = VecTy->getNumElements();
7147 InstructionCost Cost = 0;
7148 for (unsigned I : seq(DemandedElts.getBitWidth())) {
7149 if (!DemandedElts[I])
7150 continue;
7151 if (Insert)
7153 I * ScalarTyNumElements, VecTy);
7154 if (Extract)
7156 I * ScalarTyNumElements, VecTy);
7157 }
7158 return Cost;
7159 }
7160 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
7161 CostKind, ForPoisonSrc, VL, VIC);
7162}
7163
7164/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
7165/// is a FixedVectorType, a vector will be extracted instead of a scalar.
7167 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
7168 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
7169 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
7170 if (Opcode == Instruction::ExtractElement) {
7171 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
7172 assert(SLPReVec && "Only supported by REVEC.");
7173 assert(isa<VectorType>(Val) && "Val must be a vector type.");
7175 cast<VectorType>(Val), {}, CostKind,
7176 Index * VecTy->getNumElements(), VecTy);
7177 }
7178 }
7179 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
7180 ScalarUserAndIdx);
7181}
7182
7183/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
7184/// is a FixedVectorType, a vector will be extracted instead of a scalar.
7186 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
7187 VectorType *VecTy, unsigned Index,
7189 if (isVectorizedTy(Dst)) {
7190 assert(SLPReVec && "Only supported by REVEC.");
7191 auto *SubTp = cast<FixedVectorType>(
7194 Index * getNumElements(Dst), SubTp) +
7195 TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
7196 CostKind);
7197 }
7198 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
7199}
7200
7201/// Creates subvector insert. Generates shuffle using \p Generator or
7202/// using default shuffle.
7204 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
7205 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
7206 if (isa<PoisonValue>(Vec) && isa<PoisonValue>(V))
7207 return Vec;
7208 const unsigned SubVecVF = getNumElements(V->getType());
7209 // Create shuffle, insertvector requires that index is multiple of
7210 // the subvector length.
7211 const unsigned VecVF = getNumElements(Vec->getType());
7213 if (isa<PoisonValue>(Vec)) {
7214 auto *Begin = std::next(Mask.begin(), Index);
7215 std::iota(Begin, std::next(Begin, SubVecVF), 0);
7216 Vec = Builder.CreateShuffleVector(V, Mask);
7217 return Vec;
7218 }
7219 std::iota(Mask.begin(), Mask.end(), 0);
7220 std::iota(std::next(Mask.begin(), Index),
7221 std::next(Mask.begin(), Index + SubVecVF), VecVF);
7222 if (Generator)
7223 return Generator(Vec, V, Mask);
7224 // 1. Resize V to the size of Vec.
7225 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
7226 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
7227 V = Builder.CreateShuffleVector(V, ResizeMask);
7228 // 2. Insert V into Vec.
7229 return Builder.CreateShuffleVector(Vec, V, Mask);
7230}
7231
7232/// Generates subvector extract using \p Generator or using default shuffle.
7234 unsigned SubVecVF, unsigned Index) {
7235 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
7236 std::iota(Mask.begin(), Mask.end(), Index);
7237 return Builder.CreateShuffleVector(Vec, Mask);
7238}
7239
7240/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
7241/// with \p Order.
7242/// \return true if the mask represents strided access, false - otherwise.
7244 ArrayRef<unsigned> Order, Type *ScalarTy,
7245 const DataLayout &DL, ScalarEvolution &SE,
7246 SmallVectorImpl<int> &CompressMask) {
7247 const unsigned Sz = PointerOps.size();
7248 CompressMask.assign(Sz, PoisonMaskElem);
7249 // The first element always set.
7250 CompressMask[0] = 0;
7251 // Check if the mask represents strided access.
7252 std::optional<unsigned> Stride = 0;
7253 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
7254 for (unsigned I : seq<unsigned>(1, Sz)) {
7255 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
7256 std::optional<int64_t> OptPos =
7257 getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
7258 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
7259 return false;
7260 unsigned Pos = static_cast<unsigned>(*OptPos);
7261 CompressMask[I] = Pos;
7262 if (!Stride)
7263 continue;
7264 if (*Stride == 0) {
7265 *Stride = Pos;
7266 continue;
7267 }
7268 if (Pos != *Stride * I)
7269 Stride.reset();
7270 }
7271 return Stride.has_value();
7272}
7273
7274/// Checks if the \p VL can be transformed to a (masked)load + compress or
7275/// (masked) interleaved load.
7277 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
7280 const DominatorTree &DT, const TargetLibraryInfo &TLI,
7281 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
7282 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
7283 VectorType *&LoadVecTy) {
7284 InterleaveFactor = 0;
7285 Type *ScalarTy = VL.front()->getType();
7286 const size_t Sz = VL.size();
7287 auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, Sz));
7289 SmallVector<int> Mask;
7290 if (!Order.empty())
7291 inversePermutation(Order, Mask);
7292 // Check external uses.
7293 for (const auto [I, V] : enumerate(VL)) {
7294 if (AreAllUsersVectorized(V))
7295 continue;
7296 InstructionCost ExtractCost =
7297 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
7298 Mask.empty() ? I : Mask[I]);
7299 InstructionCost ScalarCost =
7300 TTI.getInstructionCost(cast<Instruction>(V), CostKind);
7301 if (ExtractCost <= ScalarCost)
7302 return false;
7303 }
7304 Value *Ptr0;
7305 Value *PtrN;
7306 if (Order.empty()) {
7307 Ptr0 = PointerOps.front();
7308 PtrN = PointerOps.back();
7309 } else {
7310 Ptr0 = PointerOps[Order.front()];
7311 PtrN = PointerOps[Order.back()];
7312 }
7313 std::optional<int64_t> Diff =
7314 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
7315 if (!Diff)
7316 return false;
7317 const size_t MaxRegSize =
7319 .getFixedValue();
7320 // Check for very large distances between elements.
7321 if (*Diff / Sz >= MaxRegSize / 8)
7322 return false;
7323 LoadVecTy = cast<FixedVectorType>(getWidenedType(ScalarTy, *Diff + 1));
7324 auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
7325 Align CommonAlignment = LI->getAlign();
7326 IsMasked = !isSafeToLoadUnconditionally(
7327 Ptr0, LoadVecTy, CommonAlignment, DL,
7328 cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
7329 &TLI);
7330 if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
7331 LI->getPointerAddressSpace()))
7332 return false;
7333 // TODO: perform the analysis of each scalar load for better
7334 // safe-load-unconditionally analysis.
7335 bool IsStrided =
7336 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
7337 assert(CompressMask.size() >= 2 && "At least two elements are required");
7338 SmallVector<Value *> OrderedPointerOps(PointerOps);
7339 if (!Order.empty())
7340 reorderScalars(OrderedPointerOps, Mask);
7341 auto [ScalarGEPCost, VectorGEPCost] =
7342 getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps.front(),
7343 Instruction::Load, CostKind, ScalarTy, LoadVecTy);
7344 // The cost of scalar loads.
7345 InstructionCost ScalarLoadsCost =
7347 [&](InstructionCost C, Value *V) {
7348 return C + TTI.getInstructionCost(cast<Instruction>(V),
7349 CostKind);
7350 }) +
7351 ScalarGEPCost;
7352 APInt DemandedElts = APInt::getAllOnes(Sz);
7353 InstructionCost GatherCost =
7354 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7355 /*Insert=*/true,
7356 /*Extract=*/false, CostKind) +
7357 ScalarLoadsCost;
7358 InstructionCost LoadCost = 0;
7359 if (IsMasked) {
7360 LoadCost = TTI.getMemIntrinsicInstrCost(
7361 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
7362 CommonAlignment,
7363 LI->getPointerAddressSpace()),
7364 CostKind);
7365 } else {
7366 LoadCost =
7367 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
7368 LI->getPointerAddressSpace(), CostKind);
7369 }
7370 if (IsStrided && !IsMasked && Order.empty()) {
7371 // Check for potential segmented(interleaved) loads.
7372 VectorType *AlignedLoadVecTy = cast<VectorType>(getWidenedType(
7373 ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1)));
7374 if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
7375 DL, cast<LoadInst>(VL.back()), &AC, &DT,
7376 &TLI))
7377 AlignedLoadVecTy = LoadVecTy;
7378 if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
7379 CommonAlignment,
7380 LI->getPointerAddressSpace())) {
7381 InstructionCost InterleavedCost =
7382 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
7383 Instruction::Load, AlignedLoadVecTy,
7384 CompressMask[1], {}, CommonAlignment,
7385 LI->getPointerAddressSpace(), CostKind, IsMasked);
7386 if (InterleavedCost < GatherCost) {
7387 InterleaveFactor = CompressMask[1];
7388 LoadVecTy = AlignedLoadVecTy;
7389 return true;
7390 }
7391 }
7392 }
7393 InstructionCost CompressCost = ::getShuffleCost(
7394 TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
7395 if (!Order.empty()) {
7396 SmallVector<int> NewMask(Sz, PoisonMaskElem);
7397 for (unsigned I : seq<unsigned>(Sz)) {
7398 NewMask[I] = CompressMask[Mask[I]];
7399 }
7400 CompressMask.swap(NewMask);
7401 }
7402 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
7403 return TotalVecCost < GatherCost;
7404}
7405
7406/// Checks if the \p VL can be transformed to a (masked)load + compress or
7407/// (masked) interleaved load.
7408static bool
7411 const DataLayout &DL, ScalarEvolution &SE,
7412 AssumptionCache &AC, const DominatorTree &DT,
7413 const TargetLibraryInfo &TLI,
7414 const function_ref<bool(Value *)> AreAllUsersVectorized) {
7415 bool IsMasked;
7416 unsigned InterleaveFactor;
7417 SmallVector<int> CompressMask;
7418 VectorType *LoadVecTy;
7419 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
7420 AreAllUsersVectorized, IsMasked, InterleaveFactor,
7421 CompressMask, LoadVecTy);
7422}
7423
7424/// Checks if strided loads can be generated out of \p VL loads with pointers \p
7425/// PointerOps:
7426/// 1. Target with strided load support is detected.
7427/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
7428/// potential stride <= MaxProfitableStride and the potential stride is
7429/// power-of-2 (to avoid perf regressions for the very small number of loads)
7430/// and max distance > number of loads, or potential stride is -1.
7431/// 3. The loads are ordered, or number of unordered loads <=
7432/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
7433/// to avoid extra costs for very expensive shuffles).
7434/// 4. Any pointer operand is an instruction with the users outside of the
7435/// current graph (for masked gathers extra extractelement instructions
7436/// might be required).
7438 Align Alignment, const int64_t Diff,
7439 const size_t Sz) const {
7440 if (Diff % (Sz - 1) != 0)
7441 return false;
7442
7443 // Try to generate strided load node.
7444 auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) {
7445 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
7446 return !isVectorized(U) && !MustGather.contains(U);
7447 });
7448 });
7449
7450 const uint64_t AbsoluteDiff = std::abs(Diff);
7451 auto *VecTy = getWidenedType(ScalarTy, Sz);
7452 if (IsAnyPointerUsedOutGraph ||
7453 (AbsoluteDiff > Sz &&
7455 (AbsoluteDiff <= MaxProfitableStride * Sz && AbsoluteDiff % Sz == 0 &&
7456 has_single_bit(AbsoluteDiff / Sz)))) ||
7457 Diff == -(static_cast<int64_t>(Sz) - 1)) {
7458 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
7459 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
7460 return false;
7461 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
7462 return false;
7463 return true;
7464 }
7465 return false;
7466}
7467
7469 const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment,
7470 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
7471 Value *Ptr0, StridedPtrInfo &SPtrInfo) const {
7472 const size_t Sz = PointerOps.size();
7473 SmallVector<int64_t> SortedOffsetsFromBase(Sz);
7474 // Go through `PointerOps` in sorted order and record offsets from
7475 // PointerOps[0]. We use PointerOps[0] rather than Ptr0 because
7476 // sortPtrAccesses only validates getPointersDiff for pairs relative to
7477 // PointerOps[0]. This is safe since only offset differences are used below.
7478 for (unsigned I : seq<unsigned>(Sz)) {
7479 Value *Ptr =
7480 SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
7481 std::optional<int64_t> Offset =
7482 getPointersDiff(ScalarTy, PointerOps[0], ScalarTy, Ptr, *DL, *SE);
7483 assert(Offset && "sortPtrAccesses should have validated this pointer");
7484 SortedOffsetsFromBase[I] = *Offset;
7485 }
7486
7487 // The code below checks that `SortedOffsetsFromBase` looks as follows:
7488 // ```
7489 // [
7490 // (e_{0, 0}, e_{0, 1}, ..., e_{0, GroupSize - 1}), // first group
7491 // (e_{1, 0}, e_{1, 1}, ..., e_{1, GroupSize - 1}), // secon group
7492 // ...
7493 // (e_{NumGroups - 1, 0}, e_{NumGroups - 1, 1}, ..., e_{NumGroups - 1,
7494 // GroupSize - 1}), // last group
7495 // ]
7496 // ```
7497 // The distance between consecutive elements within each group should all be
7498 // the same `StrideWithinGroup`. The distance between the first elements of
7499 // consecutive groups should all be the same `StrideBetweenGroups`.
7500
7501 int64_t StrideWithinGroup =
7502 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7503 // Determine size of the first group. Later we will check that all other
7504 // groups have the same size.
7505 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](unsigned Idx) {
7506 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7507 StrideWithinGroup;
7508 };
7509 auto Indices = seq<unsigned>(1, Sz);
7510 auto FoundIt = llvm::find_if(Indices, IsEndOfGroupIndex);
7511 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7512
7513 unsigned VecSz = Sz;
7514 Type *NewScalarTy = ScalarTy;
7515
7516 // Quick detour: at this point we can say what the type of strided load would
7517 // be if all the checks pass. Check if this type is legal for the target.
7518 bool NeedsWidening = Sz != GroupSize;
7519 const uint64_t UnitBitWidth = DL->getTypeSizeInBits(ScalarTy).getFixedValue();
7520 if (NeedsWidening) {
7521 if (Sz % GroupSize != 0)
7522 return false;
7523
7524 if (StrideWithinGroup != 1)
7525 return false;
7526 VecSz = Sz / GroupSize;
7527 NewScalarTy = Type::getIntNTy(SE->getContext(), UnitBitWidth * GroupSize);
7528 } else if (ScalarTy->isVectorTy()) {
7529 NewScalarTy = Type::getIntNTy(SE->getContext(), UnitBitWidth);
7530 }
7531
7532 if (!isStridedLoad(PointerOps, NewScalarTy, Alignment, Diff, VecSz))
7533 return false;
7534
7535 int64_t StrideIntVal = StrideWithinGroup;
7536 if (NeedsWidening) {
7537 // Continue with checking the "shape" of `SortedOffsetsFromBase`.
7538 // Check that the strides between groups are all the same.
7539 unsigned CurrentGroupStartIdx = GroupSize;
7540 int64_t StrideBetweenGroups =
7541 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7542 StrideIntVal = StrideBetweenGroups;
7543 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7544 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7545 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7546 StrideBetweenGroups)
7547 return false;
7548 }
7549
7550 auto CheckGroup = [=](const unsigned StartIdx) -> bool {
7551 auto Indices = seq<unsigned>(StartIdx + 1, Sz);
7552 auto FoundIt = llvm::find_if(Indices, IsEndOfGroupIndex);
7553 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7554 return GroupEndIdx - StartIdx == GroupSize;
7555 };
7556 for (unsigned I = 0; I < Sz; I += GroupSize) {
7557 if (!CheckGroup(I))
7558 return false;
7559 }
7560 }
7561
7562 Type *StrideTy = DL->getIndexType(Ptr0->getType());
7563 SPtrInfo.StrideVal = ConstantInt::getSigned(StrideTy, StrideIntVal);
7564 SPtrInfo.Ty = cast<FixedVectorType>(getWidenedType(NewScalarTy, VecSz));
7565 return true;
7566}
7567
7569 Type *BaseTy, Align CommonAlignment,
7570 SmallVectorImpl<unsigned> &SortedIndices,
7571 StridedPtrInfo &SPtrInfo,
7572 bool IsLoad) const {
7573 // If each value in `PointerOps` is of the form `%x + Offset` where `Offset`
7574 // is constant, we partition `PointerOps` sequence into subsequences of
7575 // pointers with the same offset. For each offset we record values from
7576 // `PointerOps` and their indicies in `PointerOps`.
7578 OffsetToPointerOpIdxMap;
7579 for (auto [Idx, Ptr] : enumerate(PointerOps)) {
7580 const SCEV *PtrSCEV = SE->getSCEV(Ptr);
7581 if (!PtrSCEV)
7582 return false;
7583
7584 const auto *Add = dyn_cast<SCEVAddExpr>(PtrSCEV);
7585 int64_t Offset = 0;
7586 if (Add) {
7587 // `Offset` is non-zero.
7588 for (int I : seq<int>(Add->getNumOperands())) {
7589 const auto *SC = dyn_cast<SCEVConstant>(Add->getOperand(I));
7590 if (!SC)
7591 continue;
7592 Offset = SC->getAPInt().getSExtValue();
7593 if (Offset >= std::numeric_limits<int64_t>::max() - 1) {
7594 Offset = 0;
7595 continue;
7596 }
7597 break;
7598 }
7599 }
7600 OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr);
7601 OffsetToPointerOpIdxMap[Offset].second.push_back(Idx);
7602 }
7603 unsigned NumOffsets = OffsetToPointerOpIdxMap.size();
7604
7605 // Quick detour: at this point we can say what the type of strided load would
7606 // be if all the checks pass. Check if this type is legal for the target.
7607 const unsigned Sz = PointerOps.size();
7608 unsigned VecSz = Sz;
7609 Type *NewScalarTy = BaseTy;
7610 if (NumOffsets > 1) {
7611 if (Sz % NumOffsets != 0)
7612 return false;
7613 VecSz = Sz / NumOffsets;
7614 }
7615 if (NumOffsets > 1 || BaseTy->isVectorTy())
7616 NewScalarTy = Type::getIntNTy(
7617 SE->getContext(),
7618 DL->getTypeSizeInBits(BaseTy).getFixedValue() * NumOffsets);
7619 auto *StridedLoadTy =
7620 cast<FixedVectorType>(getWidenedType(NewScalarTy, VecSz));
7621 unsigned MinProfitableStridedOps =
7623 const unsigned BaseTyNumElts = getNumElements(BaseTy);
7624 if (Sz * BaseTyNumElts < MinProfitableStridedOps ||
7625 !TTI->isTypeLegal(StridedLoadTy) ||
7626 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
7627 return false;
7628
7629 // Check if the offsets are contiguous and that each group has the required
7630 // size.
7631 SmallVector<int64_t> SortedOffsetsV(NumOffsets);
7632 for (auto [Idx, MapPair] : enumerate(OffsetToPointerOpIdxMap)) {
7633 if (MapPair.second.first.size() != VecSz)
7634 return false;
7635 SortedOffsetsV[Idx] = MapPair.first;
7636 }
7637 sort(SortedOffsetsV);
7638
7639 if (NumOffsets > 1) {
7640 int64_t BaseBytes = DL->getTypeStoreSize(BaseTy);
7641 for (int I : seq<int>(1, SortedOffsetsV.size())) {
7642 if (SortedOffsetsV[I] - SortedOffsetsV[I - 1] != BaseBytes)
7643 return false;
7644 }
7645 }
7646
7647 // Introduce some notation for the explanations below. Let `PointerOps_j`
7648 // denote the subsequence of `PointerOps` with offsets equal to
7649 // `SortedOffsetsV[j]`. Let `SortedIndices_j` be a such that the sequence
7650 // ```
7651 // PointerOps_j[SortedIndices_j[0]],
7652 // PointerOps_j[SortedIndices_j[1]],
7653 // PointerOps_j[SortedIndices_j[2]],
7654 // ...
7655 // ```
7656 // is sorted. Also, let `IndicesInAllPointerOps_j` be the vector
7657 // of indices of the subsequence `PointerOps_j` in all of `PointerOps`,
7658 // i.e `PointerOps_j[i] = PointerOps[IndicesInAllPointerOps_j[i]]`.
7659 // The entire sorted `PointerOps` looks like this:
7660 // ```
7661 // PointerOps_0[SortedIndices_0[0]] = PointerOps[IndicesInAllPointerOps_0[0]],
7662 // PointerOps_1[SortedIndices_1[0]] = PointerOps[IndicesInAllPointerOps_1[0]],
7663 // PointerOps_2[SortedIndices_2[0]] = PointerOps[IndicesInAllPointerOps_2[0]],
7664 // ...
7665 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[0]] =
7666 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[0]],
7667 //
7668 // PointerOps_0[SortedIndices_0[1]] = PointerOps[IndicesInAllPointerOps_0[1]],
7669 // PointerOps_1[SortedIndices_1[1]] = PointerOps[IndicesInAllPointerOps_1[1]],
7670 // PointerOps_2[SortedIndices_2[1]] = PointerOps[IndicesInAllPointerOps_2[1]],
7671 // ...
7672 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[1]] =
7673 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[1]],
7674 //
7675 // PointerOps_0[SortedIndices_0[2]] = PointerOps[IndicesInAllPointerOps_0[2]],
7676 // PointerOps_1[SortedIndices_1[2]] = PointerOps[IndicesInAllPointerOps_1[2]],
7677 // PointerOps_2[SortedIndices_2[2]] = PointerOps[IndicesInAllPointerOps_2[2]],
7678 // ...
7679 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[2]] =
7680 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[2]],
7681 // ...
7682 // ...
7683 // ...
7684 // PointerOps_0[SortedIndices_0[VecSz - 1]] =
7685 // PointerOps[IndicesInAllPointerOps_0[VecSz - 1]],
7686 // PointerOps_1[SortedIndices_1[VecSz - 1]] =
7687 // PointerOps[IndicesInAllPointerOps_1[VecSz - 1]],
7688 // PointerOps_2[SortedIndices_2[VecSz - 1]] =
7689 // PointerOps[IndicesInAllPointerOps_2[VecSz - 1]],
7690 // ...
7691 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[VecSz - 1]] =
7692 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[VecSz - 1]],
7693 // ```
7694 // In order to be able to generate a strided load, we need the following
7695 // checks to pass:
7696 //
7697 // (1) for each `PointerOps_j` check that the distance
7698 // between adjacent pointers are all equal to the same value (stride).
7699 // (2) for each `PointerOps_j` check that coefficients calculated by
7700 // `calculateRtStride` are all the same.
7701 //
7702 // As we do that, also calculate SortedIndices. Since we should not modify
7703 // `SortedIndices` unless we know that all the checks succeed, record the
7704 // indicies into `SortedIndicesDraft`.
7705 SmallVector<unsigned> SortedIndicesDraft(Sz);
7706
7707 // Given sorted indices for a particular offset (as calculated by
7708 // calculateRtStride), update the `SortedIndicesDraft` for all of PointerOps.
7709 // Let `Offset` be `SortedOffsetsV[OffsetNum]`.
7710 // \param `OffsetNum` the index of `Offset` in `SortedOffsetsV`.
7711 // \param `IndicesInAllPointerOps` vector of indices of the
7712 // subsequence `PointerOps_OffsetNum` in `PointerOps`, i.e. using the above
7713 // notation `IndicesInAllPointerOps = IndicesInAllPointerOps_OffsetNum`.
7714 // \param `SortedIndicesForOffset = SortedIndices_OffsetNum`
7715 auto UpdateSortedIndices =
7716 [&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
7717 ArrayRef<unsigned> IndicesInAllPointerOps, const int64_t OffsetNum) {
7718 if (SortedIndicesForOffset.empty()) {
7719 SortedIndicesForOffset.resize(IndicesInAllPointerOps.size());
7720 std::iota(SortedIndicesForOffset.begin(),
7721 SortedIndicesForOffset.end(), 0);
7722 }
7723 for (const auto [Num, Idx] : enumerate(SortedIndicesForOffset)) {
7724 SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
7725 IndicesInAllPointerOps[Idx];
7726 }
7727 };
7728
7729 int64_t LowestOffset = SortedOffsetsV[0];
7730 ArrayRef<Value *> PointerOps0 = OffsetToPointerOpIdxMap[LowestOffset].first;
7731
7732 SmallVector<int64_t> Coeffs0(VecSz);
7733 SmallVector<unsigned> SortedIndicesForOffset0;
7734 const SCEV *Stride0 = calculateRtStride(PointerOps0, BaseTy, *DL, *SE,
7735 SortedIndicesForOffset0, Coeffs0);
7736 if (!Stride0)
7737 return false;
7738 unsigned NumCoeffs0 = Coeffs0.size();
7739 if (NumCoeffs0 * NumOffsets != Sz)
7740 return false;
7741 sort(Coeffs0);
7742
7743 ArrayRef<unsigned> IndicesInAllPointerOps0 =
7744 OffsetToPointerOpIdxMap[LowestOffset].second;
7745 UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
7746
7747 // Now that we know what the common stride and coefficients has to be check
7748 // the remaining `PointerOps_j`.
7749 SmallVector<int64_t> Coeffs;
7750 SmallVector<unsigned> SortedIndicesForOffset;
7751 for (int J : seq<int>(1, NumOffsets)) {
7752 Coeffs.clear();
7753 Coeffs.resize(VecSz);
7754 SortedIndicesForOffset.clear();
7755
7756 int64_t Offset = SortedOffsetsV[J];
7757 ArrayRef<Value *> PointerOpsForOffset =
7758 OffsetToPointerOpIdxMap[Offset].first;
7759 ArrayRef<unsigned> IndicesInAllPointerOps =
7760 OffsetToPointerOpIdxMap[Offset].second;
7761 const SCEV *StrideWithinGroup = calculateRtStride(
7762 PointerOpsForOffset, BaseTy, *DL, *SE, SortedIndicesForOffset, Coeffs);
7763
7764 if (!StrideWithinGroup || StrideWithinGroup != Stride0)
7765 return false;
7766 if (Coeffs.size() != NumCoeffs0)
7767 return false;
7768 sort(Coeffs);
7769 if (Coeffs != Coeffs0)
7770 return false;
7771
7772 UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, J);
7773 }
7774
7775 SortedIndices.clear();
7776 SortedIndices = std::move(SortedIndicesDraft);
7777 SPtrInfo.StrideSCEV = Stride0;
7778 SPtrInfo.Ty = StridedLoadTy;
7779 return true;
7780}
7781
7783 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
7784 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
7785 unsigned *BestVF, bool TryRecursiveCheck) const {
7786 // Check that a vectorized load would load the same memory as a scalar
7787 // load. For example, we don't want to vectorize loads that are smaller
7788 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7789 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7790 // from such a struct, we read/write packed bits disagreeing with the
7791 // unvectorized version.
7792 if (BestVF)
7793 *BestVF = 0;
7795 return LoadsState::Gather;
7796 Type *ScalarTy = VL0->getType();
7797
7798 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7799 return LoadsState::Gather;
7800
7801 // Make sure all loads in the bundle are simple - we can't vectorize
7802 // atomic or volatile loads.
7803 PointerOps.clear();
7804 const size_t Sz = VL.size();
7805 PointerOps.resize(Sz);
7806 auto *POIter = PointerOps.begin();
7807 for (Value *V : VL) {
7808 auto *L = dyn_cast<LoadInst>(V);
7809 if (!L || !L->isSimple())
7810 return LoadsState::Gather;
7811 *POIter = L->getPointerOperand();
7812 ++POIter;
7813 }
7814
7815 Order.clear();
7816 // Check the order of pointer operands or that all pointers are the same.
7817 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7818
7819 auto *VecTy = dyn_cast<VectorType>(getWidenedType(ScalarTy, Sz));
7820 if (!VecTy)
7821 return LoadsState::Gather;
7822 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
7823 // Cache masked gather legality - both the !IsSorted path below and the
7824 // post-branch check use the same VecTy/CommonAlignment, and the underlying
7825 // TTI calls are virtual.
7826 std::optional<bool> MaskedGatherLegal;
7827 auto IsMaskedGatherLegal = [&] {
7828 if (!MaskedGatherLegal)
7829 MaskedGatherLegal =
7830 TTI->isLegalMaskedGather(VecTy, CommonAlignment) &&
7831 !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment);
7832 return *MaskedGatherLegal;
7833 };
7834 if (!IsSorted) {
7835 if (analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order,
7836 SPtrInfo, /*isLoad=*/true))
7838
7839 if (!IsMaskedGatherLegal())
7840 return LoadsState::Gather;
7841
7842 if (!all_of(PointerOps, [&](Value *P) {
7843 return arePointersCompatible(P, PointerOps.front(), *TLI);
7844 }))
7845 return LoadsState::Gather;
7846
7847 } else {
7848 Value *Ptr0;
7849 Value *PtrN;
7850 if (Order.empty()) {
7851 Ptr0 = PointerOps.front();
7852 PtrN = PointerOps.back();
7853 } else {
7854 Ptr0 = PointerOps[Order.front()];
7855 PtrN = PointerOps[Order.back()];
7856 }
7857 // sortPtrAccesses validates getPointersDiff for all pointers relative to
7858 // PointerOps[0], so compute the span using PointerOps[0] as intermediate:
7859 // Diff = offset(PtrN) - offset(Ptr0) relative to PointerOps[0]
7860 std::optional<int64_t> Diff0 =
7861 getPointersDiff(ScalarTy, PointerOps[0], ScalarTy, Ptr0, *DL, *SE);
7862 std::optional<int64_t> DiffN =
7863 getPointersDiff(ScalarTy, PointerOps[0], ScalarTy, PtrN, *DL, *SE);
7864 assert(Diff0 && DiffN &&
7865 "sortPtrAccesses should have validated these pointers");
7866 int64_t Diff = *DiffN - *Diff0;
7867 // Check that the sorted loads are consecutive.
7868 if (static_cast<uint64_t>(Diff) == Sz - 1)
7869 return LoadsState::Vectorize;
7870 if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT,
7871 *TLI, [&](Value *V) {
7872 return areAllUsersVectorized(
7873 cast<Instruction>(V), UserIgnoreList);
7874 }))
7876 Align Alignment =
7877 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
7878 ->getAlign();
7879 if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order,
7880 Diff, Ptr0, SPtrInfo))
7882 }
7883 if (!IsMaskedGatherLegal())
7884 return LoadsState::Gather;
7885 // Correctly identify compare the cost of loads + shuffles rather than
7886 // strided/masked gather loads. Returns true if vectorized + shuffles
7887 // representation is better than just gather.
7888 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
7889 unsigned *BestVF,
7890 bool ProfitableGatherPointers) {
7891 if (BestVF)
7892 *BestVF = 0;
7893 // Compare masked gather cost and loads + insert subvector costs.
7895 auto [ScalarGEPCost, VectorGEPCost] =
7896 getGEPCosts(TTI, PointerOps, PointerOps.front(), Instruction::Load,
7897 CostKind, ScalarTy, VecTy);
7898 // Estimate the cost of masked gather GEP. If not a splat, roughly
7899 // estimate as a buildvector, otherwise estimate as splat.
7900 APInt DemandedElts = APInt::getAllOnes(Sz);
7901 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
7902 auto *PtrVecTy = cast<VectorType>(getWidenedType(PtrScalarTy, Sz));
7903 // Cache the underlying object of PointerOps.front() - it is invariant
7904 // across the per-V comparisons below and getUnderlyingObject walks
7905 // GEP/cast chains.
7906 const Value *FrontUO = getUnderlyingObject(PointerOps.front());
7907 if (static_cast<unsigned>(count_if(
7908 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
7909 any_of(PointerOps,
7910 [&](Value *V) { return getUnderlyingObject(V) != FrontUO; }))
7911 VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
7912 DemandedElts, /*Insert=*/true,
7913 /*Extract=*/false, CostKind);
7914 else
7915 VectorGEPCost +=
7917 TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
7918 /*Insert=*/true, /*Extract=*/false, CostKind) +
7919 ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
7920 // The cost of scalar loads.
7921 InstructionCost ScalarLoadsCost =
7923 [&](InstructionCost C, Value *V) {
7924 return C + TTI.getInstructionCost(cast<Instruction>(V),
7925 CostKind);
7926 }) +
7927 ScalarGEPCost;
7928 // The cost of masked gather.
7929 InstructionCost MaskedGatherCost =
7930 TTI.getMemIntrinsicInstrCost(
7931 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
7933 /*VariableMask=*/false, CommonAlignment),
7934 CostKind) +
7935 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7936 InstructionCost GatherCost =
7937 getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7938 /*Insert=*/true,
7939 /*Extract=*/false, CostKind) +
7940 ScalarLoadsCost;
7941 // The list of loads is small or perform partial check already - directly
7942 // compare masked gather cost and gather cost.
7943 constexpr unsigned ListLimit = 4;
7944 if (!TryRecursiveCheck || VL.size() < ListLimit)
7945 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7946
7947 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7948 unsigned MinVF = getMinVF(2 * Sz);
7949 DemandedElts.clearAllBits();
7950 // Iterate through possible vectorization factors and check if vectorized +
7951 // shuffles is better than just gather.
7952 for (unsigned VF =
7953 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
7954 VF >= MinVF;
7955 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
7957 for (unsigned Cnt = 0, End = VL.size(); Cnt < End; Cnt += VF) {
7958 const unsigned SliceVF = std::min(VF, End - Cnt);
7959 ArrayRef<Value *> Slice = VL.slice(Cnt, SliceVF);
7961 SmallVector<Value *> PointerOps;
7962 LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order,
7963 PointerOps, SPtrInfo, BestVF,
7964 /*TryRecursiveCheck=*/false);
7965 // Check that the sorted loads are consecutive.
7966 if (LS == LoadsState::Gather) {
7967 if (BestVF) {
7968 DemandedElts.setAllBits();
7969 break;
7970 }
7971 DemandedElts.setBits(Cnt, Cnt + SliceVF);
7972 continue;
7973 }
7974 // If need the reorder - consider as high-cost masked gather for now.
7975 if ((LS == LoadsState::Vectorize ||
7978 !Order.empty() && !isReverseOrder(Order))
7980 States.emplace_back(Cnt, LS);
7981 }
7982 if (DemandedElts.isAllOnes())
7983 // All loads gathered - try smaller VF.
7984 continue;
7985 // Can be vectorized later as a serie of loads/insertelements.
7986 InstructionCost VecLdCost = 0;
7987 if (!DemandedElts.isZero()) {
7988 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
7989 /*Insert=*/true,
7990 /*Extract=*/false, CostKind) +
7991 ScalarGEPCost;
7992 for (unsigned Idx : seq<unsigned>(VL.size()))
7993 if (DemandedElts[Idx])
7994 VecLdCost +=
7995 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
7996 }
7997 for (const auto &[SliceStart, LS] : States) {
7998 const unsigned SliceVF = std::min<unsigned>(VF, VL.size() - SliceStart);
7999 auto *SubVecTy = cast<VectorType>(getWidenedType(ScalarTy, SliceVF));
8000 auto *LI0 = cast<LoadInst>(VL[SliceStart]);
8001 InstructionCost VectorGEPCost =
8002 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
8003 ? 0
8004 : getGEPCosts(TTI,
8005 ArrayRef(PointerOps).slice(SliceStart, SliceVF),
8006 LI0->getPointerOperand(), Instruction::Load,
8007 CostKind, ScalarTy, SubVecTy)
8008 .second;
8009 if (LS == LoadsState::ScatterVectorize) {
8010 if (static_cast<unsigned>(
8011 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
8012 PointerOps.size() - 1 ||
8013 any_of(PointerOps, [&](Value *V) {
8014 return getUnderlyingObject(V) != FrontUO;
8015 }))
8016 VectorGEPCost += getScalarizationOverhead(
8017 TTI, ScalarTy, SubVecTy, APInt::getAllOnes(SliceVF),
8018 /*Insert=*/true, /*Extract=*/false, CostKind);
8019 else
8020 VectorGEPCost +=
8022 TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(SliceVF, 0),
8023 /*Insert=*/true, /*Extract=*/false, CostKind) +
8024 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
8025 CostKind);
8026 }
8027 switch (LS) {
8029 VecLdCost +=
8030 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
8031 LI0->getPointerAddressSpace(), CostKind,
8033 VectorGEPCost;
8034 break;
8036 VecLdCost += TTI.getMemIntrinsicInstrCost(
8038 Intrinsic::experimental_vp_strided_load,
8039 SubVecTy, LI0->getPointerOperand(),
8040 /*VariableMask=*/false, CommonAlignment),
8041 CostKind) +
8042 VectorGEPCost;
8043 break;
8045 VecLdCost += TTI.getMemIntrinsicInstrCost(
8047 Intrinsic::masked_load, SubVecTy,
8048 CommonAlignment, LI0->getPointerAddressSpace()),
8049 CostKind) +
8051 {}, CostKind);
8052 break;
8054 VecLdCost += TTI.getMemIntrinsicInstrCost(
8056 Intrinsic::masked_gather, SubVecTy,
8057 LI0->getPointerOperand(),
8058 /*VariableMask=*/false, CommonAlignment),
8059 CostKind) +
8060 VectorGEPCost;
8061 break;
8062 case LoadsState::Gather:
8063 llvm_unreachable("Gathers are not added to States");
8064 }
8065 SmallVector<int> ShuffleMask(VL.size());
8066 const unsigned SliceIdx = SliceStart / VF;
8067 for (int Idx : seq<int>(VL.size()))
8068 ShuffleMask[Idx] = Idx / VF == SliceIdx ? VL.size() + Idx % VF : Idx;
8069 if (SliceStart > 0)
8070 VecLdCost +=
8071 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
8072 CostKind, SliceStart, SubVecTy);
8073 }
8074 // If masked gather cost is higher - better to vectorize, so
8075 // consider it as a gather node. It will be better estimated
8076 // later.
8077 if (MaskedGatherCost >= VecLdCost &&
8078 VecLdCost - GatherCost < -SLPCostThreshold) {
8079 if (BestVF)
8080 *BestVF = VF;
8081 return true;
8082 }
8083 }
8084 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
8085 };
8086 // TODO: need to improve analysis of the pointers, if not all of them are
8087 // GEPs or have > 2 operands, we end up with a gather node, which just
8088 // increases the cost.
8089 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
8090 bool ProfitableGatherPointers =
8091 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
8092 return L->isLoopInvariant(V);
8093 })) <= Sz / 2;
8094 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
8096 return (!GEP && doesNotNeedToBeScheduled(P)) ||
8097 (GEP && GEP->getNumOperands() == 2 &&
8098 isa<Constant, Instruction>(GEP->getOperand(1)));
8099 })) {
8100 // Check if potential masked gather can be represented as series
8101 // of loads + insertsubvectors.
8102 // If masked gather cost is higher - better to vectorize, so
8103 // consider it as a gather node. It will be better estimated
8104 // later.
8105 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
8106 ProfitableGatherPointers))
8108 }
8109
8110 return LoadsState::Gather;
8111}
8112
8114 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
8115 const DataLayout &DL, ScalarEvolution &SE,
8116 SmallVectorImpl<unsigned> &SortedIndices) {
8117 assert(
8118 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
8119 "Expected list of pointer operands.");
8120 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
8121 // Ptr into, sort and return the sorted indices with values next to one
8122 // another.
8124 std::pair<BasicBlock *, Value *>,
8126 Bases;
8127 Bases
8128 .try_emplace(std::make_pair(
8130 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
8131
8132 SortedIndices.clear();
8133 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
8134 auto Key = std::make_pair(BBs[Cnt + 1],
8136 bool Found = any_of(Bases.try_emplace(Key).first->second,
8137 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
8138 std::optional<int64_t> Diff =
8139 getPointersDiff(ElemTy, std::get<0>(Base.front()),
8140 ElemTy, Ptr, DL, SE,
8141 /*StrictCheck=*/true);
8142 if (!Diff)
8143 return false;
8144
8145 Base.emplace_back(Ptr, *Diff, Cnt + 1);
8146 return true;
8147 });
8148
8149 if (!Found) {
8150 // If we haven't found enough to usefully cluster, return early.
8151 if (Bases.size() > VL.size() / 2 - 1)
8152 return false;
8153
8154 // Not found already - add a new Base
8155 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
8156 }
8157 }
8158
8159 if (Bases.size() == VL.size())
8160 return false;
8161
8162 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
8163 Bases.front().second.size() == VL.size()))
8164 return false;
8165
8166 // For each of the bases sort the pointers by Offset and check if any of the
8167 // base become consecutively allocated.
8168 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
8169 SmallPtrSet<Value *, 13> FirstPointers;
8170 SmallPtrSet<Value *, 13> SecondPointers;
8171 Value *P1 = Ptr1;
8172 Value *P2 = Ptr2;
8173 unsigned Depth = 0;
8174 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
8175 if (P1 == P2 || Depth > RecursionMaxDepth)
8176 return false;
8177 FirstPointers.insert(P1);
8178 SecondPointers.insert(P2);
8179 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
8180 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
8181 ++Depth;
8182 }
8183 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
8184 "Unable to find matching root.");
8185 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
8186 };
8187 for (auto &Base : Bases) {
8188 for (auto &Vec : Base.second) {
8189 if (Vec.size() > 1) {
8191 int64_t InitialOffset = std::get<1>(Vec[0]);
8192 bool AnyConsecutive =
8193 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
8194 return std::get<1>(P.value()) ==
8195 int64_t(P.index()) + InitialOffset;
8196 });
8197 // Fill SortedIndices array only if it looks worth-while to sort the
8198 // ptrs.
8199 if (!AnyConsecutive)
8200 return false;
8201 }
8202 }
8203 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
8204 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
8205 });
8206 }
8207
8208 for (auto &T : Bases)
8209 for (const auto &Vec : T.second)
8210 for (const auto &P : Vec)
8211 SortedIndices.push_back(std::get<2>(P));
8212
8213 assert(SortedIndices.size() == VL.size() &&
8214 "Expected SortedIndices to be the size of VL");
8215 return true;
8216}
8217
8218std::optional<BoUpSLP::OrdersType>
8219BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
8220 assert(TE.isGather() && "Expected gather node only.");
8221 Type *ScalarTy = TE.Scalars[0]->getType();
8222
8224 Ptrs.reserve(TE.Scalars.size());
8226 BBs.reserve(TE.Scalars.size());
8227 for (Value *V : TE.Scalars) {
8228 auto *L = dyn_cast<LoadInst>(V);
8229 if (!L || !L->isSimple())
8230 return std::nullopt;
8231 Ptrs.push_back(L->getPointerOperand());
8232 BBs.push_back(L->getParent());
8233 }
8234
8235 BoUpSLP::OrdersType Order;
8236 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
8237 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
8238 return std::move(Order);
8239 return std::nullopt;
8240}
8241
8242/// Check if two insertelement instructions are from the same buildvector.
8245 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
8246 // Instructions must be from the same basic blocks.
8247 if (VU->getParent() != V->getParent())
8248 return false;
8249 // Checks if 2 insertelements are from the same buildvector.
8250 if (VU->getType() != V->getType())
8251 return false;
8252 // Multiple used inserts are separate nodes.
8253 if (!VU->hasOneUse() && !V->hasOneUse())
8254 return false;
8255 auto *IE1 = VU;
8256 auto *IE2 = V;
8257 std::optional<unsigned> Idx1 = getElementIndex(IE1);
8258 std::optional<unsigned> Idx2 = getElementIndex(IE2);
8259 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
8260 return false;
8261 // Go through the vector operand of insertelement instructions trying to find
8262 // either VU as the original vector for IE2 or V as the original vector for
8263 // IE1.
8265 bool IsReusedIdx = false;
8266 do {
8267 if (IE2 == VU && !IE1)
8268 return VU->hasOneUse();
8269 if (IE1 == V && !IE2)
8270 return V->hasOneUse();
8271 if (IE1 && IE1 != V) {
8272 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
8273 IsReusedIdx |= ReusedIdx.test(Idx1);
8274 ReusedIdx.set(Idx1);
8275 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
8276 IE1 = nullptr;
8277 else
8278 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
8279 }
8280 if (IE2 && IE2 != VU) {
8281 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
8282 IsReusedIdx |= ReusedIdx.test(Idx2);
8283 ReusedIdx.set(Idx2);
8284 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
8285 IE2 = nullptr;
8286 else
8287 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
8288 }
8289 } while (!IsReusedIdx && (IE1 || IE2));
8290 return false;
8291}
8292
8293/// Checks if the specified instruction \p I is an alternate operation for
8294/// the given \p MainOp and \p AltOp instructions.
8295static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
8296 Instruction *AltOp,
8297 const TargetLibraryInfo &TLI);
8298
8299std::optional<BoUpSLP::OrdersType>
8300BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
8301 bool IgnoreReorder) {
8302 // No need to reorder if need to shuffle reuses, still need to shuffle the
8303 // node.
8304 if (!TE.ReuseShuffleIndices.empty()) {
8305 if (isSplat(TE.Scalars))
8306 return std::nullopt;
8307 // Check if reuse shuffle indices can be improved by reordering.
8308 // For this, check that reuse mask is "clustered", i.e. each scalar values
8309 // is used once in each submask of size <number_of_scalars>.
8310 // Example: 4 scalar values.
8311 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
8312 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
8313 // element 3 is used twice in the second submask.
8314 unsigned Sz = TE.Scalars.size();
8315 if (TE.isGather()) {
8316 if (std::optional<OrdersType> CurrentOrder =
8317 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
8318 SmallVector<int> Mask;
8319 fixupOrderingIndices(*CurrentOrder);
8320 inversePermutation(*CurrentOrder, Mask);
8321 ::addMask(Mask, TE.ReuseShuffleIndices);
8322 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
8323 unsigned Sz = TE.Scalars.size();
8324 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
8325 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
8326 if (Idx != PoisonMaskElem)
8327 Res[Idx + K * Sz] = I + K * Sz;
8328 }
8329 return std::move(Res);
8330 }
8331 }
8332 if (Sz == 2 && TE.getVectorFactor() == 4 &&
8333 ::getNumberOfParts(*TTI,
8334 getWidenedType(getValueType(TE.Scalars.front()),
8335 2 * TE.getVectorFactor()),
8336 getValueType(TE.Scalars.front())) == 1)
8337 return std::nullopt;
8338 if (TE.ReuseShuffleIndices.size() % Sz != 0)
8339 return std::nullopt;
8340 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
8341 Sz)) {
8342 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
8343 if (TE.ReorderIndices.empty())
8344 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
8345 else
8346 inversePermutation(TE.ReorderIndices, ReorderMask);
8347 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
8348 unsigned VF = ReorderMask.size();
8349 OrdersType ResOrder(VF, VF);
8350 unsigned NumParts = divideCeil(VF, Sz);
8351 SmallBitVector UsedVals(NumParts);
8352 for (unsigned I = 0; I < VF; I += Sz) {
8353 int Val = PoisonMaskElem;
8354 unsigned UndefCnt = 0;
8355 unsigned Limit = std::min(Sz, VF - I);
8356 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
8357 [&](int Idx) {
8358 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
8359 Val = Idx;
8360 if (Idx == PoisonMaskElem)
8361 ++UndefCnt;
8362 return Idx != PoisonMaskElem && Idx != Val;
8363 }) ||
8364 Val >= static_cast<int>(NumParts) || Val == PoisonMaskElem ||
8365 UsedVals.test(Val) || UndefCnt > Sz / 2)
8366 return std::nullopt;
8367 UsedVals.set(Val);
8368 for (unsigned K = 0; K < NumParts; ++K) {
8369 unsigned Idx = Val + Sz * K;
8370 if (Idx < VF && I + K < VF)
8371 ResOrder[Idx] = I + K;
8372 }
8373 }
8374 return std::move(ResOrder);
8375 }
8376 unsigned VF = TE.getVectorFactor();
8377 // Try build correct order for extractelement instructions.
8378 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
8379 TE.ReuseShuffleIndices.end());
8380 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
8381 all_of(TE.Scalars, [Sz](Value *V) {
8382 if (isa<PoisonValue>(V))
8383 return true;
8384 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
8385 return Idx && *Idx < Sz;
8386 })) {
8387 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
8388 "by BinaryOperator and CastInst.");
8389 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
8390 if (TE.ReorderIndices.empty())
8391 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
8392 else
8393 inversePermutation(TE.ReorderIndices, ReorderMask);
8394 for (unsigned I = 0; I < VF; ++I) {
8395 int &Idx = ReusedMask[I];
8396 if (Idx == PoisonMaskElem)
8397 continue;
8398 Value *V = TE.Scalars[ReorderMask[Idx]];
8399 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
8400 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
8401 }
8402 }
8403 // Build the order of the VF size, need to reorder reuses shuffles, they are
8404 // always of VF size.
8405 OrdersType ResOrder(VF);
8406 std::iota(ResOrder.begin(), ResOrder.end(), 0);
8407 auto *It = ResOrder.begin();
8408 for (unsigned K = 0; K < VF; K += Sz) {
8409 OrdersType CurrentOrder(TE.ReorderIndices);
8410 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
8411 if (SubMask.front() == PoisonMaskElem)
8412 std::iota(SubMask.begin(), SubMask.end(), 0);
8413 reorderOrder(CurrentOrder, SubMask);
8414 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
8415 std::advance(It, Sz);
8416 }
8417 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
8418 return Data.index() == Data.value();
8419 }))
8420 return std::nullopt; // No need to reorder.
8421 return std::move(ResOrder);
8422 }
8423 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
8424 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
8425 !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
8426 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
8427 return std::nullopt;
8428 if (TE.State == TreeEntry::SplitVectorize ||
8429 ((TE.State == TreeEntry::Vectorize ||
8430 TE.State == TreeEntry::StridedVectorize ||
8431 TE.State == TreeEntry::CompressVectorize) &&
8433 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
8434 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
8435 "Alternate instructions are only supported by "
8436 "BinaryOperator and CastInst.");
8437 return TE.ReorderIndices;
8438 }
8439 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
8440 TE.isAltShuffle()) {
8441 assert(TE.ReuseShuffleIndices.empty() &&
8442 "ReuseShuffleIndices should be "
8443 "empty for alternate instructions.");
8444 SmallVector<int> Mask;
8445 TE.buildAltOpShuffleMask(
8446 [&](Instruction *I) {
8447 assert(TE.getMatchingMainOpOrAltOp(I) &&
8448 "Unexpected main/alternate opcode");
8449 return isAlternateInstruction(I, TE.getMainOp(), TE.getAltOp(), *TLI);
8450 },
8451 Mask);
8452 const int VF = TE.getVectorFactor();
8453 OrdersType ResOrder(VF, VF);
8454 for (unsigned I : seq<unsigned>(VF)) {
8455 if (Mask[I] == PoisonMaskElem)
8456 continue;
8457 ResOrder[Mask[I] % VF] = I;
8458 }
8459 return std::move(ResOrder);
8460 }
8461 if (!TE.ReorderIndices.empty())
8462 return TE.ReorderIndices;
8463 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
8464 if (!TE.ReorderIndices.empty())
8465 return TE.ReorderIndices;
8466
8467 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
8468 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
8469 if (isa<Constant>(V) || !V->hasNUsesOrMore(1))
8470 continue;
8471 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
8472 if (!II)
8473 continue;
8474 Instruction *BVHead = nullptr;
8475 BasicBlock *BB = II->getParent();
8476 while (II && II->hasOneUse() && II->getParent() == BB) {
8477 BVHead = II;
8478 II = dyn_cast<InsertElementInst>(II->getOperand(0));
8479 }
8480 I = BVHead;
8481 }
8482
8483 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
8484 assert(BB1 != BB2 && "Expected different basic blocks.");
8485 if (!DT->isReachableFromEntry(BB1))
8486 return false;
8487 if (!DT->isReachableFromEntry(BB2))
8488 return true;
8489 auto *NodeA = DT->getNode(BB1);
8490 auto *NodeB = DT->getNode(BB2);
8491 assert(NodeA && "Should only process reachable instructions");
8492 assert(NodeB && "Should only process reachable instructions");
8493 assert((NodeA == NodeB) ==
8494 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
8495 "Different nodes should have different DFS numbers");
8496 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
8497 };
8498 auto PHICompare = [&](unsigned I1, unsigned I2) {
8499 Value *V1 = TE.Scalars[I1];
8500 Value *V2 = TE.Scalars[I2];
8501 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
8502 return false;
8503 if (isa<PoisonValue>(V1))
8504 return true;
8505 if (isa<PoisonValue>(V2))
8506 return false;
8507 if (V1->getNumUses() < V2->getNumUses())
8508 return true;
8509 if (V1->getNumUses() > V2->getNumUses())
8510 return false;
8511 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
8512 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
8513 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
8514 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
8515 FirstUserOfPhi2->getParent());
8516 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
8517 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
8518 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
8519 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
8520 if (IE1 && !IE2)
8521 return true;
8522 if (!IE1 && IE2)
8523 return false;
8524 if (IE1 && IE2) {
8525 if (UserBVHead[I1] && !UserBVHead[I2])
8526 return true;
8527 if (!UserBVHead[I1])
8528 return false;
8529 if (UserBVHead[I1] == UserBVHead[I2])
8530 return getElementIndex(IE1) < getElementIndex(IE2);
8531 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
8532 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
8533 UserBVHead[I2]->getParent());
8534 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
8535 }
8536 if (EE1 && !EE2)
8537 return true;
8538 if (!EE1 && EE2)
8539 return false;
8540 if (EE1 && EE2) {
8541 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
8542 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
8543 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
8544 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
8545 if (!Inst2 && !P2)
8546 return Inst1 || P1;
8547 if (EE1->getOperand(0) == EE2->getOperand(0))
8548 return getElementIndex(EE1) < getElementIndex(EE2);
8549 if (!Inst1 && Inst2)
8550 return false;
8551 if (Inst1 && Inst2) {
8552 if (Inst1->getParent() != Inst2->getParent())
8553 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
8554 return Inst1->comesBefore(Inst2);
8555 }
8556 if (!P1 && P2)
8557 return false;
8558 assert(P1 && P2 &&
8559 "Expected either instructions or arguments vector operands.");
8560 return P1->getArgNo() < P2->getArgNo();
8561 }
8562 return false;
8563 };
8564 OrdersType Phis(TE.Scalars.size());
8565 std::iota(Phis.begin(), Phis.end(), 0);
8566 stable_sort(Phis, PHICompare);
8567 if (isIdentityOrder(Phis))
8568 return std::nullopt; // No need to reorder.
8569 return std::move(Phis);
8570 }
8571 if (TE.isGather() &&
8572 (!TE.hasState() || !TE.isAltShuffle() ||
8573 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
8574 allSameType(TE.Scalars)) {
8575 // TODO: add analysis of other gather nodes with extractelement
8576 // instructions and other values/instructions, not only undefs.
8577 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
8579 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
8580 all_of(TE.Scalars, [](Value *V) {
8581 auto *EE = dyn_cast<ExtractElementInst>(V);
8582 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
8583 })) {
8584 // Check that gather of extractelements can be represented as
8585 // just a shuffle of a single vector.
8586 OrdersType CurrentOrder;
8587 bool Reuse =
8588 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
8589 if (Reuse || !CurrentOrder.empty())
8590 return std::move(CurrentOrder);
8591 }
8592 // If the gather node is <undef, v, .., poison> and
8593 // insertelement poison, v, 0 [+ permute]
8594 // is cheaper than
8595 // insertelement poison, v, n - try to reorder.
8596 // If rotating the whole graph, exclude the permute cost, the whole graph
8597 // might be transformed.
8598 int Sz = TE.Scalars.size();
8599 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
8600 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
8601 const auto *It = find_if_not(TE.Scalars, isConstant);
8602 if (It == TE.Scalars.begin())
8603 return OrdersType();
8604 auto *Ty =
8605 cast<VectorType>(getWidenedType(TE.Scalars.front()->getType(), Sz));
8606 if (It != TE.Scalars.end()) {
8607 OrdersType Order(Sz, Sz);
8608 unsigned Idx = std::distance(TE.Scalars.begin(), It);
8609 Order[Idx] = 0;
8610 fixupOrderingIndices(Order);
8611 SmallVector<int> Mask;
8612 inversePermutation(Order, Mask);
8613 InstructionCost PermuteCost =
8614 TopToBottom
8615 ? 0
8616 : ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, Ty, Mask);
8617 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
8618 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
8619 PoisonValue::get(Ty), *It);
8620 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
8621 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
8622 PoisonValue::get(Ty), *It);
8623 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
8624 OrdersType Order(Sz, Sz);
8625 Order[Idx] = 0;
8626 return std::move(Order);
8627 }
8628 }
8629 }
8630 if (isSplat(TE.Scalars))
8631 return std::nullopt;
8632 if (TE.Scalars.size() >= 3)
8633 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
8634 return Order;
8635 // Check if can include the order of vectorized loads. For masked gathers do
8636 // extra analysis later, so include such nodes into a special list.
8637 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
8638 SmallVector<Value *> PointerOps;
8639 StridedPtrInfo SPtrInfo;
8640 OrdersType CurrentOrder;
8641 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
8642 CurrentOrder, PointerOps, SPtrInfo);
8645 return std::move(CurrentOrder);
8646 }
8647 if (std::optional<OrdersType> CurrentOrder =
8648 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
8649 return CurrentOrder;
8650 }
8651 return std::nullopt;
8652}
8653
8654/// Checks if the given mask is a "clustered" mask with the same clusters of
8655/// size \p Sz, which are not identity submasks.
8657 unsigned Sz) {
8658 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
8659 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
8660 return false;
8661 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
8662 ArrayRef<int> Cluster = Mask.slice(I, Sz);
8663 if (Cluster != FirstCluster)
8664 return false;
8665 }
8666 return true;
8667}
8668
8669void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
8670 // Reorder reuses mask.
8671 reorderReuses(TE.ReuseShuffleIndices, Mask);
8672 const unsigned Sz = TE.Scalars.size();
8673 // For vectorized and non-clustered reused no need to do anything else.
8674 if (!TE.isGather() ||
8676 Sz) ||
8677 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
8678 return;
8679 SmallVector<int> NewMask;
8680 inversePermutation(TE.ReorderIndices, NewMask);
8681 addMask(NewMask, TE.ReuseShuffleIndices);
8682 // Clear reorder since it is going to be applied to the new mask.
8683 TE.ReorderIndices.clear();
8684 // Try to improve gathered nodes with clustered reuses, if possible.
8685 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
8686 SmallVector<unsigned> NewOrder(Slice);
8687 inversePermutation(NewOrder, NewMask);
8688 reorderScalars(TE.Scalars, NewMask);
8689 // Fill the reuses mask with the identity submasks.
8690 for (auto *It = TE.ReuseShuffleIndices.begin(),
8691 *End = TE.ReuseShuffleIndices.end();
8692 It != End; std::advance(It, Sz))
8693 std::iota(It, std::next(It, Sz), 0);
8694}
8695
8697 ArrayRef<unsigned> SecondaryOrder) {
8698 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
8699 "Expected same size of orders");
8700 size_t Sz = Order.size();
8701 SmallBitVector UsedIndices(Sz);
8702 for (unsigned Idx : seq<unsigned>(0, Sz)) {
8703 if (Order[Idx] != Sz)
8704 UsedIndices.set(Order[Idx]);
8705 }
8706 if (SecondaryOrder.empty()) {
8707 for (unsigned Idx : seq<unsigned>(0, Sz))
8708 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
8709 Order[Idx] = Idx;
8710 } else {
8711 for (unsigned Idx : seq<unsigned>(0, Sz))
8712 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8713 !UsedIndices.test(SecondaryOrder[Idx]))
8714 Order[Idx] = SecondaryOrder[Idx];
8715 }
8716}
8717
8720 return false;
8721
8722 constexpr unsigned TinyVF = 2;
8723 constexpr unsigned TinyTree = 10;
8724 constexpr unsigned PhiOpsLimit = 12;
8725 constexpr unsigned GatherLoadsLimit = 2;
8726 if (VectorizableTree.size() <= TinyTree)
8727 return true;
8728 if (VectorizableTree.front()->hasState() &&
8729 !VectorizableTree.front()->isGather() &&
8730 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8731 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8732 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8733 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8734 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8735 VectorizableTree.front()->ReorderIndices.empty()) {
8736 // Check if the tree has only single store and single (unordered) load node,
8737 // other nodes are phis or geps/binops, combined with phis, and/or single
8738 // gather load node
8739 if (VectorizableTree.front()->hasState() &&
8740 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8741 VectorizableTree.front()->Scalars.size() == TinyVF &&
8742 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8743 return false;
8744 // Single node, which require reorder - skip.
8745 if (VectorizableTree.front()->hasState() &&
8746 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8747 VectorizableTree.front()->ReorderIndices.empty()) {
8748 const unsigned ReorderedSplitsCnt =
8749 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8750 return TE->State == TreeEntry::SplitVectorize &&
8751 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8752 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8753 ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp());
8754 });
8755 if (ReorderedSplitsCnt <= 1 &&
8756 static_cast<unsigned>(count_if(
8757 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
8758 return ((!TE->isGather() &&
8759 (TE->ReorderIndices.empty() ||
8760 (TE->UserTreeIndex.UserTE &&
8761 TE->UserTreeIndex.UserTE->State ==
8762 TreeEntry::Vectorize &&
8763 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8764 .empty()))) ||
8765 (TE->isGather() && TE->ReorderIndices.empty() &&
8766 (!TE->hasState() || TE->isAltShuffle() ||
8767 TE->getOpcode() == Instruction::Load ||
8768 TE->getOpcode() == Instruction::ZExt ||
8769 TE->getOpcode() == Instruction::SExt))) &&
8770 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8771 !TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
8772 return !isConstant(V) && isVectorized(V);
8773 }));
8774 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8775 return false;
8776 }
8777 bool HasPhis = false;
8778 bool HasLoad = true;
8779 unsigned GatherLoads = 0;
8780 for (const std::unique_ptr<TreeEntry> &TE :
8781 ArrayRef(VectorizableTree).drop_front()) {
8782 if (TE->State == TreeEntry::SplitVectorize)
8783 continue;
8784 if (!TE->hasState()) {
8785 if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
8787 continue;
8788 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8790 continue;
8791 return true;
8792 }
8793 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8794 if (!TE->isGather()) {
8795 HasLoad = false;
8796 continue;
8797 }
8798 if (HasLoad)
8799 return true;
8800 ++GatherLoads;
8801 if (GatherLoads >= GatherLoadsLimit)
8802 return true;
8803 }
8804 if (TE->getOpcode() == Instruction::GetElementPtr ||
8805 Instruction::isBinaryOp(TE->getOpcode()))
8806 continue;
8807 if (TE->getOpcode() != Instruction::PHI &&
8808 (!TE->hasCopyableElements() ||
8809 static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
8810 TE->Scalars.size() / 2))
8811 return true;
8812 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8813 TE->getNumOperands() > PhiOpsLimit)
8814 return false;
8815 HasPhis = true;
8816 }
8817 return !HasPhis;
8818 }
8819 return true;
8820}
8821
8822void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
8823 ArrayRef<int> MaskOrder) {
8824 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
8825 SmallVector<int> NewMask(getVectorFactor());
8826 SmallVector<int> NewMaskOrder(getVectorFactor());
8827 std::iota(NewMask.begin(), NewMask.end(), 0);
8828 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8829 if (Idx == 0) {
8830 copy(Mask, NewMask.begin());
8831 copy(MaskOrder, NewMaskOrder.begin());
8832 } else {
8833 assert(Idx == 1 && "Expected either 0 or 1 index.");
8834 unsigned Offset = CombinedEntriesWithIndices.back().second;
8835 for (unsigned I : seq<unsigned>(Mask.size())) {
8836 NewMask[I + Offset] = Mask[I] + Offset;
8837 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
8838 }
8839 }
8840 reorderScalars(Scalars, NewMask);
8841 reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
8842 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
8843 ReorderIndices.clear();
8844}
8845
8847 // Maps VF to the graph nodes.
8849 // ExtractElement gather nodes which can be vectorized and need to handle
8850 // their ordering.
8852
8853 // Phi nodes can have preferred ordering based on their result users
8855
8856 // AltShuffles can also have a preferred ordering that leads to fewer
8857 // instructions, e.g., the addsub instruction in x86.
8858 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
8859
8860 // Maps a TreeEntry to the reorder indices of external users.
8862 ExternalUserReorderMap;
8863 // TODO: Reordering of struct types is not supported.
8864 if (any_of(VectorizableTree, [](const std::unique_ptr<TreeEntry> &TE) {
8865 return TE->State == TreeEntry::Vectorize &&
8866 isa<StructType>(getValueType(TE->Scalars.front()));
8867 }))
8868 return;
8869 // Compute IgnoreReorder once - it depends only on UserIgnoreList and
8870 // VectorizableTree.front(), which do not change during this loop.
8871 const bool IgnoreReorder =
8872 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8873 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
8874 VectorizableTree.front()->getOpcode() == Instruction::Store);
8875 // Find all reorderable nodes with the given VF.
8876 // Currently the are vectorized stores,loads,extracts + some gathering of
8877 // extracts.
8878 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8879 const std::unique_ptr<TreeEntry> &TE) {
8880 // Look for external users that will probably be vectorized.
8881 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
8882 findExternalStoreUsersReorderIndices(TE.get());
8883 if (!ExternalUserReorderIndices.empty()) {
8884 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8885 ExternalUserReorderMap.try_emplace(TE.get(),
8886 std::move(ExternalUserReorderIndices));
8887 }
8888
8889 // Patterns like [fadd,fsub] can be combined into a single instruction in
8890 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
8891 // to take into account their order when looking for the most used order.
8892 if (TE->hasState() && TE->isAltShuffle() &&
8893 TE->State != TreeEntry::SplitVectorize) {
8894 Type *ScalarTy = TE->Scalars[0]->getType();
8895 auto *VecTy =
8896 cast<VectorType>(getWidenedType(ScalarTy, TE->Scalars.size()));
8897 unsigned Opcode0 = TE->getOpcode();
8898 unsigned Opcode1 = TE->getAltOpcode();
8899 SmallBitVector OpcodeMask(
8900 getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1));
8901 // If this pattern is supported by the target then we consider the order.
8902 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8903 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8904 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
8905 }
8906 // TODO: Check the reverse order too.
8907 }
8908
8909 if (std::optional<OrdersType> CurrentOrder =
8910 getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
8911 // Do not include ordering for nodes used in the alt opcode vectorization,
8912 // better to reorder them during bottom-to-top stage. If follow the order
8913 // here, it causes reordering of the whole graph though actually it is
8914 // profitable just to reorder the subgraph that starts from the alternate
8915 // opcode vectorization node. Such nodes already end-up with the shuffle
8916 // instruction and it is just enough to change this shuffle rather than
8917 // rotate the scalars for the whole graph.
8918 unsigned Cnt = 0;
8919 const TreeEntry *UserTE = TE.get();
8920 while (UserTE && Cnt < RecursionMaxDepth) {
8921 if (!UserTE->UserTreeIndex)
8922 break;
8923 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8924 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8925 UserTE->UserTreeIndex.UserTE->Idx != 0)
8926 return;
8927 UserTE = UserTE->UserTreeIndex.UserTE;
8928 ++Cnt;
8929 }
8930 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
8931 if (!(TE->State == TreeEntry::Vectorize ||
8932 TE->State == TreeEntry::StridedVectorize ||
8933 TE->State == TreeEntry::SplitVectorize ||
8934 TE->State == TreeEntry::CompressVectorize) ||
8935 !TE->ReuseShuffleIndices.empty())
8936 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
8937 if (TE->State == TreeEntry::Vectorize &&
8938 TE->getOpcode() == Instruction::PHI)
8939 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
8940 }
8941 });
8942
8943 // Reorder the graph nodes according to their vectorization factor.
8944 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
8945 !VFToOrderedEntries.empty() && VF > 1; --VF) {
8946 auto It = VFToOrderedEntries.find(VF);
8947 if (It == VFToOrderedEntries.end())
8948 continue;
8949 // Try to find the most profitable order. We just are looking for the most
8950 // used order and reorder scalar elements in the nodes according to this
8951 // mostly used order.
8952 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
8953 // Delete VF entry upon exit.
8954 llvm::scope_exit Cleanup([&]() { VFToOrderedEntries.erase(It); });
8955
8956 // All operands are reordered and used only in this node - propagate the
8957 // most used order to the user node.
8960 OrdersUses;
8961 for (const TreeEntry *OpTE : OrderedEntries) {
8962 // No need to reorder this nodes, still need to extend and to use shuffle,
8963 // just need to merge reordering shuffle and the reuse shuffle.
8964 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
8965 OpTE->State != TreeEntry::SplitVectorize)
8966 continue;
8967 // Count number of orders uses.
8968 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8969 &PhisToOrders]() -> const OrdersType & {
8970 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8971 auto It = GathersToOrders.find(OpTE);
8972 if (It != GathersToOrders.end())
8973 return It->second;
8974 }
8975 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8976 auto It = AltShufflesToOrders.find(OpTE);
8977 if (It != AltShufflesToOrders.end())
8978 return It->second;
8979 }
8980 if (OpTE->State == TreeEntry::Vectorize &&
8981 OpTE->getOpcode() == Instruction::PHI) {
8982 auto It = PhisToOrders.find(OpTE);
8983 if (It != PhisToOrders.end())
8984 return It->second;
8985 }
8986 return OpTE->ReorderIndices;
8987 }();
8988 // First consider the order of the external scalar users.
8989 auto It = ExternalUserReorderMap.find(OpTE);
8990 if (It != ExternalUserReorderMap.end()) {
8991 const auto &ExternalUserReorderIndices = It->second;
8992 // If the OpTE vector factor != number of scalars - use natural order,
8993 // it is an attempt to reorder node with reused scalars but with
8994 // external uses.
8995 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8996 OrdersUses.try_emplace(OrdersType(), 0).first->second +=
8997 ExternalUserReorderIndices.size();
8998 } else {
8999 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
9000 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
9001 }
9002 // No other useful reorder data in this entry.
9003 if (Order.empty())
9004 continue;
9005 }
9006 // Stores actually store the mask, not the order, need to invert.
9007 if (OpTE->State == TreeEntry::Vectorize &&
9008 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
9009 assert(!OpTE->isAltShuffle() &&
9010 "Alternate instructions are only supported by BinaryOperator "
9011 "and CastInst.");
9012 SmallVector<int> Mask;
9013 inversePermutation(Order, Mask);
9014 unsigned E = Order.size();
9015 OrdersType CurrentOrder(E, E);
9016 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
9017 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
9018 });
9019 fixupOrderingIndices(CurrentOrder);
9020 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
9021 } else {
9022 ++OrdersUses.try_emplace(Order, 0).first->second;
9023 }
9024 }
9025 if (OrdersUses.empty())
9026 continue;
9027 // Choose the most used order.
9028 unsigned IdentityCnt = 0;
9029 unsigned FilledIdentityCnt = 0;
9030 OrdersType IdentityOrder(VF, VF);
9031 for (auto &Pair : OrdersUses) {
9032 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
9033 if (!Pair.first.empty())
9034 FilledIdentityCnt += Pair.second;
9035 IdentityCnt += Pair.second;
9036 combineOrders(IdentityOrder, Pair.first);
9037 }
9038 }
9039 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
9040 unsigned Cnt = IdentityCnt;
9041 for (auto &Pair : OrdersUses) {
9042 // Prefer identity order. But, if filled identity found (non-empty order)
9043 // with same number of uses, as the new candidate order, we can choose
9044 // this candidate order.
9045 if (Cnt < Pair.second ||
9046 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
9047 Cnt == Pair.second && !BestOrder.empty() &&
9048 isIdentityOrder(BestOrder))) {
9049 combineOrders(Pair.first, BestOrder);
9050 BestOrder = Pair.first;
9051 Cnt = Pair.second;
9052 } else {
9053 combineOrders(BestOrder, Pair.first);
9054 }
9055 }
9056 // Set order of the user node.
9057 if (isIdentityOrder(BestOrder))
9058 continue;
9059 fixupOrderingIndices(BestOrder);
9060 SmallVector<int> Mask;
9061 inversePermutation(BestOrder, Mask);
9062 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
9063 unsigned E = BestOrder.size();
9064 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
9065 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9066 });
9067 // Do an actual reordering, if profitable.
9068 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9069 // Just do the reordering for the nodes with the given VF.
9070 if (TE->Scalars.size() != VF) {
9071 if (TE->ReuseShuffleIndices.size() == VF) {
9072 assert(TE->State != TreeEntry::SplitVectorize &&
9073 "Split vectorized not expected.");
9074 // Need to reorder the reuses masks of the operands with smaller VF to
9075 // be able to find the match between the graph nodes and scalar
9076 // operands of the given node during vectorization/cost estimation.
9077 assert(
9078 (!TE->UserTreeIndex ||
9079 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
9080 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
9081 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
9082 "All users must be of VF size.");
9083 if (SLPReVec) {
9084 assert(SLPReVec && "Only supported by REVEC.");
9085 // ShuffleVectorInst does not do reorderOperands (and it should not
9086 // because ShuffleVectorInst supports only a limited set of
9087 // patterns). Only do reorderNodeWithReuses if the user is not
9088 // ShuffleVectorInst.
9089 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
9090 isa<ShuffleVectorInst>(TE->UserTreeIndex.UserTE->getMainOp()))
9091 continue;
9092 }
9093 // Update ordering of the operands with the smaller VF than the given
9094 // one.
9095 reorderNodeWithReuses(*TE, Mask);
9096 // Update orders in user split vectorize nodes.
9097 if (TE->UserTreeIndex &&
9098 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
9099 TE->UserTreeIndex.UserTE->reorderSplitNode(
9100 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
9101 }
9102 continue;
9103 }
9104 if ((TE->State == TreeEntry::SplitVectorize &&
9105 TE->ReuseShuffleIndices.empty()) ||
9106 ((TE->State == TreeEntry::Vectorize ||
9107 TE->State == TreeEntry::StridedVectorize ||
9108 TE->State == TreeEntry::CompressVectorize) &&
9110 InsertElementInst>(TE->getMainOp()) ||
9111 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
9112 assert(
9113 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
9114 TE->ReuseShuffleIndices.empty())) &&
9115 "Alternate instructions are only supported by BinaryOperator "
9116 "and CastInst.");
9117 // Build correct orders for extract{element,value}, loads,
9118 // stores and alternate (split) nodes.
9119 reorderOrder(TE->ReorderIndices, Mask);
9120 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
9121 TE->reorderOperands(Mask);
9122 } else {
9123 // Reorder the node and its operands.
9124 TE->reorderOperands(Mask);
9125 assert(TE->ReorderIndices.empty() &&
9126 "Expected empty reorder sequence.");
9127 reorderScalars(TE->Scalars, Mask);
9128 }
9129 if (!TE->ReuseShuffleIndices.empty()) {
9130 // Apply reversed order to keep the original ordering of the reused
9131 // elements to avoid extra reorder indices shuffling.
9132 OrdersType CurrentOrder;
9133 reorderOrder(CurrentOrder, MaskOrder);
9134 SmallVector<int> NewReuses;
9135 inversePermutation(CurrentOrder, NewReuses);
9136 addMask(NewReuses, TE->ReuseShuffleIndices);
9137 TE->ReuseShuffleIndices.swap(NewReuses);
9138 } else if (TE->UserTreeIndex &&
9139 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
9140 // Update orders in user split vectorize nodes.
9141 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
9142 Mask, MaskOrder);
9143 }
9144 }
9145}
9146
9147void BoUpSLP::buildReorderableOperands(
9148 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
9149 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
9150 SmallVectorImpl<TreeEntry *> &GatherOps) {
9151 for (unsigned I : seq<unsigned>(UserTE->getNumOperands())) {
9152 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
9153 return OpData.first == I &&
9154 (OpData.second->State == TreeEntry::Vectorize ||
9155 OpData.second->State == TreeEntry::StridedVectorize ||
9156 OpData.second->State == TreeEntry::CompressVectorize ||
9157 OpData.second->State == TreeEntry::SplitVectorize);
9158 }))
9159 continue;
9160 // Do not request operands, if they do not exist.
9161 if (UserTE->hasState()) {
9162 if (UserTE->getOpcode() == Instruction::ExtractElement ||
9163 UserTE->getOpcode() == Instruction::ExtractValue)
9164 continue;
9165 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
9166 continue;
9167 if (UserTE->getOpcode() == Instruction::Store && I == 1 &&
9168 (UserTE->State == TreeEntry::Vectorize ||
9169 UserTE->State == TreeEntry::StridedVectorize))
9170 continue;
9171 if (UserTE->getOpcode() == Instruction::Load &&
9172 (UserTE->State == TreeEntry::Vectorize ||
9173 UserTE->State == TreeEntry::StridedVectorize ||
9174 UserTE->State == TreeEntry::CompressVectorize))
9175 continue;
9176 }
9177 TreeEntry *TE = getOperandEntry(UserTE, I);
9178 assert(TE && "Expected operand entry.");
9179 if (!TE->isGather()) {
9180 // Add the node to the list of the ordered nodes with the identity
9181 // order.
9182 Edges.emplace_back(I, TE);
9183 // Add ScatterVectorize nodes to the list of operands, where just
9184 // reordering of the scalars is required. Similar to the gathers, so
9185 // simply add to the list of gathered ops.
9186 // If there are reused scalars, process this node as a regular vectorize
9187 // node, just reorder reuses mask.
9188 if (TE->State == TreeEntry::ScatterVectorize &&
9189 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
9190 GatherOps.push_back(TE);
9191 continue;
9192 }
9193 if (ReorderableGathers.contains(TE))
9194 GatherOps.push_back(TE);
9195 }
9196}
9197
9198void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
9199 struct TreeEntryCompare {
9200 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
9201 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
9202 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
9203 return LHS->Idx < RHS->Idx;
9204 }
9205 };
9207 DenseSet<const TreeEntry *> GathersToOrders;
9208 // Find all reorderable leaf nodes with the given VF.
9209 // Currently the are vectorized loads,extracts without alternate operands +
9210 // some gathering of extracts.
9212 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9213 if (TE->State != TreeEntry::Vectorize &&
9214 TE->State != TreeEntry::StridedVectorize &&
9215 TE->State != TreeEntry::CompressVectorize &&
9216 TE->State != TreeEntry::SplitVectorize)
9217 NonVectorized.insert(TE.get());
9218 if (std::optional<OrdersType> CurrentOrder =
9219 getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {
9220 Queue.push(TE.get());
9221 if (!(TE->State == TreeEntry::Vectorize ||
9222 TE->State == TreeEntry::StridedVectorize ||
9223 TE->State == TreeEntry::CompressVectorize ||
9224 TE->State == TreeEntry::SplitVectorize) ||
9225 !TE->ReuseShuffleIndices.empty())
9226 GathersToOrders.insert(TE.get());
9227 }
9228 }
9229
9230 // 1. Propagate order to the graph nodes, which use only reordered nodes.
9231 // I.e., if the node has operands, that are reordered, try to make at least
9232 // one operand order in the natural order and reorder others + reorder the
9233 // user node itself.
9234 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
9235 while (!Queue.empty()) {
9236 // 1. Filter out only reordered nodes.
9237 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
9238 TreeEntry *TE = Queue.top();
9239 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
9240 Queue.pop();
9241 SmallVector<TreeEntry *> OrderedOps(1, TE);
9242 while (!Queue.empty()) {
9243 TE = Queue.top();
9244 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
9245 break;
9246 Queue.pop();
9247 OrderedOps.push_back(TE);
9248 }
9249 for (TreeEntry *TE : OrderedOps) {
9250 if (!(TE->State == TreeEntry::Vectorize ||
9251 TE->State == TreeEntry::StridedVectorize ||
9252 TE->State == TreeEntry::CompressVectorize ||
9253 TE->State == TreeEntry::SplitVectorize ||
9254 (TE->isGather() && GathersToOrders.contains(TE))) ||
9255 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
9256 !Visited.insert(TE).second)
9257 continue;
9258 // Build a map between user nodes and their operands order to speedup
9259 // search. The graph currently does not provide this dependency directly.
9260 Users.first = TE->UserTreeIndex.UserTE;
9261 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
9262 }
9263 if (Users.first) {
9264 auto &Data = Users;
9265 // TODO: Reordering of struct types is not supported.
9266 if (Data.first->State == TreeEntry::Vectorize &&
9267 isa<StructType>(getValueType(Data.first->Scalars.front())))
9268 continue;
9269 if (Data.first->State == TreeEntry::SplitVectorize) {
9270 assert(
9271 Data.second.size() <= 2 &&
9272 "Expected not greater than 2 operands for split vectorize node.");
9273 if (any_of(Data.second,
9274 [](const auto &Op) { return !Op.second->UserTreeIndex; }))
9275 continue;
9276 // Update orders in user split vectorize nodes.
9277 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
9278 "Expected exactly 2 entries.");
9279 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
9280 TreeEntry &OpTE = *VectorizableTree[P.first];
9281 OrdersType Order = OpTE.ReorderIndices;
9282 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
9283 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
9284 continue;
9285 const auto BestOrder =
9286 getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
9287 if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))
9288 continue;
9289 Order = *BestOrder;
9290 }
9291 fixupOrderingIndices(Order);
9292 SmallVector<int> Mask;
9293 inversePermutation(Order, Mask);
9294 const unsigned E = Order.size();
9295 SmallVector<int> MaskOrder(E, PoisonMaskElem);
9296 transform(Order, MaskOrder.begin(), [E](unsigned I) {
9297 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9298 });
9299 Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
9300 // Clear ordering of the operand.
9301 if (!OpTE.ReorderIndices.empty()) {
9302 OpTE.ReorderIndices.clear();
9303 } else if (!OpTE.ReuseShuffleIndices.empty()) {
9304 reorderReuses(OpTE.ReuseShuffleIndices, Mask);
9305 } else {
9306 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
9307 reorderScalars(OpTE.Scalars, Mask);
9308 }
9309 }
9310 if (Data.first->ReuseShuffleIndices.empty() &&
9311 !Data.first->ReorderIndices.empty()) {
9312 // Insert user node to the list to try to sink reordering deeper in
9313 // the graph.
9314 Queue.push(Data.first);
9315 }
9316 continue;
9317 }
9318 // Check that operands are used only in the User node.
9319 SmallVector<TreeEntry *> GatherOps;
9320 buildReorderableOperands(Data.first, Data.second, NonVectorized,
9321 GatherOps);
9322 // All operands are reordered and used only in this node - propagate the
9323 // most used order to the user node.
9326 OrdersUses;
9327 // Do the analysis for each tree entry only once, otherwise the order of
9328 // the same node my be considered several times, though might be not
9329 // profitable.
9332 for (const auto &Op : Data.second) {
9333 TreeEntry *OpTE = Op.second;
9334 if (!VisitedOps.insert(OpTE).second)
9335 continue;
9336 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
9337 continue;
9338 const auto Order = [&]() -> const OrdersType {
9339 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
9340 return getReorderingData(*OpTE, /*TopToBottom=*/false,
9341 IgnoreReorder)
9342 .value_or(OrdersType(1));
9343 return OpTE->ReorderIndices;
9344 }();
9345 // The order is partially ordered, skip it in favor of fully non-ordered
9346 // orders.
9347 if (Order.size() == 1)
9348 continue;
9349
9350 // Check that the reordering does not increase number of shuffles, i.e.
9351 // same-values-nodes has same parents or their parents has same parents.
9352 if (!Order.empty() && !isIdentityOrder(Order)) {
9353 Value *Root = OpTE->hasState()
9354 ? OpTE->getMainOp()
9355 : *find_if_not(OpTE->Scalars, isConstant);
9356 auto GetSameNodesUsers = [&](Value *Root) {
9358 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
9359 if (TE != OpTE && TE->UserTreeIndex &&
9360 TE->getVectorFactor() == OpTE->getVectorFactor() &&
9361 TE->Scalars.size() == OpTE->Scalars.size() &&
9362 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
9363 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
9364 Res.insert(TE->UserTreeIndex.UserTE);
9365 }
9366 for (const TreeEntry *TE : getTreeEntries(Root)) {
9367 if (TE != OpTE && TE->UserTreeIndex &&
9368 TE->getVectorFactor() == OpTE->getVectorFactor() &&
9369 TE->Scalars.size() == OpTE->Scalars.size() &&
9370 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
9371 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
9372 Res.insert(TE->UserTreeIndex.UserTE);
9373 }
9374 return Res.takeVector();
9375 };
9376 auto GetNumOperands = [](const TreeEntry *TE) {
9377 if (TE->State == TreeEntry::SplitVectorize)
9378 return TE->getNumOperands();
9379 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
9380 return CI->arg_size();
9381 return TE->getNumOperands();
9382 };
9383 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
9384 const TreeEntry *TE) {
9386 if (auto *CI = dyn_cast<CallInst>(TE->getMainOp()); CI)
9388 for (unsigned Idx : seq<unsigned>(GetNumOperands(TE))) {
9391 continue;
9392 const TreeEntry *Op = getOperandEntry(TE, Idx);
9393 if (Op->isGather() && Op->hasState()) {
9394 const TreeEntry *VecOp =
9395 getSameValuesTreeEntry(Op->getMainOp(), Op->Scalars);
9396 if (VecOp)
9397 Op = VecOp;
9398 }
9399 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
9400 return false;
9401 }
9402 return true;
9403 };
9404 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
9405 if (!Users.empty() && !all_of(Users, [&](TreeEntry *UTE) {
9406 if (!RevisitedOps.insert(UTE).second)
9407 return false;
9408 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
9409 !UTE->ReuseShuffleIndices.empty() ||
9410 (UTE->UserTreeIndex &&
9411 UTE->UserTreeIndex.UserTE == Data.first) ||
9412 (Data.first->UserTreeIndex &&
9413 Data.first->UserTreeIndex.UserTE == UTE) ||
9414 (IgnoreReorder && UTE->UserTreeIndex &&
9415 UTE->UserTreeIndex.UserTE->Idx == 0) ||
9416 NodeShouldBeReorderedWithOperands(UTE);
9417 }))
9418 continue;
9419 for (TreeEntry *UTE : Users) {
9421 if (auto *CI = dyn_cast<CallInst>(UTE->getMainOp()); CI)
9423 for (unsigned Idx : seq<unsigned>(GetNumOperands(UTE))) {
9426 continue;
9427 const TreeEntry *Op = getOperandEntry(UTE, Idx);
9428 Visited.erase(Op);
9429 Queue.push(const_cast<TreeEntry *>(Op));
9430 }
9431 }
9432 }
9433 unsigned NumOps = count_if(
9434 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
9435 return P.second == OpTE;
9436 });
9437 // Stores actually store the mask, not the order, need to invert.
9438 if (OpTE->State == TreeEntry::Vectorize &&
9439 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
9440 assert(!OpTE->isAltShuffle() &&
9441 "Alternate instructions are only supported by BinaryOperator "
9442 "and CastInst.");
9443 SmallVector<int> Mask;
9444 inversePermutation(Order, Mask);
9445 unsigned E = Order.size();
9446 OrdersType CurrentOrder(E, E);
9447 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
9448 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
9449 });
9450 fixupOrderingIndices(CurrentOrder);
9451 OrdersUses.try_emplace(CurrentOrder, 0).first->second += NumOps;
9452 } else {
9453 OrdersUses.try_emplace(Order, 0).first->second += NumOps;
9454 }
9455 auto Res = OrdersUses.try_emplace(OrdersType(), 0);
9456 const auto AllowsReordering = [&](const TreeEntry *TE) {
9457 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
9458 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
9459 (IgnoreReorder && TE->Idx == 0))
9460 return true;
9461 if (TE->isGather()) {
9462 if (GathersToOrders.contains(TE))
9463 return !getReorderingData(*TE, /*TopToBottom=*/false,
9464 IgnoreReorder)
9465 .value_or(OrdersType(1))
9466 .empty();
9467 return true;
9468 }
9469 return false;
9470 };
9471 if (OpTE->UserTreeIndex) {
9472 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
9473 if (!VisitedUsers.insert(UserTE).second)
9474 continue;
9475 // May reorder user node if it requires reordering, has reused
9476 // scalars, is an alternate op vectorize node or its op nodes require
9477 // reordering.
9478 if (AllowsReordering(UserTE))
9479 continue;
9480 // Check if users allow reordering.
9481 // Currently look up just 1 level of operands to avoid increase of
9482 // the compile time.
9483 // Profitable to reorder if definitely more operands allow
9484 // reordering rather than those with natural order.
9486 if (static_cast<unsigned>(count_if(
9487 Ops, [UserTE, &AllowsReordering](
9488 const std::pair<unsigned, TreeEntry *> &Op) {
9489 return AllowsReordering(Op.second) &&
9490 Op.second->UserTreeIndex.UserTE == UserTE;
9491 })) <= Ops.size() / 2)
9492 ++Res.first->second;
9493 }
9494 }
9495 if (OrdersUses.empty()) {
9496 Visited.insert_range(llvm::make_second_range(Data.second));
9497 continue;
9498 }
9499 // Choose the most used order.
9500 unsigned IdentityCnt = 0;
9501 unsigned VF = Data.second.front().second->getVectorFactor();
9502 OrdersType IdentityOrder(VF, VF);
9503 for (auto &Pair : OrdersUses) {
9504 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
9505 IdentityCnt += Pair.second;
9506 combineOrders(IdentityOrder, Pair.first);
9507 }
9508 }
9509 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
9510 unsigned Cnt = IdentityCnt;
9511 for (auto &Pair : OrdersUses) {
9512 // Prefer identity order. But, if filled identity found (non-empty
9513 // order) with same number of uses, as the new candidate order, we can
9514 // choose this candidate order.
9515 if (Cnt < Pair.second) {
9516 combineOrders(Pair.first, BestOrder);
9517 BestOrder = Pair.first;
9518 Cnt = Pair.second;
9519 } else {
9520 combineOrders(BestOrder, Pair.first);
9521 }
9522 }
9523 // Set order of the user node.
9524 if (isIdentityOrder(BestOrder)) {
9525 Visited.insert_range(llvm::make_second_range(Data.second));
9526 continue;
9527 }
9528 fixupOrderingIndices(BestOrder);
9529 // Erase operands from OrderedEntries list and adjust their orders.
9530 VisitedOps.clear();
9531 SmallVector<int> Mask;
9532 inversePermutation(BestOrder, Mask);
9533 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
9534 unsigned E = BestOrder.size();
9535 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
9536 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9537 });
9538 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
9539 TreeEntry *TE = Op.second;
9540 if (!VisitedOps.insert(TE).second)
9541 continue;
9542 // TODO: Reordering of struct types is not supported.
9543 if (TE->State == TreeEntry::Vectorize &&
9544 isa<StructType>(getValueType(TE->Scalars.front())))
9545 continue;
9546 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
9547 reorderNodeWithReuses(*TE, Mask);
9548 continue;
9549 }
9550 // Gathers are processed separately.
9551 if (TE->State != TreeEntry::Vectorize &&
9552 TE->State != TreeEntry::StridedVectorize &&
9553 TE->State != TreeEntry::CompressVectorize &&
9554 TE->State != TreeEntry::SplitVectorize &&
9555 (TE->State != TreeEntry::ScatterVectorize ||
9556 TE->ReorderIndices.empty()))
9557 continue;
9558 assert((BestOrder.size() == TE->ReorderIndices.size() ||
9559 TE->ReorderIndices.empty()) &&
9560 "Non-matching sizes of user/operand entries.");
9561 reorderOrder(TE->ReorderIndices, Mask);
9562 if (IgnoreReorder && TE == VectorizableTree.front().get())
9563 IgnoreReorder = false;
9564 }
9565 // For gathers just need to reorder its scalars.
9566 for (TreeEntry *Gather : GatherOps) {
9567 assert(Gather->ReorderIndices.empty() &&
9568 "Unexpected reordering of gathers.");
9569 if (!Gather->ReuseShuffleIndices.empty()) {
9570 // Just reorder reuses indices.
9571 reorderReuses(Gather->ReuseShuffleIndices, Mask);
9572 continue;
9573 }
9574 reorderScalars(Gather->Scalars, Mask);
9575 Visited.insert(Gather);
9576 }
9577 // Reorder operands of the user node and set the ordering for the user
9578 // node itself.
9579 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
9580 return TE.isAltShuffle() &&
9581 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
9582 TE.ReorderIndices.empty());
9583 };
9584 if (Data.first->State != TreeEntry::Vectorize ||
9586 Data.first->getMainOp()) ||
9587 IsNotProfitableAltCodeNode(*Data.first))
9588 Data.first->reorderOperands(Mask);
9589 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
9590 IsNotProfitableAltCodeNode(*Data.first) ||
9591 Data.first->State == TreeEntry::CompressVectorize) {
9592 reorderScalars(Data.first->Scalars, Mask);
9593 reorderOrder(Data.first->ReorderIndices, MaskOrder,
9594 /*BottomOrder=*/true);
9595 if (Data.first->ReuseShuffleIndices.empty() &&
9596 !Data.first->ReorderIndices.empty() &&
9597 !IsNotProfitableAltCodeNode(*Data.first)) {
9598 // Insert user node to the list to try to sink reordering deeper in
9599 // the graph.
9600 Queue.push(Data.first);
9601 }
9602 } else {
9603 reorderOrder(Data.first->ReorderIndices, Mask);
9604 }
9605 }
9606 }
9607 // If the reordering is unnecessary, just remove the reorder.
9608 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
9609 VectorizableTree.front()->ReuseShuffleIndices.empty())
9610 VectorizableTree.front()->ReorderIndices.clear();
9611}
9612
9613Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
9614 if (Entry.hasState() &&
9615 (Entry.getOpcode() == Instruction::Store ||
9616 Entry.getOpcode() == Instruction::Load) &&
9617 Entry.State == TreeEntry::StridedVectorize &&
9618 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
9619 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
9620 return dyn_cast<Instruction>(Entry.Scalars.front());
9621}
9622
9624 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
9625 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
9626 DenseMap<Value *, unsigned> ScalarToExtUses;
9627 // Collect the values that we need to extract from the tree.
9628 for (auto &TEPtr : VectorizableTree) {
9629 TreeEntry *Entry = TEPtr.get();
9630
9631 // No need to handle users of gathered values.
9632 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
9633 DeletedNodes.contains(Entry) ||
9634 TransformedToGatherNodes.contains(Entry))
9635 continue;
9636
9637 // For each lane:
9638 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
9639 Value *Scalar = Entry->Scalars[Lane];
9640 if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
9641 continue;
9642 bool IsStructScalar = isa<StructType>(Scalar->getType());
9643
9644 // All uses must be replaced already? No need to do it again.
9645 auto It = ScalarToExtUses.find(Scalar);
9646 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
9647 continue;
9648
9649 if (!IsStructScalar && Scalar->hasNUsesOrMore(NumVectScalars)) {
9650 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9651 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
9652 << " from " << *Scalar << "for many users.\n");
9653 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
9654 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
9655 ExternalUsesWithNonUsers.insert(Scalar);
9656 continue;
9657 }
9658
9659 // Check if the scalar is externally used as an extra arg.
9660 const auto ExtI = ExternallyUsedValues.find(Scalar);
9661 if (ExtI != ExternallyUsedValues.end()) {
9662 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9663 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
9664 << FoundLane << " from " << *Scalar << ".\n");
9665 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
9666 ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
9667 continue;
9668 }
9669 for (User *U : Scalar->users()) {
9670 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
9671
9672 Instruction *UserInst = dyn_cast<Instruction>(U);
9673 if (!UserInst || isDeleted(UserInst))
9674 continue;
9675
9676 // Ignore users in the user ignore list.
9677 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
9678 continue;
9679
9680 // Skip in-tree scalars that become vectors
9681 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
9682 any_of(UseEntries, [this](const TreeEntry *UseEntry) {
9683 return !DeletedNodes.contains(UseEntry) &&
9684 !TransformedToGatherNodes.contains(UseEntry);
9685 })) {
9686 // Some in-tree scalars will remain as scalar in vectorized
9687 // instructions. If that is the case, the one in FoundLane will
9688 // be used.
9689 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
9690 isa<LoadInst, StoreInst>(UserInst)) ||
9691 isa<CallInst>(UserInst)) ||
9692 all_of(UseEntries, [&](TreeEntry *UseEntry) {
9693 if (DeletedNodes.contains(UseEntry) ||
9694 TransformedToGatherNodes.contains(UseEntry))
9695 return true;
9696 return UseEntry->State == TreeEntry::ScatterVectorize ||
9698 Scalar, getRootEntryInstruction(*UseEntry), TLI,
9699 TTI);
9700 })) {
9701 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
9702 << ".\n");
9703 assert(none_of(UseEntries,
9704 [](TreeEntry *UseEntry) {
9705 return UseEntry->isGather();
9706 }) &&
9707 "Bad state");
9708 continue;
9709 }
9710 if (!IsStructScalar) {
9711 U = nullptr;
9712 if (It != ScalarToExtUses.end()) {
9713 ExternalUses[It->second].User = nullptr;
9714 break;
9715 }
9716 }
9717 }
9718
9719 if (U && !IsStructScalar && Scalar->hasNUsesOrMore(UsesLimit))
9720 U = nullptr;
9721 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9722 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
9723 << " from lane " << FoundLane << " from " << *Scalar
9724 << ".\n");
9725 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
9726 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
9727 ExternalUsesWithNonUsers.insert(Scalar);
9728 if (!U)
9729 break;
9730 }
9731 }
9732 }
9733}
9734
9736BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
9739 PtrToStoresMap;
9740 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
9741 Value *V = TE->Scalars[Lane];
9742 // Don't iterate over the users of constant data.
9743 if (!isa<Instruction>(V))
9744 continue;
9745 // To save compilation time we don't visit if we have too many users.
9746 if (V->hasNUsesOrMore(UsesLimit))
9747 break;
9748
9749 // Collect stores per pointer object.
9750 for (User *U : V->users()) {
9751 auto *SI = dyn_cast<StoreInst>(U);
9752 // Test whether we can handle the store. V might be a global, which could
9753 // be used in a different function.
9754 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
9755 !isValidElementType(SI->getValueOperand()->getType()))
9756 continue;
9757 // Skip entry if already
9758 if (isVectorized(U))
9759 continue;
9760
9761 Value *Ptr =
9762 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
9763 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
9764 SI->getValueOperand()->getType(), Ptr}];
9765 // For now just keep one store per pointer object per lane.
9766 // TODO: Extend this to support multiple stores per pointer per lane
9767 if (StoresVec.size() > Lane)
9768 continue;
9769 if (!StoresVec.empty()) {
9770 std::optional<int64_t> Diff = getPointersDiff(
9771 SI->getValueOperand()->getType(), SI->getPointerOperand(),
9772 SI->getValueOperand()->getType(),
9773 StoresVec.front()->getPointerOperand(), *DL, *SE,
9774 /*StrictCheck=*/true);
9775 // We failed to compare the pointers so just abandon this store.
9776 if (!Diff)
9777 continue;
9778 }
9779 StoresVec.push_back(SI);
9780 }
9781 }
9782 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
9783 unsigned I = 0;
9784 for (auto &P : PtrToStoresMap) {
9785 Res[I].swap(P.second);
9786 ++I;
9787 }
9788 return Res;
9789}
9790
9791bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
9792 OrdersType &ReorderIndices) const {
9793 // We check whether the stores in StoreVec can form a vector by sorting them
9794 // and checking whether they are consecutive.
9795
9796 // To avoid calling getPointersDiff() while sorting we create a vector of
9797 // pairs {store, offset from first} and sort this instead.
9799 StoreInst *S0 = StoresVec[0];
9800 StoreOffsetVec.emplace_back(0, 0);
9801 Type *S0Ty = S0->getValueOperand()->getType();
9802 Value *S0Ptr = S0->getPointerOperand();
9803 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
9804 StoreInst *SI = StoresVec[Idx];
9805 std::optional<int64_t> Diff =
9806 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
9807 SI->getPointerOperand(), *DL, *SE,
9808 /*StrictCheck=*/true);
9809 StoreOffsetVec.emplace_back(*Diff, Idx);
9810 }
9811
9812 // Check if the stores are consecutive by checking if their difference is 1.
9813 if (StoreOffsetVec.size() != StoresVec.size())
9814 return false;
9815 sort(StoreOffsetVec, llvm::less_first());
9816 unsigned Idx = 0;
9817 int64_t PrevDist = 0;
9818 for (const auto &P : StoreOffsetVec) {
9819 if (Idx > 0 && P.first != PrevDist + 1)
9820 return false;
9821 PrevDist = P.first;
9822 ++Idx;
9823 }
9824
9825 // Calculate the shuffle indices according to their offset against the sorted
9826 // StoreOffsetVec.
9827 ReorderIndices.assign(StoresVec.size(), 0);
9828 bool IsIdentity = true;
9829 for (auto [I, P] : enumerate(StoreOffsetVec)) {
9830 ReorderIndices[P.second] = I;
9831 IsIdentity &= P.second == I;
9832 }
9833 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
9834 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
9835 // same convention here.
9836 if (IsIdentity)
9837 ReorderIndices.clear();
9838
9839 return true;
9840}
9841
9842#ifndef NDEBUG
9844 for (unsigned Idx : Order)
9845 dbgs() << Idx << ", ";
9846 dbgs() << "\n";
9847}
9848#endif
9849
9851BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
9852 unsigned NumLanes = TE->Scalars.size();
9853
9854 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
9855
9856 // Holds the reorder indices for each candidate store vector that is a user of
9857 // the current TreeEntry.
9858 SmallVector<OrdersType, 1> ExternalReorderIndices;
9859
9860 // Now inspect the stores collected per pointer and look for vectorization
9861 // candidates. For each candidate calculate the reorder index vector and push
9862 // it into `ExternalReorderIndices`
9863 for (ArrayRef<StoreInst *> StoresVec : Stores) {
9864 // If we have fewer than NumLanes stores, then we can't form a vector.
9865 if (StoresVec.size() != NumLanes)
9866 continue;
9867
9868 // If the stores are not consecutive then abandon this StoresVec.
9869 OrdersType ReorderIndices;
9870 if (!canFormVector(StoresVec, ReorderIndices))
9871 continue;
9872
9873 // We now know that the scalars in StoresVec can form a vector instruction,
9874 // so set the reorder indices.
9875 ExternalReorderIndices.push_back(ReorderIndices);
9876 }
9877 return ExternalReorderIndices;
9878}
9879
9881 const SmallDenseSet<Value *> &UserIgnoreLst) {
9882 deleteTree();
9883 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9884 "TreeEntryToStridedPtrInfoMap is not cleared");
9885 UserIgnoreList = &UserIgnoreLst;
9886 if (!allSameType(Roots))
9887 return;
9888 buildTreeRec(Roots, 0, EdgeInfo());
9889}
9890
9892 deleteTree();
9893 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9894 "TreeEntryToStridedPtrInfoMap is not cleared");
9895 if (!allSameType(Roots))
9896 return;
9897 buildTreeRec(Roots, 0, EdgeInfo());
9898}
9899
9900/// Tries to find subvector of loads and builds new vector of only loads if can
9901/// be profitable.
9903 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
9905 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
9906 bool AddNew = true) {
9907 if (VL.empty())
9908 return;
9909 Type *ScalarTy = getValueType(VL.front());
9910 if (!isValidElementType(ScalarTy))
9911 return;
9913 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
9914 for (Value *V : VL) {
9915 auto *LI = dyn_cast<LoadInst>(V);
9916 if (!LI)
9917 continue;
9918 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9919 continue;
9920 bool IsFound = false;
9921 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
9922 assert(LI->getParent() == Data.front().first->getParent() &&
9923 LI->getType() == Data.front().first->getType() &&
9924 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
9925 getUnderlyingObject(Data.front().first->getPointerOperand(),
9927 "Expected loads with the same type, same parent and same "
9928 "underlying pointer.");
9929 std::optional<int64_t> Dist = getPointersDiff(
9930 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
9931 Data.front().first->getPointerOperand(), DL, SE,
9932 /*StrictCheck=*/true);
9933 if (!Dist)
9934 continue;
9935 auto It = Map.find(*Dist);
9936 if (It != Map.end() && It->second != LI)
9937 continue;
9938 if (It == Map.end()) {
9939 Data.emplace_back(LI, *Dist);
9940 Map.try_emplace(*Dist, LI);
9941 }
9942 IsFound = true;
9943 break;
9944 }
9945 if (!IsFound) {
9946 ClusteredLoads.emplace_back().emplace_back(LI, 0);
9947 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
9948 }
9949 }
9950 auto FindMatchingLoads =
9953 &GatheredLoads,
9955 int64_t &Offset, unsigned &Start) {
9956 if (Loads.empty())
9957 return GatheredLoads.end();
9958 LoadInst *LI = Loads.front().first;
9959 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
9960 if (Idx < Start)
9961 continue;
9962 ToAdd.clear();
9963 if (LI->getParent() != Data.front().first->getParent() ||
9964 LI->getType() != Data.front().first->getType())
9965 continue;
9966 std::optional<int64_t> Dist =
9968 Data.front().first->getType(),
9969 Data.front().first->getPointerOperand(), DL, SE,
9970 /*StrictCheck=*/true);
9971 if (!Dist)
9972 continue;
9973 SmallSet<int64_t, 4> DataDists;
9975 for (std::pair<LoadInst *, int64_t> P : Data) {
9976 DataDists.insert(P.second);
9977 DataLoads.insert(P.first);
9978 }
9979 // Found matching gathered loads - check if all loads are unique or
9980 // can be effectively vectorized.
9981 unsigned NumUniques = 0;
9982 for (auto [Cnt, Pair] : enumerate(Loads)) {
9983 bool Used = DataLoads.contains(Pair.first);
9984 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
9985 ++NumUniques;
9986 ToAdd.insert(Cnt);
9987 } else if (Used) {
9988 Repeated.insert(Cnt);
9989 }
9990 }
9991 if (NumUniques > 0 &&
9992 (Loads.size() == NumUniques ||
9993 (Loads.size() - NumUniques >= 2 &&
9994 Loads.size() - NumUniques >= Loads.size() / 2 &&
9995 (has_single_bit(Data.size() + NumUniques) ||
9996 bit_ceil(Data.size()) <
9997 bit_ceil(Data.size() + NumUniques))))) {
9998 Offset = *Dist;
9999 Start = Idx + 1;
10000 return std::next(GatheredLoads.begin(), Idx);
10001 }
10002 }
10003 ToAdd.clear();
10004 return GatheredLoads.end();
10005 };
10006 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
10007 unsigned Start = 0;
10008 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
10009 int64_t Offset = 0;
10010 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
10011 Offset, Start);
10012 while (It != GatheredLoads.end()) {
10013 assert(!LocalToAdd.empty() && "Expected some elements to add.");
10014 for (unsigned Idx : LocalToAdd)
10015 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
10016 ToAdd.insert_range(LocalToAdd);
10017 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
10018 Start);
10019 }
10020 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
10021 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
10022 })) {
10023 auto AddNewLoads =
10025 for (unsigned Idx : seq<unsigned>(Data.size())) {
10026 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
10027 continue;
10028 Loads.push_back(Data[Idx]);
10029 }
10030 };
10031 if (!AddNew) {
10032 LoadInst *LI = Data.front().first;
10033 It = find_if(
10034 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
10035 return PD.front().first->getParent() == LI->getParent() &&
10036 PD.front().first->getType() == LI->getType();
10037 });
10038 while (It != GatheredLoads.end()) {
10039 AddNewLoads(*It);
10040 It = std::find_if(
10041 std::next(It), GatheredLoads.end(),
10042 [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
10043 return PD.front().first->getParent() == LI->getParent() &&
10044 PD.front().first->getType() == LI->getType();
10045 });
10046 }
10047 }
10048 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
10049 AddNewLoads(GatheredLoads.emplace_back());
10050 }
10051 }
10052}
10053
10054void BoUpSLP::tryToVectorizeGatheredLoads(
10055 const SmallMapVector<
10056 std::tuple<BasicBlock *, Value *, Type *>,
10057 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
10058 &GatheredLoads) {
10059 GatheredLoadsEntriesFirst = VectorizableTree.size();
10060
10061 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
10062 LoadEntriesToVectorize.size());
10063 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
10064 Set.insert_range(VectorizableTree[Idx]->Scalars);
10065
10066 // Sort loads by distance.
10067 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
10068 const std::pair<LoadInst *, int64_t> &L2) {
10069 return L1.second > L2.second;
10070 };
10071
10072 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
10073 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
10074 Loads.size());
10075 Align Alignment = computeCommonAlignment<LoadInst>(Values);
10076 auto *Ty = cast<VectorType>(
10077 getWidenedType(Loads.front()->getType(), Loads.size()));
10078 return TTI->isLegalMaskedGather(Ty, Alignment) &&
10079 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
10080 };
10081
10082 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
10083 BoUpSLP::ValueSet &VectorizedLoads,
10084 SmallVectorImpl<LoadInst *> &NonVectorized,
10085 bool Final, unsigned MaxVF) {
10087 unsigned StartIdx = 0;
10088 SmallVector<int> CandidateVFs;
10089 if (isAllowedNonPowerOf2VF(MaxVF))
10090 CandidateVFs.push_back(MaxVF);
10091 for (int NumElts = getFloorFullVectorNumberOfElements(
10092 *TTI, Loads.front()->getType(), MaxVF);
10093 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
10094 *TTI, Loads.front()->getType(), NumElts - 1)) {
10095 CandidateVFs.push_back(NumElts);
10096 if (VectorizeNonPowerOf2 && NumElts > 2)
10097 CandidateVFs.push_back(NumElts - 1);
10098 }
10099
10100 if (Final && CandidateVFs.empty())
10101 return Results;
10102
10103 unsigned BestVF = Final ? CandidateVFs.back() : 0;
10104 for (unsigned NumElts : CandidateVFs) {
10105 if (Final && NumElts > BestVF)
10106 continue;
10107 SmallVector<unsigned> MaskedGatherVectorized;
10108 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
10109 ++Cnt) {
10110 ArrayRef<LoadInst *> Slice =
10111 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
10112 if (VectorizedLoads.count(Slice.front()) ||
10113 VectorizedLoads.count(Slice.back()) ||
10115 continue;
10116 // Check if it is profitable to try vectorizing gathered loads. It is
10117 // profitable if we have more than 3 consecutive loads or if we have
10118 // less but all users are vectorized or deleted.
10119 bool AllowToVectorize = false;
10120 // Check if it is profitable to vectorize 2-elements loads.
10121 if (NumElts == 2) {
10122 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
10123 Slice.front()->getType(), ElementCount::getFixed(NumElts));
10124 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
10125 for (LoadInst *LI : Slice) {
10126 // If single use/user - allow to vectorize.
10127 if (LI->hasOneUse())
10128 continue;
10129 // 1. Check if number of uses equals number of users.
10130 // 2. All users are deleted.
10131 // 3. The load broadcasts are not allowed or the load is not
10132 // broadcasted.
10133 if (static_cast<unsigned int>(std::distance(
10134 LI->user_begin(), LI->user_end())) != LI->getNumUses())
10135 return false;
10136 if (!IsLegalBroadcastLoad)
10137 continue;
10138 if (LI->hasNUsesOrMore(UsesLimit))
10139 return false;
10140 for (User *U : LI->users()) {
10141 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
10142 continue;
10143 for (const TreeEntry *UTE : getTreeEntries(U)) {
10144 for (int I : seq<int>(UTE->getNumOperands())) {
10145 if (all_of(UTE->getOperand(I), [LI](Value *V) {
10146 return V == LI || isa<PoisonValue>(V);
10147 }))
10148 // Found legal broadcast - do not vectorize.
10149 return false;
10150 }
10151 }
10152 }
10153 }
10154 return true;
10155 };
10156 AllowToVectorize = CheckIfAllowed(Slice);
10157 } else {
10158 AllowToVectorize =
10159 NumElts >= 3 ||
10160 any_of(ValueToGatherNodes.at(Slice.front()),
10161 [=](const TreeEntry *TE) {
10162 return TE->Scalars.size() == 2 &&
10163 ((TE->Scalars.front() == Slice.front() &&
10164 TE->Scalars.back() == Slice.back()) ||
10165 (TE->Scalars.front() == Slice.back() &&
10166 TE->Scalars.back() == Slice.front()));
10167 });
10168 }
10169 if (AllowToVectorize) {
10170 SmallVector<Value *> PointerOps;
10171 OrdersType CurrentOrder;
10172 // Try to build vector load.
10173 ArrayRef<Value *> Values(
10174 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
10175 StridedPtrInfo SPtrInfo;
10176 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
10177 PointerOps, SPtrInfo, &BestVF);
10178 if (LS != LoadsState::Gather ||
10179 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
10180 if (LS == LoadsState::ScatterVectorize) {
10181 if (MaskedGatherVectorized.empty() ||
10182 Cnt >= MaskedGatherVectorized.back() + NumElts)
10183 MaskedGatherVectorized.push_back(Cnt);
10184 continue;
10185 }
10186 if (LS != LoadsState::Gather) {
10187 Results.emplace_back(Values, LS);
10188 VectorizedLoads.insert_range(Slice);
10189 // If we vectorized initial block, no need to try to vectorize it
10190 // again.
10191 if (Cnt == StartIdx)
10192 StartIdx += NumElts;
10193 }
10194 // Check if the whole array was vectorized already - exit.
10195 if (StartIdx >= Loads.size())
10196 break;
10197 // Erase last masked gather candidate, if another candidate within
10198 // the range is found to be better.
10199 if (!MaskedGatherVectorized.empty() &&
10200 Cnt < MaskedGatherVectorized.back() + NumElts)
10201 MaskedGatherVectorized.pop_back();
10202 Cnt += NumElts - 1;
10203 continue;
10204 }
10205 }
10206 if (!AllowToVectorize || BestVF == 0)
10208 }
10209 // Mark masked gathers candidates as vectorized, if any.
10210 for (unsigned Cnt : MaskedGatherVectorized) {
10211 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
10212 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
10213 ArrayRef<Value *> Values(
10214 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
10215 Results.emplace_back(Values, LoadsState::ScatterVectorize);
10216 VectorizedLoads.insert_range(Slice);
10217 // If we vectorized initial block, no need to try to vectorize it again.
10218 if (Cnt == StartIdx)
10219 StartIdx += NumElts;
10220 }
10221 }
10222 for (LoadInst *LI : Loads) {
10223 if (!VectorizedLoads.contains(LI))
10224 NonVectorized.push_back(LI);
10225 }
10226 return Results;
10227 };
10228 auto ProcessGatheredLoads =
10229 [&, &TTI = *TTI](
10231 bool Final = false) {
10232 SmallVector<LoadInst *> NonVectorized;
10233 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
10234 GatheredLoads) {
10235 if (LoadsDists.size() <= 1) {
10236 NonVectorized.push_back(LoadsDists.back().first);
10237 continue;
10238 }
10240 LoadsDists);
10241 SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
10242 stable_sort(LocalLoadsDists, LoadSorter);
10244 unsigned MaxConsecutiveDistance = 0;
10245 unsigned CurrentConsecutiveDist = 1;
10246 int64_t LastDist = LocalLoadsDists.front().second;
10247 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
10248 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
10249 if (isVectorized(L.first))
10250 continue;
10251 assert(LastDist >= L.second &&
10252 "Expected first distance always not less than second");
10253 if (static_cast<uint64_t>(LastDist - L.second) ==
10254 CurrentConsecutiveDist) {
10255 ++CurrentConsecutiveDist;
10256 MaxConsecutiveDistance =
10257 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
10258 Loads.push_back(L.first);
10259 continue;
10260 }
10261 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
10262 !Loads.empty())
10263 Loads.pop_back();
10264 CurrentConsecutiveDist = 1;
10265 LastDist = L.second;
10266 Loads.push_back(L.first);
10267 }
10268 if (Loads.size() <= 1)
10269 continue;
10270 if (AllowMaskedGather)
10271 MaxConsecutiveDistance = Loads.size();
10272 else if (MaxConsecutiveDistance < 2)
10273 continue;
10274 BoUpSLP::ValueSet VectorizedLoads;
10275 SmallVector<LoadInst *> SortedNonVectorized;
10277 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
10278 Final, MaxConsecutiveDistance);
10279 if (!Results.empty() && !SortedNonVectorized.empty() &&
10280 OriginalLoads.size() == Loads.size() &&
10281 MaxConsecutiveDistance == Loads.size() &&
10283 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
10284 return P.second == LoadsState::ScatterVectorize;
10285 })) {
10286 VectorizedLoads.clear();
10287 SmallVector<LoadInst *> UnsortedNonVectorized;
10289 UnsortedResults =
10290 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
10291 UnsortedNonVectorized, Final,
10292 OriginalLoads.size());
10293 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
10294 SortedNonVectorized.swap(UnsortedNonVectorized);
10295 Results.swap(UnsortedResults);
10296 }
10297 }
10298 for (auto [Slice, _] : Results) {
10299 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
10300 << Slice.size() << ")\n");
10301 if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
10302 for (Value *L : Slice)
10303 if (!isVectorized(L))
10304 SortedNonVectorized.push_back(cast<LoadInst>(L));
10305 continue;
10306 }
10307
10308 // Select maximum VF as a maximum of user gathered nodes and
10309 // distance between scalar loads in these nodes.
10310 unsigned MaxVF = Slice.size();
10311 unsigned UserMaxVF = 0;
10312 unsigned InterleaveFactor = 0;
10313 if (MaxVF == 2) {
10314 UserMaxVF = MaxVF;
10315 } else {
10316 // Found distance between segments of the interleaved loads.
10317 std::optional<unsigned> InterleavedLoadsDistance = 0;
10318 unsigned Order = 0;
10319 std::optional<unsigned> CommonVF = 0;
10320 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
10321 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
10322 for (auto [Idx, V] : enumerate(Slice)) {
10323 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
10324 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
10325 unsigned Pos =
10326 EntryToPosition.try_emplace(E, Idx).first->second;
10327 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
10328 if (CommonVF) {
10329 if (*CommonVF == 0) {
10330 CommonVF = E->Scalars.size();
10331 continue;
10332 }
10333 if (*CommonVF != E->Scalars.size())
10334 CommonVF.reset();
10335 }
10336 // Check if the load is the part of the interleaved load.
10337 if (Pos != Idx && InterleavedLoadsDistance) {
10338 if (!DeinterleavedNodes.contains(E) &&
10339 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
10340 if (isa<Constant>(V))
10341 return false;
10342 if (isVectorized(V))
10343 return true;
10344 const auto &Nodes = ValueToGatherNodes.at(V);
10345 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
10346 !is_contained(Slice, V);
10347 })) {
10348 InterleavedLoadsDistance.reset();
10349 continue;
10350 }
10351 DeinterleavedNodes.insert(E);
10352 if (*InterleavedLoadsDistance == 0) {
10353 InterleavedLoadsDistance = Idx - Pos;
10354 continue;
10355 }
10356 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
10357 (Idx - Pos) / *InterleavedLoadsDistance < Order)
10358 InterleavedLoadsDistance.reset();
10359 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
10360 }
10361 }
10362 }
10363 DeinterleavedNodes.clear();
10364 // Check if the large load represents interleaved load operation.
10365 if (InterleavedLoadsDistance.value_or(0) > 1 &&
10366 CommonVF.value_or(0) != 0) {
10367 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
10368 unsigned VF = *CommonVF;
10369 OrdersType Order;
10370 SmallVector<Value *> PointerOps;
10371 StridedPtrInfo SPtrInfo;
10372 // Segmented load detected - vectorize at maximum vector factor.
10373 if (InterleaveFactor <= Slice.size() &&
10374 TTI.isLegalInterleavedAccessType(
10376 getWidenedType(Slice.front()->getType(), VF)),
10377 InterleaveFactor,
10378 cast<LoadInst>(Slice.front())->getAlign(),
10379 cast<LoadInst>(Slice.front())
10380 ->getPointerAddressSpace()) &&
10381 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
10382 SPtrInfo) == LoadsState::Vectorize) {
10383 UserMaxVF = InterleaveFactor * VF;
10384 } else {
10385 InterleaveFactor = 0;
10386 }
10387 }
10388 // Cannot represent the loads as consecutive vectorizable nodes -
10389 // just exit.
10390 unsigned ConsecutiveNodesSize = 0;
10391 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
10392 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
10393 [&, Slice = Slice](const auto &P) {
10394 const auto *It = find_if(Slice, [&](Value *V) {
10395 return std::get<1>(P).contains(V);
10396 });
10397 if (It == Slice.end())
10398 return false;
10399 const TreeEntry &TE =
10400 *VectorizableTree[std::get<0>(P)];
10401 ArrayRef<Value *> VL = TE.Scalars;
10402 OrdersType Order;
10403 SmallVector<Value *> PointerOps;
10404 StridedPtrInfo SPtrInfo;
10406 VL, VL.front(), Order, PointerOps, SPtrInfo);
10407 if (State == LoadsState::ScatterVectorize ||
10409 return false;
10410 ConsecutiveNodesSize += VL.size();
10411 size_t Start = std::distance(Slice.begin(), It);
10412 size_t Sz = Slice.size() - Start;
10413 return Sz < VL.size() ||
10414 Slice.slice(Start, VL.size()) != VL;
10415 }))
10416 continue;
10417 // Try to build long masked gather loads.
10418 UserMaxVF = bit_ceil(UserMaxVF);
10419 if (InterleaveFactor == 0 &&
10420 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
10421 [&, Slice = Slice](unsigned Idx) {
10422 OrdersType Order;
10423 SmallVector<Value *> PointerOps;
10424 StridedPtrInfo SPtrInfo;
10425 return canVectorizeLoads(
10426 Slice.slice(Idx * UserMaxVF, UserMaxVF),
10427 Slice[Idx * UserMaxVF], Order, PointerOps,
10428 SPtrInfo) == LoadsState::ScatterVectorize;
10429 }))
10430 UserMaxVF = MaxVF;
10431 if (Slice.size() != ConsecutiveNodesSize)
10432 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
10433 }
10434 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
10435 bool IsVectorized = true;
10436 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
10437 ArrayRef<Value *> SubSlice =
10438 Slice.slice(I, std::min(VF, E - I));
10439 if (isVectorized(SubSlice.front()))
10440 continue;
10441 // Check if the subslice is to be-vectorized entry, which is not
10442 // equal to entry.
10443 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
10444 [&](const auto &P) {
10445 return !SubSlice.equals(
10446 VectorizableTree[std::get<0>(P)]
10447 ->Scalars) &&
10448 set_is_subset(SubSlice, std::get<1>(P));
10449 }))
10450 continue;
10451 unsigned Sz = VectorizableTree.size();
10452 buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
10453 if (Sz == VectorizableTree.size()) {
10454 IsVectorized = false;
10455 // Try non-interleaved vectorization with smaller vector
10456 // factor.
10457 if (InterleaveFactor > 0) {
10458 VF = 2 * (MaxVF / InterleaveFactor);
10459 InterleaveFactor = 0;
10460 }
10461 continue;
10462 }
10463 }
10464 if (IsVectorized)
10465 break;
10466 }
10467 }
10468 NonVectorized.append(SortedNonVectorized);
10469 }
10470 return NonVectorized;
10471 };
10472 for (const auto &GLs : GatheredLoads) {
10473 const auto &Ref = GLs.second;
10474 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
10475 if (!Ref.empty() && !NonVectorized.empty() &&
10476 accumulate(
10477 Ref, 0u,
10478 [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
10479 -> unsigned { return S + LoadsDists.size(); }) !=
10480 NonVectorized.size() &&
10481 IsMaskedGatherSupported(NonVectorized)) {
10483 FinalGatheredLoads;
10484 for (LoadInst *LI : NonVectorized) {
10485 // Reinsert non-vectorized loads to other list of loads with the same
10486 // base pointers.
10487 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
10488 FinalGatheredLoads,
10489 /*AddNew=*/false);
10490 }
10491 // Final attempt to vectorize non-vectorized loads.
10492 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
10493 }
10494 }
10495 // Try to vectorize postponed load entries, previously marked as gathered.
10496 for (unsigned Idx : LoadEntriesToVectorize) {
10497 const TreeEntry &E = *VectorizableTree[Idx];
10498 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
10499 // Avoid reordering, if possible.
10500 if (!E.ReorderIndices.empty()) {
10501 // Build a mask out of the reorder indices and reorder scalars per this
10502 // mask.
10503 SmallVector<int> ReorderMask;
10504 inversePermutation(E.ReorderIndices, ReorderMask);
10505 reorderScalars(GatheredScalars, ReorderMask);
10506 }
10507 buildTreeRec(GatheredScalars, 0, EdgeInfo());
10508 }
10509 // If no new entries created, consider it as no gathered loads entries must be
10510 // handled.
10511 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
10512 VectorizableTree.size())
10513 GatheredLoadsEntriesFirst.reset();
10514}
10515
10516/// Generates key/subkey pair for the given value to provide effective sorting
10517/// of the values and better detection of the vectorizable values sequences. The
10518/// keys/subkeys can be used for better sorting of the values themselves (keys)
10519/// and in values subgroups (subkeys).
10520static std::pair<size_t, size_t> generateKeySubkey(
10521 Value *V, const TargetLibraryInfo *TLI,
10522 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
10523 bool AllowAlternate) {
10524 hash_code Key = hash_value(V->getValueID() + 2);
10525 hash_code SubKey = hash_value(0);
10526 // Sort the loads by the distance between the pointers.
10527 if (auto *LI = dyn_cast<LoadInst>(V)) {
10528 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
10529 if (LI->isSimple())
10530 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
10531 else
10532 Key = SubKey = hash_value(LI);
10533 } else if (isVectorLikeInstWithConstOps(V)) {
10534 // Sort extracts by the vector operands.
10536 Key = hash_value(Value::UndefValueVal + 1);
10537 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
10538 if (!isUndefVector(EI->getVectorOperand()).all() &&
10539 !isa<UndefValue>(EI->getIndexOperand()))
10540 SubKey = hash_value(EI->getVectorOperand());
10541 }
10542 } else if (auto *I = dyn_cast<Instruction>(V)) {
10543 // Sort other instructions just by the opcodes except for CMPInst.
10544 // For CMP also sort by the predicate kind.
10546 isValidForAlternation(I->getOpcode())) {
10547 if (AllowAlternate)
10548 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
10549 else
10550 Key = hash_combine(hash_value(I->getOpcode()), Key);
10551 SubKey = hash_combine(
10552 hash_value(I->getOpcode()), hash_value(I->getType()),
10554 ? I->getType()
10555 : cast<CastInst>(I)->getOperand(0)->getType()));
10556 // For casts, look through the only operand to improve compile time.
10557 if (isa<CastInst>(I)) {
10558 std::pair<size_t, size_t> OpVals =
10559 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
10560 /*AllowAlternate=*/true);
10561 Key = hash_combine(OpVals.first, Key);
10562 SubKey = hash_combine(OpVals.first, SubKey);
10563 }
10564 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
10565 CmpInst::Predicate Pred = CI->getPredicate();
10566 if (CI->isCommutative())
10567 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
10569 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
10570 hash_value(SwapPred),
10571 hash_value(CI->getOperand(0)->getType()));
10572 } else if (auto *Call = dyn_cast<CallInst>(I)) {
10575 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
10576 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
10577 SubKey = hash_combine(hash_value(I->getOpcode()),
10578 hash_value(Call->getCalledFunction()));
10579 } else {
10581 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
10582 }
10583 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
10584 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
10585 hash_value(Op.Tag), SubKey);
10586 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
10587 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
10588 SubKey = hash_value(Gep->getPointerOperand());
10589 else
10590 SubKey = hash_value(Gep);
10591 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
10592 !isa<ConstantInt>(I->getOperand(1))) {
10593 // Do not try to vectorize instructions with potentially high cost.
10594 SubKey = hash_value(I);
10595 } else {
10596 SubKey = hash_value(I->getOpcode());
10597 }
10598 Key = hash_combine(hash_value(I->getParent()->getNumber()), Key);
10599 }
10600 return std::make_pair(Key, SubKey);
10601}
10602
10603/// Checks if the specified instruction \p I is an main operation for the given
10604/// \p MainOp and \p AltOp instructions.
10605static bool isMainInstruction(Instruction *I, Instruction *MainOp,
10606 Instruction *AltOp, const TargetLibraryInfo &TLI);
10607
10608/// Builds the arguments types vector for the given call instruction with the
10609/// given \p ID for the specified vector factor.
10612 const unsigned VF, unsigned MinBW,
10613 const TargetTransformInfo *TTI) {
10614 SmallVector<Type *> ArgTys;
10615 for (auto [Idx, Arg] : enumerate(CI->args())) {
10618 ArgTys.push_back(Arg->getType());
10619 continue;
10620 }
10621 if (MinBW > 0) {
10622 ArgTys.push_back(
10623 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
10624 continue;
10625 }
10626 }
10627 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
10628 }
10629 return ArgTys;
10630}
10631
10632/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
10633/// function (if possible) calls. Returns invalid cost for the corresponding
10634/// calls, if they cannot be vectorized/will be scalarized.
10635static std::pair<InstructionCost, InstructionCost>
10637 TargetLibraryInfo *TLI, ArrayRef<Type *> ArgTys) {
10638 auto Shape = VFShape::get(CI->getFunctionType(),
10640 false /*HasGlobalPred*/);
10641 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10642 auto LibCost = InstructionCost::getInvalid();
10643 if (!CI->isNoBuiltin() && VecFunc) {
10644 // Calculate the cost of the vector library call.
10645 // If the corresponding vector call is cheaper, return its cost.
10646 LibCost =
10647 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
10648 }
10650
10651 // Calculate the cost of the vector intrinsic call.
10652 FastMathFlags FMF;
10653 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
10654 FMF = FPCI->getFastMathFlags();
10655 const InstructionCost ScalarLimit = 10000;
10656 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
10657 LibCost.isValid() ? LibCost : ScalarLimit);
10658 auto IntrinsicCost =
10659 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
10660 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
10661 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
10663
10664 return {IntrinsicCost, LibCost};
10665}
10666
10667/// Find the innermost loop starting from \p L, for which at least a single
10668/// value in \p VL is not invariant.
10670 ArrayRef<Value *> VL) {
10671 assert(L && "Expected valid loop");
10672 auto IsLoopInvariant = [&](const Loop *L, ArrayRef<Value *> VL) {
10673 return all_of(VL, [&](Value *V) {
10674 return isa<Constant>(V) || !isa<Instruction>(V) || L->isLoopInvariant(V);
10675 });
10676 };
10677 while (L && IsLoopInvariant(L, VL))
10678 L = L->getParentLoop();
10679 return L;
10680}
10681
10682/// Get the loop nest for the given loop.
10683ArrayRef<const Loop *> BoUpSLP::getLoopNest(const Loop *L) {
10684 assert(L && "Expected valid loop");
10685 if (LoopAwareTripCount == 0)
10686 return {};
10687 SmallVector<const Loop *> &Res =
10688 LoopToLoopNest.try_emplace(L).first->getSecond();
10689 if (!Res.empty())
10690 return Res;
10691 SmallVector<const Loop *> LoopNest;
10692 while (L) {
10693 LoopNest.push_back(L);
10694 L = L->getParentLoop();
10695 }
10696 Res.assign(LoopNest.rbegin(), LoopNest.rend());
10697 return Res;
10698}
10699
10700/// Detects an extractvalue bundle that can be widened by vectorizing the
10701/// underlying struct-returning calls.
10702///
10703/// \p VL is a bundle whose state \p S is Instruction::ExtractValue. The
10704/// bundle is acceptable for widening into one struct-of-vectors call only
10705/// when:
10706/// - every element of \p VL is an ExtractValueInst,
10707/// - every ExtractValueInst extracts the same struct field (its
10708/// getIndices() matches the main op's indices),
10709/// - the aggregate operands form a uniform set of CallInsts (per
10710/// getSameOpcode) that is not an alt-shuffle and whose return type is
10711/// a literal struct, and
10712/// - every user of every such call is itself an ExtractValueInst, so the
10713/// external-use extraction code can rebuild scalars via extractvalue +
10714/// extractelement without needing an insertvalue chain.
10715///
10716/// On success returns true and fills \p Indices with the common field
10717/// index path and \p Calls with the per-lane aggregate calls (in VL order),
10718/// for the caller to feed as the operand of the new tree entry. Otherwise
10719/// returns false and leaves the output parameters untouched.
10721 const InstructionsState &S,
10722 const TargetLibraryInfo &TLI,
10724 SmallVectorImpl<Value *> &Calls) {
10725 assert(S && S.getOpcode() == Instruction::ExtractValue &&
10726 "Expected extractvalue instruction state.");
10728 return false;
10729 auto *VL0 = cast<ExtractValueInst>(S.getMainOp());
10730 ArrayRef<unsigned> VL0Indices = VL0->getIndices();
10732 for (Value *V : VL) {
10733 if (V == VL0) {
10734 Aggregates.push_back(VL0->getAggregateOperand());
10735 continue;
10736 }
10737 auto *IV = cast<ExtractValueInst>(V);
10738 if (IV->getIndices() != VL0Indices ||
10739 isa<ScalableVectorType>(IV->getType()))
10740 return false;
10741 Value *Agg = IV->getAggregateOperand();
10742 Aggregates.push_back(Agg);
10743 }
10744 const InstructionsState AggState = getSameOpcode(Aggregates, TLI);
10745 if (AggState && AggState.getOpcode() == Instruction::Call &&
10746 !AggState.isAltShuffle() &&
10747 isa<StructType>(AggState.getMainOp()->getType()) &&
10749 // The struct-returning call may have non-bundle users too. The external
10750 // extraction code rebuilds scalars by extractvalue + extractelement,
10751 // which only works when every user of the call is an ExtractValueInst.
10752 // Bail out if any aggregate has a different kind of user.
10753 for (Value *Agg : Aggregates) {
10754 if (!all_of(Agg->users(), IsaPred<ExtractValueInst>))
10755 return false;
10756 }
10757 Indices.assign(VL0Indices.begin(), VL0Indices.end());
10758 Calls.swap(Aggregates);
10759 return true;
10760 }
10761 return false;
10762}
10763
10765 return all_of(VL, [](Value *V) {
10766 return !isa<StructType>(V->getType()) ||
10767 all_of(V->users(), IsaPred<ExtractValueInst>);
10768 });
10769}
10770
10771BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10772 const InstructionsState &S, ArrayRef<Value *> VL,
10773 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
10774 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
10775 assert(S.getMainOp() &&
10776 "Expected instructions with same/alternate opcodes only.");
10777
10778 unsigned ShuffleOrOp =
10779 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10780 Instruction *VL0 = S.getMainOp();
10781 switch (ShuffleOrOp) {
10782 case Instruction::PHI: {
10783 // Too many operands - gather, most probably won't be vectorized.
10784 if (VL0->getNumOperands() > MaxPHINumOperands)
10785 return TreeEntry::NeedToGather;
10786 // Check for terminator values (e.g. invoke).
10787 for (Value *V : VL) {
10788 auto *PHI = dyn_cast<PHINode>(V);
10789 if (!PHI)
10790 continue;
10791 for (Value *Incoming : PHI->incoming_values()) {
10793 if (Term && Term->isTerminator()) {
10795 << "SLP: Need to swizzle PHINodes (terminator use).\n");
10796 return TreeEntry::NeedToGather;
10797 }
10798 }
10799 }
10800
10801 return TreeEntry::Vectorize;
10802 }
10803 case Instruction::ExtractElement:
10804 if (any_of(VL, [&](Value *V) {
10805 auto *EI = dyn_cast<ExtractElementInst>(V);
10806 if (!EI)
10807 return true;
10808 Value *Op = EI->getOperand(0);
10809 if (isVectorized(Op))
10810 return true;
10811 auto *OpI = dyn_cast<Instruction>(Op);
10812 return OpI && OpI->isTerminator();
10813 }))
10814 return TreeEntry::NeedToGather;
10815 [[fallthrough]];
10816 case Instruction::ExtractValue: {
10817 bool Reuse = canReuseExtract(VL, CurrentOrder);
10818 if (Reuse || !CurrentOrder.empty())
10819 return TreeEntry::Vectorize;
10820 SmallVector<unsigned> Indices;
10822 if (ShuffleOrOp == Instruction::ExtractValue &&
10823 checkEVsForVecCalls(VL, S, *TLI, Indices, Calls))
10824 return TreeEntry::Vectorize;
10825 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
10826 return TreeEntry::NeedToGather;
10827 }
10828 case Instruction::InsertElement: {
10829 // Check that we have a buildvector and not a shuffle of 2 or more
10830 // different vectors.
10831 ValueSet SourceVectors;
10832 for (Value *V : VL) {
10833 if (isa<PoisonValue>(V)) {
10834 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
10835 return TreeEntry::NeedToGather;
10836 }
10837 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
10838 assert(getElementIndex(V) != std::nullopt &&
10839 "Non-constant or undef index?");
10840 }
10841
10842 if (count_if(VL, [&SourceVectors](Value *V) {
10843 return !SourceVectors.contains(V);
10844 }) >= 2) {
10845 // Found 2nd source vector - cancel.
10846 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10847 "different source vectors.\n");
10848 return TreeEntry::NeedToGather;
10849 }
10850
10851 if (any_of(VL, [&SourceVectors](Value *V) {
10852 // The last InsertElement can have multiple uses.
10853 return SourceVectors.contains(V) && !V->hasOneUse();
10854 })) {
10855 assert(SLPReVec && "Only supported by REVEC.");
10856 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10857 "multiple uses.\n");
10858 return TreeEntry::NeedToGather;
10859 }
10860
10861 return TreeEntry::Vectorize;
10862 }
10863 case Instruction::Load: {
10864 // Check that a vectorized load would load the same memory as a scalar
10865 // load. For example, we don't want to vectorize loads that are smaller
10866 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
10867 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
10868 // from such a struct, we read/write packed bits disagreeing with the
10869 // unvectorized version.
10870 auto IsGatheredNode = [&]() {
10871 if (!GatheredLoadsEntriesFirst)
10872 return false;
10873 return all_of(VL, [&](Value *V) {
10874 if (isa<PoisonValue>(V))
10875 return true;
10876 return any_of(getTreeEntries(V), [&](const TreeEntry *TE) {
10877 return TE->Idx >= *GatheredLoadsEntriesFirst;
10878 });
10879 });
10880 };
10881 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) {
10883 return TreeEntry::Vectorize;
10885 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10886 // Delay slow vectorized nodes for better vectorization attempts.
10887 LoadEntriesToVectorize.insert(VectorizableTree.size());
10888 return TreeEntry::NeedToGather;
10889 }
10890 return IsGatheredNode() ? TreeEntry::NeedToGather
10891 : TreeEntry::CompressVectorize;
10893 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10894 // Delay slow vectorized nodes for better vectorization attempts.
10895 LoadEntriesToVectorize.insert(VectorizableTree.size());
10896 return TreeEntry::NeedToGather;
10897 }
10898 return IsGatheredNode() ? TreeEntry::NeedToGather
10899 : TreeEntry::ScatterVectorize;
10901 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10902 // Delay slow vectorized nodes for better vectorization attempts.
10903 LoadEntriesToVectorize.insert(VectorizableTree.size());
10904 return TreeEntry::NeedToGather;
10905 }
10906 return IsGatheredNode() ? TreeEntry::NeedToGather
10907 : TreeEntry::StridedVectorize;
10908 case LoadsState::Gather:
10909#ifndef NDEBUG
10910 Type *ScalarTy = VL0->getType();
10911 if (DL->getTypeSizeInBits(ScalarTy) !=
10912 DL->getTypeAllocSizeInBits(ScalarTy))
10913 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
10914 else if (any_of(VL, [](Value *V) {
10915 auto *LI = dyn_cast<LoadInst>(V);
10916 return !LI || !LI->isSimple();
10917 }))
10918 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
10919 else
10920 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
10921#endif // NDEBUG
10923 return TreeEntry::NeedToGather;
10924 }
10925 llvm_unreachable("Unexpected state of loads");
10926 }
10927 case Instruction::ZExt:
10928 case Instruction::SExt:
10929 case Instruction::FPToUI:
10930 case Instruction::FPToSI:
10931 case Instruction::FPExt:
10932 case Instruction::PtrToInt:
10933 case Instruction::IntToPtr:
10934 case Instruction::SIToFP:
10935 case Instruction::UIToFP:
10936 case Instruction::Trunc:
10937 case Instruction::FPTrunc:
10938 case Instruction::BitCast: {
10939 Type *SrcTy = VL0->getOperand(0)->getType();
10940 for (Value *V : VL) {
10941 if (isa<PoisonValue>(V))
10942 continue;
10943 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
10944 if (Ty != SrcTy || !isValidElementType(Ty)) {
10945 LLVM_DEBUG(
10946 dbgs() << "SLP: Gathering casts with different src types.\n");
10947 return TreeEntry::NeedToGather;
10948 }
10949 }
10950 return TreeEntry::Vectorize;
10951 }
10952 case Instruction::ICmp:
10953 case Instruction::FCmp: {
10954 // Check that all of the compares have the same predicate.
10955 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
10957 Type *ComparedTy = VL0->getOperand(0)->getType();
10958 for (Value *V : VL) {
10959 if (isa<PoisonValue>(V))
10960 continue;
10961 auto *Cmp = cast<CmpInst>(V);
10962 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
10963 Cmp->getOperand(0)->getType() != ComparedTy) {
10964 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
10965 return TreeEntry::NeedToGather;
10966 }
10967 }
10968 return TreeEntry::Vectorize;
10969 }
10970 case Instruction::Select:
10971 if (SLPReVec) {
10972 SmallPtrSet<Type *, 4> CondTypes;
10973 for (Value *V : VL) {
10974 Value *Cond;
10975 if (!match(V, m_Select(m_Value(Cond), m_Value(), m_Value())) &&
10976 !match(V, m_ZExt(m_Value(Cond))))
10977 continue;
10978 CondTypes.insert(Cond->getType());
10979 }
10980 if (CondTypes.size() > 1) {
10981 LLVM_DEBUG(
10982 dbgs()
10983 << "SLP: Gathering select with different condition types.\n");
10984 return TreeEntry::NeedToGather;
10985 }
10986 }
10987 [[fallthrough]];
10988 case Instruction::FNeg:
10989 case Instruction::Add:
10990 case Instruction::FAdd:
10991 case Instruction::Sub:
10992 case Instruction::FSub:
10993 case Instruction::Mul:
10994 case Instruction::FMul:
10995 case Instruction::UDiv:
10996 case Instruction::SDiv:
10997 case Instruction::FDiv:
10998 case Instruction::URem:
10999 case Instruction::SRem:
11000 case Instruction::FRem:
11001 case Instruction::Shl:
11002 case Instruction::LShr:
11003 case Instruction::AShr:
11004 case Instruction::And:
11005 case Instruction::Or:
11006 case Instruction::Xor:
11007 case Instruction::Freeze:
11008 if (S.getMainOp()->getType()->isFloatingPointTy() &&
11009 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
11010 auto *I = dyn_cast<Instruction>(V);
11011 return I && I->isBinaryOp() && !I->isFast();
11012 }))
11013 return TreeEntry::NeedToGather;
11014 return TreeEntry::Vectorize;
11015 case Instruction::GetElementPtr: {
11016 // We don't combine GEPs with complicated (nested) indexing.
11017 for (Value *V : VL) {
11018 auto *I = dyn_cast<GetElementPtrInst>(V);
11019 if (!I)
11020 continue;
11021 if (I->getNumOperands() != 2) {
11022 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
11023 return TreeEntry::NeedToGather;
11024 }
11025 }
11026
11027 // We can't combine several GEPs into one vector if they operate on
11028 // different types.
11029 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
11030 for (Value *V : VL) {
11031 auto *GEP = dyn_cast<GEPOperator>(V);
11032 if (!GEP)
11033 continue;
11034 Type *CurTy = GEP->getSourceElementType();
11035 if (Ty0 != CurTy) {
11036 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
11037 return TreeEntry::NeedToGather;
11038 }
11039 }
11040
11041 // We don't combine GEPs with non-constant indexes.
11042 Type *Ty1 = VL0->getOperand(1)->getType();
11043 for (Value *V : VL) {
11044 auto *I = dyn_cast<GetElementPtrInst>(V);
11045 if (!I)
11046 continue;
11047 auto *Op = I->getOperand(1);
11048 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
11049 (Op->getType() != Ty1 &&
11050 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
11051 Op->getType()->getScalarSizeInBits() >
11052 DL->getIndexSizeInBits(
11053 V->getType()->getPointerAddressSpace())))) {
11054 LLVM_DEBUG(
11055 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
11056 return TreeEntry::NeedToGather;
11057 }
11058 }
11059
11060 return TreeEntry::Vectorize;
11061 }
11062 case Instruction::Store: {
11063 // Check if the stores are consecutive or if we need to swizzle them.
11064 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
11065 // Avoid types that are padded when being allocated as scalars, while
11066 // being packed together in a vector (such as i1).
11067 if (DL->getTypeSizeInBits(ScalarTy) !=
11068 DL->getTypeAllocSizeInBits(ScalarTy)) {
11069 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
11070 return TreeEntry::NeedToGather;
11071 }
11072 // Make sure all stores in the bundle are simple - we can't vectorize
11073 // atomic or volatile stores.
11074 for (Value *V : VL) {
11075 auto *SI = cast<StoreInst>(V);
11076 if (!SI->isSimple()) {
11077 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
11078 return TreeEntry::NeedToGather;
11079 }
11080 PointerOps.push_back(SI->getPointerOperand());
11081 }
11082
11083 // Check the order of pointer operands.
11084 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
11085 Value *Ptr0;
11086 Value *PtrN;
11087 if (CurrentOrder.empty()) {
11088 Ptr0 = PointerOps.front();
11089 PtrN = PointerOps.back();
11090 } else {
11091 Ptr0 = PointerOps[CurrentOrder.front()];
11092 PtrN = PointerOps[CurrentOrder.back()];
11093 }
11094 Align CommonAlignment = computeCommonAlignment<StoreInst>(VL);
11095 std::optional<int64_t> Dist =
11096 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
11097 // Check that the sorted pointer operands are consecutive.
11098 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
11099 return TreeEntry::Vectorize;
11100 if (EnableStridedStores &&
11101 analyzeConstantStrideCandidate(PointerOps, ScalarTy, CommonAlignment,
11102 CurrentOrder, *Dist, Ptr0, SPtrInfo))
11103 return TreeEntry::StridedVectorize;
11104 }
11105
11106 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
11107 return TreeEntry::NeedToGather;
11108 }
11109 case Instruction::Call: {
11110 if (S.getMainOp()->getType()->isFloatingPointTy() &&
11111 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
11112 auto *I = dyn_cast<Instruction>(V);
11113 return I && !I->isFast();
11114 }))
11115 return TreeEntry::NeedToGather;
11116 // Check if the calls are all to the same vectorizable intrinsic or
11117 // library function.
11118 CallInst *CI = cast<CallInst>(VL0);
11120
11121 VFShape Shape = VFShape::get(
11122 CI->getFunctionType(),
11123 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
11124 false /*HasGlobalPred*/);
11125 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
11126
11127 if (!VecFunc && !isTriviallyVectorizable(ID)) {
11128 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
11129 return TreeEntry::NeedToGather;
11130 }
11131 if (isa<StructType>(CI->getType()) &&
11133 LLVM_DEBUG(dbgs() << "SLP: Struct-returning calls have non-extractvalue "
11134 "users.\n");
11135 return TreeEntry::NeedToGather;
11136 }
11137 Function *F = CI->getCalledFunction();
11138 unsigned NumArgs = CI->arg_size();
11139 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
11140 for (unsigned J = 0; J != NumArgs; ++J)
11142 ScalarArgs[J] = CI->getArgOperand(J);
11143 for (Value *V : VL) {
11144 CallInst *CI2 = dyn_cast<CallInst>(V);
11145 if (!CI2 || CI2->getCalledFunction() != F ||
11146 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
11147 (VecFunc &&
11148 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
11150 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
11151 << "\n");
11152 return TreeEntry::NeedToGather;
11153 }
11154 // Some intrinsics have scalar arguments and should be same in order for
11155 // them to be vectorized.
11156 for (unsigned J = 0; J != NumArgs; ++J) {
11158 Value *A1J = CI2->getArgOperand(J);
11159 if (ScalarArgs[J] != A1J) {
11161 << "SLP: mismatched arguments in call:" << *CI
11162 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
11163 return TreeEntry::NeedToGather;
11164 }
11165 }
11166 }
11167 // Verify that the bundle operands are identical between the two calls.
11168 if (CI->hasOperandBundles() &&
11169 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
11170 CI->op_begin() + CI->getBundleOperandsEndIndex(),
11171 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
11172 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
11173 << "!=" << *V << '\n');
11174 return TreeEntry::NeedToGather;
11175 }
11176 }
11177 SmallVector<Type *> ArgTys =
11178 buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
11179 auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
11180 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
11181 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
11182 return TreeEntry::NeedToGather;
11183
11184 return TreeEntry::Vectorize;
11185 }
11186 case Instruction::ShuffleVector: {
11187 if (!S.isAltShuffle()) {
11188 // REVEC can support non alternate shuffle.
11190 return TreeEntry::Vectorize;
11191 // If this is not an alternate sequence of opcode like add-sub
11192 // then do not vectorize this instruction.
11193 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
11194 return TreeEntry::NeedToGather;
11195 }
11196
11197 return TreeEntry::Vectorize;
11198 }
11199 default:
11200 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
11201 return TreeEntry::NeedToGather;
11202 }
11203}
11204
11205namespace {
11206/// Allows to correctly handle operands of the phi nodes based on the \p Main
11207/// PHINode order of incoming basic blocks/values.
11208class PHIHandler {
11209 DominatorTree &DT;
11210 PHINode *Main = nullptr;
11213
11214public:
11215 PHIHandler() = delete;
11216 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
11217 : DT(DT), Main(Main), Phis(Phis),
11218 Operands(Main->getNumIncomingValues(),
11219 SmallVector<Value *>(Phis.size(), nullptr)) {}
11220 void buildOperands() {
11221 constexpr unsigned FastLimit = 4;
11222 if (Main->getNumIncomingValues() <= FastLimit) {
11223 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
11224 BasicBlock *InBB = Main->getIncomingBlock(I);
11225 if (!DT.isReachableFromEntry(InBB)) {
11226 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
11227 continue;
11228 }
11229 // Prepare the operand vector.
11230 for (auto [Idx, V] : enumerate(Phis)) {
11231 auto *P = dyn_cast<PHINode>(V);
11232 if (!P) {
11234 "Expected isa instruction or poison value.");
11235 Operands[I][Idx] = V;
11236 continue;
11237 }
11238 if (P->getIncomingBlock(I) == InBB)
11239 Operands[I][Idx] = P->getIncomingValue(I);
11240 else
11241 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
11242 }
11243 }
11244 return;
11245 }
11246 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
11247 Blocks;
11248 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
11249 BasicBlock *InBB = Main->getIncomingBlock(I);
11250 if (!DT.isReachableFromEntry(InBB)) {
11251 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
11252 continue;
11253 }
11254 Blocks.try_emplace(InBB).first->second.push_back(I);
11255 }
11256 for (auto [Idx, V] : enumerate(Phis)) {
11257 if (isa<PoisonValue>(V)) {
11258 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
11259 Operands[I][Idx] = V;
11260 continue;
11261 }
11262 auto *P = cast<PHINode>(V);
11263 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
11264 BasicBlock *InBB = P->getIncomingBlock(I);
11265 if (InBB == Main->getIncomingBlock(I)) {
11266 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
11267 continue;
11268 Operands[I][Idx] = P->getIncomingValue(I);
11269 continue;
11270 }
11271 auto *It = Blocks.find(InBB);
11272 if (It == Blocks.end())
11273 continue;
11274 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
11275 }
11276 }
11277 for (const auto &P : Blocks) {
11278 ArrayRef<unsigned> IncomingValues = P.second;
11279 if (IncomingValues.size() <= 1)
11280 continue;
11281 unsigned BasicI = IncomingValues.consume_front();
11282 for (unsigned I : IncomingValues) {
11283 assert(all_of(enumerate(Operands[I]),
11284 [&](const auto &Data) {
11285 return !Data.value() ||
11286 Data.value() == Operands[BasicI][Data.index()];
11287 }) &&
11288 "Expected empty operands list.");
11289 Operands[I] = Operands[BasicI];
11290 }
11291 }
11292 }
11293 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
11294};
11295} // namespace
11296
11297/// Returns main/alternate instructions for the given \p VL. Unlike
11298/// getSameOpcode supports non-compatible instructions for better SplitVectorize
11299/// node support.
11300/// \returns first main/alt instructions, if only poisons and instruction with
11301/// only 2 opcodes exists. Returns pair of nullptr otherwise.
11302static std::pair<Instruction *, Instruction *>
11304 Instruction *MainOp = nullptr;
11305 Instruction *AltOp = nullptr;
11306 for (Value *V : VL) {
11307 if (isa<PoisonValue>(V))
11308 continue;
11309 auto *I = dyn_cast<Instruction>(V);
11310 if (!I)
11311 return {};
11312 if (!MainOp) {
11313 MainOp = I;
11314 continue;
11315 }
11316 if (MainOp->getOpcode() == I->getOpcode()) {
11317 if (I->getParent() != MainOp->getParent())
11318 return {};
11319 continue;
11320 }
11321 if (!AltOp) {
11322 AltOp = I;
11323 continue;
11324 }
11325 if (AltOp->getOpcode() == I->getOpcode()) {
11326 if (I->getParent() != AltOp->getParent())
11327 return {};
11328 continue;
11329 }
11330 return {};
11331 }
11332 if (!AltOp)
11333 return {};
11334 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
11335 "Expected different main and alt instructions.");
11336 return std::make_pair(MainOp, AltOp);
11337}
11338
11339/// Checks that every instruction appears once in the list and if not, packs
11340/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
11341/// unique scalars is extended by poison values to the whole register size.
11342///
11343/// \returns false if \p VL could not be uniquified, in which case \p VL is
11344/// unchanged and \p ReuseShuffleIndices is empty.
11346 SmallVectorImpl<int> &ReuseShuffleIndices,
11347 const TargetTransformInfo &TTI,
11348 const TargetLibraryInfo &TLI,
11349 const InstructionsState &S,
11350 const BoUpSLP::EdgeInfo &UserTreeIdx,
11351 const BoUpSLP &R, bool BuildGatherOnly = true) {
11352 // TODO: Reordering of struct types is not supported.
11353 if (isa<StructType>(getValueType(VL.front()))) {
11354 LLVM_DEBUG(dbgs() << "SLP: struct type in bundle.\n");
11355 ReuseShuffleIndices.clear();
11356 return true;
11357 }
11358 // Check that every instruction appears once in this bundle.
11359 SmallVector<Value *> UniqueValues;
11360 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
11361 for (Value *V : VL) {
11362 if (isConstant(V)) {
11363 // Constants are always considered distinct, even if the same constant
11364 // appears multiple times in VL.
11365 ReuseShuffleIndices.emplace_back(
11366 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
11367 UniqueValues.emplace_back(V);
11368 continue;
11369 }
11370 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
11371 ReuseShuffleIndices.emplace_back(Res.first->second);
11372 if (Res.second)
11373 UniqueValues.emplace_back(V);
11374 }
11375
11376 // Check if we need to schedule the scalars. If no, can keep original scalars
11377 // and avoid extra shuffles.
11378 bool RequireScheduling = S && S.getOpcode() != Instruction::PHI &&
11379 !isVectorLikeInstWithConstOps(S.getMainOp()) &&
11380 (S.areInstructionsWithCopyableElements() ||
11381 !doesNotNeedToSchedule(UniqueValues));
11382 // Compute this flag BEFORE the tail-poison erase below - it must reflect
11383 // the state of the original VL (for the InsertsCost call), not the
11384 // potentially-shrunk UniqueValues.
11385 bool AreAllValuesNonConst = UniquePositions.size() == UniqueValues.size();
11386 // Drop tail poisons, if the values can be vectorized.
11387 if (RequireScheduling) {
11388 const auto EndIt =
11389 find_if_not(make_range(UniqueValues.rbegin(), UniqueValues.rend()),
11391 assert(EndIt != UniqueValues.rend() && "Expected at least one non-poison.");
11392 UniqueValues.erase(EndIt.base(), UniqueValues.end());
11393 }
11394 unsigned NumUniqueScalarValues = UniqueValues.size();
11395 if (NumUniqueScalarValues == VL.size()) {
11396 ReuseShuffleIndices.clear();
11397 return true;
11398 }
11399
11400 // For VL=4 with 3 unique values: keep originals. A <3 x T> vector is
11401 // always widened to <4 x T> on hardware, so the packing just adds an
11402 // extra expand shuffle. Does not apply to loads (a <3 x T> load is a
11403 // single memory access) or PHIs (benefit from compact packing in loops).
11404 constexpr unsigned SmallVecWidth = 4;
11405 constexpr unsigned SmallVecUniqueThreshold = 3;
11406 if (VL.size() == SmallVecWidth &&
11407 NumUniqueScalarValues == SmallVecUniqueThreshold && !BuildGatherOnly &&
11408 !(S && (S.getOpcode() == Instruction::Load ||
11409 S.getOpcode() == Instruction::PHI))) {
11410 // Keep originals with identity reuse - no packing, no extra shuffle.
11411 ReuseShuffleIndices.clear();
11412 return true;
11413 }
11414
11415 // Checks if unique inserts + shuffle is more profitable than just inserts or
11416 // vectorized values.
11417 auto EstimatePackPlusShuffleVsInserts = [&]() {
11418 // Single instruction/argument insert - no shuffle.
11419 if (UniquePositions.size() == 1 &&
11420 (NumUniqueScalarValues == 1 ||
11422 return std::make_pair(false, false);
11423 // For large gathers with power-of-2 VL where packing would produce
11424 // non-power-of-2, reject if most scalars are constants - the packing
11425 // overhead (non-power-of-2 split + shuffles) outweighs the benefit.
11426 constexpr unsigned MinVLForConstGatherCheck = 4;
11427 if (BuildGatherOnly && VL.size() > MinVLForConstGatherCheck &&
11428 has_single_bit(static_cast<unsigned>(VL.size())) &&
11429 !has_single_bit(NumUniqueScalarValues) &&
11430 UniquePositions.size() * 2 < NumUniqueScalarValues)
11431 return std::make_pair(false, false);
11432 auto CheckLoads = [&](ArrayRef<Value *> Loads, bool IncludeGather) {
11433 assert(S && S.getOpcode() == Instruction::Load && "Expected load.");
11434 BoUpSLP::OrdersType Order;
11435 SmallVector<Value *> PointerOps;
11436 BoUpSLP::StridedPtrInfo SPtrInfo;
11437 BoUpSLP::LoadsState Res = R.canVectorizeLoads(Loads, S.getMainOp(), Order,
11438 PointerOps, SPtrInfo);
11439 return (IncludeGather && Res == BoUpSLP::LoadsState::Gather) ||
11443 };
11444 // Operand of the root tree entry on the vectorize path: always pack the
11445 // scalars (PackProfitable=true). Choose between keeping the original VL
11446 // and packing the unique values:
11447 // - For loads, prefer the originals only when both the deduplicated and
11448 // the full sequence can be vectorized non-Gather, or when the reuse
11449 // mask is the identity (the shuffle is free).
11450 // - For everything else (including !S, where RequireScheduling is forced
11451 // to false above), keep originals iff no scheduling is required.
11452 bool IsRootOperand =
11453 UserTreeIdx.UserTE && UserTreeIdx.UserTE->Idx == 0 && !BuildGatherOnly;
11454 if (IsRootOperand) {
11455 if (S && S.getOpcode() == Instruction::Load) {
11456 bool UseOrig = (CheckLoads(UniqueValues, /*IncludeGather=*/true) &&
11457 CheckLoads(VL, /*IncludeGather=*/false)) ||
11459 ReuseShuffleIndices, ReuseShuffleIndices.size());
11460 return std::make_pair(true, UseOrig);
11461 }
11462 return std::make_pair(true, !RequireScheduling);
11463 }
11464 APInt DemandedElts = APInt::getZero(VL.size());
11465 for (auto [Idx, Val] : enumerate(ReuseShuffleIndices))
11466 if (Val != PoisonMaskElem && UniquePositions.contains(UniqueValues[Val]))
11467 DemandedElts.setBit(Idx);
11468 Type *ScalarTy = ::getValueType(UniqueValues.front());
11469 auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, VL.size()));
11470 auto *UniquesVecTy =
11471 cast<VectorType>(getWidenedType(ScalarTy, NumUniqueScalarValues));
11472 const unsigned NumParts = ::getNumberOfParts(TTI, VecTy, ScalarTy);
11473 const unsigned UniquesNumParts =
11474 ::getNumberOfParts(TTI, UniquesVecTy, ScalarTy);
11475 // No need to schedule scalars and only single register used? Use original
11476 // scalars, do not pack.
11477 if (!RequireScheduling) {
11478 if (VL.size() / NumUniqueScalarValues == 1 &&
11479 (NumParts <= 1 || UniquesNumParts >= NumParts))
11480 return std::make_pair(true, true);
11481 // For PHI operands, prefer packing with reuse shuffle - the PHI
11482 // carries the vector through the loop cheaply.
11483 if (S && S.getOpcode() == Instruction::PHI && NumUniqueScalarValues > 1 &&
11484 UniquesNumParts <= NumParts)
11485 return std::make_pair(true, false);
11486 }
11488 InstructionCost ReusesCost = ::getShuffleCost(
11490 NumUniqueScalarValues > VL.size() / 2 ? ArrayRef<int>()
11491 : ArrayRef(ReuseShuffleIndices),
11492 CostKind, /*Index=*/0, UniquesVecTy);
11493 // For vectorizable (non-gather) nodes with low duplication, prefer keeping
11494 // the original values over packing uniques + reshuffling:
11495 // - A single duplicate (non-load) adds negligible overhead.
11496 // - When most values are already unique (>50%), or exactly half are unique
11497 // for some ops (GEPs, non-alt-shuffle casts), the reshuffle cost may
11498 // exceed the savings from a smaller packed vector - check against a
11499 // per-register-part threshold (stricter for wider vectors).
11500 if (S && !BuildGatherOnly) {
11501 bool HasOneDup = S.getOpcode() != Instruction::Load &&
11502 NumUniqueScalarValues + 1 == VL.size();
11503 bool MostlyUnique = NumUniqueScalarValues * 2 > VL.size();
11504 bool IsHalfUniqueValues =
11505 NumUniqueScalarValues * 2 == VL.size() &&
11506 (S.getOpcode() == Instruction::GetElementPtr ||
11507 (isa<CastInst>(S.getMainOp()) && !S.isAltShuffle()));
11509 NumParts * (VL.size() > SmallVecWidth ? 1 : 2);
11510 if (HasOneDup ||
11511 ((MostlyUnique || IsHalfUniqueValues) && ReusesCost > CostThreshold))
11512 return std::make_pair(true, true);
11513 }
11514 // For loads, check if either the deduplicated or the full (with
11515 // duplicates) set can be scatter/compress-vectorized. Prefer the unique
11516 // loads (pack + reshuffle) when possible, otherwise use the originals.
11517 if (S && S.getOpcode() == Instruction::Load) {
11518 bool UniquesVectorized =
11519 CheckLoads(UniqueValues, /*IncludeGather=*/false);
11520 if (UniquesVectorized || CheckLoads(VL, /*IncludeGather=*/false))
11521 return std::make_pair(true, !UniquesVectorized);
11522 }
11523 bool CanSkipBVCost =
11524 (!BuildGatherOnly && !RequireScheduling) || R.hasSameNode(S, VL);
11525 InstructionCost InsertsCost =
11526 CanSkipBVCost
11528 : ::getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
11529 /*Insert=*/true, /*Extract=*/false,
11530 CostKind, AreAllValuesNonConst, VL);
11531 APInt UniquesDemandedElts = APInt::getAllOnes(NumUniqueScalarValues);
11532 for (const auto [Idx, V] : enumerate(UniqueValues))
11533 if (isConstant(V))
11534 UniquesDemandedElts.clearBit(Idx);
11535 InstructionCost UniquesCost =
11536 CanSkipBVCost
11538 : ::getScalarizationOverhead(TTI, ScalarTy, UniquesVecTy,
11539 UniquesDemandedElts, /*Insert=*/true,
11540 /*Extract=*/false, CostKind,
11541 AreAllValuesNonConst, UniqueValues);
11542 UniquesCost += ReusesCost;
11543 if (UniquesCost <= InsertsCost)
11544 return std::make_pair(true, false);
11545 InstructionCost CostDiff = UniquesCost - InsertsCost;
11546 if (CostDiff < TTI::TCC_Expensive ||
11547 (R.getTreeSize() == 0 && R.isReductionTree() &&
11548 CostDiff == TTI::TCC_Expensive))
11549 return std::make_pair(S && (!S.isAltShuffle() || !BuildGatherOnly),
11550 false);
11551 // Otherwise, use original values, if values do not require scheduling and
11552 // pass still try to vectorize them.
11553 bool KeepOriginal = !BuildGatherOnly && !RequireScheduling;
11554 return std::make_pair(KeepOriginal, KeepOriginal);
11555 };
11556
11557 const auto [PackProfitable, UseOriginal] = EstimatePackPlusShuffleVsInserts();
11558
11559 if (PackProfitable) {
11560 if (UseOriginal) {
11561 // Prefer original scalars - avoid shuffling.
11562 ReuseShuffleIndices.clear();
11563 } else {
11564 // Better to use uniques + reshuffle.
11565 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
11566 VL = std::move(UniqueValues);
11567 }
11568 return true;
11569 }
11570
11571 // Buildvector/gather of the original scalars.
11572 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
11573 ReuseShuffleIndices.clear();
11574 return false;
11575}
11576
11578 const InstructionsState &LocalState,
11581 OrdersType &ReorderIndices) const {
11582 constexpr unsigned SmallNodeSize = 4;
11583 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
11585 return false;
11586
11587 // Check if this is a duplicate of another split entry.
11588 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
11589 << ".\n");
11590 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
11591 if (E->isSame(VL)) {
11592 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
11593 << *LocalState.getMainOp() << ".\n");
11594 return false;
11595 }
11596 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11597 if (all_of(VL, [&](Value *V) {
11598 return isa<PoisonValue>(V) || Values.contains(V);
11599 })) {
11600 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11601 return false;
11602 }
11603 }
11604
11605 ReorderIndices.assign(VL.size(), VL.size());
11606 SmallBitVector Op1Indices(VL.size());
11607 for (auto [Idx, V] : enumerate(VL)) {
11608 auto *I = dyn_cast<Instruction>(V);
11609 if (!I) {
11610 Op1.push_back(V);
11611 Op1Indices.set(Idx);
11612 continue;
11613 }
11614 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
11615 isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(),
11616 *TLI)) ||
11617 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
11618 !isAlternateInstruction(I, LocalState.getMainOp(),
11619 LocalState.getAltOp(), *TLI))) {
11620 Op1.push_back(V);
11621 Op1Indices.set(Idx);
11622 continue;
11623 }
11624 Op2.push_back(V);
11625 }
11626 Type *ScalarTy = getValueType(VL.front());
11627 auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, VL.size()));
11628 unsigned Opcode0 = LocalState.getOpcode();
11629 unsigned Opcode1 = LocalState.getAltOpcode();
11630 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
11631 // Enable split node, only if all nodes do not form legal alternate
11632 // instruction (like X86 addsub).
11635 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
11636 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
11637 return false;
11638 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
11639 for (unsigned Idx : seq<unsigned>(VL.size())) {
11640 if (Op1Indices.test(Idx)) {
11641 ReorderIndices[Op1Cnt] = Idx;
11642 ++Op1Cnt;
11643 } else {
11644 ReorderIndices[Op2Cnt] = Idx;
11645 ++Op2Cnt;
11646 }
11647 }
11648 if (isIdentityOrder(ReorderIndices))
11649 ReorderIndices.clear();
11650 // When VL fills a power-of-2 register but the split halves do not, the
11651 // reorder shuffle makes the split unprofitable - reject.
11652 else if (hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), VL.size()) &&
11653 (!hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(),
11654 Op1.size()) ||
11655 !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(),
11656 Op2.size())))
11657 return false;
11658 SmallVector<int> Mask;
11659 if (!ReorderIndices.empty())
11660 inversePermutation(ReorderIndices, Mask);
11661 unsigned NumParts = TTI->getNumberOfParts(VecTy);
11662 auto *Op1VecTy = cast<VectorType>(getWidenedType(ScalarTy, Op1.size()));
11663 auto *Op2VecTy = cast<VectorType>(getWidenedType(ScalarTy, Op2.size()));
11664 // Check non-profitable single register ops, which better to be represented
11665 // as alternate ops.
11666 if (NumParts >= VL.size())
11667 return false;
11669 InstructionCost InsertCost = ::getShuffleCost(
11670 *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
11671 auto *SubVecTy = cast<VectorType>(
11672 getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size())));
11673 InstructionCost NewShuffleCost =
11674 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
11675 if (!LocalState.isCmpOp() && NumParts <= 1 &&
11676 (Mask.empty() || InsertCost >= NewShuffleCost))
11677 return false;
11678 if ((LocalState.getMainOp()->isBinaryOp() &&
11679 LocalState.getAltOp()->isBinaryOp() &&
11680 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
11681 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
11682 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
11683 (LocalState.getMainOp()->isUnaryOp() &&
11684 LocalState.getAltOp()->isUnaryOp())) {
11685 InstructionCost OriginalVecOpsCost =
11686 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
11687 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
11688 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
11689 for (unsigned Idx : seq<unsigned>(VL.size())) {
11690 if (isa<PoisonValue>(VL[Idx]))
11691 continue;
11692 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
11693 }
11694 InstructionCost OriginalCost =
11695 OriginalVecOpsCost + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
11696 VecTy, OriginalMask, Kind);
11697 InstructionCost NewVecOpsCost =
11698 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
11699 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
11700 InstructionCost NewCost =
11701 NewVecOpsCost + InsertCost +
11702 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
11703 VectorizableTree.front()->getOpcode() == Instruction::Store
11704 ? NewShuffleCost
11705 : 0);
11706 // If not profitable to split - exit.
11707 if (NewCost >= OriginalCost)
11708 return false;
11709 }
11710 return true;
11711}
11712
11713namespace {
11714/// Class accepts incoming list of values, checks if it is able to model
11715/// "copyable" values as compatible operations, and generates the list of values
11716/// for scheduling and list of operands doe the new nodes.
11717class InstructionsCompatibilityAnalysis {
11718 DominatorTree &DT;
11719 const DataLayout &DL;
11720 const TargetTransformInfo &TTI;
11721 const TargetLibraryInfo &TLI;
11722 unsigned MainOpcode = 0;
11723 Instruction *MainOp = nullptr;
11724
11725 /// Checks if the opcode is supported as the main opcode for copyable
11726 /// elements.
11727 static bool isSupportedOpcode(const unsigned Opcode) {
11728 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
11729 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
11730 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
11731 Opcode == Instruction::And || Opcode == Instruction::Or ||
11732 Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
11733 Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
11734 Opcode == Instruction::FDiv;
11735 }
11736
11737 /// Identifies the best candidate value, which represents main opcode
11738 /// operation.
11739 /// Currently the best candidate is the Add instruction with the parent
11740 /// block with the highest DFS incoming number (block, that dominates other).
11741 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
11742 BasicBlock *Parent = nullptr;
11743 // Checks if the instruction has supported opcode.
11744 auto IsSupportedInstruction = [&](Instruction *I, bool AnyUndef) {
11745 if (AnyUndef && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
11746 return false;
11747 return I && isSupportedOpcode(I->getOpcode()) &&
11748 (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
11749 };
11750 // Exclude operands instructions immediately to improve compile time, it
11751 // will be unable to schedule anyway.
11752 SmallDenseSet<Value *, 8> Operands;
11753 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
11754 bool AnyUndef = false;
11755 for (Value *V : VL) {
11756 auto *I = dyn_cast<Instruction>(V);
11757 if (!I) {
11758 AnyUndef |= isa<UndefValue>(V);
11759 continue;
11760 }
11761 if (!DT.isReachableFromEntry(I->getParent()))
11762 continue;
11763 if (Candidates.empty()) {
11764 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
11765 Parent = I->getParent();
11766 Operands.insert(I->op_begin(), I->op_end());
11767 continue;
11768 }
11769 if (Parent == I->getParent()) {
11770 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
11771 Operands.insert(I->op_begin(), I->op_end());
11772 continue;
11773 }
11774 auto *NodeA = DT.getNode(Parent);
11775 auto *NodeB = DT.getNode(I->getParent());
11776 assert(NodeA && "Should only process reachable instructions");
11777 assert(NodeB && "Should only process reachable instructions");
11778 assert((NodeA == NodeB) ==
11779 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11780 "Different nodes should have different DFS numbers");
11781 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
11782 Candidates.clear();
11783 Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
11784 Parent = I->getParent();
11785 Operands.clear();
11786 Operands.insert(I->op_begin(), I->op_end());
11787 }
11788 }
11789 unsigned BestOpcodeNum = 0;
11790 MainOp = nullptr;
11791 bool UsedOutside = false;
11792 for (const auto &P : Candidates) {
11793 bool PUsedOutside = all_of(P.second, isUsedOutsideBlock);
11794 if (UsedOutside && !PUsedOutside)
11795 continue;
11796 if (!UsedOutside && PUsedOutside)
11797 BestOpcodeNum = 0;
11798 if (P.second.size() < BestOpcodeNum)
11799 continue;
11800 // If have inner dependencies - skip.
11801 if (!PUsedOutside && any_of(P.second, [&](Instruction *I) {
11802 return Operands.contains(I);
11803 }))
11804 continue;
11805 // On a tie, keep the outer binary op as MainOp rather than replacing it
11806 // with an inner op that appears as its direct operand. For example, in
11807 // (2.0f * A) + B the fadd and fmul each appear once in VL; without this
11808 // check the fmul could win and prevent vectorization of the fadd pair.
11809 if (P.second.size() == BestOpcodeNum) {
11810 auto *I = P.second.front();
11811 if (auto *MainBO = dyn_cast<BinaryOperator>(MainOp)) {
11812 auto *MainBOOp0 = dyn_cast<Instruction>(MainBO->getOperand(0));
11813 auto *MainBOOp1 = dyn_cast<Instruction>(MainBO->getOperand(1));
11814 if (MainBOOp0 && MainBOOp0->getOpcode() == I->getOpcode() &&
11815 MainBOOp0->getParent() == I->getParent())
11816 continue;
11817 if (MainBOOp1 && MainBOOp1->getOpcode() == I->getOpcode() &&
11818 MainBOOp1->getParent() == I->getParent())
11819 continue;
11820 }
11821 }
11822 UsedOutside = PUsedOutside;
11823 for (Instruction *I : P.second) {
11824 if (IsSupportedInstruction(I, AnyUndef)) {
11825 MainOp = I;
11826 BestOpcodeNum = P.second.size();
11827 break;
11828 }
11829 }
11830 }
11831 if (MainOp) {
11832 // Do not match, if any copyable is a terminator from the same block as
11833 // the main operation.
11834 if (any_of(VL, [&](Value *V) {
11835 auto *I = dyn_cast<Instruction>(V);
11836 return I && I->getParent() == MainOp->getParent() &&
11837 I->isTerminator();
11838 })) {
11839 MainOp = nullptr;
11840 return;
11841 }
11842 MainOpcode = MainOp->getOpcode();
11843 }
11844 }
11845
11846 /// Returns the idempotent value for the \p MainOp with the detected \p
11847 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
11848 /// the operand itself, since V or V == V.
11849 Value *selectBestIdempotentValue() const {
11850 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11851 return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
11852 !MainOp->isCommutative());
11853 }
11854
11855 /// Returns the value and operands for the \p V, considering if it is original
11856 /// instruction and its actual operands should be returned, or it is a
11857 /// copyable element and its should be represented as idempotent instruction.
11858 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
11859 if (isa<PoisonValue>(V))
11860 return {V, V};
11861 if (!S.isCopyableElement(V))
11862 return convertTo(cast<Instruction>(V), S).second;
11863 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11864 return {V, selectBestIdempotentValue()};
11865 }
11866
11867 /// Builds operands for the original instructions.
11868 void
11869 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
11870 SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
11871
11872 unsigned ShuffleOrOp =
11873 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11874 Instruction *VL0 = S.getMainOp();
11875
11876 switch (ShuffleOrOp) {
11877 case Instruction::PHI: {
11878 auto *PH = cast<PHINode>(VL0);
11879
11880 // Keeps the reordered operands to avoid code duplication.
11881 PHIHandler Handler(DT, PH, VL);
11882 Handler.buildOperands();
11883 Operands.assign(PH->getNumOperands(), {});
11884 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
11885 Operands[I].assign(Handler.getOperands(I).begin(),
11886 Handler.getOperands(I).end());
11887 return;
11888 }
11889 case Instruction::ExtractValue: {
11890 SmallVector<unsigned> Indices;
11892 if (checkEVsForVecCalls(VL, S, TLI, Indices, Calls)) {
11893 Operands.assign(1, {});
11894 Operands[0].swap(Calls);
11895 return;
11896 }
11897 [[fallthrough]];
11898 }
11899 case Instruction::ExtractElement:
11900 // This is a special case, as it does not gather, but at the same time
11901 // we are not extending buildTree_rec() towards the operands.
11902 Operands.assign(1, {VL.size(), VL0->getOperand(0)});
11903 return;
11904 case Instruction::InsertElement:
11905 Operands.assign(2, {VL.size(), nullptr});
11906 for (auto [Idx, V] : enumerate(VL)) {
11907 auto *IE = cast<InsertElementInst>(V);
11908 for (auto [OpIdx, Ops] : enumerate(Operands))
11909 Ops[Idx] = IE->getOperand(OpIdx);
11910 }
11911 return;
11912 case Instruction::Load:
11913 Operands.assign(
11914 1, {VL.size(),
11915 PoisonValue::get(cast<LoadInst>(VL0)->getPointerOperandType())});
11916 for (auto [V, Op] : zip(VL, Operands.back())) {
11917 auto *LI = dyn_cast<LoadInst>(V);
11918 if (!LI)
11919 continue;
11920 Op = LI->getPointerOperand();
11921 }
11922 return;
11923 case Instruction::ZExt:
11924 case Instruction::SExt:
11925 case Instruction::FPToUI:
11926 case Instruction::FPToSI:
11927 case Instruction::FPExt:
11928 case Instruction::PtrToInt:
11929 case Instruction::IntToPtr:
11930 case Instruction::SIToFP:
11931 case Instruction::UIToFP:
11932 case Instruction::Trunc:
11933 case Instruction::FPTrunc:
11934 case Instruction::BitCast:
11935 case Instruction::ICmp:
11936 case Instruction::FCmp:
11937 case Instruction::FNeg:
11938 case Instruction::Add:
11939 case Instruction::FAdd:
11940 case Instruction::Sub:
11941 case Instruction::FSub:
11942 case Instruction::Mul:
11943 case Instruction::FMul:
11944 case Instruction::UDiv:
11945 case Instruction::SDiv:
11946 case Instruction::FDiv:
11947 case Instruction::URem:
11948 case Instruction::SRem:
11949 case Instruction::FRem:
11950 case Instruction::Shl:
11951 case Instruction::LShr:
11952 case Instruction::AShr:
11953 case Instruction::And:
11954 case Instruction::Or:
11955 case Instruction::Xor:
11956 case Instruction::Freeze:
11957 case Instruction::Store:
11958 case Instruction::ShuffleVector:
11959 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
11960 for (auto [Idx, V] : enumerate(VL)) {
11961 auto *I = dyn_cast<Instruction>(V);
11962 if (!I) {
11963 for (auto [OpIdx, Ops] : enumerate(Operands))
11964 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
11965 continue;
11966 }
11967 auto [Op, ConvertedOps] = convertTo(I, S);
11968 for (auto [OpIdx, Ops] : enumerate(Operands))
11969 Ops[Idx] = ConvertedOps[OpIdx];
11970 }
11971 return;
11972 case Instruction::Select:
11973 Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
11974 for (auto [Idx, V] : enumerate(VL)) {
11975 auto *I = dyn_cast<Instruction>(V);
11976 if (!I) {
11977 for (auto [OpIdx, Ops] : enumerate(Operands))
11978 Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
11979 continue;
11980 }
11981 if (isa<ZExtInst>(I)) {
11982 // Special case for select + zext i1 to avoid explosion of different
11983 // types. We want to keep the condition as i1 to be able to match
11984 // different selects together and reuse the vectorized condition
11985 // rather than trying to gather it.
11986 Operands[0][Idx] = I->getOperand(0);
11987 Operands[1][Idx] = ConstantInt::get(I->getType(), 1);
11988 Operands[2][Idx] = ConstantInt::getNullValue(I->getType());
11989 continue;
11990 }
11991 auto [Op, ConvertedOps] = convertTo(I, S);
11992 for (auto [OpIdx, Ops] : enumerate(Operands))
11993 Ops[Idx] = ConvertedOps[OpIdx];
11994 }
11995 return;
11996 case Instruction::GetElementPtr: {
11997 Operands.assign(2, {VL.size(), nullptr});
11998 // Need to cast all indices to the same type before vectorization to
11999 // avoid crash.
12000 // Required to be able to find correct matches between different gather
12001 // nodes and reuse the vectorized values rather than trying to gather them
12002 // again.
12003 const unsigned IndexIdx = 1;
12004 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
12005 Type *Ty =
12006 all_of(VL,
12007 [&](Value *V) {
12009 return !GEP || VL0Ty == GEP->getOperand(IndexIdx)->getType();
12010 })
12011 ? VL0Ty
12012 : DL.getIndexType(cast<GetElementPtrInst>(VL0)
12013 ->getPointerOperandType()
12014 ->getScalarType());
12015 for (auto [Idx, V] : enumerate(VL)) {
12017 if (!GEP) {
12018 Operands[0][Idx] = V;
12019 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
12020 continue;
12021 }
12022 Operands[0][Idx] = GEP->getPointerOperand();
12023 auto *Op = GEP->getOperand(IndexIdx);
12024 auto *CI = dyn_cast<ConstantInt>(Op);
12025 Operands[1][Idx] = CI ? ConstantFoldIntegerCast(
12026 CI, Ty, CI->getValue().isSignBitSet(), DL)
12027 : Op;
12028 }
12029 return;
12030 }
12031 case Instruction::Call: {
12032 auto *CI = cast<CallInst>(VL0);
12034 for (unsigned Idx : seq<unsigned>(CI->arg_size())) {
12036 continue;
12037 auto &Ops = Operands.emplace_back();
12038 for (Value *V : VL) {
12039 auto *I = dyn_cast<Instruction>(V);
12040 Ops.push_back(I ? I->getOperand(Idx)
12041 : PoisonValue::get(VL0->getOperand(Idx)->getType()));
12042 }
12043 }
12044 return;
12045 }
12046 default:
12047 break;
12048 }
12049 llvm_unreachable("Unexpected vectorization of the instructions.");
12050 }
12051
12052 /// Check if the specified \p VL list of values is better to represent as
12053 /// uniform with copyables, as modeled via \p CopyableS, or as alternate (or
12054 /// uniform with compatible ops), modeled via \p S.
12055 /// Performs the analysis of the operands, choosing the preferred main
12056 /// instruction and checking the matching of the operands for the main
12057 /// instruction and copyable elements.
12058 bool isCopyablePreferable(ArrayRef<Value *> VL, const BoUpSLP &R,
12059 const InstructionsState &S,
12060 const InstructionsState &CopyableS) {
12061 // If all elements are vectorized already - keep as is.
12062 if (all_of(VL, [&](Value *V) {
12063 return isa<PoisonValue>(V) || R.isVectorized(V);
12064 }))
12065 return false;
12066 Instruction *SMain = S.getMainOp();
12067 Instruction *SAlt = S.isAltShuffle() ? S.getAltOp() : nullptr;
12068 const bool IsCommutative = ::isCommutative(SMain);
12069 const bool IsAltCommutative =
12070 S.isAltShuffle() ? ::isCommutative(SAlt) : false;
12071 const bool IsMainCommutative = ::isCommutative(MainOp);
12073 buildOriginalOperands(S, SMain, Ops);
12074 // Support only binary operations for now.
12075 if (Ops.size() != 2)
12076 return false;
12077 // Try to find better candidate for S main instruction, which operands have
12078 // better matching.
12079 auto CheckOperands = [](Value *Op, Value *SMainOp) {
12080 auto *OpI = dyn_cast<BinaryOperator>(Op);
12081 if (!OpI)
12082 return false;
12083 auto *SMainOpI = dyn_cast<BinaryOperator>(SMainOp);
12084 if (!SMainOpI)
12085 return true;
12086 return any_of(OpI->operands(), [&](Value *V) {
12087 auto *I = dyn_cast<Instruction>(V);
12088 return I && I->getOpcode() == SMainOpI->getOpcode();
12089 });
12090 };
12091 SmallPtrSet<Value *, 8> Operands;
12092 for (Value *V : VL) {
12093 auto *I = dyn_cast<Instruction>(V);
12094 if (!I || I == SMain)
12095 continue;
12096 Instruction *MatchingOp = S.getMatchingMainOpOrAltOp(I);
12097 if (MatchingOp != SMain)
12098 continue;
12100 buildOriginalOperands(S, I, VOps);
12101 Operands.insert(I->op_begin(), I->op_end());
12102 assert(VOps.size() == 2 && Ops.size() == 2 &&
12103 "Expected binary operations only.");
12104 if (CheckOperands(VOps[0][0], Ops[0][0]) ||
12105 CheckOperands(VOps[1][0], Ops[1][0]) ||
12106 (IsCommutative && (CheckOperands(VOps[0][0], Ops[1][0]) ||
12107 CheckOperands(VOps[1][0], Ops[0][0])))) {
12108 SMain = I;
12109 Ops.swap(VOps);
12110 break;
12111 }
12112 }
12114 buildOriginalOperands(S, MainOp, MainOps);
12115
12116 auto BuildFirstOperandCandidates =
12117 [&](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
12119 bool IsCommutative) {
12120 Candidates.emplace_back(Ops[0][0], Op0);
12121 if (IsCommutative)
12122 Candidates.emplace_back(Ops[0][0], Op1);
12123 };
12124
12125 auto BuildSecondOperandCandidates =
12126 [&](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
12127 ArrayRef<BoUpSLP::ValueList> Ops, int PrevBestIdx, Value *Op0,
12128 Value *Op1, bool IsCommutative) {
12129 if (PrevBestIdx != 1)
12130 Candidates.emplace_back(Ops[1][0], Op1);
12131 if (PrevBestIdx != 0 && IsCommutative)
12132 Candidates.emplace_back(Ops[1][0], Op0);
12133 };
12134
12135 auto FindBestCandidate =
12136 [&](ArrayRef<std::pair<Value *, Value *>> Candidates, bool &IsConst,
12137 int &Score) {
12138 auto Res = R.findBestRootPair(Candidates);
12139 Score = Res.second;
12140 IsConst =
12142 isConstant(Candidates[Res.first.value_or(0)].first) &&
12143 isConstant(Candidates[Res.first.value_or(0)].second);
12144 if (IsConst) {
12145 // Check if there are splat candidates and consider them better
12146 // option.
12147 for (const auto [Idx, P] : enumerate(Candidates)) {
12148 if (!isConstant(P.first) && !isConstant(P.second) &&
12149 P.second == P.first) {
12150 Res.first = Idx;
12151 IsConst = false;
12152 Score = isa<LoadInst>(Candidates[Res.first.value_or(0)].first)
12155 break;
12156 }
12157 }
12158 }
12159 return Res.first;
12160 };
12161
12162 for (Value *V : VL) {
12163 auto *I = dyn_cast<Instruction>(V);
12164 if (!I || (I == MainOp && (!S.isAltShuffle() || I == SMain)) ||
12165 (!S.isAltShuffle() && I == SMain))
12166 continue;
12168 buildOriginalOperands(S, I == SMain ? MainOp : I, VOps);
12169 SmallVector<Value *> CopyableOps =
12170 getOperands(CopyableS, I == MainOp ? SMain : I);
12171 if (CopyableOps.size() == VOps.size() &&
12172 all_of(zip(CopyableOps, VOps), [&](const auto &P) {
12173 return std::get<0>(P) == std::get<1>(P)[0];
12174 }))
12175 continue;
12177 BuildFirstOperandCandidates(Candidates, MainOps, CopyableOps[0],
12178 CopyableOps[1], IsMainCommutative);
12179 const unsigned OpSize = Candidates.size();
12180 Instruction *MatchingOp =
12181 S.getMatchingMainOpOrAltOp(I) == S.getMainOp() ? SMain : SAlt;
12182 const bool IsCommutativeInst =
12183 (MatchingOp == SMain ? IsCommutative : IsAltCommutative) ||
12184 ::isCommutative(I, MatchingOp);
12185 if (S.isAltShuffle() && MatchingOp == SAlt &&
12186 any_of(VOps, [&](const BoUpSLP::ValueList &Ops) {
12187 auto *I = dyn_cast<BinaryOperator>(Ops[0]);
12188 return I && Operands.contains(I);
12189 }))
12190 return false;
12191 if (S.isAltShuffle() && MatchingOp == SMain)
12192 Operands.insert(I->op_begin(), I->op_end());
12193 BuildFirstOperandCandidates(Candidates, Ops, VOps[0][0], VOps[1][0],
12194 IsCommutativeInst);
12195 bool IsBestConst;
12196 int Score;
12197 std::optional<int> BestOp =
12198 FindBestCandidate(Candidates, IsBestConst, Score);
12199 const bool IsOriginalBetter =
12200 static_cast<unsigned>(BestOp.value_or(OpSize)) >= OpSize;
12201 Candidates.clear();
12202 BuildSecondOperandCandidates(
12203 Candidates, MainOps, IsOriginalBetter ? -1 : *BestOp, CopyableOps[0],
12204 CopyableOps[1], IsMainCommutative);
12205 const unsigned SecondOpSize = Candidates.size();
12206 BuildSecondOperandCandidates(
12207 Candidates, Ops,
12208 IsOriginalBetter ? BestOp.value_or(OpSize - 1) - OpSize : -1,
12209 VOps[0][0], VOps[1][0], IsCommutativeInst);
12210 bool IsSecondBestConst;
12211 int SecondScore;
12212 std::optional<int> SecondBestOp =
12213 FindBestCandidate(Candidates, IsSecondBestConst, SecondScore);
12214 // No best candidates.
12215 if (!BestOp && !SecondBestOp)
12216 return false;
12217 // Original better in both ops combinations.
12218 const bool IsSecondOriginalBetter =
12219 static_cast<unsigned>(SecondBestOp.value_or(SecondOpSize)) >=
12220 SecondOpSize;
12221 if (IsOriginalBetter && IsSecondOriginalBetter)
12222 return false;
12223 // Original is better in second combination, but in the first combination
12224 // no best candidates.
12225 if (!BestOp && IsSecondOriginalBetter)
12226 return false;
12227 // Original is better in first combination, but in the second combination
12228 // no best candidates.
12229 if (!SecondBestOp && IsOriginalBetter)
12230 return false;
12231 // Copyable is best in the first combination, but it is constant, but
12232 // original is better in second non-constant combination.
12233 if (!IsOriginalBetter && IsBestConst && IsSecondOriginalBetter &&
12234 !IsSecondBestConst)
12235 return false;
12236 // Copyable is best in the second combination, but it is constant, but
12237 // original is better in the first non-constant combination.
12238 if (BestOp && IsOriginalBetter && !IsBestConst &&
12239 !IsSecondOriginalBetter && IsSecondBestConst)
12240 return false;
12241 // Original combination score is better.
12242 if (((Score > SecondScore ||
12244 Score == SecondScore)) &&
12245 IsOriginalBetter) ||
12246 (IsSecondOriginalBetter &&
12247 (SecondScore > Score ||
12249 Score == SecondScore))))
12250 return false;
12251 }
12252 return true;
12253 }
12254
12255public:
12256 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
12257 const TargetTransformInfo &TTI,
12258 const TargetLibraryInfo &TLI)
12259 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
12260
12261 InstructionsState buildInstructionsState(ArrayRef<Value *> VL,
12262 const BoUpSLP &R,
12263 bool WithProfitabilityCheck = false,
12264 bool SkipSameCodeCheck = false) {
12265 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
12266 ? InstructionsState::invalid()
12267 : getSameOpcode(VL, TLI);
12268 // Check if series of selects + zext i1 %x to in can be combined into
12269 // selects + select %x, i32 1, i32 0.
12270 Instruction *SelectOp = nullptr;
12271 if (!S && allSameBlock(VL) && all_of(VL, [&](Value *V) {
12272 if (match(V, m_Select(m_Value(), m_Value(), m_Value()))) {
12273 if (!SelectOp)
12274 SelectOp = cast<Instruction>(V);
12275 return true;
12276 }
12277 auto *ZExt = dyn_cast<ZExtInst>(V);
12278 return (ZExt && ZExt->getSrcTy()->isIntegerTy(1)) ||
12280 })) {
12281 if (SelectOp)
12282 return InstructionsState(SelectOp, SelectOp);
12283 }
12284 if (S && S.isAltShuffle()) {
12285 Type *ScalarTy = S.getMainOp()->getType();
12286 auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, VL.size()));
12287 unsigned Opcode0 = S.getOpcode();
12288 unsigned Opcode1 = S.getAltOpcode();
12289 SmallBitVector OpcodeMask(
12290 getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
12291 // If this pattern is supported by the target then we consider the order.
12292 if (TTI.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
12293 return S;
12294 } else if (S && (!VectorizeCopyableElements ||
12295 !isa<BinaryOperator>(S.getMainOp()) ||
12296 all_of(VL, [&](Value *V) {
12297 auto *I = dyn_cast<Instruction>(V);
12298 return !I || I->getOpcode() == S.getOpcode() ||
12299 (S.getOpcode() == Instruction::Add &&
12300 I->getOpcode() == Instruction::Shl);
12301 }))) {
12302 return S;
12303 }
12305 return S;
12306 findAndSetMainInstruction(VL, R);
12307 if (!MainOp)
12308 return S;
12309 InstructionsState OrigS = S;
12310 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
12311 if (OrigS && !isCopyablePreferable(VL, R, OrigS, S))
12312 return OrigS;
12313 if (!WithProfitabilityCheck)
12314 return S;
12315 // Check if it is profitable to vectorize the instruction.
12316 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
12317 auto BuildCandidates =
12318 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
12319 Value *V2) {
12320 if (V1 != V2 && isa<PHINode>(V1))
12321 return;
12322 auto *I1 = dyn_cast<Instruction>(V1);
12323 auto *I2 = dyn_cast<Instruction>(V2);
12324 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
12325 I1->getParent() != I2->getParent())
12326 return;
12327 Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
12328 };
12329 if (VL.size() == 2) {
12330 // Check if the operands allow better vectorization.
12331 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
12332 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
12333 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
12334 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
12335 R.findBestRootPair(Candidates1).first &&
12336 R.findBestRootPair(Candidates2).first;
12337 if (!Res && isCommutative(MainOp)) {
12338 Candidates1.clear();
12339 Candidates2.clear();
12340 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
12341 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
12342 Res = !Candidates1.empty() && !Candidates2.empty() &&
12343 R.findBestRootPair(Candidates1).first &&
12344 R.findBestRootPair(Candidates2).first;
12345 }
12346 if (!Res)
12347 return OrigS;
12349 InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
12350 InstructionCost VectorCost;
12351 auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
12352 switch (MainOpcode) {
12353 case Instruction::Add:
12354 case Instruction::Sub:
12355 case Instruction::LShr:
12356 case Instruction::Shl:
12357 case Instruction::SDiv:
12358 case Instruction::UDiv:
12359 case Instruction::And:
12360 case Instruction::Or:
12361 case Instruction::Xor:
12362 case Instruction::FAdd:
12363 case Instruction::FMul:
12364 case Instruction::FSub:
12365 case Instruction::FDiv:
12366 VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
12367 break;
12368 default:
12369 llvm_unreachable("Unexpected instruction.");
12370 }
12371 if (VectorCost > ScalarCost)
12372 return OrigS;
12373 return S;
12374 }
12375 assert(Operands.size() == 2 && "Unexpected number of operands!");
12376 unsigned CopyableNum =
12377 count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
12378 if (CopyableNum < VL.size() / 2)
12379 return S;
12380 // Too many phi copyables - exit.
12381 const unsigned Limit = VL.size() / 24;
12382 if ((CopyableNum >= VL.size() - Limit ||
12383 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
12384 CopyableNum >= MaxPHINumOperands) &&
12385 all_of(VL, [&](Value *V) {
12386 return isa<PHINode>(V) || !S.isCopyableElement(V);
12387 }))
12388 return OrigS;
12389 // Check profitability if number of copyables > VL.size() / 2.
12390 // 1. Reorder operands for better matching.
12391 if (isCommutative(MainOp)) {
12392 Value *BestFrontOp = nullptr;
12393 for (auto [OpL, OpR] : zip(Operands.front(), Operands.back())) {
12394 // Make instructions the first operands.
12395 if (!isa<Instruction>(OpL) && isa<Instruction>(OpR)) {
12396 BestFrontOp = OpR;
12397 std::swap(OpL, OpR);
12398 continue;
12399 }
12400 // Make constants the second operands.
12401 if ((isa<Constant>(OpL) && !match(OpR, m_Zero())) ||
12402 match(OpL, m_Zero())) {
12403 if (isa<Instruction>(OpR))
12404 BestFrontOp = OpR;
12405 std::swap(OpL, OpR);
12406 continue;
12407 }
12408 if (isa<Instruction>(OpL))
12409 BestFrontOp = OpL;
12410 }
12411 // If some of the RHS operands better match most of LHS - swap such
12412 // operands to increase matching rate.
12413 if (auto *BestLHS = dyn_cast_if_present<Instruction>(BestFrontOp)) {
12414 const unsigned BestOpcode = BestLHS->getOpcode();
12415 for (auto [OpL, OpR] : zip(Operands.front(), Operands.back())) {
12416 auto *OpRI = dyn_cast<Instruction>(OpR);
12417 if (!OpRI)
12418 continue;
12419 if (OpRI->getOpcode() == BestOpcode)
12420 std::swap(OpL, OpR);
12421 }
12422 }
12423 }
12424 // 2. Check, if operands can be vectorized.
12425 if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
12426 return OrigS;
12427 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
12428 if (allConstant(Ops) || isSplat(Ops))
12429 return true;
12430 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
12431 // one is different.
12432 constexpr unsigned Limit = 4;
12433 if (Operands.front().size() >= Limit) {
12434 SmallDenseMap<const Value *, unsigned> Counters;
12435 for (Value *V : Ops) {
12436 if (isa<UndefValue>(V))
12437 continue;
12438 ++Counters[V];
12439 }
12440 if (Counters.size() == 2 &&
12441 any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
12442 return C.second == 1;
12443 }))
12444 return true;
12445 }
12446 // First operand not a constant or splat? Last attempt - check for
12447 // potential vectorization.
12448 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
12449 InstructionsState OpS = Analysis.buildInstructionsState(Ops, R);
12450 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
12451 return false;
12452 unsigned CopyableNum =
12453 count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
12454 return CopyableNum <= VL.size() / 2;
12455 };
12456 if (!CheckOperand(Operands.front()))
12457 return OrigS;
12458
12459 return S;
12460 }
12461
12462 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
12463 ArrayRef<Value *> VL) {
12464 assert(S && "Invalid state!");
12466 if (S.areInstructionsWithCopyableElements()) {
12467 MainOp = S.getMainOp();
12468 MainOpcode = S.getOpcode();
12469 const bool IsCommutative =
12470 isCommutative(MainOp) && MainOp->getNumOperands() == 2;
12471 Operands.assign(MainOp->getNumOperands(),
12472 BoUpSLP::ValueList(VL.size(), nullptr));
12473 // Populate operands for every lane.
12474 for (auto [Idx, V] : enumerate(VL)) {
12475 SmallVector<Value *> OperandsForValue = getOperands(S, V);
12476 for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
12477 Operands[OperandIdx][Idx] = Operand;
12478 }
12479 // Operand-order normalization below swaps OpIdx 0 and OpIdx 1
12480 // of non-copyable lanes. That is only safe when the main op is
12481 // commutative (e.g. 0 - X is not X - 0, so `sub` must be
12482 // excluded).
12483 if (IsCommutative) {
12484 // Count (ID0, ID1) pair frequencies for operand normalization.
12485 // Pairs and their inverses are tracked under a canonical key
12486 // so that (Load, Add) and (Add, Load) contribute to the same
12487 // bucket.
12488 struct PairInfo {
12489 unsigned FwdCount = 0;
12490 unsigned RevCount = 0;
12491 };
12492 SmallMapVector<std::pair<unsigned, unsigned>, PairInfo, 8> PairCounts;
12493 unsigned MajID0 = 0, MajID1 = 0;
12494 for (auto [Idx, V] : enumerate(VL)) {
12495 if (S.isCopyableElement(V) || isa<PoisonValue>(V))
12496 continue;
12497 unsigned ID0 = Operands[0][Idx]->getValueID();
12498 unsigned ID1 = Operands[1][Idx]->getValueID();
12499 if (ID0 == ID1)
12500 continue;
12501 unsigned MinID = std::min(ID0, ID1);
12502 unsigned MaxID = std::max(ID0, ID1);
12503 auto [It, Inserted] =
12504 PairCounts.try_emplace(std::make_pair(MinID, MaxID));
12505 PairInfo &Info = It->second;
12506 if (ID0 < ID1)
12507 ++Info.FwdCount;
12508 else
12509 ++Info.RevCount;
12510 }
12511 // Find the most frequent (ID0, ID1) pair across non-copyable
12512 // lanes. Select the orientation (original or inverse) that
12513 // has more votes as the majority pattern.
12514 unsigned BestCount = 0;
12515 for (const auto &P : PairCounts) {
12516 const PairInfo &Info = P.second;
12517 unsigned Total = Info.FwdCount + Info.RevCount;
12518 if (Total > BestCount) {
12519 BestCount = Total;
12520 if (Info.FwdCount >= Info.RevCount) {
12521 MajID0 = P.first.first;
12522 MajID1 = P.first.second;
12523 } else {
12524 MajID0 = P.first.second;
12525 MajID1 = P.first.first;
12526 }
12527 }
12528 }
12529 // Normalize non-copyable lanes in two steps:
12530 // 1) Swap lanes whose operand types are the exact inverse of
12531 // the majority pattern, making the non-copyable lanes
12532 // consistent.
12533 // 2) Independently, if a strict majority of non-copyable lanes
12534 // have loads at OpIdx 1, swap those lanes to put loads at
12535 // OpIdx 0 for better downstream vectorization.
12536 unsigned LAt0 = 0, LAt1 = 0, TotalNC = 0;
12537 for (auto [Idx, V] : enumerate(VL)) {
12538 if (S.isCopyableElement(V) || isa<PoisonValue>(V))
12539 continue;
12540 // Step 1: swap exact-inverse lanes.
12541 if (BestCount > 0) {
12542 unsigned ID0 = Operands[0][Idx]->getValueID();
12543 unsigned ID1 = Operands[1][Idx]->getValueID();
12544 if (ID0 == MajID1 && ID1 == MajID0)
12545 std::swap(Operands[0][Idx], Operands[1][Idx]);
12546 }
12547 ++TotalNC;
12548 LAt0 += isa<LoadInst>(Operands[0][Idx]);
12549 LAt1 += isa<LoadInst>(Operands[1][Idx]);
12550 }
12551 // Step 2: if most non-copyable lanes have loads at OpIdx 1,
12552 // swap those lanes to put loads at OpIdx 0.
12553 if (TotalNC > 1 && LAt1 > LAt0 && LAt1 * 2 > TotalNC) {
12554 for (auto [Idx, V] : enumerate(VL)) {
12555 if (S.isCopyableElement(V) || isa<PoisonValue>(V))
12556 continue;
12557 if (!isa<LoadInst>(Operands[0][Idx]) &&
12558 isa<LoadInst>(Operands[1][Idx]))
12559 std::swap(Operands[0][Idx], Operands[1][Idx]);
12560 }
12561 }
12562 }
12563 } else {
12564 buildOriginalOperands(S, VL, Operands);
12565 }
12566 return Operands;
12567 }
12568};
12569} // namespace
12570
12571BoUpSLP::ScalarsVectorizationLegality
12572BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
12573 const EdgeInfo &UserTreeIdx) const {
12574 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
12575
12576 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
12577 InstructionsState S = Analysis.buildInstructionsState(
12578 VL, *this, /*WithProfitabilityCheck=*/true);
12579
12580 bool AreScatterAllGEPSameBlock = false;
12581 if (!S) {
12582 SmallVector<unsigned> SortedIndices;
12583 BasicBlock *BB = nullptr;
12584 bool IsScatterVectorizeUserTE =
12585 UserTreeIdx.UserTE &&
12586 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
12587 AreScatterAllGEPSameBlock =
12588 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
12589 VL.size() > 2 &&
12590 all_of(VL,
12591 [&BB](Value *V) {
12592 auto *I = dyn_cast<GetElementPtrInst>(V);
12593 if (!I)
12594 return doesNotNeedToBeScheduled(V);
12595 if (!BB)
12596 BB = I->getParent();
12597 return BB == I->getParent() && I->getNumOperands() == 2;
12598 }) &&
12599 BB &&
12600 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL,
12601 *SE, SortedIndices));
12602 if (!AreScatterAllGEPSameBlock) {
12603 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
12604 "C,S,B,O, small shuffle. \n";
12605 dbgs() << "[";
12606 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
12607 dbgs() << "]\n");
12608 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
12609 /*TryToFindDuplicates=*/true,
12610 /*TrySplitVectorize=*/true);
12611 }
12612 // Reset S to make it GetElementPtr kind of node.
12613 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
12614 assert(It != VL.end() && "Expected at least one GEP.");
12615 S = getSameOpcode(*It, *TLI);
12616 }
12617 assert(S && "Must be valid.");
12618
12619 // Gather very wide PHI bundles. Wide PHIs (e.g. produced by
12620 // jump threading) are not profitable to vectorize and make this analysis
12621 // explode, so gather them to keep the compile time bounded.
12622 if (S.getOpcode() == Instruction::PHI) {
12623 unsigned NumIncomingValues =
12624 cast<PHINode>(S.getMainOp())->getNumIncomingValues();
12625 if (static_cast<uint64_t>(VL.size()) * NumIncomingValues >
12627 LLVM_DEBUG(dbgs() << "SLP: Gathering due to wide PHI operand fan-out ("
12628 << VL.size() << " lanes x " << NumIncomingValues
12629 << " incoming values).\n");
12630 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12631 }
12632 }
12633
12634 // Don't handle vectors.
12635 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
12636 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
12637 // Do not try to pack to avoid extra instructions here.
12638 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
12639 /*TryToFindDuplicates=*/false);
12640 }
12641
12642 // Check that all of the users of the scalars that we want to vectorize are
12643 // schedulable.
12644 BasicBlock *BB = S.getMainOp()->getParent();
12645
12647 !DT->isReachableFromEntry(BB)) {
12648 // Don't go into unreachable blocks. They may contain instructions with
12649 // dependency cycles which confuse the final scheduling.
12650 // Do not vectorize EH and non-returning blocks, not profitable in most
12651 // cases.
12652 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
12653 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12654 }
12655
12656 // Don't go into catchswitch blocks, which can happen with PHIs.
12657 // Such blocks can only have PHIs and the catchswitch. There is no
12658 // place to insert a shuffle if we need to, so just avoid that issue.
12660 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
12661 // Do not try to pack to avoid extra instructions here.
12662 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
12663 /*TryToFindDuplicates=*/false);
12664 }
12665
12666 // Don't handle scalable vectors
12667 if (S.getOpcode() == Instruction::ExtractElement &&
12669 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
12670 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
12671 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12672 }
12673
12674 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
12675 // a load), in which case peek through to include it in the tree, without
12676 // ballooning over-budget.
12677 if (Depth >= RecursionMaxDepth &&
12678 (S.isAltShuffle() || VL.size() < 4 ||
12679 !(match(S.getMainOp(), m_Load(m_Value())) ||
12680 all_of(VL, [&S](const Value *I) {
12681 return match(I,
12683 cast<Instruction>(I)->getOpcode() == S.getOpcode();
12684 })))) {
12685 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
12686 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12687 }
12688
12689 // Check if this is a duplicate of another entry.
12690 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
12691 // Cache invariants to avoid recomputing for every V in VL (and every E).
12692 const bool IsPHIWithLoop =
12693 S.getOpcode() == Instruction::PHI &&
12694 LI->getLoopFor(S.getMainOp()->getParent()) != nullptr;
12695 for (TreeEntry *E : getTreeEntries(S.getMainOp())) {
12696 if (E->isSame(VL)) {
12697 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
12698 << ".\n");
12699 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12700 }
12701 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
12702 if (all_of(VL, [&](Value *V) {
12703 return isa<PoisonValue>(V) || Values.contains(V) ||
12704 (IsPHIWithLoop && isa<PHINode>(V) && isVectorized(V));
12705 })) {
12706 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
12707 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12708 }
12709 }
12710
12711 bool AreAllSameBlock = !AreScatterAllGEPSameBlock;
12712 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
12713 if (!AreAllSameInsts || isSplat(VL) ||
12715 S.getMainOp()) &&
12717 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O conditions. \n";
12718 dbgs() << "[";
12719 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
12720 dbgs() << "]\n");
12721 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12722 }
12723
12724 // Don't vectorize ephemeral values.
12725 if (!EphValues.empty()) {
12726 for (Value *V : VL) {
12727 if (EphValues.count(V)) {
12728 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
12729 << ") is ephemeral.\n");
12730 // Do not try to pack to avoid extra instructions here.
12731 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
12732 /*TryToFindDuplicates=*/false);
12733 }
12734 }
12735 }
12736
12737 // We now know that this is a vector of instructions of the same type from
12738 // the same block.
12739
12740 // Check that none of the instructions in the bundle are already in the tree
12741 // and the node may be not profitable for the vectorization as the small
12742 // alternate node.
12743 if (S.isAltShuffle()) {
12744 auto GetNumVectorizedExtracted = [&]() {
12745 APInt Extracted = APInt::getZero(VL.size());
12746 APInt Vectorized = APInt::getAllOnes(VL.size());
12747 for (auto [Idx, V] : enumerate(VL)) {
12748 auto *I = dyn_cast<Instruction>(V);
12749 if (!I || doesNotNeedToBeScheduled(I) ||
12750 all_of(I->operands(), [&](const Use &U) {
12751 return isa<ExtractElementInst>(U.get());
12752 }))
12753 continue;
12754 if (isVectorized(I))
12755 Vectorized.clearBit(Idx);
12756 else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList))
12757 Extracted.setBit(Idx);
12758 }
12759 return std::make_pair(Vectorized, Extracted);
12760 };
12761 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
12763 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
12764 if (!Vectorized.isAllOnes() && !PreferScalarize) {
12765 // Rough cost estimation, if the vector code (+ potential extracts) is
12766 // more profitable than the scalar + buildvector.
12767 Type *ScalarTy = VL.front()->getType();
12768 auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, VL.size()));
12769 InstructionCost VectorizeCostEstimate =
12770 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +
12771 ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
12772 /*Insert=*/false, /*Extract=*/true, Kind);
12773 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
12774 *TTI, ScalarTy, VecTy, Vectorized,
12775 /*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
12776 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
12777 }
12778 if (PreferScalarize) {
12779 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
12780 "node is not profitable.\n");
12781 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12782 }
12783 }
12784
12785 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
12786 if (UserIgnoreList && !UserIgnoreList->empty()) {
12787 for (Value *V : VL) {
12788 if (UserIgnoreList->contains(V)) {
12789 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
12790 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12791 }
12792 }
12793 }
12794
12795 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
12796}
12797
12798void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
12799 const EdgeInfo &UserTreeIdx,
12800 unsigned InterleaveFactor) {
12801 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
12802
12803 SmallVector<int> ReuseShuffleIndices;
12804 SmallVector<Value *> VL(VLRef);
12805
12806 // Tries to build split node.
12807 auto TrySplitNode = [&](const InstructionsState &LocalState) {
12808 SmallVector<Value *> Op1, Op2;
12809 OrdersType ReorderIndices;
12810 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
12811 return false;
12812
12813 auto Invalid = ScheduleBundle::invalid();
12814 auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
12815 UserTreeIdx, {}, ReorderIndices);
12816 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
12817 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
12818 InstructionsState S = getSameOpcode(Op, *TLI);
12819 if (S && (isa<LoadInst>(S.getMainOp()) ||
12820 getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
12821 // Build gather node for loads, they will be gathered later.
12822 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
12823 Idx == 0 ? 0 : Op1.size());
12824 (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
12825 } else {
12826 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
12827 Idx == 0 ? 0 : Op1.size());
12828 buildTreeRec(Op, Depth, {TE, Idx});
12829 }
12830 };
12831 AddNode(Op1, 0);
12832 AddNode(Op2, 1);
12833 return true;
12834 };
12835
12836 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
12837 bool AreConsts = false;
12838 for (Value *V : VL) {
12839 if (isa<PoisonValue>(V))
12840 continue;
12841 if (isa<Constant>(V)) {
12842 AreConsts = true;
12843 continue;
12844 }
12845 if (!isa<PHINode>(V))
12846 return false;
12847 }
12848 return AreConsts;
12849 };
12850 if (AreOnlyConstsWithPHIs(VL)) {
12851 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
12852 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
12853 return;
12854 }
12855
12856 ScalarsVectorizationLegality Legality =
12857 getScalarsVectorizationLegality(VL, Depth, UserTreeIdx);
12858 InstructionsState S = Legality.getInstructionsState();
12859 if (!Legality.isLegal()) {
12860 if (Legality.trySplitVectorize()) {
12861 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
12862 // Last chance to try to vectorize alternate node.
12863 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
12864 return;
12865 }
12866 if (Legality.tryToFindDuplicates())
12867 (void)tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
12868 UserTreeIdx, *this);
12869
12870 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12871 return;
12872 }
12873
12874 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
12875 if (S.isAltShuffle() && TrySplitNode(S))
12876 return;
12877
12878 // Check that every instruction appears once in this bundle.
12879 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx,
12880 *this, /*BuildGatherOnly=*/false)) {
12881 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12882 return;
12883 }
12884
12885 // Perform specific checks for each particular instruction kind.
12886 bool IsScatterVectorizeUserTE =
12887 UserTreeIdx.UserTE &&
12888 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
12889 OrdersType CurrentOrder;
12890 SmallVector<Value *> PointerOps;
12891 StridedPtrInfo SPtrInfo;
12892 TreeEntry::EntryState State = getScalarsVectorizationState(
12893 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
12894 if (State == TreeEntry::NeedToGather) {
12895 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12896 return;
12897 }
12898
12899 // Check the loop nest. We need to be sure we handle a single loop nest at a
12900 // time to avoid incorrect cost estimation because of the loop aware cost
12901 // model.
12902 if (VectorizableTree.empty()) {
12903 assert(CurrentLoopNest.empty() && "Expected empty loop nest");
12904 // Process the first node? Initial fill of the loop nest.
12905 BasicBlock *Parent = S.getMainOp()->getParent();
12906 if (const Loop *L = LI->getLoopFor(Parent)) {
12908 if (L)
12909 CurrentLoopNest.assign(getLoopNest(L));
12910 }
12911 } else if (!UserTreeIdx ||
12912 UserTreeIdx.UserTE->State == TreeEntry::SplitVectorize ||
12913 UserTreeIdx.UserTE->isGather() ||
12914 UserTreeIdx.UserTE->getMainOp()->getParent() !=
12915 S.getMainOp()->getParent()) {
12916 BasicBlock *Parent = S.getMainOp()->getParent();
12917 if (const Loop *L = LI->getLoopFor(Parent)) {
12918 // Check that the new loop nest shares the same outer structure as the
12919 // tree's current loop nest. Completely disjoint nests (different
12920 // outermost loops) are forced to gather because their scales cannot be
12921 // meaningfully combined. Sibling inner loops (inside a common outer
12922 // loop or outside any loops at all) are allowed: the cost model scales
12923 // each entry by its own loop via getScaleToLoopIterations(), so a tree
12924 // that spans sibling inner loops (e.g. a PHI at their merge block) can
12925 // still be costed correctly. Contract CurrentLoopNest to the longest
12926 // common prefix with the new entry's nest so subsequent entries in yet
12927 // another sibling can also be admitted.
12929 if (L) {
12930 SmallVector<const Loop *> NewLoopNest(getLoopNest(L));
12931 unsigned CommonLen = 0;
12932 for (const auto [L1, L2] : zip(CurrentLoopNest, NewLoopNest)) {
12933 if (L1 != L2)
12934 break;
12935 ++CommonLen;
12936 }
12937 auto ValidateMergedBTCs = [&](unsigned StartDepth) -> bool {
12938 unsigned EndDepth =
12939 std::min<unsigned>(NewLoopNest.size(), MergedLoopBTCs.size());
12940 for (unsigned D = StartDepth; D < EndDepth; ++D) {
12941 const SCEV *Constraint = MergedLoopBTCs[D];
12942 if (!Constraint)
12943 continue;
12944 const SCEV *NewBTC = SE->getBackedgeTakenCount(NewLoopNest[D]);
12945 if (isa<SCEVCouldNotCompute>(NewBTC) || NewBTC != Constraint)
12946 return false;
12947 }
12948 return true;
12949 };
12950 auto BailOutToGather = [&]() {
12952 << "SLP: Sibling loops have different trip counts.\n");
12953 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12954 };
12955 if (CurrentLoopNest.empty()) {
12956 if (!ValidateMergedBTCs(0)) {
12957 BailOutToGather();
12958 return;
12959 }
12960 CurrentLoopNest.assign(NewLoopNest);
12961 } else if (CommonLen < CurrentLoopNest.size() &&
12962 CommonLen < NewLoopNest.size()) {
12963 // Divergence below the common prefix: the tree now spans sibling
12964 // loops at depth CommonLen. Admitting them into one tree makes
12965 // the profitability decision JOINT across both siblings, so a
12966 // very hot sibling could otherwise let an unprofitable cold
12967 // sibling ride along "for free" (per-entry scaling of the cold
12968 // sibling's entries would be dwarfed by the hot one). Require
12969 // SCEV-proven equal backedge-taken counts for the diverging
12970 // siblings before joining; otherwise force gather.
12971 const Loop *SibA = CurrentLoopNest[CommonLen];
12972 const Loop *SibB = NewLoopNest[CommonLen];
12973 const SCEV *BecA = SE->getBackedgeTakenCount(SibA);
12974 const SCEV *BecB = SE->getBackedgeTakenCount(SibB);
12975 if (isa<SCEVCouldNotCompute>(BecA) || BecA != BecB) {
12976 BailOutToGather();
12977 return;
12978 }
12979 if (!ValidateMergedBTCs(CommonLen + 1)) {
12980 BailOutToGather();
12981 return;
12982 }
12983 if (MergedLoopBTCs.size() <= CommonLen)
12984 MergedLoopBTCs.resize(CommonLen + 1, nullptr);
12985 MergedLoopBTCs[CommonLen] = BecA;
12986 CurrentLoopNest.truncate(CommonLen);
12987 } else if (NewLoopNest.size() > CurrentLoopNest.size()) {
12988 if (!ValidateMergedBTCs(CurrentLoopNest.size())) {
12989 BailOutToGather();
12990 return;
12991 }
12992 CurrentLoopNest.append(
12993 std::next(NewLoopNest.begin(), CurrentLoopNest.size()),
12994 NewLoopNest.end());
12995 }
12996 // Otherwise NewLoopNest is a prefix of CurrentLoopNest: keep as-is.
12997 }
12998 }
12999 }
13000
13001 Instruction *VL0 = S.getMainOp();
13002 BasicBlock *BB = VL0->getParent();
13003 auto &BSRef = BlocksSchedules[BB];
13004 if (!BSRef)
13005 BSRef = std::make_unique<BlockScheduling>(BB);
13006
13007 BlockScheduling &BS = *BSRef;
13008
13009 SetVector<Value *> UniqueValues(llvm::from_range, VL);
13010 std::optional<ScheduleBundle *> BundlePtr =
13011 BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
13012#ifdef EXPENSIVE_CHECKS
13013 // Make sure we didn't break any internal invariants
13014 BS.verify();
13015#endif
13016 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
13017 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
13018 // Last chance to try to vectorize alternate node.
13019 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
13020 return;
13021 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
13022 NonScheduledFirst.insert(VL.front());
13023 if (S.getOpcode() == Instruction::Load &&
13024 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
13026 return;
13027 }
13028 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
13029 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
13030 ScheduleBundle Empty;
13031 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
13032 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
13033
13034 unsigned ShuffleOrOp =
13035 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
13036 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
13037 // Postpone PHI nodes creation
13038 SmallVector<unsigned> PHIOps;
13039 for (unsigned I : seq<unsigned>(Operands.size())) {
13040 ArrayRef<Value *> Op = Operands[I];
13041 if (Op.empty())
13042 continue;
13043 InstructionsState S = getSameOpcode(Op, *TLI);
13044 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
13045 buildTreeRec(Op, Depth + 1, {TE, I});
13046 else
13047 PHIOps.push_back(I);
13048 }
13049 for (unsigned I : PHIOps)
13050 buildTreeRec(Operands[I], Depth + 1, {TE, I});
13051 };
13052 switch (ShuffleOrOp) {
13053 case Instruction::PHI: {
13054 TreeEntry *TE =
13055 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
13056 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
13057 TE->dump());
13058
13059 TE->setOperands(Operands);
13060 CreateOperandNodes(TE, Operands);
13061 return;
13062 }
13063 case Instruction::ExtractValue:
13064 case Instruction::ExtractElement: {
13065 if (CurrentOrder.empty()) {
13066 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
13067 } else {
13068 LLVM_DEBUG({
13069 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
13070 "with order";
13071 for (unsigned Idx : CurrentOrder)
13072 dbgs() << " " << Idx;
13073 dbgs() << "\n";
13074 });
13075 fixupOrderingIndices(CurrentOrder);
13076 }
13077 // Insert new order with initial value 0, if it does not exist,
13078 // otherwise return the iterator to the existing one.
13079 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
13080 ReuseShuffleIndices, CurrentOrder);
13081 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
13082 "(ExtractValueInst/ExtractElementInst).\n";
13083 TE->dump());
13084 // This is a special case, as it does not gather, but at the same time
13085 // we are not extending buildTreeRec() towards the operands.
13086 TE->setOperands(Operands);
13087 if (ShuffleOrOp == Instruction::ExtractValue) {
13088 SmallVector<unsigned> Indices;
13090 if (checkEVsForVecCalls(VL, S, *TLI, Indices, Calls)) {
13091 TE->StructEVIndices = std::move(Indices);
13092 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
13093 }
13094 }
13095 return;
13096 }
13097 case Instruction::InsertElement: {
13098 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
13099
13100 auto OrdCompare = [](const std::pair<int, int> &P1,
13101 const std::pair<int, int> &P2) {
13102 return P1.first > P2.first;
13103 };
13104 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
13105 decltype(OrdCompare)>
13106 Indices(OrdCompare);
13107 for (int I = 0, E = VL.size(); I < E; ++I) {
13108 unsigned Idx = *getElementIndex(VL[I]);
13109 Indices.emplace(Idx, I);
13110 }
13111 OrdersType CurrentOrder(VL.size(), VL.size());
13112 bool IsIdentity = true;
13113 for (int I = 0, E = VL.size(); I < E; ++I) {
13114 CurrentOrder[Indices.top().second] = I;
13115 IsIdentity &= Indices.top().second == I;
13116 Indices.pop();
13117 }
13118 if (IsIdentity)
13119 CurrentOrder.clear();
13120 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
13121 {}, CurrentOrder);
13122 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
13123 TE->dump());
13124
13125 TE->setOperands(Operands);
13126 buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
13127 return;
13128 }
13129 case Instruction::Load: {
13130 // Check that a vectorized load would load the same memory as a scalar
13131 // load. For example, we don't want to vectorize loads that are smaller
13132 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
13133 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
13134 // from such a struct, we read/write packed bits disagreeing with the
13135 // unvectorized version.
13136 TreeEntry *TE = nullptr;
13137 fixupOrderingIndices(CurrentOrder);
13138 switch (State) {
13139 case TreeEntry::Vectorize:
13140 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
13141 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
13142 if (CurrentOrder.empty())
13143 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
13144 TE->dump());
13145 else
13147 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
13148 TE->dump());
13149 break;
13150 case TreeEntry::CompressVectorize:
13151 // Vectorizing non-consecutive loads with (masked)load + compress.
13152 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
13153 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
13154 LLVM_DEBUG(
13155 dbgs()
13156 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
13157 TE->dump());
13158 break;
13159 case TreeEntry::StridedVectorize:
13160 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
13161 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
13162 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
13163 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
13164 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
13165 TE->dump());
13166 break;
13167 case TreeEntry::ScatterVectorize:
13168 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
13169 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
13170 UserTreeIdx, ReuseShuffleIndices);
13171 LLVM_DEBUG(
13172 dbgs()
13173 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
13174 TE->dump());
13175 break;
13176 case TreeEntry::CombinedVectorize:
13177 case TreeEntry::SplitVectorize:
13178 case TreeEntry::NeedToGather:
13179 llvm_unreachable("Unexpected loads state.");
13180 }
13181 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
13182 assert(Operands.size() == 1 && "Expected a single operand only");
13183 SmallVector<int> Mask;
13184 inversePermutation(CurrentOrder, Mask);
13185 reorderScalars(Operands.front(), Mask);
13186 }
13187 TE->setOperands(Operands);
13188 if (State == TreeEntry::ScatterVectorize)
13189 buildTreeRec(PointerOps, Depth + 1, {TE, 0});
13190 return;
13191 }
13192 case Instruction::ZExt:
13193 case Instruction::SExt:
13194 case Instruction::FPToUI:
13195 case Instruction::FPToSI:
13196 case Instruction::FPExt:
13197 case Instruction::PtrToInt:
13198 case Instruction::IntToPtr:
13199 case Instruction::SIToFP:
13200 case Instruction::UIToFP:
13201 case Instruction::Trunc:
13202 case Instruction::FPTrunc:
13203 case Instruction::BitCast: {
13204 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
13205 std::make_pair(std::numeric_limits<unsigned>::min(),
13206 std::numeric_limits<unsigned>::max()));
13207 if (ShuffleOrOp == Instruction::ZExt ||
13208 ShuffleOrOp == Instruction::SExt) {
13209 CastMaxMinBWSizes = std::make_pair(
13210 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
13211 PrevMaxBW),
13212 std::min<unsigned>(
13213 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
13214 PrevMinBW));
13215 } else if (ShuffleOrOp == Instruction::Trunc) {
13216 CastMaxMinBWSizes = std::make_pair(
13217 std::max<unsigned>(
13218 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
13219 PrevMaxBW),
13220 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
13221 PrevMinBW));
13222 }
13223 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
13224 ReuseShuffleIndices);
13225 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
13226 TE->dump());
13227
13228 TE->setOperands(Operands);
13229 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
13230 buildTreeRec(TE->getOperand(I), Depth, {TE, I});
13231 if (ShuffleOrOp == Instruction::Trunc) {
13232 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
13233 } else if (ShuffleOrOp == Instruction::SIToFP ||
13234 ShuffleOrOp == Instruction::UIToFP) {
13235 unsigned NumSignBits =
13236 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
13237 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
13238 APInt Mask = DB->getDemandedBits(OpI);
13239 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
13240 }
13241 if (NumSignBits * 2 >=
13242 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
13243 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
13244 }
13245 return;
13246 }
13247 case Instruction::ICmp:
13248 case Instruction::FCmp: {
13249 // Check that all of the compares have the same predicate.
13250 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
13251 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
13252 ReuseShuffleIndices);
13253 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
13254 TE->dump());
13255
13256 VLOperands Ops(VL, Operands, S, *this);
13257 if (cast<CmpInst>(VL0)->isCommutative()) {
13258 // Commutative predicate - collect + sort operands of the instructions
13259 // so that each side is more likely to have the same opcode.
13261 "Commutative Predicate mismatch");
13262 Ops.reorder();
13263 Operands.front() = Ops.getVL(0);
13264 Operands.back() = Ops.getVL(1);
13265 } else {
13266 // Collect operands - commute if it uses the swapped predicate.
13267 for (auto [Idx, V] : enumerate(VL)) {
13268 if (isa<PoisonValue>(V))
13269 continue;
13270 auto *Cmp = cast<CmpInst>(V);
13271 if (Cmp->getPredicate() != P0)
13272 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
13273 }
13274 }
13275 TE->setOperands(Operands);
13276 buildTreeRec(Operands.front(), Depth, {TE, 0});
13277 buildTreeRec(Operands.back(), Depth, {TE, 1});
13278 if (ShuffleOrOp == Instruction::ICmp) {
13279 unsigned NumSignBits0 =
13280 ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
13281 if (NumSignBits0 * 2 >=
13282 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
13283 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
13284 unsigned NumSignBits1 =
13285 ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
13286 if (NumSignBits1 * 2 >=
13287 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
13288 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
13289 }
13290 return;
13291 }
13292 case Instruction::Select:
13293 case Instruction::FNeg:
13294 case Instruction::Add:
13295 case Instruction::FAdd:
13296 case Instruction::Sub:
13297 case Instruction::FSub:
13298 case Instruction::Mul:
13299 case Instruction::FMul:
13300 case Instruction::UDiv:
13301 case Instruction::SDiv:
13302 case Instruction::FDiv:
13303 case Instruction::URem:
13304 case Instruction::SRem:
13305 case Instruction::FRem:
13306 case Instruction::Shl:
13307 case Instruction::LShr:
13308 case Instruction::AShr:
13309 case Instruction::And:
13310 case Instruction::Or:
13311 case Instruction::Xor:
13312 case Instruction::Freeze: {
13313 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
13314 ReuseShuffleIndices);
13315 LLVM_DEBUG(
13316 dbgs() << "SLP: added a new TreeEntry "
13317 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
13318 TE->dump());
13319
13320 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
13321 VLOperands Ops(VL, Operands, S, *this);
13322 Ops.reorder();
13323 Operands[0] = Ops.getVL(0);
13324 Operands[1] = Ops.getVL(1);
13325 }
13326 TE->setOperands(Operands);
13327 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
13328 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
13329 return;
13330 }
13331 case Instruction::GetElementPtr: {
13332 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
13333 ReuseShuffleIndices);
13334 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
13335 TE->dump());
13336 TE->setOperands(Operands);
13337
13338 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
13339 buildTreeRec(Operands[I], Depth + 1, {TE, I});
13340 return;
13341 }
13342 case Instruction::Store: {
13343 assert(CurrentOrder.empty() &&
13344 "Expected ordered store during tree building");
13345 if (State == TreeEntry::StridedVectorize) {
13346 TreeEntry *TE =
13347 newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
13348 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
13349 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
13350 LLVM_DEBUG(
13351 dbgs() << "SLP: added a new TreeEntry (strided StoreInst).\n";
13352 TE->dump());
13353 TE->setOperands(Operands);
13354 buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
13355 return;
13356 }
13357 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
13358 ReuseShuffleIndices, CurrentOrder);
13359 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
13360 TE->dump());
13361 TE->setOperands(Operands);
13362 buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
13363 return;
13364 }
13365 case Instruction::Call: {
13366 // Check if the calls are all to the same vectorizable intrinsic or
13367 // library function.
13368 CallInst *CI = cast<CallInst>(VL0);
13370
13371 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
13372 ReuseShuffleIndices);
13373 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
13374 TE->dump());
13375 if (isCommutative(VL0)) {
13376 VLOperands Ops(VL, Operands, S, *this);
13377 Ops.reorder();
13378 Operands[0] = Ops.getVL(0);
13379 Operands[1] = Ops.getVL(1);
13380 }
13381 TE->setOperands(Operands);
13382 for (unsigned I : seq<unsigned>(CI->arg_size())) {
13383 // For scalar operands no need to create an entry since no need to
13384 // vectorize it.
13386 continue;
13387 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
13388 }
13389 return;
13390 }
13391 case Instruction::ShuffleVector: {
13392 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
13393 ReuseShuffleIndices);
13394 if (S.isAltShuffle()) {
13395 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
13396 TE->dump());
13397 } else {
13398 assert(SLPReVec && "Only supported by REVEC.");
13399 LLVM_DEBUG(
13400 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
13401 TE->dump());
13402 }
13403
13404 // Reorder operands if reordering would enable vectorization.
13405 auto *CI = dyn_cast<CmpInst>(VL0);
13406 if (CI && any_of(VL, [](Value *V) {
13407 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
13408 })) {
13409 auto *MainCI = cast<CmpInst>(S.getMainOp());
13410 auto *AltCI = cast<CmpInst>(S.getAltOp());
13411 CmpInst::Predicate MainP = MainCI->getPredicate();
13412 CmpInst::Predicate AltP = AltCI->getPredicate();
13413 assert(MainP != AltP &&
13414 "Expected different main/alternate predicates.");
13415 // Collect operands - commute if it uses the swapped predicate or
13416 // alternate operation.
13417 for (auto [Idx, V] : enumerate(VL)) {
13418 if (isa<PoisonValue>(V))
13419 continue;
13420 auto *Cmp = cast<CmpInst>(V);
13421
13422 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
13423 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
13424 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
13425 } else {
13426 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
13427 std::swap(Operands.front()[Idx], Operands.back()[Idx]);
13428 }
13429 }
13430 TE->setOperands(Operands);
13431 buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
13432 buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
13433 return;
13434 }
13435
13436 if (isa<BinaryOperator>(VL0) || CI) {
13437 VLOperands Ops(VL, Operands, S, *this);
13438 Ops.reorder();
13439 Operands[0] = Ops.getVL(0);
13440 Operands[1] = Ops.getVL(1);
13441 }
13442 TE->setOperands(Operands);
13443 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
13444 buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
13445 return;
13446 }
13447 default:
13448 break;
13449 }
13450 llvm_unreachable("Unexpected vectorization of the instructions.");
13451}
13452
13453unsigned BoUpSLP::canMapToVector(Type *T) const {
13454 unsigned N = 1;
13455 Type *EltTy = T;
13456
13458 if (EltTy->isEmptyTy())
13459 return 0;
13460 if (auto *ST = dyn_cast<StructType>(EltTy)) {
13461 // Check that struct is homogeneous.
13462 for (const auto *Ty : ST->elements())
13463 if (Ty != *ST->element_begin())
13464 return 0;
13465 N *= ST->getNumElements();
13466 EltTy = *ST->element_begin();
13467 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
13468 N *= AT->getNumElements();
13469 EltTy = AT->getElementType();
13470 } else {
13471 auto *VT = cast<FixedVectorType>(EltTy);
13472 N *= VT->getNumElements();
13473 EltTy = VT->getElementType();
13474 }
13475 }
13476
13477 if (!isValidElementType(EltTy))
13478 return 0;
13479 size_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
13480 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
13481 VTSize != DL->getTypeStoreSizeInBits(T))
13482 return 0;
13483 return N;
13484}
13485
13486bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
13487 SmallVectorImpl<unsigned> &CurrentOrder,
13488 bool ResizeAllowed) const {
13490 assert(It != VL.end() && "Expected at least one extract instruction.");
13491 auto *E0 = cast<Instruction>(*It);
13492 assert(
13494 "Invalid opcode");
13495 // Check if all of the extracts come from the same vector and from the
13496 // correct offset.
13497 Value *Vec = E0->getOperand(0);
13498
13499 CurrentOrder.clear();
13500
13501 // We have to extract from a vector/aggregate with the same number of elements.
13502 unsigned NElts;
13503 if (E0->getOpcode() == Instruction::ExtractValue) {
13504 NElts = canMapToVector(Vec->getType());
13505 if (!NElts)
13506 return false;
13507 // Check if load can be rewritten as load of vector.
13508 LoadInst *LI = dyn_cast<LoadInst>(Vec);
13509 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
13510 return false;
13511 } else {
13512 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
13513 }
13514
13515 unsigned E = VL.size();
13516 if (!ResizeAllowed && NElts != E)
13517 return false;
13518 SmallVector<int> Indices(E, PoisonMaskElem);
13519 unsigned MinIdx = NElts, MaxIdx = 0;
13520 for (auto [I, V] : enumerate(VL)) {
13521 auto *Inst = dyn_cast<Instruction>(V);
13522 if (!Inst)
13523 continue;
13524 if (Inst->getOperand(0) != Vec)
13525 return false;
13526 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
13527 if (isa<UndefValue>(EE->getIndexOperand()))
13528 continue;
13529 std::optional<unsigned> Idx = getExtractIndex(Inst);
13530 if (!Idx)
13531 return false;
13532 const unsigned ExtIdx = *Idx;
13533 if (ExtIdx >= NElts)
13534 continue;
13535 Indices[I] = ExtIdx;
13536 if (MinIdx > ExtIdx)
13537 MinIdx = ExtIdx;
13538 if (MaxIdx < ExtIdx)
13539 MaxIdx = ExtIdx;
13540 }
13541 if (MaxIdx - MinIdx + 1 > E)
13542 return false;
13543 if (MaxIdx + 1 <= E)
13544 MinIdx = 0;
13545
13546 // Check that all of the indices extract from the correct offset.
13547 bool ShouldKeepOrder = true;
13548 // Assign to all items the initial value E + 1 so we can check if the extract
13549 // instruction index was used already.
13550 // Also, later we can check that all the indices are used and we have a
13551 // consecutive access in the extract instructions, by checking that no
13552 // element of CurrentOrder still has value E + 1.
13553 CurrentOrder.assign(E, E);
13554 for (unsigned I = 0; I < E; ++I) {
13555 if (Indices[I] == PoisonMaskElem)
13556 continue;
13557 const unsigned ExtIdx = Indices[I] - MinIdx;
13558 if (CurrentOrder[ExtIdx] != E) {
13559 CurrentOrder.clear();
13560 return false;
13561 }
13562 ShouldKeepOrder &= ExtIdx == I;
13563 CurrentOrder[ExtIdx] = I;
13564 }
13565 if (ShouldKeepOrder)
13566 CurrentOrder.clear();
13567
13568 return ShouldKeepOrder;
13569}
13570
13571bool BoUpSLP::areAllUsersVectorized(
13572 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
13573 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
13574 all_of(I->users(), [this](User *U) {
13575 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
13576 (isa<ExtractElementInst>(U) && MustGather.contains(U));
13577 });
13578}
13579
13581 const InstructionsState &S,
13582 DominatorTree &DT, const DataLayout &DL,
13583 TargetTransformInfo &TTI,
13584 const TargetLibraryInfo &TLI);
13585
13586unsigned BoUpSLP::getNumScalarInsts() const {
13587 unsigned Count = 0;
13588 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
13589 const TreeEntry &TE = *Ptr;
13590 if (DeletedNodes.contains(&TE))
13591 continue;
13592 if (TE.isGather() || TransformedToGatherNodes.contains(&TE)) {
13593 // Count extractelement scalars in gathers - they exist in the scalar
13594 // code regardless of vectorization. ExtractElement instructions
13595 // become free when the vector input is used directly.
13596 for (Value *V : TE.Scalars)
13598 ++Count;
13599 continue;
13600 }
13601 // CombinedVectorize entries (e.g. the fmul child of an FMulAdd, or the
13602 // cmp child of a MinMax select) are absorbed into the parent on both
13603 // scalar and vector sides. The backend fuses fadd+fmul → fma and
13604 // select+cmp → smin/smax even for scalar code, so skip to avoid
13605 // double-counting.
13606 if (TE.State == TreeEntry::CombinedVectorize)
13607 continue;
13608 // Each vectorize entry represents a bundle of scalar instructions.
13609 // Count per-entry without cross-entry deduplication, since shared
13610 // scalars across entries still represent separate work in scalar code.
13611 for (Value *V : TE.Scalars) {
13612 if (!isa<Instruction>(V) ||
13613 (TE.hasCopyableElements() && TE.isCopyableElement(V)))
13614 continue;
13615 ++Count;
13616 // Calculate calls/divs/rems twice, they may cost higher, so better to
13617 // include their count twice to mimic slightly real cost here.
13618 auto *I = dyn_cast<Instruction>(V);
13619 if (I && (I->isIntDivRem() || I->isFPDivRem()))
13620 ++Count;
13621 if (auto *CI = dyn_cast<CallInst>(V)) {
13623 if (!isTriviallyVectorizable(BaseID))
13624 ++Count;
13625 }
13626 }
13627 // Even when the whole node is not combined, individual scalar
13628 // instructions may be fused by the backend. Each fused pair (e.g.
13629 // fadd+fmul → fma, select+cmp → smin/smax) becomes a single scalar
13630 // instruction, absorbing the operand instruction. Subtract 1 for each
13631 // such match to avoid over-counting the scalar side.
13632 if (TE.CombinedOp == TreeEntry::NotCombinedOp && TE.hasState()) {
13633 unsigned Opcode = TE.getOpcode();
13634 if (Opcode == Instruction::Select) {
13635 for (Value *V : TE.Scalars) {
13636 if (TE.hasCopyableElements() && TE.isCopyableElement(V))
13637 continue;
13638 auto *SI = dyn_cast<SelectInst>(V);
13639 if (!SI)
13640 continue;
13641 auto [ID, _] = canConvertToMinOrMaxIntrinsic({V});
13643 assert(Count > 0 && "Underflow in scalar inst count (minmax)");
13644 --Count;
13645 }
13646 }
13647 } else if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub) {
13648 for (Value *V : TE.Scalars) {
13649 if (TE.hasCopyableElements() && TE.isCopyableElement(V))
13650 continue;
13651 auto *I = dyn_cast<Instruction>(V);
13652 if (!I || (TE.isAltShuffle() && I->getOpcode() != Instruction::FAdd &&
13653 I->getOpcode() != Instruction::FSub))
13654 continue;
13655 if (canConvertToFMA(I, InstructionsState(I, I), *DT, *DL, *TTI, *TLI)
13656 .isValid()) {
13657 assert(Count > 0 && "Underflow in scalar inst count (fma)");
13658 --Count;
13659 }
13660 }
13661 }
13662 }
13663 }
13664 return Count;
13665}
13666
13667unsigned BoUpSLP::getNumVectorInsts() const {
13668 unsigned Count = 0;
13669 SmallPtrSet<Value *, 4> GatherExtractSourceVecs;
13670 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
13671 const TreeEntry &TE = *Ptr;
13672 if (DeletedNodes.contains(&TE))
13673 continue;
13674 if (TE.State == TreeEntry::CombinedVectorize)
13675 continue;
13676 bool IsGatherOrTransformed =
13677 TE.isGather() || TransformedToGatherNodes.contains(&TE);
13678 if (IsGatherOrTransformed) {
13679 if (TE.hasState()) {
13680 if (const TreeEntry *E =
13681 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
13682 E && E != &TE && E->getVectorFactor() == TE.getVectorFactor())
13683 continue;
13684 SmallVector<Value *> RevScalars(TE.Scalars.rbegin(), TE.Scalars.rend());
13685 if (const TreeEntry *E =
13686 getSameValuesTreeEntry(TE.getMainOp(), RevScalars);
13687 E && E->getVectorFactor() == TE.getVectorFactor()) {
13688 ++Count;
13689 continue;
13690 }
13691 }
13692 // ExtractElement gathers from the same source vector become a single
13693 // shufflevector. Collect source vectors globally across all gather
13694 // entries and count once at the end.
13695 if (all_of(TE.Scalars,
13697 for (Value *V : TE.Scalars)
13698 if (auto *EE = dyn_cast<ExtractElementInst>(V))
13699 GatherExtractSourceVecs.insert(EE->getVectorOperand());
13700 } else {
13701 for (Value *V : TE.Scalars) {
13702 if (!isConstant(V))
13703 ++Count;
13704 }
13705 }
13706 continue;
13707 }
13708 // InsertElement/ExtractElement vectorize entries don't produce real
13709 // vector instructions - InsertElement at root IS the result, and
13710 // ExtractElement entries reference the input vector directly.
13711 if (TE.getOpcode() == Instruction::InsertElement ||
13712 TE.getOpcode() == Instruction::ExtractElement)
13713 continue;
13714 if (TE.State == TreeEntry::SplitVectorize)
13715 Count += 2;
13716 else
13717 ++Count;
13718 if (!TE.ReorderIndices.empty() || !TE.ReuseShuffleIndices.empty())
13719 ++Count;
13720 }
13721 Count += GatherExtractSourceVecs.size();
13722 // Count extract instructions from ExternalUses, skipping insertelements
13723 // (those get folded into shuffles, not real extracts).
13724 SmallPtrSet<Value *, 8> CountedExtracts;
13725 for (const ExternalUser &EU : ExternalUses) {
13727 continue;
13728 if (EU.User && EphValues.count(EU.User))
13729 continue;
13730 if (ExternalUsesAsOriginalScalar.contains(EU.Scalar))
13731 continue;
13732 if (!CountedExtracts.insert(EU.Scalar).second)
13733 continue;
13734 ++Count;
13735 }
13736 return Count;
13737}
13738
13739void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
13740 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
13741 SmallVectorImpl<Value *> *OpScalars,
13742 SmallVectorImpl<Value *> *AltScalars) const {
13743 unsigned Sz = Scalars.size();
13744 Mask.assign(Sz, PoisonMaskElem);
13745 SmallVector<int> OrderMask;
13746 if (!ReorderIndices.empty())
13747 inversePermutation(ReorderIndices, OrderMask);
13748 for (unsigned I = 0; I < Sz; ++I) {
13749 unsigned Idx = I;
13750 if (!ReorderIndices.empty())
13751 Idx = OrderMask[I];
13752 if (isa<PoisonValue>(Scalars[Idx]))
13753 continue;
13754 auto *OpInst = cast<Instruction>(Scalars[Idx]);
13755 if (IsAltOp(OpInst)) {
13756 Mask[I] = Sz + Idx;
13757 if (AltScalars)
13758 AltScalars->push_back(OpInst);
13759 } else {
13760 Mask[I] = Idx;
13761 if (OpScalars)
13762 OpScalars->push_back(OpInst);
13763 }
13764 }
13765 if (!ReuseShuffleIndices.empty()) {
13766 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
13767 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
13768 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
13769 });
13770 Mask.swap(NewMask);
13771 }
13772}
13773
13775 Instruction *AltOp,
13776 const TargetLibraryInfo &TLI) {
13777 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
13778}
13779
13781 Instruction *AltOp,
13782 const TargetLibraryInfo &TLI) {
13783 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
13784 auto *AltCI = cast<CmpInst>(AltOp);
13785 CmpInst::Predicate MainP = MainCI->getPredicate();
13786 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
13787 assert(MainP != AltP && "Expected different main/alternate predicates.");
13788 auto *CI = cast<CmpInst>(I);
13789 if (isCmpSameOrSwapped(MainCI, CI, TLI))
13790 return false;
13791 if (isCmpSameOrSwapped(AltCI, CI, TLI))
13792 return true;
13793 CmpInst::Predicate P = CI->getPredicate();
13795
13796 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
13797 "CmpInst expected to match either main or alternate predicate or "
13798 "their swap.");
13799 return MainP != P && MainP != SwappedP;
13800 }
13801 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
13802}
13803
13804TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) const {
13805 assert(!Ops.empty());
13806 const auto *Op0 = Ops.front();
13807
13808 const bool IsConstant = all_of(Ops, [](Value *V) {
13809 // TODO: We should allow undef elements here
13810 return isConstant(V) && !isa<UndefValue>(V);
13811 });
13812 const bool IsUniform = all_of(Ops, [=](Value *V) {
13813 // TODO: We should allow undef elements here
13814 return V == Op0;
13815 });
13816 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
13817 // TODO: We should allow undef elements here
13818 if (auto *CI = dyn_cast<ConstantInt>(V))
13819 return CI->getValue().isPowerOf2();
13820 return false;
13821 });
13822 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
13823 // TODO: We should allow undef elements here
13824 if (auto *CI = dyn_cast<ConstantInt>(V))
13825 return CI->getValue().isNegatedPowerOf2();
13826 return false;
13827 });
13828
13830 if (IsConstant && IsUniform)
13832 else if (IsConstant)
13834 else if (IsUniform)
13836
13838 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
13839 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
13840
13841 return {VK, VP};
13842}
13843
13844namespace {
13845/// The base class for shuffle instruction emission and shuffle cost estimation.
13846class BaseShuffleAnalysis {
13847protected:
13848 Type *ScalarTy = nullptr;
13849
13850 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
13851
13852 /// V is expected to be a vectorized value.
13853 /// When REVEC is disabled, there is no difference between VF and
13854 /// VNumElements.
13855 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
13856 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
13857 /// of 8.
13858 unsigned getVF(Value *V) const {
13859 assert(V && "V cannot be nullptr");
13860 assert(isa<FixedVectorType>(V->getType()) &&
13861 "V does not have FixedVectorType");
13862 assert(ScalarTy && "ScalarTy cannot be nullptr");
13863 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
13864 unsigned VNumElements =
13865 cast<FixedVectorType>(V->getType())->getNumElements();
13866 assert(VNumElements > ScalarTyNumElements &&
13867 "the number of elements of V is not large enough");
13868 assert(VNumElements % ScalarTyNumElements == 0 &&
13869 "the number of elements of V is not a vectorized value");
13870 return VNumElements / ScalarTyNumElements;
13871 }
13872
13873 /// Checks if the mask is an identity mask.
13874 /// \param IsStrict if is true the function returns false if mask size does
13875 /// not match vector size.
13876 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
13877 bool IsStrict) {
13878 int Limit = Mask.size();
13879 int VF = VecTy->getNumElements();
13880 int Index = -1;
13881 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
13882 return true;
13883 if (!IsStrict) {
13884 // Consider extract subvector starting from index 0.
13885 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
13886 Index == 0)
13887 return true;
13888 // All VF-size submasks are identity (e.g.
13889 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
13890 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
13891 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
13892 return all_of(Slice, equal_to(PoisonMaskElem)) ||
13894 }))
13895 return true;
13896 }
13897 return false;
13898 }
13899
13900 /// Tries to combine 2 different masks into single one.
13901 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
13902 /// change the size of the vector, \p LocalVF is the original size of the
13903 /// shuffled vector.
13904 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
13905 ArrayRef<int> ExtMask) {
13906 unsigned VF = Mask.size();
13907 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
13908 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
13909 if (ExtMask[I] == PoisonMaskElem)
13910 continue;
13911 int MaskedIdx = Mask[ExtMask[I] % VF];
13912 NewMask[I] =
13913 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
13914 }
13915 Mask.swap(NewMask);
13916 }
13917
13918 /// Looks through shuffles trying to reduce final number of shuffles in the
13919 /// code. The function looks through the previously emitted shuffle
13920 /// instructions and properly mark indices in mask as undef.
13921 /// For example, given the code
13922 /// \code
13923 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
13924 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
13925 /// \endcode
13926 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
13927 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
13928 /// <0, 1, 2, 3> for the shuffle.
13929 /// If 2 operands are of different size, the smallest one will be resized and
13930 /// the mask recalculated properly.
13931 /// For example, given the code
13932 /// \code
13933 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
13934 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
13935 /// \endcode
13936 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
13937 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
13938 /// <0, 1, 2, 3> for the shuffle.
13939 /// So, it tries to transform permutations to simple vector merge, if
13940 /// possible.
13941 /// \param V The input vector which must be shuffled using the given \p Mask.
13942 /// If the better candidate is found, \p V is set to this best candidate
13943 /// vector.
13944 /// \param Mask The input mask for the shuffle. If the best candidate is found
13945 /// during looking-through-shuffles attempt, it is updated accordingly.
13946 /// \param SinglePermute true if the shuffle operation is originally a
13947 /// single-value-permutation. In this case the look-through-shuffles procedure
13948 /// may look for resizing shuffles as the best candidates.
13949 /// \return true if the shuffle results in the non-resizing identity shuffle
13950 /// (and thus can be ignored), false - otherwise.
13951 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
13952 bool SinglePermute) {
13953 Value *Op = V;
13954 ShuffleVectorInst *IdentityOp = nullptr;
13955 SmallVector<int> IdentityMask;
13956 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
13957 // Exit if not a fixed vector type or changing size shuffle.
13958 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
13959 if (!SVTy)
13960 break;
13961 // Remember the identity or broadcast mask, if it is not a resizing
13962 // shuffle. If no better candidates are found, this Op and Mask will be
13963 // used in the final shuffle.
13964 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
13965 if (!IdentityOp || !SinglePermute ||
13966 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
13968 IdentityMask.size()))) {
13969 IdentityOp = SV;
13970 // Store current mask in the IdentityMask so later we did not lost
13971 // this info if IdentityOp is selected as the best candidate for the
13972 // permutation.
13973 IdentityMask.assign(Mask);
13974 }
13975 }
13976 // Remember the broadcast mask. If no better candidates are found, this Op
13977 // and Mask will be used in the final shuffle.
13978 // Zero splat can be used as identity too, since it might be used with
13979 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
13980 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
13981 // expensive, the analysis founds out, that the source vector is just a
13982 // broadcast, this original mask can be transformed to identity mask <0,
13983 // 1, 2, 3>.
13984 // \code
13985 // %0 = shuffle %v, poison, zeroinitalizer
13986 // %res = shuffle %0, poison, <3, 1, 2, 0>
13987 // \endcode
13988 // may be transformed to
13989 // \code
13990 // %0 = shuffle %v, poison, zeroinitalizer
13991 // %res = shuffle %0, poison, <0, 1, 2, 3>
13992 // \endcode
13993 if (SV->isZeroEltSplat()) {
13994 IdentityOp = SV;
13995 IdentityMask.assign(Mask);
13996 }
13997 int LocalVF = Mask.size();
13998 if (auto *SVOpTy =
13999 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
14000 LocalVF = SVOpTy->getNumElements();
14001 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
14002 for (auto [Idx, I] : enumerate(Mask)) {
14003 if (I == PoisonMaskElem ||
14004 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
14005 continue;
14006 ExtMask[Idx] = SV->getMaskValue(I);
14007 }
14008 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
14009 SV->getOperand(0),
14010 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
14011 .all();
14012 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
14013 SV->getOperand(1),
14014 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
14015 .all();
14016 if (!IsOp1Undef && !IsOp2Undef) {
14017 // Update mask and mark undef elems.
14018 for (int &I : Mask) {
14019 if (I == PoisonMaskElem)
14020 continue;
14021 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
14023 I = PoisonMaskElem;
14024 }
14025 break;
14026 }
14027 SmallVector<int> ShuffleMask(SV->getShuffleMask());
14028 combineMasks(LocalVF, ShuffleMask, Mask);
14029 Mask.swap(ShuffleMask);
14030 if (IsOp2Undef)
14031 Op = SV->getOperand(0);
14032 else
14033 Op = SV->getOperand(1);
14034 }
14035 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
14036 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
14038 if (IdentityOp) {
14039 V = IdentityOp;
14040 assert(Mask.size() == IdentityMask.size() &&
14041 "Expected masks of same sizes.");
14042 // Clear known poison elements.
14043 for (auto [I, Idx] : enumerate(Mask))
14044 if (Idx == PoisonMaskElem)
14045 IdentityMask[I] = PoisonMaskElem;
14046 Mask.swap(IdentityMask);
14047 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
14048 return SinglePermute &&
14049 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
14050 /*IsStrict=*/true) ||
14051 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
14052 Shuffle->isZeroEltSplat() &&
14054 all_of(enumerate(Mask), [&](const auto &P) {
14055 return P.value() == PoisonMaskElem ||
14056 Shuffle->getShuffleMask()[P.index()] == 0;
14057 })));
14058 }
14059 V = Op;
14060 return false;
14061 }
14062 V = Op;
14063 return true;
14064 }
14065
14066 /// Smart shuffle instruction emission, walks through shuffles trees and
14067 /// tries to find the best matching vector for the actual shuffle
14068 /// instruction.
14069 template <typename T, typename ShuffleBuilderTy, typename... Args>
14070 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
14071 ShuffleBuilderTy &Builder, Type *ScalarTy,
14072 Args... Arguments) {
14073 assert(V1 && "Expected at least one vector value.");
14074 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
14075 SmallVector<int> NewMask(Mask);
14076 if (ScalarTyNumElements != 1) {
14077 assert(SLPReVec && "FixedVectorType is not expected.");
14078 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
14079 Mask = NewMask;
14080 }
14081 if (V2)
14082 Builder.resizeToMatch(V1, V2);
14083 int VF = Mask.size();
14084 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
14085 VF = FTy->getNumElements();
14087 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
14088 .all()) {
14089 // Peek through shuffles.
14090 Value *Op1 = V1;
14091 Value *Op2 = V2;
14092 int VF =
14093 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
14094 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
14095 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
14096 for (int I = 0, E = Mask.size(); I < E; ++I) {
14097 if (Mask[I] < VF)
14098 CombinedMask1[I] = Mask[I];
14099 else
14100 CombinedMask2[I] = Mask[I] - VF;
14101 }
14102 Value *PrevOp1;
14103 Value *PrevOp2;
14104 do {
14105 PrevOp1 = Op1;
14106 PrevOp2 = Op2;
14107 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
14108 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
14109 // Check if we have 2 resizing shuffles - need to peek through operands
14110 // again.
14111 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
14112 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
14113 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
14114 for (auto [Idx, I] : enumerate(CombinedMask1)) {
14115 if (I == PoisonMaskElem)
14116 continue;
14117 ExtMask1[Idx] = SV1->getMaskValue(I);
14118 }
14119 SmallBitVector UseMask1 = buildUseMask(
14120 cast<FixedVectorType>(SV1->getOperand(1)->getType())
14121 ->getNumElements(),
14122 ExtMask1, UseMask::SecondArg);
14123 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
14124 for (auto [Idx, I] : enumerate(CombinedMask2)) {
14125 if (I == PoisonMaskElem)
14126 continue;
14127 ExtMask2[Idx] = SV2->getMaskValue(I);
14128 }
14129 SmallBitVector UseMask2 = buildUseMask(
14130 cast<FixedVectorType>(SV2->getOperand(1)->getType())
14131 ->getNumElements(),
14132 ExtMask2, UseMask::SecondArg);
14133 if (SV1->getOperand(0)->getType() ==
14134 SV2->getOperand(0)->getType() &&
14135 SV1->getOperand(0)->getType() != SV1->getType() &&
14136 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
14137 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
14138 Op1 = SV1->getOperand(0);
14139 Op2 = SV2->getOperand(0);
14140 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
14141 int LocalVF = ShuffleMask1.size();
14142 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
14143 LocalVF = FTy->getNumElements();
14144 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
14145 CombinedMask1.swap(ShuffleMask1);
14146 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
14147 LocalVF = ShuffleMask2.size();
14148 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
14149 LocalVF = FTy->getNumElements();
14150 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
14151 CombinedMask2.swap(ShuffleMask2);
14152 }
14153 }
14154 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
14155 Builder.resizeToMatch(Op1, Op2);
14156 VF = std::max(cast<VectorType>(Op1->getType())
14157 ->getElementCount()
14158 .getKnownMinValue(),
14160 ->getElementCount()
14161 .getKnownMinValue());
14162 for (int I = 0, E = Mask.size(); I < E; ++I) {
14163 if (CombinedMask2[I] != PoisonMaskElem) {
14164 assert(CombinedMask1[I] == PoisonMaskElem &&
14165 "Expected undefined mask element");
14166 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
14167 }
14168 }
14169 if (Op1 == Op2 &&
14170 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
14171 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
14173 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
14174 ArrayRef(CombinedMask1))))
14175 return Builder.createIdentity(Op1);
14176 return Builder.createShuffleVector(
14177 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
14178 CombinedMask1);
14179 }
14180 if (isa<PoisonValue>(V1))
14181 return Builder.createPoison(
14182 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
14183 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
14184 assert(V1 && "Expected non-null value after looking through shuffles.");
14185
14186 if (!IsIdentity)
14187 return Builder.createShuffleVector(V1, NewMask, Arguments...);
14188 return Builder.createIdentity(V1);
14189 }
14190
14191 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
14192 /// shuffle emission.
14193 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
14194 ArrayRef<int> Mask) {
14195 for (unsigned I : seq<unsigned>(CommonMask.size()))
14196 if (Mask[I] != PoisonMaskElem)
14197 CommonMask[I] = I;
14198 }
14199};
14200} // namespace
14201
14202/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
14203static std::pair<InstructionCost, InstructionCost>
14205 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
14206 Type *ScalarTy, VectorType *VecTy) {
14207 InstructionCost ScalarCost = 0;
14208 InstructionCost VecCost = 0;
14209 // Here we differentiate two cases: (1) when Ptrs represent a regular
14210 // vectorization tree node (as they are pointer arguments of scattered
14211 // loads) or (2) when Ptrs are the arguments of loads or stores being
14212 // vectorized as plane wide unit-stride load/store since all the
14213 // loads/stores are known to be from/to adjacent locations.
14214 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
14215 // Case 2: estimate costs for pointer related costs when vectorizing to
14216 // a wide load/store.
14217 // Scalar cost is estimated as a set of pointers with known relationship
14218 // between them.
14219 // For vector code we will use BasePtr as argument for the wide load/store
14220 // but we also need to account all the instructions which are going to
14221 // stay in vectorized code due to uses outside of these scalar
14222 // loads/stores.
14223 ScalarCost = TTI.getPointersChainCost(
14224 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
14225 CostKind);
14226
14227 SmallVector<const Value *> PtrsRetainedInVecCode;
14228 for (Value *V : Ptrs) {
14229 if (V == BasePtr) {
14230 PtrsRetainedInVecCode.push_back(V);
14231 continue;
14232 }
14233 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
14234 // For simplicity assume Ptr to stay in vectorized code if it's not a
14235 // GEP instruction. We don't care since it's cost considered free.
14236 // TODO: We should check for any uses outside of vectorizable tree
14237 // rather than just single use.
14238 if (!Ptr || !Ptr->hasOneUse())
14239 PtrsRetainedInVecCode.push_back(V);
14240 }
14241
14242 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
14243 // If all pointers stay in vectorized code then we don't have
14244 // any savings on that.
14245 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
14246 }
14247 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
14248 TTI::PointersChainInfo::getKnownStride(),
14249 VecTy, CostKind);
14250 } else {
14251 // Case 1: Ptrs are the arguments of loads that we are going to transform
14252 // into masked gather load intrinsic.
14253 // All the scalar GEPs will be removed as a result of vectorization.
14254 // For any external uses of some lanes extract element instructions will
14255 // be generated (which cost is estimated separately).
14256 TTI::PointersChainInfo PtrsInfo =
14257 all_of(Ptrs,
14258 [](const Value *V) {
14259 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
14260 return Ptr && !Ptr->hasAllConstantIndices();
14261 })
14262 ? TTI::PointersChainInfo::getUnknownStride()
14263 : TTI::PointersChainInfo::getKnownStride();
14264
14265 ScalarCost =
14266 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
14267 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
14268 if (!BaseGEP) {
14269 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
14270 if (It != Ptrs.end())
14271 BaseGEP = cast<GEPOperator>(*It);
14272 }
14273 if (BaseGEP) {
14274 SmallVector<const Value *> Indices(BaseGEP->indices());
14275 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
14276 BaseGEP->getPointerOperand(), Indices, VecTy,
14277 CostKind);
14278 }
14279 }
14280
14281 return std::make_pair(ScalarCost, VecCost);
14282}
14283
14284void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
14285 assert(TE.isGather() && TE.ReorderIndices.empty() &&
14286 "Expected gather node without reordering.");
14287 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
14288 SmallSet<size_t, 2> LoadKeyUsed;
14289
14290 // Do not reorder nodes if it small (just 2 elements), all-constant or all
14291 // instructions have same opcode already.
14292 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
14293 all_of(TE.Scalars, isConstant))
14294 return;
14295
14296 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
14297 return VectorizableTree[Idx]->isSame(TE.Scalars);
14298 }))
14299 return;
14300
14301 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
14302 Key = hash_combine(hash_value(LI->getParent()->getNumber()), Key);
14303 Value *Ptr =
14304 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
14305 if (LoadKeyUsed.contains(Key)) {
14306 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
14307 if (LIt != LoadsMap.end()) {
14308 for (LoadInst *RLI : LIt->second) {
14309 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
14310 LI->getType(), LI->getPointerOperand(), *DL, *SE,
14311 /*StrictCheck=*/true))
14312 return hash_value(RLI->getPointerOperand());
14313 }
14314 for (LoadInst *RLI : LIt->second) {
14316 LI->getPointerOperand(), *TLI)) {
14317 hash_code SubKey = hash_value(RLI->getPointerOperand());
14318 return SubKey;
14319 }
14320 }
14321 if (LIt->second.size() > 2) {
14322 hash_code SubKey =
14323 hash_value(LIt->second.back()->getPointerOperand());
14324 return SubKey;
14325 }
14326 }
14327 }
14328 LoadKeyUsed.insert(Key);
14329 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
14330 return hash_value(LI->getPointerOperand());
14331 };
14332 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
14333 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
14334 bool IsOrdered = true;
14335 unsigned NumInstructions = 0;
14336 // Try to "cluster" scalar instructions, to be able to build extra vectorized
14337 // nodes.
14338 for (auto [I, V] : enumerate(TE.Scalars)) {
14339 size_t Key = 1, Idx = 1;
14340 if (auto *Inst = dyn_cast<Instruction>(V);
14342 !isDeleted(Inst) && !isVectorized(V)) {
14343 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
14344 /*AllowAlternate=*/false);
14345 ++NumInstructions;
14346 }
14347 auto &Container = SortedValues[Key];
14348 if (IsOrdered && !KeyToIndex.contains(V) &&
14351 ((Container.contains(Idx) &&
14352 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
14353 (!Container.empty() && !Container.contains(Idx) &&
14354 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
14355 IsOrdered = false;
14356 auto &KTI = KeyToIndex[V];
14357 if (KTI.empty())
14358 Container[Idx].push_back(V);
14359 KTI.push_back(I);
14360 }
14362 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
14363 if (!IsOrdered && NumInstructions > 1) {
14364 unsigned Cnt = 0;
14365 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
14366 for (const auto &D : SortedValues) {
14367 for (const auto &P : D.second) {
14368 unsigned Sz = 0;
14369 for (Value *V : P.second) {
14370 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
14371 for (auto [K, Idx] : enumerate(Indices)) {
14372 TE.ReorderIndices[Cnt + K] = Idx;
14373 TE.Scalars[Cnt + K] = V;
14374 }
14375 Sz += Indices.size();
14376 Cnt += Indices.size();
14377 }
14378 if (Sz > 1 && isa<Instruction>(P.second.front())) {
14379 const unsigned SubVF = getFloorFullVectorNumberOfElements(
14380 *TTI, TE.Scalars.front()->getType(), Sz);
14381 SubVectors.emplace_back(Cnt - Sz, SubVF);
14382 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
14383 DemandedElts.clearBit(I);
14384 } else if (!P.second.empty() && isConstant(P.second.front())) {
14385 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
14386 DemandedElts.clearBit(I);
14387 }
14388 }
14389 }
14390 }
14391 // Reuses always require shuffles, so consider it as profitable.
14392 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
14393 return;
14394 // Do simple cost estimation.
14397 auto *ScalarTy = TE.Scalars.front()->getType();
14398 auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, TE.Scalars.size()));
14399 for (auto [Idx, Sz] : SubVectors) {
14400 Cost +=
14402 Idx, cast<VectorType>(getWidenedType(ScalarTy, Sz)));
14403 }
14404 Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
14405 /*Insert=*/true,
14406 /*Extract=*/false, CostKind);
14407 int Sz = TE.Scalars.size();
14408 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
14409 TE.ReorderIndices.end());
14410 for (unsigned I : seq<unsigned>(Sz)) {
14411 Value *V = TE.getOrdered(I);
14412 if (isa<PoisonValue>(V)) {
14413 ReorderMask[I] = PoisonMaskElem;
14414 } else if (isConstant(V) || DemandedElts[I]) {
14415 ReorderMask[I] = I + TE.ReorderIndices.size();
14416 }
14417 }
14418 Cost += ::getShuffleCost(*TTI,
14419 any_of(ReorderMask, [&](int I) { return I >= Sz; })
14422 VecTy, ReorderMask);
14423 DemandedElts = APInt::getAllOnes(TE.Scalars.size());
14424 ReorderMask.assign(Sz, PoisonMaskElem);
14425 for (unsigned I : seq<unsigned>(Sz)) {
14426 Value *V = TE.getOrdered(I);
14427 if (isConstant(V)) {
14428 DemandedElts.clearBit(I);
14429 if (!isa<PoisonValue>(V))
14430 ReorderMask[I] = I;
14431 } else {
14432 ReorderMask[I] = I + Sz;
14433 }
14434 }
14435 InstructionCost BVCost =
14436 getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
14437 /*Insert=*/true, /*Extract=*/false, CostKind);
14438 if (!DemandedElts.isAllOnes())
14439 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
14440 if (Cost >= BVCost) {
14441 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
14442 reorderScalars(TE.Scalars, Mask);
14443 TE.ReorderIndices.clear();
14444 }
14445}
14446
14447/// Check if we can convert fadd/fsub sequence to FMAD.
14448/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
14450 const InstructionsState &S,
14451 DominatorTree &DT, const DataLayout &DL,
14453 const TargetLibraryInfo &TLI) {
14454 assert(all_of(VL,
14455 [](Value *V) {
14456 return V->getType()->getScalarType()->isFloatingPointTy();
14457 }) &&
14458 "Can only convert to FMA for floating point types");
14459 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
14460
14461 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
14462 FastMathFlags FMF;
14463 FMF.set();
14464 for (Value *V : VL) {
14465 auto *I = dyn_cast<Instruction>(V);
14466 if (!I)
14467 continue;
14468 if (S.isCopyableElement(I))
14469 continue;
14470 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
14471 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
14472 continue;
14473 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
14474 FMF &= FPCI->getFastMathFlags();
14475 }
14476 return FMF.allowContract();
14477 };
14478 if (!CheckForContractable(VL))
14480 // fmul also should be contractable
14481 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
14482 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
14483
14484 InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
14485 if (!OpS.valid())
14487
14488 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
14490 if (!CheckForContractable(Operands.front()))
14492 // Compare the costs.
14493 InstructionCost FMulPlusFAddCost = 0;
14494 InstructionCost FMACost = 0;
14496 FastMathFlags FMF;
14497 FMF.set();
14498 for (Value *V : VL) {
14499 auto *I = dyn_cast<Instruction>(V);
14500 if (!I)
14501 continue;
14502 if (!S.isCopyableElement(I))
14503 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
14504 FMF &= FPCI->getFastMathFlags();
14505 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
14506 }
14507 unsigned NumOps = 0;
14508 for (auto [V, Op] : zip(VL, Operands.front())) {
14509 if (S.isCopyableElement(V))
14510 continue;
14511 auto *I = dyn_cast<Instruction>(Op);
14512 if (!I || !I->hasOneUse() || OpS.isCopyableElement(I)) {
14513 if (auto *OpI = dyn_cast<Instruction>(V))
14514 FMACost += TTI.getInstructionCost(OpI, CostKind);
14515 if (I)
14516 FMACost += TTI.getInstructionCost(I, CostKind);
14517 continue;
14518 }
14519 ++NumOps;
14520 if (auto *FPCI = dyn_cast<FPMathOperator>(I))
14521 FMF &= FPCI->getFastMathFlags();
14522 FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
14523 }
14524 Type *Ty = VL.front()->getType();
14525 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
14526 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
14527 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
14528}
14529
14530bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
14531 bool &IsBSwap, bool &ForLoads) const {
14532 assert(TE.hasState() && TE.getOpcode() == Instruction::Shl &&
14533 "Expected Shl node.");
14534 IsBSwap = false;
14535 ForLoads = false;
14536 if (TE.State != TreeEntry::Vectorize || !TE.ReorderIndices.empty() ||
14537 !TE.ReuseShuffleIndices.empty() || MinBWs.contains(&TE) ||
14538 any_of(TE.Scalars, [](Value *V) { return !V->hasOneUse(); }))
14539 return false;
14540 Type *ScalarTy = TE.getMainOp()->getType();
14541 // TODO: Check if same can be done for the vector types.
14542 if (!ScalarTy->isIntegerTy())
14543 return false;
14544 if (ScalarTy->isVectorTy())
14545 return false;
14546 const unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
14547 const TreeEntry *LhsTE = getOperandEntry(&TE, /*Idx=*/0);
14548 const TreeEntry *RhsTE = getOperandEntry(&TE, /*Idx=*/1);
14549 // Lhs should be zext i<stride> to I<sz>.
14550 if (!(LhsTE->State == TreeEntry::Vectorize &&
14551 LhsTE->getOpcode() == Instruction::ZExt &&
14552 LhsTE->ReorderIndices.empty() && LhsTE->ReuseShuffleIndices.empty() &&
14553 !MinBWs.contains(LhsTE) &&
14554 all_of(LhsTE->Scalars, [](Value *V) { return V->hasOneUse(); })))
14555 return false;
14556 Type *SrcScalarTy = cast<ZExtInst>(LhsTE->getMainOp())->getSrcTy();
14557 unsigned Stride = DL->getTypeSizeInBits(SrcScalarTy);
14558 if (!isPowerOf2_64(Stride) || Stride >= Sz || Sz % Stride != 0 ||
14559 !isPowerOf2_64(LhsTE->getVectorFactor()))
14560 return false;
14561 if (!(RhsTE->isGather() && RhsTE->ReorderIndices.empty() &&
14562 RhsTE->ReuseShuffleIndices.empty() && !MinBWs.contains(RhsTE)))
14563 return false;
14564 Order.clear();
14565 unsigned CurrentValue = 0;
14566 // Rhs should be (0, Stride, 2 * Stride, ..., N-Stride), where N <= Sz.
14567 if (all_of(RhsTE->Scalars,
14568 [&](Value *V) {
14569 CurrentValue += Stride;
14570 if (isa<UndefValue>(V))
14571 return true;
14572 auto *C = dyn_cast<Constant>(V);
14573 if (!C)
14574 return false;
14575 return C->getUniqueInteger() == CurrentValue - Stride;
14576 }) &&
14577 CurrentValue <= Sz) {
14578 Order.clear();
14579 } else {
14580 const unsigned VF = RhsTE->getVectorFactor();
14581 Order.assign(VF, VF);
14582 // Track which logical positions we've seen; reject duplicate shift amounts.
14583 SmallBitVector SeenPositions(VF);
14584 // Check if need to reorder Rhs to make it in form (0, Stride, 2 * Stride,
14585 // ..., N-Stride), where N <= Sz.
14586 if (VF * Stride > Sz)
14587 return false;
14588 for (const auto [Idx, V] : enumerate(RhsTE->Scalars)) {
14589 if (isa<UndefValue>(V))
14590 continue;
14591 auto *C = dyn_cast<Constant>(V);
14592 if (!C)
14593 return false;
14594 const APInt &Val = C->getUniqueInteger();
14595 if (Val.isNegative() || Val.uge(Sz) || Val.getZExtValue() % Stride != 0)
14596 return false;
14597 unsigned Pos = Val.getZExtValue() / Stride;
14598 // TODO: Support Pos >= VF, in this case need to shift the final value.
14599 if (Order[Idx] != VF || Pos >= VF)
14600 return false;
14601 if (SeenPositions.test(Pos))
14602 return false;
14603 SeenPositions.set(Pos);
14604 Order[Idx] = Pos;
14605 }
14606 // One of the indices not set - exit.
14607 if (is_contained(Order, VF))
14608 return false;
14609 }
14611 auto *SrcType = IntegerType::getIntNTy(ScalarTy->getContext(),
14612 Stride * LhsTE->getVectorFactor());
14613 FastMathFlags FMF;
14614 SmallPtrSet<Value *, 4> CheckedExtracts;
14615 auto *VecTy =
14616 cast<VectorType>(getWidenedType(ScalarTy, TE.getVectorFactor()));
14617 auto *SrcVecTy =
14618 cast<VectorType>(getWidenedType(SrcScalarTy, LhsTE->getVectorFactor()));
14619 TTI::CastContextHint CastCtx =
14620 getCastContextHint(*getOperandEntry(LhsTE, /*Idx=*/0));
14621 InstructionCost VecCost =
14622 TTI->getArithmeticReductionCost(Instruction::Or, VecTy, FMF, CostKind) +
14623 TTI->getArithmeticInstrCost(Instruction::Shl, VecTy, CostKind,
14624 getOperandInfo(LhsTE->Scalars)) +
14625 TTI->getCastInstrCost(
14626 Instruction::ZExt, VecTy,
14627 getWidenedType(SrcScalarTy, LhsTE->getVectorFactor()), CastCtx,
14628 CostKind);
14629 InstructionCost BitcastCost = TTI->getCastInstrCost(
14630 Instruction::BitCast, SrcType, SrcVecTy, CastCtx, CostKind);
14631 if (!Order.empty()) {
14632 fixupOrderingIndices(Order);
14633 SmallVector<int> Mask;
14634 inversePermutation(Order, Mask);
14635 BitcastCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, SrcVecTy,
14636 Mask, CostKind);
14637 }
14638 // Check if the combination can be modeled as a bitcast+byteswap operation.
14639 constexpr unsigned ByteSize = 8;
14640 if (!Order.empty() && isReverseOrder(Order) &&
14641 DL->getTypeSizeInBits(SrcScalarTy) == ByteSize) {
14642 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
14643 InstructionCost BSwapCost =
14644 TTI->getCastInstrCost(Instruction::BitCast, SrcType, SrcVecTy, CastCtx,
14645 CostKind) +
14646 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
14647 if (BSwapCost <= BitcastCost) {
14648 BitcastCost = BSwapCost;
14649 IsBSwap = true;
14650 Order.clear();
14651 // Check for loads in the ZExt node.
14652 const TreeEntry *SrcTE = getOperandEntry(LhsTE, /*Idx=*/0);
14653 if (SrcTE->State == TreeEntry::Vectorize &&
14654 SrcTE->ReorderIndices.empty() && SrcTE->ReuseShuffleIndices.empty() &&
14655 SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() &&
14656 all_of(SrcTE->Scalars, [](Value *V) { return V->hasOneUse(); })) {
14657 auto *LI = cast<LoadInst>(SrcTE->getMainOp());
14658 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
14659 InstructionCost BSwapCost =
14660 TTI->getMemoryOpCost(Instruction::Load, SrcType, LI->getAlign(),
14661 LI->getPointerAddressSpace(), CostKind) +
14662 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
14663 if (BSwapCost <= BitcastCost) {
14664 VecCost +=
14665 TTI->getMemoryOpCost(Instruction::Load, SrcVecTy, LI->getAlign(),
14666 LI->getPointerAddressSpace(), CostKind);
14667 BitcastCost = BSwapCost;
14668 ForLoads = true;
14669 }
14670 }
14671 }
14672 } else if (Order.empty() && DL->getTypeSizeInBits(SrcScalarTy) == ByteSize) {
14673 // Check for loads in the ZExt node.
14674 const TreeEntry *SrcTE = getOperandEntry(LhsTE, /*Idx=*/0);
14675 if (SrcTE->State == TreeEntry::Vectorize && SrcTE->ReorderIndices.empty() &&
14676 SrcTE->ReuseShuffleIndices.empty() &&
14677 SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() &&
14678 all_of(SrcTE->Scalars, [](Value *V) { return V->hasOneUse(); })) {
14679 auto *LI = cast<LoadInst>(SrcTE->getMainOp());
14680 BitcastCost =
14681 TTI->getMemoryOpCost(Instruction::Load, SrcType, LI->getAlign(),
14682 LI->getPointerAddressSpace(), CostKind);
14683 VecCost +=
14684 TTI->getMemoryOpCost(Instruction::Load, SrcVecTy, LI->getAlign(),
14685 LI->getPointerAddressSpace(), CostKind);
14686 ForLoads = true;
14687 }
14688 }
14689 if (SrcType != ScalarTy) {
14690 BitcastCost += TTI->getCastInstrCost(Instruction::ZExt, ScalarTy, SrcType,
14692 }
14693 return BitcastCost < VecCost;
14694}
14695
14696bool BoUpSLP::matchesInversedZExtSelect(
14697 const TreeEntry &SelectTE,
14698 SmallVectorImpl<unsigned> &InversedCmpsIndices) const {
14699 assert(SelectTE.hasState() && SelectTE.getOpcode() == Instruction::Select &&
14700 "Expected select node.");
14702 for (auto [Idx, V] : enumerate(SelectTE.Scalars)) {
14703 auto *Inst = dyn_cast<Instruction>(V);
14704 if (!Inst || Inst->getOpcode() != Instruction::ZExt)
14705 continue;
14706 ZExts.emplace_back(Inst, Idx);
14707 }
14708 if (ZExts.empty())
14709 return false;
14710 const auto *CmpTE = getOperandEntry(&SelectTE, 0);
14711 const auto *Op1TE = getOperandEntry(&SelectTE, 1);
14712 const auto *Op2TE = getOperandEntry(&SelectTE, 2);
14713 // Compares must be alternate vectorized, and other operands must be gathers
14714 // or copyables.
14715 // TODO: investigate opportunity for reordered/reused nodes.
14716 if (CmpTE->State != TreeEntry::Vectorize || !CmpTE->isAltShuffle() ||
14717 (CmpTE->getOpcode() != Instruction::ICmp &&
14718 CmpTE->getOpcode() != Instruction::FCmp) ||
14719 !CmpTE->ReorderIndices.empty() || !CmpTE->ReuseShuffleIndices.empty() ||
14720 !Op1TE->ReorderIndices.empty() || !Op1TE->ReuseShuffleIndices.empty() ||
14721 !Op2TE->ReorderIndices.empty() || !Op2TE->ReuseShuffleIndices.empty())
14722 return false;
14723 // The operands must be buildvectors/copyables.
14724 if (!Op1TE->isGather() || !Op2TE->isGather())
14725 return false;
14726 // TODO: investigate opportunity for the vector nodes with copyables.
14727 auto *Cmp = CmpTE->getMainOp();
14728 CmpPredicate Pred;
14729 auto MatchCmp = m_Cmp(Pred, m_Value(), m_Value());
14730 if (!match(Cmp, MatchCmp))
14731 return false;
14732 CmpPredicate MainPred = Pred;
14733 CmpPredicate InversedPred(CmpInst::getInversePredicate(Pred),
14734 Pred.hasSameSign());
14735 for (const auto [Idx, V] : enumerate(CmpTE->Scalars)) {
14736 if (!match(V, MatchCmp))
14737 continue;
14738 if (CmpPredicate::getMatching(MainPred, Pred))
14739 continue;
14740 if (!CmpPredicate::getMatching(InversedPred, Pred))
14741 return false;
14742 if (!V->hasOneUse())
14743 return false;
14744 InversedCmpsIndices.push_back(Idx);
14745 }
14746
14747 if (InversedCmpsIndices.empty())
14748 return false;
14749 Type *VecTy =
14750 getWidenedType(Cmp->getOperand(0)->getType(), CmpTE->getVectorFactor());
14751 Type *CmpTy = CmpInst::makeCmpResultType(VecTy);
14752
14754 InstructionCost VecCost =
14755 TTI->getCmpSelInstrCost(CmpTE->getOpcode(), VecTy, CmpTy, MainPred,
14756 CostKind, getOperandInfo(CmpTE->getOperand(0)),
14757 getOperandInfo(CmpTE->getOperand(1)));
14758 InstructionCost BVCost =
14759 ::getScalarizationOverhead(*TTI, Cmp->getType(), cast<VectorType>(CmpTy),
14760 APInt::getAllOnes(CmpTE->getVectorFactor()),
14761 /*Insert=*/true, /*Extract=*/false, CostKind);
14762 for (Value *V : CmpTE->Scalars) {
14763 auto *I = dyn_cast<Instruction>(V);
14764 if (!I)
14765 continue;
14766 BVCost += TTI->getInstructionCost(I, CostKind);
14767 }
14768 return VecCost < BVCost;
14769}
14770
14771bool BoUpSLP::matchesSelectOfBits(const TreeEntry &SelectTE) const {
14772 assert(SelectTE.hasState() && SelectTE.getOpcode() == Instruction::Select &&
14773 "Expected select node.");
14774 if (DL->isBigEndian())
14775 return false;
14776 if (!SelectTE.ReorderIndices.empty() || !SelectTE.ReuseShuffleIndices.empty())
14777 return false;
14778 if (!UserIgnoreList || SelectTE.Idx != 0)
14779 return false;
14780 if (any_of(SelectTE.Scalars, [](Value *V) { return !V->hasOneUse(); }))
14781 return false;
14782 // Check that all reduction operands are or instructions.
14783 if (any_of(*UserIgnoreList,
14784 [](Value *V) { return !match(V, m_Or(m_Value(), m_Value())); }))
14785 return false;
14786 const TreeEntry *Op1TE = getOperandEntry(&SelectTE, 1);
14787 const TreeEntry *Op2TE = getOperandEntry(&SelectTE, 2);
14788 if (!Op1TE->isGather() || !Op2TE->isGather())
14789 return false;
14790 // No need to check for zeroes reordering.
14791 if (!Op1TE->ReorderIndices.empty() || !Op1TE->ReuseShuffleIndices.empty() ||
14792 !Op2TE->ReuseShuffleIndices.empty())
14793 return false;
14794 Type *ScalarTy = Op1TE->Scalars.front()->getType();
14795 if (!ScalarTy->isIntegerTy())
14796 return false;
14797 // Check that second operand is all zeroes.
14798 if (any_of(Op2TE->Scalars, [](Value *V) { return !match(V, m_ZeroInt()); }))
14799 return false;
14800 // Check that first operand is 1,2,4,...
14801 if (any_of(enumerate(Op1TE->Scalars), [](const auto &P) {
14802 uint64_t V;
14803 return !(match(P.value(), m_ConstantInt(V)) && isPowerOf2_64(V) &&
14804 Log2_64(V) == P.index());
14805 }))
14806 return false;
14807 // Check if bitcast is cheaper than select.
14808 auto *DstTy = IntegerType::getIntNTy(ScalarTy->getContext(),
14809 SelectTE.getVectorFactor());
14810 Type *OpTy = getWidenedType(DstTy, SelectTE.getVectorFactor());
14811 Type *CmpTy = CmpInst::makeCmpResultType(OpTy);
14812 auto *VecTy =
14813 cast<VectorType>(getWidenedType(ScalarTy, SelectTE.getVectorFactor()));
14814 auto It = MinBWs.find(&SelectTE);
14815 if (It != MinBWs.end()) {
14816 auto *EffectiveScalarTy =
14817 IntegerType::get(F->getContext(), It->second.first);
14818 VecTy = cast<VectorType>(
14819 getWidenedType(EffectiveScalarTy, SelectTE.getVectorFactor()));
14820 }
14822 InstructionCost BitcastCost = TTI->getCastInstrCost(
14823 Instruction::BitCast, DstTy, CmpTy, TTI::CastContextHint::None, CostKind);
14824 if (DstTy != ScalarTy) {
14825 BitcastCost += TTI->getCastInstrCost(Instruction::ZExt, ScalarTy, DstTy,
14827 }
14828 FastMathFlags FMF;
14829 InstructionCost SelectCost =
14830 TTI->getCmpSelInstrCost(Instruction::Select, VecTy, CmpTy,
14832 getOperandInfo(Op1TE->Scalars),
14833 getOperandInfo(Op2TE->Scalars)) +
14834 TTI->getArithmeticReductionCost(Instruction::Or, VecTy, FMF, CostKind);
14835 return BitcastCost <= SelectCost;
14836}
14837
14840 BaseGraphSize = VectorizableTree.size();
14841 // Turn graph transforming mode on and off, when done.
14842 class GraphTransformModeRAAI {
14843 bool &SavedIsGraphTransformMode;
14844
14845 public:
14846 GraphTransformModeRAAI(bool &IsGraphTransformMode)
14847 : SavedIsGraphTransformMode(IsGraphTransformMode) {
14848 IsGraphTransformMode = true;
14849 }
14850 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
14851 } TransformContext(IsGraphTransformMode);
14852 // Operands are profitable if they are:
14853 // 1. At least one constant
14854 // or
14855 // 2. Splats
14856 // or
14857 // 3. Results in good vectorization opportunity, i.e. may generate vector
14858 // nodes and reduce cost of the graph.
14859 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
14860 const InstructionsState &S) {
14862 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
14863 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
14864 I2->getOperand(Op));
14865 return all_of(Candidates, [this](
14866 ArrayRef<std::pair<Value *, Value *>> Cand) {
14867 return all_of(Cand,
14868 [](const std::pair<Value *, Value *> &P) {
14869 return isa<Constant>(P.first) ||
14870 isa<Constant>(P.second) || P.first == P.second;
14871 }) ||
14873 });
14874 };
14875
14876 // Try to reorder gather nodes for better vectorization opportunities.
14877 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
14878 TreeEntry &E = *VectorizableTree[Idx];
14879 if (E.isGather())
14880 reorderGatherNode(E);
14881 }
14882
14883 // Better to use full gathered loads analysis, if there are only 2 loads
14884 // gathered nodes each having less than 16 elements.
14885 constexpr unsigned VFLimit = 16;
14886 bool ForceLoadGather =
14887 count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
14888 return TE->isGather() && TE->hasState() &&
14889 TE->getOpcode() == Instruction::Load &&
14890 TE->getVectorFactor() < VFLimit;
14891 }) == 2;
14892
14893 // Checks if the scalars are used in other node.
14894 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
14895 function_ref<bool(Value *)> CheckContainer) {
14896 return TE->isSame(VL) || all_of(VL, [&](Value *V) {
14897 if (isa<PoisonValue>(V))
14898 return true;
14899 auto *I = dyn_cast<Instruction>(V);
14900 if (!I)
14901 return false;
14902 return is_contained(TE->Scalars, I) || CheckContainer(I);
14903 });
14904 };
14905 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
14906 if (E.hasState()) {
14907 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
14908 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
14909 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
14910 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
14911 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
14912 return is_contained(TEs, TE);
14913 });
14914 });
14915 }))
14916 return true;
14917 ;
14918 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
14919 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
14920 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
14921 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
14922 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
14923 return is_contained(TEs, TE);
14924 });
14925 });
14926 }))
14927 return true;
14928 } else {
14929 // Check if the gather node full copy of split node.
14930 auto *It = find_if(E.Scalars, IsaPred<Instruction>);
14931 if (It != E.Scalars.end()) {
14932 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
14933 !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
14934 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
14935 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
14936 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
14937 return is_contained(TEs, TE);
14938 });
14939 });
14940 }))
14941 return true;
14942 }
14943 }
14944 return false;
14945 };
14946 // The tree may grow here, so iterate over nodes, built before.
14947 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
14948 TreeEntry &E = *VectorizableTree[Idx];
14949 if (E.isGather()) {
14950 ArrayRef<Value *> VL = E.Scalars;
14951 const unsigned Sz = getVectorElementSize(VL.front());
14952 unsigned MinVF = getMinVF(2 * Sz);
14953 // Do not try partial vectorization for small nodes (<= 2), nodes with the
14954 // same opcode and same parent block or all constants.
14955 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
14956 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
14957 // We use allSameOpcode instead of isAltShuffle because we don't
14958 // want to use interchangeable instruction here.
14959 !allSameOpcode(VL) || !allSameBlock(VL)) ||
14960 allConstant(VL) || isSplat(VL))
14961 continue;
14962 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
14963 continue;
14964 // Check if the node is a copy of other vector nodes.
14965 if (CheckForSameVectorNodes(E))
14966 continue;
14967 // Try to find vectorizable sequences and transform them into a series of
14968 // insertvector instructions.
14969 unsigned StartIdx = 0;
14970 unsigned End = VL.size();
14971 SmallBitVector Processed(End);
14972 for (unsigned VF = getFloorFullVectorNumberOfElements(
14973 *TTI, VL.front()->getType(), VL.size() - 1);
14974 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
14975 *TTI, VL.front()->getType(), VF - 1)) {
14976 if (StartIdx + VF > End)
14977 continue;
14979 bool AllStrided = true;
14980 // Walk the range in steps of VF, but allow the trailing slice to be
14981 // shorter (SliceVF < VF) so non-power-of-2 tails can be vectorized.
14982 // Processed only records slice starts (Cnt), and downstream consumers
14983 // (test(Cnt), AddCombinedNode's range set/StartIdx update) operate on
14984 // start positions, so partial coverage is consistent.
14985 for (unsigned Cnt = StartIdx; Cnt < End; Cnt += VF) {
14986 const unsigned SliceVF = std::min(VF, End - Cnt);
14987 if (SliceVF <= 1)
14988 continue;
14989 ArrayRef<Value *> Slice = VL.slice(Cnt, SliceVF);
14990 // If any instruction is vectorized already - do not try again.
14991 // Reuse the existing node, if it fully matches the slice.
14992 if ((Processed.test(Cnt) || isVectorized(Slice.front())) &&
14993 !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
14994 continue;
14995 // Constant already handled effectively - skip.
14996 if (allConstant(Slice))
14997 continue;
14998 // Do not try to vectorize small splats (less than vector register and
14999 // only with the single non-undef element).
15000 bool IsSplat = isSplat(Slice);
15001 bool IsTwoRegisterSplat = true;
15002 if (IsSplat && VF == 2) {
15003 unsigned NumRegs2VF = ::getNumberOfParts(
15004 *TTI, getWidenedType(getValueType(Slice.front()), 2 * VF),
15005 getValueType(Slice.front()));
15006 IsTwoRegisterSplat = NumRegs2VF == 2;
15007 }
15008 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
15009 count(Slice, Slice.front()) ==
15010 static_cast<long>(isa<UndefValue>(Slice.front()) ? SliceVF - 1
15011 : 1)) {
15012 if (IsSplat)
15013 continue;
15014 InstructionsState S = getSameOpcode(Slice, *TLI);
15015 if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
15016 (S.getOpcode() == Instruction::Load &&
15018 continue;
15019 if (VF == 2) {
15020 // Cache the cost check lazily - both branches below may need it.
15021 std::optional<bool> MainOpIsCheap;
15022 auto IsMainOpCheap = [&] {
15023 if (!MainOpIsCheap)
15024 MainOpIsCheap =
15025 TTI->getInstructionCost(S.getMainOp(), CostKind) <
15027 return *MainOpIsCheap;
15028 };
15029 // Try to vectorize reduced values or if all users are vectorized.
15030 // For expensive instructions extra extracts might be profitable.
15031 if ((!UserIgnoreList || E.Idx != 0) && IsMainOpCheap() &&
15032 !all_of(Slice, [&](Value *V) {
15033 if (isa<PoisonValue>(V))
15034 return true;
15035 return areAllUsersVectorized(cast<Instruction>(V),
15036 UserIgnoreList);
15037 }))
15038 continue;
15039 if (S.getOpcode() == Instruction::Load) {
15040 OrdersType Order;
15041 SmallVector<Value *> PointerOps;
15042 StridedPtrInfo SPtrInfo;
15043 LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
15044 PointerOps, SPtrInfo);
15045 AllStrided &= Res == LoadsState::StridedVectorize ||
15047 Res == LoadsState::Gather;
15048 // Do not vectorize gathers.
15049 if (Res == LoadsState::ScatterVectorize ||
15050 Res == LoadsState::Gather) {
15051 if (Res == LoadsState::Gather) {
15053 // If reductions and the scalars from the root node are
15054 // analyzed - mark as non-vectorizable reduction.
15055 if (UserIgnoreList && E.Idx == 0)
15056 analyzedReductionVals(Slice);
15057 }
15058 continue;
15059 }
15060 } else if (S.getOpcode() == Instruction::ExtractElement ||
15061 (IsMainOpCheap() &&
15062 !CheckOperandsProfitability(
15063 S.getMainOp(),
15066 S))) {
15067 // Do not vectorize extractelements (handled effectively
15068 // alread). Do not vectorize non-profitable instructions (with
15069 // low cost and non-vectorizable operands.)
15070 continue;
15071 }
15072 }
15073 }
15074 Slices.emplace_back(Cnt, Slice.size());
15075 }
15076 // Do not try to vectorize if all slides are strided or gathered with
15077 // vector factor 2 and there are more than 2 slices. Better to handle
15078 // them in gathered loads analysis, may result in better vectorization.
15079 if (VF == 2 && AllStrided && Slices.size() > 2)
15080 continue;
15081 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
15082 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
15083 Processed.set(Cnt, Cnt + Sz);
15084 if (StartIdx == Cnt)
15085 StartIdx = Cnt + Sz;
15086 if (End == Cnt + Sz)
15087 End = Cnt;
15088 };
15089 for (auto [Cnt, Sz] : Slices) {
15090 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
15091 const TreeEntry *SameTE = nullptr;
15092 if (const auto *It = find_if(Slice, IsaPred<Instruction>);
15093 It != Slice.end()) {
15094 // If any instruction is vectorized already - do not try again.
15095 SameTE = getSameValuesTreeEntry(*It, Slice);
15096 }
15097 unsigned PrevSize = VectorizableTree.size();
15098 [[maybe_unused]] unsigned PrevEntriesSize =
15099 LoadEntriesToVectorize.size();
15100 buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
15101 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
15102 VectorizableTree[PrevSize]->isGather() &&
15103 VectorizableTree[PrevSize]->hasState() &&
15104 VectorizableTree[PrevSize]->getOpcode() !=
15105 Instruction::ExtractElement &&
15106 !isSplat(Slice)) {
15107 if (UserIgnoreList && E.Idx == 0 && VF == 2)
15108 analyzedReductionVals(Slice);
15109 VectorizableTree.pop_back();
15110 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
15111 "LoadEntriesToVectorize expected to remain the same");
15112 continue;
15113 }
15114 AddCombinedNode(PrevSize, Cnt, Sz);
15115 }
15116 }
15117 // Restore ordering, if no extra vectorization happened.
15118 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
15119 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
15120 reorderScalars(E.Scalars, Mask);
15121 E.ReorderIndices.clear();
15122 }
15123 }
15124 if (!E.hasState())
15125 continue;
15126 switch (E.getOpcode()) {
15127 case Instruction::Load: {
15128 // No need to reorder masked gather loads, just reorder the scalar
15129 // operands.
15130 if (E.State != TreeEntry::Vectorize)
15131 break;
15132 Type *ScalarTy = E.getMainOp()->getType();
15133 auto *VecTy =
15134 cast<FixedVectorType>(getWidenedType(ScalarTy, E.Scalars.size()));
15135 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
15136 // Check if profitable to represent consecutive load + reverse as strided
15137 // load with stride -1.
15138 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
15139 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
15140 SmallVector<int> Mask;
15141 inversePermutation(E.ReorderIndices, Mask);
15142 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
15143 InstructionCost OriginalVecCost =
15144 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
15145 BaseLI->getPointerAddressSpace(), CostKind,
15147 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
15148 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
15149 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
15150 VecTy, BaseLI->getPointerOperand(),
15151 /*VariableMask=*/false, CommonAlignment,
15152 BaseLI),
15153 CostKind);
15154 if (StridedCost < OriginalVecCost || ForceStridedLoads) {
15155 // Strided load is more profitable than consecutive load + reverse -
15156 // transform the node to strided load.
15157 Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
15158 ->getPointerOperand()
15159 ->getType());
15160 StridedPtrInfo SPtrInfo;
15161 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
15162 SPtrInfo.Ty = VecTy;
15163 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
15164 E.State = TreeEntry::StridedVectorize;
15165 }
15166 }
15167 break;
15168 }
15169 case Instruction::Store: {
15170 Type *ScalarTy =
15171 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
15172 auto *VecTy =
15173 cast<FixedVectorType>(getWidenedType(ScalarTy, E.Scalars.size()));
15174 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
15175 // Check if profitable to represent consecutive load + reverse as strided
15176 // load with stride -1.
15177 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
15178 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
15179 SmallVector<int> Mask;
15180 inversePermutation(E.ReorderIndices, Mask);
15181 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
15182 InstructionCost OriginalVecCost =
15183 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
15184 BaseSI->getPointerAddressSpace(), CostKind,
15186 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
15187 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
15188 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
15189 VecTy, BaseSI->getPointerOperand(),
15190 /*VariableMask=*/false, CommonAlignment,
15191 BaseSI),
15192 CostKind);
15193 if (StridedCost < OriginalVecCost) {
15194 // Strided store is more profitable than reverse + consecutive store -
15195 // transform the node to strided store.
15196 E.State = TreeEntry::StridedVectorize;
15197 Type *StrideTy = DL->getIndexType(cast<StoreInst>(E.Scalars.front())
15198 ->getPointerOperand()
15199 ->getType());
15200 StridedPtrInfo SPtrInfo;
15201 SPtrInfo.StrideVal = ConstantInt::getSigned(StrideTy, -1);
15202 SPtrInfo.Ty = VecTy;
15203 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
15204 }
15205 } else if (!E.ReorderIndices.empty()) {
15206 // Check for interleaved stores.
15207 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
15208 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
15209 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
15210 if (Mask.size() < 4)
15211 return 0u;
15212 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
15214 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
15215 TTI.isLegalInterleavedAccessType(
15216 VecTy, Factor, BaseSI->getAlign(),
15217 BaseSI->getPointerAddressSpace()))
15218 return Factor;
15219 }
15220
15221 return 0u;
15222 };
15223 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
15224 unsigned InterleaveFactor = IsInterleaveMask(Mask);
15225 if (InterleaveFactor != 0)
15226 E.setInterleave(InterleaveFactor);
15227 }
15228 break;
15229 }
15230 case Instruction::Select: {
15231 if (E.State != TreeEntry::Vectorize)
15232 break;
15233 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
15234 if (MinMaxID != Intrinsic::not_intrinsic) {
15235 // This node is a minmax node.
15236 E.CombinedOp = TreeEntry::MinMax;
15237 TreeEntry *CondEntry = getOperandEntry(&E, 0);
15238 if (SelectOnly && CondEntry->UserTreeIndex &&
15239 CondEntry->State == TreeEntry::Vectorize) {
15240 // The condition node is part of the combined minmax node.
15241 CondEntry->State = TreeEntry::CombinedVectorize;
15242 }
15243 break;
15244 }
15245 // Check for zext + selects, which can be reordered.
15246 SmallVector<unsigned> InversedCmpsIndices;
15247 if (matchesInversedZExtSelect(E, InversedCmpsIndices)) {
15248 auto *CmpTE = getOperandEntry(&E, 0);
15249 auto *Op1TE = getOperandEntry(&E, 1);
15250 auto *Op2TE = getOperandEntry(&E, 2);
15251 // State now is uniform, not alternate opcode.
15252 CmpTE->setOperations(
15253 InstructionsState(CmpTE->getMainOp(), CmpTE->getMainOp()));
15254 // Update mapping between the swapped values and their internal matching
15255 // nodes.
15256 auto UpdateGatherEntry = [&](TreeEntry *OldTE, TreeEntry *NewTE,
15257 Value *V) {
15258 if (isConstant(V))
15259 return;
15260 auto It = ValueToGatherNodes.find(V);
15261 assert(It != ValueToGatherNodes.end() &&
15262 "Expected to find the value in the map.");
15263 auto &C = It->getSecond();
15264 if (!is_contained(OldTE->Scalars, V))
15265 C.remove(OldTE);
15266 C.insert(NewTE);
15267 };
15268 ValueList &Op1 = E.getOperand(1);
15269 ValueList &Op2 = E.getOperand(2);
15270 for (const unsigned Idx : InversedCmpsIndices) {
15271 Value *V1 = Op1TE->Scalars[Idx];
15272 Value *V2 = Op2TE->Scalars[Idx];
15273 std::swap(Op1TE->Scalars[Idx], Op2TE->Scalars[Idx]);
15274 std::swap(Op1[Idx], Op2[Idx]);
15275 UpdateGatherEntry(Op1TE, Op2TE, V1);
15276 UpdateGatherEntry(Op2TE, Op1TE, V2);
15277 }
15278 OperandsToTreeEntry.emplace_or_assign(std::make_pair(&E, 1), Op1TE);
15279 OperandsToTreeEntry.emplace_or_assign(std::make_pair(&E, 2), Op2TE);
15280 // NB: Fallback to check if select can be converted to cmp bitcast.
15281 }
15282 if (matchesSelectOfBits(E)) {
15283 // This node is a (reduced or) cmp bitcast node.
15284 const TreeEntry::CombinedOpcode Code = TreeEntry::ReducedCmpBitcast;
15285 E.CombinedOp = Code;
15286 auto *Op1TE = getOperandEntry(&E, 1);
15287 auto *Op2TE = getOperandEntry(&E, 2);
15288 Op1TE->State = TreeEntry::CombinedVectorize;
15289 Op1TE->CombinedOp = Code;
15290 Op2TE->State = TreeEntry::CombinedVectorize;
15291 Op2TE->CombinedOp = Code;
15292 break;
15293 }
15294 break;
15295 }
15296 case Instruction::FSub:
15297 case Instruction::FAdd: {
15298 // Check if possible to convert (a*b)+c to fma.
15299 if (E.State != TreeEntry::Vectorize ||
15300 !E.getOperations().isAddSubLikeOp() ||
15301 E.getOperations().isAltShuffle())
15302 break;
15303 const TreeEntry *LHS = getOperandEntry(&E, 0);
15304 const TreeEntry *RHS = getOperandEntry(&E, 1);
15305 auto IsOneUseVectorFMulOperand = [](const TreeEntry *TE) {
15306 return TE->State == TreeEntry::Vectorize &&
15307 TE->ReorderIndices.empty() && TE->ReuseShuffleIndices.empty() &&
15308 TE->getOpcode() == Instruction::FMul && !TE->isAltShuffle() &&
15309 all_of(TE->Scalars, [&](Value *V) {
15310 return (TE->hasCopyableElements() &&
15311 TE->isCopyableElement(V)) ||
15312 V->hasOneUse();
15313 });
15314 };
15315 if (!IsOneUseVectorFMulOperand(LHS) &&
15316 (E.getOpcode() == Instruction::FSub ||
15317 !IsOneUseVectorFMulOperand(RHS)))
15318 break;
15319 if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
15320 .isValid())
15321 break;
15322 // This node is a fmuladd node.
15323 E.CombinedOp = TreeEntry::FMulAdd;
15324 TreeEntry *FMulEntry = getOperandEntry(&E, 0);
15325 if (FMulEntry->UserTreeIndex &&
15326 FMulEntry->State == TreeEntry::Vectorize) {
15327 // The FMul node is part of the combined fmuladd node.
15328 FMulEntry->State = TreeEntry::CombinedVectorize;
15329 }
15330 break;
15331 }
15332 case Instruction::Shl: {
15333 if (E.Idx != 0 || DL->isBigEndian())
15334 break;
15335 if (!UserIgnoreList)
15336 break;
15337 // Check that all reduction operands are disjoint or instructions.
15338 if (any_of(*UserIgnoreList, [](Value *V) {
15339 return !match(V, m_DisjointOr(m_Value(), m_Value()));
15340 }))
15341 break;
15342 OrdersType Order;
15343 bool IsBSwap;
15344 bool ForLoads;
15345 if (!matchesShlZExt(E, Order, IsBSwap, ForLoads))
15346 break;
15347 // This node is a (reduced disjoint or) bitcast node.
15348 TreeEntry::CombinedOpcode Code =
15349 IsBSwap ? (ForLoads ? TreeEntry::ReducedBitcastBSwapLoads
15350 : TreeEntry::ReducedBitcastBSwap)
15351 : (ForLoads ? TreeEntry::ReducedBitcastLoads
15352 : TreeEntry::ReducedBitcast);
15353 E.CombinedOp = Code;
15354 E.ReorderIndices = std::move(Order);
15355 TreeEntry *ZExtEntry = getOperandEntry(&E, 0);
15356 assert(ZExtEntry->UserTreeIndex &&
15357 ZExtEntry->State == TreeEntry::Vectorize &&
15358 ZExtEntry->getOpcode() == Instruction::ZExt &&
15359 "Expected ZExt node.");
15360 // The ZExt node is part of the combined node.
15361 ZExtEntry->State = TreeEntry::CombinedVectorize;
15362 ZExtEntry->CombinedOp = Code;
15363 if (ForLoads) {
15364 TreeEntry *LoadsEntry = getOperandEntry(ZExtEntry, 0);
15365 assert(LoadsEntry->UserTreeIndex &&
15366 LoadsEntry->State == TreeEntry::Vectorize &&
15367 LoadsEntry->getOpcode() == Instruction::Load &&
15368 "Expected Load node.");
15369 // The Load node is part of the combined node.
15370 LoadsEntry->State = TreeEntry::CombinedVectorize;
15371 LoadsEntry->CombinedOp = Code;
15372 }
15373 TreeEntry *ConstEntry = getOperandEntry(&E, 1);
15374 assert(ConstEntry->UserTreeIndex && ConstEntry->isGather() &&
15375 "Expected ZExt node.");
15376 // The ConstNode node is part of the combined node.
15377 ConstEntry->State = TreeEntry::CombinedVectorize;
15378 ConstEntry->CombinedOp = Code;
15379 break;
15380 }
15381 default:
15382 break;
15383 }
15384 }
15385
15386 if (LoadEntriesToVectorize.empty()) {
15387 // Single load node - exit.
15388 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
15389 VectorizableTree.front()->getOpcode() == Instruction::Load)
15390 return;
15391 // Small graph with small VF - exit.
15392 constexpr unsigned SmallTree = 3;
15393 constexpr unsigned SmallVF = 2;
15394 if ((VectorizableTree.size() <= SmallTree &&
15395 VectorizableTree.front()->Scalars.size() == SmallVF) ||
15396 (VectorizableTree.size() <= 2 && UserIgnoreList))
15397 return;
15398
15399 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15400 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
15401 getCanonicalGraphSize() <= SmallTree &&
15402 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
15403 [](const std::unique_ptr<TreeEntry> &TE) {
15404 return TE->isGather() && TE->hasState() &&
15405 TE->getOpcode() == Instruction::Load &&
15406 !allSameBlock(TE->Scalars);
15407 }) == 1)
15408 return;
15409 }
15410
15411 // A list of loads to be gathered during the vectorization process. We can
15412 // try to vectorize them at the end, if profitable.
15413 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
15415 GatheredLoads;
15416
15417 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15418 TreeEntry &E = *TE;
15419 if (E.isGather() &&
15420 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
15421 (!E.hasState() && any_of(E.Scalars,
15422 [&](Value *V) {
15423 return isa<LoadInst>(V) &&
15424 !isVectorized(V) &&
15425 !isDeleted(cast<Instruction>(V));
15426 }))) &&
15427 !isSplat(E.Scalars)) {
15428 for (Value *V : E.Scalars) {
15429 auto *LI = dyn_cast<LoadInst>(V);
15430 if (!LI)
15431 continue;
15432 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
15433 continue;
15435 *this, V, *DL, *SE, *TTI,
15436 GatheredLoads[std::make_tuple(
15437 LI->getParent(),
15438 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),
15439 LI->getType())]);
15440 }
15441 }
15442 }
15443 // Try to vectorize gathered loads if this is not just a gather of loads.
15444 if (!GatheredLoads.empty())
15445 tryToVectorizeGatheredLoads(GatheredLoads);
15446}
15447
15448/// Merges shuffle masks and emits final shuffle instruction, if required. It
15449/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
15450/// when the actual shuffle instruction is generated only if this is actually
15451/// required. Otherwise, the shuffle instruction emission is delayed till the
15452/// end of the process, to reduce the number of emitted instructions and further
15453/// analysis/transformations.
15454class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
15455 bool IsFinalized = false;
15456 SmallVector<int> CommonMask;
15458 /// Captures the original scalar VL of a single, "clean" gather() call so
15459 /// the values can be forwarded as the Args operand to getShuffleCost() for
15460 /// the final permutation in finalize(). This lets the target cost model
15461 /// recognize patterns such as broadcast-of-load (e.g. on X86,
15462 /// vbroadcast{ss,sd} folds the broadcast and the load into one instruction
15463 /// under AVX/AVX2 and is reported as TCC_Free by getShuffleCost). The
15464 /// state machine is:
15465 /// * engaged + empty: tracking active, no qualifying gather seen yet.
15466 /// * engaged + non-empty: exactly one qualifying gather observed and its
15467 /// VL still corresponds to InVectors.front().
15468 /// * disengaged: the cached VL is no longer trustworthy (multiple
15469 /// gather() calls, or a state-mutating add() happened).
15470 std::optional<SmallVector<Value *>> BVValues = SmallVector<Value *>();
15471 const TargetTransformInfo &TTI;
15472 InstructionCost Cost = 0;
15473 SmallDenseSet<Value *> VectorizedVals;
15474 BoUpSLP &R;
15475 SmallPtrSetImpl<Value *> &CheckedExtracts;
15476 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
15477 /// While set, still trying to estimate the cost for the same nodes and we
15478 /// can delay actual cost estimation (virtual shuffle instruction emission).
15479 /// May help better estimate the cost if same nodes must be permuted + allows
15480 /// to move most of the long shuffles cost estimation to TTI.
15481 bool SameNodesEstimated = true;
15482
15483 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
15484 if (Ty->getScalarType()->isPointerTy()) {
15487 IntegerType::get(Ty->getContext(),
15488 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
15489 Ty->getScalarType());
15490 if (auto *VTy = dyn_cast<VectorType>(Ty))
15491 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
15492 return Res;
15493 }
15494 return Constant::getAllOnesValue(Ty);
15495 }
15496
15497 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
15498 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
15499 return TTI::TCC_Free;
15500 auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, VL.size()));
15501 InstructionCost GatherCost = 0;
15502 SmallVector<Value *> Gathers(VL);
15503 if (!Root && isSplat(VL)) {
15504 // Found the broadcasting of the single scalar, calculate the cost as
15505 // the broadcast.
15506 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
15507 assert(It != VL.end() && "Expected at least one non-undef value.");
15508 // Add broadcast for non-identity shuffle only.
15509 bool NeedShuffle =
15510 count(VL, *It) > 1 &&
15511 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
15512 if (!NeedShuffle) {
15513 if (isa<FixedVectorType>(ScalarTy)) {
15514 assert(SLPReVec && "FixedVectorType is not expected.");
15515 return TTI.getShuffleCost(
15516 TTI::SK_InsertSubvector, VecTy, VecTy, {}, CostKind,
15517 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
15518 cast<FixedVectorType>(ScalarTy));
15519 }
15520 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
15521 CostKind, std::distance(VL.begin(), It),
15522 PoisonValue::get(VecTy), *It);
15523 }
15524
15525 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
15526 transform(VL, ShuffleMask.begin(), [](Value *V) {
15527 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
15528 });
15529 InstructionCost InsertCost =
15530 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
15531 PoisonValue::get(VecTy), *It);
15532 return InsertCost + ::getShuffleCost(TTI,
15534 VecTy, ShuffleMask, CostKind,
15535 /*Index=*/0, /*SubTp=*/nullptr,
15536 /*Args=*/*It);
15537 }
15538 return GatherCost +
15539 (all_of(Gathers, IsaPred<UndefValue>)
15541 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
15542 ScalarTy));
15543 };
15544
15545 /// Compute the cost of creating a vector containing the extracted values from
15546 /// \p VL.
15548 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
15549 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
15550 unsigned NumParts) {
15551 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
15552 unsigned NumElts = accumulate(VL, 0, [](unsigned Sz, Value *V) {
15553 auto *EE = dyn_cast<ExtractElementInst>(V);
15554 if (!EE)
15555 return Sz;
15556 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
15557 if (!VecTy)
15558 return Sz;
15559 return std::max(Sz, VecTy->getNumElements());
15560 });
15561 // FIXME: this must be moved to TTI for better estimation.
15562 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
15563 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
15565 SmallVectorImpl<unsigned> &SubVecSizes)
15566 -> std::optional<TTI::ShuffleKind> {
15567 if (NumElts <= EltsPerVector)
15568 return std::nullopt;
15569 int OffsetReg0 = alignDown(accumulate(Mask, INT_MAX,
15570 [](int S, int I) {
15571 if (I == PoisonMaskElem)
15572 return S;
15573 return std::min(S, I);
15574 }),
15575 EltsPerVector);
15576 int OffsetReg1 = OffsetReg0;
15577 DenseSet<int> RegIndices;
15578 // Check that if trying to permute same single/2 input vectors.
15580 int FirstRegId = -1;
15581 Indices.assign(1, OffsetReg0);
15582 for (auto [Pos, I] : enumerate(Mask)) {
15583 if (I == PoisonMaskElem)
15584 continue;
15585 int Idx = I - OffsetReg0;
15586 int RegId =
15587 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
15588 if (FirstRegId < 0)
15589 FirstRegId = RegId;
15590 RegIndices.insert(RegId);
15591 if (RegIndices.size() > 2)
15592 return std::nullopt;
15593 if (RegIndices.size() == 2) {
15594 ShuffleKind = TTI::SK_PermuteTwoSrc;
15595 if (Indices.size() == 1) {
15596 OffsetReg1 = alignDown(
15597 std::accumulate(
15598 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
15599 [&](int S, int I) {
15600 if (I == PoisonMaskElem)
15601 return S;
15602 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
15603 ((I - OffsetReg0) % NumElts) / EltsPerVector;
15604 if (RegId == FirstRegId)
15605 return S;
15606 return std::min(S, I);
15607 }),
15608 EltsPerVector);
15609 unsigned Index = OffsetReg1 % NumElts;
15610 Indices.push_back(Index);
15611 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
15612 }
15613 Idx = I - OffsetReg1;
15614 }
15615 I = (Idx % NumElts) % EltsPerVector +
15616 (RegId == FirstRegId ? 0 : EltsPerVector);
15617 }
15618 return ShuffleKind;
15619 };
15620 InstructionCost Cost = 0;
15621
15622 // Process extracts in blocks of EltsPerVector to check if the source vector
15623 // operand can be re-used directly. If not, add the cost of creating a
15624 // shuffle to extract the values into a vector register.
15625 for (unsigned Part : seq<unsigned>(NumParts)) {
15626 if (!ShuffleKinds[Part])
15627 continue;
15628 ArrayRef<int> MaskSlice = Mask.slice(
15629 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
15630 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
15631 copy(MaskSlice, SubMask.begin());
15633 SmallVector<unsigned, 2> SubVecSizes;
15634 std::optional<TTI::ShuffleKind> RegShuffleKind =
15635 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
15636 if (!RegShuffleKind) {
15637 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
15639 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
15640 Cost += ::getShuffleCost(
15641 TTI, *ShuffleKinds[Part],
15642 cast<VectorType>(getWidenedType(ScalarTy, NumElts)), MaskSlice);
15643 continue;
15644 }
15645 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
15646 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
15647 Cost += ::getShuffleCost(
15648 TTI, *RegShuffleKind,
15649 cast<VectorType>(getWidenedType(ScalarTy, EltsPerVector)), SubMask);
15650 }
15651 const unsigned BaseVF = getFullVectorNumberOfElements(
15652 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
15653 for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
15654 assert((Idx + SubVecSize) <= BaseVF &&
15655 "SK_ExtractSubvector index out of range");
15656 Cost += ::getShuffleCost(
15658 cast<VectorType>(getWidenedType(ScalarTy, BaseVF)), {}, CostKind,
15659 Idx, cast<VectorType>(getWidenedType(ScalarTy, SubVecSize)));
15660 }
15661 // Second attempt to check, if just a permute is better estimated than
15662 // subvector extract.
15663 SubMask.assign(NumElts, PoisonMaskElem);
15664 copy(MaskSlice, SubMask.begin());
15665 InstructionCost OriginalCost = ::getShuffleCost(
15666 TTI, *ShuffleKinds[Part],
15667 cast<VectorType>(getWidenedType(ScalarTy, NumElts)), SubMask);
15668 if (OriginalCost < Cost)
15669 Cost = OriginalCost;
15670 }
15671 return Cost;
15672 }
15673 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
15674 /// mask \p Mask, register number \p Part, that includes \p SliceSize
15675 /// elements.
15676 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
15677 ArrayRef<int> Mask, unsigned Part,
15678 unsigned SliceSize) {
15679 if (SameNodesEstimated) {
15680 // Delay the cost estimation if the same nodes are reshuffling.
15681 // If we already requested the cost of reshuffling of E1 and E2 before, no
15682 // need to estimate another cost with the sub-Mask, instead include this
15683 // sub-Mask into the CommonMask to estimate it later and avoid double cost
15684 // estimation.
15685 if ((InVectors.size() == 2 &&
15686 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
15687 cast<const TreeEntry *>(InVectors.back()) == E2) ||
15688 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
15689 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
15690 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
15691 [](int Idx) { return Idx == PoisonMaskElem; }) &&
15692 "Expected all poisoned elements.");
15693 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
15694 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
15695 return;
15696 }
15697 // Found non-matching nodes - need to estimate the cost for the matched
15698 // and transform mask.
15699 Cost += createShuffle(InVectors.front(),
15700 InVectors.size() == 1 ? nullptr : InVectors.back(),
15701 CommonMask);
15702 transformMaskAfterShuffle(CommonMask, CommonMask);
15703 } else if (InVectors.size() == 2) {
15704 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
15705 transformMaskAfterShuffle(CommonMask, CommonMask);
15706 }
15707 SameNodesEstimated = false;
15708 if (!E2 && InVectors.size() == 1) {
15709 unsigned VF = E1.getVectorFactor();
15710 if (Value *V1 = dyn_cast<Value *>(InVectors.front())) {
15711 VF = std::max(VF, getVF(V1));
15712 } else {
15713 const auto *E = cast<const TreeEntry *>(InVectors.front());
15714 VF = std::max(VF, E->getVectorFactor());
15715 }
15716 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
15717 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
15718 CommonMask[Idx] = Mask[Idx] + VF;
15719 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
15720 transformMaskAfterShuffle(CommonMask, CommonMask);
15721 } else {
15722 auto P = InVectors.front();
15723 Cost += createShuffle(&E1, E2, Mask);
15724 unsigned VF = Mask.size();
15725 if (Value *V1 = dyn_cast<Value *>(P)) {
15726 VF = std::max(VF,
15727 getNumElements(V1->getType()));
15728 } else {
15729 const auto *E = cast<const TreeEntry *>(P);
15730 VF = std::max(VF, E->getVectorFactor());
15731 }
15732 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
15733 if (Mask[Idx] != PoisonMaskElem)
15734 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
15735 Cost += createShuffle(P, InVectors.front(), CommonMask);
15736 transformMaskAfterShuffle(CommonMask, CommonMask);
15737 }
15738 }
15739
15740 class ShuffleCostBuilder {
15741 const TargetTransformInfo &TTI;
15742
15743 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
15744 int Index = -1;
15745 return Mask.empty() ||
15746 (VF == Mask.size() &&
15749 Index == 0);
15750 }
15751
15752 public:
15753 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
15754 ~ShuffleCostBuilder() = default;
15755 InstructionCost createShuffleVector(Value *V1, Value *,
15756 ArrayRef<int> Mask) const {
15757 // Empty mask or identity mask are free.
15758 unsigned VF =
15759 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
15760 if (isEmptyOrIdentity(Mask, VF))
15761 return TTI::TCC_Free;
15762 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
15763 cast<VectorType>(V1->getType()), Mask);
15764 }
15765 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask,
15766 ArrayRef<Value *> VL) const {
15767 // Empty mask or identity mask are free.
15768 unsigned VF =
15769 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
15770 if (isEmptyOrIdentity(Mask, VF))
15771 return TTI::TCC_Free;
15772 return ::getShuffleCost(
15774 TTI::TCK_RecipThroughput, /*Index=*/0, /*SubTp=*/nullptr, VL);
15775 }
15776 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
15777 InstructionCost createPoison(Type *Ty, unsigned VF) const {
15778 return TTI::TCC_Free;
15779 }
15780 void resizeToMatch(Value *&, Value *&) const {}
15781 };
15782
15783 /// Smart shuffle instruction emission, walks through shuffles trees and
15784 /// tries to find the best matching vector for the actual shuffle
15785 /// instruction.
15787 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
15789 ArrayRef<int> Mask, ArrayRef<Value *> VL = {}) {
15790 ShuffleCostBuilder Builder(TTI);
15791 SmallVector<int> CommonMask(Mask);
15792 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
15793 unsigned CommonVF = Mask.size();
15794 InstructionCost ExtraCost = 0;
15795 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
15796 unsigned VF) -> InstructionCost {
15797 if (E.isGather() && allConstant(E.Scalars))
15798 return TTI::TCC_Free;
15799 Type *EScalarTy = E.Scalars.front()->getType();
15800 bool IsSigned = true;
15801 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
15802 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
15803 IsSigned = It->second.second;
15804 }
15805 if (EScalarTy != ScalarTy) {
15806 unsigned CastOpcode = Instruction::Trunc;
15807 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
15808 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
15809 if (DstSz > SrcSz)
15810 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
15811 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
15812 getWidenedType(EScalarTy, VF),
15813 TTI::CastContextHint::None, CostKind);
15814 }
15815 return TTI::TCC_Free;
15816 };
15817 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
15818 if (isa<Constant>(V))
15819 return TTI::TCC_Free;
15820 auto *VecTy = cast<VectorType>(V->getType());
15821 Type *EScalarTy = VecTy->getElementType();
15822 if (EScalarTy != ScalarTy) {
15823 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
15824 unsigned CastOpcode = Instruction::Trunc;
15825 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
15826 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
15827 if (DstSz > SrcSz)
15828 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
15829 return TTI.getCastInstrCost(
15830 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
15831 VecTy, TTI::CastContextHint::None, CostKind);
15832 }
15833 return TTI::TCC_Free;
15834 };
15835 if (!V1 && !V2 && !P2.isNull()) {
15836 // Shuffle 2 entry nodes.
15837 const TreeEntry *E = cast<const TreeEntry *>(P1);
15838 unsigned VF = E->getVectorFactor();
15839 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
15840 CommonVF = std::max(VF, E2->getVectorFactor());
15841 assert(all_of(Mask,
15842 [=](int Idx) {
15843 return Idx < 2 * static_cast<int>(CommonVF);
15844 }) &&
15845 "All elements in mask must be less than 2 * CommonVF.");
15846 if (E->Scalars.size() == E2->Scalars.size()) {
15847 SmallVector<int> EMask = E->getCommonMask();
15848 SmallVector<int> E2Mask = E2->getCommonMask();
15849 if (!EMask.empty() || !E2Mask.empty()) {
15850 for (int &Idx : CommonMask) {
15851 if (Idx == PoisonMaskElem)
15852 continue;
15853 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
15854 Idx = EMask[Idx];
15855 else if (Idx >= static_cast<int>(CommonVF))
15856 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
15857 E->Scalars.size();
15858 }
15859 }
15860 CommonVF = E->Scalars.size();
15861 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
15862 GetNodeMinBWAffectedCost(*E2, CommonVF);
15863 } else {
15864 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
15865 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
15866 }
15867 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
15868 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
15869 } else if (!V1 && P2.isNull()) {
15870 // Shuffle single entry node.
15871 const TreeEntry *E = cast<const TreeEntry *>(P1);
15872 unsigned VF = E->getVectorFactor();
15873 CommonVF = VF;
15874 assert(
15875 all_of(Mask,
15876 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
15877 "All elements in mask must be less than CommonVF.");
15878 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
15879 SmallVector<int> EMask = E->getCommonMask();
15880 assert(!EMask.empty() && "Expected non-empty common mask.");
15881 for (int &Idx : CommonMask) {
15882 if (Idx != PoisonMaskElem)
15883 Idx = EMask[Idx];
15884 }
15885 CommonVF = E->Scalars.size();
15886 } else if (unsigned Factor = E->getInterleaveFactor();
15887 Factor > 0 && E->Scalars.size() != Mask.size() &&
15889 Factor)) {
15890 // Deinterleaved nodes are free.
15891 std::iota(CommonMask.begin(), CommonMask.end(), 0);
15892 }
15893 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
15894 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
15895 // Not identity/broadcast? Try to see if the original vector is better.
15896 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
15897 CommonVF == CommonMask.size() &&
15898 any_of(enumerate(CommonMask),
15899 [](const auto &&P) {
15900 return P.value() != PoisonMaskElem &&
15901 static_cast<unsigned>(P.value()) != P.index();
15902 }) &&
15903 any_of(CommonMask,
15904 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
15905 SmallVector<int> ReorderMask;
15906 inversePermutation(E->ReorderIndices, ReorderMask);
15907 ::addMask(CommonMask, ReorderMask);
15908 }
15909 } else if (V1 && P2.isNull()) {
15910 // Shuffle single vector.
15911 ExtraCost += GetValueMinBWAffectedCost(V1);
15912 CommonVF = getVF(V1);
15913 assert(
15914 all_of(Mask,
15915 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
15916 "All elements in mask must be less than CommonVF.");
15917 } else if (V1 && !V2) {
15918 // Shuffle vector and tree node.
15919 unsigned VF = getVF(V1);
15920 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
15921 CommonVF = std::max(VF, E2->getVectorFactor());
15922 assert(all_of(Mask,
15923 [=](int Idx) {
15924 return Idx < 2 * static_cast<int>(CommonVF);
15925 }) &&
15926 "All elements in mask must be less than 2 * CommonVF.");
15927 if (E2->Scalars.size() == VF && VF != CommonVF) {
15928 SmallVector<int> E2Mask = E2->getCommonMask();
15929 assert(!E2Mask.empty() && "Expected non-empty common mask.");
15930 for (int &Idx : CommonMask) {
15931 if (Idx == PoisonMaskElem)
15932 continue;
15933 if (Idx >= static_cast<int>(CommonVF))
15934 Idx = E2Mask[Idx - CommonVF] + VF;
15935 }
15936 CommonVF = VF;
15937 }
15938 ExtraCost += GetValueMinBWAffectedCost(V1);
15939 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
15940 ExtraCost += GetNodeMinBWAffectedCost(
15941 *E2, std::min(CommonVF, E2->getVectorFactor()));
15942 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
15943 } else if (!V1 && V2) {
15944 // Shuffle vector and tree node.
15945 unsigned VF = getVF(V2);
15946 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
15947 CommonVF = std::max(VF, E1->getVectorFactor());
15948 assert(all_of(Mask,
15949 [=](int Idx) {
15950 return Idx < 2 * static_cast<int>(CommonVF);
15951 }) &&
15952 "All elements in mask must be less than 2 * CommonVF.");
15953 if (E1->Scalars.size() == VF && VF != CommonVF) {
15954 SmallVector<int> E1Mask = E1->getCommonMask();
15955 assert(!E1Mask.empty() && "Expected non-empty common mask.");
15956 for (int &Idx : CommonMask) {
15957 if (Idx == PoisonMaskElem)
15958 continue;
15959 if (Idx >= static_cast<int>(CommonVF))
15960 Idx = E1Mask[Idx - CommonVF] + VF;
15961 else
15962 Idx = E1Mask[Idx];
15963 }
15964 CommonVF = VF;
15965 }
15966 ExtraCost += GetNodeMinBWAffectedCost(
15967 *E1, std::min(CommonVF, E1->getVectorFactor()));
15968 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
15969 ExtraCost += GetValueMinBWAffectedCost(V2);
15970 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
15971 } else {
15972 assert(V1 && V2 && "Expected both vectors.");
15973 unsigned VF = getVF(V1);
15974 CommonVF = std::max(VF, getVF(V2));
15975 assert(all_of(Mask,
15976 [=](int Idx) {
15977 return Idx < 2 * static_cast<int>(CommonVF);
15978 }) &&
15979 "All elements in mask must be less than 2 * CommonVF.");
15980 ExtraCost +=
15981 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
15982 if (V1->getType() != V2->getType()) {
15983 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
15984 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
15985 } else {
15986 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
15987 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
15988 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
15989 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
15990 }
15991 }
15992 InVectors.front() =
15993 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
15994 if (InVectors.size() == 2)
15995 InVectors.pop_back();
15996 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
15997 V1, V2, CommonMask, Builder, ScalarTy, VL);
15998 }
15999
16000public:
16002 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
16003 SmallPtrSetImpl<Value *> &CheckedExtracts)
16004 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
16005 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
16006 CheckedExtracts(CheckedExtracts) {}
16007 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
16008 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
16009 unsigned NumParts, bool &UseVecBaseAsInput) {
16010 UseVecBaseAsInput = false;
16011 if (Mask.empty())
16012 return nullptr;
16013 Value *VecBase = nullptr;
16014 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
16015 if (!E->ReorderIndices.empty()) {
16016 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
16017 E->ReorderIndices.end());
16018 reorderScalars(VL, ReorderMask);
16019 }
16020 // Check if it can be considered reused if same extractelements were
16021 // vectorized already.
16022 bool PrevNodeFound = any_of(
16023 ArrayRef(R.VectorizableTree).take_front(E->Idx),
16024 [&](const std::unique_ptr<TreeEntry> &TE) {
16025 return ((TE->hasState() && !TE->isAltShuffle() &&
16026 TE->getOpcode() == Instruction::ExtractElement) ||
16027 TE->isGather()) &&
16028 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
16029 return VL.size() > Data.index() &&
16030 (Mask[Data.index()] == PoisonMaskElem ||
16031 isa<UndefValue>(VL[Data.index()]) ||
16032 Data.value() == VL[Data.index()]);
16033 });
16034 });
16035 SmallPtrSet<Value *, 4> UniqueBases;
16036 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
16037 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
16038 for (unsigned Part : seq<unsigned>(NumParts)) {
16039 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
16040 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
16041 for (auto [I, V] :
16042 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
16043 // Ignore non-extractelement scalars.
16044 if (isa<UndefValue>(V) ||
16045 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
16046 continue;
16047 // If all users of instruction are going to be vectorized and this
16048 // instruction itself is not going to be vectorized, consider this
16049 // instruction as dead and remove its cost from the final cost of the
16050 // vectorized tree.
16051 // Also, avoid adjusting the cost for extractelements with multiple uses
16052 // in different graph entries.
16053 auto *EE = cast<ExtractElementInst>(V);
16054 VecBase = EE->getVectorOperand();
16055 UniqueBases.insert(VecBase);
16056 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
16057 if (!CheckedExtracts.insert(V).second ||
16058 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
16059 any_of(VEs,
16060 [&](const TreeEntry *TE) {
16061 return R.DeletedNodes.contains(TE) ||
16062 R.TransformedToGatherNodes.contains(TE);
16063 }) ||
16064 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
16065 !R.isVectorized(EE) &&
16066 count_if(E->Scalars, [&](Value *V) { return V == EE; }) !=
16067 count_if(E->UserTreeIndex.UserTE->Scalars,
16068 [&](Value *V) { return V == EE; })) ||
16069 any_of(EE->users(),
16070 [&](User *U) {
16071 return isa<GetElementPtrInst>(U) &&
16072 !R.areAllUsersVectorized(cast<Instruction>(U),
16073 &VectorizedVals);
16074 }) ||
16075 (!VEs.empty() && !is_contained(VEs, E)))
16076 continue;
16077 std::optional<unsigned> EEIdx = getExtractIndex(EE);
16078 if (!EEIdx)
16079 continue;
16080 unsigned Idx = *EEIdx;
16081 // Take credit for instruction that will become dead.
16082 if (EE->hasOneUse() || !PrevNodeFound) {
16083 Instruction *Ext = EE->user_back();
16084 if (isa<SExtInst, ZExtInst>(Ext) &&
16086 // Use getExtractWithExtendCost() to calculate the cost of
16087 // extractelement/ext pair.
16088 Cost -= TTI.getExtractWithExtendCost(
16089 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
16090 Idx, CostKind);
16091 // Add back the cost of s|zext which is subtracted separately.
16092 Cost += TTI.getCastInstrCost(
16093 Ext->getOpcode(), Ext->getType(), EE->getType(),
16095 continue;
16096 }
16097 }
16098 APInt &DemandedElts =
16099 VectorOpsToExtracts
16100 .try_emplace(VecBase,
16101 APInt::getZero(getNumElements(VecBase->getType())))
16102 .first->getSecond();
16103 DemandedElts.setBit(Idx);
16104 }
16105 }
16106 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
16108 DemandedElts, /*Insert=*/false,
16109 /*Extract=*/true, CostKind);
16110 // Check that gather of extractelements can be represented as just a
16111 // shuffle of a single/two vectors the scalars are extracted from.
16112 // Found the bunch of extractelement instructions that must be gathered
16113 // into a vector and can be represented as a permutation elements in a
16114 // single input vector or of 2 input vectors.
16115 // Done for reused if same extractelements were vectorized already.
16116 if (!PrevNodeFound)
16117 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
16118 InVectors.assign(1, E);
16119 CommonMask.assign(Mask.begin(), Mask.end());
16120 transformMaskAfterShuffle(CommonMask, CommonMask);
16121 SameNodesEstimated = false;
16122 if (NumParts != 1 && UniqueBases.size() != 1) {
16123 UseVecBaseAsInput = true;
16124 VecBase =
16125 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
16126 }
16127 return VecBase;
16128 }
16129 /// Checks if the specified entry \p E needs to be delayed because of its
16130 /// dependency nodes.
16131 std::optional<InstructionCost>
16132 needToDelay(const TreeEntry *,
16134 // No need to delay the cost estimation during analysis.
16135 return std::nullopt;
16136 }
16137 /// Reset the builder to handle perfect diamond match.
16139 IsFinalized = false;
16140 CommonMask.clear();
16141 InVectors.clear();
16142 Cost = 0;
16143 VectorizedVals.clear();
16144 SameNodesEstimated = true;
16145 }
16146 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
16147 BVValues.reset();
16148 if (&E1 == &E2) {
16149 assert(all_of(Mask,
16150 [&](int Idx) {
16151 return Idx < static_cast<int>(E1.getVectorFactor());
16152 }) &&
16153 "Expected single vector shuffle mask.");
16154 add(E1, Mask);
16155 return;
16156 }
16157 if (InVectors.empty()) {
16158 CommonMask.assign(Mask.begin(), Mask.end());
16159 InVectors.assign({&E1, &E2});
16160 return;
16161 }
16162 assert(!CommonMask.empty() && "Expected non-empty common mask.");
16163 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
16164 unsigned NumParts =
16165 ::getNumberOfParts(TTI, MaskVecTy, ScalarTy, Mask.size());
16166 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
16167 const auto *It = find_if(Mask, not_equal_to(PoisonMaskElem));
16168 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
16169 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
16170 }
16171 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
16172 BVValues.reset();
16173 if (InVectors.empty()) {
16174 CommonMask.assign(Mask.begin(), Mask.end());
16175 InVectors.assign(1, &E1);
16176 return;
16177 }
16178 assert(!CommonMask.empty() && "Expected non-empty common mask.");
16179 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
16180 unsigned NumParts =
16181 ::getNumberOfParts(TTI, MaskVecTy, ScalarTy, Mask.size());
16182 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
16183 const auto *It = find_if(Mask, not_equal_to(PoisonMaskElem));
16184 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
16185 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
16186 if (!SameNodesEstimated && InVectors.size() == 1)
16187 InVectors.emplace_back(&E1);
16188 }
16189 /// Adds 2 input vectors and the mask for their shuffling.
16190 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
16191 // May come only for shuffling of 2 vectors with extractelements, already
16192 // handled in adjustExtracts.
16193 assert(InVectors.size() == 1 &&
16194 all_of(enumerate(CommonMask),
16195 [&](auto P) {
16196 if (P.value() == PoisonMaskElem)
16197 return Mask[P.index()] == PoisonMaskElem;
16198 auto *EI = cast<ExtractElementInst>(
16199 cast<const TreeEntry *>(InVectors.front())
16200 ->getOrdered(P.index()));
16201 return EI->getVectorOperand() == V1 ||
16202 EI->getVectorOperand() == V2;
16203 }) &&
16204 "Expected extractelement vectors.");
16205 }
16206 /// Adds another one input vector and the mask for the shuffling.
16207 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
16208 if (BVValues && !isa<Constant>(V1))
16209 BVValues.reset();
16210 if (InVectors.empty()) {
16211 assert(CommonMask.empty() && !ForExtracts &&
16212 "Expected empty input mask/vectors.");
16213 CommonMask.assign(Mask.begin(), Mask.end());
16214 InVectors.assign(1, V1);
16215 return;
16216 }
16217 if (ForExtracts) {
16218 // No need to add vectors here, already handled them in adjustExtracts.
16219 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
16220 !CommonMask.empty() &&
16221 all_of(enumerate(CommonMask),
16222 [&](auto P) {
16223 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
16224 ->getOrdered(P.index());
16225 if (P.value() == PoisonMaskElem)
16226 return P.value() == Mask[P.index()] ||
16227 isa<UndefValue>(Scalar);
16228 if (isa<Constant>(V1))
16229 return true;
16230 auto *EI = cast<ExtractElementInst>(Scalar);
16231 return EI->getVectorOperand() == V1;
16232 }) &&
16233 "Expected only tree entry for extractelement vectors.");
16234 return;
16235 }
16236 assert(!InVectors.empty() && !CommonMask.empty() &&
16237 "Expected only tree entries from extracts/reused buildvectors.");
16238 unsigned VF = getVF(V1);
16239 if (InVectors.size() == 2) {
16240 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
16241 transformMaskAfterShuffle(CommonMask, CommonMask);
16242 VF = std::max<unsigned>(VF, CommonMask.size());
16243 } else if (const auto *InTE =
16244 InVectors.front().dyn_cast<const TreeEntry *>()) {
16245 VF = std::max(VF, InTE->getVectorFactor());
16246 } else {
16247 VF = std::max(
16248 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
16249 ->getNumElements());
16250 }
16251 InVectors.push_back(V1);
16252 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
16253 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
16254 CommonMask[Idx] = Mask[Idx] + VF;
16255 }
16256 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
16257 Value *Root = nullptr) {
16258 Cost += getBuildVectorCost(VL, Root);
16259 if (BVValues) {
16260 if (BVValues->empty() && InVectors.empty())
16261 BVValues->assign(VL.begin(), VL.end());
16262 else
16263 BVValues.reset();
16264 }
16265 if (!Root) {
16266 // FIXME: Need to find a way to avoid use of getNullValue here.
16268 unsigned VF = VL.size();
16269 if (MaskVF != 0)
16270 VF = std::min(VF, MaskVF);
16271 Type *VLScalarTy = VL.front()->getType();
16272 for (Value *V : VL.take_front(VF)) {
16273 Type *ScalarTy = VLScalarTy->getScalarType();
16274 if (isa<PoisonValue>(V)) {
16275 Vals.push_back(PoisonValue::get(ScalarTy));
16276 continue;
16277 }
16278 if (isa<UndefValue>(V)) {
16279 Vals.push_back(UndefValue::get(ScalarTy));
16280 continue;
16281 }
16282 Vals.push_back(Constant::getNullValue(ScalarTy));
16283 }
16284 if (auto *VecTy = dyn_cast<FixedVectorType>(VLScalarTy)) {
16285 assert(SLPReVec && "FixedVectorType is not expected.");
16286 // When REVEC is enabled, we need to expand vector types into scalar
16287 // types.
16288 Vals = replicateMask(Vals, VecTy->getNumElements());
16289 }
16290 return ConstantVector::get(Vals);
16291 }
16294 cast<FixedVectorType>(Root->getType())->getNumElements()),
16295 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
16296 }
16298 /// Finalize emission of the shuffles.
16300 ArrayRef<int> ExtMask,
16301 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
16302 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
16305 Action = {}) {
16306 IsFinalized = true;
16307 if (Action) {
16308 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
16309 if (InVectors.size() == 2)
16310 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
16311 else
16312 Cost += createShuffle(Vec, nullptr, CommonMask);
16313 transformMaskAfterShuffle(CommonMask, CommonMask);
16314 assert(VF > 0 &&
16315 "Expected vector length for the final value before action.");
16316 Value *V = cast<Value *>(Vec);
16317 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
16318 Cost += createShuffle(V1, V2, Mask);
16319 return V1;
16320 });
16321 InVectors.front() = V;
16322 }
16323 if (!SubVectors.empty()) {
16324 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
16325 if (InVectors.size() == 2)
16326 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
16327 else
16328 Cost += createShuffle(Vec, nullptr, CommonMask);
16329 transformMaskAfterShuffle(CommonMask, CommonMask);
16330 // Add subvectors permutation cost.
16331 if (!SubVectorsMask.empty()) {
16332 assert(SubVectorsMask.size() <= CommonMask.size() &&
16333 "Expected same size of masks for subvectors and common mask.");
16334 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
16335 copy(SubVectorsMask, SVMask.begin());
16336 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
16337 if (I2 != PoisonMaskElem) {
16338 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
16339 I1 = I2 + CommonMask.size();
16340 }
16341 }
16344 cast<VectorType>(getWidenedType(ScalarTy, CommonMask.size())),
16345 SVMask, CostKind);
16346 }
16347 for (auto [E, Idx] : SubVectors) {
16348 Type *EScalarTy = E->Scalars.front()->getType();
16349 bool IsSigned = true;
16350 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
16351 EScalarTy =
16352 IntegerType::get(EScalarTy->getContext(), It->second.first);
16353 IsSigned = It->second.second;
16354 }
16355 if (ScalarTy != EScalarTy) {
16356 unsigned CastOpcode = Instruction::Trunc;
16357 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
16358 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
16359 if (DstSz > SrcSz)
16360 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
16361 Cost += TTI.getCastInstrCost(
16362 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
16363 getWidenedType(EScalarTy, E->getVectorFactor()),
16365 }
16368 cast<VectorType>(getWidenedType(ScalarTy, CommonMask.size())), {},
16369 CostKind, Idx,
16370 cast<VectorType>(getWidenedType(ScalarTy, E->getVectorFactor())));
16371 if (!CommonMask.empty()) {
16372 std::iota(std::next(CommonMask.begin(), Idx),
16373 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
16374 Idx);
16375 }
16376 }
16377 }
16378
16379 if (!ExtMask.empty()) {
16380 if (CommonMask.empty()) {
16381 CommonMask.assign(ExtMask.begin(), ExtMask.end());
16382 } else {
16383 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
16384 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
16385 if (ExtMask[I] == PoisonMaskElem)
16386 continue;
16387 NewMask[I] = CommonMask[ExtMask[I]];
16388 }
16389 CommonMask.swap(NewMask);
16390 }
16391 }
16392 if (CommonMask.empty()) {
16393 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
16394 return Cost;
16395 }
16397 if (BVValues)
16398 VL = *BVValues;
16399 return Cost +
16400 createShuffle(InVectors.front(),
16401 InVectors.size() == 2 ? InVectors.back() : nullptr,
16402 CommonMask, VL);
16403 }
16404
16406 assert((IsFinalized || CommonMask.empty()) &&
16407 "Shuffle construction must be finalized.");
16408 }
16409};
16410
16411const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
16412 unsigned Idx) const {
16413 TreeEntry *Op = OperandsToTreeEntry.at({E, Idx});
16414 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
16415 return Op;
16416}
16417
16418TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
16419 if (TE.State == TreeEntry::ScatterVectorize ||
16420 TE.State == TreeEntry::StridedVectorize)
16422 if (TE.State == TreeEntry::CompressVectorize)
16424 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
16425 !TE.isAltShuffle()) {
16426 if (TE.ReorderIndices.empty())
16428 SmallVector<int> Mask;
16429 inversePermutation(TE.ReorderIndices, Mask);
16430 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
16432 }
16434}
16435
16436/// Get the assumed loop trip count for the loop \p L.
16437static unsigned getLoopTripCount(const Loop *L, ScalarEvolution &SE) {
16438 if (LoopAwareTripCount == 0)
16439 return 1;
16440 unsigned Scale = SE.getSmallConstantTripCount(L);
16441 if (Scale == 0)
16442 Scale = getLoopEstimatedTripCount(const_cast<Loop *>(L)).value_or(0);
16443 if (Scale != 0) {
16444 // Multiple exiting blocks - choose the minimum between trip count (scale)
16445 // and LoopAwareTripCount, since the multiple exit loops can be terminated
16446 // early.
16447 if (!L->getExitingBlock())
16448 return std::min<unsigned>(LoopAwareTripCount, Scale);
16449 return Scale;
16450 }
16451 return LoopAwareTripCount;
16452}
16453
16454uint64_t BoUpSLP::getScaleToLoopIterations(const TreeEntry &TE, Value *Scalar,
16455 Instruction *U) {
16456 BasicBlock *Parent = nullptr;
16457 if (U) {
16458 // The extractelement for a PHI-node user is created in the incoming
16459 // block that feeds the matching operand, not in the PHI block itself
16460 // When the PHI is inside a loop that incoming block can belong to a deeper
16461 // loop than the PHI block. Scaling by the PHI block would use
16462 // the outer trip count instead of inner*outer, and because
16463 // ExtractCostCalculated deduplicates by scalar (only the first external
16464 // user fixes the scale) it would also make the cost depend on external-user
16465 // ordering. A PHI outside all loops is a plain loop-exit phi: its live-out
16466 // lanes are normally rebuilt as a vector LCSSA phi in the exit block, which
16467 // hoists the extract out of the loop, so scale = 1 (via U->getParent()
16468 // below) is kept and the adjustment is restricted to in-loop PHIs.
16469 if (auto *PHI = dyn_cast<PHINode>(U); PHI && Scalar) {
16470 if (LI->getLoopFor(PHI->getParent())) {
16471 // Use the deepest incoming block among all slots where Scalar
16472 // appears, to be conservative when the same value appears in
16473 // multiple predecessors.
16474 for (unsigned I : seq<unsigned>(PHI->getNumIncomingValues())) {
16475 if (PHI->getIncomingValue(I) != Scalar)
16476 continue;
16477 BasicBlock *InBB = PHI->getIncomingBlock(I);
16478 if (!Parent || LI->getLoopDepth(InBB) > LI->getLoopDepth(Parent))
16479 Parent = InBB;
16480 }
16481 }
16482 }
16483 if (!Parent)
16484 Parent = U->getParent();
16485 } else if (TE.isGather() || TE.State == TreeEntry::SplitVectorize) {
16486 EdgeInfo EI = TE.UserTreeIndex;
16487 while (EI.UserTE) {
16488 if (EI.UserTE->isGather() ||
16489 EI.UserTE->State == TreeEntry::SplitVectorize) {
16490 EI = EI.UserTE->UserTreeIndex;
16491 continue;
16492 }
16493 if (EI.UserTE->State == TreeEntry::Vectorize &&
16494 EI.UserTE->getOpcode() == Instruction::PHI) {
16495 auto *PH = cast<PHINode>(EI.UserTE->getMainOp());
16496 Parent = PH->getIncomingBlock(EI.EdgeIdx);
16497 } else {
16498 Parent = EI.UserTE->getMainOp()->getParent();
16499 }
16500 break;
16501 }
16502 if (!Parent)
16503 return 1;
16504 } else {
16505 Parent = TE.getMainOp()->getParent();
16506 }
16507 const Loop *L = LI->getLoopFor(Parent);
16508 if (!L)
16509 return 1;
16510 // The entry's cost is paid once per execution of the innermost loop in
16511 // which some of its operands are variant. Operands that are invariant in
16512 // all enclosing loops are executed once (LICM will hoist them out).
16513 return getLoopNestScale(findInnermostNonInvariantLoop(
16514 L, Scalar ? ArrayRef(Scalar) : ArrayRef(TE.Scalars)));
16515}
16516
16517uint64_t BoUpSLP::getLoopNestScale(const Loop *L) {
16518 if (!L || LoopAwareTripCount == 0)
16519 return 1;
16520 if (auto It = LoopNestScaleCache.find(L); It != LoopNestScaleCache.end())
16521 return It->second;
16522 // Collect loops from L outward up to (but not including) the first cached
16523 // ancestor or the function top, then walk back inward multiplying trip
16524 // counts. Use uint64_t to avoid silent overflow on deep/large nests.
16525 SmallVector<const Loop *> Chain;
16526 for (const Loop *Cur = L; Cur; Cur = Cur->getParentLoop()) {
16527 if (LoopNestScaleCache.contains(Cur))
16528 break;
16529 Chain.push_back(Cur);
16530 }
16531 assert(!Chain.empty() && "Early-return above should have handled cache hit.");
16532 uint64_t Scale = 1;
16533 if (const Loop *Parent = Chain.back()->getParentLoop())
16534 Scale = LoopNestScaleCache.lookup(Parent);
16535 // Walk from the outermost uncached loop inward, accumulating trip counts.
16536 // Use SaturatingMultiply to clamp at uint64_t max on deep/large nests
16537 // rather than wrapping around.
16538 for (const Loop *Cur : reverse(Chain)) {
16539 uint64_t TC = std::max<uint64_t>(1, getLoopTripCount(Cur, *SE));
16540 Scale = SaturatingMultiply(Scale, TC);
16541 LoopNestScaleCache.try_emplace(Cur, std::max<uint64_t>(1, Scale));
16542 }
16543 return std::max<uint64_t>(1, Scale);
16544}
16545
16546uint64_t BoUpSLP::getGatherNodeEffectiveScale(const TreeEntry &TE,
16547 Instruction *U) {
16548 // Only meaningful for gather/buildvector-like entries; the per-lane
16549 // insertelements that make up such an entry are LICM-hoistable by
16550 // optimizeGatherSequence() when their operand is loop-invariant.
16551 assert((TE.isGather() || TE.State == TreeEntry::SplitVectorize) &&
16552 "Expected gather/split tree entry.");
16553
16554 uint64_t BaseScale = getScaleToLoopIterations(TE, nullptr, U);
16555 if (!PerLaneGatherScale || LoopAwareTripCount == 0 || BaseScale <= 1)
16556 return BaseScale;
16557
16558 // Average the per-lane execution scales: for each lane, reuse the same
16559 // scale helper the rest of the cost model uses, but ask it about that
16560 // one lane's value. Lanes that are loop-invariant in the current nest
16561 // collapse to their outer-loop scale (or 1 for fully invariant/constant
16562 // lanes), which matches the LICM hoisting performed by
16563 // optimizeGatherSequence(). Cap per-lane contributions by BaseScale so a
16564 // refinement can never raise the cost above the whole-entry scale.
16565 // Each lane contributes at most BaseScale, so Sum is bounded above by
16566 // N * BaseScale. If BaseScale is near uint64_t max (saturated by
16567 // getLoopNestScale on a deep nest) Sum can still overflow uint64_t,
16568 // which would silently wrap and produce a wrong average. Use
16569 // SaturatingAdd and bail out to BaseScale on overflow: the true average
16570 // is bounded above by BaseScale anyway, so this preserves the
16571 // refinement's invariant that it can never raise cost.
16572 uint64_t Sum = 0;
16573 unsigned N = 0;
16574 bool Overflow = false;
16575 for (Value *V : TE.Scalars) {
16576 if (isConstant(V))
16577 continue;
16578 ++N;
16579 uint64_t LaneScale =
16580 std::min(getScaleToLoopIterations(TE, V, U), BaseScale);
16581 Sum = SaturatingAdd(Sum, LaneScale, &Overflow);
16582 if (Overflow)
16583 return BaseScale;
16584 }
16585 if (N == 0)
16586 return BaseScale;
16587 // Ceil-divide so we never round the effective scale down below 1.
16588 uint64_t Numerator = SaturatingAdd(Sum, uint64_t(N - 1), &Overflow);
16589 if (Overflow)
16590 return BaseScale;
16591 uint64_t Avg = Numerator / N;
16592 return std::clamp<uint64_t>(Avg, 1, BaseScale);
16593}
16594
16596BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, Type *ScalarTy,
16597 Type *VecTy, Type *FinalVecTy,
16599 InstructionCost SpillsReloads = 0;
16600
16601 // Estimate vector register pressure per target register class: operand
16602 // vectors plus the result. The same vector operand is counted once via
16603 // CountedOpEntries deduplication. PHIs take the max operand pressure across
16604 // incoming slots (only one predecessor is live at a time) plus the result.
16605 // All-constant operand bundles are skipped.
16606 if (!E->hasState() || E->getOpcode() == Instruction::Store ||
16607 E->getOpcode() == Instruction::ExtractElement ||
16608 E->getOpcode() == Instruction::ExtractValue ||
16609 E->getOpcode() == Instruction::Freeze ||
16610 (E->getOpcode() == Instruction::Load &&
16611 E->State != TreeEntry::ScatterVectorize))
16612 return SpillsReloads;
16613
16614 const bool IsPHI =
16615 E->State == TreeEntry::Vectorize && E->getOpcode() == Instruction::PHI;
16616 SmallPtrSet<const TreeEntry *, 8> CountedOpEntries;
16617 SmallDenseMap<unsigned, unsigned> PressureByClass;
16618 auto AddPartsToClass = [&](unsigned RegClass, unsigned Parts) {
16619 assert(Parts != 0 && "Expected non-zero number of parts (registers).");
16620 PressureByClass[RegClass] += Parts;
16621 };
16622
16623 auto GetEntryVecTy = [&](const TreeEntry *TE) -> std::pair<Type *, Type *> {
16624 Type *ScalarTy = getValueType(TE->Scalars.front());
16625 auto BWIt = MinBWs.find(TE);
16626 if (BWIt != MinBWs.end()) {
16627 auto *VTy = dyn_cast<FixedVectorType>(ScalarTy);
16628 ScalarTy = IntegerType::get(F->getContext(), BWIt->second.first);
16629 if (VTy)
16630 ScalarTy = getWidenedType(ScalarTy, VTy->getNumElements());
16631 }
16632 return std::make_pair(ScalarTy,
16633 getWidenedType(ScalarTy, TE->getVectorFactor()));
16634 };
16635
16636 if (E->State == TreeEntry::SplitVectorize) {
16637 for (const auto &[Idx, _] : E->CombinedEntriesWithIndices) {
16638 const TreeEntry *OpTE = VectorizableTree[Idx].get();
16639
16640 if (!CountedOpEntries.insert(OpTE).second)
16641 continue;
16642 auto [ScalarTy, OpVecTy] = GetEntryVecTy(OpTE);
16643 const unsigned Parts = ::getNumberOfParts(*TTI, OpVecTy, ScalarTy);
16644 if (Parts == 0)
16645 continue;
16646 const unsigned RC =
16647 TTI->getRegisterClassForType(/*Vector=*/true, OpVecTy);
16648 AddPartsToClass(RC, Parts);
16649 }
16650 } else if (IsPHI) {
16651 // Only one predecessor is live at a time - take the max operand pressure
16652 // across incoming slots.
16653 SmallDenseMap<unsigned, unsigned> MaxOpPressureByClass;
16654 for (unsigned Idx : seq<unsigned>(E->getNumOperands())) {
16655 const TreeEntry *OpTE = getOperandEntry(E, Idx);
16656 auto [ScalarTy, OpVecTy] = GetEntryVecTy(OpTE);
16657 const unsigned Parts = ::getNumberOfParts(*TTI, OpVecTy, ScalarTy);
16658 if (Parts == 0)
16659 continue;
16660 const unsigned RC =
16661 TTI->getRegisterClassForType(/*Vector=*/true, OpVecTy);
16662 MaxOpPressureByClass[RC] = std::max(MaxOpPressureByClass[RC], Parts);
16663 }
16664 for (auto [RC, Parts] : MaxOpPressureByClass)
16665 AddPartsToClass(RC, Parts);
16666 } else {
16667 for (unsigned Idx : seq<unsigned>(E->getNumOperands())) {
16668 // InsertElement operand 0 is the vector being inserted into, which is
16669 // built incrementally and does not occupy an extra register.
16670 if (E->getOpcode() == Instruction::InsertElement && Idx == 0)
16671 continue;
16672 ArrayRef<Value *> Ops = E->getOperand(Idx);
16673 if (Ops.empty() || allConstant(Ops) || isSplat(Ops))
16674 continue;
16675 Value *Op = Ops.front();
16676 if (!Op)
16677 continue;
16678 const TreeEntry *OpTE = getOperandEntry(E, Idx);
16679
16680 if (!CountedOpEntries.insert(OpTE).second)
16681 continue;
16682 auto *OpVecTy = getWidenedType(Op->getType(), Ops.size());
16683 const unsigned Parts = ::getNumberOfParts(*TTI, OpVecTy, Op->getType());
16684 if (Parts == 0)
16685 continue;
16686 const unsigned RC =
16687 TTI->getRegisterClassForType(/*Vector=*/true, OpVecTy);
16688 AddPartsToClass(RC, Parts);
16689 }
16690 }
16691
16692 if (E->getOpcode() != Instruction::Load) {
16693 const unsigned ResParts = ::getNumberOfParts(*TTI, VecTy, ScalarTy);
16694 if (ResParts != 0) {
16695 const unsigned RC = TTI->getRegisterClassForType(/*Vector=*/true, VecTy);
16696 AddPartsToClass(RC, ResParts);
16697 }
16698 if (VecTy != FinalVecTy) {
16699 const unsigned FinalResParts =
16700 ::getNumberOfParts(*TTI, FinalVecTy, ScalarTy);
16701 if (FinalResParts != 0) {
16702 const unsigned RC =
16703 TTI->getRegisterClassForType(/*Vector=*/true, FinalVecTy);
16704 AddPartsToClass(RC, FinalResParts);
16705 }
16706 }
16707 }
16708
16709 for (auto [RegClass, UsedRegs] : PressureByClass) {
16710 const unsigned NumAvailRegs = TTI->getNumberOfRegisters(RegClass);
16711 if (NumAvailRegs == 0 || UsedRegs <= NumAvailRegs)
16712 continue;
16713 const unsigned SpillCount = UsedRegs - NumAvailRegs;
16714 InstructionCost SingleRegSpillReload =
16715 TTI->getRegisterClassReloadCost(RegClass, CostKind);
16716 // No need to spill cost only for the root entry (Idx == 0), for reduction
16717 // and non-returning instructions, like void calls.
16718 if (E->Idx > 0 || !UserIgnoreList || !E->Scalars[0]->getType()->isVoidTy())
16719 SingleRegSpillReload +=
16720 TTI->getRegisterClassSpillCost(RegClass, CostKind);
16721 SpillsReloads += SingleRegSpillReload * SpillCount;
16722 }
16723 return SpillsReloads;
16724}
16725
16726/// Calculates a VectorInstrContext from the values in \p VL at indices in
16727/// \p DemandedElts.
16731 for (unsigned I : seq(VL.size())) {
16732 if (!DemandedElts[I])
16733 continue;
16734 Value *V = VL[I];
16735 if (isa<UndefValue>(V))
16736 continue;
16737 auto *Inst = dyn_cast<Instruction>(V);
16738 if (!Inst)
16741 if (VIC == TTI::VectorInstrContext::None) {
16742 VIC = IVIC;
16743 continue;
16744 }
16745 if (VIC != IVIC)
16747 }
16748 return VIC;
16749}
16750
16752BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
16753 SmallPtrSetImpl<Value *> &CheckedExtracts) {
16754 ArrayRef<Value *> VL = E->Scalars;
16755
16756 Type *ScalarTy = getValueType(VL[0]);
16757 if (SLPReVec && E->State == TreeEntry::Vectorize &&
16758 E->getOpcode() == Instruction::InsertElement &&
16759 !E->getOperand(1).back()->getType()->isVectorTy())
16760 ScalarTy = ScalarTy->getScalarType();
16761 if (!isValidElementType(ScalarTy))
16762 return InstructionCost::getInvalid();
16764
16765 // If we have computed a smaller type for the expression, update VecTy so
16766 // that the costs will be accurate.
16767 auto It = MinBWs.find(E);
16768 Type *OrigScalarTy = ScalarTy;
16769 if (It != MinBWs.end()) {
16770 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
16771 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
16772 if (VecTy)
16773 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
16774 } else if (E->Idx == 0 && isReducedBitcastRoot()) {
16775 const TreeEntry *ZExt = getOperandEntry(E, /*Idx=*/0);
16776 ScalarTy = cast<CastInst>(ZExt->getMainOp())->getSrcTy();
16777 }
16778 auto *VecTy = getWidenedType(ScalarTy, VL.size());
16779 unsigned EntryVF = E->getVectorFactor();
16780 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
16781
16782 const InstructionCost SpillsReloads =
16783 getVectorSpillReloadCost(E, ScalarTy, VecTy, FinalVecTy, CostKind);
16784 if (E->isGather() || TransformedToGatherNodes.contains(E)) {
16785 if (allConstant(VL))
16786 return 0;
16787 if (isa<InsertElementInst>(VL[0]))
16788 return InstructionCost::getInvalid();
16789 return SpillsReloads +
16790 processBuildVector<ShuffleCostEstimator, InstructionCost>(
16791 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
16792 }
16793 if (E->State == TreeEntry::SplitVectorize) {
16794 assert(E->CombinedEntriesWithIndices.size() == 2 &&
16795 "Expected exactly 2 combined entries.");
16796 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
16797 InstructionCost VectorCost = 0;
16798 if (E->ReorderIndices.empty()) {
16799 VectorCost = ::getShuffleCost(
16800 *TTI, TTI::SK_InsertSubvector, cast<VectorType>(FinalVecTy), {},
16801 CostKind, E->CombinedEntriesWithIndices.back().second,
16803 ScalarTy,
16804 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
16805 ->getVectorFactor())));
16806 } else {
16807 unsigned CommonVF =
16808 std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
16809 ->getVectorFactor(),
16810 VectorizableTree[E->CombinedEntriesWithIndices.back().first]
16811 ->getVectorFactor());
16812 VectorCost =
16814 cast<VectorType>(getWidenedType(ScalarTy, CommonVF)),
16815 E->getSplitMask(), CostKind);
16816 }
16817 VectorCost += SpillsReloads;
16818 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
16819 return VectorCost;
16820 }
16821 InstructionCost CommonCost = 0;
16822 SmallVector<int> Mask;
16823 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
16824 (E->State != TreeEntry::StridedVectorize ||
16825 !isReverseOrder(E->ReorderIndices))) {
16826 SmallVector<int> NewMask;
16827 if (E->getOpcode() == Instruction::Store) {
16828 // For stores the order is actually a mask.
16829 NewMask.resize(E->ReorderIndices.size());
16830 copy(E->ReorderIndices, NewMask.begin());
16831 } else {
16832 inversePermutation(E->ReorderIndices, NewMask);
16833 }
16834 ::addMask(Mask, NewMask);
16835 }
16836 if (!E->ReuseShuffleIndices.empty())
16837 ::addMask(Mask, E->ReuseShuffleIndices);
16838 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size())) {
16839 assert(!isa<StructType>(FinalVecTy) &&
16840 "Expected non-struct vector type for shuffle cost calculation.");
16841 CommonCost = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
16842 cast<VectorType>(FinalVecTy), Mask, CostKind,
16843 /*Index=*/0, cast<VectorType>(VecTy));
16844 }
16845 assert((E->State == TreeEntry::Vectorize ||
16846 E->State == TreeEntry::ScatterVectorize ||
16847 E->State == TreeEntry::StridedVectorize ||
16848 E->State == TreeEntry::CompressVectorize) &&
16849 "Unhandled state");
16850 assert(E->getOpcode() &&
16851 ((allSameType(VL) && allSameBlock(VL)) ||
16852 (E->getOpcode() == Instruction::GetElementPtr &&
16853 E->getMainOp()->getType()->isPointerTy()) ||
16854 E->hasCopyableElements()) &&
16855 "Invalid VL");
16856 Instruction *VL0 = E->getMainOp();
16857 unsigned ShuffleOrOp =
16858 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
16859 if (E->CombinedOp != TreeEntry::NotCombinedOp)
16860 ShuffleOrOp = E->CombinedOp;
16861 SmallSetVector<Value *, 16> UniqueValues;
16862 SmallVector<unsigned, 16> UniqueIndexes;
16863 for (auto [Idx, V] : enumerate(VL))
16864 if (UniqueValues.insert(V))
16865 UniqueIndexes.push_back(Idx);
16866 const unsigned Sz = UniqueValues.size();
16867 SmallBitVector UsedScalars(Sz, false);
16868 for (unsigned I = 0; I < Sz; ++I) {
16869 if (isa<Instruction>(UniqueValues[I]) &&
16870 !E->isCopyableElement(UniqueValues[I]) &&
16871 getTreeEntries(UniqueValues[I]).front() == E)
16872 continue;
16873 UsedScalars.set(I);
16874 }
16875 auto GetCastContextHint = [&](Value *V) {
16876 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
16877 return getCastContextHint(*OpTEs.front());
16878 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
16879 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
16880 !SrcState.isAltShuffle())
16883 };
16884 auto GetCostDiff =
16885 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
16886 function_ref<InstructionCost(InstructionCost)> VectorCost) {
16887 // Calculate the cost of this instruction.
16888 InstructionCost ScalarCost = 0;
16889 if (isa<CastInst, CallInst>(VL0)) {
16890 // For some of the instructions no need to calculate cost for each
16891 // particular instruction, we can use the cost of the single
16892 // instruction x total number of scalar instructions.
16893 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
16894 } else {
16895 for (unsigned I = 0; I < Sz; ++I) {
16896 if (UsedScalars.test(I))
16897 continue;
16898 ScalarCost += ScalarEltCost(I);
16899 }
16900 }
16901
16902 InstructionCost VecCost = VectorCost(CommonCost);
16903 // Check if the current node must be resized, if the parent node is not
16904 // resized.
16905 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
16906 E->Idx != 0 &&
16907 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
16908 const EdgeInfo &EI = E->UserTreeIndex;
16909 if (!EI.UserTE->hasState() ||
16910 EI.UserTE->getOpcode() != Instruction::Select ||
16911 EI.EdgeIdx != 0) {
16912 auto UserBWIt = MinBWs.find(EI.UserTE);
16913 Type *UserScalarTy =
16914 (EI.UserTE->isGather() ||
16915 EI.UserTE->State == TreeEntry::SplitVectorize)
16916 ? EI.UserTE->Scalars.front()->getType()
16917 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
16918 if (UserBWIt != MinBWs.end())
16919 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
16920 UserBWIt->second.first);
16921 if (ScalarTy != UserScalarTy) {
16922 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
16923 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
16924 unsigned VecOpcode;
16925 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
16926 if (BWSz > SrcBWSz)
16927 VecOpcode = Instruction::Trunc;
16928 else
16929 VecOpcode =
16930 It->second.second ? Instruction::SExt : Instruction::ZExt;
16931 TTI::CastContextHint CCH = GetCastContextHint(VL0);
16932 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
16933 CostKind);
16934 }
16935 }
16936 }
16937 VecCost += SpillsReloads;
16938 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
16939 ScalarCost, "Calculated costs for Tree"));
16940 return VecCost - ScalarCost;
16941 };
16942 // Calculate cost difference from vectorizing set of GEPs.
16943 // Negative value means vectorizing is profitable.
16944 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
16945 assert((E->State == TreeEntry::Vectorize ||
16946 E->State == TreeEntry::StridedVectorize ||
16947 E->State == TreeEntry::CompressVectorize) &&
16948 "Entry state expected to be Vectorize, StridedVectorize or "
16949 "MaskedLoadCompressVectorize here.");
16950 InstructionCost ScalarCost = 0;
16951 InstructionCost VecCost = 0;
16952 std::tie(ScalarCost, VecCost) =
16953 getGEPCosts(*TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy,
16954 cast<VectorType>(VecTy));
16955 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
16956 "Calculated GEPs cost for Tree"));
16957
16958 return VecCost - ScalarCost + SpillsReloads;
16959 };
16960
16961 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
16962 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
16963 if (MinMaxID == Intrinsic::not_intrinsic)
16964 return InstructionCost::getInvalid();
16965 Type *CanonicalType = Ty;
16966 if (CanonicalType->isPtrOrPtrVectorTy())
16967 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
16968 CanonicalType->getContext(),
16969 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
16970
16971 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
16972 {CanonicalType, CanonicalType});
16974 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
16975 // If the selects are the only uses of the compares, they will be
16976 // dead and we can adjust the cost by removing their cost.
16977 if (VI && SelectOnly) {
16978 assert((!Ty->isVectorTy() || SLPReVec) &&
16979 "Expected only for scalar type.");
16980 auto *CI = cast<CmpInst>(VI->getOperand(0));
16981 IntrinsicCost -= TTI->getCmpSelInstrCost(
16982 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
16983 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
16984 {TTI::OK_AnyValue, TTI::OP_None}, CI);
16985 }
16986 return IntrinsicCost;
16987 };
16988 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
16989 Instruction *VI) {
16990 InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
16991 return Cost;
16992 };
16993 switch (ShuffleOrOp) {
16994 case Instruction::PHI: {
16995 // Count reused scalars.
16996 InstructionCost ScalarCost = 0;
16997 SmallPtrSet<const TreeEntry *, 4> CountedOps;
16998 for (Value *V : UniqueValues) {
16999 auto *PHI = dyn_cast<PHINode>(V);
17000 if (!PHI)
17001 continue;
17002
17003 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
17004 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
17005 Value *Op = PHI->getIncomingValue(I);
17006 Operands[I] = Op;
17007 }
17008 if (const TreeEntry *OpTE =
17009 getSameValuesTreeEntry(Operands.front(), Operands))
17010 if (CountedOps.insert(OpTE).second &&
17011 !OpTE->ReuseShuffleIndices.empty())
17012 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
17013 OpTE->Scalars.size());
17014 }
17015
17016 return CommonCost - ScalarCost + SpillsReloads;
17017 }
17018 case Instruction::ExtractValue:
17019 case Instruction::ExtractElement: {
17020 // For ExtractValue entries vectorized via the struct-call path the scalar
17021 // extractvalue instructions are free (they become extractvalue from a
17022 // struct-of-vectors, which is also free). The cost is fully captured by
17023 // the underlying Call entry and the external-use extraction costs.
17024 if (ShuffleOrOp == Instruction::ExtractValue && !E->StructEVIndices.empty())
17025 return CommonCost;
17026 APInt DemandedElts;
17027 VectorType *SrcVecTy = nullptr;
17028 auto GetScalarCost = [&](unsigned Idx) {
17029 if (isa<PoisonValue>(UniqueValues[Idx]))
17031
17032 auto *I = cast<Instruction>(UniqueValues[Idx]);
17033 if (!SrcVecTy) {
17034 if (ShuffleOrOp == Instruction::ExtractElement) {
17035 auto *EE = cast<ExtractElementInst>(I);
17036 SrcVecTy = EE->getVectorOperandType();
17037 } else {
17038 auto *EV = cast<ExtractValueInst>(I);
17039 Type *AggregateTy = EV->getAggregateOperand()->getType();
17040 unsigned NumElts;
17041 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
17042 NumElts = ATy->getNumElements();
17043 else
17044 NumElts = AggregateTy->getStructNumElements();
17045 SrcVecTy = cast<VectorType>(getWidenedType(OrigScalarTy, NumElts));
17046 }
17047 }
17048 if (I->hasOneUse()) {
17049 Instruction *Ext = I->user_back();
17050 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
17052 // Use getExtractWithExtendCost() to calculate the cost of
17053 // extractelement/ext pair.
17054 InstructionCost Cost = TTI->getExtractWithExtendCost(
17055 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I),
17056 CostKind);
17057 // Subtract the cost of s|zext which is subtracted separately.
17058 Cost -= TTI->getCastInstrCost(
17059 Ext->getOpcode(), Ext->getType(), I->getType(),
17061 return Cost;
17062 }
17063 }
17064 if (DemandedElts.isZero())
17065 DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
17066 DemandedElts.setBit(*getExtractIndex(I));
17068 };
17069 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
17070 return CommonCost - (DemandedElts.isZero()
17072 : TTI.getScalarizationOverhead(
17073 SrcVecTy, DemandedElts, /*Insert=*/false,
17074 /*Extract=*/true, CostKind));
17075 };
17076 return GetCostDiff(GetScalarCost, GetVectorCost);
17077 }
17078 case Instruction::InsertElement: {
17079 assert(E->ReuseShuffleIndices.empty() &&
17080 "Unique insertelements only are expected.");
17081 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
17082 unsigned const NumElts = SrcVecTy->getNumElements();
17083 unsigned const NumScalars = VL.size();
17084
17085 unsigned NumOfParts =
17086 ::getNumberOfParts(*TTI, SrcVecTy, VL0->getOperand(1)->getType());
17087
17088 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
17089 unsigned OffsetBeg = *getElementIndex(VL.front());
17090 unsigned OffsetEnd = OffsetBeg;
17091 InsertMask[OffsetBeg] = 0;
17092 for (auto [I, V] : enumerate(VL.drop_front())) {
17093 unsigned Idx = *getElementIndex(V);
17094 if (OffsetBeg > Idx)
17095 OffsetBeg = Idx;
17096 else if (OffsetEnd < Idx)
17097 OffsetEnd = Idx;
17098 InsertMask[Idx] = I + 1;
17099 }
17100 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
17101 if (NumOfParts > 0 && NumOfParts < NumElts)
17102 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
17103 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
17104 VecScalarsSz;
17105 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
17106 unsigned InsertVecSz = std::min<unsigned>(
17107 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
17108 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
17109 bool IsWholeSubvector =
17110 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
17111 // Check if we can safely insert a subvector. If it is not possible, just
17112 // generate a whole-sized vector and shuffle the source vector and the new
17113 // subvector.
17114 if (OffsetBeg + InsertVecSz > VecSz) {
17115 // Align OffsetBeg to generate correct mask.
17116 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
17117 InsertVecSz = VecSz;
17118 }
17119
17120 APInt DemandedElts = APInt::getZero(NumElts);
17121 // TODO: Add support for Instruction::InsertValue.
17122 SmallVector<int> Mask;
17123 if (!E->ReorderIndices.empty()) {
17124 inversePermutation(E->ReorderIndices, Mask);
17125 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
17126 } else {
17127 Mask.assign(VecSz, PoisonMaskElem);
17128 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
17129 }
17130 bool IsIdentity = true;
17131 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
17132 SmallVector<Value *> AdjustedVL(NumElts, PoisonValue::get(ScalarTy));
17133 Mask.swap(PrevMask);
17134 for (unsigned I = 0; I < NumScalars; ++I) {
17135 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
17136 DemandedElts.setBit(InsertIdx);
17137 AdjustedVL[InsertIdx] = VL[PrevMask[I]];
17138 IsIdentity &= InsertIdx - OffsetBeg == I;
17139 Mask[InsertIdx - OffsetBeg] = I;
17140 }
17141 assert(Offset < NumElts && "Failed to find vector index offset");
17142
17144 // First cost - resize to actual vector size if not identity shuffle or
17145 // need to shift the vector.
17146 // Do not calculate the cost if the actual size is the register size and
17147 // we can merge this shuffle with the following SK_Select.
17148 auto *InsertVecTy = cast<VectorType>(getWidenedType(ScalarTy, InsertVecSz));
17149 if (!IsIdentity)
17151 InsertVecTy, Mask);
17152 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
17153 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
17154 }));
17155 // Second cost - permutation with subvector, if some elements are from the
17156 // initial vector or inserting a subvector.
17157 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
17158 // subvector of ActualVecTy.
17159 SmallBitVector InMask =
17160 isUndefVector(FirstInsert->getOperand(0),
17161 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
17163 *TTI, ScalarTy, SrcVecTy, DemandedElts,
17164 /*Insert=*/true, /*Extract=*/false, CostKind, InMask.all(), AdjustedVL,
17165 getVectorInstrContextHint(AdjustedVL, DemandedElts));
17166 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
17167 if (InsertVecSz != VecSz) {
17168 auto *ActualVecTy = cast<VectorType>(getWidenedType(ScalarTy, VecSz));
17169 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
17170 CostKind, OffsetBeg - Offset, InsertVecTy);
17171 } else {
17172 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
17173 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
17174 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
17175 I <= End; ++I)
17176 if (Mask[I] != PoisonMaskElem)
17177 Mask[I] = I + VecSz;
17178 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
17179 Mask[I] =
17180 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
17181 Cost +=
17182 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
17183 }
17184 }
17185 return Cost + SpillsReloads;
17186 }
17187 case Instruction::ZExt:
17188 case Instruction::SExt:
17189 case Instruction::FPToUI:
17190 case Instruction::FPToSI:
17191 case Instruction::FPExt:
17192 case Instruction::PtrToInt:
17193 case Instruction::IntToPtr:
17194 case Instruction::SIToFP:
17195 case Instruction::UIToFP:
17196 case Instruction::Trunc:
17197 case Instruction::FPTrunc:
17198 case Instruction::BitCast: {
17199 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
17200 Type *SrcScalarTy = VL0->getOperand(0)->getType();
17201 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
17202 unsigned Opcode = ShuffleOrOp;
17203 unsigned VecOpcode = Opcode;
17204 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
17205 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
17206 // Check if the values are candidates to demote.
17207 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
17208 if (SrcIt != MinBWs.end()) {
17209 SrcBWSz = SrcIt->second.first;
17210 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
17211 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
17212 SrcVecTy =
17213 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
17214 }
17215 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
17216 if (BWSz == SrcBWSz) {
17217 VecOpcode = Instruction::BitCast;
17218 } else if (BWSz < SrcBWSz) {
17219 VecOpcode = Instruction::Trunc;
17220 } else if (It != MinBWs.end()) {
17221 assert(BWSz > SrcBWSz && "Invalid cast!");
17222 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
17223 } else if (SrcIt != MinBWs.end()) {
17224 assert(BWSz > SrcBWSz && "Invalid cast!");
17225 VecOpcode =
17226 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
17227 }
17228 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
17229 !SrcIt->second.second) {
17230 VecOpcode = Instruction::UIToFP;
17231 }
17232 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
17233 assert(Idx == 0 && "Expected 0 index only");
17234 return TTI->getCastInstrCost(Opcode, VL0->getType(),
17235 VL0->getOperand(0)->getType(),
17237 };
17238 auto GetVectorCost = [=](InstructionCost CommonCost) {
17239 // Do not count cost here if minimum bitwidth is in effect and it is just
17240 // a bitcast (here it is just a noop).
17241 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
17242 return CommonCost;
17243 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
17244 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
17245
17246 bool IsArithmeticExtendedReduction =
17247 E->Idx == 0 && UserIgnoreList &&
17248 all_of(*UserIgnoreList, [](Value *V) {
17249 auto *I = cast<Instruction>(V);
17250 return is_contained({Instruction::Add, Instruction::FAdd,
17251 Instruction::Mul, Instruction::FMul,
17252 Instruction::And, Instruction::Or,
17253 Instruction::Xor},
17254 I->getOpcode());
17255 });
17256 if (IsArithmeticExtendedReduction &&
17257 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
17258 return CommonCost;
17259 return CommonCost +
17260 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
17261 VecOpcode == Opcode ? VI : nullptr);
17262 };
17263 return GetCostDiff(GetScalarCost, GetVectorCost);
17264 }
17265 case Instruction::FCmp:
17266 case Instruction::ICmp:
17267 // Override ScalarTy/VecTy with the compared operand type (not i1). The
17268 // cost of a compare instruction is determined by the operand width, and
17269 // getCmpSelInstrCost expects the compared type as its first type arg.
17270 OrigScalarTy = ScalarTy = getValueType(VL0, /*LookThroughCmp=*/true);
17271 VecTy = getWidenedType(ScalarTy, VL.size());
17272 [[fallthrough]];
17273 case Instruction::Select: {
17274 CmpPredicate VecPred, SwappedVecPred;
17275 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
17276 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
17277 match(VL0, MatchCmp))
17278 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
17279 else
17280 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
17283 auto GetScalarCost = [&](unsigned Idx) {
17284 if (isa<PoisonValue>(UniqueValues[Idx]))
17286
17287 if (!isa<SelectInst>(UniqueValues[Idx]))
17288 return TTI->getInstructionCost(cast<Instruction>(UniqueValues[Idx]),
17289 CostKind);
17290
17291 auto *VI = cast<Instruction>(UniqueValues[Idx]);
17292 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
17295 Value *LHS = nullptr, *RHS = nullptr;
17296 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
17297 bool IsSelect =
17298 ShuffleOrOp == Instruction::Select &&
17299 (match(VI, m_Select(MatchCmp, m_Value(LHS), m_Value(RHS))) ||
17301 if ((!IsSelect && !match(VI, MatchCmp)) ||
17302 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
17303 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
17304 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
17307
17308 // Check if operands are of i1 types, like a condition expression.
17309 // TODO: consider implementing this in TTI.
17310 InstructionCost ScalarCost = InstructionCost::getInvalid();
17311 if (IsSelect && LHS->getType() == VI->getOperand(0)->getType()) {
17312 assert(LHS->getType() == RHS->getType() &&
17313 "Expected same type for LHS/RHS");
17314 // select i1 v, i1 true, i1 b -> or i1 v, i1 b
17315 if (match(LHS, m_AllOnes())) {
17316 ScalarCost = TTI->getArithmeticInstrCost(
17317 Instruction::Or, LHS->getType(), CostKind,
17318 getOperandInfo(VI->getOperand(0)), getOperandInfo(RHS));
17319 } else if (match(RHS, m_Zero())) {
17320 // select i1 v, i1 b, i1 false -> and i1 v, i1 b
17321 ScalarCost = TTI->getArithmeticInstrCost(
17322 Instruction::And, LHS->getType(), CostKind,
17323 getOperandInfo(VI->getOperand(0)), getOperandInfo(LHS));
17324 }
17325 }
17326 if (!ScalarCost.isValid()) {
17327 // For selects, the "condition type" arg is the condition operand's
17328 // type; for standalone compares, it is the result type (i1).
17329 ScalarCost = TTI->getCmpSelInstrCost(
17330 E->getOpcode(), OrigScalarTy,
17331 ShuffleOrOp == Instruction::Select ? VL0->getOperand(0)->getType()
17332 : VL0->getType(),
17333 CurrentPred, CostKind,
17334 getOperandInfo(
17335 VI->getOperand(ShuffleOrOp == Instruction::Select ? 1 : 0)),
17336 getOperandInfo(
17337 VI->getOperand(ShuffleOrOp == Instruction::Select ? 2 : 1)),
17338 VI);
17339 }
17340 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
17341 if (IntrinsicCost.isValid())
17342 ScalarCost = IntrinsicCost;
17343
17344 return ScalarCost;
17345 };
17346 auto GetVectorCost = [&](InstructionCost CommonCost) {
17347 // For selects, the condition type may differ from the result type
17348 // (e.g. condition is <N x i1> while result is <N x i32>). For
17349 // compares, the result type IS the mask (i1/vNi1). Construct the
17350 // right type so getCmpSelInstrCost sees the actual mask/result width.
17351 auto *MaskTy = cast<VectorType>(getWidenedType(
17352 ShuffleOrOp == Instruction::Select ? VL0->getOperand(0)->getType()
17353 : VL0->getType(),
17354 VL.size()));
17355
17356 InstructionCost VecCost = InstructionCost::getInvalid();
17357 if (ShuffleOrOp == Instruction::Select) {
17358 ArrayRef<Value *> Cond = E->getOperand(0);
17359 ArrayRef<Value *> LHS = E->getOperand(1);
17360 ArrayRef<Value *> RHS = E->getOperand(2);
17361 // select <VF x i1>, <VF x i1>, <VF x i1>?
17362 // TODO: consider implementing this in TTI.
17363 if (Cond.front()->getType() == LHS.front()->getType()) {
17364 // select <VF x i1> v, <VF x i1> true, <VF x i1> b -> or <VF x i1> v,
17365 // <VF x i1> b
17366 if (all_of(LHS, [&](Value *V) { return match(V, m_AllOnes()); })) {
17367 VecCost = TTI->getArithmeticInstrCost(
17368 Instruction::Or, VecTy, CostKind, getOperandInfo(Cond),
17369 getOperandInfo(RHS));
17370 } else if (all_of(RHS,
17371 [&](Value *V) { return match(V, m_Zero()); })) {
17372 // select <VF x i1> v, <VF x i1> b, <VF x i1> false -> and <VF x i1>
17373 // v, <VF x i1> b
17374 VecCost = TTI->getArithmeticInstrCost(
17375 Instruction::And, VecTy, CostKind, getOperandInfo(Cond),
17376 getOperandInfo(LHS));
17377 }
17378 }
17379 }
17380 if (!VecCost.isValid()) {
17381 VecCost = TTI->getCmpSelInstrCost(
17382 E->getOpcode(), VecTy, MaskTy, VecPred, CostKind,
17383 getOperandInfo(
17384 E->getOperand(ShuffleOrOp == Instruction::Select ? 1 : 0)),
17385 getOperandInfo(
17386 E->getOperand(ShuffleOrOp == Instruction::Select ? 2 : 1)),
17387 VL0);
17388 if (isa<SelectInst>(VL0)) {
17389 unsigned CondNumElements = getNumElements(MaskTy);
17390 unsigned VecTyNumElements = getNumElements(VecTy);
17391 assert(VecTyNumElements >= CondNumElements &&
17392 VecTyNumElements % CondNumElements == 0 &&
17393 "Cannot vectorize Instruction::Select");
17394 if (CondNumElements != VecTyNumElements) {
17395 // When the return type is i1 but the source is fixed vector type,
17396 // we need to duplicate the condition value.
17397 VecCost += ::getShuffleCost(
17398 *TTI, TTI::SK_PermuteSingleSrc, MaskTy,
17399 createReplicatedMask(VecTyNumElements / CondNumElements,
17400 CondNumElements));
17401 }
17402 }
17403 }
17404 return VecCost + CommonCost;
17405 };
17406 return GetCostDiff(GetScalarCost, GetVectorCost);
17407 }
17408 case TreeEntry::MinMax: {
17409 auto GetScalarCost = [&](unsigned Idx) {
17410 return GetMinMaxCost(OrigScalarTy);
17411 };
17412 auto GetVectorCost = [&](InstructionCost CommonCost) {
17413 InstructionCost VecCost = GetMinMaxCost(VecTy);
17414 return VecCost + CommonCost;
17415 };
17416 return GetCostDiff(GetScalarCost, GetVectorCost);
17417 }
17418 case TreeEntry::FMulAdd: {
17419 auto GetScalarCost = [&](unsigned Idx) {
17420 if (isa<PoisonValue>(UniqueValues[Idx]))
17422 return GetFMulAddCost(E->getOperations(),
17423 cast<Instruction>(UniqueValues[Idx]));
17424 };
17425 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
17426 FastMathFlags FMF;
17427 FMF.set();
17428 for (Value *V : E->Scalars) {
17429 if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
17430 FMF &= FPCI->getFastMathFlags();
17431 if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
17432 FMF &= FPCIOp->getFastMathFlags();
17433 }
17434 }
17435 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
17436 {VecTy, VecTy, VecTy}, FMF);
17437 InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
17438 return VecCost + CommonCost;
17439 };
17440 return GetCostDiff(GetScalarCost, GetVectorCost);
17441 }
17442 case TreeEntry::ReducedBitcast:
17443 case TreeEntry::ReducedBitcastBSwap: {
17444 auto GetScalarCost = [&, &TTI = *TTI](unsigned Idx) {
17445 if (isa<PoisonValue>(UniqueValues[Idx]))
17447 auto *Shl = dyn_cast<Instruction>(UniqueValues[Idx]);
17448 if (!Shl)
17450 InstructionCost ScalarCost = TTI.getInstructionCost(Shl, CostKind);
17451 auto *ZExt = dyn_cast<Instruction>(Shl->getOperand(0));
17452 if (!ZExt)
17453 return ScalarCost;
17454 ScalarCost += TTI.getInstructionCost(ZExt, CostKind);
17455 return ScalarCost;
17456 };
17457 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
17458 const TreeEntry *LhsTE = getOperandEntry(E, /*Idx=*/0);
17459 TTI::CastContextHint CastCtx =
17460 getCastContextHint(*getOperandEntry(LhsTE, /*Idx=*/0));
17461 Type *SrcScalarTy = cast<ZExtInst>(LhsTE->getMainOp())->getSrcTy();
17462 auto *SrcVecTy = getWidenedType(SrcScalarTy, LhsTE->getVectorFactor());
17463 InstructionCost BitcastCost = TTI.getCastInstrCost(
17464 Instruction::BitCast, ScalarTy, SrcVecTy, CastCtx, CostKind);
17465 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap) {
17466 auto *SrcType = IntegerType::getIntNTy(
17467 ScalarTy->getContext(),
17468 DL->getTypeSizeInBits(SrcScalarTy) * EntryVF);
17469 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
17471 TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
17472 BitcastCost += IntrinsicCost;
17473 if (SrcType != ScalarTy) {
17474 BitcastCost +=
17475 TTI.getCastInstrCost(Instruction::ZExt, ScalarTy, SrcType,
17477 }
17478 }
17479 return BitcastCost + CommonCost;
17480 };
17481 return GetCostDiff(GetScalarCost, GetVectorCost);
17482 }
17483 case TreeEntry::ReducedBitcastLoads:
17484 case TreeEntry::ReducedBitcastBSwapLoads: {
17485 auto GetScalarCost = [&, &TTI = *TTI](unsigned Idx) {
17486 if (isa<PoisonValue>(UniqueValues[Idx]))
17488 auto *Shl = dyn_cast<Instruction>(UniqueValues[Idx]);
17489 if (!Shl)
17491 InstructionCost ScalarCost = TTI.getInstructionCost(Shl, CostKind);
17492 auto *ZExt = dyn_cast<Instruction>(Shl->getOperand(0));
17493 if (!ZExt)
17494 return ScalarCost;
17495 ScalarCost += TTI.getInstructionCost(ZExt, CostKind);
17496 auto *Load = dyn_cast<Instruction>(ZExt->getOperand(0));
17497 if (!Load)
17498 return ScalarCost;
17499 ScalarCost += TTI.getInstructionCost(Load, CostKind);
17500 return ScalarCost;
17501 };
17502 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
17503 const TreeEntry *LhsTE = getOperandEntry(E, /*Idx=*/0);
17504 const TreeEntry *LoadTE = getOperandEntry(LhsTE, /*Idx=*/0);
17505 auto *LI0 = cast<LoadInst>(LoadTE->getMainOp());
17506 auto *SrcType = IntegerType::getIntNTy(
17507 ScalarTy->getContext(),
17508 DL->getTypeSizeInBits(LI0->getType()) * EntryVF);
17509 InstructionCost LoadCost =
17510 TTI.getMemoryOpCost(Instruction::Load, SrcType, LI0->getAlign(),
17511 LI0->getPointerAddressSpace(), CostKind);
17512 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) {
17513 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
17515 TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
17516 LoadCost += IntrinsicCost;
17517 if (SrcType != ScalarTy) {
17518 LoadCost +=
17519 TTI.getCastInstrCost(Instruction::ZExt, ScalarTy, SrcType,
17521 }
17522 }
17523 return LoadCost + CommonCost;
17524 };
17525 return GetCostDiff(GetScalarCost, GetVectorCost);
17526 }
17527 case TreeEntry::ReducedCmpBitcast: {
17528 auto GetScalarCost = [&, &TTI = *TTI](unsigned Idx) {
17529 if (isa<PoisonValue>(UniqueValues[Idx]))
17531 auto *Sel = dyn_cast<Instruction>(UniqueValues[Idx]);
17532 if (!Sel)
17534 InstructionCost ScalarCost = TTI.getInstructionCost(Sel, CostKind);
17535 return ScalarCost;
17536 };
17537 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
17538 Type *CmpTy = CmpInst::makeCmpResultType(VecTy);
17539 auto *DstTy =
17540 IntegerType::getIntNTy(ScalarTy->getContext(), E->getVectorFactor());
17541 InstructionCost BitcastCost =
17542 TTI.getCastInstrCost(Instruction::BitCast, DstTy, CmpTy,
17544 if (DstTy != ScalarTy) {
17545 BitcastCost +=
17546 TTI.getCastInstrCost(Instruction::ZExt, ScalarTy, DstTy,
17548 }
17549 return BitcastCost + CommonCost;
17550 };
17551 return GetCostDiff(GetScalarCost, GetVectorCost);
17552 }
17553 case Instruction::FNeg:
17554 case Instruction::Add:
17555 case Instruction::FAdd:
17556 case Instruction::Sub:
17557 case Instruction::FSub:
17558 case Instruction::Mul:
17559 case Instruction::FMul:
17560 case Instruction::UDiv:
17561 case Instruction::SDiv:
17562 case Instruction::FDiv:
17563 case Instruction::URem:
17564 case Instruction::SRem:
17565 case Instruction::FRem:
17566 case Instruction::Shl:
17567 case Instruction::LShr:
17568 case Instruction::AShr:
17569 case Instruction::And:
17570 case Instruction::Or:
17571 case Instruction::Xor: {
17572 auto GetScalarCost = [&](unsigned Idx) {
17573 if (isa<PoisonValue>(UniqueValues[Idx]))
17575
17576 // We cannot retrieve the operand from UniqueValues[Idx] because an
17577 // interchangeable instruction may be used. The order and the actual
17578 // operand might differ from what is retrieved from UniqueValues[Idx].
17579 unsigned Lane = UniqueIndexes[Idx];
17580 Value *Op1 = E->getOperand(0)[Lane];
17581 Value *Op2;
17582 SmallVector<const Value *, 2> Operands(1, Op1);
17583 if (isa<UnaryOperator>(UniqueValues[Idx])) {
17584 Op2 = Op1;
17585 } else {
17586 Op2 = E->getOperand(1)[Lane];
17587 Operands.push_back(Op2);
17588 }
17591 InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
17592 ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
17593 if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
17594 I && (ShuffleOrOp == Instruction::FAdd ||
17595 ShuffleOrOp == Instruction::FSub)) {
17596 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
17597 if (IntrinsicCost.isValid())
17598 ScalarCost = IntrinsicCost;
17599 }
17600 return ScalarCost;
17601 };
17602 auto GetVectorCost = [=](InstructionCost CommonCost) {
17603 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
17604 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
17605 ArrayRef<Value *> Ops = E->getOperand(I);
17606 if (all_of(Ops, [&](Value *Op) {
17607 auto *CI = dyn_cast<ConstantInt>(Op);
17608 return CI && CI->getValue().countr_one() >= It->second.first;
17609 }))
17610 return CommonCost;
17611 }
17612 }
17613 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
17614 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
17615 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
17616 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
17617 Op2Info, {}, nullptr, TLI) +
17618 CommonCost;
17619 };
17620 return GetCostDiff(GetScalarCost, GetVectorCost);
17621 }
17622 case Instruction::GetElementPtr: {
17623 return CommonCost + GetGEPCostDiff(VL, VL0);
17624 }
17625 case Instruction::Load: {
17626 auto GetScalarCost = [&](unsigned Idx) {
17627 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
17628 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
17629 VI->getAlign(), VI->getPointerAddressSpace(),
17631 };
17632 auto *LI0 = cast<LoadInst>(VL0);
17633 auto GetVectorCost = [&](InstructionCost CommonCost) {
17634 InstructionCost VecLdCost;
17635 switch (E->State) {
17636 case TreeEntry::Vectorize:
17637 if (unsigned Factor = E->getInterleaveFactor()) {
17638 VecLdCost = TTI->getInterleavedMemoryOpCost(
17639 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
17640 LI0->getPointerAddressSpace(), CostKind);
17641
17642 } else {
17643 VecLdCost = TTI->getMemoryOpCost(
17644 Instruction::Load, VecTy, LI0->getAlign(),
17645 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
17646 }
17647 break;
17648 case TreeEntry::StridedVectorize: {
17649 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
17650 FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
17651 assert(StridedLoadTy && "Missing StridedPointerInfo for tree entry.");
17652 Align CommonAlignment =
17653 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
17654 VecLdCost = TTI->getMemIntrinsicInstrCost(
17655 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
17656 StridedLoadTy, LI0->getPointerOperand(),
17657 /*VariableMask=*/false, CommonAlignment),
17658 CostKind);
17659 if (StridedLoadTy != VecTy)
17660 VecLdCost +=
17661 TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
17662 getCastContextHint(*E), CostKind);
17663
17664 break;
17665 }
17666 case TreeEntry::CompressVectorize: {
17667 bool IsMasked;
17668 unsigned InterleaveFactor;
17669 SmallVector<int> CompressMask;
17670 VectorType *LoadVecTy;
17671 SmallVector<Value *> Scalars(VL);
17672 if (!E->ReorderIndices.empty()) {
17673 SmallVector<int> Mask(E->ReorderIndices.begin(),
17674 E->ReorderIndices.end());
17675 reorderScalars(Scalars, Mask);
17676 }
17677 SmallVector<Value *> PointerOps(Scalars.size());
17678 for (auto [I, V] : enumerate(Scalars))
17679 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
17680 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
17681 Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
17682 *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
17683 CompressMask, LoadVecTy);
17684 CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
17685 InterleaveFactor, IsMasked);
17686 Align CommonAlignment = LI0->getAlign();
17687 if (InterleaveFactor) {
17688 VecLdCost = TTI->getInterleavedMemoryOpCost(
17689 Instruction::Load, LoadVecTy, InterleaveFactor, {},
17690 CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
17691 } else if (IsMasked) {
17692 VecLdCost = TTI->getMemIntrinsicInstrCost(
17693 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
17694 CommonAlignment,
17695 LI0->getPointerAddressSpace()),
17696 CostKind);
17697 // TODO: include this cost into CommonCost.
17698 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
17699 LoadVecTy, CompressMask, CostKind);
17700 } else {
17701 VecLdCost = TTI->getMemoryOpCost(
17702 Instruction::Load, LoadVecTy, CommonAlignment,
17703 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
17704 // TODO: include this cost into CommonCost.
17705 VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
17706 LoadVecTy, CompressMask, CostKind);
17707 }
17708 break;
17709 }
17710 case TreeEntry::ScatterVectorize: {
17711 Align CommonAlignment =
17712 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
17713 VecLdCost = TTI->getMemIntrinsicInstrCost(
17714 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
17715 LI0->getPointerOperand(),
17716 /*VariableMask=*/false, CommonAlignment),
17717 CostKind);
17718 break;
17719 }
17720 case TreeEntry::CombinedVectorize:
17721 case TreeEntry::SplitVectorize:
17722 case TreeEntry::NeedToGather:
17723 llvm_unreachable("Unexpected vectorization state.");
17724 }
17725 return VecLdCost + CommonCost;
17726 };
17727
17728 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
17729 // If this node generates masked gather load then it is not a terminal node.
17730 // Hence address operand cost is estimated separately.
17731 if (E->State == TreeEntry::ScatterVectorize)
17732 return Cost;
17733
17734 // Estimate cost of GEPs since this tree node is a terminator.
17735 SmallVector<Value *> PointerOps(VL.size());
17736 for (auto [I, V] : enumerate(VL))
17737 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
17738 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
17739 }
17740 case Instruction::Store: {
17741 bool IsReorder = !E->ReorderIndices.empty();
17742 auto GetScalarCost = [=](unsigned Idx) {
17743 auto *VI = cast<StoreInst>(VL[Idx]);
17744 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
17745 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
17746 VI->getAlign(), VI->getPointerAddressSpace(),
17747 CostKind, OpInfo, VI);
17748 };
17749 auto *BaseSI =
17750 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
17751 auto GetVectorCost = [=](InstructionCost CommonCost) {
17752 // We know that we can merge the stores. Calculate the cost.
17753 InstructionCost VecStCost;
17754 if (E->State == TreeEntry::StridedVectorize) {
17755 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
17756 FixedVectorType *StridedStoreTy = SPtrInfo.Ty;
17757 assert(StridedStoreTy && "Missing StridedPointerInfo for tree entry.");
17758 Align CommonAlignment =
17759 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
17760 VecStCost = TTI->getMemIntrinsicInstrCost(
17761 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
17762 StridedStoreTy,
17763 BaseSI->getPointerOperand(),
17764 /*VariableMask=*/false, CommonAlignment),
17765 CostKind);
17766 if (StridedStoreTy != VecTy)
17767 VecStCost +=
17768 TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedStoreTy,
17769 getCastContextHint(*E), CostKind);
17770
17771 } else {
17772 assert(E->State == TreeEntry::Vectorize &&
17773 "Expected either strided or consecutive stores.");
17774 if (unsigned Factor = E->getInterleaveFactor()) {
17775 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
17776 "No reused shuffles expected");
17777 CommonCost = 0;
17778 VecStCost = TTI->getInterleavedMemoryOpCost(
17779 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
17780 BaseSI->getPointerAddressSpace(), CostKind);
17781 } else {
17782 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
17783 VecStCost = TTI->getMemoryOpCost(
17784 Instruction::Store, VecTy, BaseSI->getAlign(),
17785 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
17786 }
17787 }
17788 return VecStCost + CommonCost;
17789 };
17790 SmallVector<Value *> PointerOps(VL.size());
17791 for (auto [I, V] : enumerate(VL)) {
17792 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
17793 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
17794 }
17795
17796 return GetCostDiff(GetScalarCost, GetVectorCost) +
17797 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
17798 }
17799 case Instruction::Call: {
17800 auto GetScalarCost = [&](unsigned Idx) {
17801 auto *CI = cast<CallInst>(UniqueValues[Idx]);
17804 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
17805 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
17806 }
17807 return TTI->getCallInstrCost(CI->getCalledFunction(),
17809 CI->getFunctionType()->params(), CostKind);
17810 };
17811 auto GetVectorCost = [=](InstructionCost CommonCost) {
17812 auto *CI = cast<CallInst>(VL0);
17815 CI, ID, getNumElements(VecTy),
17816 It != MinBWs.end() ? It->second.first : 0, TTI);
17817 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
17818 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
17819 };
17820 return GetCostDiff(GetScalarCost, GetVectorCost);
17821 }
17822 case Instruction::ShuffleVector: {
17823 if (!SLPReVec || E->isAltShuffle())
17824 assert(E->isAltShuffle() &&
17825 ((Instruction::isBinaryOp(E->getOpcode()) &&
17826 Instruction::isBinaryOp(E->getAltOpcode())) ||
17827 (Instruction::isCast(E->getOpcode()) &&
17828 Instruction::isCast(E->getAltOpcode())) ||
17829 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
17830 "Invalid Shuffle Vector Operand");
17831 // Try to find the previous shuffle node with the same operands and same
17832 // main/alternate ops.
17833 auto TryFindNodeWithEqualOperands = [=]() {
17834 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
17835 if (TE.get() == E)
17836 break;
17837 if (TE->hasState() && TE->isAltShuffle() &&
17838 ((TE->getOpcode() == E->getOpcode() &&
17839 TE->getAltOpcode() == E->getAltOpcode()) ||
17840 (TE->getOpcode() == E->getAltOpcode() &&
17841 TE->getAltOpcode() == E->getOpcode())) &&
17842 TE->hasEqualOperands(*E))
17843 return true;
17844 }
17845 return false;
17846 };
17847 auto GetScalarCost = [&](unsigned Idx) {
17848 if (isa<PoisonValue>(UniqueValues[Idx]))
17850
17851 auto *VI = cast<Instruction>(UniqueValues[Idx]);
17852 assert(E->getMatchingMainOpOrAltOp(VI) &&
17853 "Unexpected main/alternate opcode");
17854 (void)E;
17855 return TTI->getInstructionCost(VI, CostKind);
17856 };
17857 // Need to clear CommonCost since the final shuffle cost is included into
17858 // vector cost.
17859 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
17860 // VecCost is equal to sum of the cost of creating 2 vectors
17861 // and the cost of creating shuffle.
17862 InstructionCost VecCost = 0;
17863 if (TryFindNodeWithEqualOperands()) {
17864 LLVM_DEBUG({
17865 dbgs() << "SLP: diamond match for alternate node found.\n";
17866 E->dump();
17867 });
17868 // No need to add new vector costs here since we're going to reuse
17869 // same main/alternate vector ops, just do different shuffling.
17870 } else if (Instruction::isBinaryOp(E->getOpcode())) {
17871 VecCost =
17872 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
17873 VecCost +=
17874 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
17875 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
17876 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
17877 VecCost = TTIRef.getCmpSelInstrCost(
17878 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
17879 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
17880 VL0);
17881 VecCost += TTIRef.getCmpSelInstrCost(
17882 E->getOpcode(), VecTy, MaskTy,
17883 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
17884 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
17885 E->getAltOp());
17886 } else {
17887 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
17888 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
17889 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
17890 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
17891 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
17892 unsigned SrcBWSz =
17893 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
17894 if (SrcIt != MinBWs.end()) {
17895 SrcBWSz = SrcIt->second.first;
17896 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
17897 SrcTy = getWidenedType(SrcSclTy, VL.size());
17898 }
17899 if (BWSz <= SrcBWSz) {
17900 if (BWSz < SrcBWSz)
17901 VecCost =
17902 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
17904 LLVM_DEBUG({
17905 dbgs()
17906 << "SLP: alternate extension, which should be truncated.\n";
17907 E->dump();
17908 });
17909 return VecCost;
17910 }
17911 }
17912 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
17914 VecCost +=
17915 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
17917 }
17918 SmallVector<int> Mask;
17919 E->buildAltOpShuffleMask(
17920 [&](Instruction *I) {
17921 assert(E->getMatchingMainOpOrAltOp(I) &&
17922 "Unexpected main/alternate opcode");
17923 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
17924 *TLI);
17925 },
17926 Mask);
17928 cast<VectorType>(FinalVecTy), Mask, CostKind);
17929 // Patterns like [fadd,fsub] can be combined into a single instruction
17930 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
17931 // need to take into account their order when looking for the most used
17932 // order.
17933 unsigned Opcode0 = E->getOpcode();
17934 unsigned Opcode1 = E->getAltOpcode();
17935 SmallBitVector OpcodeMask(
17936 getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));
17937 // If this pattern is supported by the target then we consider the
17938 // order.
17939 if (TTIRef.isLegalAltInstr(cast<VectorType>(VecTy), Opcode0, Opcode1,
17940 OpcodeMask)) {
17941 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
17942 cast<VectorType>(VecTy), Opcode0, Opcode1, OpcodeMask, CostKind);
17943 return AltVecCost < VecCost ? AltVecCost : VecCost;
17944 }
17945 // TODO: Check the reverse order too.
17946 return VecCost;
17947 };
17948 if (SLPReVec && !E->isAltShuffle())
17949 return GetCostDiff(
17950 GetScalarCost, [&](InstructionCost) -> InstructionCost {
17951 // If a group uses mask in order, the shufflevector can be
17952 // eliminated by instcombine. Then the cost is 0.
17954 "Not supported shufflevector usage.");
17955 auto *SV = cast<ShuffleVectorInst>(VL.front());
17956 unsigned SVNumElements =
17957 cast<FixedVectorType>(SV->getOperand(0)->getType())
17958 ->getNumElements();
17959 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
17960 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
17961 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
17962 int NextIndex = 0;
17963 if (!all_of(Group, [&](Value *V) {
17965 "Not supported shufflevector usage.");
17966 auto *SV = cast<ShuffleVectorInst>(V);
17967 int Index;
17968 [[maybe_unused]] bool IsExtractSubvectorMask =
17969 SV->isExtractSubvectorMask(Index);
17970 assert(IsExtractSubvectorMask &&
17971 "Not supported shufflevector usage.");
17972 if (NextIndex != Index)
17973 return false;
17974 NextIndex += SV->getShuffleMask().size();
17975 return true;
17976 }))
17977 return ::getShuffleCost(
17979 cast<VectorType>(VecTy),
17980 calculateShufflevectorMask(E->Scalars));
17981 }
17982 return TTI::TCC_Free;
17983 });
17984 return GetCostDiff(GetScalarCost, GetVectorCost);
17985 }
17986 case Instruction::Freeze:
17987 return CommonCost;
17988 default:
17989 llvm_unreachable("Unknown instruction");
17990 }
17991}
17992
17993bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
17994 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
17995 << VectorizableTree.size() << " is fully vectorizable .\n");
17996
17997 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
17998 SmallVector<int> Mask;
17999 return TE->isGather() &&
18000 !any_of(TE->Scalars,
18001 [this](Value *V) { return EphValues.contains(V); }) &&
18002 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
18003 TE->Scalars.size() < Limit ||
18004 (((TE->hasState() &&
18005 TE->getOpcode() == Instruction::ExtractElement) ||
18007 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
18008 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
18009 !TE->isAltShuffle()) ||
18010 any_of(TE->Scalars, IsaPred<LoadInst>));
18011 };
18012
18013 // We only handle trees of heights 1 and 2.
18014 if (VectorizableTree.size() == 1 &&
18015 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
18016 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
18017 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
18018 (ForReduction &&
18019 AreVectorizableGathers(VectorizableTree[0].get(),
18020 VectorizableTree[0]->Scalars.size()) &&
18021 VectorizableTree[0]->getVectorFactor() > 2)))
18022 return true;
18023
18024 if (VectorizableTree.size() != 2)
18025 return false;
18026
18027 // Handle splat and all-constants stores. Also try to vectorize tiny trees
18028 // with the second gather nodes if they have less scalar operands rather than
18029 // the initial tree element (may be profitable to shuffle the second gather)
18030 // or they are extractelements, which form shuffle.
18031 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
18032 AreVectorizableGathers(VectorizableTree[1].get(),
18033 VectorizableTree[0]->Scalars.size()))
18034 return true;
18035
18036 // Gathering cost would be too much for tiny trees.
18037 if (VectorizableTree[0]->isGather() ||
18038 (VectorizableTree[1]->isGather() &&
18039 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
18040 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
18041 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
18042 return false;
18043
18044 return true;
18045}
18046
18047bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
18048 if (!DebugCounter::shouldExecute(VectorizedGraphs))
18049 return true;
18050
18051 // Graph is empty - do nothing.
18052 if (VectorizableTree.empty()) {
18053 assert(ExternalUses.empty() && "We shouldn't have any external users");
18054
18055 return true;
18056 }
18057
18058 // FIXME: support buildvector of the gather nodes with struct types.
18059 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
18060 return TE->isGather() &&
18061 isa<StructType>(TE->Scalars.front()->getType());
18062 })) {
18063 LLVM_DEBUG(
18064 dbgs() << "SLP: rejecting tree with buildvector struct values of size "
18065 << VectorizableTree.size() << ".\n");
18066 return true;
18067 }
18068
18069 // Cache values from the root node and the cost-threshold options to avoid
18070 // re-querying them inside hot predicates below.
18071 const unsigned TreeSize = VectorizableTree.size();
18072 const TreeEntry &Front = *VectorizableTree.front();
18073 const bool FrontIsGather = Front.isGather();
18074 const bool FrontHasState = Front.hasState();
18075 const unsigned FrontOpcode = FrontHasState ? Front.getOpcode() : 0u;
18076 const bool ThresholdSet = SLPCostThreshold.getNumOccurrences() > 0;
18077 const bool ThresholdNonNegative = SLPCostThreshold >= 0;
18078
18079 constexpr unsigned Limit = 4;
18080 constexpr unsigned LargeTree = 20;
18081 constexpr unsigned LimitTreeSize = 36;
18082
18083 // The remaining size-1/size-<=MinTreeSize early bail-outs only apply to
18084 // non-reduction trees; group them under a single guard to avoid 3 separate
18085 // !ForReduction short-circuits when reducing.
18086 if (!ForReduction) {
18087 // Single gather node: bail out for ExtractElement or any node containing a
18088 // real Instruction scalar.
18089 if (TreeSize == 1 && FrontIsGather) {
18090 if (FrontHasState && FrontOpcode == Instruction::ExtractElement)
18091 return true;
18092 if (any_of(Front.Scalars, IsaPred<Instruction>))
18093 return true;
18094 }
18095 if (TreeSize <= MinTreeSize &&
18096 all_of(VectorizableTree, [](const std::unique_ptr<TreeEntry> &TE) {
18097 return TE->isGather() || TE->State == TreeEntry::SplitVectorize;
18098 }))
18099 return true;
18100 if (TreeSize == 1 && SLPCostThreshold < 0 && FrontHasState &&
18101 FrontOpcode == Instruction::ExtractElement &&
18102 (Front.getVectorFactor() == 2 ||
18103 all_of(
18104 Front.Scalars,
18105 [&](Value *V) {
18106 auto *I = dyn_cast<Instruction>(V);
18107 return !I || !areAllUsersVectorized(I, UserIgnoreList);
18108 })))
18109 return true;
18110 }
18111 // No need to vectorize inserts of gathered values.
18112 if (TreeSize == 2 && isa<InsertElementInst>(Front.Scalars[0]) &&
18113 VectorizableTree[1]->isGather() &&
18114 (VectorizableTree[1]->getVectorFactor() <= 2 ||
18115 !(isSplat(VectorizableTree[1]->Scalars) ||
18116 allConstant(VectorizableTree[1]->Scalars))))
18117 return true;
18118
18119 // The tree with only 3 nodes, where 2 last are gathers/buildvectors, not
18120 // profitable for vectorization.
18121 if (TreeSize == 3 && SLPCostThreshold == 0 &&
18122 (!ForReduction || Front.getVectorFactor() <= 2) &&
18123 all_of(ArrayRef(VectorizableTree).drop_front(),
18124 [&](const std::unique_ptr<TreeEntry> &TE) {
18125 return TE->isGather() && TE->getVectorFactor() <= Limit &&
18126 !all_of(
18127 TE->Scalars,
18129 }))
18130 return true;
18131
18132 // All remaining bail-out heuristics require !ForReduction. Group them under
18133 // a single guard so reduction trees skip them with one branch instead of one
18134 // per check.
18135 if (!ForReduction) {
18136 // If the graph includes only PHI nodes and gathers, it is defnitely not
18137 // profitable for the vectorization, we can skip it, if the cost threshold
18138 // is default. The cost of vectorized PHI nodes is almost always 0 + the
18139 // cost of gathers/buildvectors.
18140 if (!ThresholdSet &&
18141 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
18142 const bool IsGather = TE->isGather();
18143 const bool HasState = TE->hasState();
18144 const unsigned Op = HasState ? TE->getOpcode() : 0u;
18145 if (IsGather && (!HasState || Op != Instruction::ExtractElement) &&
18146 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit)
18147 return true;
18148 return HasState && Op == Instruction::PHI;
18149 }))
18150 return true;
18151
18152 // Do not vectorize small tree of phis only, if all vector phis are also
18153 // gathered.
18154 if (ThresholdSet && TreeSize <= Limit) {
18155 bool HasVectorPhi = false;
18156 auto Compatible = [&](const std::unique_ptr<TreeEntry> &TE) {
18157 const bool IsGather = TE->isGather();
18158 const bool HasState = TE->hasState();
18159 const unsigned Op = HasState ? TE->getOpcode() : 0u;
18160 if (IsGather && (!HasState || Op != Instruction::ExtractElement) &&
18161 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit)
18162 return true;
18163 if (!HasState)
18164 return false;
18165 if (Op == Instruction::InsertElement)
18166 return true;
18167 if (Op != Instruction::PHI)
18168 return false;
18169 if (TE->State == TreeEntry::Vectorize)
18170 HasVectorPhi = true;
18171 return all_of(TE->Scalars, [&](Value *V) {
18172 return isa<PoisonValue>(V) || MustGather.contains(V);
18173 });
18174 };
18175 if (all_of(VectorizableTree, Compatible) && HasVectorPhi)
18176 return true;
18177 }
18178
18179 // PHI nodes only and gathers cannot be vectorized, skip.
18180 if (ThresholdNonNegative) {
18181 const bool IsLargeTree = TreeSize >= LargeTree;
18182 bool HasSingleLoad = false;
18183 if (all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
18184 const bool IsGather = TE->isGather();
18185 const bool HasState = TE->hasState();
18186 const unsigned Op = HasState ? TE->getOpcode() : 0u;
18187 // HasSingleLoad/PrevLoad are only consulted in the
18188 // IsLargeTree branch; skip the bookkeeping otherwise.
18189 if (IsLargeTree) {
18190 const bool PrevLoad = HasSingleLoad;
18191 HasSingleLoad |=
18192 HasState && !IsGather &&
18193 (Op == Instruction::Load || TE->hasCopyableElements()) &&
18194 (TE->getVectorFactor() > 2 || TE->ReorderIndices.empty());
18195 if (HasState) {
18196 if (Op == Instruction::PHI)
18197 return true;
18198 if (TE->getVectorFactor() <= Limit &&
18199 (Op == Instruction::Store ||
18200 (Op == Instruction::Load && !PrevLoad)))
18201 return true;
18202 }
18203 } else if (HasState && Op == Instruction::PHI) {
18204 return true;
18205 }
18206 return IsGather && (!HasState || Op != Instruction::ExtractElement);
18207 }))
18208 return true;
18209
18210 // Single non-phi vector node - skip the tree.
18211 if (TreeSize >= 5 && Front.getVectorFactor() <= 2 &&
18212 Front.Scalars.front()->getType()->isIntegerTy()) {
18213 bool VectorNodeFound = false;
18214 bool AnyNonConst = false;
18215 if (all_of(VectorizableTree,
18216 [&](const std::unique_ptr<TreeEntry> &TE) {
18217 if (TE->State == TreeEntry::Vectorize && TE->hasState()) {
18218 const unsigned Op = TE->getOpcode();
18219 if (Op == Instruction::PHI ||
18220 !TE->ReorderIndices.empty())
18221 return true;
18222 if (VectorNodeFound)
18223 return false;
18224 VectorNodeFound = true;
18225 return true;
18226 }
18227 // Once AnyNonConst is true, skip the O(n) allConstant
18228 // walk for subsequent entries.
18229 if (!AnyNonConst)
18230 AnyNonConst = !allConstant(TE->Scalars);
18231 return TE->isGather() ||
18232 TE->State == TreeEntry::SplitVectorize;
18233 }) &&
18234 AnyNonConst)
18235 return true;
18236 }
18237 }
18238
18239 // Common predicate for "phis, buildvectors, split nodes and small nodes
18240 // with reuses" used by the two checks below. Cheap checks are evaluated
18241 // before expensive Scalars walks.
18242 auto IsBenignNode = [&](const TreeEntry &TE) {
18243 if (TE.State == TreeEntry::SplitVectorize)
18244 return true;
18245 const bool IsGather = TE.isGather();
18246 const bool HasState = TE.hasState();
18247 if (HasState) {
18248 const unsigned Op = TE.getOpcode();
18249 if (Op == Instruction::PHI)
18250 return true;
18251 const unsigned ScalarsSize = TE.Scalars.size();
18252 if (TE.Idx == 0 && ScalarsSize == 2 && Op == Instruction::ICmp &&
18253 TreeSize > LimitTreeSize)
18254 return true;
18255 if (ScalarsSize == 2 &&
18256 (!TE.ReuseShuffleIndices.empty() || !TE.ReorderIndices.empty() ||
18257 TE.isAltShuffle()))
18258 return true;
18259 if (TE.hasCopyableElements() &&
18260 static_cast<unsigned>(count_if(
18261 TE.Scalars, IsaPred<PHINode, Constant>)) >= ScalarsSize / 2)
18262 return true;
18263 }
18264 return IsGather && none_of(TE.Scalars, IsaPred<ExtractElementInst>);
18265 };
18266
18267 // If the tree contains only phis, buildvectors, split nodes and
18268 // small nodes with reuses, we can skip it.
18269 if (!ThresholdSet) {
18270 SmallVector<const TreeEntry *> StoreLoadNodes;
18271 unsigned NumGathers = 0;
18272 if (all_of(VectorizableTree,
18273 [&](const std::unique_ptr<TreeEntry> &TE) {
18274 const bool IsGather = TE->isGather();
18275 if (!IsGather && TE->hasState()) {
18276 const unsigned Op = TE->getOpcode();
18277 if (Op == Instruction::Load || Op == Instruction::Store) {
18278 StoreLoadNodes.push_back(TE.get());
18279 return true;
18280 }
18281 }
18282 if (IsGather)
18283 ++NumGathers;
18284 return IsBenignNode(*TE);
18285 }) &&
18286 (StoreLoadNodes.empty() ||
18287 (TreeSize > LimitTreeSize * StoreLoadNodes.size() &&
18288 (NumGathers > 0 ||
18289 none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
18290 return TE->getOpcode() == Instruction::Store ||
18291 all_of(TE->Scalars, [&](Value *V) {
18292 return !isa<LoadInst>(V) ||
18293 areAllUsersVectorized(cast<Instruction>(V));
18294 });
18295 })))))
18296 return true;
18297 }
18298
18299 // If the tree contains only phis, buildvectors, split nodes and
18300 // small nodes with reuses, we can skip it.
18301 if (ThresholdNonNegative && TreeSize > LimitTreeSize) {
18302 const TreeEntry *VectorNode = nullptr;
18303 if (all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
18304 if (!TE->isGather() && TE->hasState() &&
18305 TE->State != TreeEntry::SplitVectorize &&
18306 TE->getOpcode() != Instruction::PHI) {
18307 if (VectorNode)
18308 return false;
18309 VectorNode = TE.get();
18310 return true;
18311 }
18312 return IsBenignNode(*TE);
18313 }))
18314 return true;
18315 }
18316
18317 // If the tree contains only buildvector, 2 non-buildvectors (with root
18318 // user tree node) and other buildvectors, we can skip it.
18319 if (ThresholdSet && TreeSize >= Limit &&
18320 Front.State == TreeEntry::SplitVectorize &&
18321 count_if(ArrayRef(VectorizableTree).drop_front(),
18322 [](const std::unique_ptr<TreeEntry> &TE) {
18323 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
18324 TE->UserTreeIndex.UserTE->Idx == 0;
18325 }) == 2)
18326 return true;
18327
18328 // If the tree contains only vectorization of the phi node from the
18329 // buildvector - skip it.
18330 if (ThresholdSet && TreeSize > 2 && Front.State == TreeEntry::Vectorize &&
18331 FrontOpcode == Instruction::InsertElement &&
18332 VectorizableTree[1]->State == TreeEntry::Vectorize &&
18333 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
18334 all_of(ArrayRef(VectorizableTree).drop_front(2),
18335 [](const std::unique_ptr<TreeEntry> &TE) {
18336 return TE->isGather();
18337 }))
18338 return true;
18339 }
18340
18341 // We can vectorize the tree if its size is greater than or equal to the
18342 // minimum size specified by the MinTreeSize command line option.
18343 if (TreeSize >= MinTreeSize)
18344 return false;
18345
18346 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
18347 // can vectorize it if we can prove it fully vectorizable.
18348 if (isFullyVectorizableTinyTree(ForReduction))
18349 return false;
18350
18351 // Check if any of the gather node forms an insertelement buildvector
18352 // somewhere. TreeSize >= 1 is guaranteed, so the multi-node case reduces to
18353 // a simple TreeSize > 1 short-circuit.
18354 const bool IsAllowedSingleBVNode =
18355 TreeSize > 1 || (FrontHasState && !Front.isAltShuffle() &&
18356 FrontOpcode != Instruction::PHI &&
18357 FrontOpcode != Instruction::GetElementPtr &&
18358 allSameBlock(Front.Scalars));
18359 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
18360 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
18361 return isa<ExtractElementInst, Constant>(V) ||
18362 (IsAllowedSingleBVNode &&
18363 !V->hasNUsesOrMore(UsesLimit) &&
18364 any_of(V->users(), IsaPred<InsertElementInst>));
18365 });
18366 }))
18367 return false;
18368
18369 const TreeEntry &Back = *VectorizableTree.back();
18370 if (Back.isGather() && Back.hasState() && Back.isAltShuffle()) {
18371 const unsigned BackVF = Back.getVectorFactor();
18372 if (BackVF > 2 && allSameBlock(Back.Scalars) &&
18373 !Back.Scalars.front()->getType()->isVectorTy() &&
18374 TTI->getScalarizationOverhead(
18376 getWidenedType(Back.Scalars.front()->getType(), BackVF)),
18377 APInt::getAllOnes(BackVF),
18378 /*Insert=*/true, /*Extract=*/false,
18380 return false;
18381 }
18382
18383 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
18384 // vectorizable.
18385 return true;
18386}
18387
18390 constexpr unsigned SmallTree = 3;
18391 if (VectorizableTree.front()->isNonPowOf2Vec() &&
18392 getCanonicalGraphSize() <= SmallTree &&
18393 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
18394 [](const std::unique_ptr<TreeEntry> &TE) {
18395 return TE->isGather() && TE->hasState() &&
18396 TE->getOpcode() == Instruction::Load &&
18397 !allSameBlock(TE->Scalars);
18398 }) == 1)
18399 return true;
18400 return false;
18401 }
18402 bool Res = false;
18403 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
18404 TreeEntry &E = *VectorizableTree[Idx];
18405 if (E.State == TreeEntry::SplitVectorize)
18406 return false;
18407 if (!E.isGather())
18408 continue;
18409 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
18410 (!E.hasState() &&
18412 (isa<ExtractElementInst>(E.Scalars.front()) &&
18413 getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid()))
18414 return false;
18415 if (isSplat(E.Scalars) || allConstant(E.Scalars))
18416 continue;
18417 Res = true;
18418 }
18419 return Res;
18420}
18421
18423 // Walk the vectorizable tree from the root towards its leaves, tracking
18424 // which vectorized operand values would be live across each tree edge
18425 // (i.e. between the last instruction of an operand entry and the last
18426 // instruction of its user entry). When the live range crosses a call
18427 // instruction that is not part of the vectorized tree, query TTI for the
18428 // cost of keeping the value live across it (for example, if spills and
18429 // fills are required).
18430
18431 const TreeEntry *Root = VectorizableTree.front().get();
18432 if (Root->isGather())
18433 return 0;
18434
18435 InstructionCost Cost = 0;
18437 EntriesToOperands;
18438 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
18439 SmallPtrSet<const Instruction *, 8> LastInstructions;
18440 SmallPtrSet<const TreeEntry *, 8> ScalarOrPseudoEntries;
18441 for (const auto &TEPtr : VectorizableTree) {
18442 if (TEPtr->CombinedOp == TreeEntry::ReducedBitcast ||
18443 TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
18444 TEPtr->CombinedOp == TreeEntry::ReducedBitcastLoads ||
18445 TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
18446 TEPtr->CombinedOp == TreeEntry::ReducedCmpBitcast) {
18447 ScalarOrPseudoEntries.insert(TEPtr.get());
18448 continue;
18449 }
18450 if (!TEPtr->isGather()) {
18451 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
18452 EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
18453 LastInstructions.insert(LastInst);
18454 }
18455 if (TEPtr->UserTreeIndex)
18456 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
18457 }
18458
18459 // Cache NoCallIntrinsicOrDoesNotReturn results - the same intrinsic call may
18460 // be queried many times during the spill cost scan, and each computation
18461 // involves two potentially expensive TTI virtual calls.
18462 SmallDenseMap<const IntrinsicInst *, bool> NoCallIntrinsicCache;
18463 auto NoCallIntrinsicOrDoesNotReturn = [this, &NoCallIntrinsicCache](
18464 const Instruction *I) {
18465 const auto *CB = dyn_cast<CallBase>(I);
18466 if (!CB)
18467 return false;
18468 if (CB->doesNotReturn())
18469 return true;
18470 const auto *II = dyn_cast<IntrinsicInst>(CB);
18471 if (!II)
18472 return false;
18473 if (II->isAssumeLikeIntrinsic())
18474 return true;
18475 auto [It, Inserted] = NoCallIntrinsicCache.try_emplace(II);
18476 if (!Inserted)
18477 return It->second;
18478 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
18479 InstructionCost IntrCost =
18480 TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
18481 InstructionCost CallCost = TTI->getCallInstrCost(
18482 nullptr, II->getType(), ICA.getArgTypes(), TTI::TCK_RecipThroughput);
18483 bool Res = IntrCost < CallCost;
18484 It->second = Res;
18485 return Res;
18486 };
18487
18488 // Maps last instruction in the entry to the last instruction for the one of
18489 // operand entries and the flag. If the flag is true, there are no calls in
18490 // between these instructions.
18492 CheckedInstructions;
18493 unsigned Budget = 0;
18494 const unsigned BudgetLimit =
18495 ScheduleRegionSizeBudget / VectorizableTree.size();
18496 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
18497 const Instruction *Last) {
18498 assert(First->getParent() == Last->getParent() &&
18499 "Expected instructions in same block.");
18500 if (auto It = CheckedInstructions.find(Last);
18501 It != CheckedInstructions.end()) {
18502 const Instruction *Checked = It->second.getPointer();
18503 const bool NoCallsInCachedRange = It->second.getInt() != 0;
18504 if (Checked == First)
18505 return NoCallsInCachedRange;
18506 if (Checked->comesBefore(First))
18507 // In every cached state (full clean scan, call-found, or
18508 // budget-exhausted) the region strictly above `Checked` up to `Last`
18509 // was inspected and proved call-free. Since `First` is above
18510 // `Checked`, the queried range [First, Last] is contained in that
18511 // call-free region, regardless of whether bit is 0 or 1.
18512 return true;
18513 Last = Checked;
18514 } else if (Last == First || Last->comesBefore(First)) {
18515 // Empty range.
18516 return true;
18517 }
18519 ++First->getIterator().getReverse(),
18520 PrevInstIt =
18521 Last->getIterator().getReverse();
18522 SmallVector<const Instruction *> LastInstsInRange;
18523 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
18524 // Debug information does not impact spill cost.
18525 // Vectorized calls, represented as vector intrinsics, do not impact spill
18526 // cost.
18527 if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
18528 CB && !NoCallIntrinsicOrDoesNotReturn(CB) && !isVectorized(CB)) {
18529 for (const Instruction *LastInst : LastInstsInRange)
18530 CheckedInstructions.try_emplace(LastInst, &*PrevInstIt, 0);
18531 return false;
18532 }
18533 if (LastInstructions.contains(&*PrevInstIt))
18534 LastInstsInRange.push_back(&*PrevInstIt);
18535
18536 ++PrevInstIt;
18537 ++Budget;
18538 }
18539 // If we reached the scan's lower bound (`PrevInstIt == InstIt`) then the
18540 // whole [First, Last] range was inspected and found call-free, even if
18541 // Budget just overflowed at the very last step; do not mislabel such a
18542 // completed scan as "has call".
18543 const bool Completed = PrevInstIt == InstIt;
18544 const bool NoCallsInRange = Completed || Budget <= BudgetLimit;
18545 for (const Instruction *LastInst : LastInstsInRange)
18546 CheckedInstructions.try_emplace(
18547 LastInst, Completed ? First : &*PrevInstIt, NoCallsInRange ? 1 : 0);
18548 return NoCallsInRange;
18549 };
18550 auto AddCosts = [&](const TreeEntry *Op) {
18551 if (ScalarOrPseudoEntries.contains(Op))
18552 return;
18553 Type *ScalarTy = Op->Scalars.front()->getType();
18554 auto It = MinBWs.find(Op);
18555 if (It != MinBWs.end())
18556 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
18557 auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
18558 uint64_t Scale = getScaleToLoopIterations(*Op);
18559 InstructionCost KeepLiveCost = TTI->getCostOfKeepingLiveOverCall(VecTy);
18560 KeepLiveCost *= Scale;
18561 Cost += KeepLiveCost;
18562 if (ScalarTy->isVectorTy()) {
18563 // Handle revec dead vector instructions.
18564 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy) *
18565 Scale;
18566 }
18567 };
18568 // Memoize the relationship between blocks, i.e. if there is (at least one)
18569 // non-vectorized call between the blocks. This allows to skip the analysis of
18570 // the same block paths multiple times.
18572 ParentOpParentToPreds;
18573 // Memoize whether a basic block contains a non-terminator no-return call.
18574 // Such blocks are dead-end paths in normal control flow (execution does not
18575 // exit them past the no-return call), so the block is excluded from the
18576 // spill cost analysis. Terminator no-return calls (invoke/callbr) are not
18577 // block-killing because they still have live CFG successors (e.g. the
18578 // unwind destination of an invoke).
18579 SmallDenseMap<const BasicBlock *, bool> BlockHasNoReturnCallCache;
18580 auto BlockHasNoReturnCall = [&](const BasicBlock *BB) {
18581 auto [It, Inserted] = BlockHasNoReturnCallCache.try_emplace(BB, false);
18582 if (!Inserted)
18583 return It->second;
18584 for (const Instruction &I : *BB) {
18585 const auto *CB = dyn_cast<CallBase>(&I);
18586 if (CB && CB->doesNotReturn() && !CB->isTerminator()) {
18587 It->second = true;
18588 return true;
18589 }
18590 }
18591 return false;
18592 };
18593 // Memoize whether a loop's body (all blocks of the loop, including
18594 // sub-loops) contains any non-vec call.
18595 SmallDenseMap<const Loop *, bool> LoopBodyHasNonVecCall;
18596 auto LoopBodyHasCall = [&](const Loop *L) {
18597 if (auto It = LoopBodyHasNonVecCall.find(L);
18598 It != LoopBodyHasNonVecCall.end())
18599 return It->second;
18600 for (BasicBlock *BB : L->blocks()) {
18602 continue;
18603 // Blocks containing a no-return call are dead-end paths and never
18604 // actually flow back through the loop's back-edge, so their calls do
18605 // not keep loop-invariant vector values live across calls.
18606 if (BlockHasNoReturnCall(BB))
18607 continue;
18608 for (const Instruction &I : *BB) {
18609 const auto *CB = dyn_cast<CallBase>(&I);
18610 if (!CB || NoCallIntrinsicOrDoesNotReturn(CB) || isVectorized(CB))
18611 continue;
18612 LoopBodyHasNonVecCall.try_emplace(L, true);
18613 return true;
18614 }
18615 }
18616 LoopBodyHasNonVecCall.try_emplace(L, false);
18617 return false;
18618 };
18619 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
18620 BasicBlock *OpParent) {
18621 auto Key = std::make_pair(Root, OpParent);
18622 if (auto It = ParentOpParentToPreds.find(Key);
18623 It != ParentOpParentToPreds.end())
18624 return It->second;
18625 bool Res = false;
18626 scope_exit Cleanup([&]() { ParentOpParentToPreds.try_emplace(Key, Res); });
18627 // If Op is loop-invariant, a call anywhere in the loop body forces a spill,
18628 // even when a call-free forward path from Root back to OpParent exists on
18629 // the first iteration. Find the outermost such enclosing loop and reject if
18630 // its body contains a non-vec call.
18631 const Loop *L = LI->getLoopFor(Root);
18632 const Loop *Outermost = nullptr;
18633 while (L && !L->contains(OpParent)) {
18634 Outermost = L;
18635 L = L->getParentLoop();
18636 }
18637 if (Outermost && LoopBodyHasCall(Outermost))
18638 return Res;
18640 if (Pred)
18641 Worklist.push_back(Pred);
18642 else
18643 Worklist.append(pred_begin(Root), pred_end(Root));
18645 // With "at least one call-free path" semantics we can only reliably
18646 // memoize the exact (Root, OpParent) query. Pairs for intermediate
18647 // blocks that were visited during the BFS are not necessarily
18648 // call-free-reachable to OpParent themselves - we may have reached
18649 // OpParent through a *sibling* path that bypassed them.
18650 // We return `true` (no spill cost) if at least one backward path from
18651 // some predecessor of Root back to OpParent is call-free. Only when
18652 // *every* such path goes through a non-vec call do we charge the spill
18653 // cost: only then is it actually necessary to keep the vectorized value
18654 // live across a call and therefore spill/reload it.
18655 //
18656 // A BB is only explored further (its predecessors added to the worklist)
18657 // when it is itself call-free and not strictly dominated by Root (blocks
18658 // dominated by Root are only reachable via loop back-edges - they sit
18659 // *after* Root in forward execution and must not be counted).
18660 //
18661 // If we ever pop OpParent from the worklist, we have reached it through
18662 // a chain of call-free, non-dominated blocks: a call-free path exists
18663 // and we return true. If the worklist is exhausted without reaching
18664 // OpParent, every admissible path is blocked by a call and we return
18665 // false so the caller charges the spill cost.
18666 while (!Worklist.empty()) {
18667 BasicBlock *BB = Worklist.pop_back_val();
18668 if (BB == OpParent) {
18669 Res = true;
18670 return Res;
18671 }
18672 if (!Visited.insert(BB).second)
18673 continue;
18674 // Blocks strictly dominated by Root are reached only *after* Root in
18675 // forward execution (via loop back-edges); skip them and their
18676 // dominated predecessors.
18677 if (DT->properlyDominates(Root, BB))
18678 continue;
18679 // A block containing a no-return call cannot reach Root via the
18680 // forward edge being analyzed: execution does not continue past the
18681 // no-return call, so the BB -> ... -> Root path is dead. Drop the
18682 // block from the analysis without following its predecessors.
18683 if (BlockHasNoReturnCall(BB))
18684 continue;
18685 auto Pair = std::make_pair(BB, OpParent);
18686 if (auto It = ParentOpParentToPreds.find(Pair);
18687 It != ParentOpParentToPreds.end()) {
18688 if (It->second) {
18689 // BB is known to reach OpParent via a call-free path.
18690 Res = true;
18691 return Res;
18692 }
18693 // BB is known to be blocked from OpParent by calls; keep checking
18694 // other paths.
18695 continue;
18696 }
18697 unsigned BlockSize = BB->size();
18698 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
18699 continue;
18700 Budget += BlockSize;
18701 if (Budget > BudgetLimit)
18702 return Res;
18703 if (!isa<CatchSwitchInst>(BB->getTerminator()) &&
18704 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
18705 BB->getTerminator()))
18706 continue;
18707 Worklist.append(pred_begin(BB), pred_end(BB));
18708 }
18709 // Worklist drained without ever reaching OpParent: every path between
18710 // Root and OpParent is blocked by a non-vec call.
18711 return Res;
18712 };
18713 SmallVector<const TreeEntry *> LiveEntries(1, Root);
18714 auto FindNonScalarParentEntry = [&](const TreeEntry *E) -> const TreeEntry * {
18715 assert(ScalarOrPseudoEntries.contains(E) &&
18716 "Expected scalar or pseudo entry.");
18717 const TreeEntry *Entry = E;
18718 while (Entry->UserTreeIndex) {
18719 Entry = Entry->UserTreeIndex.UserTE;
18720 if (!ScalarOrPseudoEntries.contains(Entry))
18721 return Entry;
18722 }
18723 return nullptr;
18724 };
18725 while (!LiveEntries.empty()) {
18726 const TreeEntry *Entry = LiveEntries.pop_back_val();
18727 const auto OpIt = EntriesToOperands.find(Entry);
18728 if (OpIt == EntriesToOperands.end())
18729 continue;
18730 ArrayRef<const TreeEntry *> Operands = OpIt->second;
18731 if (Operands.empty())
18732 continue;
18733 if (ScalarOrPseudoEntries.contains(Entry)) {
18734 Entry = FindNonScalarParentEntry(Entry);
18735 if (!Entry) {
18736 for (const TreeEntry *Op : Operands) {
18737 if (!Op->isGather())
18738 LiveEntries.push_back(Op);
18739 }
18740 continue;
18741 }
18742 }
18743 Instruction *LastInst = EntriesToLastInstruction.at(Entry);
18744 BasicBlock *Parent = LastInst->getParent();
18745 for (const TreeEntry *Op : Operands) {
18746 if (!Op->isGather())
18747 LiveEntries.push_back(Op);
18748 if (ScalarOrPseudoEntries.contains(Op))
18749 continue;
18750 if (Entry->State == TreeEntry::SplitVectorize ||
18751 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
18752 (Op->isGather() && allConstant(Op->Scalars)))
18753 continue;
18754 Budget = 0;
18755 BasicBlock *Pred = nullptr;
18756 if (auto *Phi = dyn_cast<PHINode>(Entry->getMainOp()))
18757 Pred = Phi->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
18758 BasicBlock *OpParent;
18759 Instruction *OpLastInst;
18760 if (Op->isGather()) {
18761 assert(Entry->getOpcode() == Instruction::PHI &&
18762 "Expected phi node only.");
18763 OpParent = cast<PHINode>(Entry->getMainOp())
18764 ->getIncomingBlock(Op->UserTreeIndex.EdgeIdx);
18765 OpLastInst = OpParent->getTerminator();
18766 for (Value *V : Op->Scalars) {
18767 auto *Inst = dyn_cast<Instruction>(V);
18768 if (!Inst)
18769 continue;
18770 if (isVectorized(V)) {
18771 OpParent = Inst->getParent();
18772 OpLastInst = Inst;
18773 break;
18774 }
18775 }
18776 } else {
18777 OpLastInst = EntriesToLastInstruction.at(Op);
18778 OpParent = OpLastInst->getParent();
18779 }
18780 // Check the call instructions within the same basic blocks.
18781 if (OpParent == Parent) {
18782 if (Entry->getOpcode() == Instruction::PHI) {
18783 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
18784 AddCosts(Op);
18785 continue;
18786 }
18787 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
18788 AddCosts(Op);
18789 continue;
18790 }
18791 // Check for call instruction in between blocks.
18792 // 1. Check entry's block to the head.
18793 if (Entry->getOpcode() != Instruction::PHI &&
18794 !CheckForNonVecCallsInSameBlock(
18795 &*Parent->getFirstNonPHIOrDbgOrAlloca(), LastInst)) {
18796 AddCosts(Op);
18797 continue;
18798 }
18799 // 2. Check op's block from the end.
18800 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
18801 OpParent->getTerminator())) {
18802 AddCosts(Op);
18803 continue;
18804 }
18805 // 3. Check the predecessors of entry's block till op's block.
18806 if (!CheckPredecessors(Parent, Pred, OpParent)) {
18807 AddCosts(Op);
18808 continue;
18809 }
18810 }
18811 }
18812
18813 return Cost;
18814}
18815
18816/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
18817/// buildvector sequence.
18819 const InsertElementInst *IE2) {
18820 if (IE1 == IE2)
18821 return false;
18822 const auto *I1 = IE1;
18823 const auto *I2 = IE2;
18824 const InsertElementInst *PrevI1;
18825 const InsertElementInst *PrevI2;
18826 unsigned Idx1 = *getElementIndex(IE1);
18827 unsigned Idx2 = *getElementIndex(IE2);
18828 do {
18829 if (I2 == IE1)
18830 return true;
18831 if (I1 == IE2)
18832 return false;
18833 PrevI1 = I1;
18834 PrevI2 = I2;
18835 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
18836 getElementIndex(I1).value_or(Idx2) != Idx2)
18837 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
18838 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
18839 getElementIndex(I2).value_or(Idx1) != Idx1)
18840 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
18841 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
18842 llvm_unreachable("Two different buildvectors not expected.");
18843}
18844
18845namespace {
18846/// Returns incoming Value *, if the requested type is Value * too, or a default
18847/// value, otherwise.
18848struct ValueSelect {
18849 template <typename U>
18850 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
18851 return V;
18852 }
18853 template <typename U>
18854 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
18855 return U();
18856 }
18857};
18858} // namespace
18859
18860/// Does the analysis of the provided shuffle masks and performs the requested
18861/// actions on the vectors with the given shuffle masks. It tries to do it in
18862/// several steps.
18863/// 1. If the Base vector is not undef vector, resizing the very first mask to
18864/// have common VF and perform action for 2 input vectors (including non-undef
18865/// Base). Other shuffle masks are combined with the resulting after the 1 stage
18866/// and processed as a shuffle of 2 elements.
18867/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
18868/// action only for 1 vector with the given mask, if it is not the identity
18869/// mask.
18870/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
18871/// vectors, combing the masks properly between the steps.
18872template <typename T>
18874 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
18875 function_ref<unsigned(T *)> GetVF,
18876 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
18878 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
18879 SmallVector<int> Mask(ShuffleMask.begin()->second);
18880 auto VMIt = std::next(ShuffleMask.begin());
18881 T *Prev = nullptr;
18882 SmallBitVector UseMask =
18883 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
18884 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
18885 if (!IsBaseUndef.all()) {
18886 // Base is not undef, need to combine it with the next subvectors.
18887 std::pair<T *, bool> Res =
18888 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
18889 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
18890 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
18891 if (Mask[Idx] == PoisonMaskElem)
18892 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
18893 else
18894 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
18895 }
18896 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
18897 assert((!V || GetVF(V) == Mask.size()) &&
18898 "Expected base vector of VF number of elements.");
18899 Prev = Action(Mask, {nullptr, Res.first});
18900 } else if (ShuffleMask.size() == 1) {
18901 // Base is undef and only 1 vector is shuffled - perform the action only for
18902 // single vector, if the mask is not the identity mask.
18903 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
18904 /*ForSingleMask=*/true);
18905 if (Res.second)
18906 // Identity mask is found.
18907 Prev = Res.first;
18908 else
18909 Prev = Action(Mask, {ShuffleMask.begin()->first});
18910 } else {
18911 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
18912 // shuffles step by step, combining shuffle between the steps.
18913 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
18914 unsigned Vec2VF = GetVF(VMIt->first);
18915 if (Vec1VF == Vec2VF) {
18916 // No need to resize the input vectors since they are of the same size, we
18917 // can shuffle them directly.
18918 ArrayRef<int> SecMask = VMIt->second;
18919 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
18920 if (SecMask[I] != PoisonMaskElem) {
18921 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
18922 Mask[I] = SecMask[I] + Vec1VF;
18923 }
18924 }
18925 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
18926 } else {
18927 // Vectors of different sizes - resize and reshuffle.
18928 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
18929 /*ForSingleMask=*/false);
18930 std::pair<T *, bool> Res2 =
18931 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
18932 ArrayRef<int> SecMask = VMIt->second;
18933 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
18934 if (Mask[I] != PoisonMaskElem) {
18935 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
18936 if (Res1.second)
18937 Mask[I] = I;
18938 } else if (SecMask[I] != PoisonMaskElem) {
18939 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
18940 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
18941 }
18942 }
18943 Prev = Action(Mask, {Res1.first, Res2.first});
18944 }
18945 VMIt = std::next(VMIt);
18946 }
18947 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
18948 // Perform requested actions for the remaining masks/vectors.
18949 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
18950 // Shuffle other input vectors, if any.
18951 std::pair<T *, bool> Res =
18952 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
18953 ArrayRef<int> SecMask = VMIt->second;
18954 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
18955 if (SecMask[I] != PoisonMaskElem) {
18956 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
18957 "Multiple uses of scalars.");
18958 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
18959 } else if (Mask[I] != PoisonMaskElem) {
18960 Mask[I] = I;
18961 }
18962 }
18963 Prev = Action(Mask, {Prev, Res.first});
18964 }
18965 return Prev;
18966}
18967
18970 Instruction *RdxRoot) {
18971 // FIXME: support buildvector of the gather nodes with struct types.
18972 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
18973 return TE->isGather() &&
18974 isa<StructType>(TE->Scalars.front()->getType());
18975 })) {
18976 LLVM_DEBUG(
18977 dbgs() << "SLP: rejecting tree with buildvector struct values of size "
18978 << VectorizableTree.size() << ".\n");
18980 }
18981
18983 SmallPtrSet<Value *, 4> CheckedExtracts;
18984 SmallSetVector<TreeEntry *, 4> GatheredLoadsNodes;
18986 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
18987 << VectorizableTree.size() << ".\n");
18988 // The V-only-dependent part of the predicate. Same V is commonly seen in
18989 // multiple TEs (shared scalars), so cache the result across calls.
18990 // DeletedNodes is read-only during this cost loop, so caching is safe.
18991 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
18992 SmallDenseMap<Value *, bool> ExternalUseVCache;
18993 auto IsExternallyUsedV = [&](Value *V) {
18994 auto [It, Inserted] = ExternalUseVCache.try_emplace(V);
18995 if (!Inserted)
18996 return It->second;
18997 bool Res = false;
18998 if (V->hasOneUse() || V->getType()->isVoidTy()) {
18999 // Res stays false.
19000 } else if (V->hasNUsesOrMore(NumVectScalars)) {
19001 Res = true;
19002 } else if (auto *I = dyn_cast<Instruction>(V)) {
19003 Res = any_of(I->users(), [&](const User *U) {
19004 // store/insertelt v, [cast]U will likely be vectorized.
19005 if (match(U,
19006 m_InsertElt(m_Value(), m_OneUse(m_CastOrSelf(m_Specific(I))),
19007 m_ConstantInt())))
19008 return false;
19009 if (match(U, m_InsertElt(m_Value(), m_Specific(I), m_ConstantInt())))
19010 return false;
19011 if (match(U, m_Store(m_OneUse(m_CastOrSelf(m_Specific(I))), m_Value())))
19012 return false;
19013 if (match(U, m_Store(m_Specific(I), m_Value())))
19014 return false;
19015 ArrayRef<TreeEntry *> Entries = getTreeEntries(U);
19016 if (Entries.empty() && !MustGather.contains(U))
19017 return true;
19018 if (any_of(Entries,
19019 [&](TreeEntry *TE) { return DeletedNodes.contains(TE); }))
19020 return true;
19021 return any_of(ValueToGatherNodes.lookup(U), [&](const TreeEntry *TE) {
19022 return DeletedNodes.contains(TE);
19023 });
19024 });
19025 }
19026 It->second = Res;
19027 return Res;
19028 };
19029 auto IsExternallyUsed = [&](const TreeEntry &TE, Value *V) {
19030 assert(TE.hasState() && !TE.isGather() &&
19031 TE.State != TreeEntry::SplitVectorize && "Expected vector node.");
19032 if (TE.hasCopyableElements() && TE.isCopyableElement(V))
19033 return false;
19034 return IsExternallyUsedV(V);
19035 };
19037 InstructionCost Cost = 0;
19039 uint64_t PrevScale = 0;
19040 BasicBlock *PrevVecParent = nullptr;
19041 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
19042 TreeEntry &TE = *Ptr;
19043 // No need to count the cost for combined entries, they are combined and
19044 // just skip their cost.
19045 if (TE.State == TreeEntry::CombinedVectorize) {
19046 LLVM_DEBUG(
19047 dbgs() << "SLP: Skipping cost for combined node that starts with "
19048 << *TE.Scalars[0] << ".\n";
19049 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
19050 NodesCosts.try_emplace(&TE);
19051 continue;
19052 }
19053 if (TE.hasState() &&
19054 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
19055 if (const TreeEntry *E =
19056 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
19057 E && E->getVectorFactor() == TE.getVectorFactor()) {
19058 // Some gather nodes might be absolutely the same as some vectorizable
19059 // nodes after reordering, need to handle it.
19060 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
19061 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
19062 << "SLP: Current total cost = " << Cost << "\n");
19063 NodesCosts.try_emplace(&TE);
19064 continue;
19065 }
19066 }
19067
19068 // Exclude cost of gather loads nodes which are not used. These nodes were
19069 // built as part of the final attempt to vectorize gathered loads.
19070 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
19071 "Expected gather nodes with users only.");
19072
19073 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
19074 uint64_t Scale = 0;
19075 bool CostIsFree = C == 0;
19076 // For gather/buildvector (and split-vectorize) entries, prefer the
19077 // per-lane refined scale that accounts for LICM-hoistable insertelements
19078 // when an operand is invariant in the current loop nest but defined in
19079 // an outer loop. This prevents over-costing cross-loop-nest buildvectors.
19080 const bool IsGatherLike =
19081 TE.isGather() || TE.State == TreeEntry::SplitVectorize;
19082 if (!CostIsFree && !TE.isGather() && TE.hasState()) {
19083 if (PrevVecParent == TE.getMainOp()->getParent()) {
19084 Scale = PrevScale;
19085 C *= Scale;
19086 EntryToScale.try_emplace(&TE, Scale);
19087 }
19088 }
19089 if (!CostIsFree && !Scale) {
19090 Scale =
19091 IsGatherLike
19092 ? getGatherNodeEffectiveScale(TE, TE.Idx == 0 ? RdxRoot : nullptr)
19093 : getScaleToLoopIterations(TE);
19094 C *= Scale;
19095 EntryToScale.try_emplace(&TE, Scale);
19096 if (!TE.isGather() && TE.hasState()) {
19097 PrevVecParent = TE.getMainOp()->getParent();
19098 PrevScale = Scale;
19099 }
19100 }
19101 Cost += C;
19102 NodesCosts.try_emplace(&TE, C);
19103 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
19104 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
19105 << "SLP: Current total cost = " << Cost << "\n");
19106 // Add gathered loads nodes to the set for later processing.
19107 if (TE.Idx > 0 && !TE.UserTreeIndex && TE.hasState() &&
19108 TE.getOpcode() == Instruction::Load)
19109 GatheredLoadsNodes.insert(&TE);
19110 if (!TE.isGather() && TE.State != TreeEntry::SplitVectorize &&
19111 !(TE.Idx == 0 && (TE.getOpcode() == Instruction::InsertElement ||
19112 TE.getOpcode() == Instruction::Store)) &&
19113 !isa<StructType>(getValueType(TE.Scalars.front()))) {
19114 // Calculate costs of external uses.
19115 APInt DemandedElts = APInt::getZero(TE.getVectorFactor());
19116 for (Value *V : TE.Scalars) {
19117 if (IsExternallyUsed(TE, V))
19118 DemandedElts.setBit(TE.findLaneForValue(V));
19119 }
19120 if (!DemandedElts.isZero()) {
19121 Type *ScalarTy = TE.Scalars.front()->getType();
19122 auto It = MinBWs.find(&TE);
19123 if (It != MinBWs.end())
19124 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
19125 auto *VecTy = getWidenedType(ScalarTy, TE.getVectorFactor());
19126 InstructionCost ExtCost =
19127 ::getScalarizationOverhead(*TTI, ScalarTy, cast<VectorType>(VecTy),
19128 DemandedElts, /*Insert=*/false,
19129 /*Extract=*/true, CostKind);
19130 if (ExtCost.isValid() && ExtCost != 0) {
19131 if (!Scale)
19132 Scale = getScaleToLoopIterations(TE);
19133 ExtCost *= Scale;
19134 EntryToScale.try_emplace(&TE, Scale);
19135 }
19136 ExtractCosts.try_emplace(&TE, ExtCost);
19137 }
19138 }
19139 }
19140 // Bail out if the cost threshold is negative and cost already below it.
19141 if (SLPCostThreshold.getNumOccurrences() > 0 && SLPCostThreshold < 0 &&
19143 return Cost;
19144 // The narrow non-profitable tree in loop? Skip, may cause regressions.
19145 constexpr unsigned PartLimit = 2;
19146 const unsigned Sz =
19147 getVectorElementSize(VectorizableTree.front()->Scalars.front());
19148 const unsigned MinVF = getMinVF(Sz);
19149 if (Cost >= -SLPCostThreshold &&
19150 VectorizableTree.front()->Scalars.size() * PartLimit <= MinVF &&
19151 (!VectorizableTree.front()->hasState() ||
19152 (VectorizableTree.front()->getOpcode() != Instruction::Store &&
19153 LI->getLoopFor(VectorizableTree.front()->getMainOp()->getParent()))))
19154 return Cost;
19155 // Store the cost + external uses estimation as the first element of the
19156 // tuple, just the cost as the second element of the tuple. Required to return
19157 // correct cost estimation for the tree, extracts are calculated separately.
19158 // Extracts, calculated here, are just quick estimations.
19160 std::tuple<InstructionCost, InstructionCost, SmallVector<unsigned>>>
19161 SubtreeCosts(VectorizableTree.size());
19162 auto UpdateParentNodes =
19163 [&](const TreeEntry *UserTE, const TreeEntry *TE,
19165 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4>
19166 &VisitedUser,
19167 bool AddToList = true) {
19168 while (UserTE &&
19169 VisitedUser.insert(std::make_pair(TE, UserTE)).second) {
19170 std::get<0>(SubtreeCosts[UserTE->Idx]) += TotalCost;
19171 std::get<1>(SubtreeCosts[UserTE->Idx]) += Cost;
19172 if (AddToList)
19173 std::get<2>(SubtreeCosts[UserTE->Idx]).push_back(TE->Idx);
19174 UserTE = UserTE->UserTreeIndex.UserTE;
19175 }
19176 };
19177 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
19178 TreeEntry &TE = *Ptr;
19179 InstructionCost C = NodesCosts.at(&TE);
19180 InstructionCost ExtractCost = ExtractCosts.lookup(&TE);
19181 std::get<0>(SubtreeCosts[TE.Idx]) += C + ExtractCost;
19182 std::get<1>(SubtreeCosts[TE.Idx]) += C;
19183 if (const TreeEntry *UserTE = TE.UserTreeIndex.UserTE) {
19184 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4>
19185 VisitedUser;
19186 UpdateParentNodes(UserTE, &TE, C + ExtractCost, C, VisitedUser);
19187 }
19188 }
19189 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4> Visited;
19190 for (TreeEntry *TE : GatheredLoadsNodes) {
19191 InstructionCost TotalCost = std::get<0>(SubtreeCosts[TE->Idx]);
19192 InstructionCost Cost = std::get<1>(SubtreeCosts[TE->Idx]);
19193 for (Value *V : TE->Scalars) {
19194 for (const TreeEntry *BVTE : ValueToGatherNodes.lookup(V))
19195 UpdateParentNodes(BVTE, TE, TotalCost, Cost, Visited,
19196 /*AddToList=*/false);
19197 }
19198 }
19199 Visited.clear();
19200 using CostIndicesTy =
19201 std::pair<TreeEntry *, std::tuple<InstructionCost, InstructionCost,
19202 SmallVector<unsigned>>>;
19203 struct FirstGreater {
19204 bool operator()(const CostIndicesTy &LHS, const CostIndicesTy &RHS) const {
19205 return std::get<0>(LHS.second) < std::get<0>(RHS.second) ||
19206 (std::get<0>(LHS.second) == std::get<0>(RHS.second) &&
19207 LHS.first->Idx < RHS.first->Idx);
19208 }
19209 };
19210 PriorityQueue<CostIndicesTy, SmallVector<CostIndicesTy>, FirstGreater>
19211 Worklist;
19212 for (const auto [Idx, P] : enumerate(SubtreeCosts))
19213 Worklist.emplace(VectorizableTree[Idx].get(), P);
19214
19215 // Narrow store trees with non-profitable immediate values - exit.
19216 if (!UserIgnoreList && VectorizableTree.front()->getVectorFactor() < MinVF &&
19217 VectorizableTree.front()->hasState() &&
19218 VectorizableTree.front()->getOpcode() == Instruction::Store &&
19219 (Worklist.top().first->Idx == 0 || Worklist.top().first->Idx == 1))
19220 return Cost;
19221
19222 bool Changed = false;
19223 bool PreferTrimmedTree = false;
19224 while (!Worklist.empty() && std::get<0>(Worklist.top().second) > 0) {
19225 TreeEntry *TE = Worklist.top().first;
19226 if (TE->isGather() || TE->Idx == 0 || DeletedNodes.contains(TE) ||
19227 isa<StructType>(getValueType(TE->Scalars.front())) ||
19228 // Exit early if the parent node is split node and any of scalars is
19229 // used in other split nodes.
19230 (TE->UserTreeIndex &&
19231 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize &&
19232 any_of(TE->Scalars, [&](Value *V) {
19233 ArrayRef<TreeEntry *> Entries = getSplitTreeEntries(V);
19234 return Entries.size() > 1;
19235 }))) {
19236 Worklist.pop();
19237 continue;
19238 }
19239 // Skip inversed compare nodes, they cannot be transformed to buildvectors.
19240 if (TE->State == TreeEntry::Vectorize && !TE->isAltShuffle() &&
19241 (TE->getOpcode() == Instruction::ICmp ||
19242 TE->getOpcode() == Instruction::FCmp) &&
19243 any_of(TE->Scalars, [&](Value *V) {
19244 auto *I = dyn_cast<CmpInst>(V);
19245 if (!I)
19246 return false;
19247 return I->getPredicate() !=
19248 cast<CmpInst>(TE->getMainOp())->getPredicate();
19249 })) {
19250 Worklist.pop();
19251 continue;
19252 }
19253
19254 // Calculate the gather cost of the root node.
19255 InstructionCost TotalSubtreeCost = std::get<0>(Worklist.top().second);
19256 InstructionCost SubtreeCost = std::get<1>(Worklist.top().second);
19257 if (TotalSubtreeCost < TE->Scalars.size()) {
19258 Worklist.pop();
19259 continue;
19260 }
19261 if (!TransformedToGatherNodes.empty()) {
19262 for (unsigned Idx : std::get<2>(Worklist.top().second)) {
19263 auto It = TransformedToGatherNodes.find(VectorizableTree[Idx].get());
19264 if (It != TransformedToGatherNodes.end()) {
19265 TotalSubtreeCost -= std::get<0>(SubtreeCosts[Idx]);
19266 SubtreeCost -= std::get<1>(SubtreeCosts[Idx]);
19267 TotalSubtreeCost += It->second;
19268 SubtreeCost += It->second;
19269 }
19270 }
19271 }
19272 if (TotalSubtreeCost < 0 || TotalSubtreeCost < TE->Scalars.size()) {
19273 Worklist.pop();
19274 continue;
19275 }
19276 const unsigned EntryVF = TE->getVectorFactor();
19277 APInt DemandedElts = APInt::getZero(EntryVF);
19278 for (auto [Idx, V] : enumerate(TE->Scalars)) {
19279 if (!isConstant(V))
19280 DemandedElts.setBit(Idx);
19281 }
19282
19283 Type *ScalarTy = getValueType(TE->Scalars.front());
19284 auto It = MinBWs.find(TE);
19285 if (It != MinBWs.end())
19286 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
19287 auto *VecTy = getWidenedType(ScalarTy, EntryVF);
19289 *TTI, ScalarTy, cast<VectorType>(VecTy), DemandedElts,
19290 /*Insert=*/true, /*Extract=*/false, CostKind);
19291 SmallVector<int> Mask;
19292 if (!TE->ReorderIndices.empty() &&
19293 TE->State != TreeEntry::CompressVectorize &&
19294 (TE->State != TreeEntry::StridedVectorize ||
19295 !isReverseOrder(TE->ReorderIndices))) {
19296 SmallVector<int> NewMask;
19297 if (TE->getOpcode() == Instruction::Store) {
19298 // For stores the order is actually a mask.
19299 NewMask.resize(TE->ReorderIndices.size());
19300 copy(TE->ReorderIndices, NewMask.begin());
19301 } else {
19302 inversePermutation(TE->ReorderIndices, NewMask);
19303 }
19304 ::addMask(Mask, NewMask);
19305 }
19306 if (!TE->ReuseShuffleIndices.empty())
19307 ::addMask(Mask, TE->ReuseShuffleIndices);
19308 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, EntryVF))
19309 GatherCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
19310 cast<VectorType>(VecTy), Mask);
19311 // If all scalars are reused in gather node(s) or other vector nodes, there
19312 // might be extra cost for inserting them.
19313 if ((!TE->hasState() || !TE->isAltShuffle()) &&
19314 all_of(TE->Scalars, [&](Value *V) {
19315 return (TE->hasCopyableElements() && TE->isCopyableElement(V)) ||
19316 isConstant(V) || isGathered(V) || getTreeEntries(V).size() > 1;
19317 }))
19318 GatherCost *= 2;
19319 // Erase subtree if it is non-profitable.
19320 ArrayRef<unsigned> Nodes = std::get<2>(Worklist.top().second);
19321 // Prefer trimming equal-cost alternate-shuffle subtrees rooted at binary
19322 // ops: alt-shuffles introduce runtime shuffle overhead that the cost model
19323 // may underestimate. Skip if the subtree contains ExtractElement nodes,
19324 // since those operate on already-materialized vectors where the cost model
19325 // is more accurate.
19326 auto IsEqualCostAltShuffleToTrim = [&]() {
19327 return TotalSubtreeCost == GatherCost && TE->hasState() &&
19328 TE->isAltShuffle() && Instruction::isBinaryOp(TE->getOpcode()) &&
19329 none_of(Nodes, [&](unsigned Idx) {
19330 return VectorizableTree[Idx]->hasState() &&
19331 VectorizableTree[Idx]->getOpcode() ==
19332 Instruction::ExtractElement;
19333 });
19334 };
19335 // Non-power-of-2 entries may have inflated costs - add a margin of 1
19336 // before trimming to avoid over-pruning.
19337 bool HasNonPowerOf2 = any_of(Nodes, [&](unsigned Idx) {
19338 return !has_single_bit(VectorizableTree[Idx]->Scalars.size());
19339 });
19340 InstructionCost TrimMargin = HasNonPowerOf2 ? 1 : 0;
19341 if (TotalSubtreeCost > GatherCost + TrimMargin ||
19342 IsEqualCostAltShuffleToTrim()) {
19343 PreferTrimmedTree |= TotalSubtreeCost == GatherCost;
19344 // If the remaining tree is just a buildvector - exit, it will cause
19345 // endless attempts to vectorize. When the tree is already profitable,
19346 // skip trimming this node and let the post-loop logic (including
19347 // gathered loads processing) decide.
19348 if (VectorizableTree.front()->hasState() &&
19349 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
19350 TE->Idx == 1) {
19351 if (Cost < -SLPCostThreshold) {
19352 LLVM_DEBUG(dbgs() << "SLP: Skipping trim of node " << TE->Idx
19353 << " - tree already profitable with cost " << Cost
19354 << ".\n");
19355 Worklist.pop();
19356 continue;
19357 }
19358 return InstructionCost::getInvalid();
19359 }
19360
19361 LLVM_DEBUG(dbgs() << "SLP: Trimming unprofitable subtree at node "
19362 << TE->Idx << " with cost "
19363 << std::get<0>(Worklist.top().second)
19364 << " and gather cost " << GatherCost << ".\n");
19365 if (TE->UserTreeIndex) {
19366 TransformedToGatherNodes.try_emplace(TE, GatherCost);
19367 NodesCosts.erase(TE);
19368 } else {
19369 DeletedNodes.insert(TE);
19370 TransformedToGatherNodes.erase(TE);
19371 NodesCosts.erase(TE);
19372 }
19373 for (unsigned Idx : Nodes) {
19374 TreeEntry &ChildTE = *VectorizableTree[Idx];
19375 DeletedNodes.insert(&ChildTE);
19376 TransformedToGatherNodes.erase(&ChildTE);
19377 NodesCosts.erase(&ChildTE);
19378 }
19379 Changed = true;
19380 }
19381 Worklist.pop();
19382 }
19383 if (!Changed)
19384 return std::get<1>(SubtreeCosts.front());
19385
19386 SmallPtrSet<TreeEntry *, 4> GatheredLoadsToDelete;
19387 InstructionCost LoadsExtractsCost = 0;
19388 // Check if all loads of gathered loads nodes are marked for deletion. In this
19389 // case the whole gathered loads subtree must be deleted.
19390 // Also, try to account for extracts, which might be required, if only part of
19391 // gathered load must be vectorized. Keep partially vectorized nodes, if
19392 // extracts are cheaper than gathers.
19393 for (TreeEntry *TE : GatheredLoadsNodes) {
19394 if (DeletedNodes.contains(TE) || TransformedToGatherNodes.contains(TE))
19395 continue;
19396 GatheredLoadsToDelete.insert(TE);
19397 APInt DemandedElts = APInt::getZero(TE->getVectorFactor());
19398 // All loads are removed from gathered? Need to delete the subtree.
19399 SmallDenseMap<const TreeEntry *, SmallVector<Value *>> ValuesToInsert;
19400 for (Value *V : TE->Scalars) {
19401 unsigned Pos = TE->findLaneForValue(V);
19402 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
19403 if (DeletedNodes.contains(BVE))
19404 continue;
19405 DemandedElts.setBit(Pos);
19406 ValuesToInsert.try_emplace(BVE).first->second.push_back(V);
19407 }
19408 }
19409 if (!DemandedElts.isZero()) {
19410 Type *ScalarTy = TE->Scalars.front()->getType();
19411 auto It = MinBWs.find(TE);
19412 if (It != MinBWs.end())
19413 ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
19414 auto *VecTy = getWidenedType(ScalarTy, TE->getVectorFactor());
19416 *TTI, ScalarTy, cast<VectorType>(VecTy), DemandedElts,
19417 /*Insert=*/false, /*Extract=*/true, CostKind);
19418 InstructionCost BVCost = 0;
19419 for (const auto &[BVE, Values] : ValuesToInsert) {
19420 APInt BVDemandedElts = APInt::getZero(BVE->getVectorFactor());
19421 SmallVector<Value *> BVValues(BVE->getVectorFactor(),
19422 PoisonValue::get(ScalarTy));
19423 for (Value *V : Values) {
19424 unsigned Pos = BVE->findLaneForValue(V);
19425 BVValues[Pos] = V;
19426 BVDemandedElts.setBit(Pos);
19427 }
19428 auto *BVVecTy = getWidenedType(ScalarTy, BVE->getVectorFactor());
19430 *TTI, ScalarTy, cast<VectorType>(BVVecTy), BVDemandedElts,
19431 /*Insert=*/true, /*Extract=*/false, CostKind,
19432 BVDemandedElts.isAllOnes(), BVValues);
19433 }
19434 if (ExtractsCost < BVCost) {
19435 LoadsExtractsCost += ExtractsCost;
19436 GatheredLoadsToDelete.erase(TE);
19437 continue;
19438 }
19439 LoadsExtractsCost += BVCost;
19440 }
19441 NodesCosts.erase(TE);
19442 }
19443
19444 // Deleted all subtrees rooted at gathered loads nodes.
19445 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19446 if (TE->UserTreeIndex &&
19447 GatheredLoadsToDelete.contains(TE->UserTreeIndex.UserTE)) {
19448 DeletedNodes.insert(TE.get());
19449 NodesCosts.erase(TE.get());
19450 GatheredLoadsToDelete.insert(TE.get());
19451 }
19452 if (GatheredLoadsToDelete.contains(TE.get()))
19453 DeletedNodes.insert(TE.get());
19454 }
19455
19456 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19457 if (!TE->UserTreeIndex && TransformedToGatherNodes.contains(TE.get())) {
19458 assert(TE->getOpcode() == Instruction::Load && "Expected load only.");
19459 continue;
19460 }
19461 if (DeletedNodes.contains(TE.get()))
19462 continue;
19463 if (!NodesCosts.contains(TE.get())) {
19465 getEntryCost(TE.get(), VectorizedVals, CheckedExtracts);
19466 if (!C.isValid() || C == 0) {
19467 NodesCosts.try_emplace(TE.get(), C);
19468 continue;
19469 }
19470 uint64_t Scale = EntryToScale.lookup(TE.get());
19471 if (!Scale) {
19472 const bool IsGatherLike =
19473 TE->isGather() || TE->State == TreeEntry::SplitVectorize;
19474 Scale = IsGatherLike ? getGatherNodeEffectiveScale(*TE.get())
19475 : getScaleToLoopIterations(*TE.get());
19476 }
19477 C *= Scale;
19478 NodesCosts.try_emplace(TE.get(), C);
19479 }
19480 }
19481
19482 LLVM_DEBUG(dbgs() << "SLP: Recalculate costs after tree trimming.\n");
19483 InstructionCost NewCost = 0;
19484 for (const auto &P : NodesCosts) {
19485 NewCost += P.second;
19486 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << P.second << " for bundle "
19487 << shortBundleName(P.first->Scalars, P.first->Idx)
19488 << ".\n"
19489 << "SLP: Current total cost = " << NewCost << "\n");
19490 }
19491 if (NewCost + LoadsExtractsCost > Cost ||
19492 (!PreferTrimmedTree && NewCost + LoadsExtractsCost == Cost)) {
19493 DeletedNodes.clear();
19494 TransformedToGatherNodes.clear();
19495 NewCost = Cost;
19496 } else {
19497 // If the remaining tree is just a buildvector - exit, it will cause
19498 // endless attempts to vectorize.
19499 if (VectorizableTree.size() >= 2 && VectorizableTree.front()->hasState() &&
19500 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
19501 TransformedToGatherNodes.contains(VectorizableTree[1].get()))
19502 return InstructionCost::getInvalid();
19503 if (VectorizableTree.size() >= 3 && VectorizableTree.front()->hasState() &&
19504 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
19505 VectorizableTree[1]->hasState() &&
19506 VectorizableTree[1]->State == TreeEntry::Vectorize &&
19507 (VectorizableTree[1]->getOpcode() == Instruction::ZExt ||
19508 VectorizableTree[1]->getOpcode() == Instruction::SExt ||
19509 VectorizableTree[1]->getOpcode() == Instruction::Trunc) &&
19510 TransformedToGatherNodes.contains(VectorizableTree[2].get()))
19511 return InstructionCost::getInvalid();
19512 }
19513 return NewCost;
19514}
19515
19516namespace {
19517/// Data type for handling buildvector sequences with the reused scalars from
19518/// other tree entries.
19519template <typename T> struct ShuffledInsertData {
19520 /// List of insertelements to be replaced by shuffles.
19521 SmallVector<InsertElementInst *> InsertElements;
19522 /// The parent vectors and shuffle mask for the given list of inserts.
19523 MapVector<T, SmallVector<int>> ValueMasks;
19524};
19525} // namespace
19526
19528 ArrayRef<Value *> VectorizedVals,
19529 InstructionCost ReductionCost,
19530 Instruction *RdxRoot) {
19531 // Reject vectorization if the vector code would produce more instructions
19532 // than the scalar code. The cost model may underestimate overhead from
19533 // shuffles, inserts, and extracts.
19534 // FIXME: remove this as soon as correct fractional model is landed for all
19535 // targets.
19536 if (SLPInstCountCheck && TTI->preferSLPInstCountCheck() &&
19537 VectorizableTree.front()->getVectorFactor() == 2 &&
19538 SLPCostThreshold == 0 &&
19539 (!SLPReVec ||
19541 VectorizableTree.front()->Scalars.front()->getType()))) {
19542 unsigned NumScalar = getNumScalarInsts();
19543 unsigned NumVector = getNumVectorInsts();
19544 LLVM_DEBUG(dbgs() << "SLP: Inst count check: vector=" << NumVector
19545 << " scalar=" << NumScalar << "\n");
19546 if (NumVector > NumScalar) {
19547 LLVM_DEBUG(dbgs() << "SLP: Rejecting tree: vector inst count "
19548 << NumVector << " > scalar inst count " << NumScalar
19549 << ".\n");
19551 }
19552 }
19553 InstructionCost Cost = TreeCost;
19554
19556 EntryToScale;
19557 auto ScaleCost = [&](InstructionCost C, const TreeEntry &TE,
19558 Value *Scalar = nullptr, Instruction *U = nullptr) {
19559 if (!C.isValid() || C == 0)
19560 return C;
19561 uint64_t &Scale =
19562 EntryToScale.try_emplace(std::make_tuple(&TE, Scalar, U), 0)
19563 .first->getSecond();
19564 if (!Scale)
19565 Scale = getScaleToLoopIterations(TE, Scalar, U);
19566 LLVM_DEBUG(dbgs() << "Scale " << Scale << " For entry " << TE.Idx << "\n");
19567 return C * Scale;
19568 };
19569 Instruction *ReductionRoot = RdxRoot;
19570 if (UserIgnoreList) {
19571 // Scale reduction cost to the factor of the loop nest trip count.
19572 ReductionCost = ScaleCost(ReductionCost, *VectorizableTree.front().get(),
19573 /*Scalar=*/nullptr, ReductionRoot);
19574 }
19575
19576 // Add the cost for reduction.
19577 Cost += ReductionCost;
19578
19579 if (any_of(ExternalUses, [](const ExternalUser &EU) {
19580 return isa<StructType>(EU.Scalar->getType()) &&
19582 }))
19584
19585 // Skip trees, which are non-profitable even if there are insertelements with
19586 // external uses.
19587 constexpr unsigned CostLimit = 100;
19588 if (Cost >= -SLPCostThreshold + CostLimit &&
19589 (VectorizableTree.size() - DeletedNodes.size()) *
19590 VectorizableTree.front()->getVectorFactor() <
19591 CostLimit)
19592 return Cost;
19593
19594 if (Cost >= -SLPCostThreshold &&
19595 none_of(ExternalUses, [](const ExternalUser &EU) {
19596 return isa_and_nonnull<InsertElementInst>(EU.User);
19597 }))
19598 return Cost;
19599
19601 ExtractCostCalculated;
19602 InstructionCost ExtractCost = 0;
19604 SmallVector<APInt> DemandedElts;
19605 SmallDenseSet<Value *, 4> UsedInserts;
19607 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
19609 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
19610 // Keep track {Scalar, Index, User} tuple.
19611 // On AArch64, this helps in fusing a mov instruction, associated with
19612 // extractelement, with fmul in the backend so that extractelement is free.
19614 bool AllUsersGEPSWithStoresLoads = true;
19615 SmallBitVector UsedLanes(VectorizableTree.front()->getVectorFactor());
19617 Type *UserScalarTy = nullptr;
19618 for (ExternalUser &EU : ExternalUses) {
19619 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
19620 if (EU.E.Idx == 0) {
19621 UsedLanes.set(EU.Lane);
19623 if (User && User->hasOneUse() &&
19625 Type *LocalTy = getValueType(User->user_back());
19626 if (!UserScalarTy && !isa<ScalableVectorType>(LocalTy)) {
19627 UserScalarTy = LocalTy;
19628 } else if (UserScalarTy != LocalTy) {
19629 AllUsersGEPSWithStoresLoads = false;
19630 break;
19631 }
19632 Pointers.push_back(User);
19633 } else {
19634 AllUsersGEPSWithStoresLoads = false;
19635 break;
19636 }
19637 }
19638 }
19639 AllUsersGEPSWithStoresLoads &= UsedLanes.all();
19640
19641 // Pre-pass: for each externally-used scalar, find the basic block at which
19642 // the extractelement will be placed by codegen. This mirrors what
19643 // vectorizeTree does: the extract is placed at the nearest common dominator
19644 // of all effective use sites. For a non-PHI user the effective site is the
19645 // user's own block; for a PHI user it is the incoming block for the scalar
19646 // operand (the predecessor of the PHI on the edge that carries the scalar).
19647 // Using the NCD of all effective sites rather than the first-encountered
19648 // user's block makes the extract-cost scale order-independent and correct
19649 // even when users live in different loop nests.
19650 SmallDenseMap<Value *, BasicBlock *> ScalarToExtractBlock;
19651 for (const ExternalUser &EU : ExternalUses) {
19652 if (!EU.User || isa<InsertElementInst>(EU.User))
19653 continue;
19654 if (EphValues.count(EU.User))
19655 continue;
19656 BasicBlock *UserParent = cast<Instruction>(EU.User)->getParent();
19657 if (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
19659 continue;
19660 BasicBlock *UseBlock = nullptr;
19661 if (auto *PHI = dyn_cast<PHINode>(EU.User)) {
19662 // When the PHI itself is inside a loop, the extractelement is placed
19663 // in the incoming block for the scalar operand (the predecessor edge),
19664 // not in the PHI's own block. This applies to LCSSA phis at an inner-
19665 // loop exit that are still inside an outer loop: the incoming block is
19666 // in the inner loop while the PHI block is in the outer loop.
19667 // When the PHI is outside all loops (a true loop-exit phi), codegen
19668 // uses a vector phi at the exit block and the extract stays there
19669 // (scale = 1), so we keep the PHI's own block as the effective site.
19670 if (LI->getLoopFor(PHI->getParent())) {
19671 for (unsigned Idx : seq<unsigned>(PHI->getNumIncomingValues())) {
19672 if (PHI->getIncomingValue(Idx) != EU.Scalar)
19673 continue;
19674 BasicBlock *InBB = PHI->getIncomingBlock(Idx);
19675 UseBlock =
19676 UseBlock ? DT->findNearestCommonDominator(UseBlock, InBB) : InBB;
19677 }
19678 }
19679 if (!UseBlock)
19680 UseBlock = cast<Instruction>(EU.User)->getParent();
19681 } else {
19682 UseBlock = cast<Instruction>(EU.User)->getParent();
19683 }
19684 auto [It, Inserted] = ScalarToExtractBlock.try_emplace(EU.Scalar, UseBlock);
19685 if (!Inserted && It->second && UseBlock)
19686 It->second = DT->findNearestCommonDominator(It->second, UseBlock);
19687 }
19688
19689 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
19690 for (ExternalUser &EU : ExternalUses) {
19691 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
19692 << EU.E.Idx << " in lane " << EU.Lane << "\n");
19693 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
19694 else dbgs() << " User: nullptr\n");
19695 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
19696
19697 // Uses by ephemeral values are free (because the ephemeral value will be
19698 // removed prior to code generation, and so the extraction will be
19699 // removed as well).
19700 if (EphValues.count(EU.User))
19701 continue;
19702
19703 // Check if the scalar for the given user or all users is accounted already.
19704 if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second ||
19705 (EU.User &&
19706 CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr))))
19707 continue;
19708
19709 // Used in unreachable blocks or in EH pads (rarely executed) or is
19710 // terminated with unreachable instruction.
19711 if (BasicBlock *UserParent =
19712 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
19713 UserParent &&
19714 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
19715 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
19716 continue;
19717
19718 // No extract cost for vector "scalar" if REVEC is disabled.
19719 if (isVectorizedTy(EU.Scalar->getType()) &&
19720 (!SLPReVec ||
19721 (EU.E.hasState() && EU.E.getOpcode() == Instruction::InsertElement)))
19722 continue;
19723
19724 // If found user is an insertelement, do not calculate extract cost but try
19725 // to detect it as a final shuffled/identity match.
19726 // TODO: what if a user is insertvalue when REVEC is enabled?
19727 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
19728 VU && VU->getOperand(1) == EU.Scalar) {
19729 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
19730 if (!UsedInserts.insert(VU).second)
19731 continue;
19732 std::optional<unsigned> InsertIdx = getElementIndex(VU);
19733 if (InsertIdx) {
19734 const TreeEntry *ScalarTE = &EU.E;
19735 auto *It = find_if(
19736 ShuffledInserts,
19737 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
19738 // Checks if 2 insertelements are from the same buildvector.
19739 InsertElementInst *VecInsert = Data.InsertElements.front();
19741 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
19742 Value *Op0 = II->getOperand(0);
19743 if (isVectorized(II) && !isVectorized(Op0))
19744 return nullptr;
19745 return Op0;
19746 });
19747 });
19748 int VecId = -1;
19749 if (It == ShuffledInserts.end()) {
19750 auto &Data = ShuffledInserts.emplace_back();
19751 Data.InsertElements.emplace_back(VU);
19752 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
19753 VecId = ShuffledInserts.size() - 1;
19754 auto It = MinBWs.find(ScalarTE);
19755 if (It != MinBWs.end() &&
19756 VectorCasts
19757 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
19758 .second) {
19759 unsigned BWSz = It->second.first;
19760 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
19761 unsigned VecOpcode;
19762 if (DstBWSz < BWSz)
19763 VecOpcode = Instruction::Trunc;
19764 else
19765 VecOpcode =
19766 It->second.second ? Instruction::SExt : Instruction::ZExt;
19768 InstructionCost C = TTI->getCastInstrCost(
19769 VecOpcode, FTy,
19770 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
19771 FTy->getNumElements()),
19773 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
19774 << " for extending externally used vector with "
19775 "non-equal minimum bitwidth.\n");
19776 Cost += C;
19777 }
19778 } else {
19779 if (isFirstInsertElement(VU, It->InsertElements.front()))
19780 It->InsertElements.front() = VU;
19781 VecId = std::distance(ShuffledInserts.begin(), It);
19782 }
19783 int InIdx = *InsertIdx;
19784 SmallVectorImpl<int> &Mask =
19785 ShuffledInserts[VecId].ValueMasks[ScalarTE];
19786 if (Mask.empty())
19787 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
19788 Mask[InIdx] = EU.Lane;
19789 DemandedElts[VecId].setBit(InIdx);
19790 continue;
19791 }
19792 }
19793 }
19794
19796 // If we plan to rewrite the tree in a smaller type, we will need to sign
19797 // extend the extracted value back to the original type. Here, we account
19798 // for the extract and the added cost of the sign extend if needed.
19799 InstructionCost ExtraCost = TTI::TCC_Free;
19800 auto *ScalarTy = EU.Scalar->getType();
19801 const unsigned BundleWidth = EU.E.getVectorFactor();
19802 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
19803 auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
19804 const TreeEntry *Entry = &EU.E;
19805 auto It = MinBWs.find(Entry);
19807 if (isa<StructType>(VecTy)) {
19808 assert(EU.User && "Expected user for struct extract");
19809 const auto *EV = cast<ExtractValueInst>(EU.User);
19810 Indices.assign(EV->getIndices());
19811 }
19812 // We only add extract cost once for the same scalar and struct field.
19813 auto ExtractKey = std::make_pair(EU.Scalar, Indices);
19814 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
19815 !ExtractCostCalculated.insert(ExtractKey).second)
19816 continue;
19817 if (It != MinBWs.end()) {
19818 Type *MinTy = IntegerType::get(F->getContext(), It->second.first);
19819 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy))
19820 MinTy = getWidenedType(MinTy, VecTy->getNumElements());
19821 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
19822 ? Instruction::ZExt
19823 : Instruction::SExt;
19824 VecTy = getWidenedType(MinTy, BundleWidth);
19825 ExtraCost = getExtractWithExtendCost(*TTI, Extend, ScalarTy,
19826 cast<VectorType>(VecTy), EU.Lane);
19827 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
19828 << ExtraCost << "\n");
19829 } else {
19830 Type *ExtractTy = VecTy;
19831 if (auto *ST = dyn_cast<StructType>(VecTy)) {
19832 ExtractTy = ExtractValueInst::getIndexedType(ST, Indices);
19833 }
19834 ExtraCost = getVectorInstrCost(
19835 *TTI, ScalarTy, Instruction::ExtractElement, ExtractTy, CostKind,
19836 EU.Lane, EU.Scalar, ScalarUserAndIdx);
19837 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
19838 << *VecTy << ": " << ExtraCost << "\n");
19839 }
19840 // Leave the scalar instructions as is if they are cheaper than extracts.
19841 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
19842 Entry->getOpcode() == Instruction::Load) {
19843 // Checks if the user of the external scalar is phi in loop body.
19844 auto IsPhiInLoop = [&](const ExternalUser &U) {
19845 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
19846 auto *I = cast<Instruction>(U.Scalar);
19847 const Loop *L = LI->getLoopFor(Phi->getParent());
19848 return L && (Phi->getParent() == I->getParent() ||
19849 L == LI->getLoopFor(I->getParent()));
19850 }
19851 return false;
19852 };
19853 if (!ValueToExtUses) {
19854 ValueToExtUses.emplace();
19855 for (const auto &P : enumerate(ExternalUses)) {
19856 // Ignore phis in loops.
19857 if (IsPhiInLoop(P.value()))
19858 continue;
19859
19860 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
19861 }
19862 }
19863 // Can use original instruction, if no operands vectorized or they are
19864 // marked as externally used already.
19865 auto *Inst = cast<Instruction>(EU.Scalar);
19866 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
19867 auto OperandIsScalar = [&](Value *V) {
19868 if (!isVectorized(V)) {
19869 // Some extractelements might be not vectorized, but
19870 // transformed into shuffle and removed from the function,
19871 // consider it here.
19872 if (auto *EE = dyn_cast<ExtractElementInst>(V))
19873 return !EE->hasOneUse() || !MustGather.contains(EE);
19874 return true;
19875 }
19876 if (isa<StructType>(V->getType()))
19877 return false;
19878 return ValueToExtUses->contains(V);
19879 };
19880 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
19881 bool CanBeUsedAsScalarCast = false;
19882 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
19883 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
19884 Op && all_of(Op->operands(), OperandIsScalar)) {
19885 InstructionCost OpCost =
19886 (isVectorized(Op) && !ValueToExtUses->contains(Op))
19887 ? TTI->getInstructionCost(Op, CostKind)
19888 : 0;
19889 if (ScalarCost + OpCost <= ExtraCost) {
19890 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
19891 ScalarCost += OpCost;
19892 }
19893 }
19894 }
19895 if (CanBeUsedAsScalar) {
19896 bool KeepScalar = ScalarCost <= ExtraCost;
19897 // Try to keep original scalar if the user is the phi node from the same
19898 // block as the root phis, currently vectorized. It allows to keep
19899 // better ordering info of PHIs, being vectorized currently.
19900 bool IsProfitablePHIUser =
19901 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
19902 VectorizableTree.front()->Scalars.size() > 2)) &&
19903 VectorizableTree.front()->hasState() &&
19904 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
19905 !Inst->hasNUsesOrMore(UsesLimit) &&
19906 none_of(Inst->users(),
19907 [&](User *U) {
19908 auto *PHIUser = dyn_cast<PHINode>(U);
19909 return (!PHIUser ||
19910 PHIUser->getParent() != VectorizableTree.front()
19911 ->getMainOp()
19912 ->getParent()) &&
19913 !isVectorized(U);
19914 }) &&
19915 count_if(Entry->Scalars, [&](Value *V) {
19916 return ValueToExtUses->contains(V);
19917 }) <= 2;
19918 if (IsProfitablePHIUser) {
19919 KeepScalar = true;
19920 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
19921 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
19922 (!GatheredLoadsEntriesFirst.has_value() ||
19923 Entry->Idx < *GatheredLoadsEntriesFirst)) {
19924 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
19925 return ValueToExtUses->contains(V);
19926 });
19927 auto It = ExtractsCount.find(Entry);
19928 if (It != ExtractsCount.end()) {
19929 assert(ScalarUsesCount >= It->getSecond().size() &&
19930 "Expected total number of external uses not less than "
19931 "number of scalar uses.");
19932 ScalarUsesCount -= It->getSecond().size();
19933 }
19934 // Keep original scalar if number of externally used instructions in
19935 // the same entry is not power of 2. It may help to do some extra
19936 // vectorization for now.
19937 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
19938 }
19939 if (KeepScalar) {
19940 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
19941 for (Value *V : Inst->operands()) {
19942 // Struct operands cannot be rebuilt by the !User extraction
19943 // path (it has no insertvalue chain), so leave their existing
19944 // ExtractValueInst user in place.
19945 if (isa<StructType>(V->getType()))
19946 continue;
19947 auto It = ValueToExtUses->find(V);
19948 if (It != ValueToExtUses->end()) {
19949 // Replace all uses to avoid compiler crash.
19950 ExternalUses[It->second].User = nullptr;
19951 }
19952 }
19953 ExtraCost = ScalarCost;
19954 if (!IsPhiInLoop(EU))
19955 ExtractsCount[Entry].insert(Inst);
19956 if (CanBeUsedAsScalarCast) {
19957 ScalarOpsFromCasts.insert(Inst->getOperand(0));
19958 // Update the users of the operands of the cast operand to avoid
19959 // compiler crash.
19960 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
19961 for (Value *V : IOp->operands()) {
19962 if (isa<StructType>(V->getType()))
19963 continue;
19964 auto It = ValueToExtUses->find(V);
19965 if (It != ValueToExtUses->end()) {
19966 // Replace all uses to avoid compiler crash.
19967 ExternalUses[It->second].User = nullptr;
19968 }
19969 }
19970 }
19971 }
19972 }
19973 }
19974 }
19975
19976 // Scale the extract cost by the execution frequency of the block where
19977 // codegen will place the extractelement. That block is the nearest common
19978 // dominator of all effective use sites (precomputed in ScalarToExtractBlock
19979 // above), which is order-independent. For scalars kept as originals the
19980 // existing ScaleCost path (user-block based) remains correct, since the
19981 // scalar instruction executes at its definition site's frequency.
19982 if (!ExternalUsesAsOriginalScalar.contains(EU.Scalar)) {
19983 if (ExtraCost.isValid() && ExtraCost != 0) {
19984 if (!EU.User) {
19985 // No external user instruction is recorded (User == nullptr): the
19986 // scalar stays live in vectorized instructions or is used as an
19987 // extra arg, and is not present in ScalarToExtractBlock (the
19988 // pre-pass only records sites of real users). vectorizeTree() then
19989 // places the extractelement right after the vectorized instruction
19990 // (in the entry's block) and replaces the scalar uses with it, so
19991 // scale by the entry block's execution frequency to match that
19992 // placement.
19993 ExtraCost = ScaleCost(ExtraCost, *Entry, EU.Scalar, /*U=*/nullptr);
19994 } else {
19995 BasicBlock *ExtractBB = ScalarToExtractBlock.lookup(EU.Scalar);
19996 if (const Loop *L = ExtractBB ? LI->getLoopFor(ExtractBB) : nullptr) {
19997 uint64_t Scale = getLoopNestScale(
20000 << "SLP: Extract scale " << Scale << " (NCD block) for "
20001 << EU.Scalar->getNameOrAsOperand() << "\n");
20002 ExtraCost *= Scale;
20003 }
20004 }
20005 }
20006 } else {
20007 ExtraCost = ScaleCost(ExtraCost, *Entry, EU.Scalar,
20008 cast_or_null<Instruction>(EU.User));
20009 }
20010
20011 ExtractCost += ExtraCost;
20012 }
20013 // Charge the pointer-chain cost difference once for the root entry when
20014 // every external use of its scalars is a GEP feeding a single load/store
20015 // (see the detection loop above). Vectorizing the root in this pattern
20016 // forces lane extracts (or a vector GEP with unknown stride) to drive the
20017 // address computation, which is typically more expensive than keeping the
20018 // indices scalar in a unit-stride address chain. Add the delta once rather
20019 // than per external use.
20020 if (AllUsersGEPSWithStoresLoads && !Pointers.empty()) {
20021 const TreeEntry &RootEntry = *VectorizableTree.front();
20022 const bool AnyRootKeptAsScalar = any_of(RootEntry.Scalars, [&](Value *V) {
20023 return ExternalUsesAsOriginalScalar.contains(V);
20024 });
20025 const Value *CommonBase = nullptr;
20026 bool HaveCommonBase = true;
20027 for (const Value *P : Pointers) {
20028 const Value *Op = getUnderlyingObject(P);
20029 if (!CommonBase)
20030 CommonBase = Op;
20031 else if (CommonBase != Op) {
20032 HaveCommonBase = false;
20033 break;
20034 }
20035 }
20036 if (!AnyRootKeptAsScalar && HaveCommonBase) {
20038 auto *VecTy = getWidenedType(UserScalarTy, RootEntry.Scalars.size());
20039 InstructionCost ScalarGEPCost = TTI->getPointersChainCost(
20040 Pointers, CommonBase, TTI::PointersChainInfo::getUnitStride(),
20041 UserScalarTy, CostKind);
20042 InstructionCost VectorGEPCost = TTI->getPointersChainCost(
20043 Pointers, CommonBase, TTI::PointersChainInfo::getUnknownStride(),
20044 VecTy, CostKind);
20045 ExtractCost += ScaleCost(VectorGEPCost - ScalarGEPCost, RootEntry);
20046 }
20047 }
20048 // Insert externals for extract of operands of casts to be emitted as scalars
20049 // instead of extractelement.
20050 for (Value *V : ScalarOpsFromCasts) {
20051 ExternalUsesAsOriginalScalar.insert(V);
20052 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
20053 const auto *It = find_if_not(TEs, [&](TreeEntry *TE) {
20054 return TransformedToGatherNodes.contains(TE) ||
20055 DeletedNodes.contains(TE);
20056 });
20057 if (It != TEs.end()) {
20058 const TreeEntry *UserTE = *It;
20059 ExternalUses.emplace_back(V, nullptr, *UserTE,
20060 UserTE->findLaneForValue(V));
20061 }
20062 }
20063 }
20064 // Add reduced value cost, if resized.
20065 if (!VectorizedVals.empty()) {
20066 const TreeEntry &Root = *VectorizableTree.front();
20067 auto BWIt = MinBWs.find(&Root);
20068 if (BWIt != MinBWs.end()) {
20069 Type *DstTy = Root.Scalars.front()->getType();
20070 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
20071 unsigned SrcSz =
20072 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
20073 if (OriginalSz != SrcSz) {
20074 unsigned Opcode = Instruction::Trunc;
20075 if (OriginalSz > SrcSz)
20076 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
20077 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
20078 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
20079 assert(SLPReVec && "Only supported by REVEC.");
20080 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
20081 }
20082 InstructionCost CastCost =
20083 TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
20086 CastCost = ScaleCost(CastCost, Root, /*Scalar=*/nullptr, ReductionRoot);
20087 Cost += CastCost;
20088 }
20089 }
20090 }
20091
20092 // Buildvector with externally used scalars, which should remain as scalars,
20093 // should not be vectorized, the compiler may hang.
20094 if (SLPCostThreshold < 0 && VectorizableTree.size() > 1 &&
20095 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
20096 VectorizableTree[1]->hasState() &&
20097 VectorizableTree[1]->State == TreeEntry::Vectorize &&
20098 all_of(VectorizableTree[1]->Scalars, [&](Value *V) {
20099 return ExternalUsesAsOriginalScalar.contains(V);
20100 }))
20102
20103 Cost += ExtractCost;
20104 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
20105 bool ForSingleMask) {
20106 InstructionCost C = 0;
20107 unsigned VF = Mask.size();
20108 unsigned VecVF = TE->getVectorFactor();
20109 bool HasLargeIndex =
20110 any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
20111 if ((VF != VecVF && HasLargeIndex) ||
20113
20114 if (HasLargeIndex) {
20115 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
20116 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
20117 OrigMask.begin());
20120 cast<VectorType>(getWidenedType(TE->getMainOp()->getType(), VecVF)),
20121 OrigMask);
20122 LLVM_DEBUG(
20123 dbgs() << "SLP: Adding cost " << C
20124 << " for final shuffle of insertelement external users.\n";
20125 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
20126 Cost += C;
20127 return std::make_pair(TE, true);
20128 }
20129
20130 if (!ForSingleMask) {
20131 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
20132 for (unsigned I = 0; I < VF; ++I) {
20133 if (Mask[I] != PoisonMaskElem)
20134 ResizeMask[Mask[I]] = Mask[I];
20135 }
20136 if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
20139 TE->getMainOp()->getType(), VecVF)),
20140 ResizeMask);
20141 LLVM_DEBUG(
20142 dbgs() << "SLP: Adding cost " << C
20143 << " for final shuffle of insertelement external users.\n";
20144 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
20145
20146 Cost += C;
20147 }
20148 }
20149 return std::make_pair(TE, false);
20150 };
20151 // Calculate the cost of the reshuffled vectors, if any.
20152 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
20153 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
20154 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
20155 unsigned VF = 0;
20156 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
20158 assert((TEs.size() == 1 || TEs.size() == 2) &&
20159 "Expected exactly 1 or 2 tree entries.");
20160 if (TEs.size() == 1) {
20161 if (VF == 0)
20162 VF = TEs.front()->getVectorFactor();
20163 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
20164 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
20165 !all_of(enumerate(Mask), [=](const auto &Data) {
20166 return Data.value() == PoisonMaskElem ||
20167 (Data.index() < VF &&
20168 static_cast<int>(Data.index()) == Data.value());
20169 })) {
20171 cast<VectorType>(FTy), Mask);
20172 C = ScaleCost(C, *TEs.front());
20173 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
20174 << " for final shuffle of insertelement "
20175 "external users.\n";
20176 TEs.front()->dump();
20177 dbgs() << "SLP: Current total cost = " << Cost << "\n");
20178 Cost += C;
20179 }
20180 } else {
20181 if (VF == 0) {
20182 if (TEs.front() &&
20183 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
20184 VF = TEs.front()->getVectorFactor();
20185 else
20186 VF = Mask.size();
20187 }
20188 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
20190 cast<VectorType>(FTy), Mask);
20191 C = ScaleCost(C, *TEs.back());
20192 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
20193 << " for final shuffle of vector node and external "
20194 "insertelement users.\n";
20195 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
20196 dbgs() << "SLP: Current total cost = " << Cost << "\n");
20197 Cost += C;
20198 }
20199 VF = Mask.size();
20200 return TEs.back();
20201 };
20203 MutableArrayRef(Vector.data(), Vector.size()), Base,
20204 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
20205 EstimateShufflesCost);
20206 InstructionCost InsertCost = TTI->getScalarizationOverhead(
20208 ShuffledInserts[I].InsertElements.front()->getType()),
20209 DemandedElts[I],
20210 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
20211 Cost -= InsertCost;
20212 }
20213
20214 // Add the cost for reduced value resize (if required).
20215 if (ReductionBitWidth != 0) {
20216 assert(UserIgnoreList && "Expected reduction tree.");
20217 const TreeEntry &E = *VectorizableTree.front();
20218 auto It = MinBWs.find(&E);
20219 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
20220 unsigned SrcSize = It->second.first;
20221 unsigned DstSize = ReductionBitWidth;
20222 unsigned Opcode = Instruction::Trunc;
20223 if (SrcSize < DstSize) {
20224 bool IsArithmeticExtendedReduction =
20225 all_of(*UserIgnoreList, [](Value *V) {
20226 auto *I = cast<Instruction>(V);
20227 return is_contained({Instruction::Add, Instruction::FAdd,
20228 Instruction::Mul, Instruction::FMul,
20229 Instruction::And, Instruction::Or,
20230 Instruction::Xor},
20231 I->getOpcode());
20232 });
20233 if (IsArithmeticExtendedReduction)
20234 Opcode =
20235 Instruction::BitCast; // Handle it by getExtendedReductionCost
20236 else
20237 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
20238 }
20239 if (Opcode != Instruction::BitCast) {
20240 auto *SrcVecTy =
20241 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
20242 auto *DstVecTy =
20243 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
20244 TTI::CastContextHint CCH = getCastContextHint(E);
20245 switch (E.getOpcode()) {
20246 case Instruction::SExt:
20247 case Instruction::ZExt:
20248 case Instruction::Trunc: {
20249 const TreeEntry *OpTE = getOperandEntry(&E, 0);
20250 CCH = getCastContextHint(*OpTE);
20251 break;
20252 }
20253 default:
20254 break;
20255 }
20256 InstructionCost CastCost =
20257 TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
20259 CastCost = ScaleCost(CastCost, *VectorizableTree.front().get(),
20260 /*Scalar=*/nullptr, ReductionRoot);
20261 Cost += CastCost;
20262 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
20263 << " for final resize for reduction from " << SrcVecTy
20264 << " to " << DstVecTy << "\n";
20265 dbgs() << "SLP: Current total cost = " << Cost << "\n");
20266 }
20267 }
20268 }
20269
20270 std::optional<InstructionCost> SpillCost;
20271 if (Cost < -SLPCostThreshold) {
20272 SpillCost = getSpillCost();
20273 Cost += *SpillCost;
20274 }
20275#ifndef NDEBUG
20276 SmallString<256> Str;
20277 {
20278 raw_svector_ostream OS(Str);
20279 OS << "SLP: Spill Cost = ";
20280 if (SpillCost)
20281 OS << *SpillCost;
20282 else
20283 OS << "<skipped>";
20284 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n";
20285 if (ReductionRoot)
20286 OS << "SLP: Reduction Cost = " << ReductionCost << ".\n";
20287 OS << "SLP: Total Cost = " << Cost << ".\n";
20288 }
20289 LLVM_DEBUG(dbgs() << Str);
20290 if (ViewSLPTree)
20291 ViewGraph(this, "SLP" + F->getName(), false, Str);
20292#endif
20293
20294 return Cost;
20295}
20296
20297/// Tries to find extractelement instructions with constant indices from fixed
20298/// vector type and gather such instructions into a bunch, which highly likely
20299/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
20300/// successful, the matched scalars are replaced by poison values in \p VL for
20301/// future analysis.
20302std::optional<TTI::ShuffleKind>
20303BoUpSLP::tryToGatherSingleRegisterExtractElements(
20305 // Scan list of gathered scalars for extractelements that can be represented
20306 // as shuffles.
20308 SmallVector<int> UndefVectorExtracts;
20309 for (int I = 0, E = VL.size(); I < E; ++I) {
20310 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
20311 if (!EI) {
20312 if (isa<UndefValue>(VL[I]))
20313 UndefVectorExtracts.push_back(I);
20314 continue;
20315 }
20316 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
20317 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
20318 continue;
20319 std::optional<unsigned> Idx = getExtractIndex(EI);
20320 // Undefined index.
20321 if (!Idx) {
20322 UndefVectorExtracts.push_back(I);
20323 continue;
20324 }
20325 if (Idx >= VecTy->getNumElements()) {
20326 UndefVectorExtracts.push_back(I);
20327 continue;
20328 }
20329 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
20330 ExtractMask.reset(*Idx);
20331 if (isUndefVector</*IsPoisonOnly=*/true>(EI->getVectorOperand(),
20332 ExtractMask)
20333 .all()) {
20334 UndefVectorExtracts.push_back(I);
20335 continue;
20336 }
20337 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
20338 }
20339 // Sort the vector operands by the maximum number of uses in extractelements.
20341 VectorOpToIdx.takeVector();
20342 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
20343 return P1.second.size() > P2.second.size();
20344 });
20345 // Find the best pair of the vectors or a single vector.
20346 const int UndefSz = UndefVectorExtracts.size();
20347 unsigned SingleMax = 0;
20348 unsigned PairMax = 0;
20349 if (!Vectors.empty()) {
20350 SingleMax = Vectors.front().second.size() + UndefSz;
20351 if (Vectors.size() > 1) {
20352 auto *ItNext = std::next(Vectors.begin());
20353 PairMax = SingleMax + ItNext->second.size();
20354 }
20355 }
20356 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
20357 return std::nullopt;
20358 // Check if better to perform a shuffle of 2 vectors or just of a single
20359 // vector.
20360 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
20361 SmallVector<Value *> GatheredExtracts(
20362 VL.size(), PoisonValue::get(VL.front()->getType()));
20363 if (SingleMax >= PairMax && SingleMax) {
20364 for (int Idx : Vectors.front().second)
20365 std::swap(GatheredExtracts[Idx], VL[Idx]);
20366 } else if (!Vectors.empty()) {
20367 for (unsigned Idx : {0, 1})
20368 for (int Idx : Vectors[Idx].second)
20369 std::swap(GatheredExtracts[Idx], VL[Idx]);
20370 }
20371 // Add extracts from undefs too.
20372 for (int Idx : UndefVectorExtracts)
20373 std::swap(GatheredExtracts[Idx], VL[Idx]);
20374 // Check that gather of extractelements can be represented as just a
20375 // shuffle of a single/two vectors the scalars are extracted from.
20376 std::optional<TTI::ShuffleKind> Res =
20377 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
20378 if (!Res || all_of(Mask, equal_to(PoisonMaskElem))) {
20379 // TODO: try to check other subsets if possible.
20380 // Restore the original VL if attempt was not successful.
20381 copy(SavedVL, VL.begin());
20382 return std::nullopt;
20383 }
20384 // Restore unused scalars from mask, if some of the extractelements were not
20385 // selected for shuffle.
20386 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
20387 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
20388 isa<UndefValue>(GatheredExtracts[I])) {
20389 std::swap(VL[I], GatheredExtracts[I]);
20390 continue;
20391 }
20392 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
20393 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
20394 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
20395 is_contained(UndefVectorExtracts, I))
20396 continue;
20397 }
20398 return Res;
20399}
20400
20401/// Tries to find extractelement instructions with constant indices from fixed
20402/// vector type and gather such instructions into a bunch, which highly likely
20403/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
20404/// successful, the matched scalars are replaced by poison values in \p VL for
20405/// future analysis.
20407BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
20408 SmallVectorImpl<int> &Mask,
20409 unsigned NumParts) const {
20410 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
20411 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
20412 Mask.assign(VL.size(), PoisonMaskElem);
20413 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
20414 for (unsigned Part : seq<unsigned>(NumParts)) {
20415 // Scan list of gathered scalars for extractelements that can be represented
20416 // as shuffles.
20417 const unsigned PartOffset = Part * SliceSize;
20418 const unsigned PartSize = getNumElems(VL.size(), SliceSize, Part);
20419 // It may happen in case of revec, need to check no access out of bounds.
20420 if (PartOffset + PartSize > VL.size())
20421 break;
20423 MutableArrayRef(VL).slice(PartOffset, PartSize);
20424 SmallVector<int> SubMask;
20425 std::optional<TTI::ShuffleKind> Res =
20426 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
20427 ShufflesRes[Part] = Res;
20428 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
20429 if (SubVL.size() != SliceSize)
20430 break;
20431 }
20432 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
20433 return Res.has_value();
20434 }))
20435 ShufflesRes.clear();
20436 return ShufflesRes;
20437}
20438
20439std::optional<TargetTransformInfo::ShuffleKind>
20440BoUpSLP::isGatherShuffledSingleRegisterEntry(
20441 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
20442 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder,
20443 unsigned SliceSize) {
20444 Entries.clear();
20445 if (TE->Idx == 0)
20446 return std::nullopt;
20447 const unsigned MaskBase = Part * SliceSize;
20448 // TODO: currently checking only for Scalars in the tree entry, need to count
20449 // reused elements too for better cost estimation.
20450 auto GetUserEntry = [&](const TreeEntry *TE) {
20451 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
20452 TE = TE->UserTreeIndex.UserTE;
20453 if (TE == VectorizableTree.front().get())
20454 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
20455 return TE->UserTreeIndex;
20456 };
20457 auto HasGatherUser = [&](const TreeEntry *TE) {
20458 while (TE->Idx != 0 && TE->UserTreeIndex) {
20459 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
20460 return true;
20461 TE = TE->UserTreeIndex.UserTE;
20462 }
20463 return false;
20464 };
20465 const EdgeInfo TEUseEI = GetUserEntry(TE);
20466 if (!TEUseEI || (TEUseEI.UserTE->Idx == 0 && TEUseEI.UserTE->isGather() &&
20467 !TEUseEI.UserTE->hasState()))
20468 return std::nullopt;
20469 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
20470 const BasicBlock *TEInsertBlock = nullptr;
20471 // Main node of PHI entries keeps the correct order of operands/incoming
20472 // blocks.
20473 if (auto *PHI = dyn_cast_or_null<PHINode>(
20474 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
20475 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
20476 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
20477 TEInsertPt = TEInsertBlock->getTerminator();
20478 } else {
20479 TEInsertBlock = TEInsertPt->getParent();
20480 }
20481 if (!DT->isReachableFromEntry(TEInsertBlock))
20482 return std::nullopt;
20483 auto *NodeUI = DT->getNode(TEInsertBlock);
20484 assert(NodeUI && "Should only process reachable instructions");
20485 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
20486 auto CheckOrdering = [&](const Instruction *InsertPt) {
20487 // Argument InsertPt is an instruction where vector code for some other
20488 // tree entry (one that shares one or more scalars with TE) is going to be
20489 // generated. This lambda returns true if insertion point of vector code
20490 // for the TE dominates that point (otherwise dependency is the other way
20491 // around). The other node is not limited to be of a gather kind. Gather
20492 // nodes are not scheduled and their vector code is inserted before their
20493 // first user. If user is PHI, that is supposed to be at the end of a
20494 // predecessor block. Otherwise it is the last instruction among scalars of
20495 // the user node. So, instead of checking dependency between instructions
20496 // themselves, we check dependency between their insertion points for vector
20497 // code (since each scalar instruction ends up as a lane of a vector
20498 // instruction).
20499 const BasicBlock *InsertBlock = InsertPt->getParent();
20500 auto *NodeEUI = DT->getNode(InsertBlock);
20501 if (!NodeEUI)
20502 return false;
20503 assert((NodeUI == NodeEUI) ==
20504 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
20505 "Different nodes should have different DFS numbers");
20506 // Check the order of the gather nodes users.
20507 if (TEInsertPt->getParent() != InsertBlock &&
20508 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
20509 return false;
20510 if (TEInsertPt->getParent() == InsertBlock &&
20511 TEInsertPt->comesBefore(InsertPt))
20512 return false;
20513 return true;
20514 };
20515 // Find all tree entries used by the gathered values. If no common entries
20516 // found - not a shuffle.
20517 // Here we build a set of tree nodes for each gathered value and trying to
20518 // find the intersection between these sets. If we have at least one common
20519 // tree node for each gathered value - we have just a permutation of the
20520 // single vector. If we have 2 different sets, we're in situation where we
20521 // have a permutation of 2 input vectors.
20523 SmallDenseMap<Value *, int> UsedValuesEntry;
20524 SmallPtrSet<const Value *, 16> VisitedValue;
20525 bool IsReusedNodeFound = false;
20526 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
20527 // The node is reused - exit.
20528 if (IsReusedNodeFound)
20529 return false;
20530 if ((TEPtr->getVectorFactor() != VL.size() &&
20531 TEPtr->Scalars.size() != VL.size()) ||
20532 (!TEPtr->isSame(VL) && !TEPtr->isSame(TE->Scalars)))
20533 return false;
20534 IsReusedNodeFound =
20535 equal(TE->Scalars, TEPtr->Scalars) &&
20536 equal(TE->ReorderIndices, TEPtr->ReorderIndices) &&
20537 equal(TE->ReuseShuffleIndices, TEPtr->ReuseShuffleIndices);
20538 UsedTEs.clear();
20539 UsedTEs.emplace_back().insert(TEPtr);
20540 for (Value *V : VL) {
20541 if (isConstant(V))
20542 continue;
20543 UsedValuesEntry.try_emplace(V, 0);
20544 }
20545 return true;
20546 };
20547 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
20548 unsigned EdgeIdx) {
20549 const TreeEntry *Ptr1 = User1;
20550 const TreeEntry *Ptr2 = User2;
20551 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
20552 while (Ptr2) {
20553 PtrToIdx.try_emplace(Ptr2, EdgeIdx);
20554 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
20555 Ptr2 = Ptr2->UserTreeIndex.UserTE;
20556 }
20557 while (Ptr1) {
20558 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
20559 Ptr1 = Ptr1->UserTreeIndex.UserTE;
20560 if (auto It = PtrToIdx.find(Ptr1); It != PtrToIdx.end())
20561 return Idx < It->second;
20562 }
20563 return false;
20564 };
20565 // Cache `isUsedOutsideBlock(TEInsertPt)` - TEInsertPt is loop-invariant and
20566 // the function walks the instruction's user list.
20567 std::optional<bool> TEInsertPtUsedOutsideBlock;
20568 auto IsTEInsertPtUsedOutsideBlock = [&] {
20569 if (!TEInsertPtUsedOutsideBlock)
20570 TEInsertPtUsedOutsideBlock =
20571 isUsedOutsideBlock(const_cast<Instruction *>(TEInsertPt));
20572 return *TEInsertPtUsedOutsideBlock;
20573 };
20574 // Cache the TEUseEI/TEInsertPt-only prefix of the per-call lambda predicate
20575 // below - all of these depend only on outer-scope state, not the lambda's
20576 // arguments.
20577 const bool TEUseEIInsertPtUsedOutside =
20578 TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
20579 !TEUseEI.UserTE->isCopyableElement(
20580 const_cast<Instruction *>(TEInsertPt)) &&
20581 IsTEInsertPtUsedOutsideBlock();
20582 auto CheckNonSchedulableOrdering = [&](const TreeEntry *E,
20583 Instruction *InsertPt) {
20584 return TEUseEIInsertPtUsedOutside &&
20585 InsertPt->getNextNode() == TEInsertPt &&
20586 (!E->hasCopyableElements() || !E->isCopyableElement(InsertPt) ||
20587 !isUsedOutsideBlock(InsertPt));
20588 };
20589 // Cache the TEUseEI.UserTE-dependent predicate - it is invariant across the
20590 // double loop below. all_of with isUsedOutsideBlock walks each scalar's
20591 // users and is the expensive component.
20592 const bool TEUserNeedsEmitFirst =
20593 TEUseEI.UserTE->State == TreeEntry::Vectorize &&
20594 TEUseEI.UserTE->hasState() &&
20595 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
20596 TEUseEI.UserTE->isAltShuffle()) &&
20597 all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock);
20598 // Cache `all_of(UserTE->Scalars, isUsedOutsideBlock)` per UserTE - the
20599 // same UserTE may be encountered for many TEPtr values inside the loop.
20600 SmallDenseMap<const TreeEntry *, bool> ScalarsUsedOutsideBlockCache;
20601 auto AllScalarsUsedOutsideBlock = [&](const TreeEntry *UserTE) {
20602 auto [It, Inserted] = ScalarsUsedOutsideBlockCache.try_emplace(UserTE);
20603 if (!Inserted)
20604 return It->second;
20605 bool Res = all_of(UserTE->Scalars, isUsedOutsideBlock);
20606 It->second = Res;
20607 return Res;
20608 };
20609 for (Value *V : VL) {
20610 if (isConstant(V) || !VisitedValue.insert(V).second)
20611 continue;
20612 // Build a list of tree entries where V is used.
20613 SmallPtrSet<const TreeEntry *, 4> VToTEs;
20615 ValueToGatherNodes.lookup(V).takeVector());
20616 if (TransformedToGatherNodes.contains(TE)) {
20617 for (TreeEntry *E : getSplitTreeEntries(V)) {
20618 if (TE == E || !TransformedToGatherNodes.contains(E) ||
20619 !E->UserTreeIndex || E->UserTreeIndex.UserTE->isGather())
20620 continue;
20621 GatherNodes.push_back(E);
20622 }
20623 for (TreeEntry *E : getTreeEntries(V)) {
20624 if (TE == E || !TransformedToGatherNodes.contains(E) ||
20625 !E->UserTreeIndex || E->UserTreeIndex.UserTE->isGather())
20626 continue;
20627 GatherNodes.push_back(E);
20628 }
20629 }
20630 for (const TreeEntry *TEPtr : GatherNodes) {
20631 if (TEPtr == TE || TEPtr->Idx == 0 || DeletedNodes.contains(TEPtr))
20632 continue;
20633 assert(any_of(TEPtr->Scalars,
20634 [&](Value *V) { return GatheredScalars.contains(V); }) &&
20635 "Must contain at least single gathered value.");
20636 assert(TEPtr->UserTreeIndex &&
20637 "Expected only single user of a gather node.");
20638 if (any_of(TEPtr->CombinedEntriesWithIndices,
20639 [&](const auto &P) { return P.first == TE->Idx; }))
20640 continue;
20641 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
20642
20643 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
20644 UseEI.UserTE->hasState())
20645 ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
20646 : nullptr;
20647 Instruction *InsertPt =
20648 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
20649 : &getLastInstructionInBundle(UseEI.UserTE);
20650 if (TEInsertPt == InsertPt) {
20651 // Check nodes, which might be emitted first.
20652 if (TEUserNeedsEmitFirst) {
20653 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
20654 (UseEI.UserTE->hasState() &&
20655 UseEI.UserTE->getOpcode() == Instruction::PHI &&
20656 !UseEI.UserTE->isAltShuffle()) ||
20657 !AllScalarsUsedOutsideBlock(UseEI.UserTE))
20658 continue;
20659 }
20660
20661 // If the schedulable insertion point is used in multiple entries - just
20662 // exit, no known ordering at this point, available only after real
20663 // scheduling.
20664 if (!doesNotNeedToBeScheduled(InsertPt) &&
20665 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
20666 continue;
20667 // If the users are the PHI nodes with the same incoming blocks - skip.
20668 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
20669 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
20670 UseEI.UserTE->State == TreeEntry::Vectorize &&
20671 UseEI.UserTE->getOpcode() == Instruction::PHI &&
20672 TEUseEI.UserTE != UseEI.UserTE)
20673 continue;
20674 // If 2 gathers are operands of the same entry (regardless of whether
20675 // user is PHI or else), compare operands indices, use the earlier one
20676 // as the base.
20677 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
20678 continue;
20679 // If the user instruction is used for some reason in different
20680 // vectorized nodes - make it depend on index.
20681 if (TEUseEI.UserTE != UseEI.UserTE &&
20682 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
20683 HasGatherUser(TEUseEI.UserTE)))
20684 continue;
20685 // If the user node is the operand of the other user node - skip.
20686 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
20687 continue;
20688 }
20689
20690 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
20691 TEUseEI.UserTE->doesNotNeedToSchedule() !=
20692 UseEI.UserTE->doesNotNeedToSchedule() &&
20693 is_contained(UseEI.UserTE->Scalars, TEInsertPt))
20694 continue;
20695 // Check if the user node of the TE comes after user node of TEPtr,
20696 // otherwise TEPtr depends on TE.
20697 if ((TEInsertBlock != InsertPt->getParent() ||
20698 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
20699 (!CheckOrdering(InsertPt) ||
20700 (UseEI.UserTE->hasCopyableElements() &&
20701 IsTEInsertPtUsedOutsideBlock() &&
20702 is_contained(UseEI.UserTE->Scalars, TEInsertPt))))
20703 continue;
20704 // The node is reused - exit.
20705 if (CheckAndUseSameNode(TEPtr))
20706 break;
20707 // The parent node is copyable with last inst used outside? And the last
20708 // inst is the next inst for the lastinst of TEPtr? Exit, if yes, to
20709 // preserve def-use chain.
20710 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
20711 continue;
20712 VToTEs.insert(TEPtr);
20713 }
20714 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
20715 const auto *It = find_if(VTEs, [&](const TreeEntry *MTE) {
20716 return MTE != TE && MTE != TEUseEI.UserTE &&
20717 !DeletedNodes.contains(MTE) &&
20718 !TransformedToGatherNodes.contains(MTE);
20719 });
20720 if (It != VTEs.end()) {
20721 const TreeEntry *VTE = *It;
20722 if (none_of(TE->CombinedEntriesWithIndices,
20723 [&](const auto &P) { return P.first == VTE->Idx; })) {
20724 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
20725 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
20726 continue;
20727 }
20728 // The node is reused - exit.
20729 if (CheckAndUseSameNode(VTE))
20730 break;
20731 VToTEs.insert(VTE);
20732 }
20733 }
20734 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
20735 const auto *It = find_if(VTEs, [&, MainTE = TE](const TreeEntry *TE) {
20736 return TE != MainTE && !DeletedNodes.contains(TE) &&
20737 !TransformedToGatherNodes.contains(TE);
20738 });
20739 if (It != VTEs.end()) {
20740 const TreeEntry *VTE = *It;
20741 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
20742 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
20743 VTEs = VTEs.drop_front();
20744 // Iterate through all vectorized nodes.
20745 const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
20746 return MTE->State == TreeEntry::Vectorize;
20747 });
20748 if (MIt == VTEs.end())
20749 continue;
20750 VTE = *MIt;
20751 }
20752 if (none_of(TE->CombinedEntriesWithIndices,
20753 [&](const auto &P) { return P.first == VTE->Idx; })) {
20754 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
20755 if (&LastBundleInst == TEInsertPt ||
20756 !CheckOrdering(&LastBundleInst) ||
20757 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
20758 continue;
20759 }
20760 // The node is reused - exit.
20761 if (CheckAndUseSameNode(VTE))
20762 break;
20763 VToTEs.insert(VTE);
20764 }
20765 }
20766 if (IsReusedNodeFound)
20767 break;
20768 if (VToTEs.empty())
20769 continue;
20770 if (UsedTEs.empty()) {
20771 // The first iteration, just insert the list of nodes to vector.
20772 UsedTEs.push_back(VToTEs);
20773 UsedValuesEntry.try_emplace(V, 0);
20774 } else {
20775 // Need to check if there are any previously used tree nodes which use V.
20776 // If there are no such nodes, consider that we have another one input
20777 // vector.
20778 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
20779 unsigned Idx = 0;
20780 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
20781 // Do we have a non-empty intersection of previously listed tree entries
20782 // and tree entries using current V?
20783 set_intersect(VToTEs, Set);
20784 if (!VToTEs.empty()) {
20785 // Yes, write the new subset and continue analysis for the next
20786 // scalar.
20787 Set.swap(VToTEs);
20788 break;
20789 }
20790 VToTEs = SavedVToTEs;
20791 ++Idx;
20792 }
20793 // No non-empty intersection found - need to add a second set of possible
20794 // source vectors.
20795 if (Idx == UsedTEs.size()) {
20796 // If the number of input vectors is greater than 2 - not a permutation,
20797 // fallback to the regular gather.
20798 // TODO: support multiple reshuffled nodes.
20799 if (UsedTEs.size() == 2)
20800 continue;
20801 UsedTEs.push_back(SavedVToTEs);
20802 Idx = UsedTEs.size() - 1;
20803 }
20804 UsedValuesEntry.try_emplace(V, Idx);
20805 }
20806 }
20807
20808 if (UsedTEs.empty()) {
20809 Entries.clear();
20810 return std::nullopt;
20811 }
20812
20813 unsigned VF = 0;
20814 if (UsedTEs.size() == 1) {
20815 // Keep the order to avoid non-determinism.
20816 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
20817 UsedTEs.front().end());
20818 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
20819 return TE1->Idx < TE2->Idx;
20820 });
20821 // Try to find the perfect match in another gather node at first.
20822 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
20823 return (EntryPtr->getVectorFactor() == TE->Scalars.size() &&
20824 EntryPtr->isSame(TE->Scalars)) ||
20825 EntryPtr->isSame(VL);
20826 });
20827 if (It != FirstEntries.end() &&
20828 (IsReusedNodeFound || (*It)->getVectorFactor() == VL.size() ||
20829 ((*It)->getVectorFactor() == TE->Scalars.size() &&
20830 TE->ReuseShuffleIndices.size() == VL.size() &&
20831 (*It)->isSame(TE->Scalars)))) {
20832 Entries.push_back(*It);
20833 if (IsReusedNodeFound || (*It)->getVectorFactor() == VL.size()) {
20834 std::iota(std::next(Mask.begin(), MaskBase),
20835 std::next(Mask.begin(), MaskBase + VL.size()), 0);
20836 } else {
20837 SmallVector<int> CommonMask = TE->getCommonMask();
20838 copy(CommonMask, Mask.begin());
20839 }
20840 // Clear undef scalars.
20841 for (unsigned I : seq<unsigned>(VL.size()))
20842 if (isa<PoisonValue>(VL[I]))
20843 Mask[MaskBase + I] = PoisonMaskElem;
20845 }
20846 // No perfect match, just shuffle, so choose the first tree node from the
20847 // tree.
20848 Entries.push_back(FirstEntries.front());
20849 // Update mapping between values and corresponding tree entries.
20850 for (auto &P : UsedValuesEntry)
20851 P.second = 0;
20852 VF = FirstEntries.front()->getVectorFactor();
20853 } else {
20854 // Try to find nodes with the same vector factor.
20855 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
20856 // Keep the order of tree nodes to avoid non-determinism.
20857 DenseMap<int, const TreeEntry *> VFToTE;
20858 for (const TreeEntry *TE : UsedTEs.front()) {
20859 unsigned VF = TE->getVectorFactor();
20860 auto It = VFToTE.find(VF);
20861 if (It != VFToTE.end()) {
20862 if (It->second->Idx > TE->Idx)
20863 It->getSecond() = TE;
20864 continue;
20865 }
20866 VFToTE.try_emplace(VF, TE);
20867 }
20868 // Same, keep the order to avoid non-determinism.
20869 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
20870 UsedTEs.back().end());
20871 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
20872 return TE1->Idx < TE2->Idx;
20873 });
20874 for (const TreeEntry *TE : SecondEntries) {
20875 auto It = VFToTE.find(TE->getVectorFactor());
20876 if (It != VFToTE.end()) {
20877 VF = It->first;
20878 Entries.push_back(It->second);
20879 Entries.push_back(TE);
20880 break;
20881 }
20882 }
20883 // No 2 source vectors with the same vector factor - just choose 2 with max
20884 // index.
20885 if (Entries.empty()) {
20887 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
20888 return TE1->Idx < TE2->Idx;
20889 }));
20890 Entries.push_back(SecondEntries.front());
20891 VF = std::max(Entries.front()->getVectorFactor(),
20892 Entries.back()->getVectorFactor());
20893 } else {
20894 VF = Entries.front()->getVectorFactor();
20895 }
20896 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
20897 for (const TreeEntry *E : Entries)
20898 ValuesToEntries.emplace_back().insert(E->Scalars.begin(),
20899 E->Scalars.end());
20900 // Update mapping between values and corresponding tree entries.
20901 for (auto &P : UsedValuesEntry) {
20902 for (unsigned Idx : seq<unsigned>(ValuesToEntries.size()))
20903 if (ValuesToEntries[Idx].contains(P.first)) {
20904 P.second = Idx;
20905 break;
20906 }
20907 }
20908 }
20909
20910 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
20911 // Checks if the 2 PHIs are compatible in terms of high possibility to be
20912 // vectorized.
20913 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
20914 auto *PHI = cast<PHINode>(V);
20915 auto *PHI1 = cast<PHINode>(V1);
20916 // Check that all incoming values are compatible/from same parent (if they
20917 // are instructions).
20918 // The incoming values are compatible if they all are constants, or
20919 // instruction with the same/alternate opcodes from the same basic block.
20920 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
20921 Value *In = PHI->getIncomingValue(I);
20922 Value *In1 = PHI1->getIncomingValue(I);
20923 if (isConstant(In) && isConstant(In1))
20924 continue;
20925 if (!getSameOpcode({In, In1}, *TLI))
20926 return false;
20927 if (cast<Instruction>(In)->getParent() !=
20929 return false;
20930 }
20931 return true;
20932 };
20933 // Check if the value can be ignored during analysis for shuffled gathers.
20934 // We suppose it is better to ignore instruction, which do not form splats,
20935 // are not vectorized/not extractelements (these instructions will be handled
20936 // by extractelements processing) or may form vector node in future.
20937 // Cache results - each V in VL is queried up to 3 times (direct +
20938 // NeighborMightBeIgnored from both neighbors), and areAllUsersVectorized
20939 // walks each instruction's user list.
20940 SmallDenseMap<Value *, bool> MightBeIgnoredCache;
20941 auto MightBeIgnored = [=, &MightBeIgnoredCache](Value *V) {
20942 auto [It, Inserted] = MightBeIgnoredCache.try_emplace(V);
20943 if (!Inserted)
20944 return It->second;
20945 auto *I = dyn_cast<Instruction>(V);
20946 bool Res = I && !IsSplatOrUndefs && !isVectorized(I) &&
20948 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
20949 It->second = Res;
20950 return Res;
20951 };
20952 // Check that the neighbor instruction may form a full vector node with the
20953 // current instruction V. It is possible, if they have same/alternate opcode
20954 // and same parent basic block.
20955 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
20956 Value *V1 = VL[Idx];
20957 bool UsedInSameVTE = false;
20958 auto It = UsedValuesEntry.find(V1);
20959 if (It != UsedValuesEntry.end())
20960 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
20961 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
20962 getSameOpcode({V, V1}, *TLI) &&
20963 cast<Instruction>(V)->getParent() ==
20964 cast<Instruction>(V1)->getParent() &&
20965 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
20966 };
20967 // Build a shuffle mask for better cost estimation and vector emission.
20968 SmallBitVector UsedIdxs(Entries.size());
20970 for (int I = 0, E = VL.size(); I < E; ++I) {
20971 Value *V = VL[I];
20972 auto It = UsedValuesEntry.find(V);
20973 if (It == UsedValuesEntry.end())
20974 continue;
20975 // Do not try to shuffle scalars, if they are constants, or instructions
20976 // that can be vectorized as a result of the following vector build
20977 // vectorization.
20978 if (isConstant(V) || (MightBeIgnored(V) &&
20979 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
20980 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
20981 continue;
20982 unsigned Idx = It->second;
20983 EntryLanes.emplace_back(Idx, I);
20984 UsedIdxs.set(Idx);
20985 }
20986 // Iterate through all shuffled scalars and select entries, which can be used
20987 // for final shuffle.
20989 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
20990 if (!UsedIdxs.test(I))
20991 continue;
20992 // Fix the entry number for the given scalar. If it is the first entry, set
20993 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
20994 // These indices are used when calculating final shuffle mask as the vector
20995 // offset.
20996 for (std::pair<unsigned, int> &Pair : EntryLanes)
20997 if (Pair.first == I)
20998 Pair.first = TempEntries.size();
20999 TempEntries.push_back(Entries[I]);
21000 }
21001 Entries.swap(TempEntries);
21002 if (EntryLanes.size() == Entries.size() &&
21003 !VL.equals(ArrayRef(TE->Scalars)
21004 .slice(MaskBase, getNumElems(TE->Scalars.size(), SliceSize,
21005 Part)))) {
21006 // We may have here 1 or 2 entries only. If the number of scalars is equal
21007 // to the number of entries, no need to do the analysis, it is not very
21008 // profitable. Since VL is not the same as TE->Scalars, it means we already
21009 // have some shuffles before. Cut off not profitable case.
21010 Entries.clear();
21011 return std::nullopt;
21012 }
21013 // Build the final mask, check for the identity shuffle, if possible.
21014 bool IsIdentity = Entries.size() == 1;
21015 // Pair.first is the offset to the vector, while Pair.second is the index of
21016 // scalar in the list.
21017 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
21018 unsigned Idx = MaskBase + Pair.second;
21019 Mask[Idx] =
21020 Pair.first * VF +
21021 (ForOrder ? std::distance(
21022 Entries[Pair.first]->Scalars.begin(),
21023 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
21024 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
21025 IsIdentity &= Mask[Idx] == Pair.second;
21026 }
21027 if (ForOrder || IsIdentity || Entries.empty()) {
21028 switch (Entries.size()) {
21029 case 1:
21030 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
21032 break;
21033 case 2:
21034 if (EntryLanes.size() > 2 || VL.size() <= 2)
21036 break;
21037 default:
21038 break;
21039 }
21040 } else if (!isa<VectorType>(VL.front()->getType()) &&
21041 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
21042 // Do the cost estimation if shuffle beneficial than buildvector.
21043 SmallVector<int> SubMask(std::next(Mask.begin(), MaskBase),
21044 std::next(Mask.begin(), MaskBase + VL.size()));
21045 int MinElement = SubMask.front(), MaxElement = SubMask.front();
21046 for (int Idx : SubMask) {
21047 if (Idx == PoisonMaskElem)
21048 continue;
21049 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
21050 MinElement = Idx;
21051 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
21052 MaxElement = Idx;
21053 }
21054 assert(MaxElement >= 0 && MinElement >= 0 &&
21055 MaxElement % VF >= MinElement % VF &&
21056 "Expected at least single element.");
21057 // If the leading [0, MinIdx) range sits in its own register part(s),
21058 // skip those whole parts when sizing the destination - everything below
21059 // the register-aligned floor is unused and never indexed.
21060 unsigned Offset = 0;
21061 unsigned MinIdx = MinElement % VF;
21062 if (MinIdx > 1) {
21063 unsigned RegFloor = getFloorFullVectorNumberOfElements(
21064 *TTI, VL.front()->getType(), MinIdx);
21065 auto *RegFloorTy = getWidenedType(VL.front()->getType(), RegFloor);
21066 unsigned RegFloorParts =
21067 ::getNumberOfParts(*TTI, RegFloorTy, VL.front()->getType(), RegFloor);
21068 if (RegFloorParts > 1)
21069 Offset = RegFloor;
21070 }
21071 unsigned NewVF =
21072 std::max<unsigned>(VL.size(), (MaxElement % VF) - Offset + 1);
21073 if (NewVF < VF) {
21074 for (int &Idx : SubMask) {
21075 if (Idx == PoisonMaskElem)
21076 continue;
21077 Idx = (Idx % VF) - Offset + (Idx >= static_cast<int>(VF) ? NewVF : 0);
21078 }
21079 } else {
21080 NewVF = VF;
21081 }
21082
21084 auto *VecTy =
21085 cast<VectorType>(getWidenedType(VL.front()->getType(), NewVF));
21086 auto *MaskVecTy =
21087 cast<VectorType>(getWidenedType(VL.front()->getType(), SubMask.size()));
21088 auto GetShuffleCost = [&,
21089 &TTI = *TTI](ArrayRef<int> Mask,
21091 VectorType *VecTy) -> InstructionCost {
21092 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
21094 Mask, Entries.front()->getInterleaveFactor()))
21095 return TTI::TCC_Free;
21096 return ::getShuffleCost(TTI,
21097 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
21099 VecTy, Mask, CostKind);
21100 };
21101 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
21102 InstructionCost FirstShuffleCost = 0;
21103 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
21104 if (Entries.size() == 1 || !Entries[0]->isGather()) {
21105 FirstShuffleCost = ShuffleCost;
21106 } else {
21107 // Transform mask to include only first entry.
21108 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
21109 bool IsIdentity = true;
21110 for (auto [I, Idx] : enumerate(FirstMask)) {
21111 if (Idx >= static_cast<int>(NewVF)) {
21112 Idx = PoisonMaskElem;
21113 } else {
21114 DemandedElts.clearBit(I);
21115 if (Idx != PoisonMaskElem)
21116 IsIdentity &= static_cast<int>(I) == Idx;
21117 }
21118 }
21119 if (!IsIdentity)
21120 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
21121 FirstShuffleCost += getScalarizationOverhead(
21122 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
21123 /*Extract=*/false, CostKind);
21124 }
21125 InstructionCost SecondShuffleCost = 0;
21126 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
21127 if (Entries.size() == 1 || !Entries[1]->isGather()) {
21128 SecondShuffleCost = ShuffleCost;
21129 } else {
21130 // Transform mask to include only first entry.
21131 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
21132 bool IsIdentity = true;
21133 for (auto [I, Idx] : enumerate(SecondMask)) {
21134 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
21135 Idx = PoisonMaskElem;
21136 } else {
21137 DemandedElts.clearBit(I);
21138 if (Idx != PoisonMaskElem) {
21139 Idx -= NewVF;
21140 IsIdentity &= static_cast<int>(I) == Idx;
21141 }
21142 }
21143 }
21144 if (!IsIdentity)
21145 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
21146 SecondShuffleCost += getScalarizationOverhead(
21147 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
21148 /*Extract=*/false, CostKind);
21149 }
21150 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
21151 for (auto [I, Idx] : enumerate(SubMask))
21152 if (Idx == PoisonMaskElem)
21153 DemandedElts.clearBit(I);
21154 InstructionCost BuildVectorCost = getScalarizationOverhead(
21155 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
21156 /*Extract=*/false, CostKind);
21157 const TreeEntry *BestEntry = nullptr;
21158 auto MaskSlice = MutableArrayRef(Mask).slice(MaskBase, VL.size());
21159 if (FirstShuffleCost < ShuffleCost) {
21160 for (int &Idx : MaskSlice)
21161 if (Idx >= static_cast<int>(VF))
21162 Idx = PoisonMaskElem;
21163 BestEntry = Entries.front();
21164 ShuffleCost = FirstShuffleCost;
21165 }
21166 if (SecondShuffleCost < ShuffleCost) {
21167 for (int &Idx : MaskSlice) {
21168 if (Idx < static_cast<int>(VF))
21169 Idx = PoisonMaskElem;
21170 else
21171 Idx -= VF;
21172 }
21173 BestEntry = Entries[1];
21174 ShuffleCost = SecondShuffleCost;
21175 }
21176 if (BuildVectorCost >= ShuffleCost) {
21177 if (BestEntry) {
21178 Entries.clear();
21179 Entries.push_back(BestEntry);
21180 }
21181 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
21183 }
21184 }
21185 Entries.clear();
21186 // Clear the corresponding mask elements.
21187 std::fill(std::next(Mask.begin(), MaskBase),
21188 std::next(Mask.begin(), MaskBase + VL.size()), PoisonMaskElem);
21189 return std::nullopt;
21190}
21191
21193BoUpSLP::isGatherShuffledEntry(
21194 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
21195 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
21196 bool ForOrder) {
21197 assert(NumParts > 0 && NumParts < VL.size() &&
21198 "Expected positive number of registers.");
21199 Entries.clear();
21200 // No need to check for the topmost gather node.
21201 if (TE == VectorizableTree.front().get() &&
21202 (!GatheredLoadsEntriesFirst.has_value() ||
21203 none_of(ArrayRef(VectorizableTree).drop_front(),
21204 [](const std::unique_ptr<TreeEntry> &TE) {
21205 return !TE->isGather();
21206 })))
21207 return {};
21208 Mask.assign(VL.size(), PoisonMaskElem);
21209 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
21210 "Expected only single user of the gather node.");
21211 unsigned PWSz =
21212 getFullVectorNumberOfElements(*TTI, VL.front()->getType(), VL.size());
21213 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
21214 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
21215 (TE->Idx == 0 ||
21216 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
21217 isSplat(TE->Scalars) ||
21218 (TE->hasState() &&
21219 getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))
21220 return {};
21221 unsigned SliceSize = getPartNumElems(PWSz, NumParts);
21223 for (unsigned Part : seq<unsigned>(NumParts)) {
21224 if (Part * SliceSize >= VL.size())
21225 break;
21226 ArrayRef<Value *> SubVL =
21227 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
21228 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
21229 std::optional<TTI::ShuffleKind> SubRes =
21230 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
21231 ForOrder, SliceSize);
21232 if (!SubRes)
21233 SubEntries.clear();
21234 Res.push_back(SubRes);
21235 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
21236 SubEntries.front()->getVectorFactor() == VL.size() &&
21237 (SubEntries.front()->isSame(TE->Scalars) ||
21238 SubEntries.front()->isSame(VL))) {
21239 SmallVector<const TreeEntry *> LocalSubEntries;
21240 LocalSubEntries.swap(SubEntries);
21241 Entries.clear();
21242 Res.clear();
21243 std::iota(Mask.begin(), Mask.end(), 0);
21244 // Clear undef scalars.
21245 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
21246 if (isa<PoisonValue>(VL[I]))
21248 Entries.emplace_back(1, LocalSubEntries.front());
21250 return Res;
21251 }
21252 }
21253 if (all_of(Res,
21254 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
21255 Entries.clear();
21256 return {};
21257 }
21258 return Res;
21259}
21260
21261InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
21262 Type *ScalarTy) const {
21263 const unsigned VF = VL.size();
21264 auto *VecTy = getWidenedType(ScalarTy, VF);
21265 // Find the cost of inserting/extracting values from the vector.
21266 // Check if the same elements are inserted several times and count them as
21267 // shuffle candidates.
21268 APInt DemandedElements = APInt::getZero(VF);
21271 auto EstimateInsertCost = [&](unsigned I, Value *V) {
21272 DemandedElements.setBit(I);
21273 if (V->getType() != ScalarTy)
21274 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
21276 };
21277 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
21278 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
21279 for (auto [I, V] : enumerate(VL)) {
21280 // No need to shuffle duplicates for constants.
21281 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V))
21282 continue;
21283
21284 if (isConstant(V)) {
21285 ConstantShuffleMask[I] = I + VF;
21286 continue;
21287 }
21288 EstimateInsertCost(I, V);
21289 }
21290 // FIXME: add a cost for constant vector materialization.
21291 bool IsAnyNonUndefConst =
21292 any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });
21293 // 1. Shuffle input source vector and constant vector.
21294 if (!ForPoisonSrc && IsAnyNonUndefConst) {
21296 cast<VectorType>(VecTy), ConstantShuffleMask);
21297 }
21298
21299 // 2. Insert unique non-constants.
21300 if (!DemandedElements.isZero())
21302 *TTI, ScalarTy, cast<VectorType>(VecTy), DemandedElements,
21303 /*Insert=*/true,
21304 /*Extract=*/false, CostKind, ForPoisonSrc && !IsAnyNonUndefConst, VL);
21305 return Cost;
21306}
21307
21308Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
21309 auto It = EntryToLastInstruction.find(E);
21310 if (It != EntryToLastInstruction.end())
21311 return *cast<Instruction>(It->second);
21312 Instruction *Res = nullptr;
21313 // Get the basic block this bundle is in. All instructions in the bundle
21314 // should be in this block (except for extractelement-like instructions with
21315 // constant indices or gathered loads or copyables).
21316 Instruction *Front;
21317 unsigned Opcode;
21318 if (E->hasState()) {
21319 Front = E->getMainOp();
21320 Opcode = E->getOpcode();
21321 } else {
21322 Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
21323 Opcode = Front->getOpcode();
21324 }
21325 auto *BB = Front->getParent();
21326 assert(
21327 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
21328 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
21329 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
21330 all_of(E->Scalars,
21331 [=](Value *V) -> bool {
21332 if (Opcode == Instruction::GetElementPtr &&
21333 !isa<GetElementPtrInst>(V))
21334 return true;
21335 auto *I = dyn_cast<Instruction>(V);
21336 return !I || !E->getMatchingMainOpOrAltOp(I) ||
21337 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
21338 })) &&
21339 "Expected gathered loads or GEPs or instructions from same basic "
21340 "block.");
21341
21342 auto FindLastInst = [&]() {
21343 Instruction *LastInst = Front;
21344 for (Value *V : E->Scalars) {
21345 auto *I = dyn_cast<Instruction>(V);
21346 if (!I)
21347 continue;
21348 if (E->isCopyableElement(I))
21349 continue;
21350 if (LastInst->getParent() == I->getParent()) {
21351 if (LastInst->comesBefore(I))
21352 LastInst = I;
21353 continue;
21354 }
21355 assert(((Opcode == Instruction::GetElementPtr &&
21357 E->State == TreeEntry::SplitVectorize ||
21358 (isVectorLikeInstWithConstOps(LastInst) &&
21360 (GatheredLoadsEntriesFirst.has_value() &&
21361 Opcode == Instruction::Load && E->isGather() &&
21362 E->Idx < *GatheredLoadsEntriesFirst)) &&
21363 "Expected vector-like or non-GEP in GEP node insts only.");
21364 if (!DT->isReachableFromEntry(LastInst->getParent())) {
21365 LastInst = I;
21366 continue;
21367 }
21368 if (!DT->isReachableFromEntry(I->getParent()))
21369 continue;
21370 auto *NodeA = DT->getNode(LastInst->getParent());
21371 auto *NodeB = DT->getNode(I->getParent());
21372 assert(NodeA && "Should only process reachable instructions");
21373 assert(NodeB && "Should only process reachable instructions");
21374 assert((NodeA == NodeB) ==
21375 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
21376 "Different nodes should have different DFS numbers");
21377 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
21378 LastInst = I;
21379 }
21380 BB = LastInst->getParent();
21381 return LastInst;
21382 };
21383
21384 auto FindFirstInst = [&]() {
21385 Instruction *FirstInst = Front;
21386 for (Value *V : E->Scalars) {
21387 auto *I = dyn_cast<Instruction>(V);
21388 if (!I)
21389 continue;
21390 if (E->isCopyableElement(I))
21391 continue;
21392 if (FirstInst->getParent() == I->getParent()) {
21393 if (I->comesBefore(FirstInst))
21394 FirstInst = I;
21395 continue;
21396 }
21397 assert(((Opcode == Instruction::GetElementPtr &&
21399 (isVectorLikeInstWithConstOps(FirstInst) &&
21401 "Expected vector-like or non-GEP in GEP node insts only.");
21402 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
21403 FirstInst = I;
21404 continue;
21405 }
21406 if (!DT->isReachableFromEntry(I->getParent()))
21407 continue;
21408 auto *NodeA = DT->getNode(FirstInst->getParent());
21409 auto *NodeB = DT->getNode(I->getParent());
21410 assert(NodeA && "Should only process reachable instructions");
21411 assert(NodeB && "Should only process reachable instructions");
21412 assert((NodeA == NodeB) ==
21413 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
21414 "Different nodes should have different DFS numbers");
21415 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
21416 FirstInst = I;
21417 }
21418 return FirstInst;
21419 };
21420
21421 if (E->State == TreeEntry::SplitVectorize) {
21422 Res = FindLastInst();
21423 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(Res); !Entries.empty()) {
21424 for (auto *E : Entries) {
21425 auto *I = dyn_cast_or_null<Instruction>(E->VectorizedValue);
21426 if (!I)
21427 I = &getLastInstructionInBundle(E);
21428 if (Res->getParent() == I->getParent() && Res->comesBefore(I))
21429 Res = I;
21430 }
21431 }
21432 EntryToLastInstruction.try_emplace(E, Res);
21433 return *Res;
21434 }
21435
21436 // Set insertpoint for gathered loads to the very first load.
21437 if (GatheredLoadsEntriesFirst.has_value() &&
21438 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
21439 Opcode == Instruction::Load) {
21440 Res = FindFirstInst();
21441 EntryToLastInstruction.try_emplace(E, Res);
21442 return *Res;
21443 }
21444
21445 // Set the insert point to the beginning of the basic block if the entry
21446 // should not be scheduled.
21447 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
21448 if (E->isGather())
21449 return nullptr;
21450 // Found previously that the instruction do not need to be scheduled.
21451 const auto *It = BlocksSchedules.find(BB);
21452 if (It == BlocksSchedules.end())
21453 return nullptr;
21454 for (Value *V : E->Scalars) {
21455 auto *I = dyn_cast<Instruction>(V);
21456 if (!I || isa<PHINode>(I) ||
21457 (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
21458 continue;
21459 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
21460 if (Bundles.empty())
21461 continue;
21462 const auto *It = find_if(
21463 Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
21464 if (It != Bundles.end())
21465 return *It;
21466 }
21467 return nullptr;
21468 };
21469 const ScheduleBundle *Bundle = FindScheduleBundle(E);
21470 if (!E->isGather() && !Bundle) {
21471 if ((Opcode == Instruction::GetElementPtr &&
21472 any_of(E->Scalars,
21473 [](Value *V) {
21474 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
21475 })) ||
21476 (all_of(E->Scalars,
21477 [&](Value *V) {
21478 return isa<PoisonValue>(V) ||
21479 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
21480 E->isCopyableElement(V) ||
21481 (!isVectorLikeInstWithConstOps(V) &&
21482 isUsedOutsideBlock(V));
21483 }) &&
21484 (!E->doesNotNeedToSchedule() ||
21485 any_of(E->Scalars,
21486 [&](Value *V) {
21487 if (!isa<Instruction>(V) ||
21488 (E->hasCopyableElements() && E->isCopyableElement(V)))
21489 return false;
21490 return !areAllOperandsNonInsts(V);
21491 }) ||
21492 none_of(E->Scalars, [&](Value *V) {
21493 if (!isa<Instruction>(V) ||
21494 (E->hasCopyableElements() && E->isCopyableElement(V)))
21495 return false;
21496 return MustGather.contains(V);
21497 }))))
21498 Res = FindLastInst();
21499 else
21500 Res = FindFirstInst();
21501 EntryToLastInstruction.try_emplace(E, Res);
21502 return *Res;
21503 }
21504
21505 // Find the last instruction. The common case should be that BB has been
21506 // scheduled, and the last instruction is VL.back(). So we start with
21507 // VL.back() and iterate over schedule data until we reach the end of the
21508 // bundle. The end of the bundle is marked by null ScheduleData.
21509 if (Bundle) {
21510 assert(!E->isGather() && "Gathered instructions should not be scheduled");
21511 Res = Bundle->getBundle().back()->getInst();
21512 EntryToLastInstruction.try_emplace(E, Res);
21513 return *Res;
21514 }
21515
21516 // LastInst can still be null at this point if there's either not an entry
21517 // for BB in BlocksSchedules or there's no ScheduleData available for
21518 // VL.back(). This can be the case if buildTreeRec aborts for various
21519 // reasons (e.g., the maximum recursion depth is reached, the maximum region
21520 // size is reached, etc.). ScheduleData is initialized in the scheduling
21521 // "dry-run".
21522 //
21523 // If this happens, we can still find the last instruction by brute force. We
21524 // iterate forwards from Front (inclusive) until we either see all
21525 // instructions in the bundle or reach the end of the block. If Front is the
21526 // last instruction in program order, LastInst will be set to Front, and we
21527 // will visit all the remaining instructions in the block.
21528 //
21529 // One of the reasons we exit early from buildTreeRec is to place an upper
21530 // bound on compile-time. Thus, taking an additional compile-time hit here is
21531 // not ideal. However, this should be exceedingly rare since it requires that
21532 // we both exit early from buildTreeRec and that the bundle be out-of-order
21533 // (causing us to iterate all the way to the end of the block).
21534 if (!Res)
21535 Res = FindLastInst();
21536 assert(Res && "Failed to find last instruction in bundle");
21537 EntryToLastInstruction.try_emplace(E, Res);
21538 return *Res;
21539}
21540
21541void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
21542 auto *Front = E->getMainOp();
21543 Instruction *LastInst = &getLastInstructionInBundle(E);
21544 assert(LastInst && "Failed to find last instruction in bundle");
21545 BasicBlock::iterator LastInstIt = LastInst->getIterator();
21546 // If the instruction is PHI, set the insert point after all the PHIs.
21547 bool IsPHI = isa<PHINode>(LastInst);
21548 if (IsPHI) {
21549 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
21550 if (LastInstIt != LastInst->getParent()->end() &&
21551 LastInstIt->getParent()->isLandingPad())
21552 LastInstIt = std::next(LastInstIt);
21553 }
21554 if (IsPHI ||
21555 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
21556 (E->doesNotNeedToSchedule() ||
21557 (E->hasCopyableElements() && !E->isCopyableElement(LastInst) &&
21558 isUsedOutsideBlock(LastInst)))) ||
21559 (GatheredLoadsEntriesFirst.has_value() &&
21560 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
21561 E->getOpcode() == Instruction::Load)) {
21562 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
21563 } else {
21564 // Set the insertion point after the last instruction in the bundle. Set the
21565 // debug location to Front.
21566 Builder.SetInsertPoint(
21567 LastInst->getParent(),
21568 LastInst->getNextNode()->getIterator());
21569 if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
21570 Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
21571 } else {
21572 Res = Builder.CreateAlignedLoad(Builder.getPtrTy(),
21573 PoisonValue::get(Builder.getPtrTy()),
21574 MaybeAlign());
21575 Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
21576 eraseInstruction(Res);
21577 if (E->State != TreeEntry::SplitVectorize)
21578 LastInstructionToPos.try_emplace(LastInst, Res);
21579 }
21580 }
21581 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
21582}
21583
21584Value *BoUpSLP::gather(
21585 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
21586 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
21587 // List of instructions/lanes from current block and/or the blocks which are
21588 // part of the current loop. These instructions will be inserted at the end to
21589 // make it possible to optimize loops and hoist invariant instructions out of
21590 // the loops body with better chances for success.
21592 SmallSet<int, 4> PostponedIndices;
21593 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
21594 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
21595 SmallPtrSet<BasicBlock *, 4> Visited;
21596 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
21597 InsertBB = InsertBB->getSinglePredecessor();
21598 return InsertBB && InsertBB == InstBB;
21599 };
21600 for (int I = 0, E = VL.size(); I < E; ++I) {
21601 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
21602 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
21603 isVectorized(Inst) ||
21604 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
21605 PostponedIndices.insert(I).second)
21606 PostponedInsts.emplace_back(Inst, I);
21607 }
21608
21609 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
21610 Type *Ty) {
21611 Value *Scalar = V;
21612 // Drop NUW from trunc to avoid incorrect codegen.
21613 Value *Trunced;
21614 if (match(Scalar, m_NUWTrunc(m_Value(Trunced))))
21615 cast<TruncInst>(Scalar)->setHasNoUnsignedWrap(/*B=*/false);
21616 if (Scalar->getType() != Ty) {
21617 assert(Scalar->getType()->isIntOrIntVectorTy() &&
21618 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
21619 Value *V = Scalar;
21620 if (auto *CI = dyn_cast<CastInst>(Scalar);
21622 Value *Op = CI->getOperand(0);
21623 if (auto *IOp = dyn_cast<Instruction>(Op);
21624 !IOp || !(isDeleted(IOp) || isVectorized(IOp)))
21625 V = Op;
21626 }
21627 Scalar = Builder.CreateIntCast(
21628 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
21629 }
21630
21631 Instruction *InsElt;
21632 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
21633 assert(SLPReVec && "FixedVectorType is not expected.");
21634 Vec =
21635 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
21636 auto *II = dyn_cast<Instruction>(Vec);
21637 if (!II)
21638 return Vec;
21639 InsElt = II;
21640 } else {
21641 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
21642 InsElt = dyn_cast<InsertElementInst>(Vec);
21643 if (!InsElt)
21644 return Vec;
21645 }
21646 GatherShuffleExtractSeq.insert(InsElt);
21647 CSEBlocks.insert(InsElt->getParent());
21648 // Add to our 'need-to-extract' list.
21649 if (isa<Instruction>(V)) {
21650 ArrayRef<TreeEntry *> Entries = getTreeEntries(V);
21651 const auto *It = find_if(Entries, [&](const TreeEntry *E) {
21652 return !TransformedToGatherNodes.contains(E) &&
21653 !DeletedNodes.contains(E);
21654 });
21655 if (It != Entries.end()) {
21656 // Find which lane we need to extract.
21657 User *UserOp = nullptr;
21658 if (Scalar != V) {
21659 if (auto *SI = dyn_cast<Instruction>(Scalar))
21660 UserOp = SI;
21661 } else {
21662 if (V->getType()->isVectorTy()) {
21663 if (auto *SV = dyn_cast<ShuffleVectorInst>(InsElt);
21664 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
21665 // Find shufflevector, caused by resize.
21666 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
21667 if (auto *SV = dyn_cast<ShuffleVectorInst>(Vec)) {
21668 if (SV->getOperand(0) == V)
21669 return SV;
21670 if (SV->getOperand(1) == V)
21671 return SV;
21672 }
21673 return nullptr;
21674 };
21675 InsElt = nullptr;
21676 if (Instruction *User = FindOperand(SV->getOperand(0), V))
21677 InsElt = User;
21678 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
21679 InsElt = User;
21680 assert(InsElt &&
21681 "Failed to find shufflevector, caused by resize.");
21682 } else if (SLPReVec && isa<ShuffleVectorInst>(InsElt)) {
21683 // ReVec gather used V directly as a shufflevector operand.
21684 // Register a nullptr-User external use so all remaining
21685 // in-IR uses of V get rewritten via replaceAllUsesWith,
21686 // and track V in ExternalUsesWithNonUsers to match the
21687 // bookkeeping done by buildExternalUses.
21688 unsigned FoundLane = (*It)->findLaneForValue(V);
21689 ExternalUses.emplace_back(V, nullptr, **It, FoundLane);
21690 ExternalUsesWithNonUsers.insert(V);
21691 }
21692 }
21693 UserOp = InsElt;
21694 }
21695 if (UserOp) {
21696 unsigned FoundLane = (*It)->findLaneForValue(V);
21697 ExternalUses.emplace_back(V, UserOp, **It, FoundLane);
21698 }
21699 }
21700 }
21701 return Vec;
21702 };
21703 auto *VecTy = getWidenedType(ScalarTy, VL.size());
21704 Value *Vec = PoisonValue::get(VecTy);
21705 SmallVector<int> NonConsts;
21706 SmallVector<int> Mask(VL.size());
21707 std::iota(Mask.begin(), Mask.end(), 0);
21708 Value *OriginalRoot = Root;
21709 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
21710 SV && isa<PoisonValue>(SV->getOperand(1)) &&
21711 SV->getOperand(0)->getType() == VecTy) {
21712 Root = SV->getOperand(0);
21713 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
21714 }
21715 // Insert constant values at first.
21716 for (int I = 0, E = VL.size(); I < E; ++I) {
21717 if (PostponedIndices.contains(I))
21718 continue;
21719 if (!isConstant(VL[I])) {
21720 NonConsts.push_back(I);
21721 continue;
21722 }
21723 if (isa<PoisonValue>(VL[I]))
21724 continue;
21725 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
21726 Mask[I] = I + E;
21727 }
21728 if (Root) {
21729 if (isa<PoisonValue>(Vec)) {
21730 Vec = OriginalRoot;
21731 } else {
21732 Vec = CreateShuffle(Root, Vec, Mask);
21733 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
21734 OI && OI->use_empty() &&
21735 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
21736 return TE->VectorizedValue == OI;
21737 }))
21738 eraseInstruction(OI);
21739 }
21740 }
21741 // Insert non-constant values.
21742 for (int I : NonConsts)
21743 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
21744 // Append instructions, which are/may be part of the loop, in the end to make
21745 // it possible to hoist non-loop-based instructions.
21746 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
21747 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
21748
21749 return Vec;
21750}
21751
21752/// Merges shuffle masks and emits final shuffle instruction, if required. It
21753/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
21754/// when the actual shuffle instruction is generated only if this is actually
21755/// required. Otherwise, the shuffle instruction emission is delayed till the
21756/// end of the process, to reduce the number of emitted instructions and further
21757/// analysis/transformations.
21758/// The class also will look through the previously emitted shuffle instructions
21759/// and properly mark indices in mask as undef.
21760/// For example, given the code
21761/// \code
21762/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
21763/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
21764/// \endcode
21765/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
21766/// look through %s1 and %s2 and emit
21767/// \code
21768/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
21769/// \endcode
21770/// instead.
21771/// If 2 operands are of different size, the smallest one will be resized and
21772/// the mask recalculated properly.
21773/// For example, given the code
21774/// \code
21775/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
21776/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
21777/// \endcode
21778/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
21779/// look through %s1 and %s2 and emit
21780/// \code
21781/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
21782/// \endcode
21783/// instead.
21784class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
21785 bool IsFinalized = false;
21786 /// Combined mask for all applied operands and masks. It is built during
21787 /// analysis and actual emission of shuffle vector instructions.
21788 SmallVector<int> CommonMask;
21789 /// List of operands for the shuffle vector instruction. It hold at max 2
21790 /// operands, if the 3rd is going to be added, the first 2 are combined into
21791 /// shuffle with \p CommonMask mask, the first operand sets to be the
21792 /// resulting shuffle and the second operand sets to be the newly added
21793 /// operand. The \p CommonMask is transformed in the proper way after that.
21794 SmallVector<Value *, 2> InVectors;
21795 IRBuilderBase &Builder;
21796 BoUpSLP &R;
21797
21798 class ShuffleIRBuilder {
21799 IRBuilderBase &Builder;
21800 /// Holds all of the instructions that we gathered.
21801 SetVector<Instruction *> &GatherShuffleExtractSeq;
21802 /// A list of blocks that we are going to CSE.
21803 DenseSet<BasicBlock *> &CSEBlocks;
21804 /// Data layout.
21805 const DataLayout &DL;
21806
21807 public:
21808 ShuffleIRBuilder(IRBuilderBase &Builder,
21809 SetVector<Instruction *> &GatherShuffleExtractSeq,
21810 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
21811 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
21812 CSEBlocks(CSEBlocks), DL(DL) {}
21813 ~ShuffleIRBuilder() = default;
21814 /// Creates shufflevector for the 2 operands with the given mask.
21815 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
21816 if (V1->getType() != V2->getType()) {
21818 V1->getType()->isIntOrIntVectorTy() &&
21819 "Expected integer vector types only.");
21820 if (V1->getType() != V2->getType()) {
21821 if (cast<VectorType>(V2->getType())
21822 ->getElementType()
21823 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
21824 ->getElementType()
21825 ->getIntegerBitWidth())
21826 V2 = Builder.CreateIntCast(
21827 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
21828 else
21829 V1 = Builder.CreateIntCast(
21830 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
21831 }
21832 }
21833 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
21834 if (auto *I = dyn_cast<Instruction>(Vec)) {
21835 GatherShuffleExtractSeq.insert(I);
21836 CSEBlocks.insert(I->getParent());
21837 }
21838 return Vec;
21839 }
21840 /// Creates permutation of the single vector operand with the given mask, if
21841 /// it is not identity mask.
21842 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
21843 if (Mask.empty())
21844 return V1;
21845 unsigned VF = Mask.size();
21846 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
21847 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
21848 return V1;
21849 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
21850 if (auto *I = dyn_cast<Instruction>(Vec)) {
21851 GatherShuffleExtractSeq.insert(I);
21852 CSEBlocks.insert(I->getParent());
21853 }
21854 return Vec;
21855 }
21856 Value *createIdentity(Value *V) { return V; }
21857 Value *createPoison(Type *Ty, unsigned VF) {
21858 return PoisonValue::get(getWidenedType(Ty, VF));
21859 }
21860 /// Resizes 2 input vector to match the sizes, if the they are not equal
21861 /// yet. The smallest vector is resized to the size of the larger vector.
21862 void resizeToMatch(Value *&V1, Value *&V2) {
21863 if (V1->getType() == V2->getType())
21864 return;
21865 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
21866 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
21867 int VF = std::max(V1VF, V2VF);
21868 int MinVF = std::min(V1VF, V2VF);
21869 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
21870 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
21871 0);
21872 Value *&Op = MinVF == V1VF ? V1 : V2;
21873 Op = Builder.CreateShuffleVector(Op, IdentityMask);
21874 if (auto *I = dyn_cast<Instruction>(Op)) {
21875 GatherShuffleExtractSeq.insert(I);
21876 CSEBlocks.insert(I->getParent());
21877 }
21878 if (MinVF == V1VF)
21879 V1 = Op;
21880 else
21881 V2 = Op;
21882 }
21883 };
21884
21885 /// Smart shuffle instruction emission, walks through shuffles trees and
21886 /// tries to find the best matching vector for the actual shuffle
21887 /// instruction.
21888 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
21889 assert(V1 && "Expected at least one vector value.");
21890 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
21891 R.CSEBlocks, *R.DL);
21892 return BaseShuffleAnalysis::createShuffle<Value *>(
21893 V1, V2, Mask, ShuffleBuilder, ScalarTy);
21894 }
21895
21896 /// Cast value \p V to the vector type with the same number of elements, but
21897 /// the base type \p ScalarTy.
21898 Value *castToScalarTyElem(Value *V,
21899 std::optional<bool> IsSigned = std::nullopt) {
21900 auto *VecTy = cast<VectorType>(V->getType());
21901 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
21902 if (VecTy->getElementType() == ScalarTy->getScalarType())
21903 return V;
21904 return Builder.CreateIntCast(
21905 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
21906 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
21907 }
21908
21909 Value *getVectorizedValue(const TreeEntry &E) {
21910 Value *Vec = E.VectorizedValue;
21911 if (!Vec->getType()->isIntOrIntVectorTy())
21912 return Vec;
21913 return castToScalarTyElem(Vec, any_of(E.Scalars, [&](Value *V) {
21914 return !isa<PoisonValue>(V) &&
21915 !isKnownNonNegative(
21916 V, SimplifyQuery(*R.DL));
21917 }));
21918 }
21919
21920public:
21922 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
21923
21924 /// Adjusts extractelements after reusing them.
21925 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
21926 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
21927 unsigned NumParts, bool &UseVecBaseAsInput) {
21928 UseVecBaseAsInput = false;
21929 SmallPtrSet<Value *, 4> UniqueBases;
21930 Value *VecBase = nullptr;
21931 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
21932 if (!E->ReorderIndices.empty()) {
21933 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
21934 E->ReorderIndices.end());
21935 reorderScalars(VL, ReorderMask);
21936 }
21937 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
21938 int Idx = Mask[I];
21939 if (Idx == PoisonMaskElem)
21940 continue;
21941 auto *EI = cast<ExtractElementInst>(VL[I]);
21942 VecBase = EI->getVectorOperand();
21943 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecBase); !TEs.empty())
21944 VecBase = TEs.front()->VectorizedValue;
21945 assert(VecBase && "Expected vectorized value.");
21946 UniqueBases.insert(VecBase);
21947 // If the only one use is vectorized - can delete the extractelement
21948 // itself.
21949 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
21950 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
21951 !R.isVectorized(EI) &&
21952 count_if(E->Scalars, [&](Value *V) { return V == EI; }) !=
21953 count_if(E->UserTreeIndex.UserTE->Scalars,
21954 [&](Value *V) { return V == EI; })) ||
21955 (NumParts != 1 && count(VL, EI) > 1) ||
21956 any_of(EI->users(), [&](User *U) {
21957 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
21958 return UTEs.empty() || UTEs.size() > 1 ||
21959 any_of(UTEs,
21960 [&](const TreeEntry *TE) {
21961 return R.DeletedNodes.contains(TE) ||
21962 R.TransformedToGatherNodes.contains(TE);
21963 }) ||
21965 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
21966 (!UTEs.empty() &&
21967 count_if(R.VectorizableTree,
21968 [&](const std::unique_ptr<TreeEntry> &TE) {
21969 return TE->UserTreeIndex.UserTE ==
21970 UTEs.front() &&
21971 is_contained(VL, EI);
21972 }) != 1);
21973 }))
21974 continue;
21975 R.eraseInstruction(EI);
21976 }
21977 if (NumParts == 1 || UniqueBases.size() == 1) {
21978 assert(VecBase && "Expected vectorized value.");
21979 return castToScalarTyElem(VecBase);
21980 }
21981 UseVecBaseAsInput = true;
21982 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
21983 for (auto [I, Idx] : enumerate(Mask))
21984 if (Idx != PoisonMaskElem)
21985 Idx = I;
21986 };
21987 // Perform multi-register vector shuffle, joining them into a single virtual
21988 // long vector.
21989 // Need to shuffle each part independently and then insert all this parts
21990 // into a long virtual vector register, forming the original vector.
21991 Value *Vec = nullptr;
21992 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
21993 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
21994 for (unsigned Part : seq<unsigned>(NumParts)) {
21995 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
21996 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
21997 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
21998 constexpr int MaxBases = 2;
21999 SmallVector<Value *, MaxBases> Bases(MaxBases);
22000 auto VLMask = zip(SubVL, SubMask);
22001 const unsigned VF =
22002 accumulate(VLMask, 0U, [&](unsigned S, const auto &D) {
22003 if (std::get<1>(D) == PoisonMaskElem)
22004 return S;
22005 Value *VecOp =
22006 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
22007 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
22008 !TEs.empty())
22009 VecOp = TEs.front()->VectorizedValue;
22010 assert(VecOp && "Expected vectorized value.");
22011 const unsigned Size =
22012 cast<FixedVectorType>(VecOp->getType())->getNumElements();
22013 return std::max(S, Size);
22014 });
22015 for (const auto [V, I] : VLMask) {
22016 if (I == PoisonMaskElem)
22017 continue;
22018 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
22019 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp); !TEs.empty())
22020 VecOp = TEs.front()->VectorizedValue;
22021 assert(VecOp && "Expected vectorized value.");
22022 VecOp = castToScalarTyElem(VecOp);
22023 Bases[I / VF] = VecOp;
22024 }
22025 if (!Bases.front())
22026 continue;
22027 Value *SubVec;
22028 if (Bases.back()) {
22029 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
22030 TransformToIdentity(SubMask);
22031 } else {
22032 SubVec = Bases.front();
22033 }
22034 if (!Vec) {
22035 Vec = SubVec;
22036 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
22037 [&](unsigned P) {
22038 ArrayRef<int> SubMask =
22039 Mask.slice(P * SliceSize,
22040 getNumElems(Mask.size(),
22041 SliceSize, P));
22042 return all_of(SubMask, [](int Idx) {
22043 return Idx == PoisonMaskElem;
22044 });
22045 })) &&
22046 "Expected first part or all previous parts masked.");
22047 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
22048 } else {
22049 unsigned NewVF =
22050 cast<FixedVectorType>(Vec->getType())->getNumElements();
22051 if (Vec->getType() != SubVec->getType()) {
22052 unsigned SubVecVF =
22053 cast<FixedVectorType>(SubVec->getType())->getNumElements();
22054 NewVF = std::max(NewVF, SubVecVF);
22055 }
22056 // Adjust SubMask.
22057 for (int &Idx : SubMask)
22058 if (Idx != PoisonMaskElem)
22059 Idx += NewVF;
22060 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
22061 Vec = createShuffle(Vec, SubVec, VecMask);
22062 TransformToIdentity(VecMask);
22063 }
22064 }
22065 copy(VecMask, Mask.begin());
22066 return Vec;
22067 }
22068 /// Checks if the specified entry \p E needs to be delayed because of its
22069 /// dependency nodes.
22070 std::optional<Value *>
22071 needToDelay(const TreeEntry *E,
22073 // No need to delay emission if all deps are ready.
22074 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
22075 return all_of(
22076 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
22077 }))
22078 return std::nullopt;
22079 // Postpone gather emission, will be emitted after the end of the
22080 // process to keep correct order.
22081 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
22082 return Builder.CreateAlignedLoad(
22083 ResVecTy,
22084 PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
22085 MaybeAlign());
22086 }
22087 /// Reset the builder to handle perfect diamond match.
22089 IsFinalized = false;
22090 CommonMask.clear();
22091 InVectors.clear();
22092 }
22093 /// Adds 2 input vectors (in form of tree entries) and the mask for their
22094 /// shuffling.
22095 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
22096 Value *V1 = getVectorizedValue(E1);
22097 Value *V2 = getVectorizedValue(E2);
22098 add(V1, V2, Mask);
22099 }
22100 /// Adds single input vector (in form of tree entry) and the mask for its
22101 /// shuffling.
22102 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
22103 Value *V1 = getVectorizedValue(E1);
22104 add(V1, Mask);
22105 }
22106 /// Adds 2 input vectors and the mask for their shuffling.
22107 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
22108 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
22111 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
22112 V1 = castToScalarTyElem(V1);
22113 V2 = castToScalarTyElem(V2);
22114 if (InVectors.empty()) {
22115 InVectors.push_back(V1);
22116 InVectors.push_back(V2);
22117 CommonMask.assign(Mask.begin(), Mask.end());
22118 return;
22119 }
22120 Value *Vec = InVectors.front();
22121 if (InVectors.size() == 2) {
22122 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
22123 transformMaskAfterShuffle(CommonMask, CommonMask);
22124 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
22125 Mask.size()) {
22126 Vec = createShuffle(Vec, nullptr, CommonMask);
22127 transformMaskAfterShuffle(CommonMask, CommonMask);
22128 }
22129 V1 = createShuffle(V1, V2, Mask);
22130 unsigned VF = std::max(getVF(V1), getVF(Vec));
22131 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
22132 if (Mask[Idx] != PoisonMaskElem)
22133 CommonMask[Idx] = Idx + VF;
22134 InVectors.front() = Vec;
22135 if (InVectors.size() == 2)
22136 InVectors.back() = V1;
22137 else
22138 InVectors.push_back(V1);
22139 }
22140 /// Adds another one input vector and the mask for the shuffling.
22141 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
22143 "castToScalarTyElem expects V1 to be FixedVectorType");
22144 V1 = castToScalarTyElem(V1);
22145 if (InVectors.empty()) {
22146 InVectors.push_back(V1);
22147 CommonMask.assign(Mask.begin(), Mask.end());
22148 return;
22149 }
22150 const auto *It = find(InVectors, V1);
22151 if (It == InVectors.end()) {
22152 if (InVectors.size() == 2 ||
22153 InVectors.front()->getType() != V1->getType()) {
22154 Value *V = InVectors.front();
22155 if (InVectors.size() == 2) {
22156 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
22157 transformMaskAfterShuffle(CommonMask, CommonMask);
22158 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
22159 CommonMask.size()) {
22160 V = createShuffle(InVectors.front(), nullptr, CommonMask);
22161 transformMaskAfterShuffle(CommonMask, CommonMask);
22162 }
22163 unsigned VF = std::max(CommonMask.size(), Mask.size());
22164 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
22165 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
22166 CommonMask[Idx] = V->getType() != V1->getType()
22167 ? Idx + VF
22168 : Mask[Idx] + getVF(V1);
22169 if (V->getType() != V1->getType())
22170 V1 = createShuffle(V1, nullptr, Mask);
22171 InVectors.front() = V;
22172 if (InVectors.size() == 2)
22173 InVectors.back() = V1;
22174 else
22175 InVectors.push_back(V1);
22176 return;
22177 }
22178 // Check if second vector is required if the used elements are already
22179 // used from the first one.
22180 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
22181 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
22182 InVectors.push_back(V1);
22183 break;
22184 }
22185 }
22186 unsigned VF = 0;
22187 for (Value *V : InVectors)
22188 VF = std::max(VF, getVF(V));
22189 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
22190 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
22191 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
22192 }
22193 /// Adds another one input vector and the mask for the shuffling.
22195 SmallVector<int> NewMask;
22196 inversePermutation(Order, NewMask);
22197 add(V1, NewMask);
22198 }
22199 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
22200 Value *Root = nullptr) {
22201 return R.gather(VL, Root, ScalarTy,
22202 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
22203 return createShuffle(V1, V2, Mask);
22204 });
22205 }
22206 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
22207 /// Finalize emission of the shuffles.
22208 /// \param Action the action (if any) to be performed before final applying of
22209 /// the \p ExtMask mask.
22211 ArrayRef<int> ExtMask,
22212 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
22213 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
22216 Action = {}) {
22217 IsFinalized = true;
22218 if (Action) {
22219 Value *Vec = InVectors.front();
22220 if (InVectors.size() == 2) {
22221 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
22222 InVectors.pop_back();
22223 } else {
22224 Vec = createShuffle(Vec, nullptr, CommonMask);
22225 }
22226 transformMaskAfterShuffle(CommonMask, CommonMask);
22227 assert(VF > 0 &&
22228 "Expected vector length for the final value before action.");
22229 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
22230 if (VecVF < VF) {
22231 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
22232 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
22233 Vec = createShuffle(Vec, nullptr, ResizeMask);
22234 }
22235 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
22236 return createShuffle(V1, V2, Mask);
22237 });
22238 InVectors.front() = Vec;
22239 }
22240 if (!SubVectors.empty()) {
22241 Value *Vec = InVectors.front();
22242 if (InVectors.size() == 2) {
22243 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
22244 InVectors.pop_back();
22245 } else {
22246 Vec = createShuffle(Vec, nullptr, CommonMask);
22247 }
22248 transformMaskAfterShuffle(CommonMask, CommonMask);
22249 auto CreateSubVectors = [&](Value *Vec,
22250 SmallVectorImpl<int> &CommonMask) {
22251 for (auto [E, Idx] : SubVectors) {
22252 Value *V = getVectorizedValue(*E);
22253 unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
22254 // Use scalar version of the SCalarType to correctly handle shuffles
22255 // for revectorization. The revectorization mode operates by the
22256 // vectors, but here we need to operate on the scalars, because the
22257 // masks were already transformed for the vector elements and we don't
22258 // need doing this transformation again.
22259 Type *OrigScalarTy = ScalarTy;
22260 ScalarTy = ScalarTy->getScalarType();
22261 Vec = createInsertVector(
22262 Builder, Vec, V, InsertionIndex,
22263 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
22264 _3));
22265 ScalarTy = OrigScalarTy;
22266 if (!CommonMask.empty()) {
22267 std::iota(std::next(CommonMask.begin(), Idx),
22268 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
22269 Idx);
22270 }
22271 }
22272 return Vec;
22273 };
22274 if (SubVectorsMask.empty()) {
22275 Vec = CreateSubVectors(Vec, CommonMask);
22276 } else {
22277 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
22278 copy(SubVectorsMask, SVMask.begin());
22279 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
22280 if (I2 != PoisonMaskElem) {
22281 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
22282 I1 = I2 + CommonMask.size();
22283 }
22284 }
22285 Value *InsertVec =
22286 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
22287 Vec = createShuffle(InsertVec, Vec, SVMask);
22288 transformMaskAfterShuffle(CommonMask, SVMask);
22289 }
22290 InVectors.front() = Vec;
22291 }
22292
22293 if (!ExtMask.empty()) {
22294 if (CommonMask.empty()) {
22295 CommonMask.assign(ExtMask.begin(), ExtMask.end());
22296 } else {
22297 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
22298 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
22299 if (ExtMask[I] == PoisonMaskElem)
22300 continue;
22301 NewMask[I] = CommonMask[ExtMask[I]];
22302 }
22303 CommonMask.swap(NewMask);
22304 }
22305 }
22306 if (CommonMask.empty()) {
22307 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
22308 return InVectors.front();
22309 }
22310 if (InVectors.size() == 2)
22311 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
22312 return createShuffle(InVectors.front(), nullptr, CommonMask);
22313 }
22314
22316 assert((IsFinalized || CommonMask.empty()) &&
22317 "Shuffle construction must be finalized.");
22318 }
22319};
22320
22321Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
22322 return vectorizeTree(getOperandEntry(E, NodeIdx));
22323}
22324
22325template <typename BVTy, typename ResTy, typename... Args>
22326ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
22327 Args &...Params) {
22328 assert((E->isGather() || TransformedToGatherNodes.contains(E)) &&
22329 "Expected gather node.");
22330 unsigned VF = E->getVectorFactor();
22331
22332 bool NeedFreeze = false;
22333 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
22334 // Do not process split vectorize node, marked to be gathers/buildvectors.
22336 E->CombinedEntriesWithIndices.size());
22337 if (E->State == TreeEntry::SplitVectorize &&
22338 TransformedToGatherNodes.contains(E)) {
22339 SubVectors.clear();
22340 } else {
22341 // Clear values, to be replaced by insertvector instructions.
22342 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
22343 for_each(MutableArrayRef(GatheredScalars)
22344 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
22345 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
22346 transform(
22347 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
22348 return std::make_pair(VectorizableTree[P.first].get(), P.second);
22349 });
22350 }
22351 // Build a mask out of the reorder indices and reorder scalars per this
22352 // mask.
22353 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
22354 E->ReorderIndices.end());
22355 if (!ReorderMask.empty())
22356 reorderScalars(GatheredScalars, ReorderMask);
22357 SmallVector<int> SubVectorsMask;
22358 inversePermutation(E->ReorderIndices, SubVectorsMask);
22359 // Transform non-clustered elements in the mask to poison (-1).
22360 // "Clustered" operations will be reordered using this mask later.
22361 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
22362 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
22363 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
22364 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
22365 } else {
22366 SubVectorsMask.clear();
22367 }
22368 SmallVector<Value *> StoredGS(GatheredScalars);
22369 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
22370 unsigned I, unsigned SliceSize,
22371 bool IsNotPoisonous) {
22372 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
22373 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
22374 }))
22375 return false;
22376 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
22377 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
22378 if (UserTE->getNumOperands() != 2)
22379 return false;
22380 if (!IsNotPoisonous) {
22381 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
22382 [=](const std::unique_ptr<TreeEntry> &TE) {
22383 return TE->UserTreeIndex.UserTE == UserTE &&
22384 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
22385 });
22386 if (It == VectorizableTree.end())
22387 return false;
22388 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
22389 if (!(*It)->ReorderIndices.empty()) {
22390 inversePermutation((*It)->ReorderIndices, ReorderMask);
22391 reorderScalars(GS, ReorderMask);
22392 }
22393 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
22394 Value *V0 = std::get<0>(P);
22395 Value *V1 = std::get<1>(P);
22396 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
22397 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
22398 is_contained(E->Scalars, V1));
22399 }))
22400 return false;
22401 }
22402 int Idx;
22403 if ((Mask.size() < InputVF &&
22404 ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
22405 Idx == 0) ||
22406 (Mask.size() == InputVF &&
22407 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
22408 std::iota(
22409 std::next(Mask.begin(), I * SliceSize),
22410 std::next(Mask.begin(),
22411 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
22412 0);
22413 } else {
22414 unsigned IVal =
22415 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
22416 std::fill(
22417 std::next(Mask.begin(), I * SliceSize),
22418 std::next(Mask.begin(),
22419 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
22420 IVal);
22421 }
22422 return true;
22423 };
22424 BVTy ShuffleBuilder(ScalarTy, Params...);
22425 ResTy Res = ResTy();
22426 SmallVector<int> Mask;
22427 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
22429 Value *ExtractVecBase = nullptr;
22430 bool UseVecBaseAsInput = false;
22433 Type *OrigScalarTy = GatheredScalars.front()->getType();
22434 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
22435 unsigned NumParts =
22436 ::getNumberOfParts(*TTI, VecTy, ScalarTy, GatheredScalars.size());
22437 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
22438 // Check for gathered extracts.
22439 bool Resized = false;
22440 ExtractShuffles =
22441 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
22442 if (!ExtractShuffles.empty()) {
22443 SmallVector<const TreeEntry *> ExtractEntries;
22444 for (auto [Idx, I] : enumerate(ExtractMask)) {
22445 if (I == PoisonMaskElem)
22446 continue;
22447 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
22448 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand());
22449 !TEs.empty())
22450 ExtractEntries.append(TEs.begin(), TEs.end());
22451 }
22452 if (std::optional<ResTy> Delayed =
22453 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
22454 // Delay emission of gathers which are not ready yet.
22455 PostponedGathers.insert(E);
22456 // Postpone gather emission, will be emitted after the end of the
22457 // process to keep correct order.
22458 return *Delayed;
22459 }
22460 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
22461 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
22462 ExtractVecBase = VecBase;
22463 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
22464 if (VF == VecBaseTy->getNumElements() &&
22465 GatheredScalars.size() != VF) {
22466 Resized = true;
22467 GatheredScalars.append(VF - GatheredScalars.size(),
22468 PoisonValue::get(OrigScalarTy));
22469 NumParts = ::getNumberOfParts(
22470 *TTI, getWidenedType(OrigScalarTy, VF), OrigScalarTy, VF);
22471 }
22472 }
22473 }
22474 // Gather extracts after we check for full matched gathers only.
22475 if (!ExtractShuffles.empty() || !E->hasState() ||
22476 E->getOpcode() != Instruction::Load ||
22477 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
22478 any_of(E->Scalars, IsaPred<LoadInst>)) &&
22479 any_of(E->Scalars,
22480 [this](Value *V) {
22481 return isa<LoadInst>(V) && isVectorized(V);
22482 })) ||
22483 (E->hasState() && E->isAltShuffle()) ||
22484 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
22485 isSplat(E->Scalars) ||
22486 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
22487 GatherShuffles =
22488 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
22489 }
22490 if (!GatherShuffles.empty()) {
22491 if (std::optional<ResTy> Delayed =
22492 ShuffleBuilder.needToDelay(E, Entries)) {
22493 // Delay emission of gathers which are not ready yet.
22494 PostponedGathers.insert(E);
22495 // Postpone gather emission, will be emitted after the end of the
22496 // process to keep correct order.
22497 return *Delayed;
22498 }
22499 if (GatherShuffles.size() == 1 &&
22500 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
22501 (Entries.front().front()->isSame(E->Scalars) ||
22502 E->isSame(Entries.front().front()->Scalars))) {
22503 // Perfect match in the graph, will reuse the previously vectorized
22504 // node. Cost is 0.
22505 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
22506 << shortBundleName(E->Scalars, E->Idx) << ".\n");
22507 // Restore the mask for previous partially matched values.
22508 Mask.resize(E->Scalars.size());
22509 const TreeEntry *FrontTE = Entries.front().front();
22510 if (FrontTE->ReorderIndices.empty() && E->ReorderIndices.empty() &&
22511 ((FrontTE->ReuseShuffleIndices.empty() &&
22512 E->Scalars.size() == FrontTE->Scalars.size()) ||
22513 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
22514 std::iota(Mask.begin(), Mask.end(), 0);
22515 } else {
22516 for (auto [I, V] : enumerate(E->Scalars)) {
22517 if (isa<PoisonValue>(V)) {
22518 Mask[I] = PoisonMaskElem;
22519 continue;
22520 }
22521 Mask[I] = FrontTE->findLaneForValue(V);
22522 }
22523 }
22524 // Reset the builder(s) to correctly handle perfect diamond matched
22525 // nodes.
22526 ShuffleBuilder.resetForSameNode();
22527 // Full matched entry found, no need to insert subvectors.
22528 if ((E->isSame(FrontTE->Scalars) &&
22529 FrontTE->ReuseShuffleIndices.empty() &&
22530 FrontTE->ReorderIndices.empty() &&
22531 E->getVectorFactor() == FrontTE->getVectorFactor()) ||
22532 (equal(E->Scalars, FrontTE->Scalars) &&
22533 equal(E->ReorderIndices, FrontTE->ReorderIndices) &&
22534 equal(E->ReuseShuffleIndices, FrontTE->ReuseShuffleIndices))) {
22535 Mask.resize(FrontTE->getVectorFactor());
22536 std::iota(Mask.begin(), Mask.end(), 0);
22537 ShuffleBuilder.add(*FrontTE, Mask);
22538 Res = ShuffleBuilder.finalize({}, {}, {});
22539 } else {
22540 ShuffleBuilder.add(*FrontTE, Mask);
22541 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
22542 }
22543 return Res;
22544 }
22545 if (!Resized) {
22546 if (GatheredScalars.size() != VF &&
22547 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
22548 return any_of(TEs, [&](const TreeEntry *TE) {
22549 return TE->getVectorFactor() == VF;
22550 });
22551 }))
22552 GatheredScalars.append(VF - GatheredScalars.size(),
22553 PoisonValue::get(OrigScalarTy));
22554 }
22555 // Remove shuffled elements from list of gathers.
22556 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
22557 if (Mask[I] != PoisonMaskElem)
22558 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
22559 }
22560 }
22561 }
22562 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
22563 SmallVectorImpl<int> &ReuseMask,
22564 bool IsRootPoison) {
22565 // For splats with can emit broadcasts instead of gathers, so try to find
22566 // such sequences.
22567 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
22568 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
22569 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
22570 SmallVector<int> UndefPos;
22571 DenseMap<Value *, unsigned> UniquePositions;
22572 // Gather unique non-const values and all constant values.
22573 // For repeated values, just shuffle them.
22574 int NumNonConsts = 0;
22575 int SinglePos = 0;
22576 for (auto [I, V] : enumerate(Scalars)) {
22577 if (isa<UndefValue>(V)) {
22578 if (!isa<PoisonValue>(V)) {
22579 ReuseMask[I] = I;
22580 UndefPos.push_back(I);
22581 }
22582 continue;
22583 }
22584 if (isConstant(V)) {
22585 ReuseMask[I] = I;
22586 continue;
22587 }
22588 ++NumNonConsts;
22589 SinglePos = I;
22590 Value *OrigV = V;
22591 Scalars[I] = PoisonValue::get(OrigScalarTy);
22592 if (IsSplat) {
22593 Scalars.front() = OrigV;
22594 ReuseMask[I] = 0;
22595 } else {
22596 const auto Res = UniquePositions.try_emplace(OrigV, I);
22597 Scalars[Res.first->second] = OrigV;
22598 ReuseMask[I] = Res.first->second;
22599 }
22600 }
22601 if (NumNonConsts == 1) {
22602 // Restore single insert element.
22603 if (IsSplat) {
22604 ReuseMask.assign(VF, PoisonMaskElem);
22605 std::swap(Scalars.front(), Scalars[SinglePos]);
22606 if (!UndefPos.empty() && UndefPos.front() == 0)
22607 Scalars.front() = UndefValue::get(OrigScalarTy);
22608 }
22609 ReuseMask[SinglePos] = SinglePos;
22610 } else if (!UndefPos.empty() && IsSplat) {
22611 // For undef values, try to replace them with the simple broadcast.
22612 // We can do it if the broadcasted value is guaranteed to be
22613 // non-poisonous, or by freezing the incoming scalar value first.
22614 auto *It = find_if(Scalars, [this, E](Value *V) {
22615 return !isa<UndefValue>(V) &&
22617 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
22618 // Check if the value already used in the same operation in
22619 // one of the nodes already.
22620 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
22621 is_contained(E->UserTreeIndex.UserTE->Scalars,
22622 U.getUser());
22623 })));
22624 });
22625 if (It != Scalars.end()) {
22626 // Replace undefs by the non-poisoned scalars and emit broadcast.
22627 int Pos = std::distance(Scalars.begin(), It);
22628 for (int I : UndefPos) {
22629 // Set the undef position to the non-poisoned scalar.
22630 ReuseMask[I] = Pos;
22631 // Replace the undef by the poison, in the mask it is replaced by
22632 // non-poisoned scalar already.
22633 if (I != Pos)
22634 Scalars[I] = PoisonValue::get(OrigScalarTy);
22635 }
22636 } else {
22637 // Replace undefs by the poisons, emit broadcast and then emit
22638 // freeze.
22639 for (int I : UndefPos) {
22640 ReuseMask[I] = PoisonMaskElem;
22641 if (isa<UndefValue>(Scalars[I]))
22642 Scalars[I] = PoisonValue::get(OrigScalarTy);
22643 }
22644 NeedFreeze = true;
22645 }
22646 }
22647 };
22648 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
22649 bool IsNonPoisoned = true;
22650 bool IsUsedInExpr = true;
22651 Value *Vec1 = nullptr;
22652 if (!ExtractShuffles.empty()) {
22653 // Gather of extractelements can be represented as just a shuffle of
22654 // a single/two vectors the scalars are extracted from.
22655 // Find input vectors.
22656 Value *Vec2 = nullptr;
22657 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
22658 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
22659 ExtractMask[I] = PoisonMaskElem;
22660 }
22661 if (UseVecBaseAsInput) {
22662 Vec1 = ExtractVecBase;
22663 } else {
22664 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
22665 if (ExtractMask[I] == PoisonMaskElem)
22666 continue;
22667 if (isa<UndefValue>(StoredGS[I]))
22668 continue;
22669 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
22670 Value *VecOp = EI->getVectorOperand();
22671 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(VecOp);
22672 !TEs.empty() && TEs.front()->VectorizedValue)
22673 VecOp = TEs.front()->VectorizedValue;
22674 if (!Vec1) {
22675 Vec1 = VecOp;
22676 } else if (Vec1 != VecOp) {
22677 assert((!Vec2 || Vec2 == VecOp) &&
22678 "Expected only 1 or 2 vectors shuffle.");
22679 Vec2 = VecOp;
22680 }
22681 }
22682 }
22683 if (Vec2) {
22684 IsUsedInExpr = false;
22685 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
22686 isGuaranteedNotToBePoison(Vec2, AC);
22687 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
22688 } else if (Vec1) {
22689 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
22690 IsUsedInExpr &= FindReusedSplat(
22691 ExtractMask,
22692 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
22693 ExtractMask.size(), IsNotPoisonedVec);
22694 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
22695 IsNonPoisoned &= IsNotPoisonedVec;
22696 } else {
22697 IsUsedInExpr = false;
22698 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
22699 /*ForExtracts=*/true);
22700 }
22701 }
22702 if (!GatherShuffles.empty()) {
22703 unsigned SliceSize = getPartNumElems(VF, NumParts);
22704 if (Mask.size() == E->Scalars.size())
22705 SliceSize = getPartNumElems(
22706 E->Scalars.size(),
22707 ::getNumberOfParts(*TTI, VecTy, ScalarTy, E->Scalars.size()));
22708 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
22709 for (const auto [I, TEs] : enumerate(Entries)) {
22710 if (TEs.empty()) {
22711 assert(!GatherShuffles[I] &&
22712 "No shuffles with empty entries list expected.");
22713 continue;
22714 }
22715 assert((TEs.size() == 1 || TEs.size() == 2) &&
22716 "Expected shuffle of 1 or 2 entries.");
22717 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
22718 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
22719 VecMask.assign(VecMask.size(), PoisonMaskElem);
22720 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
22721 if (TEs.size() == 1) {
22722 bool IsNotPoisonedVec =
22723 TEs.front()->VectorizedValue
22724 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
22725 : true;
22726 IsUsedInExpr &=
22727 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
22728 SliceSize, IsNotPoisonedVec);
22729 ShuffleBuilder.add(*TEs.front(), VecMask);
22730 IsNonPoisoned &= IsNotPoisonedVec;
22731 } else {
22732 IsUsedInExpr = false;
22733 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
22734 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
22735 IsNonPoisoned &=
22736 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
22737 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
22738 }
22739 }
22740 }
22741 // Try to figure out best way to combine values: build a shuffle and insert
22742 // elements or just build several shuffles.
22743 // Insert non-constant scalars.
22744 SmallVector<Value *> NonConstants(GatheredScalars);
22745 int EMSz = ExtractMask.size();
22746 int MSz = Mask.size();
22747 // Try to build constant vector and shuffle with it only if currently we
22748 // have a single permutation and more than 1 scalar constants.
22749 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
22750 bool IsIdentityShuffle =
22751 ((UseVecBaseAsInput ||
22752 all_of(ExtractShuffles,
22753 [](const std::optional<TTI::ShuffleKind> &SK) {
22754 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
22756 })) &&
22757 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
22758 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
22759 (!GatherShuffles.empty() &&
22760 all_of(GatherShuffles,
22761 [](const std::optional<TTI::ShuffleKind> &SK) {
22762 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
22764 }) &&
22765 none_of(Mask, [&](int I) { return I >= MSz; }) &&
22767 bool EnoughConstsForShuffle =
22768 IsSingleShuffle &&
22769 (none_of(GatheredScalars,
22770 [](Value *V) {
22771 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
22772 }) ||
22773 any_of(GatheredScalars,
22774 [](Value *V) {
22775 return isa<Constant>(V) && !isa<UndefValue>(V);
22776 })) &&
22777 (!IsIdentityShuffle ||
22778 (GatheredScalars.size() == 2 &&
22779 any_of(GatheredScalars,
22780 [](Value *V) { return !isa<UndefValue>(V); })) ||
22781 count_if(GatheredScalars, [](Value *V) {
22782 return isa<Constant>(V) && !isa<PoisonValue>(V);
22783 }) > 1);
22784 // NonConstants array contains just non-constant values, GatheredScalars
22785 // contains only constant to build final vector and then shuffle.
22786 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
22787 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
22788 NonConstants[I] = PoisonValue::get(OrigScalarTy);
22789 else
22790 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
22791 }
22792 // Generate constants for final shuffle and build a mask for them.
22793 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
22794 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
22795 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
22796 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
22797 ShuffleBuilder.add(BV, BVMask);
22798 }
22799 if (all_of(NonConstants, [=](Value *V) {
22800 return isa<PoisonValue>(V) ||
22801 (IsSingleShuffle && ((IsIdentityShuffle &&
22802 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
22803 }))
22804 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
22805 SubVectorsMask);
22806 else
22807 Res = ShuffleBuilder.finalize(
22808 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
22809 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
22810 bool IsSplat = isSplat(NonConstants);
22811 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
22812 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
22813 auto CheckIfSplatIsProfitable = [&]() {
22814 // Estimate the cost of splatting + shuffle and compare with
22815 // insert + shuffle.
22816 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
22817 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
22818 if (isa<ExtractElementInst>(V) || isVectorized(V))
22819 return false;
22820 InstructionCost SplatCost = TTI->getVectorInstrCost(
22821 Instruction::InsertElement, VecTy, CostKind, /*Index=*/0,
22822 PoisonValue::get(VecTy), V);
22823 SmallVector<int> NewMask(Mask.begin(), Mask.end());
22824 for (auto [Idx, I] : enumerate(BVMask))
22825 if (I != PoisonMaskElem)
22826 NewMask[Idx] = Mask.size();
22827 SplatCost +=
22828 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
22829 cast<VectorType>(VecTy), NewMask, CostKind);
22830 InstructionCost BVCost = TTI->getVectorInstrCost(
22831 Instruction::InsertElement, VecTy, CostKind,
22832 *find_if(Mask, not_equal_to(PoisonMaskElem)), Vec, V);
22833 // Shuffle required?
22834 if (count(BVMask, PoisonMaskElem) <
22835 static_cast<int>(BVMask.size() - 1)) {
22836 SmallVector<int> NewMask(Mask.begin(), Mask.end());
22837 for (auto [Idx, I] : enumerate(BVMask))
22838 if (I != PoisonMaskElem)
22839 NewMask[Idx] = I;
22840 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
22841 cast<VectorType>(VecTy), NewMask,
22842 CostKind);
22843 }
22844 return SplatCost <= BVCost;
22845 };
22846 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
22847 for (auto [Idx, I] : enumerate(BVMask))
22848 if (I != PoisonMaskElem)
22849 Mask[Idx] = I;
22850 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
22851 } else {
22852 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
22853 SmallVector<Value *> Values(NonConstants.size(),
22854 PoisonValue::get(ScalarTy));
22855 Values[0] = V;
22856 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
22857 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
22858 transform(BVMask, SplatMask.begin(), [](int I) {
22859 return I == PoisonMaskElem ? PoisonMaskElem : 0;
22860 });
22861 if (!ShuffleVectorInst::isIdentityMask(SplatMask, VF))
22862 BV = CreateShuffle(BV, nullptr, SplatMask);
22863 for (auto [Idx, I] : enumerate(BVMask))
22864 if (I != PoisonMaskElem)
22865 Mask[Idx] = BVMask.size() + Idx;
22866 Vec = CreateShuffle(Vec, BV, Mask);
22867 for (auto [Idx, I] : enumerate(Mask))
22868 if (I != PoisonMaskElem)
22869 Mask[Idx] = Idx;
22870 }
22871 });
22872 } else if (!allConstant(GatheredScalars)) {
22873 // Gather unique scalars and all constants.
22874 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
22875 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
22876 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
22877 ShuffleBuilder.add(BV, ReuseMask);
22878 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
22879 SubVectorsMask);
22880 } else {
22881 // Gather all constants.
22882 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
22883 for (auto [I, V] : enumerate(GatheredScalars)) {
22884 if (!isa<PoisonValue>(V))
22885 Mask[I] = I;
22886 }
22887 Value *BV = ShuffleBuilder.gather(GatheredScalars);
22888 ShuffleBuilder.add(BV, Mask);
22889 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
22890 SubVectorsMask);
22891 }
22892
22893 if (NeedFreeze)
22894 Res = ShuffleBuilder.createFreeze(Res);
22895 return Res;
22896}
22897
22898Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
22899 // Do not do this for split vectorize node, marked to be gathers/buildvectors.
22900 if (E->State != TreeEntry::SplitVectorize ||
22901 !TransformedToGatherNodes.contains(E)) {
22902 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
22903 (void)vectorizeTree(VectorizableTree[EIdx].get());
22904 }
22905 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
22906 Builder, *this);
22907}
22908
22909/// \returns \p I after propagating metadata from \p VL only for instructions in
22910/// \p VL.
22913 for (Value *V : VL)
22914 if (isa<Instruction>(V))
22915 Insts.push_back(V);
22916 return llvm::propagateMetadata(Inst, Insts);
22917}
22918
22920 if (DebugLoc DL = PN.getDebugLoc())
22921 return DL;
22922 return DebugLoc::getUnknown();
22923}
22924
22925Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
22926 IRBuilderBase::InsertPointGuard Guard(Builder);
22927
22928 Value *V = E->Scalars.front();
22929 Type *ScalarTy = getValueType(V);
22930 auto It = MinBWs.find(E);
22931 if (It != MinBWs.end()) {
22932 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
22933 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
22934 if (VecTy)
22935 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
22936 }
22937 if (E->VectorizedValue)
22938 return E->VectorizedValue;
22939 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
22940 if (E->isGather() || TransformedToGatherNodes.contains(E)) {
22941 // Set insert point for non-reduction initial nodes.
22942 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
22943 setInsertPointAfterBundle(E);
22944 Value *Vec = createBuildVector(E, ScalarTy);
22945 E->VectorizedValue = Vec;
22946 return Vec;
22947 }
22948 if (E->State == TreeEntry::SplitVectorize) {
22949 assert(E->CombinedEntriesWithIndices.size() == 2 &&
22950 "Expected exactly 2 combined entries.");
22951 setInsertPointAfterBundle(E);
22952 TreeEntry &OpTE1 =
22953 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
22954 assert(OpTE1.isSame(
22955 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
22956 "Expected same first part of scalars.");
22957 Value *Op1 = vectorizeTree(&OpTE1);
22958 TreeEntry &OpTE2 =
22959 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
22960 assert(
22961 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
22962 "Expected same second part of scalars.");
22963 Value *Op2 = vectorizeTree(&OpTE2);
22964 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
22965 bool IsSigned = false;
22966 auto It = MinBWs.find(OpE);
22967 if (It != MinBWs.end())
22968 IsSigned = It->second.second;
22969 else
22970 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
22971 if (isa<PoisonValue>(V))
22972 return false;
22973 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22974 });
22975 return IsSigned;
22976 };
22977 if (cast<VectorType>(Op1->getType())->getElementType() !=
22978 ScalarTy->getScalarType()) {
22979 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
22980 Op1 = Builder.CreateIntCast(
22981 Op1,
22983 ScalarTy,
22984 cast<FixedVectorType>(Op1->getType())->getNumElements()),
22985 GetOperandSignedness(&OpTE1));
22986 }
22987 if (cast<VectorType>(Op2->getType())->getElementType() !=
22988 ScalarTy->getScalarType()) {
22989 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
22990 Op2 = Builder.CreateIntCast(
22991 Op2,
22993 ScalarTy,
22994 cast<FixedVectorType>(Op2->getType())->getNumElements()),
22995 GetOperandSignedness(&OpTE2));
22996 }
22997 if (E->ReorderIndices.empty()) {
22998 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
22999 std::iota(
23000 Mask.begin(),
23001 std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
23002 0);
23003 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
23004 if (ScalarTyNumElements != 1) {
23005 assert(SLPReVec && "Only supported by REVEC.");
23006 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, Mask);
23007 }
23008 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
23009 Vec = createInsertVector(Builder, Vec, Op2,
23010 E->CombinedEntriesWithIndices.back().second *
23011 ScalarTyNumElements);
23012 E->VectorizedValue = Vec;
23013 return Vec;
23014 }
23015 unsigned CommonVF =
23016 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
23017 const unsigned Scale = getNumElements(ScalarTy);
23018 CommonVF *= Scale;
23019 if (getNumElements(Op1->getType()) != CommonVF) {
23020 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
23021 copy(createReplicatedMask(Scale, OpTE1.getVectorFactor() * Scale),
23022 Mask.begin());
23023 Op1 = Builder.CreateShuffleVector(Op1, Mask);
23024 }
23025 if (getNumElements(Op2->getType()) != CommonVF) {
23026 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
23027 copy(createReplicatedMask(Scale, OpTE2.getVectorFactor() * Scale),
23028 Mask.begin());
23029 Op2 = Builder.CreateShuffleVector(Op2, Mask);
23030 }
23031 Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
23032 E->VectorizedValue = Vec;
23033 return Vec;
23034 }
23035
23036 bool IsReverseOrder =
23037 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
23038 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
23039 if (isa<StructType>(ScalarTy)) {
23040 // TODO: Reordering of struct types is not supported.
23041 assert(E->ReorderIndices.empty() &&
23042 "Expected no reordering for struct types.");
23043 assert(E->ReuseShuffleIndices.empty() &&
23044 "Expected no reuse shuffle indices for struct types.");
23045 return V;
23046 }
23047 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
23048 if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
23049 E->State == TreeEntry::CompressVectorize) {
23050 ShuffleBuilder.addOrdered(V, {});
23051 } else if (E->getOpcode() == Instruction::Store &&
23052 (E->State == TreeEntry::Vectorize ||
23053 E->State == TreeEntry::StridedVectorize)) {
23054 ArrayRef<int> Mask =
23055 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
23056 E->ReorderIndices.size());
23057 ShuffleBuilder.add(V, Mask);
23058 } else {
23059 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
23060 }
23062 E->CombinedEntriesWithIndices.size());
23063 transform(
23064 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
23065 return std::make_pair(VectorizableTree[P.first].get(), P.second);
23066 });
23067 assert(
23068 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
23069 "Expected either combined subnodes or reordering");
23070 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
23071 };
23072
23073 assert(!E->isGather() && "Unhandled state");
23074 unsigned ShuffleOrOp =
23075 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
23076 if (!E->isAltShuffle()) {
23077 switch (E->CombinedOp) {
23078 case TreeEntry::ReducedBitcast:
23079 case TreeEntry::ReducedBitcastBSwap:
23080 case TreeEntry::ReducedBitcastLoads:
23081 case TreeEntry::ReducedBitcastBSwapLoads:
23082 case TreeEntry::ReducedCmpBitcast:
23083 ShuffleOrOp = E->CombinedOp;
23084 break;
23085 default:
23086 break;
23087 }
23088 }
23089 Instruction *VL0 = E->getMainOp();
23090 auto GetOperandSignedness = [&](unsigned Idx) {
23091 const TreeEntry *OpE = getOperandEntry(E, Idx);
23092 bool IsSigned = false;
23093 auto It = MinBWs.find(OpE);
23094 if (It != MinBWs.end())
23095 IsSigned = It->second.second;
23096 else
23097 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
23098 if (isa<PoisonValue>(V))
23099 return false;
23100 return !isKnownNonNegative(R, SimplifyQuery(*DL));
23101 });
23102 return IsSigned;
23103 };
23104 auto PropagateIRFlags = [&](Value *V, unsigned Opcode = 0,
23105 ArrayRef<Value *> VL = {}) -> Value * {
23106 ArrayRef<Value *> Scalars = VL.empty() ? E->Scalars : VL;
23107 SmallSetVector<Value *, 4> UniqueInsts;
23108 for (Value *Scalar : Scalars) {
23109 auto *I = dyn_cast<Instruction>(Scalar);
23110 if (!I)
23111 continue;
23112 if (E->hasCopyableElements() && E->isCopyableElement(I))
23113 continue;
23114 UniqueInsts.insert(I);
23115 }
23116 if (!Opcode)
23117 Opcode = E->getOpcode();
23118 propagateIRFlags(V, UniqueInsts.getArrayRef(), nullptr, It == MinBWs.end());
23119 auto *I = dyn_cast<Instruction>(V);
23120 if (!I)
23121 return V;
23122 I = ::propagateMetadata(I, UniqueInsts.getArrayRef());
23123 // For copyable elements the lane is synthesized using a binop identity
23124 // value, so the operand at that lane is the copyable scalar's value.
23125 // fast-math flags that turn defined NaN/Inf inputs into poison (nnan,
23126 // ninf) are only justified if every copyable scalar at the synthesized
23127 // lane is itself provably non-NaN/non-Inf - either an FPMathOperator
23128 // with the matching flag set, or a constant FP that is not NaN/Inf.
23129 if (E->hasCopyableElements() && isa<FPMathOperator>(I)) {
23130 bool AllNoNaNs = true;
23131 bool AllNoInfs = true;
23132 for (Value *Scalar : Scalars) {
23133 if (!E->isCopyableElement(Scalar))
23134 continue;
23135 if (auto *FPMO = dyn_cast<FPMathOperator>(Scalar)) {
23136 AllNoNaNs &= FPMO->hasNoNaNs();
23137 AllNoInfs &= FPMO->hasNoInfs();
23138 continue;
23139 }
23140 if (auto *CFP = dyn_cast<ConstantFP>(Scalar)) {
23141 AllNoNaNs &= !CFP->isNaN();
23142 AllNoInfs &= !CFP->isInfinity();
23143 continue;
23144 }
23145 AllNoNaNs = false;
23146 AllNoInfs = false;
23147 break;
23148 }
23149 if (!AllNoNaNs)
23150 I->setHasNoNaNs(false);
23151 if (!AllNoInfs)
23152 I->setHasNoInfs(false);
23153 }
23154 // Drop nuw flags for abs(sub(commutative), true).
23155 if (!MinBWs.contains(E) && Opcode == Instruction::Sub &&
23156 (E->hasCopyableElements() || any_of(Scalars, [](Value *Scalar) {
23157 auto *SI = dyn_cast<Instruction>(Scalar);
23158 return !SI || isCommutative(SI);
23159 })))
23160 I->setHasNoUnsignedWrap(/*b=*/false);
23161 if (auto *ICmp = dyn_cast<ICmpInst>(I); ICmp && It == MinBWs.end())
23162 ICmp->setSameSign(/*B=*/false);
23163 return I;
23164 };
23165 switch (ShuffleOrOp) {
23166 case Instruction::PHI: {
23167 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
23168 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
23169 "PHI reordering is free.");
23170 auto *PH = cast<PHINode>(VL0);
23171 Builder.SetInsertPoint(PH->getParent(),
23172 PH->getParent()->getFirstNonPHIIt());
23173 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
23174 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
23175 Value *V = NewPhi;
23176
23177 // Adjust insertion point once all PHI's have been generated.
23178 Builder.SetInsertPoint(PH->getParent(),
23179 PH->getParent()->getFirstInsertionPt());
23180 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
23181
23182 V = FinalShuffle(V, E);
23183
23184 E->VectorizedValue = V;
23185 // If phi node is fully emitted - exit.
23186 if (NewPhi->getNumIncomingValues() != 0)
23187 return NewPhi;
23188
23189 // PHINodes may have multiple entries from the same block. We want to
23190 // visit every block once.
23191 SmallDenseMap<BasicBlock *, unsigned, 4> VisitedBBs;
23192 for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
23193 BasicBlock *IBB = PH->getIncomingBlock(I);
23194
23195 // Stop emission if all incoming values are generated.
23196 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
23197 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
23198 return NewPhi;
23199 }
23200
23201 auto Res = VisitedBBs.try_emplace(IBB, I);
23202 if (!Res.second) {
23203 TreeEntry *OpTE = getOperandEntry(E, I);
23204 if (OpTE->isGather() || DeletedNodes.contains(OpTE) ||
23205 TransformedToGatherNodes.contains(OpTE)) {
23206 Value *VecOp = NewPhi->getIncomingValue(Res.first->getSecond());
23207 NewPhi->addIncoming(VecOp, IBB);
23208 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
23209 OpTE->VectorizedValue = VecOp;
23210 continue;
23211 }
23212 }
23213
23214 Builder.SetInsertPoint(IBB->getTerminator());
23215 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
23216 Value *Vec = vectorizeOperand(E, I);
23217 if (VecTy != Vec->getType()) {
23218 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
23219 MinBWs.contains(getOperandEntry(E, I))) &&
23220 "Expected item in MinBWs.");
23221 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
23222 }
23223 NewPhi->addIncoming(Vec, IBB);
23224 }
23225
23226 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
23227 "Invalid number of incoming values");
23228 assert(E->VectorizedValue && "Expected vectorized value.");
23229 return E->VectorizedValue;
23230 }
23231
23232 case Instruction::ExtractElement: {
23233 Value *V = E->getSingleOperand(0);
23234 setInsertPointAfterBundle(E);
23235 V = FinalShuffle(V, E);
23236 E->VectorizedValue = V;
23237 return V;
23238 }
23239 case Instruction::ExtractValue: {
23240 if (!E->StructEVIndices.empty()) {
23241 setInsertPointAfterBundle(E);
23242 Value *V = vectorizeOperand(E, 0);
23243 V = Builder.CreateExtractValue(V, E->StructEVIndices);
23244 if (auto *I = dyn_cast<Instruction>(V))
23245 V = ::propagateMetadata(I, E->Scalars);
23246 V = FinalShuffle(V, E);
23247 E->VectorizedValue = V;
23248 return V;
23249 }
23250 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
23251 Builder.SetInsertPoint(LI);
23252 Value *Ptr = LI->getPointerOperand();
23253 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
23254 Value *NewV = PropagateIRFlags(V);
23255 NewV = FinalShuffle(NewV, E);
23256 E->VectorizedValue = NewV;
23257 return NewV;
23258 }
23259 case Instruction::InsertElement: {
23260 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
23261 if (const TreeEntry *OpE = getOperandEntry(E, 1);
23262 OpE && !OpE->isGather() && OpE->hasState() &&
23263 !OpE->hasCopyableElements())
23264 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
23265 else
23266 setInsertPointAfterBundle(E);
23267 Value *V = vectorizeOperand(E, 1);
23268 ArrayRef<Value *> Op = E->getOperand(1);
23269 Type *ScalarTy = Op.front()->getType();
23270 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
23271 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
23272 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
23273 assert(Res.first > 0 && "Expected item in MinBWs.");
23274 V = Builder.CreateIntCast(
23275 V,
23277 ScalarTy,
23278 cast<FixedVectorType>(V->getType())->getNumElements()),
23279 Res.second);
23280 }
23281
23282 // Create InsertVector shuffle if necessary
23283 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
23284 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
23285 }));
23286 const unsigned NumElts =
23287 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
23288 const unsigned NumScalars = E->Scalars.size();
23289
23290 unsigned Offset = *getElementIndex(VL0);
23291 assert(Offset < NumElts && "Failed to find vector index offset");
23292
23293 // Create shuffle to resize vector
23294 SmallVector<int> Mask;
23295 if (!E->ReorderIndices.empty()) {
23296 inversePermutation(E->ReorderIndices, Mask);
23297 Mask.append(NumElts - NumScalars, PoisonMaskElem);
23298 } else {
23299 Mask.assign(NumElts, PoisonMaskElem);
23300 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
23301 }
23302 // Create InsertVector shuffle if necessary
23303 bool IsIdentity = true;
23304 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
23305 Mask.swap(PrevMask);
23306 for (unsigned I = 0; I < NumScalars; ++I) {
23307 Value *Scalar = E->Scalars[PrevMask[I]];
23308 unsigned InsertIdx = *getElementIndex(Scalar);
23309 IsIdentity &= InsertIdx - Offset == I;
23310 Mask[InsertIdx - Offset] = I;
23311 }
23312 if (!IsIdentity || NumElts != NumScalars) {
23313 Value *V2 = nullptr;
23314 bool IsVNonPoisonous =
23316 SmallVector<int> InsertMask(Mask);
23317 if (NumElts != NumScalars && Offset == 0) {
23318 // Follow all insert element instructions from the current buildvector
23319 // sequence.
23320 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
23321 do {
23322 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
23323 if (!InsertIdx)
23324 break;
23325 if (InsertMask[*InsertIdx] == PoisonMaskElem)
23326 InsertMask[*InsertIdx] = *InsertIdx;
23327 if (!Ins->hasOneUse())
23328 break;
23331 } while (Ins);
23332 SmallBitVector UseMask =
23333 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
23334 SmallBitVector IsFirstPoison =
23335 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
23336 SmallBitVector IsFirstUndef =
23337 isUndefVector(FirstInsert->getOperand(0), UseMask);
23338 if (!IsFirstPoison.all()) {
23339 unsigned Idx = 0;
23340 for (unsigned I = 0; I < NumElts; I++) {
23341 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
23342 IsFirstUndef.test(I)) {
23343 if (IsVNonPoisonous) {
23344 InsertMask[I] = I < NumScalars ? I : 0;
23345 continue;
23346 }
23347 if (!V2)
23348 V2 = UndefValue::get(V->getType());
23349 if (Idx >= NumScalars)
23350 Idx = NumScalars - 1;
23351 InsertMask[I] = NumScalars + Idx;
23352 ++Idx;
23353 } else if (InsertMask[I] != PoisonMaskElem &&
23354 Mask[I] == PoisonMaskElem) {
23355 InsertMask[I] = PoisonMaskElem;
23356 }
23357 }
23358 } else {
23359 InsertMask = Mask;
23360 }
23361 }
23362 if (!V2)
23363 V2 = PoisonValue::get(V->getType());
23364 V = Builder.CreateShuffleVector(V, V2, InsertMask);
23365 if (auto *I = dyn_cast<Instruction>(V)) {
23366 GatherShuffleExtractSeq.insert(I);
23367 CSEBlocks.insert(I->getParent());
23368 }
23369 }
23370
23371 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
23372 for (unsigned I = 0; I < NumElts; I++) {
23373 if (Mask[I] != PoisonMaskElem)
23374 InsertMask[Offset + I] = I;
23375 }
23376 SmallBitVector UseMask =
23377 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
23378 SmallBitVector IsFirstUndef =
23379 isUndefVector(FirstInsert->getOperand(0), UseMask);
23380 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
23381 NumElts != NumScalars) {
23382 if (IsFirstUndef.all()) {
23383 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
23384 SmallBitVector IsFirstPoison =
23385 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
23386 if (!IsFirstPoison.all()) {
23387 for (unsigned I = 0; I < NumElts; I++) {
23388 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
23389 InsertMask[I] = I + NumElts;
23390 }
23391 }
23392 V = Builder.CreateShuffleVector(
23393 V,
23394 IsFirstPoison.all() ? PoisonValue::get(V->getType())
23395 : FirstInsert->getOperand(0),
23396 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
23397 if (auto *I = dyn_cast<Instruction>(V)) {
23398 GatherShuffleExtractSeq.insert(I);
23399 CSEBlocks.insert(I->getParent());
23400 }
23401 }
23402 } else {
23403 SmallBitVector IsFirstPoison =
23404 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
23405 for (unsigned I = 0; I < NumElts; I++) {
23406 if (InsertMask[I] == PoisonMaskElem)
23407 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
23408 else
23409 InsertMask[I] += NumElts;
23410 }
23411 V = Builder.CreateShuffleVector(
23412 FirstInsert->getOperand(0), V, InsertMask,
23413 cast<Instruction>(E->Scalars.back())->getName());
23414 if (auto *I = dyn_cast<Instruction>(V)) {
23415 GatherShuffleExtractSeq.insert(I);
23416 CSEBlocks.insert(I->getParent());
23417 }
23418 }
23419 }
23420
23421 ++NumVectorInstructions;
23422 E->VectorizedValue = V;
23423 return V;
23424 }
23425 case Instruction::ZExt:
23426 case Instruction::SExt:
23427 case Instruction::FPToUI:
23428 case Instruction::FPToSI:
23429 case Instruction::FPExt:
23430 case Instruction::PtrToInt:
23431 case Instruction::IntToPtr:
23432 case Instruction::SIToFP:
23433 case Instruction::UIToFP:
23434 case Instruction::Trunc:
23435 case Instruction::FPTrunc:
23436 case Instruction::BitCast: {
23437 setInsertPointAfterBundle(E);
23438
23439 Value *InVec = vectorizeOperand(E, 0);
23440
23441 auto *CI = cast<CastInst>(VL0);
23442 Instruction::CastOps VecOpcode = CI->getOpcode();
23443 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
23444 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
23445 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
23446 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
23447 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
23448 // Check if the values are candidates to demote.
23449 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
23450 if (SrcIt != MinBWs.end())
23451 SrcBWSz = SrcIt->second.first;
23452 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
23453 if (BWSz == SrcBWSz) {
23454 VecOpcode = Instruction::BitCast;
23455 } else if (BWSz < SrcBWSz) {
23456 VecOpcode = Instruction::Trunc;
23457 } else if (It != MinBWs.end()) {
23458 assert(BWSz > SrcBWSz && "Invalid cast!");
23459 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
23460 } else if (SrcIt != MinBWs.end()) {
23461 assert(BWSz > SrcBWSz && "Invalid cast!");
23462 VecOpcode =
23463 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
23464 }
23465 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
23466 !SrcIt->second.second) {
23467 VecOpcode = Instruction::UIToFP;
23468 } else if (VecOpcode == Instruction::BitCast && SrcIt != MinBWs.end() &&
23469 ScalarTy->isFPOrFPVectorTy()) {
23470 Type *OrigSrcScalarTy = CI->getSrcTy();
23471 auto *OrigSrcVectorTy =
23472 getWidenedType(OrigSrcScalarTy, E->Scalars.size());
23473 InVec =
23474 Builder.CreateIntCast(InVec, OrigSrcVectorTy, SrcIt->second.second);
23475 }
23476 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
23477 ? InVec
23478 : Builder.CreateCast(VecOpcode, InVec, VecTy);
23479 V = FinalShuffle(V, E);
23480
23481 E->VectorizedValue = V;
23482 ++NumVectorInstructions;
23483 return V;
23484 }
23485 case Instruction::FCmp:
23486 case Instruction::ICmp: {
23487 setInsertPointAfterBundle(E);
23488
23489 Value *L = vectorizeOperand(E, 0);
23490 Value *R = vectorizeOperand(E, 1);
23491 if (L->getType() != R->getType()) {
23492 assert((getOperandEntry(E, 0)->isGather() ||
23493 getOperandEntry(E, 1)->isGather() ||
23494 MinBWs.contains(getOperandEntry(E, 0)) ||
23495 MinBWs.contains(getOperandEntry(E, 1))) &&
23496 "Expected item in MinBWs.");
23497 const unsigned LBW = cast<VectorType>(L->getType())
23498 ->getElementType()
23499 ->getIntegerBitWidth();
23500 const unsigned RBW = cast<VectorType>(R->getType())
23501 ->getElementType()
23502 ->getIntegerBitWidth();
23503 if ((LBW < RBW && (!allConstant(E->getOperand(1)) ||
23504 any_of(
23505 E->getOperand(1),
23506 [&](Value *V) {
23507 auto *CI = dyn_cast<ConstantInt>(V);
23508 return !CI ||
23509 CI->getValue().getActiveBits() > LBW;
23510 }))) ||
23511 (LBW > RBW && allConstant(E->getOperand(0)) &&
23512 all_of(E->getOperand(1), [&](Value *V) {
23513 auto *CI = dyn_cast<ConstantInt>(V);
23514 return CI && CI->getValue().getActiveBits() <= RBW;
23515 }))) {
23516 Type *CastTy = R->getType();
23517 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
23518 } else {
23519 Type *CastTy = L->getType();
23520 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
23521 }
23522 }
23523
23524 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
23525 Value *V = Builder.CreateCmp(P0, L, R);
23526 V = PropagateIRFlags(V);
23527 // Do not cast for cmps.
23528 VecTy = cast<FixedVectorType>(V->getType());
23529 V = FinalShuffle(V, E);
23530
23531 E->VectorizedValue = V;
23532 ++NumVectorInstructions;
23533 return V;
23534 }
23535 case Instruction::Select: {
23536 setInsertPointAfterBundle(E);
23537
23538 Value *Cond = vectorizeOperand(E, 0);
23539 Value *True = vectorizeOperand(E, 1);
23540 Value *False = vectorizeOperand(E, 2);
23541 if (True->getType() != VecTy || False->getType() != VecTy) {
23542 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
23543 getOperandEntry(E, 2)->isGather() ||
23544 MinBWs.contains(getOperandEntry(E, 1)) ||
23545 MinBWs.contains(getOperandEntry(E, 2))) &&
23546 "Expected item in MinBWs.");
23547 if (True->getType() != VecTy)
23548 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
23549 if (False->getType() != VecTy)
23550 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
23551 }
23552
23553 unsigned CondNumElements = getNumElements(Cond->getType());
23554 unsigned TrueNumElements = getNumElements(True->getType());
23555 assert(TrueNumElements >= CondNumElements &&
23556 TrueNumElements % CondNumElements == 0 &&
23557 "Cannot vectorize Instruction::Select");
23558 assert(TrueNumElements == getNumElements(False->getType()) &&
23559 "Cannot vectorize Instruction::Select");
23560 if (CondNumElements != TrueNumElements) {
23561 // When the return type is i1 but the source is fixed vector type, we
23562 // need to duplicate the condition value.
23563 Cond = Builder.CreateShuffleVector(
23564 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
23565 CondNumElements));
23566 }
23567 assert(getNumElements(Cond->getType()) == TrueNumElements &&
23568 "Cannot vectorize Instruction::Select");
23569 Value *V =
23570 Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
23571 V = FinalShuffle(V, E);
23572
23573 E->VectorizedValue = V;
23574 ++NumVectorInstructions;
23575 return V;
23576 }
23577 case Instruction::FNeg: {
23578 setInsertPointAfterBundle(E);
23579
23580 Value *Op = vectorizeOperand(E, 0);
23581
23582 Value *V = Builder.CreateUnOp(
23583 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
23584 V = PropagateIRFlags(V);
23585
23586 V = FinalShuffle(V, E);
23587
23588 E->VectorizedValue = V;
23589 ++NumVectorInstructions;
23590
23591 return V;
23592 }
23593 case Instruction::Freeze: {
23594 setInsertPointAfterBundle(E);
23595
23596 Value *Op = vectorizeOperand(E, 0);
23597
23598 if (Op->getType() != VecTy) {
23599 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
23600 MinBWs.contains(getOperandEntry(E, 0))) &&
23601 "Expected item in MinBWs.");
23602 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
23603 }
23604 Value *V = Builder.CreateFreeze(Op);
23605 V = FinalShuffle(V, E);
23606
23607 E->VectorizedValue = V;
23608 ++NumVectorInstructions;
23609
23610 return V;
23611 }
23612 case Instruction::Add:
23613 case Instruction::FAdd:
23614 case Instruction::Sub:
23615 case Instruction::FSub:
23616 case Instruction::Mul:
23617 case Instruction::FMul:
23618 case Instruction::UDiv:
23619 case Instruction::SDiv:
23620 case Instruction::FDiv:
23621 case Instruction::URem:
23622 case Instruction::SRem:
23623 case Instruction::FRem:
23624 case Instruction::Shl:
23625 case Instruction::LShr:
23626 case Instruction::AShr:
23627 case Instruction::And:
23628 case Instruction::Or:
23629 case Instruction::Xor: {
23630 setInsertPointAfterBundle(E);
23631
23632 Value *LHS = vectorizeOperand(E, 0);
23633 Value *RHS = vectorizeOperand(E, 1);
23634 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
23635 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
23636 ArrayRef<Value *> Ops = E->getOperand(I);
23637 if (all_of(Ops, [&](Value *Op) {
23638 auto *CI = dyn_cast<ConstantInt>(Op);
23639 return CI && CI->getValue().countr_one() >= It->second.first;
23640 })) {
23641 V = FinalShuffle(I == 0 ? RHS : LHS, E);
23642 E->VectorizedValue = V;
23643 ++NumVectorInstructions;
23644 return V;
23645 }
23646 }
23647 }
23648 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
23649 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
23650 getOperandEntry(E, 1)->isGather() ||
23651 MinBWs.contains(getOperandEntry(E, 0)) ||
23652 MinBWs.contains(getOperandEntry(E, 1))) &&
23653 "Expected item in MinBWs.");
23654 if (LHS->getType() != VecTy)
23655 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
23656 if (RHS->getType() != VecTy)
23657 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
23658 }
23659
23660 Value *V = Builder.CreateBinOp(
23661 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
23662 RHS);
23663 V = PropagateIRFlags(V);
23664
23665 V = FinalShuffle(V, E);
23666
23667 E->VectorizedValue = V;
23668 ++NumVectorInstructions;
23669
23670 return V;
23671 }
23672 case Instruction::Load: {
23673 // Loads are inserted at the head of the tree because we don't want to
23674 // sink them all the way down past store instructions.
23675 setInsertPointAfterBundle(E);
23676
23677 LoadInst *LI = cast<LoadInst>(VL0);
23678 Instruction *NewLI;
23679 FixedVectorType *StridedLoadTy = nullptr;
23680 Value *PO = LI->getPointerOperand();
23681 if (E->State == TreeEntry::Vectorize) {
23682 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
23683 } else if (E->State == TreeEntry::CompressVectorize) {
23684 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
23685 CompressEntryToData.at(E);
23686 Align CommonAlignment = LI->getAlign();
23687 if (IsMasked) {
23688 unsigned VF = getNumElements(LoadVecTy);
23689 SmallVector<Constant *> MaskValues(
23690 VF / getNumElements(LI->getType()),
23692 for (int I : CompressMask)
23693 MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
23694 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
23695 assert(SLPReVec && "Only supported by REVEC.");
23696 MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
23697 }
23698 Constant *MaskValue = ConstantVector::get(MaskValues);
23699 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
23700 MaskValue);
23701 } else {
23702 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
23703 }
23704 NewLI = cast<Instruction>(PropagateIRFlags(NewLI));
23705 // TODO: include this cost into CommonCost.
23706 if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
23707 assert(SLPReVec && "FixedVectorType is not expected.");
23708 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
23709 CompressMask);
23710 }
23711 NewLI =
23712 cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
23713 } else if (E->State == TreeEntry::StridedVectorize) {
23714 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
23715 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
23716 PO = IsReverseOrder ? PtrN : Ptr0;
23717 Type *StrideTy = DL->getIndexType(PO->getType());
23718 Value *StrideVal;
23719 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
23720 StridedLoadTy = SPtrInfo.Ty;
23721 assert(StridedLoadTy && "Missing StridedPointerInfo for tree entry.");
23722 unsigned StridedLoadEC =
23723 StridedLoadTy->getElementCount().getKnownMinValue();
23724
23725 Value *Stride = SPtrInfo.StrideVal;
23726 if (!Stride) {
23727 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
23728 assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
23729 SCEVExpander Expander(*SE, "strided-load-vec");
23730 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
23731 &*Builder.GetInsertPoint());
23732 }
23733 Value *NewStride =
23734 Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
23735 StrideVal = Builder.CreateMul(
23736 NewStride, ConstantInt::getSigned(
23737 StrideTy, (IsReverseOrder ? -1 : 1) *
23738 static_cast<int>(
23739 DL->getTypeAllocSize(ScalarTy))));
23740 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
23741 auto *Inst = Builder.CreateIntrinsic(
23742 Intrinsic::experimental_vp_strided_load,
23743 {StridedLoadTy, PO->getType(), StrideTy},
23744 {PO, StrideVal,
23745 Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
23746 Builder.getInt32(StridedLoadEC)});
23747 Inst->addParamAttr(
23748 /*ArgNo=*/0,
23749 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
23750 NewLI = Inst;
23751 } else {
23752 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
23753 Value *VecPtr = vectorizeOperand(E, 0);
23754 if (isa<FixedVectorType>(ScalarTy)) {
23755 assert(SLPReVec && "FixedVectorType is not expected.");
23756 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
23757 // to expand VecPtr if ScalarTy is a vector type.
23758 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
23759 unsigned VecTyNumElements = getNumElements(VecTy);
23760 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
23761 "Cannot expand getelementptr.");
23762 unsigned VF = VecTyNumElements / ScalarTyNumElements;
23763 SmallVector<Constant *> Indices(VecTyNumElements);
23764 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
23765 return Builder.getInt64(I % ScalarTyNumElements);
23766 });
23767 VecPtr = Builder.CreateGEP(
23768 toScalarizedTy(VecTy),
23769 Builder.CreateShuffleVector(
23770 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
23771 ConstantVector::get(Indices));
23772 }
23773 // Use the minimum alignment of the gathered loads.
23774 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
23775 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
23776 }
23777 Value *V = E->State == TreeEntry::CompressVectorize
23778 ? NewLI
23779 : PropagateIRFlags(NewLI);
23780
23781 if (StridedLoadTy != VecTy)
23782 V = Builder.CreateBitOrPointerCast(V, VecTy);
23783 V = FinalShuffle(V, E);
23784 E->VectorizedValue = V;
23785 ++NumVectorInstructions;
23786 return V;
23787 }
23788 case Instruction::Store: {
23789 auto *SI = cast<StoreInst>(VL0);
23790
23791 setInsertPointAfterBundle(E);
23792
23793 Value *VecValue = vectorizeOperand(E, 0);
23794 if (VecValue->getType() != VecTy)
23795 VecValue =
23796 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
23797 VecValue = FinalShuffle(VecValue, E);
23798
23799 Value *Ptr = SI->getPointerOperand();
23800 Instruction *ST;
23801 if (E->State == TreeEntry::Vectorize) {
23802 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
23803 } else {
23804 assert(E->State == TreeEntry::StridedVectorize &&
23805 "Expected either strided or consecutive stores.");
23806 bool IsReverseOrder =
23807 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
23808 if (IsReverseOrder) {
23809 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
23810 Ptr = SI->getPointerOperand();
23811 }
23812 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
23813 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
23814
23815 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
23816 FixedVectorType *StridedStoreTy = SPtrInfo.Ty;
23817 assert(StridedStoreTy && "Missing StridedPointerInfo for tree entry.");
23818 unsigned StridedStoreEC = getNumElements(StridedStoreTy);
23819 Value *Stride = SPtrInfo.StrideVal;
23820 assert(Stride && "Missing StridedPointerInfo for tree entry.");
23821 Value *StrideVal =
23822 Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
23823 // vp_strided_store::stride is defined in bytes
23824 StrideVal = Builder.CreateMul(
23825 StrideVal,
23827 StrideTy, static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
23828 if (StridedStoreTy != VecTy)
23829 VecValue = Builder.CreateBitOrPointerCast(VecValue, StridedStoreTy);
23830 auto *Inst = Builder.CreateIntrinsic(
23831 Intrinsic::experimental_vp_strided_store,
23832 {StridedStoreTy, Ptr->getType(), StrideTy},
23833 {VecValue, Ptr, StrideVal,
23834 Builder.getAllOnesMask(ElementCount::getFixed(StridedStoreEC)),
23835 Builder.getInt32(StridedStoreEC)});
23836 Inst->addParamAttr(
23837 /*ArgNo=*/1,
23838 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
23839 ST = Inst;
23840 }
23841
23842 Value *V = PropagateIRFlags(ST);
23843
23844 E->VectorizedValue = V;
23845 ++NumVectorInstructions;
23846 return V;
23847 }
23848 case Instruction::GetElementPtr: {
23849 auto *GEP0 = cast<GetElementPtrInst>(VL0);
23850 setInsertPointAfterBundle(E);
23851
23852 Value *Op0 = vectorizeOperand(E, 0);
23853
23854 SmallVector<Value *> OpVecs;
23855 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
23856 Value *OpVec = vectorizeOperand(E, J);
23857 OpVecs.push_back(OpVec);
23858 }
23859
23860 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
23861 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
23863 for (Value *V : E->Scalars) {
23865 GEPs.push_back(V);
23866 }
23867 V = PropagateIRFlags(I);
23868 }
23869
23870 V = FinalShuffle(V, E);
23871
23872 E->VectorizedValue = V;
23873 ++NumVectorInstructions;
23874
23875 return V;
23876 }
23877 case Instruction::Call: {
23878 CallInst *CI = cast<CallInst>(VL0);
23879 setInsertPointAfterBundle(E);
23880
23882
23884 CI, ID, getNumElements(VecTy),
23885 It != MinBWs.end() ? It->second.first : 0, TTI);
23886 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
23887 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
23888 VecCallCosts.first <= VecCallCosts.second;
23889
23890 Value *ScalarArg = nullptr;
23891 SmallVector<Value *> OpVecs;
23892 SmallVector<Type *, 2> TysForDecl;
23893 // Add return type if intrinsic is overloaded on it.
23894 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI)) {
23895 ArrayRef<Type *> ContainedTys = getContainedTypes(VecTy);
23896 for (auto [Idx, Ty] : enumerate(ContainedTys)) {
23898 TysForDecl.push_back(Ty);
23899 }
23900 }
23901 auto *CEI = cast<CallInst>(VL0);
23902 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
23903 // Some intrinsics have scalar arguments. This argument should not be
23904 // vectorized.
23905 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
23906 ScalarArg = CEI->getArgOperand(I);
23907 // if decided to reduce bitwidth of abs intrinsic, it second argument
23908 // must be set false (do not return poison, if value issigned min).
23909 if (ID == Intrinsic::abs && It != MinBWs.end() &&
23910 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
23911 ScalarArg = Builder.getFalse();
23912 OpVecs.push_back(ScalarArg);
23914 TysForDecl.push_back(ScalarArg->getType());
23915 continue;
23916 }
23917
23918 Value *OpVec = vectorizeOperand(E, I);
23919 ScalarArg = CEI->getArgOperand(I);
23920 if (cast<VectorType>(OpVec->getType())->getElementType() !=
23921 ScalarArg->getType()->getScalarType() &&
23922 It == MinBWs.end()) {
23923 auto *CastTy =
23924 getWidenedType(ScalarArg->getType(), getNumElements(VecTy));
23925 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
23926 } else if (It != MinBWs.end()) {
23927 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
23928 }
23929 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
23930 OpVecs.push_back(OpVec);
23931 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
23932 TysForDecl.push_back(OpVec->getType());
23933 }
23934
23935 Function *CF;
23936 if (!UseIntrinsic) {
23937 VFShape Shape =
23940 false /*HasGlobalPred*/);
23941 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
23942 } else {
23943 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
23944 }
23945
23947 CI->getOperandBundlesAsDefs(OpBundles);
23948 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
23949
23950 V = PropagateIRFlags(V);
23951 cast<CallInst>(V)->setCallingConv(CF->getCallingConv());
23952 V = FinalShuffle(V, E);
23953
23954 E->VectorizedValue = V;
23955 ++NumVectorInstructions;
23956 return V;
23957 }
23958 case Instruction::ShuffleVector: {
23959 Value *V;
23960 if (SLPReVec && !E->isAltShuffle()) {
23961 setInsertPointAfterBundle(E);
23962 Value *Src = vectorizeOperand(E, 0);
23963 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
23964 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
23965 SmallVector<int> NewMask(ThisMask.size());
23966 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
23967 return SVSrc->getShuffleMask()[Mask];
23968 });
23969 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
23970 SVSrc->getOperand(1), NewMask);
23971 } else {
23972 V = Builder.CreateShuffleVector(Src, ThisMask);
23973 }
23974 V = PropagateIRFlags(V);
23975 V = FinalShuffle(V, E);
23976 } else {
23977 assert(E->isAltShuffle() &&
23978 ((Instruction::isBinaryOp(E->getOpcode()) &&
23979 Instruction::isBinaryOp(E->getAltOpcode())) ||
23980 (Instruction::isCast(E->getOpcode()) &&
23981 Instruction::isCast(E->getAltOpcode())) ||
23982 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
23983 "Invalid Shuffle Vector Operand");
23984
23985 Value *LHS = nullptr, *RHS = nullptr;
23986 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
23987 setInsertPointAfterBundle(E);
23988 LHS = vectorizeOperand(E, 0);
23989 RHS = vectorizeOperand(E, 1);
23990 } else {
23991 setInsertPointAfterBundle(E);
23992 LHS = vectorizeOperand(E, 0);
23993 }
23994 if (LHS && RHS &&
23995 ((Instruction::isBinaryOp(E->getOpcode()) &&
23996 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
23997 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
23998 assert((It != MinBWs.end() ||
23999 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
24000 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
24001 MinBWs.contains(getOperandEntry(E, 0)) ||
24002 MinBWs.contains(getOperandEntry(E, 1))) &&
24003 "Expected item in MinBWs.");
24004 Type *CastTy = VecTy;
24005 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
24007 ->getElementType()
24008 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
24009 ->getElementType()
24010 ->getIntegerBitWidth())
24011 CastTy = RHS->getType();
24012 else
24013 CastTy = LHS->getType();
24014 }
24015 if (LHS->getType() != CastTy)
24016 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
24017 if (RHS->getType() != CastTy)
24018 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
24019 }
24020
24021 Value *V0, *V1;
24022 if (Instruction::isBinaryOp(E->getOpcode())) {
24023 V0 = Builder.CreateBinOp(
24024 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
24025 V1 = Builder.CreateBinOp(
24026 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
24027 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
24028 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
24029 auto *AltCI = cast<CmpInst>(E->getAltOp());
24030 CmpInst::Predicate AltPred = AltCI->getPredicate();
24031 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
24032 } else {
24033 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
24034 unsigned SrcBWSz = DL->getTypeSizeInBits(
24035 cast<VectorType>(LHS->getType())->getElementType());
24036 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
24037 if (BWSz <= SrcBWSz) {
24038 if (BWSz < SrcBWSz)
24039 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
24040 assert(LHS->getType() == VecTy &&
24041 "Expected same type as operand.");
24042 LHS = PropagateIRFlags(LHS);
24043 LHS = FinalShuffle(LHS, E);
24044 E->VectorizedValue = LHS;
24045 ++NumVectorInstructions;
24046 return LHS;
24047 }
24048 }
24049 V0 = Builder.CreateCast(
24050 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
24051 V1 = Builder.CreateCast(
24052 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
24053 }
24054 // Add V0 and V1 to later analysis to try to find and remove matching
24055 // instruction, if any.
24056 for (Value *V : {V0, V1}) {
24057 if (auto *I = dyn_cast<Instruction>(V)) {
24058 GatherShuffleExtractSeq.insert(I);
24059 CSEBlocks.insert(I->getParent());
24060 }
24061 }
24062
24063 // Create shuffle to take alternate operations from the vector.
24064 // Also, gather up main and alt scalar ops to propagate IR flags to
24065 // each vector operation.
24066 ValueList OpScalars, AltScalars;
24067 SmallVector<int> Mask;
24068 E->buildAltOpShuffleMask(
24069 [E, this](Instruction *I) {
24070 assert(E->getMatchingMainOpOrAltOp(I) &&
24071 "Unexpected main/alternate opcode");
24072 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
24073 *TLI);
24074 },
24075 Mask, &OpScalars, &AltScalars);
24076
24077 PropagateIRFlags(V0, E->getOpcode(), OpScalars);
24078 PropagateIRFlags(V1, E->getAltOpcode(), AltScalars);
24079
24080 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
24081 assert(SLPReVec && "FixedVectorType is not expected.");
24082 transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
24083 }
24084 V = Builder.CreateShuffleVector(V0, V1, Mask);
24085 if (auto *I = dyn_cast<Instruction>(V)) {
24086 GatherShuffleExtractSeq.insert(I);
24087 CSEBlocks.insert(I->getParent());
24088 }
24089 }
24090
24091 E->VectorizedValue = V;
24092 ++NumVectorInstructions;
24093
24094 return V;
24095 }
24096 case TreeEntry::ReducedBitcast:
24097 case TreeEntry::ReducedBitcastBSwap: {
24098 assert(UserIgnoreList && "Expected reduction operations only.");
24099 setInsertPointAfterBundle(E);
24100 TreeEntry *ZExt = getOperandEntry(E, /*Idx=*/0);
24101 ZExt->VectorizedValue = PoisonValue::get(getWidenedType(
24102 ZExt->getMainOp()->getType(), ZExt->getVectorFactor()));
24103 TreeEntry *Const = getOperandEntry(E, /*Idx=*/1);
24104 Const->VectorizedValue = PoisonValue::get(getWidenedType(
24105 Const->Scalars.front()->getType(), Const->getVectorFactor()));
24106 Value *Op = vectorizeOperand(ZExt, 0);
24107 auto *SrcType = IntegerType::get(
24108 Op->getContext(),
24109 DL->getTypeSizeInBits(cast<CastInst>(ZExt->getMainOp())->getSrcTy()) *
24110 E->getVectorFactor());
24111 auto *OrigScalarTy = ScalarTy;
24112 // Set the scalar type properly to avoid casting to the extending type.
24113 ScalarTy = cast<CastInst>(ZExt->getMainOp())->getSrcTy();
24114 Op = FinalShuffle(Op, E);
24115 auto *V = Builder.CreateBitCast(Op, SrcType);
24116 ++NumVectorInstructions;
24117 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap) {
24118 V = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, V);
24119 ++NumVectorInstructions;
24120 }
24121 if (SrcType != OrigScalarTy) {
24122 V = Builder.CreateIntCast(V, OrigScalarTy, /*isSigned=*/false);
24123 ++NumVectorInstructions;
24124 }
24125 E->VectorizedValue = V;
24126 return V;
24127 }
24128 case TreeEntry::ReducedBitcastLoads:
24129 case TreeEntry::ReducedBitcastBSwapLoads: {
24130 assert(UserIgnoreList && "Expected reduction operations only.");
24131 TreeEntry *ZExt = getOperandEntry(E, /*Idx=*/0);
24132 TreeEntry *Load = getOperandEntry(ZExt, /*Idx=*/0);
24133 setInsertPointAfterBundle(Load);
24134 ZExt->VectorizedValue = PoisonValue::get(getWidenedType(
24135 ZExt->getMainOp()->getType(), ZExt->getVectorFactor()));
24136 TreeEntry *Const = getOperandEntry(E, /*Idx=*/1);
24137 Const->VectorizedValue = PoisonValue::get(getWidenedType(
24138 Const->Scalars.front()->getType(), Const->getVectorFactor()));
24139 Load->VectorizedValue = PoisonValue::get(getWidenedType(
24140 Load->getMainOp()->getType(), Load->getVectorFactor()));
24141 LoadInst *LI = cast<LoadInst>(Load->getMainOp());
24142 Value *PO = LI->getPointerOperand();
24143 auto *SrcTy = IntegerType::get(
24144 ScalarTy->getContext(),
24145 DL->getTypeSizeInBits(cast<CastInst>(ZExt->getMainOp())->getSrcTy()) *
24146 E->getVectorFactor());
24147 auto *OrigScalarTy = ScalarTy;
24148 ScalarTy = ZExt->getMainOp()->getType();
24149 Value *V = Builder.CreateAlignedLoad(SrcTy, PO, LI->getAlign());
24150 ++NumVectorInstructions;
24151 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) {
24152 V = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, V);
24153 ++NumVectorInstructions;
24154 }
24155 if (SrcTy != OrigScalarTy) {
24156 V = Builder.CreateIntCast(V, OrigScalarTy, /*isSigned=*/false);
24157 ++NumVectorInstructions;
24158 }
24159 E->VectorizedValue = V;
24160 return V;
24161 }
24162 case TreeEntry::ReducedCmpBitcast: {
24163 assert(UserIgnoreList && "Expected reduction operations only.");
24164 setInsertPointAfterBundle(E);
24165 TreeEntry *Op1TE = getOperandEntry(E, /*Idx=*/1);
24166 TreeEntry *Op2TE = getOperandEntry(E, /*Idx=*/2);
24167 Op1TE->VectorizedValue =
24168 PoisonValue::get(getWidenedType(ScalarTy, Op1TE->getVectorFactor()));
24169 Op2TE->VectorizedValue =
24170 PoisonValue::get(getWidenedType(ScalarTy, Op2TE->getVectorFactor()));
24171 Value *Cmp = vectorizeOperand(E, /*NodeIdx=*/0);
24172 // Set the scalar type properly to avoid casting to the extending type.
24173 auto *DstTy =
24174 IntegerType::getIntNTy(ScalarTy->getContext(), E->getVectorFactor());
24175 auto *V = Builder.CreateBitCast(Cmp, DstTy);
24176 ++NumVectorInstructions;
24177 if (DstTy != ScalarTy) {
24178 V = Builder.CreateIntCast(V, ScalarTy, /*isSigned=*/false);
24179 ++NumVectorInstructions;
24180 }
24181 E->VectorizedValue = V;
24182 return V;
24183 }
24184 default:
24185 llvm_unreachable("unknown inst");
24186 }
24187 return nullptr;
24188}
24189
24191 ExtraValueToDebugLocsMap ExternallyUsedValues;
24192 return vectorizeTree(ExternallyUsedValues);
24193}
24194
24196 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
24197 Instruction *ReductionRoot,
24198 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
24199 VectorValuesAndScales) {
24200 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
24201 // need to rebuild it.
24202 EntryToLastInstruction.clear();
24203 // All blocks must be scheduled before any instructions are inserted.
24204 for (auto &BSIter : BlocksSchedules)
24205 scheduleBlock(*this, BSIter.second.get());
24206 // Cache last instructions for the nodes to avoid side effects, which may
24207 // appear during vectorization, like extra uses, etc.
24208 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
24209 // Need to generate insertion point for loads nodes of the bitcast/bswap
24210 // ops.
24211 if (TE->isGather() || DeletedNodes.contains(TE.get()) ||
24212 (TE->State == TreeEntry::CombinedVectorize &&
24213 (TE->CombinedOp == TreeEntry::ReducedBitcast ||
24214 TE->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
24215 ((TE->CombinedOp == TreeEntry::ReducedBitcastLoads ||
24216 TE->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
24217 TE->CombinedOp == TreeEntry::ReducedCmpBitcast) &&
24218 (!TE->hasState() || TE->getOpcode() != Instruction::Load)))))
24219 continue;
24220 (void)getLastInstructionInBundle(TE.get());
24221 }
24222
24223 if (ReductionRoot)
24224 Builder.SetInsertPoint(ReductionRoot->getParent(),
24225 ReductionRoot->getIterator());
24226 else
24227 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
24228
24229 // Vectorize gather operands of the nodes with the external uses only.
24231 // Multiple gather TEs may share the same UserTE - cache the per-UserTE
24232 // all_of-isUsedOutsideBlock result to avoid re-walking each scalar's
24233 // user list.
24234 SmallDenseMap<const TreeEntry *, bool> UserTEScalarsUsedOutsideBlockCache;
24235 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
24236 if (DeletedNodes.contains(TE.get()))
24237 continue;
24238 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
24239 TE->UserTreeIndex.UserTE->hasState() &&
24240 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
24241 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
24242 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
24243 !TE->UserTreeIndex.UserTE->hasCopyableElements()) {
24244 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
24245 auto [It, Inserted] =
24246 UserTEScalarsUsedOutsideBlockCache.try_emplace(UserTE);
24247 if (Inserted)
24248 It->second = all_of(UserTE->Scalars,
24249 [](Value *V) { return isUsedOutsideBlock(V); });
24250 if (!It->second)
24251 continue;
24252 Instruction &LastInst = getLastInstructionInBundle(UserTE);
24253 GatherEntries.emplace_back(TE.get(), &LastInst);
24254 }
24255 }
24256 for (auto &Entry : GatherEntries) {
24257 IRBuilderBase::InsertPointGuard Guard(Builder);
24258 Builder.SetInsertPoint(Entry.second);
24259 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
24260 (void)vectorizeTree(Entry.first);
24261 }
24262 // Emit gathered loads first to emit better code for the users of those
24263 // gathered loads.
24264 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
24265 if (DeletedNodes.contains(TE.get()))
24266 continue;
24267 if (GatheredLoadsEntriesFirst.has_value() &&
24268 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
24269 (!TE->isGather() || TE->UserTreeIndex)) {
24270 assert((TE->UserTreeIndex ||
24271 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
24272 "Expected gathered load node.");
24273 (void)vectorizeTree(TE.get());
24274 }
24275 }
24276 (void)vectorizeTree(VectorizableTree[0].get());
24277 // Run through the list of postponed gathers and emit them, replacing the temp
24278 // emitted allocas with actual vector instructions.
24279 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
24281 for (const TreeEntry *E : PostponedNodes) {
24282 auto *TE = const_cast<TreeEntry *>(E);
24283 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
24284 TE->VectorizedValue = nullptr;
24285 auto *UserI = cast<Instruction>(TE->UserTreeIndex.UserTE->VectorizedValue);
24286 // If user is a PHI node, its vector code have to be inserted right before
24287 // block terminator. Since the node was delayed, there were some unresolved
24288 // dependencies at the moment when stab instruction was emitted. In a case
24289 // when any of these dependencies turn out an operand of another PHI, coming
24290 // from this same block, position of a stab instruction will become invalid.
24291 // The is because source vector that supposed to feed this gather node was
24292 // inserted at the end of the block [after stab instruction]. So we need
24293 // to adjust insertion point again to the end of block.
24294 if (isa<PHINode>(UserI) ||
24295 (TE->UserTreeIndex.UserTE->hasState() &&
24296 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
24297 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
24298 // Insert before all users.
24299 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
24300 for (User *U : PrevVec->users()) {
24301 if (U == UserI)
24302 continue;
24303 auto *UI = dyn_cast<Instruction>(U);
24304 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
24305 continue;
24306 if (UI->comesBefore(InsertPt))
24307 InsertPt = UI;
24308 }
24309 Builder.SetInsertPoint(InsertPt);
24310 } else {
24311 Builder.SetInsertPoint(PrevVec);
24312 }
24313 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
24314 Value *Vec = vectorizeTree(TE);
24315 if (auto *VecI = dyn_cast<Instruction>(Vec);
24316 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
24317 Builder.GetInsertPoint()->comesBefore(VecI))
24318 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
24319 Builder.GetInsertPoint());
24320 if (Vec->getType() != PrevVec->getType()) {
24321 assert(Vec->getType()->isIntOrIntVectorTy() &&
24322 PrevVec->getType()->isIntOrIntVectorTy() &&
24323 "Expected integer vector types only.");
24324 std::optional<bool> IsSigned;
24325 for (Value *V : TE->Scalars) {
24326 if (isVectorized(V)) {
24327 for (const TreeEntry *MNTE : getTreeEntries(V)) {
24328 auto It = MinBWs.find(MNTE);
24329 if (It != MinBWs.end()) {
24330 IsSigned = IsSigned.value_or(false) || It->second.second;
24331 if (*IsSigned)
24332 break;
24333 }
24334 }
24335 if (IsSigned.value_or(false))
24336 break;
24337 // Scan through gather nodes.
24338 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
24339 auto It = MinBWs.find(BVE);
24340 if (It != MinBWs.end()) {
24341 IsSigned = IsSigned.value_or(false) || It->second.second;
24342 if (*IsSigned)
24343 break;
24344 }
24345 }
24346 if (IsSigned.value_or(false))
24347 break;
24348 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
24349 IsSigned =
24350 IsSigned.value_or(false) ||
24351 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
24352 continue;
24353 }
24354 if (IsSigned.value_or(false))
24355 break;
24356 }
24357 }
24358 if (IsSigned.value_or(false)) {
24359 // Final attempt - check user node.
24360 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
24361 if (It != MinBWs.end())
24362 IsSigned = It->second.second;
24363 }
24364 assert(IsSigned &&
24365 "Expected user node or perfect diamond match in MinBWs.");
24366 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
24367 }
24368 PrevVec->replaceAllUsesWith(Vec);
24369 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
24370 // Replace the stub vector node, if it was used before for one of the
24371 // buildvector nodes already.
24372 auto It = PostponedValues.find(PrevVec);
24373 if (It != PostponedValues.end()) {
24374 for (TreeEntry *VTE : It->getSecond())
24375 VTE->VectorizedValue = Vec;
24376 }
24377 eraseInstruction(PrevVec);
24378 }