LLVM 20.0.0git
LoopVectorizationLegality.cpp
Go to the documentation of this file.
1//===- LoopVectorizationLegality.cpp --------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file provides loop vectorization legality analysis. Original code
10// resided in LoopVectorize.cpp for a long time.
11//
12// At this point, it is implemented as a utility class, not as an analysis
13// pass. It should be easy to create an analysis pass around it if there
14// is a need (but D45420 needs to happen first).
15//
16
18#include "llvm/Analysis/Loads.h"
30
31using namespace llvm;
32using namespace PatternMatch;
33
34#define LV_NAME "loop-vectorize"
35#define DEBUG_TYPE LV_NAME
36
37static cl::opt<bool>
38 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
39 cl::desc("Enable if-conversion during vectorization."));
40
41static cl::opt<bool>
42AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden,
43 cl::desc("Enable recognition of non-constant strided "
44 "pointer induction variables."));
45
46namespace llvm {
48 HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden,
49 cl::desc("Allow enabling loop hints to reorder "
50 "FP operations during vectorization."));
51} // namespace llvm
52
53// TODO: Move size-based thresholds out of legality checking, make cost based
54// decisions instead of hard thresholds.
56 "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
57 cl::desc("The maximum number of SCEV checks allowed."));
58
60 "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
61 cl::desc("The maximum number of SCEV checks allowed with a "
62 "vectorize(enable) pragma"));
63
66 "scalable-vectorization", cl::init(LoopVectorizeHints::SK_Unspecified),
68 cl::desc("Control whether the compiler can use scalable vectors to "
69 "vectorize a loop"),
72 "Scalable vectorization is disabled."),
75 "Scalable vectorization is available and favored when the "
76 "cost is inconclusive."),
79 "Scalable vectorization is available and favored when the "
80 "cost is inconclusive.")));
81
83 "enable-histogram-loop-vectorization", cl::init(false), cl::Hidden,
84 cl::desc("Enables autovectorization of some loops containing histograms"));
85
86/// Maximum vectorization interleave count.
87static const unsigned MaxInterleaveFactor = 16;
88
89namespace llvm {
90
91bool LoopVectorizeHints::Hint::validate(unsigned Val) {
92 switch (Kind) {
93 case HK_WIDTH:
95 case HK_INTERLEAVE:
96 return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
97 case HK_FORCE:
98 return (Val <= 1);
99 case HK_ISVECTORIZED:
100 case HK_PREDICATE:
101 case HK_SCALABLE:
102 return (Val == 0 || Val == 1);
103 }
104 return false;
105}
106
108 bool InterleaveOnlyWhenForced,
111 : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
112 Interleave("interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE),
113 Force("vectorize.enable", FK_Undefined, HK_FORCE),
114 IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
115 Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
116 Scalable("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE),
117 TheLoop(L), ORE(ORE) {
118 // Populate values with existing loop metadata.
119 getHintsFromMetadata();
120
121 // force-vector-interleave overrides DisableInterleaving.
124
125 // If the metadata doesn't explicitly specify whether to enable scalable
126 // vectorization, then decide based on the following criteria (increasing
127 // level of priority):
128 // - Target default
129 // - Metadata width
130 // - Force option (always overrides)
132 if (TTI)
135
136 if (Width.Value)
137 // If the width is set, but the metadata says nothing about the scalable
138 // property, then assume it concerns only a fixed-width UserVF.
139 // If width is not set, the flag takes precedence.
140 Scalable.Value = SK_FixedWidthOnly;
141 }
142
143 // If the flag is set to force any use of scalable vectors, override the loop
144 // hints.
145 if (ForceScalableVectorization.getValue() !=
147 Scalable.Value = ForceScalableVectorization.getValue();
148
149 // Scalable vectorization is disabled if no preference is specified.
151 Scalable.Value = SK_FixedWidthOnly;
152
153 if (IsVectorized.Value != 1)
154 // If the vectorization width and interleaving count are both 1 then
155 // consider the loop to have been already vectorized because there's
156 // nothing more that we can do.
157 IsVectorized.Value =
159 LLVM_DEBUG(if (InterleaveOnlyWhenForced && getInterleave() == 1) dbgs()
160 << "LV: Interleaving disabled by the pass manager\n");
161}
162
164 LLVMContext &Context = TheLoop->getHeader()->getContext();
165
166 MDNode *IsVectorizedMD = MDNode::get(
167 Context,
168 {MDString::get(Context, "llvm.loop.isvectorized"),
169 ConstantAsMetadata::get(ConstantInt::get(Context, APInt(32, 1)))});
170 MDNode *LoopID = TheLoop->getLoopID();
171 MDNode *NewLoopID =
172 makePostTransformationMetadata(Context, LoopID,
173 {Twine(Prefix(), "vectorize.").str(),
174 Twine(Prefix(), "interleave.").str()},
175 {IsVectorizedMD});
176 TheLoop->setLoopID(NewLoopID);
177
178 // Update internal cache.
179 IsVectorized.Value = 1;
180}
181
183 Function *F, Loop *L, bool VectorizeOnlyWhenForced) const {
185 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
187 return false;
188 }
189
190 if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) {
191 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
193 return false;
194 }
195
196 if (getIsVectorized() == 1) {
197 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
198 // FIXME: Add interleave.disable metadata. This will allow
199 // vectorize.disable to be used without disabling the pass and errors
200 // to differentiate between disabled vectorization and a width of 1.
201 ORE.emit([&]() {
203 "AllDisabled", L->getStartLoc(),
204 L->getHeader())
205 << "loop not vectorized: vectorization and interleaving are "
206 "explicitly disabled, or the loop has already been "
207 "vectorized";
208 });
209 return false;
210 }
211
212 return true;
213}
214
216 using namespace ore;
217
218 ORE.emit([&]() {
219 if (Force.Value == LoopVectorizeHints::FK_Disabled)
220 return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
221 TheLoop->getStartLoc(),
222 TheLoop->getHeader())
223 << "loop not vectorized: vectorization is explicitly disabled";
224
225 OptimizationRemarkMissed R(LV_NAME, "MissedDetails", TheLoop->getStartLoc(),
226 TheLoop->getHeader());
227 R << "loop not vectorized";
228 if (Force.Value == LoopVectorizeHints::FK_Enabled) {
229 R << " (Force=" << NV("Force", true);
230 if (Width.Value != 0)
231 R << ", Vector Width=" << NV("VectorWidth", getWidth());
232 if (getInterleave() != 0)
233 R << ", Interleave Count=" << NV("InterleaveCount", getInterleave());
234 R << ")";
235 }
236 return R;
237 });
238}
239
242 return LV_NAME;
244 return LV_NAME;
246 return LV_NAME;
248}
249
251 // Allow the vectorizer to change the order of operations if enabling
252 // loop hints are provided
253 ElementCount EC = getWidth();
254 return HintsAllowReordering &&
256 EC.getKnownMinValue() > 1);
257}
258
259void LoopVectorizeHints::getHintsFromMetadata() {
260 MDNode *LoopID = TheLoop->getLoopID();
261 if (!LoopID)
262 return;
263
264 // First operand should refer to the loop id itself.
265 assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
266 assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
267
268 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
269 const MDString *S = nullptr;
271
272 // The expected hint is either a MDString or a MDNode with the first
273 // operand a MDString.
274 if (const MDNode *MD = dyn_cast<MDNode>(MDO)) {
275 if (!MD || MD->getNumOperands() == 0)
276 continue;
277 S = dyn_cast<MDString>(MD->getOperand(0));
278 for (unsigned Idx = 1; Idx < MD->getNumOperands(); ++Idx)
279 Args.push_back(MD->getOperand(Idx));
280 } else {
281 S = dyn_cast<MDString>(MDO);
282 assert(Args.size() == 0 && "too many arguments for MDString");
283 }
284
285 if (!S)
286 continue;
287
288 // Check if the hint starts with the loop metadata prefix.
289 StringRef Name = S->getString();
290 if (Args.size() == 1)
291 setHint(Name, Args[0]);
292 }
293}
294
295void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
296 if (!Name.starts_with(Prefix()))
297 return;
298 Name = Name.substr(Prefix().size(), StringRef::npos);
299
300 const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
301 if (!C)
302 return;
303 unsigned Val = C->getZExtValue();
304
305 Hint *Hints[] = {&Width, &Interleave, &Force,
306 &IsVectorized, &Predicate, &Scalable};
307 for (auto *H : Hints) {
308 if (Name == H->Name) {
309 if (H->validate(Val))
310 H->Value = Val;
311 else
312 LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
313 break;
314 }
315 }
316}
317
318// Return true if the inner loop \p Lp is uniform with regard to the outer loop
319// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
320// executing the inner loop will execute the same iterations). This check is
321// very constrained for now but it will be relaxed in the future. \p Lp is
322// considered uniform if it meets all the following conditions:
323// 1) it has a canonical IV (starting from 0 and with stride 1),
324// 2) its latch terminator is a conditional branch and,
325// 3) its latch condition is a compare instruction whose operands are the
326// canonical IV and an OuterLp invariant.
327// This check doesn't take into account the uniformity of other conditions not
328// related to the loop latch because they don't affect the loop uniformity.
329//
330// NOTE: We decided to keep all these checks and its associated documentation
331// together so that we can easily have a picture of the current supported loop
332// nests. However, some of the current checks don't depend on \p OuterLp and
333// would be redundantly executed for each \p Lp if we invoked this function for
334// different candidate outer loops. This is not the case for now because we
335// don't currently have the infrastructure to evaluate multiple candidate outer
336// loops and \p OuterLp will be a fixed parameter while we only support explicit
337// outer loop vectorization. It's also very likely that these checks go away
338// before introducing the aforementioned infrastructure. However, if this is not
339// the case, we should move the \p OuterLp independent checks to a separate
340// function that is only executed once for each \p Lp.
341static bool isUniformLoop(Loop *Lp, Loop *OuterLp) {
342 assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
343
344 // If Lp is the outer loop, it's uniform by definition.
345 if (Lp == OuterLp)
346 return true;
347 assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
348
349 // 1.
351 if (!IV) {
352 LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
353 return false;
354 }
355
356 // 2.
357 BasicBlock *Latch = Lp->getLoopLatch();
358 auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
359 if (!LatchBr || LatchBr->isUnconditional()) {
360 LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
361 return false;
362 }
363
364 // 3.
365 auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition());
366 if (!LatchCmp) {
368 dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
369 return false;
370 }
371
372 Value *CondOp0 = LatchCmp->getOperand(0);
373 Value *CondOp1 = LatchCmp->getOperand(1);
374 Value *IVUpdate = IV->getIncomingValueForBlock(Latch);
375 if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) &&
376 !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) {
377 LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
378 return false;
379 }
380
381 return true;
382}
383
384// Return true if \p Lp and all its nested loops are uniform with regard to \p
385// OuterLp.
386static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) {
387 if (!isUniformLoop(Lp, OuterLp))
388 return false;
389
390 // Check if nested loops are uniform.
391 for (Loop *SubLp : *Lp)
392 if (!isUniformLoopNest(SubLp, OuterLp))
393 return false;
394
395 return true;
396}
397
399 if (Ty->isPointerTy())
400 return DL.getIntPtrType(Ty);
401
402 // It is possible that char's or short's overflow when we ask for the loop's
403 // trip count, work around this by changing the type size.
404 if (Ty->getScalarSizeInBits() < 32)
405 return Type::getInt32Ty(Ty->getContext());
406
407 return Ty;
408}
409
410static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
413 if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
414 return Ty0;
415 return Ty1;
416}
417
418/// Check that the instruction has outside loop users and is not an
419/// identified reduction variable.
420static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
421 SmallPtrSetImpl<Value *> &AllowedExit) {
422 // Reductions, Inductions and non-header phis are allowed to have exit users. All
423 // other instructions must not have external users.
424 if (!AllowedExit.count(Inst))
425 // Check that all of the users of the loop are inside the BB.
426 for (User *U : Inst->users()) {
427 Instruction *UI = cast<Instruction>(U);
428 // This user may be a reduction exit value.
429 if (!TheLoop->contains(UI)) {
430 LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
431 return true;
432 }
433 }
434 return false;
435}
436
437/// Returns true if A and B have same pointer operands or same SCEVs addresses
439 StoreInst *B) {
440 // Compare store
441 if (A == B)
442 return true;
443
444 // Otherwise Compare pointers
445 Value *APtr = A->getPointerOperand();
446 Value *BPtr = B->getPointerOperand();
447 if (APtr == BPtr)
448 return true;
449
450 // Otherwise compare address SCEVs
451 return SE->getSCEV(APtr) == SE->getSCEV(BPtr);
452}
453
455 Value *Ptr) const {
456 // FIXME: Currently, the set of symbolic strides is sometimes queried before
457 // it's collected. This happens from canVectorizeWithIfConvert, when the
458 // pointer is checked to reference consecutive elements suitable for a
459 // masked access.
460 const auto &Strides =
462
463 bool CanAddPredicate = !llvm::shouldOptimizeForSize(
464 TheLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
465 int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides,
466 CanAddPredicate, false).value_or(0);
467 if (Stride == 1 || Stride == -1)
468 return Stride;
469 return 0;
470}
471
473 return LAI->isInvariant(V);
474}
475
476namespace {
477/// A rewriter to build the SCEVs for each of the VF lanes in the expected
478/// vectorized loop, which can then be compared to detect their uniformity. This
479/// is done by replacing the AddRec SCEVs of the original scalar loop (TheLoop)
480/// with new AddRecs where the step is multiplied by StepMultiplier and Offset *
481/// Step is added. Also checks if all sub-expressions are analyzable w.r.t.
482/// uniformity.
483class SCEVAddRecForUniformityRewriter
484 : public SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter> {
485 /// Multiplier to be applied to the step of AddRecs in TheLoop.
486 unsigned StepMultiplier;
487
488 /// Offset to be added to the AddRecs in TheLoop.
489 unsigned Offset;
490
491 /// Loop for which to rewrite AddRecsFor.
492 Loop *TheLoop;
493
494 /// Is any sub-expressions not analyzable w.r.t. uniformity?
495 bool CannotAnalyze = false;
496
497 bool canAnalyze() const { return !CannotAnalyze; }
498
499public:
500 SCEVAddRecForUniformityRewriter(ScalarEvolution &SE, unsigned StepMultiplier,
501 unsigned Offset, Loop *TheLoop)
502 : SCEVRewriteVisitor(SE), StepMultiplier(StepMultiplier), Offset(Offset),
503 TheLoop(TheLoop) {}
504
505 const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
506 assert(Expr->getLoop() == TheLoop &&
507 "addrec outside of TheLoop must be invariant and should have been "
508 "handled earlier");
509 // Build a new AddRec by multiplying the step by StepMultiplier and
510 // incrementing the start by Offset * step.
511 Type *Ty = Expr->getType();
512 const SCEV *Step = Expr->getStepRecurrence(SE);
513 if (!SE.isLoopInvariant(Step, TheLoop)) {
514 CannotAnalyze = true;
515 return Expr;
516 }
517 const SCEV *NewStep =
518 SE.getMulExpr(Step, SE.getConstant(Ty, StepMultiplier));
519 const SCEV *ScaledOffset = SE.getMulExpr(Step, SE.getConstant(Ty, Offset));
520 const SCEV *NewStart = SE.getAddExpr(Expr->getStart(), ScaledOffset);
521 return SE.getAddRecExpr(NewStart, NewStep, TheLoop, SCEV::FlagAnyWrap);
522 }
523
524 const SCEV *visit(const SCEV *S) {
525 if (CannotAnalyze || SE.isLoopInvariant(S, TheLoop))
526 return S;
528 }
529
530 const SCEV *visitUnknown(const SCEVUnknown *S) {
531 if (SE.isLoopInvariant(S, TheLoop))
532 return S;
533 // The value could vary across iterations.
534 CannotAnalyze = true;
535 return S;
536 }
537
538 const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *S) {
539 // Could not analyze the expression.
540 CannotAnalyze = true;
541 return S;
542 }
543
544 static const SCEV *rewrite(const SCEV *S, ScalarEvolution &SE,
545 unsigned StepMultiplier, unsigned Offset,
546 Loop *TheLoop) {
547 /// Bail out if the expression does not contain an UDiv expression.
548 /// Uniform values which are not loop invariant require operations to strip
549 /// out the lowest bits. For now just look for UDivs and use it to avoid
550 /// re-writing UDIV-free expressions for other lanes to limit compile time.
551 if (!SCEVExprContains(S,
552 [](const SCEV *S) { return isa<SCEVUDivExpr>(S); }))
553 return SE.getCouldNotCompute();
554
555 SCEVAddRecForUniformityRewriter Rewriter(SE, StepMultiplier, Offset,
556 TheLoop);
557 const SCEV *Result = Rewriter.visit(S);
558
559 if (Rewriter.canAnalyze())
560 return Result;
561 return SE.getCouldNotCompute();
562 }
563};
564
565} // namespace
566
568 if (isInvariant(V))
569 return true;
570 if (VF.isScalable())
571 return false;
572 if (VF.isScalar())
573 return true;
574
575 // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
576 // never considered uniform.
577 auto *SE = PSE.getSE();
578 if (!SE->isSCEVable(V->getType()))
579 return false;
580 const SCEV *S = SE->getSCEV(V);
581
582 // Rewrite AddRecs in TheLoop to step by VF and check if the expression for
583 // lane 0 matches the expressions for all other lanes.
584 unsigned FixedVF = VF.getKnownMinValue();
585 const SCEV *FirstLaneExpr =
586 SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, 0, TheLoop);
587 if (isa<SCEVCouldNotCompute>(FirstLaneExpr))
588 return false;
589
590 // Make sure the expressions for lanes FixedVF-1..1 match the expression for
591 // lane 0. We check lanes in reverse order for compile-time, as frequently
592 // checking the last lane is sufficient to rule out uniformity.
593 return all_of(reverse(seq<unsigned>(1, FixedVF)), [&](unsigned I) {
594 const SCEV *IthLaneExpr =
595 SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, I, TheLoop);
596 return FirstLaneExpr == IthLaneExpr;
597 });
598}
599
601 ElementCount VF) const {
603 if (!Ptr)
604 return false;
605 // Note: There's nothing inherent which prevents predicated loads and
606 // stores from being uniform. The current lowering simply doesn't handle
607 // it; in particular, the cost model distinguishes scatter/gather from
608 // scalar w/predication, and we currently rely on the scalar path.
609 return isUniform(Ptr, VF) && !blockNeedsPredication(I.getParent());
610}
611
612bool LoopVectorizationLegality::canVectorizeOuterLoop() {
613 assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop.");
614 // Store the result and return it at the end instead of exiting early, in case
615 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
616 bool Result = true;
617 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
618
619 for (BasicBlock *BB : TheLoop->blocks()) {
620 // Check whether the BB terminator is a BranchInst. Any other terminator is
621 // not supported yet.
622 auto *Br = dyn_cast<BranchInst>(BB->getTerminator());
623 if (!Br) {
624 reportVectorizationFailure("Unsupported basic block terminator",
625 "loop control flow is not understood by vectorizer",
626 "CFGNotUnderstood", ORE, TheLoop);
627 if (DoExtraAnalysis)
628 Result = false;
629 else
630 return false;
631 }
632
633 // Check whether the BranchInst is a supported one. Only unconditional
634 // branches, conditional branches with an outer loop invariant condition or
635 // backedges are supported.
636 // FIXME: We skip these checks when VPlan predication is enabled as we
637 // want to allow divergent branches. This whole check will be removed
638 // once VPlan predication is on by default.
639 if (Br && Br->isConditional() &&
640 !TheLoop->isLoopInvariant(Br->getCondition()) &&
641 !LI->isLoopHeader(Br->getSuccessor(0)) &&
642 !LI->isLoopHeader(Br->getSuccessor(1))) {
643 reportVectorizationFailure("Unsupported conditional branch",
644 "loop control flow is not understood by vectorizer",
645 "CFGNotUnderstood", ORE, TheLoop);
646 if (DoExtraAnalysis)
647 Result = false;
648 else
649 return false;
650 }
651 }
652
653 // Check whether inner loops are uniform. At this point, we only support
654 // simple outer loops scenarios with uniform nested loops.
655 if (!isUniformLoopNest(TheLoop /*loop nest*/,
656 TheLoop /*context outer loop*/)) {
657 reportVectorizationFailure("Outer loop contains divergent loops",
658 "loop control flow is not understood by vectorizer",
659 "CFGNotUnderstood", ORE, TheLoop);
660 if (DoExtraAnalysis)
661 Result = false;
662 else
663 return false;
664 }
665
666 // Check whether we are able to set up outer loop induction.
667 if (!setupOuterLoopInductions()) {
668 reportVectorizationFailure("Unsupported outer loop Phi(s)",
669 "UnsupportedPhi", ORE, TheLoop);
670 if (DoExtraAnalysis)
671 Result = false;
672 else
673 return false;
674 }
675
676 return Result;
677}
678
679void LoopVectorizationLegality::addInductionPhi(
680 PHINode *Phi, const InductionDescriptor &ID,
681 SmallPtrSetImpl<Value *> &AllowedExit) {
682 Inductions[Phi] = ID;
683
684 // In case this induction also comes with casts that we know we can ignore
685 // in the vectorized loop body, record them here. All casts could be recorded
686 // here for ignoring, but suffices to record only the first (as it is the
687 // only one that may bw used outside the cast sequence).
688 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
689 if (!Casts.empty())
690 InductionCastsToIgnore.insert(*Casts.begin());
691
692 Type *PhiTy = Phi->getType();
693 const DataLayout &DL = Phi->getDataLayout();
694
695 // Get the widest type.
696 if (!PhiTy->isFloatingPointTy()) {
697 if (!WidestIndTy)
698 WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
699 else
700 WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
701 }
702
703 // Int inductions are special because we only allow one IV.
704 if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
705 ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() &&
706 isa<Constant>(ID.getStartValue()) &&
707 cast<Constant>(ID.getStartValue())->isNullValue()) {
708
709 // Use the phi node with the widest type as induction. Use the last
710 // one if there are multiple (no good reason for doing this other
711 // than it is expedient). We've checked that it begins at zero and
712 // steps by one, so this is a canonical induction variable.
713 if (!PrimaryInduction || PhiTy == WidestIndTy)
714 PrimaryInduction = Phi;
715 }
716
717 // Both the PHI node itself, and the "post-increment" value feeding
718 // back into the PHI node may have external users.
719 // We can allow those uses, except if the SCEVs we have for them rely
720 // on predicates that only hold within the loop, since allowing the exit
721 // currently means re-using this SCEV outside the loop (see PR33706 for more
722 // details).
723 if (PSE.getPredicate().isAlwaysTrue()) {
724 AllowedExit.insert(Phi);
725 AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
726 }
727
728 LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
729}
730
731bool LoopVectorizationLegality::setupOuterLoopInductions() {
732 BasicBlock *Header = TheLoop->getHeader();
733
734 // Returns true if a given Phi is a supported induction.
735 auto IsSupportedPhi = [&](PHINode &Phi) -> bool {
737 if (InductionDescriptor::isInductionPHI(&Phi, TheLoop, PSE, ID) &&
739 addInductionPhi(&Phi, ID, AllowedExit);
740 return true;
741 }
742 // Bail out for any Phi in the outer loop header that is not a supported
743 // induction.
745 dbgs() << "LV: Found unsupported PHI for outer loop vectorization.\n");
746 return false;
747 };
748
749 return llvm::all_of(Header->phis(), IsSupportedPhi);
750}
751
752/// Checks if a function is scalarizable according to the TLI, in
753/// the sense that it should be vectorized and then expanded in
754/// multiple scalar calls. This is represented in the
755/// TLI via mappings that do not specify a vector name, as in the
756/// following example:
757///
758/// const VecDesc VecIntrinsics[] = {
759/// {"llvm.phx.abs.i32", "", 4}
760/// };
761static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) {
762 const StringRef ScalarName = CI.getCalledFunction()->getName();
763 bool Scalarize = TLI.isFunctionVectorizable(ScalarName);
764 // Check that all known VFs are not associated to a vector
765 // function, i.e. the vector name is emty.
766 if (Scalarize) {
767 ElementCount WidestFixedVF, WidestScalableVF;
768 TLI.getWidestVF(ScalarName, WidestFixedVF, WidestScalableVF);
770 ElementCount::isKnownLE(VF, WidestFixedVF); VF *= 2)
771 Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF);
773 ElementCount::isKnownLE(VF, WidestScalableVF); VF *= 2)
774 Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF);
775 assert((WidestScalableVF.isZero() || !Scalarize) &&
776 "Caller may decide to scalarize a variant using a scalable VF");
777 }
778 return Scalarize;
779}
780
781/// Returns true if the call return type `Ty` can be widened by the loop
782/// vectorizer.
783static bool canWidenCallReturnType(Type *Ty) {
784 auto *StructTy = dyn_cast<StructType>(Ty);
785 // TODO: Remove the homogeneous types restriction. This is just an initial
786 // simplification. When we want to support things like the overflow intrinsics
787 // we will have to lift this restriction.
788 if (StructTy && !StructTy->containsHomogeneousTypes())
789 return false;
790 return canVectorizeTy(StructTy);
791}
792
793bool LoopVectorizationLegality::canVectorizeInstrs() {
794 BasicBlock *Header = TheLoop->getHeader();
795
796 // For each block in the loop.
797 for (BasicBlock *BB : TheLoop->blocks()) {
798 // Scan the instructions in the block and look for hazards.
799 for (Instruction &I : *BB) {
800 if (auto *Phi = dyn_cast<PHINode>(&I)) {
801 Type *PhiTy = Phi->getType();
802 // Check that this PHI type is allowed.
803 if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
804 !PhiTy->isPointerTy()) {
805 reportVectorizationFailure("Found a non-int non-pointer PHI",
806 "loop control flow is not understood by vectorizer",
807 "CFGNotUnderstood", ORE, TheLoop);
808 return false;
809 }
810
811 // If this PHINode is not in the header block, then we know that we
812 // can convert it to select during if-conversion. No need to check if
813 // the PHIs in this block are induction or reduction variables.
814 if (BB != Header) {
815 // Non-header phi nodes that have outside uses can be vectorized. Add
816 // them to the list of allowed exits.
817 // Unsafe cyclic dependencies with header phis are identified during
818 // legalization for reduction, induction and fixed order
819 // recurrences.
820 AllowedExit.insert(&I);
821 continue;
822 }
823
824 // We only allow if-converted PHIs with exactly two incoming values.
825 if (Phi->getNumIncomingValues() != 2) {
826 reportVectorizationFailure("Found an invalid PHI",
827 "loop control flow is not understood by vectorizer",
828 "CFGNotUnderstood", ORE, TheLoop, Phi);
829 return false;
830 }
831
833 if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
834 DT, PSE.getSE())) {
835 Requirements->addExactFPMathInst(RedDes.getExactFPMathInst());
836 AllowedExit.insert(RedDes.getLoopExitInstr());
837 Reductions[Phi] = RedDes;
838 continue;
839 }
840
841 // We prevent matching non-constant strided pointer IVS to preserve
842 // historical vectorizer behavior after a generalization of the
843 // IVDescriptor code. The intent is to remove this check, but we
844 // have to fix issues around code quality for such loops first.
845 auto IsDisallowedStridedPointerInduction =
846 [](const InductionDescriptor &ID) {
848 return false;
849 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
850 ID.getConstIntStepValue() == nullptr;
851 };
852
853 // TODO: Instead of recording the AllowedExit, it would be good to
854 // record the complementary set: NotAllowedExit. These include (but may
855 // not be limited to):
856 // 1. Reduction phis as they represent the one-before-last value, which
857 // is not available when vectorized
858 // 2. Induction phis and increment when SCEV predicates cannot be used
859 // outside the loop - see addInductionPhi
860 // 3. Non-Phis with outside uses when SCEV predicates cannot be used
861 // outside the loop - see call to hasOutsideLoopUser in the non-phi
862 // handling below
863 // 4. FixedOrderRecurrence phis that can possibly be handled by
864 // extraction.
865 // By recording these, we can then reason about ways to vectorize each
866 // of these NotAllowedExit.
868 if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID) &&
869 !IsDisallowedStridedPointerInduction(ID)) {
870 addInductionPhi(Phi, ID, AllowedExit);
871 Requirements->addExactFPMathInst(ID.getExactFPMathInst());
872 continue;
873 }
874
875 if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) {
876 AllowedExit.insert(Phi);
877 FixedOrderRecurrences.insert(Phi);
878 continue;
879 }
880
881 // As a last resort, coerce the PHI to a AddRec expression
882 // and re-try classifying it a an induction PHI.
883 if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) &&
884 !IsDisallowedStridedPointerInduction(ID)) {
885 addInductionPhi(Phi, ID, AllowedExit);
886 continue;
887 }
888
889 reportVectorizationFailure("Found an unidentified PHI",
890 "value that could not be identified as "
891 "reduction is used outside the loop",
892 "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi);
893 return false;
894 } // end of PHI handling
895
896 // We handle calls that:
897 // * Are debug info intrinsics.
898 // * Have a mapping to an IR intrinsic.
899 // * Have a vector version available.
900 auto *CI = dyn_cast<CallInst>(&I);
901
902 if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
903 !isa<DbgInfoIntrinsic>(CI) &&
904 !(CI->getCalledFunction() && TLI &&
905 (!VFDatabase::getMappings(*CI).empty() ||
906 isTLIScalarize(*TLI, *CI)))) {
907 // If the call is a recognized math libary call, it is likely that
908 // we can vectorize it given loosened floating-point constraints.
910 bool IsMathLibCall =
911 TLI && CI->getCalledFunction() &&
912 CI->getType()->isFloatingPointTy() &&
913 TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) &&
914 TLI->hasOptimizedCodeGen(Func);
915
916 if (IsMathLibCall) {
917 // TODO: Ideally, we should not use clang-specific language here,
918 // but it's hard to provide meaningful yet generic advice.
919 // Also, should this be guarded by allowExtraAnalysis() and/or be part
920 // of the returned info from isFunctionVectorizable()?
922 "Found a non-intrinsic callsite",
923 "library call cannot be vectorized. "
924 "Try compiling with -fno-math-errno, -ffast-math, "
925 "or similar flags",
926 "CantVectorizeLibcall", ORE, TheLoop, CI);
927 } else {
928 reportVectorizationFailure("Found a non-intrinsic callsite",
929 "call instruction cannot be vectorized",
930 "CantVectorizeLibcall", ORE, TheLoop, CI);
931 }
932 return false;
933 }
934
935 // Some intrinsics have scalar arguments and should be same in order for
936 // them to be vectorized (i.e. loop invariant).
937 if (CI) {
938 auto *SE = PSE.getSE();
939 Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
940 for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx)
942 if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(Idx)),
943 TheLoop)) {
944 reportVectorizationFailure("Found unvectorizable intrinsic",
945 "intrinsic instruction cannot be vectorized",
946 "CantVectorizeIntrinsic", ORE, TheLoop, CI);
947 return false;
948 }
949 }
950 }
951
952 // If we found a vectorized variant of a function, note that so LV can
953 // make better decisions about maximum VF.
954 if (CI && !VFDatabase::getMappings(*CI).empty())
955 VecCallVariantsFound = true;
956
957 auto CanWidenInstructionTy = [this](Instruction const &Inst) {
958 Type *InstTy = Inst.getType();
959 if (!isa<StructType>(InstTy))
960 return canVectorizeTy(InstTy);
961
962 // For now, we only recognize struct values returned from calls where
963 // all users are extractvalue as vectorizable. All element types of the
964 // struct must be types that can be widened.
965 if (isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
966 all_of(Inst.users(), IsaPred<ExtractValueInst>)) {
967 // TODO: Remove the `StructVecCallFound` flag once vectorizing calls
968 // with struct returns is supported.
969 StructVecCallFound = true;
970 return true;
971 }
972
973 return false;
974 };
975
976 // Check that the instruction return type is vectorizable.
977 // We can't vectorize casts from vector type to scalar type.
978 // Also, we can't vectorize extractelement instructions.
979 if (!CanWidenInstructionTy(I) ||
980 (isa<CastInst>(I) &&
981 !VectorType::isValidElementType(I.getOperand(0)->getType())) ||
982 isa<ExtractElementInst>(I)) {
983 reportVectorizationFailure("Found unvectorizable type",
984 "instruction return type cannot be vectorized",
985 "CantVectorizeInstructionReturnType", ORE, TheLoop, &I);
986 return false;
987 }
988
989 // Check that the stored type is vectorizable.
990 if (auto *ST = dyn_cast<StoreInst>(&I)) {
991 Type *T = ST->getValueOperand()->getType();
993 reportVectorizationFailure("Store instruction cannot be vectorized",
994 "CantVectorizeStore", ORE, TheLoop, ST);
995 return false;
996 }
997
998 // For nontemporal stores, check that a nontemporal vector version is
999 // supported on the target.
1000 if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
1001 // Arbitrarily try a vector of 2 elements.
1002 auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2);
1003 assert(VecTy && "did not find vectorized version of stored type");
1004 if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
1006 "nontemporal store instruction cannot be vectorized",
1007 "CantVectorizeNontemporalStore", ORE, TheLoop, ST);
1008 return false;
1009 }
1010 }
1011
1012 } else if (auto *LD = dyn_cast<LoadInst>(&I)) {
1013 if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
1014 // For nontemporal loads, check that a nontemporal vector version is
1015 // supported on the target (arbitrarily try a vector of 2 elements).
1016 auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2);
1017 assert(VecTy && "did not find vectorized version of load type");
1018 if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
1020 "nontemporal load instruction cannot be vectorized",
1021 "CantVectorizeNontemporalLoad", ORE, TheLoop, LD);
1022 return false;
1023 }
1024 }
1025
1026 // FP instructions can allow unsafe algebra, thus vectorizable by
1027 // non-IEEE-754 compliant SIMD units.
1028 // This applies to floating-point math operations and calls, not memory
1029 // operations, shuffles, or casts, as they don't change precision or
1030 // semantics.
1031 } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
1032 !I.isFast()) {
1033 LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
1034 Hints->setPotentiallyUnsafe();
1035 }
1036
1037 // Reduction instructions are allowed to have exit users.
1038 // All other instructions must not have external users.
1039 if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
1040 // We can safely vectorize loops where instructions within the loop are
1041 // used outside the loop only if the SCEV predicates within the loop is
1042 // same as outside the loop. Allowing the exit means reusing the SCEV
1043 // outside the loop.
1044 if (PSE.getPredicate().isAlwaysTrue()) {
1045 AllowedExit.insert(&I);
1046 continue;
1047 }
1048 reportVectorizationFailure("Value cannot be used outside the loop",
1049 "ValueUsedOutsideLoop", ORE, TheLoop, &I);
1050 return false;
1051 }
1052 } // next instr.
1053 }
1054
1055 if (!PrimaryInduction) {
1056 if (Inductions.empty()) {
1057 reportVectorizationFailure("Did not find one integer induction var",
1058 "loop induction variable could not be identified",
1059 "NoInductionVariable", ORE, TheLoop);
1060 return false;
1061 }
1062 if (!WidestIndTy) {
1063 reportVectorizationFailure("Did not find one integer induction var",
1064 "integer loop induction variable could not be identified",
1065 "NoIntegerInductionVariable", ORE, TheLoop);
1066 return false;
1067 }
1068 LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
1069 }
1070
1071 // Now we know the widest induction type, check if our found induction
1072 // is the same size. If it's not, unset it here and InnerLoopVectorizer
1073 // will create another.
1074 if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
1075 PrimaryInduction = nullptr;
1076
1077 return true;
1078}
1079
1080/// Find histogram operations that match high-level code in loops:
1081/// \code
1082/// buckets[indices[i]]+=step;
1083/// \endcode
1084///
1085/// It matches a pattern starting from \p HSt, which Stores to the 'buckets'
1086/// array the computed histogram. It uses a BinOp to sum all counts, storing
1087/// them using a loop-variant index Load from the 'indices' input array.
1088///
1089/// On successful matches it updates the STATISTIC 'HistogramsDetected',
1090/// regardless of hardware support. When there is support, it additionally
1091/// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers
1092/// used to update histogram in \p HistogramPtrs.
1093static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
1094 const PredicatedScalarEvolution &PSE,
1095 SmallVectorImpl<HistogramInfo> &Histograms) {
1096
1097 // Store value must come from a Binary Operation.
1098 Instruction *HPtrInstr = nullptr;
1099 BinaryOperator *HBinOp = nullptr;
1100 if (!match(HSt, m_Store(m_BinOp(HBinOp), m_Instruction(HPtrInstr))))
1101 return false;
1102
1103 // BinOp must be an Add or a Sub modifying the bucket value by a
1104 // loop invariant amount.
1105 // FIXME: We assume the loop invariant term is on the RHS.
1106 // Fine for an immediate/constant, but maybe not a generic value?
1107 Value *HIncVal = nullptr;
1108 if (!match(HBinOp, m_Add(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
1109 !match(HBinOp, m_Sub(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))))
1110 return false;
1111
1112 // Make sure the increment value is loop invariant.
1113 if (!TheLoop->isLoopInvariant(HIncVal))
1114 return false;
1115
1116 // The address to store is calculated through a GEP Instruction.
1117 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(HPtrInstr);
1118 if (!GEP)
1119 return false;
1120
1121 // Restrict address calculation to constant indices except for the last term.
1122 Value *HIdx = nullptr;
1123 for (Value *Index : GEP->indices()) {
1124 if (HIdx)
1125 return false;
1126 if (!isa<ConstantInt>(Index))
1127 HIdx = Index;
1128 }
1129
1130 if (!HIdx)
1131 return false;
1132
1133 // Check that the index is calculated by loading from another array. Ignore
1134 // any extensions.
1135 // FIXME: Support indices from other sources than a linear load from memory?
1136 // We're currently trying to match an operation looping over an array
1137 // of indices, but there could be additional levels of indirection
1138 // in place, or possibly some additional calculation to form the index
1139 // from the loaded data.
1140 Value *VPtrVal;
1141 if (!match(HIdx, m_ZExtOrSExtOrSelf(m_Load(m_Value(VPtrVal)))))
1142 return false;
1143
1144 // Make sure the index address varies in this loop, not an outer loop.
1145 const auto *AR = dyn_cast<SCEVAddRecExpr>(PSE.getSE()->getSCEV(VPtrVal));
1146 if (!AR || AR->getLoop() != TheLoop)
1147 return false;
1148
1149 // Ensure we'll have the same mask by checking that all parts of the histogram
1150 // (gather load, update, scatter store) are in the same block.
1151 LoadInst *IndexedLoad = cast<LoadInst>(HBinOp->getOperand(0));
1152 BasicBlock *LdBB = IndexedLoad->getParent();
1153 if (LdBB != HBinOp->getParent() || LdBB != HSt->getParent())
1154 return false;
1155
1156 LLVM_DEBUG(dbgs() << "LV: Found histogram for: " << *HSt << "\n");
1157
1158 // Store the operations that make up the histogram.
1159 Histograms.emplace_back(IndexedLoad, HBinOp, HSt);
1160 return true;
1161}
1162
1163bool LoopVectorizationLegality::canVectorizeIndirectUnsafeDependences() {
1164 // For now, we only support an IndirectUnsafe dependency that calculates
1165 // a histogram
1167 return false;
1168
1169 // Find a single IndirectUnsafe dependency.
1170 const MemoryDepChecker::Dependence *IUDep = nullptr;
1171 const MemoryDepChecker &DepChecker = LAI->getDepChecker();
1172 const auto *Deps = DepChecker.getDependences();
1173 // If there were too many dependences, LAA abandons recording them. We can't
1174 // proceed safely if we don't know what the dependences are.
1175 if (!Deps)
1176 return false;
1177
1178 for (const MemoryDepChecker::Dependence &Dep : *Deps) {
1179 // Ignore dependencies that are either known to be safe or can be
1180 // checked at runtime.
1183 continue;
1184
1185 // We're only interested in IndirectUnsafe dependencies here, where the
1186 // address might come from a load from memory. We also only want to handle
1187 // one such dependency, at least for now.
1188 if (Dep.Type != MemoryDepChecker::Dependence::IndirectUnsafe || IUDep)
1189 return false;
1190
1191 IUDep = &Dep;
1192 }
1193 if (!IUDep)
1194 return false;
1195
1196 // For now only normal loads and stores are supported.
1197 LoadInst *LI = dyn_cast<LoadInst>(IUDep->getSource(DepChecker));
1198 StoreInst *SI = dyn_cast<StoreInst>(IUDep->getDestination(DepChecker));
1199
1200 if (!LI || !SI)
1201 return false;
1202
1203 LLVM_DEBUG(dbgs() << "LV: Checking for a histogram on: " << *SI << "\n");
1204 return findHistogram(LI, SI, TheLoop, LAI->getPSE(), Histograms);
1205}
1206
1207bool LoopVectorizationLegality::canVectorizeMemory() {
1208 LAI = &LAIs.getInfo(*TheLoop);
1209 const OptimizationRemarkAnalysis *LAR = LAI->getReport();
1210 if (LAR) {
1211 ORE->emit([&]() {
1212 return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
1213 "loop not vectorized: ", *LAR);
1214 });
1215 }
1216
1217 if (!LAI->canVectorizeMemory())
1218 return canVectorizeIndirectUnsafeDependences();
1219
1221 reportVectorizationFailure("We don't allow storing to uniform addresses",
1222 "write to a loop invariant address could not "
1223 "be vectorized",
1224 "CantVectorizeStoreToLoopInvariantAddress", ORE,
1225 TheLoop);
1226 return false;
1227 }
1228
1229 // We can vectorize stores to invariant address when final reduction value is
1230 // guaranteed to be stored at the end of the loop. Also, if decision to
1231 // vectorize loop is made, runtime checks are added so as to make sure that
1232 // invariant address won't alias with any other objects.
1233 if (!LAI->getStoresToInvariantAddresses().empty()) {
1234 // For each invariant address, check if last stored value is unconditional
1235 // and the address is not calculated inside the loop.
1236 for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
1238 continue;
1239
1240 if (blockNeedsPredication(SI->getParent())) {
1242 "We don't allow storing to uniform addresses",
1243 "write of conditional recurring variant value to a loop "
1244 "invariant address could not be vectorized",
1245 "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1246 return false;
1247 }
1248
1249 // Invariant address should be defined outside of loop. LICM pass usually
1250 // makes sure it happens, but in rare cases it does not, we do not want
1251 // to overcomplicate vectorization to support this case.
1252 if (Instruction *Ptr = dyn_cast<Instruction>(SI->getPointerOperand())) {
1253 if (TheLoop->contains(Ptr)) {
1255 "Invariant address is calculated inside the loop",
1256 "write to a loop invariant address could not "
1257 "be vectorized",
1258 "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1259 return false;
1260 }
1261 }
1262 }
1263
1265 // For each invariant address, check its last stored value is the result
1266 // of one of our reductions.
1267 //
1268 // We do not check if dependence with loads exists because that is already
1269 // checked via hasLoadStoreDependenceInvolvingLoopInvariantAddress.
1270 ScalarEvolution *SE = PSE.getSE();
1271 SmallVector<StoreInst *, 4> UnhandledStores;
1272 for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
1274 // Earlier stores to this address are effectively deadcode.
1275 // With opaque pointers it is possible for one pointer to be used with
1276 // different sizes of stored values:
1277 // store i32 0, ptr %x
1278 // store i8 0, ptr %x
1279 // The latest store doesn't complitely overwrite the first one in the
1280 // example. That is why we have to make sure that types of stored
1281 // values are same.
1282 // TODO: Check that bitwidth of unhandled store is smaller then the
1283 // one that overwrites it and add a test.
1284 erase_if(UnhandledStores, [SE, SI](StoreInst *I) {
1285 return storeToSameAddress(SE, SI, I) &&
1286 I->getValueOperand()->getType() ==
1287 SI->getValueOperand()->getType();
1288 });
1289 continue;
1290 }
1291 UnhandledStores.push_back(SI);
1292 }
1293
1294 bool IsOK = UnhandledStores.empty();
1295 // TODO: we should also validate against InvariantMemSets.
1296 if (!IsOK) {
1298 "We don't allow storing to uniform addresses",
1299 "write to a loop invariant address could not "
1300 "be vectorized",
1301 "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1302 return false;
1303 }
1304 }
1305 }
1306
1307 PSE.addPredicate(LAI->getPSE().getPredicate());
1308 return true;
1309}
1310
1312 bool EnableStrictReductions) {
1313
1314 // First check if there is any ExactFP math or if we allow reassociations
1315 if (!Requirements->getExactFPInst() || Hints->allowReordering())
1316 return true;
1317
1318 // If the above is false, we have ExactFPMath & do not allow reordering.
1319 // If the EnableStrictReductions flag is set, first check if we have any
1320 // Exact FP induction vars, which we cannot vectorize.
1321 if (!EnableStrictReductions ||
1322 any_of(getInductionVars(), [&](auto &Induction) -> bool {
1323 InductionDescriptor IndDesc = Induction.second;
1324 return IndDesc.getExactFPMathInst();
1325 }))
1326 return false;
1327
1328 // We can now only vectorize if all reductions with Exact FP math also
1329 // have the isOrdered flag set, which indicates that we can move the
1330 // reduction operations in-loop.
1331 return (all_of(getReductionVars(), [&](auto &Reduction) -> bool {
1332 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1333 return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered();
1334 }));
1335}
1336
1338 return any_of(getReductionVars(), [&](auto &Reduction) -> bool {
1339 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1340 return RdxDesc.IntermediateStore == SI;
1341 });
1342}
1343
1345 return any_of(getReductionVars(), [&](auto &Reduction) -> bool {
1346 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1347 if (!RdxDesc.IntermediateStore)
1348 return false;
1349
1350 ScalarEvolution *SE = PSE.getSE();
1351 Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand();
1352 return V == InvariantAddress ||
1353 SE->getSCEV(V) == SE->getSCEV(InvariantAddress);
1354 });
1355}
1356
1358 Value *In0 = const_cast<Value *>(V);
1359 PHINode *PN = dyn_cast_or_null<PHINode>(In0);
1360 if (!PN)
1361 return false;
1362
1363 return Inductions.count(PN);
1364}
1365
1366const InductionDescriptor *
1368 if (!isInductionPhi(Phi))
1369 return nullptr;
1370 auto &ID = getInductionVars().find(Phi)->second;
1371 if (ID.getKind() == InductionDescriptor::IK_IntInduction ||
1373 return &ID;
1374 return nullptr;
1375}
1376
1377const InductionDescriptor *
1379 if (!isInductionPhi(Phi))
1380 return nullptr;
1381 auto &ID = getInductionVars().find(Phi)->second;
1383 return &ID;
1384 return nullptr;
1385}
1386
1388 const Value *V) const {
1389 auto *Inst = dyn_cast<Instruction>(V);
1390 return (Inst && InductionCastsToIgnore.count(Inst));
1391}
1392
1395}
1396
1398 const PHINode *Phi) const {
1399 return FixedOrderRecurrences.count(Phi);
1400}
1401
1403 // When vectorizing early exits, create predicates for the latch block only.
1404 // The early exiting block must be a direct predecessor of the latch at the
1405 // moment.
1406 BasicBlock *Latch = TheLoop->getLoopLatch();
1408 assert(
1410 "Uncountable exiting block must be a direct predecessor of latch");
1411 return BB == Latch;
1412 }
1413 return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
1414}
1415
1416bool LoopVectorizationLegality::blockCanBePredicated(
1417 BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
1418 SmallPtrSetImpl<const Instruction *> &MaskedOp) const {
1419 for (Instruction &I : *BB) {
1420 // We can predicate blocks with calls to assume, as long as we drop them in
1421 // case we flatten the CFG via predication.
1422 if (match(&I, m_Intrinsic<Intrinsic::assume>())) {
1423 MaskedOp.insert(&I);
1424 continue;
1425 }
1426
1427 // Do not let llvm.experimental.noalias.scope.decl block the vectorization.
1428 // TODO: there might be cases that it should block the vectorization. Let's
1429 // ignore those for now.
1430 if (isa<NoAliasScopeDeclInst>(&I))
1431 continue;
1432
1433 // We can allow masked calls if there's at least one vector variant, even
1434 // if we end up scalarizing due to the cost model calculations.
1435 // TODO: Allow other calls if they have appropriate attributes... readonly
1436 // and argmemonly?
1437 if (CallInst *CI = dyn_cast<CallInst>(&I))
1439 MaskedOp.insert(CI);
1440 continue;
1441 }
1442
1443 // Loads are handled via masking (or speculated if safe to do so.)
1444 if (auto *LI = dyn_cast<LoadInst>(&I)) {
1445 if (!SafePtrs.count(LI->getPointerOperand()))
1446 MaskedOp.insert(LI);
1447 continue;
1448 }
1449
1450 // Predicated store requires some form of masking:
1451 // 1) masked store HW instruction,
1452 // 2) emulation via load-blend-store (only if safe and legal to do so,
1453 // be aware on the race conditions), or
1454 // 3) element-by-element predicate check and scalar store.
1455 if (auto *SI = dyn_cast<StoreInst>(&I)) {
1456 MaskedOp.insert(SI);
1457 continue;
1458 }
1459
1460 if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow())
1461 return false;
1462 }
1463
1464 return true;
1465}
1466
1467bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
1468 if (!EnableIfConversion) {
1469 reportVectorizationFailure("If-conversion is disabled",
1470 "IfConversionDisabled", ORE, TheLoop);
1471 return false;
1472 }
1473
1474 assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
1475
1476 // A list of pointers which are known to be dereferenceable within scope of
1477 // the loop body for each iteration of the loop which executes. That is,
1478 // the memory pointed to can be dereferenced (with the access size implied by
1479 // the value's type) unconditionally within the loop header without
1480 // introducing a new fault.
1481 SmallPtrSet<Value *, 8> SafePointers;
1482
1483 // Collect safe addresses.
1484 for (BasicBlock *BB : TheLoop->blocks()) {
1485 if (!blockNeedsPredication(BB)) {
1486 for (Instruction &I : *BB)
1487 if (auto *Ptr = getLoadStorePointerOperand(&I))
1488 SafePointers.insert(Ptr);
1489 continue;
1490 }
1491
1492 // For a block which requires predication, a address may be safe to access
1493 // in the loop w/o predication if we can prove dereferenceability facts
1494 // sufficient to ensure it'll never fault within the loop. For the moment,
1495 // we restrict this to loads; stores are more complicated due to
1496 // concurrency restrictions.
1497 ScalarEvolution &SE = *PSE.getSE();
1499 for (Instruction &I : *BB) {
1500 LoadInst *LI = dyn_cast<LoadInst>(&I);
1501 // Pass the Predicates pointer to isDereferenceableAndAlignedInLoop so
1502 // that it will consider loops that need guarding by SCEV checks. The
1503 // vectoriser will generate these checks if we decide to vectorise.
1504 if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(*LI) &&
1505 isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT, AC,
1506 &Predicates))
1507 SafePointers.insert(LI->getPointerOperand());
1508 Predicates.clear();
1509 }
1510 }
1511
1512 // Collect the blocks that need predication.
1513 for (BasicBlock *BB : TheLoop->blocks()) {
1514 // We support only branches and switch statements as terminators inside the
1515 // loop.
1516 if (isa<SwitchInst>(BB->getTerminator())) {
1517 if (TheLoop->isLoopExiting(BB)) {
1518 reportVectorizationFailure("Loop contains an unsupported switch",
1519 "LoopContainsUnsupportedSwitch", ORE,
1520 TheLoop, BB->getTerminator());
1521 return false;
1522 }
1523 } else if (!isa<BranchInst>(BB->getTerminator())) {
1524 reportVectorizationFailure("Loop contains an unsupported terminator",
1525 "LoopContainsUnsupportedTerminator", ORE,
1526 TheLoop, BB->getTerminator());
1527 return false;
1528 }
1529
1530 // We must be able to predicate all blocks that need to be predicated.
1531 if (blockNeedsPredication(BB) &&
1532 !blockCanBePredicated(BB, SafePointers, MaskedOp)) {
1534 "Control flow cannot be substituted for a select", "NoCFGForSelect",
1535 ORE, TheLoop, BB->getTerminator());
1536 return false;
1537 }
1538 }
1539
1540 // We can if-convert this loop.
1541 return true;
1542}
1543
1544// Helper function to canVectorizeLoopNestCFG.
1545bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
1546 bool UseVPlanNativePath) {
1547 assert((UseVPlanNativePath || Lp->isInnermost()) &&
1548 "VPlan-native path is not enabled.");
1549
1550 // TODO: ORE should be improved to show more accurate information when an
1551 // outer loop can't be vectorized because a nested loop is not understood or
1552 // legal. Something like: "outer_loop_location: loop not vectorized:
1553 // (inner_loop_location) loop control flow is not understood by vectorizer".
1554
1555 // Store the result and return it at the end instead of exiting early, in case
1556 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1557 bool Result = true;
1558 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1559
1560 // We must have a loop in canonical form. Loops with indirectbr in them cannot
1561 // be canonicalized.
1562 if (!Lp->getLoopPreheader()) {
1563 reportVectorizationFailure("Loop doesn't have a legal pre-header",
1564 "loop control flow is not understood by vectorizer",
1565 "CFGNotUnderstood", ORE, TheLoop);
1566 if (DoExtraAnalysis)
1567 Result = false;
1568 else
1569 return false;
1570 }
1571
1572 // We must have a single backedge.
1573 if (Lp->getNumBackEdges() != 1) {
1574 reportVectorizationFailure("The loop must have a single backedge",
1575 "loop control flow is not understood by vectorizer",
1576 "CFGNotUnderstood", ORE, TheLoop);
1577 if (DoExtraAnalysis)
1578 Result = false;
1579 else
1580 return false;
1581 }
1582
1583 return Result;
1584}
1585
1586bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
1587 Loop *Lp, bool UseVPlanNativePath) {
1588 // Store the result and return it at the end instead of exiting early, in case
1589 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1590 bool Result = true;
1591 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1592 if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) {
1593 if (DoExtraAnalysis)
1594 Result = false;
1595 else
1596 return false;
1597 }
1598
1599 // Recursively check whether the loop control flow of nested loops is
1600 // understood.
1601 for (Loop *SubLp : *Lp)
1602 if (!canVectorizeLoopNestCFG(SubLp, UseVPlanNativePath)) {
1603 if (DoExtraAnalysis)
1604 Result = false;
1605 else
1606 return false;
1607 }
1608
1609 return Result;
1610}
1611
1612bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
1613 BasicBlock *LatchBB = TheLoop->getLoopLatch();
1614 if (!LatchBB) {
1615 reportVectorizationFailure("Loop does not have a latch",
1616 "Cannot vectorize early exit loop",
1617 "NoLatchEarlyExit", ORE, TheLoop);
1618 return false;
1619 }
1620
1621 if (Reductions.size() || FixedOrderRecurrences.size()) {
1623 "Found reductions or recurrences in early-exit loop",
1624 "Cannot vectorize early exit loop with reductions or recurrences",
1625 "RecurrencesInEarlyExitLoop", ORE, TheLoop);
1626 return false;
1627 }
1628
1629 SmallVector<BasicBlock *, 8> ExitingBlocks;
1630 TheLoop->getExitingBlocks(ExitingBlocks);
1631
1632 // Keep a record of all the exiting blocks.
1634 for (BasicBlock *BB : ExitingBlocks) {
1635 const SCEV *EC =
1636 PSE.getSE()->getPredicatedExitCount(TheLoop, BB, &Predicates);
1637 if (isa<SCEVCouldNotCompute>(EC)) {
1638 UncountableExitingBlocks.push_back(BB);
1639
1641 if (Succs.size() != 2) {
1643 "Early exiting block does not have exactly two successors",
1644 "Incorrect number of successors from early exiting block",
1645 "EarlyExitTooManySuccessors", ORE, TheLoop);
1646 return false;
1647 }
1648
1649 BasicBlock *ExitBlock;
1650 if (!TheLoop->contains(Succs[0]))
1651 ExitBlock = Succs[0];
1652 else {
1653 assert(!TheLoop->contains(Succs[1]));
1654 ExitBlock = Succs[1];
1655 }
1656 UncountableExitBlocks.push_back(ExitBlock);
1657 } else
1658 CountableExitingBlocks.push_back(BB);
1659 }
1660 // We can safely ignore the predicates here because when vectorizing the loop
1661 // the PredicatatedScalarEvolution class will keep track of all predicates
1662 // for each exiting block anyway. This happens when calling
1663 // PSE.getSymbolicMaxBackedgeTakenCount() below.
1664 Predicates.clear();
1665
1666 // We only support one uncountable early exit.
1667 if (getUncountableExitingBlocks().size() != 1) {
1669 "Loop has too many uncountable exits",
1670 "Cannot vectorize early exit loop with more than one early exit",
1671 "TooManyUncountableEarlyExits", ORE, TheLoop);
1672 return false;
1673 }
1674
1675 // The only supported early exit loops so far are ones where the early
1676 // exiting block is a unique predecessor of the latch block.
1677 BasicBlock *LatchPredBB = LatchBB->getUniquePredecessor();
1678 if (LatchPredBB != getUncountableEarlyExitingBlock()) {
1679 reportVectorizationFailure("Early exit is not the latch predecessor",
1680 "Cannot vectorize early exit loop",
1681 "EarlyExitNotLatchPredecessor", ORE, TheLoop);
1682 return false;
1683 }
1684
1685 // The latch block must have a countable exit.
1686 if (isa<SCEVCouldNotCompute>(
1687 PSE.getSE()->getPredicatedExitCount(TheLoop, LatchBB, &Predicates))) {
1689 "Cannot determine exact exit count for latch block",
1690 "Cannot vectorize early exit loop",
1691 "UnknownLatchExitCountEarlyExitLoop", ORE, TheLoop);
1692 return false;
1693 }
1694 assert(llvm::is_contained(CountableExitingBlocks, LatchBB) &&
1695 "Latch block not found in list of countable exits!");
1696
1697 // Check to see if there are instructions that could potentially generate
1698 // exceptions or have side-effects.
1699 auto IsSafeOperation = [](Instruction *I) -> bool {
1700 switch (I->getOpcode()) {
1701 case Instruction::Load:
1702 case Instruction::Store:
1703 case Instruction::PHI:
1704 case Instruction::Br:
1705 // These are checked separately.
1706 return true;
1707 default:
1709 }
1710 };
1711
1712 for (auto *BB : TheLoop->blocks())
1713 for (auto &I : *BB) {
1714 if (I.mayWriteToMemory()) {
1715 // We don't support writes to memory.
1717 "Writes to memory unsupported in early exit loops",
1718 "Cannot vectorize early exit loop with writes to memory",
1719 "WritesInEarlyExitLoop", ORE, TheLoop);
1720 return false;
1721 } else if (!IsSafeOperation(&I)) {
1722 reportVectorizationFailure("Early exit loop contains operations that "
1723 "cannot be speculatively executed",
1724 "UnsafeOperationsEarlyExitLoop", ORE,
1725 TheLoop);
1726 return false;
1727 }
1728 }
1729
1730 // The vectoriser cannot handle loads that occur after the early exit block.
1732 "Expected latch predecessor to be the early exiting block");
1733
1734 // TODO: Handle loops that may fault.
1735 Predicates.clear();
1736 if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
1737 &Predicates)) {
1739 "Loop may fault",
1740 "Cannot vectorize potentially faulting early exit loop",
1741 "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
1742 return false;
1743 }
1744
1745 [[maybe_unused]] const SCEV *SymbolicMaxBTC =
1747 // Since we have an exact exit count for the latch and the early exit
1748 // dominates the latch, then this should guarantee a computed SCEV value.
1749 assert(!isa<SCEVCouldNotCompute>(SymbolicMaxBTC) &&
1750 "Failed to get symbolic expression for backedge taken count");
1751 LLVM_DEBUG(dbgs() << "LV: Found an early exit loop with symbolic max "
1752 "backedge taken count: "
1753 << *SymbolicMaxBTC << '\n');
1754 return true;
1755}
1756
1757bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
1758 // Store the result and return it at the end instead of exiting early, in case
1759 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1760 bool Result = true;
1761
1762 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1763 // Check whether the loop-related control flow in the loop nest is expected by
1764 // vectorizer.
1765 if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) {
1766 if (DoExtraAnalysis) {
1767 LLVM_DEBUG(dbgs() << "LV: legality check failed: loop nest");
1768 Result = false;
1769 } else {
1770 return false;
1771 }
1772 }
1773
1774 // We need to have a loop header.
1775 LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
1776 << '\n');
1777
1778 // Specific checks for outer loops. We skip the remaining legal checks at this
1779 // point because they don't support outer loops.
1780 if (!TheLoop->isInnermost()) {
1781 assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
1782
1783 if (!canVectorizeOuterLoop()) {
1784 reportVectorizationFailure("Unsupported outer loop",
1785 "UnsupportedOuterLoop", ORE, TheLoop);
1786 // TODO: Implement DoExtraAnalysis when subsequent legal checks support
1787 // outer loops.
1788 return false;
1789 }
1790
1791 LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
1792 return Result;
1793 }
1794
1795 assert(TheLoop->isInnermost() && "Inner loop expected.");
1796 // Check if we can if-convert non-single-bb loops.
1797 unsigned NumBlocks = TheLoop->getNumBlocks();
1798 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
1799 LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
1800 if (DoExtraAnalysis)
1801 Result = false;
1802 else
1803 return false;
1804 }
1805
1806 // Check if we can vectorize the instructions and CFG in this loop.
1807 if (!canVectorizeInstrs()) {
1808 LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
1809 if (DoExtraAnalysis)
1810 Result = false;
1811 else
1812 return false;
1813 }
1814
1815 HasUncountableEarlyExit = false;
1816 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
1817 if (TheLoop->getExitingBlock()) {
1818 reportVectorizationFailure("Cannot vectorize uncountable loop",
1819 "UnsupportedUncountableLoop", ORE, TheLoop);
1820 if (DoExtraAnalysis)
1821 Result = false;
1822 else
1823 return false;
1824 } else {
1825 HasUncountableEarlyExit = true;
1826 if (!isVectorizableEarlyExitLoop()) {
1827 UncountableExitingBlocks.clear();
1828 HasUncountableEarlyExit = false;
1829 if (DoExtraAnalysis)
1830 Result = false;
1831 else
1832 return false;
1833 }
1834 }
1835 }
1836
1837 // Go over each instruction and look at memory deps.
1838 if (!canVectorizeMemory()) {
1839 LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
1840 if (DoExtraAnalysis)
1841 Result = false;
1842 else
1843 return false;
1844 }
1845
1846 if (Result) {
1847 LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
1849 ? " (with a runtime bound check)"
1850 : "")
1851 << "!\n");
1852 }
1853
1854 unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
1855 if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
1856 SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
1857
1858 if (PSE.getPredicate().getComplexity() > SCEVThreshold) {
1859 LLVM_DEBUG(dbgs() << "LV: Vectorization not profitable "
1860 "due to SCEVThreshold");
1861 reportVectorizationFailure("Too many SCEV checks needed",
1862 "Too many SCEV assumptions need to be made and checked at runtime",
1863 "TooManySCEVRunTimeChecks", ORE, TheLoop);
1864 if (DoExtraAnalysis)
1865 Result = false;
1866 else
1867 return false;
1868 }
1869
1870 // Okay! We've done all the tests. If any have failed, return false. Otherwise
1871 // we can vectorize, and at this point we don't have any other mem analysis
1872 // which may limit our maximum vectorization factor, so just return true with
1873 // no restrictions.
1874 return Result;
1875}
1876
1878
1879 LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
1880
1881 SmallPtrSet<const Value *, 8> ReductionLiveOuts;
1882
1883 for (const auto &Reduction : getReductionVars())
1884 ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
1885
1886 // TODO: handle non-reduction outside users when tail is folded by masking.
1887 for (auto *AE : AllowedExit) {
1888 // Check that all users of allowed exit values are inside the loop or
1889 // are the live-out of a reduction.
1890 if (ReductionLiveOuts.count(AE))
1891 continue;
1892 for (User *U : AE->users()) {
1893 Instruction *UI = cast<Instruction>(U);
1894 if (TheLoop->contains(UI))
1895 continue;
1896 LLVM_DEBUG(
1897 dbgs()
1898 << "LV: Cannot fold tail by masking, loop has an outside user for "
1899 << *UI << "\n");
1900 return false;
1901 }
1902 }
1903
1904 for (const auto &Entry : getInductionVars()) {
1905 PHINode *OrigPhi = Entry.first;
1906 for (User *U : OrigPhi->users()) {
1907 auto *UI = cast<Instruction>(U);
1908 if (!TheLoop->contains(UI)) {
1909 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking, loop IV has an "
1910 "outside user for "
1911 << *UI << "\n");
1912 return false;
1913 }
1914 }
1915 }
1916
1917 // The list of pointers that we can safely read and write to remains empty.
1918 SmallPtrSet<Value *, 8> SafePointers;
1919
1920 // Check all blocks for predication, including those that ordinarily do not
1921 // need predication such as the header block.
1923 for (BasicBlock *BB : TheLoop->blocks()) {
1924 if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp)) {
1925 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking.\n");
1926 return false;
1927 }
1928 }
1929
1930 LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
1931
1932 return true;
1933}
1934
1936 // The list of pointers that we can safely read and write to remains empty.
1937 SmallPtrSet<Value *, 8> SafePointers;
1938
1939 // Mark all blocks for predication, including those that ordinarily do not
1940 // need predication such as the header block.
1941 for (BasicBlock *BB : TheLoop->blocks()) {
1942 [[maybe_unused]] bool R = blockCanBePredicated(BB, SafePointers, MaskedOp);
1943 assert(R && "Must be able to predicate block when tail-folding.");
1944 }
1945}
1946
1947} // namespace llvm
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
std::string Name
#define DEBUG_TYPE
Hexagon Common GEP
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:557
loop Loop Strength Reduction
static cl::opt< LoopVectorizeHints::ScalableForceKind > ForceScalableVectorization("scalable-vectorization", cl::init(LoopVectorizeHints::SK_Unspecified), cl::Hidden, cl::desc("Control whether the compiler can use scalable vectors to " "vectorize a loop"), cl::values(clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off", "Scalable vectorization is disabled."), clEnumValN(LoopVectorizeHints::SK_PreferScalable, "preferred", "Scalable vectorization is available and favored when the " "cost is inconclusive."), clEnumValN(LoopVectorizeHints::SK_PreferScalable, "on", "Scalable vectorization is available and favored when the " "cost is inconclusive.")))
#define LV_NAME
static cl::opt< unsigned > PragmaVectorizeSCEVCheckThreshold("pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum number of SCEV checks allowed with a " "vectorize(enable) pragma"))
static const unsigned MaxInterleaveFactor
Maximum vectorization interleave count.
static cl::opt< bool > AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden, cl::desc("Enable recognition of non-constant strided " "pointer induction variables."))
static cl::opt< unsigned > VectorizeSCEVCheckThreshold("vectorize-scev-check-threshold", cl::init(16), cl::Hidden, cl::desc("The maximum number of SCEV checks allowed."))
static cl::opt< bool > EnableHistogramVectorization("enable-histogram-loop-vectorization", cl::init(false), cl::Hidden, cl::desc("Enables autovectorization of some loops containing histograms"))
static cl::opt< bool > EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization."))
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define H(x, y, z)
Definition: MD5.cpp:57
if(PassOpts->AAPipeline)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
This pass exposes codegen information to IR-level passes.
Virtual Register Rewriter
Definition: VirtRegMap.cpp:261
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:78
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:467
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1349
This class represents a function call, abstracting a target machine's calling convention.
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:528
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:314
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
A struct for saving information about induction variables.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
static bool isInductionPHI(PHINode *Phi, const Loop *L, ScalarEvolution *SE, InductionDescriptor &D, const SCEV *Expr=nullptr, SmallVectorImpl< Instruction * > *CastsToIgnore=nullptr)
Returns true if Phi is an induction in the loop L.
Instruction * getExactFPMathInst()
Returns floating-point induction operator that does not allow reassociation (transforming the inducti...
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
const LoopAccessInfo & getInfo(Loop &L)
const MemoryDepChecker & getDepChecker() const
the Memory Dependence Checker which can determine the loop-independent and loop-carried dependences b...
ArrayRef< StoreInst * > getStoresToInvariantAddresses() const
Return the list of stores to invariant addresses.
const OptimizationRemarkAnalysis * getReport() const
The diagnostics report generated for the analysis.
const RuntimePointerChecking * getRuntimePointerChecking() const
bool canVectorizeMemory() const
Return true we can analyze the memory accesses in the loop and there are no memory dependence cycles.
bool isInvariant(Value *V) const
Returns true if value V is loop invariant.
bool hasLoadStoreDependenceInvolvingLoopInvariantAddress() const
Return true if the loop has memory dependence involving a load and a store to an invariant address,...
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
static bool blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, DominatorTree *DT)
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
bool hasStoreStoreDependenceInvolvingLoopInvariantAddress() const
Return true if the loop has memory dependence involving two stores to an invariant address,...
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
bool isLoopHeader(const BlockT *BB) const
const SmallVector< BasicBlock *, 4 > & getUncountableExitingBlocks() const
Returns all the exiting blocks with an uncountable exit.
bool isInvariantStoreOfReduction(StoreInst *SI)
Returns True if given store is a final invariant store of one of the reductions found in the loop.
bool isInvariantAddressOfReduction(Value *V)
Returns True if given address is invariant and is used to store recurrent expression.
bool blockNeedsPredication(BasicBlock *BB) const
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const
Check if this pointer is consecutive when vectorizing.
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
bool isFixedOrderRecurrence(const PHINode *Phi) const
Returns True if Phi is a fixed-order recurrence in this loop.
const InductionDescriptor * getPointerInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is pointer induction.
const InductionDescriptor * getIntOrFpInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is an integer or floating point induction.
bool isInductionPhi(const Value *V) const
Returns True if V is a Phi node of an induction variable in this loop.
bool isUniform(Value *V, ElementCount VF) const
Returns true if value V is uniform across VF lanes, when VF is provided, and otherwise if V is invari...
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool isInvariant(Value *V) const
Returns true if V is invariant across all loop iterations according to SCEV.
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
bool canFoldTailByMasking() const
Return true if we can vectorize this loop while folding its tail by masking.
void prepareToFoldTailByMasking()
Mark all respective loads/stores for masking.
bool hasUncountableEarlyExit() const
Returns true if the loop has an uncountable early exit, i.e.
bool isUniformMemOp(Instruction &I, ElementCount VF) const
A uniform memory op is a load or store which accesses the same memory location on all VF lanes,...
BasicBlock * getUncountableEarlyExitingBlock() const
Returns the uncountable early exiting block.
bool isInductionVariable(const Value *V) const
Returns True if V can be considered as an induction variable in this loop.
bool isCastedInductionVariable(const Value *V) const
Returns True if V is a cast that is part of an induction def-use chain, and had been proven to be red...
void addExactFPMathInst(Instruction *I)
Track the 1st floating-point instruction that can not be reassociated.
@ SK_PreferScalable
Vectorize loops using scalable vectors or fixed-width vectors, but favor scalable vectors when the co...
@ SK_FixedWidthOnly
Disables vectorization with scalable vectors.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
LoopVectorizeHints(const Loop *L, bool InterleaveOnlyWhenForced, OptimizationRemarkEmitter &ORE, const TargetTransformInfo *TTI=nullptr)
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:632
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:61
void setLoopID(MDNode *LoopID) const
Set the llvm.loop loop id metadata for this loop.
Definition: LoopInfo.cpp:526
PHINode * getCanonicalInductionVariable() const
Check to see if the loop has a canonical induction variable: an integer recurrence that starts at 0 a...
Definition: LoopInfo.cpp:151
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Definition: LoopInfo.cpp:502
Metadata node.
Definition: Metadata.h:1069
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1428
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1545
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1436
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:891
A single uniqued string.
Definition: Metadata.h:720
StringRef getString() const
Definition: Metadata.cpp:616
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:606
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
size_type size() const
Definition: MapVector.h:60
Checks memory dependences among accesses to the same underlying object to determine whether there vec...
const SmallVectorImpl< Dependence > * getDependences() const
Returns the memory dependences.
Root of the metadata hierarchy.
Definition: Metadata.h:62
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
void addPredicate(const SCEVPredicate &Pred)
Adds a new predicate.
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSymbolicMaxBackedgeTakenCount()
Get the (predicated) symbolic max backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:77
Instruction * getExactFPMathInst() const
Returns 1st non-reassociative FP instruction in the PHI node's use-chain.
static bool isFixedOrderRecurrence(PHINode *Phi, Loop *TheLoop, DominatorTree *DT)
Returns true if Phi is a fixed-order recurrence.
bool hasExactFPMath() const
Returns true if the recurrence has floating-point math that requires precise (ordered) operations.
Instruction * getLoopExitInstr() const
static bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes, DemandedBits *DB=nullptr, AssumptionCache *AC=nullptr, DominatorTree *DT=nullptr, ScalarEvolution *SE=nullptr)
Returns true if Phi is a reduction in TheLoop.
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
StoreInst * IntermediateStore
Reductions may store temporary or final result to an invariant address.
bool Need
This flag indicates if we need to add the runtime check.
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
virtual unsigned getComplexity() const
Returns the estimated complexity of this predicate.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This visitor recursively visits a SCEV expression and re-writes it.
const SCEV * visit(const SCEV *S)
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
The main scalar evolution driver.
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
const SCEV * getPredicatedExitCount(const Loop *L, const BasicBlock *ExitingBlock, SmallVectorImpl< const SCEVPredicate * > *Predicates, ExitCountKind Kind=Exact)
Same as above except this uses the predicated backedge taken info and may require predicates.
const SCEV * getCouldNotCompute()
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
bool empty() const
Definition: SmallVector.h:81
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
Value * getPointerOperand()
Definition: Instructions.h:381
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
static constexpr size_t npos
Definition: StringRef.h:53
Provides information about what library functions are available for the current target.
bool hasOptimizedCodeGen(LibFunc F) const
Tests if the function is both available and a candidate for optimized code generation.
void getWidestVF(StringRef ScalarF, ElementCount &FixedVF, ElementCount &ScalableVF) const
Returns the largest vectorization factor used in the list of vector functions.
bool getLibFunc(StringRef funcName, LibFunc &F) const
Searches for a particular function name.
bool isFunctionVectorizable(StringRef F, const ElementCount &VF) const
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool isLegalNTLoad(Type *DataType, Align Alignment) const
Return true if the target supports nontemporal load.
bool isLegalNTStore(Type *DataType, Align Alignment) const
Return true if the target supports nontemporal store.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
std::string str() const
Return the twine contents as a std::string.
Definition: Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
Value * getOperand(unsigned i) const
Definition: User.h:228
static bool hasMaskedVariant(const CallInst &CI, std::optional< ElementCount > VF=std::nullopt)
Definition: VectorUtils.h:83
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:72
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
iterator_range< user_iterator > users()
Definition: Value.h:421
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:232
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr bool isZero() const
Definition: TypeSize.h:156
const ParentTy * getParent() const
Definition: ilist_node.h:32
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:826
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
match_combine_or< match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > >, OpTy > m_ZExtOrSExtOrSelf(const OpTy &Op)
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
cl::opt< bool > HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden, cl::desc("Allow enabling loop hints to reorder " "FP operations during vectorization."))
static Type * getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1)
auto successors(const MachineBasicBlock *BB)
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
static Type * convertPointerToIntegerType(const DataLayout &DL, Type *Ty)
static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp)
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
static bool isUniformLoop(Loop *Lp, Loop *OuterLp)
bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition: Loads.cpp:369
static bool canWidenCallReturnType(Type *Ty)
Returns true if the call return type Ty can be widened by the loop vectorizer.
bool isDereferenceableReadOnlyLoop(Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Return true if the loop L cannot fault on any iteration and only contains read-only memory accesses.
Definition: Loads.cpp:808
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, SmallPtrSetImpl< Value * > &AllowedExit)
Check that the instruction has outside loop users and is not an identified reduction variable.
static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A, StoreInst *B)
Returns true if A and B have same pointer operands or same SCEVs addresses.
bool canVectorizeTy(Type *Ty)
Returns true if Ty is a valid vector element type, void, or an unpacked literal struct where all elem...
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
llvm::MDNode * makePostTransformationMetadata(llvm::LLVMContext &Context, MDNode *OrigLoopID, llvm::ArrayRef< llvm::StringRef > RemovePrefixes, llvm::ArrayRef< llvm::MDNode * > AddAttrs)
Create a new LoopID after the loop has been transformed.
Definition: LoopInfo.cpp:1158
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2099
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop, const PredicatedScalarEvolution &PSE, SmallVectorImpl< HistogramInfo > &Histograms)
Find histogram operations that match high-level code in loops:
static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI)
Checks if a function is scalarizable according to the TLI, in the sense that it should be vectorized ...
bool isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT, AssumptionCache *AC=nullptr, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Return true if we can prove that the given load (which is assumed to be within the specified loop) wo...
Definition: Loads.cpp:275
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
Dependece between memory access instructions.
Instruction * getDestination(const MemoryDepChecker &DepChecker) const
Return the destination instruction of the dependence.
Instruction * getSource(const MemoryDepChecker &DepChecker) const
Return the source instruction of the dependence.
static VectorizationSafetyStatus isSafeForVectorization(DepType Type)
Dependence types that don't prevent vectorization.
An object of this class is returned by queries that could not be answered.
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
Collection of parameters shared beetween the Loop Vectorizer and the Loop Access Analysis.
static const unsigned MaxVectorWidth
Maximum SIMD width.
static bool isInterleaveForced()
True if force-vector-interleave was specified by the user.
static unsigned VectorizationInterleave
Interleave factor as overridden by the user.