LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(PhiR->operands(), PhiR->getDebugLoc(),
75 Phi->getName());
76 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
77 assert(!isa<PHINode>(Inst) && "phis should be handled above");
78 // Create VPWidenMemoryRecipe for loads and stores.
79 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
80 NewRecipe = new VPWidenLoadRecipe(
81 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
82 false /*Consecutive*/, *VPI, Ingredient.getDebugLoc());
83 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
84 NewRecipe = new VPWidenStoreRecipe(
85 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
86 nullptr /*Mask*/, false /*Consecutive*/, *VPI,
87 Ingredient.getDebugLoc());
89 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
90 Ingredient.getDebugLoc());
91 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
92 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
93 if (VectorID == Intrinsic::not_intrinsic)
94 return false;
95
96 // The noalias.scope.decl intrinsic declares a noalias scope that
97 // is valid for a single iteration. Emitting it as a single-scalar
98 // replicate would incorrectly extend the scope across multiple
99 // original iterations packed into one vector iteration.
100 // FIXME: If we want to vectorize this loop, then we have to drop
101 // all the associated !alias.scope and !noalias.
102 if (VectorID == Intrinsic::experimental_noalias_scope_decl)
103 return false;
104
105 // These intrinsics are recognized by getVectorIntrinsicIDForCall
106 // but are not widenable. Emit them as replicate instead of widening.
107 if (VectorID == Intrinsic::assume ||
108 VectorID == Intrinsic::lifetime_end ||
109 VectorID == Intrinsic::lifetime_start ||
110 VectorID == Intrinsic::sideeffect ||
111 VectorID == Intrinsic::pseudoprobe) {
112 // If the operand of llvm.assume holds before vectorization, it will
113 // also hold per lane.
114 // llvm.pseudoprobe requires to be duplicated per lane for accurate
115 // sample count.
116 const bool IsSingleScalar = VectorID != Intrinsic::assume &&
117 VectorID != Intrinsic::pseudoprobe;
118 NewRecipe = new VPReplicateRecipe(CI, Ingredient.operands(),
119 /*IsSingleScalar=*/IsSingleScalar,
120 /*Mask=*/nullptr, *VPI, *VPI,
121 Ingredient.getDebugLoc());
122 } else {
123 NewRecipe = new VPWidenIntrinsicRecipe(
124 *CI, VectorID, drop_end(Ingredient.operands()), CI->getType(),
125 VPIRFlags(*CI), *VPI, CI->getDebugLoc());
126 }
127 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
128 NewRecipe = new VPWidenCastRecipe(
129 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
130 VPIRFlags(*CI), VPIRMetadata(*CI));
131 } else {
132 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
133 *VPI, Ingredient.getDebugLoc());
134 }
135 } else {
137 "inductions must be created earlier");
138 continue;
139 }
140
141 NewRecipe->insertBefore(&Ingredient);
142 if (NewRecipe->getNumDefinedValues() == 1)
143 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
144 else
145 assert(NewRecipe->getNumDefinedValues() == 0 &&
146 "Only recpies with zero or one defined values expected");
147 Ingredient.eraseFromParent();
148 }
149 }
150 return true;
151}
152
153/// Helper for extra no-alias checks via known-safe recipe and SCEV.
155 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
156 VPReplicateRecipe &GroupLeader;
158 const Loop &L;
159 VPTypeAnalysis &TypeInfo;
160
161 // Return true if \p A and \p B are known to not alias for all VFs in the
162 // plan, checked via the distance between the accesses
163 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
164 if (A->getOpcode() != Instruction::Store ||
165 B->getOpcode() != Instruction::Store)
166 return false;
167
168 VPValue *AddrA = A->getOperand(1);
169 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
170 VPValue *AddrB = B->getOperand(1);
171 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
173 return false;
174
175 const APInt *Distance;
176 ScalarEvolution &SE = *PSE.getSE();
177 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
178 return false;
179
180 const DataLayout &DL = SE.getDataLayout();
181 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
182 uint64_t SizeA = DL.getTypeStoreSize(TyA);
183 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
184 uint64_t SizeB = DL.getTypeStoreSize(TyB);
185
186 // Use the maximum store size to ensure no overlap from either direction.
187 // Currently only handles fixed sizes, as it is only used for
188 // replicating VPReplicateRecipes.
189 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
190
191 auto VFs = B->getParent()->getPlan()->vectorFactors();
193 if (MaxVF.isScalable())
194 return false;
195 return Distance->abs().uge(
196 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
197 }
198
199public:
202 const Loop &L, VPTypeAnalysis &TypeInfo)
203 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
204 L(L), TypeInfo(TypeInfo) {}
205
206 /// Return true if \p R should be skipped during alias checking, either
207 /// because it's in the exclude set or because no-alias can be proven via
208 /// SCEV.
209 bool shouldSkip(VPRecipeBase &R) const {
210 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
211 return ExcludeRecipes.contains(&R) ||
212 (Store && isNoAliasViaDistance(Store, &GroupLeader));
213 }
214};
215
216/// Check if a memory operation doesn't alias with memory operations using
217/// scoped noalias metadata, in blocks in the single-successor chain between \p
218/// FirstBB and \p LastBB. If \p SinkInfo is std::nullopt, only recipes that may
219/// write to memory are checked (for load hoisting). Otherwise recipes that both
220/// read and write memory are checked, and SCEV is used to prove no-alias
221/// between the group leader and other replicate recipes (for store sinking).
222static bool
224 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
225 std::optional<SinkStoreInfo> SinkInfo = {}) {
226 bool CheckReads = SinkInfo.has_value();
227 if (!MemLoc.AATags.Scope)
228 return false;
229
230 for (VPBasicBlock *VPBB :
232 for (VPRecipeBase &R : *VPBB) {
233 if (SinkInfo && SinkInfo->shouldSkip(R))
234 continue;
235
236 // Skip recipes that don't need checking.
237 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
238 continue;
239
241 if (!Loc)
242 // Conservatively assume aliasing for memory operations without
243 // location.
244 return false;
245
247 return false;
248 }
249 }
250 return true;
251}
252
253/// Collect either replicated Loads or Stores grouped by their address SCEV, in
254/// a deep-traversal of the vector loop region in \p Plan.
255template <unsigned Opcode>
258 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
259 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
260 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
261 "Only Load and Store opcodes supported");
262 constexpr bool IsLoad = (Opcode == Instruction::Load);
264 RecipesByAddress;
267 for (VPRecipeBase &R : *VPBB) {
268 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
269 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
270 continue;
271
272 // For loads, operand 0 is address; for stores, operand 1 is address.
273 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
274 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
275 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
276 RecipesByAddress[AddrSCEV].push_back(RepR);
277 }
278 }
279 auto Groups = to_vector(RecipesByAddress.values());
280 VPDominatorTree VPDT(Plan);
281 for (auto &Group : Groups) {
282 // Sort mem ops by dominance order, with earliest (most dominating) first.
284 return VPDT.properlyDominates(A, B);
285 });
286 }
287 return Groups;
288}
289
290static bool sinkScalarOperands(VPlan &Plan) {
291 auto Iter = vp_depth_first_deep(Plan.getEntry());
292 bool ScalarVFOnly = Plan.hasScalarVFOnly();
293 bool Changed = false;
294
296 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
297 VPBasicBlock *SinkTo, VPValue *Op) {
298 auto *Candidate =
299 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
300 if (!Candidate)
301 return;
302
303 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
304 // for now.
306 return;
307
308 if (Candidate->getParent() == SinkTo ||
309 vputils::cannotHoistOrSinkRecipe(*Candidate, /*Sinking=*/true))
310 return;
311
312 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
313 if (!ScalarVFOnly && RepR->isSingleScalar())
314 return;
315
316 WorkList.insert({SinkTo, Candidate});
317 };
318
319 // First, collect the operands of all recipes in replicate blocks as seeds for
320 // sinking.
322 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
323 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
324 continue;
325 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
326 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
327 continue;
328 for (auto &Recipe : *VPBB)
329 for (VPValue *Op : Recipe.operands())
330 InsertIfValidSinkCandidate(VPBB, Op);
331 }
332
333 // Try to sink each replicate or scalar IV steps recipe in the worklist.
334 for (unsigned I = 0; I != WorkList.size(); ++I) {
335 VPBasicBlock *SinkTo;
336 VPSingleDefRecipe *SinkCandidate;
337 std::tie(SinkTo, SinkCandidate) = WorkList[I];
338
339 // All recipe users of SinkCandidate must be in the same block SinkTo or all
340 // users outside of SinkTo must only use the first lane of SinkCandidate. In
341 // the latter case, we need to duplicate SinkCandidate.
342 auto UsersOutsideSinkTo =
343 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
344 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
345 });
346 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
347 return !U->usesFirstLaneOnly(SinkCandidate);
348 }))
349 continue;
350 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
351
352 if (NeedsDuplicating) {
353 if (ScalarVFOnly)
354 continue;
355 VPSingleDefRecipe *Clone;
356 if (auto *SinkCandidateRepR =
357 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
358 // TODO: Handle converting to uniform recipes as separate transform,
359 // then cloning should be sufficient here.
360 Instruction *I = SinkCandidate->getUnderlyingInstr();
361 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
362 nullptr /*Mask*/, *SinkCandidateRepR,
363 *SinkCandidateRepR);
364 // TODO: add ".cloned" suffix to name of Clone's VPValue.
365 } else {
366 Clone = SinkCandidate->clone();
367 }
368
369 Clone->insertBefore(SinkCandidate);
370 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
371 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
372 });
373 }
374 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
375 for (VPValue *Op : SinkCandidate->operands())
376 InsertIfValidSinkCandidate(SinkTo, Op);
377 Changed = true;
378 }
379 return Changed;
380}
381
382/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
383/// the mask.
385 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
386 if (!EntryBB || EntryBB->size() != 1 ||
387 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
388 return nullptr;
389
390 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
391}
392
393/// If \p R is a triangle region, return the 'then' block of the triangle.
395 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
396 if (EntryBB->getNumSuccessors() != 2)
397 return nullptr;
398
399 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
400 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
401 if (!Succ0 || !Succ1)
402 return nullptr;
403
404 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
405 return nullptr;
406 if (Succ0->getSingleSuccessor() == Succ1)
407 return Succ0;
408 if (Succ1->getSingleSuccessor() == Succ0)
409 return Succ1;
410 return nullptr;
411}
412
413// Merge replicate regions in their successor region, if a replicate region
414// is connected to a successor replicate region with the same predicate by a
415// single, empty VPBasicBlock.
417 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
418
419 // Collect replicate regions followed by an empty block, followed by another
420 // replicate region with matching masks to process front. This is to avoid
421 // iterator invalidation issues while merging regions.
424 vp_depth_first_deep(Plan.getEntry()))) {
425 if (!Region1->isReplicator())
426 continue;
427 auto *MiddleBasicBlock =
428 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
429 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
430 continue;
431
432 auto *Region2 =
433 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
434 if (!Region2 || !Region2->isReplicator())
435 continue;
436
437 VPValue *Mask1 = getPredicatedMask(Region1);
438 VPValue *Mask2 = getPredicatedMask(Region2);
439 if (!Mask1 || Mask1 != Mask2)
440 continue;
441
442 assert(Mask1 && Mask2 && "both region must have conditions");
443 WorkList.push_back(Region1);
444 }
445
446 // Move recipes from Region1 to its successor region, if both are triangles.
447 for (VPRegionBlock *Region1 : WorkList) {
448 if (TransformedRegions.contains(Region1))
449 continue;
450 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
451 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
452
453 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
454 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
455 if (!Then1 || !Then2)
456 continue;
457
458 // Note: No fusion-preventing memory dependencies are expected in either
459 // region. Such dependencies should be rejected during earlier dependence
460 // checks, which guarantee accesses can be re-ordered for vectorization.
461 //
462 // Move recipes to the successor region.
463 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
464 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
465
466 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
467 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
468
469 // Move VPPredInstPHIRecipes from the merge block to the successor region's
470 // merge block. Update all users inside the successor region to use the
471 // original values.
472 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
473 VPValue *PredInst1 =
474 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
475 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
476 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
477 return cast<VPRecipeBase>(&U)->getParent() == Then2;
478 });
479
480 // Remove phi recipes that are unused after merging the regions.
481 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
482 Phi1ToMove.eraseFromParent();
483 continue;
484 }
485 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
486 }
487
488 // Remove the dead recipes in Region1's entry block.
489 for (VPRecipeBase &R :
490 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
491 R.eraseFromParent();
492
493 // Finally, remove the first region.
494 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
495 VPBlockUtils::disconnectBlocks(Pred, Region1);
496 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
497 }
498 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
499 TransformedRegions.insert(Region1);
500 }
501
502 return !TransformedRegions.empty();
503}
504
506 VPRegionBlock *ParentRegion,
507 VPlan &Plan) {
508 Instruction *Instr = PredRecipe->getUnderlyingInstr();
509 // Build the triangular if-then region.
510 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
511 assert(Instr->getParent() && "Predicated instruction not in any basic block");
512 auto *BlockInMask = PredRecipe->getMask();
513 auto *MaskDef = BlockInMask->getDefiningRecipe();
514 auto *BOMRecipe = new VPBranchOnMaskRecipe(
515 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
516 auto *Entry =
517 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
518
519 // Replace predicated replicate recipe with a replicate recipe without a
520 // mask but in the replicate region.
521 auto *RecipeWithoutMask = new VPReplicateRecipe(
522 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
523 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
524 PredRecipe->getDebugLoc());
525 auto *Pred =
526 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
527 auto *Exiting = Plan.createVPBasicBlock(Twine(RegionName) + ".continue");
529 Plan.createReplicateRegion(Entry, Exiting, RegionName);
530
531 // Note: first set Entry as region entry and then connect successors starting
532 // from it in order, to propagate the "parent" of each VPBasicBlock.
533 Region->setParent(ParentRegion);
534 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
535 VPBlockUtils::connectBlocks(Pred, Exiting);
536
537 if (PredRecipe->getNumUsers() != 0) {
538 auto *PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
539 RecipeWithoutMask->getDebugLoc());
540 Exiting->appendRecipe(PHIRecipe);
541 PredRecipe->replaceAllUsesWith(PHIRecipe);
542 }
543 PredRecipe->eraseFromParent();
544 return Region;
545}
546
547static void addReplicateRegions(VPlan &Plan) {
550 vp_depth_first_deep(Plan.getEntry()))) {
551 for (VPRecipeBase &R : *VPBB)
552 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
553 if (RepR->isPredicated())
554 WorkList.push_back(RepR);
555 }
556 }
557
558 unsigned BBNum = 0;
559 for (VPReplicateRecipe *RepR : WorkList) {
560 VPBasicBlock *CurrentBlock = RepR->getParent();
561 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
562
563 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
564 SplitBlock->setName(
565 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
566 // Record predicated instructions for above packing optimizations.
568 createReplicateRegion(RepR, CurrentBlock->getParent(), Plan);
570
571 VPRegionBlock *ParentRegion = Region->getParent();
572 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
573 ParentRegion->setExiting(SplitBlock);
574 }
575}
576
580 vp_depth_first_deep(Plan.getEntry()))) {
581 // Don't fold the blocks in the skeleton of the Plan into their single
582 // predecessors for now.
583 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
584 if (!VPBB->getParent())
585 continue;
586 auto *PredVPBB =
587 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
588 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
589 isa<VPIRBasicBlock>(PredVPBB))
590 continue;
591 WorkList.push_back(VPBB);
592 }
593
594 for (VPBasicBlock *VPBB : WorkList) {
595 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
596 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
597 R.moveBefore(*PredVPBB, PredVPBB->end());
598 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
599 auto *ParentRegion = VPBB->getParent();
600 if (ParentRegion && ParentRegion->getExiting() == VPBB)
601 ParentRegion->setExiting(PredVPBB);
602 VPBlockUtils::transferSuccessors(VPBB, PredVPBB);
603 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
604 }
605 return !WorkList.empty();
606}
607
609 // Convert masked VPReplicateRecipes to if-then region blocks.
611
612 bool ShouldSimplify = true;
613 while (ShouldSimplify) {
614 ShouldSimplify = sinkScalarOperands(Plan);
615 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
616 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
617 }
618}
619
620/// Remove redundant casts of inductions.
621///
622/// Such redundant casts are casts of induction variables that can be ignored,
623/// because we already proved that the casted phi is equal to the uncasted phi
624/// in the vectorized loop. There is no need to vectorize the cast - the same
625/// value can be used for both the phi and casts in the vector loop.
627 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
629 if (!IV || IV->getTruncInst())
630 continue;
631
632 // A sequence of IR Casts has potentially been recorded for IV, which
633 // *must be bypassed* when the IV is vectorized, because the vectorized IV
634 // will produce the desired casted value. This sequence forms a def-use
635 // chain and is provided in reverse order, ending with the cast that uses
636 // the IV phi. Search for the recipe of the last cast in the chain and
637 // replace it with the original IV. Note that only the final cast is
638 // expected to have users outside the cast-chain and the dead casts left
639 // over will be cleaned up later.
640 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
641 VPValue *FindMyCast = IV;
642 for (Instruction *IRCast : reverse(Casts)) {
643 VPSingleDefRecipe *FoundUserCast = nullptr;
644 for (auto *U : FindMyCast->users()) {
645 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
646 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
647 FoundUserCast = UserCast;
648 break;
649 }
650 }
651 // A cast recipe in the chain may have been removed by earlier DCE.
652 if (!FoundUserCast)
653 break;
654 FindMyCast = FoundUserCast;
655 }
656 if (FindMyCast != IV)
657 FindMyCast->replaceAllUsesWith(IV);
658 }
659}
660
663 Instruction::BinaryOps InductionOpcode,
664 FPMathOperator *FPBinOp, Instruction *TruncI,
665 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
666 VPBuilder &Builder) {
667 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
668 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
669 VPValue *CanonicalIV = LoopRegion->getCanonicalIV();
670 VPSingleDefRecipe *BaseIV =
671 Builder.createDerivedIV(Kind, FPBinOp, StartV, CanonicalIV, Step);
672
673 // Truncate base induction if needed.
674 VPTypeAnalysis TypeInfo(Plan);
675 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
676 if (TruncI) {
677 Type *TruncTy = TruncI->getType();
678 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
679 "Not truncating.");
680 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
681 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
682 ResultTy = TruncTy;
683 }
684
685 // Truncate step if needed.
686 Type *StepTy = TypeInfo.inferScalarType(Step);
687 if (ResultTy != StepTy) {
688 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
689 "Not truncating.");
690 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
691 auto *VecPreheader =
693 VPBuilder::InsertPointGuard Guard(Builder);
694 Builder.setInsertPoint(VecPreheader);
695 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
696 }
697 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
698 &Plan.getVF(), DL);
699}
700
702 VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI,
704 const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
705 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
706 if (!LoopRegion)
707 return;
708
710 LoopRegion->getCanonicalIV());
711 if (!WideCanIV)
712 return;
713
714 Type *CanIVTy = LoopRegion->getCanonicalIVType();
715
716 // Replace the wide canonical IV with a scalar-iv-steps over the canonical
717 // IV.
718 if (Plan.hasScalarVFOnly() || vputils::onlyFirstLaneUsed(WideCanIV)) {
719 VPBuilder Builder(WideCanIV);
720 WideCanIV->replaceAllUsesWith(createScalarIVSteps(
721 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
722 nullptr, Plan.getZero(CanIVTy), Plan.getConstantInt(CanIVTy, 1),
723 WideCanIV->getDebugLoc(), Builder));
724 WideCanIV->eraseFromParent();
725 return;
726 }
727
728 if (vputils::onlyScalarValuesUsed(WideCanIV))
729 return;
730
731 // If a canonical VPWidenIntOrFpInductionRecipe already produces vector lanes
732 // in the header, reuse it instead of introducing another wide induction phi.
733 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
734 for (VPRecipeBase &Phi : Header->phis()) {
735 auto *WidenIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
736 if (!WidenIV || !WidenIV->isCanonical())
737 continue;
738 // The reused wide IV feeds the header mask, whose lanes may extend past
739 // the trip count; drop flags that only hold inside the scalar loop.
740 WidenIV->dropPoisonGeneratingFlags();
741 WideCanIV->replaceAllUsesWith(WidenIV);
742 WideCanIV->eraseFromParent();
743 return;
744 }
745
746 // Introduce a new VPWidenIntOrFpInductionRecipe if profitable.
747 auto *VecTy = VectorType::get(CanIVTy, VF);
748 InstructionCost BroadcastCost = TTI.getShuffleCost(
750 InstructionCost PHICost = TTI.getCFInstrCost(Instruction::PHI, CostKind);
751 if (PHICost > BroadcastCost)
752 return;
753
754 // Bail out if the additional wide induction phi increase the expected spill
755 // cost.
756 VPRegisterUsage UnrolledBase =
757 calculateRegisterUsageForPlan(Plan, VF, TTI, ValuesToIgnore)[0];
758 for (unsigned &NumUsers : make_second_range(UnrolledBase.MaxLocalUsers))
759 NumUsers *= UF;
760 unsigned RegClass = TTI.getRegisterClassForType(/*Vector=*/true, VecTy);
761 VPRegisterUsage Projected = UnrolledBase;
762 Projected.MaxLocalUsers[RegClass] += TTI.getRegUsageForType(VecTy);
763 if (Projected.spillCost(TTI, CostKind) >
764 UnrolledBase.spillCost(TTI, CostKind))
765 return;
766
769 VPValue *StepV = Plan.getConstantInt(CanIVTy, 1);
770 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
771 /*IV=*/nullptr, Plan.getZero(CanIVTy), StepV, &Plan.getVF(), ID,
772 WideCanIV->getNoWrapFlags(), WideCanIV->getDebugLoc());
773 NewWideIV->insertBefore(&*Header->getFirstNonPhi());
774 WideCanIV->replaceAllUsesWith(NewWideIV);
775 WideCanIV->eraseFromParent();
776}
777
778/// Returns true if \p R is dead and can be removed.
779static bool isDeadRecipe(VPRecipeBase &R) {
780 // Do remove conditional assume instructions as their conditions may be
781 // flattened.
782 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
783 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
785 if (IsConditionalAssume)
786 return true;
787
788 if (R.mayHaveSideEffects())
789 return false;
790
791 // Recipe is dead if no user keeps the recipe alive.
792 return all_of(R.definedValues(),
793 [](VPValue *V) { return V->getNumUsers() == 0; });
794}
795
798 Plan.getEntry());
800 // The recipes in the block are processed in reverse order, to catch chains
801 // of dead recipes.
802 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
803 if (isDeadRecipe(R)) {
804 R.eraseFromParent();
805 continue;
806 }
807
808 // Check if R is a dead VPPhi <-> update cycle and remove it.
809 VPValue *Start, *Incoming;
810 if (!match(&R, m_VPPhi(m_VPValue(Start), m_VPValue(Incoming))))
811 continue;
812 auto *PhiR = cast<VPPhi>(&R);
813 VPUser *PhiUser = PhiR->getSingleUser();
814 if (!PhiUser)
815 continue;
816 if (PhiUser != Incoming->getDefiningRecipe() ||
817 Incoming->getNumUsers() != 1)
818 continue;
819 PhiR->replaceAllUsesWith(Start);
820 PhiR->eraseFromParent();
821 Incoming->getDefiningRecipe()->eraseFromParent();
822 }
823 }
824}
825
828 for (unsigned I = 0; I != Users.size(); ++I) {
830 for (VPValue *V : Cur->definedValues())
831 Users.insert_range(V->users());
832 }
833 return Users.takeVector();
834}
835
836/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
837/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
838/// generates scalar values.
839static VPValue *
841 VPlan &Plan, VPBuilder &Builder) {
843 VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());
844 VPValue *StepV = PtrIV->getOperand(1);
846 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
847 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
848
849 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
850 PtrIV->getDebugLoc(), "next.gep");
851}
852
853/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
854/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
855/// VPWidenPointerInductionRecipe will generate vectors only. If some users
856/// require vectors while other require scalars, the scalar uses need to extract
857/// the scalars from the generated vectors (Note that this is different to how
858/// int/fp inductions are handled). Legalize extract-from-ends using uniform
859/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
860/// the correct end value is available. Also optimize
861/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
862/// providing them scalar steps built on the canonical scalar IV and update the
863/// original IV's users. This is an optional optimization to reduce the needs of
864/// vector extracts.
867 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
868 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
869 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
870 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
871 if (!PhiR)
872 continue;
873
874 // Try to narrow wide and replicating recipes to uniform recipes, based on
875 // VPlan analysis.
876 // TODO: Apply to all recipes in the future, to replace legacy uniformity
877 // analysis.
878 auto Users = collectUsersRecursively(PhiR);
879 for (VPUser *U : reverse(Users)) {
880 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
881 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
882 // Skip recipes that shouldn't be narrowed.
883 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
884 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
885 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
886 continue;
887
888 // Skip recipes that may have other lanes than their first used.
890 continue;
891
892 // TODO: Support scalarizing ExtractValue.
893 if (match(Def,
895 continue;
896
897 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
898 Def->operands(), /*IsUniform*/ true,
899 /*Mask*/ nullptr, /*Flags*/ *Def);
900 Clone->insertAfter(Def);
901 Def->replaceAllUsesWith(Clone);
902 }
903
904 // Replace wide pointer inductions which have only their scalars used by
905 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
906 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
907 if (!Plan.hasScalarVFOnly() &&
908 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
909 continue;
910
911 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
912 PtrIV->replaceAllUsesWith(PtrAdd);
913 continue;
914 }
915
916 // Replace widened induction with scalar steps for users that only use
917 // scalars.
918 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
919 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
920 return U->usesScalars(WideIV);
921 }))
922 continue;
923
924 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
926 Plan, ID.getKind(), ID.getInductionOpcode(),
927 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
928 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
929 WideIV->getDebugLoc(), Builder);
930
931 // Update scalar users of IV to use Step instead.
932 if (!HasOnlyVectorVFs) {
933 assert(!Plan.hasScalableVF() &&
934 "plans containing a scalar VF cannot also include scalable VFs");
935 WideIV->replaceAllUsesWith(Steps);
936 } else {
937 bool HasScalableVF = Plan.hasScalableVF();
938 WideIV->replaceUsesWithIf(Steps,
939 [WideIV, HasScalableVF](VPUser &U, unsigned) {
940 if (HasScalableVF)
941 return U.usesFirstLaneOnly(WideIV);
942 return U.usesScalars(WideIV);
943 });
944 }
945 }
946}
947
948/// Check if \p VPV is an untruncated wide induction, either before or after the
949/// increment. If so return the header IV (before the increment), otherwise
950/// return null.
953 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
954 if (WideIV) {
955 // VPV itself is a wide induction, separately compute the end value for exit
956 // users if it is not a truncated IV.
957 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
958 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
959 }
960
961 // Check if VPV is an optimizable induction increment.
962 VPRecipeBase *Def = VPV->getDefiningRecipe();
963 if (!Def || Def->getNumOperands() != 2)
964 return nullptr;
965 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
966 if (!WideIV)
967 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
968 if (!WideIV)
969 return nullptr;
970
971 auto IsWideIVInc = [&]() {
972 auto &ID = WideIV->getInductionDescriptor();
973
974 // Check if VPV increments the induction by the induction step.
975 VPValue *IVStep = WideIV->getStepValue();
976 switch (ID.getInductionOpcode()) {
977 case Instruction::Add:
978 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
979 case Instruction::FAdd:
980 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
981 case Instruction::FSub:
982 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
983 m_Specific(IVStep)));
984 case Instruction::Sub: {
985 // IVStep will be the negated step of the subtraction. Check if Step == -1
986 // * IVStep.
987 VPValue *Step;
988 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
989 return false;
990 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
991 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
992 ScalarEvolution &SE = *PSE.getSE();
993 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
994 !isa<SCEVCouldNotCompute>(StepSCEV) &&
995 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
996 }
997 default:
998 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
999 match(VPV, m_GetElementPtr(m_Specific(WideIV),
1000 m_Specific(WideIV->getStepValue())));
1001 }
1002 llvm_unreachable("should have been covered by switch above");
1003 };
1004 return IsWideIVInc() ? WideIV : nullptr;
1005}
1006
1007/// Attempts to optimize the induction variable exit values for users in the
1008/// early exit block.
1010 VPTypeAnalysis &TypeInfo,
1011 VPValue *Op,
1013 VPValue *Incoming, *Mask;
1015 m_VPValue(Incoming))))
1016 return nullptr;
1017
1018 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
1019 if (!WideIV)
1020 return nullptr;
1021
1022 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1023 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1024 return nullptr;
1025
1026 // Calculate the final index.
1027 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1028 auto *CanonicalIV = LoopRegion->getCanonicalIV();
1029 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
1030 auto *ExtractR = cast<VPInstruction>(Op);
1031 VPBuilder B(ExtractR);
1032
1033 DebugLoc DL = ExtractR->getDebugLoc();
1034 VPValue *FirstActiveLane = B.createFirstActiveLane(Mask, DL);
1035 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
1036 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
1037 FirstActiveLaneType, DL);
1038 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
1039
1040 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1041 // changed it means the exit is using the incremented value, so we need to
1042 // add the step.
1043 if (Incoming != WideIV) {
1044 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
1045 EndValue = B.createAdd(EndValue, One, DL);
1046 }
1047
1048 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1049 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1050 VPIRValue *Start = WideIV->getStartValue();
1051 VPValue *Step = WideIV->getStepValue();
1052 EndValue = B.createDerivedIV(
1053 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1054 Start, EndValue, Step);
1055 }
1056
1057 return EndValue;
1058}
1059
1060/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1061/// VPDerivedIVRecipe for non-canonical inductions.
1063 VPBuilder &VectorPHBuilder,
1064 VPTypeAnalysis &TypeInfo,
1065 VPValue *VectorTC) {
1066 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1067 // Truncated wide inductions resume from the last lane of their vector value
1068 // in the last vector iteration which is handled elsewhere.
1069 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1070 return nullptr;
1071
1072 VPIRValue *Start = WideIV->getStartValue();
1073 VPValue *Step = WideIV->getStepValue();
1075 VPValue *EndValue = VectorTC;
1076 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1077 EndValue = VectorPHBuilder.createDerivedIV(
1078 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1079 Start, VectorTC, Step);
1080 }
1081
1082 // EndValue is derived from the vector trip count (which has the same type as
1083 // the widest induction) and thus may be wider than the induction here.
1084 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
1085 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
1086 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
1087 ScalarTypeOfWideIV,
1088 WideIV->getDebugLoc());
1089 }
1090
1091 return EndValue;
1092}
1093
1094/// Attempts to optimize the induction variable exit values for users in the
1095/// exit block coming from the latch in the original scalar loop.
1097 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPValue *Op,
1099 VPValue *Incoming;
1101 return nullptr;
1102
1103 VPWidenInductionRecipe *WideIV = getOptimizableIVOf(Incoming, PSE);
1104 if (!WideIV)
1105 return nullptr;
1106
1107 VPValue *EndValue = EndValues.lookup(WideIV);
1108 assert(EndValue && "Must have computed the end value up front");
1109
1110 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1111 // changed it means the exit is using the incremented value, so we don't
1112 // need to subtract the step.
1113 if (Incoming != WideIV)
1114 return EndValue;
1115
1116 // Otherwise, subtract the step from the EndValue.
1117 auto *ExtractR = cast<VPInstruction>(Op);
1118 VPBuilder B(ExtractR);
1119 VPValue *Step = WideIV->getStepValue();
1120 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1121 if (ScalarTy->isIntegerTy())
1122 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1123 if (ScalarTy->isPointerTy()) {
1124 Type *StepTy = TypeInfo.inferScalarType(Step);
1125 auto *Zero = Plan.getZero(StepTy);
1126 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1127 DebugLoc::getUnknown(), "ind.escape");
1128 }
1129 if (ScalarTy->isFloatingPointTy()) {
1130 const auto &ID = WideIV->getInductionDescriptor();
1131 return B.createNaryOp(
1132 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1133 ? Instruction::FSub
1134 : Instruction::FAdd,
1135 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1136 }
1137 llvm_unreachable("all possible induction types must be handled");
1138 return nullptr;
1139}
1140
1142 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1143 // Compute end values for all inductions.
1144 VPTypeAnalysis TypeInfo(Plan);
1145 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1146 auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());
1147 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1149 VPValue *ResumeTC =
1150 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1151 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1152 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);
1153 if (!WideIV)
1154 continue;
1156 WideIV, VectorPHBuilder, TypeInfo, ResumeTC))
1157 EndValues[WideIV] = EndValue;
1158 }
1159
1160 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1161 for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
1162 VPValue *Op;
1163 if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))
1164 continue;
1165 auto *WideIV = cast<VPWidenInductionRecipe>(Op);
1166 if (VPValue *EndValue = EndValues.lookup(WideIV)) {
1167 R.getVPSingleValue()->replaceAllUsesWith(EndValue);
1168 R.eraseFromParent();
1169 }
1170 }
1171
1172 // Then, optimize exit block users.
1173 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1174 for (VPRecipeBase &R : ExitVPBB->phis()) {
1175 auto *ExitIRI = cast<VPIRPhi>(&R);
1176
1177 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1178 VPValue *Escape = nullptr;
1179 if (PredVPBB == MiddleVPBB)
1181 Plan, TypeInfo, ExitIRI->getOperand(Idx), EndValues, PSE);
1182 else
1184 Plan, TypeInfo, ExitIRI->getOperand(Idx), PSE);
1185 if (Escape)
1186 ExitIRI->setOperand(Idx, Escape);
1187 }
1188 }
1189 }
1190}
1191
1192/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1193/// them with already existing recipes expanding the same SCEV expression.
1196
1197 for (VPRecipeBase &R :
1199 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1200 if (!ExpR)
1201 continue;
1202
1203 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1204 if (Inserted)
1205 continue;
1206 ExpR->replaceAllUsesWith(V->second);
1207 ExpR->eraseFromParent();
1208 }
1209}
1210
1212 SmallVector<VPValue *> WorkList;
1214 WorkList.push_back(V);
1215
1216 while (!WorkList.empty()) {
1217 VPValue *Cur = WorkList.pop_back_val();
1218 if (!Seen.insert(Cur).second)
1219 continue;
1220 VPRecipeBase *R = Cur->getDefiningRecipe();
1221 if (!R)
1222 continue;
1223 if (!isDeadRecipe(*R))
1224 continue;
1225 append_range(WorkList, R->operands());
1226 R->eraseFromParent();
1227 }
1228}
1229
1230/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1231/// Returns an optional pair, where the first element indicates whether it is
1232/// an intrinsic ID.
1233static std::optional<std::pair<bool, unsigned>>
1235 return TypeSwitch<const VPSingleDefRecipe *,
1236 std::optional<std::pair<bool, unsigned>>>(R)
1239 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1240 .Case([](const VPWidenIntrinsicRecipe *I) {
1241 return std::make_pair(true, I->getVectorIntrinsicID());
1242 })
1243 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe, VPScalarIVStepsRecipe>(
1244 [](auto *I) {
1245 // For recipes that do not directly map to LLVM IR instructions,
1246 // assign opcodes after the last VPInstruction opcode (which is also
1247 // after the last IR Instruction opcode), based on the VPRecipeID.
1248 return std::make_pair(false, VPInstruction::OpsEnd + 1 +
1249 I->getVPRecipeID());
1250 })
1251 .Default([](auto *) { return std::nullopt; });
1252}
1253
1254/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1255/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1256/// Operands are foldable live-ins.
1258 ArrayRef<VPValue *> Operands,
1259 const DataLayout &DL,
1260 VPTypeAnalysis &TypeInfo) {
1261 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1262 if (!OpcodeOrIID)
1263 return nullptr;
1264
1266 for (VPValue *Op : Operands) {
1267 if (!match(Op, m_LiveIn()))
1268 return nullptr;
1269 Value *V = Op->getUnderlyingValue();
1270 if (!V)
1271 return nullptr;
1272 Ops.push_back(V);
1273 }
1274
1275 auto FoldToIRValue = [&]() -> Value * {
1276 InstSimplifyFolder Folder(DL);
1277 if (OpcodeOrIID->first) {
1278 if (R.getNumOperands() != 2)
1279 return nullptr;
1280 unsigned ID = OpcodeOrIID->second;
1281 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1282 TypeInfo.inferScalarType(&R));
1283 }
1284 unsigned Opcode = OpcodeOrIID->second;
1285 if (Instruction::isBinaryOp(Opcode))
1286 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1287 Ops[0], Ops[1]);
1288 if (Instruction::isCast(Opcode))
1289 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1290 TypeInfo.inferScalarType(R.getVPSingleValue()));
1291 switch (Opcode) {
1293 return Folder.FoldSelect(Ops[0], Ops[1],
1295 case VPInstruction::Not:
1296 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1298 case Instruction::Select:
1299 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1300 case Instruction::ICmp:
1301 case Instruction::FCmp:
1302 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1303 Ops[1]);
1304 case Instruction::GetElementPtr: {
1305 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1306 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1307 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1308 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1309 }
1312 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1313 Ops[0], Ops[1],
1314 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1315 // An extract of a live-in is an extract of a broadcast, so return the
1316 // broadcasted element.
1317 case Instruction::ExtractElement:
1318 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1319 return Ops[0];
1320 }
1321 return nullptr;
1322 };
1323
1324 if (Value *V = FoldToIRValue())
1325 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1326 return nullptr;
1327}
1328
1329/// Try to simplify VPSingleDefRecipe \p Def.
1331 VPlan *Plan = Def->getParent()->getPlan();
1332
1333 // Simplification of live-in IR values for SingleDef recipes using
1334 // InstSimplifyFolder.
1335 const DataLayout &DL = Plan->getDataLayout();
1336 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1337 return Def->replaceAllUsesWith(V);
1338
1339 // Fold PredPHI LiveIn -> LiveIn.
1340 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1341 VPValue *Op = PredPHI->getOperand(0);
1342 if (isa<VPIRValue>(Op))
1343 PredPHI->replaceAllUsesWith(Op);
1344 }
1345
1346 VPBuilder Builder(Def);
1347
1348 // Avoid replacing VPInstructions with underlying values with new
1349 // VPInstructions, as we would fail to create widen/replicate recpes from the
1350 // new VPInstructions without an underlying value, and miss out on some
1351 // transformations that only apply to widened/replicated recipes later, by
1352 // doing so.
1353 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1354 // VPInstructions without underlying values, as those will get skipped during
1355 // cost computation.
1356 bool CanCreateNewRecipe =
1357 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1358
1359 VPValue *A;
1360 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1361 Type *TruncTy = TypeInfo.inferScalarType(Def);
1362 Type *ATy = TypeInfo.inferScalarType(A);
1363 if (TruncTy == ATy) {
1364 Def->replaceAllUsesWith(A);
1365 } else {
1366 // Don't replace a non-widened cast recipe with a widened cast.
1367 if (!isa<VPWidenCastRecipe>(Def))
1368 return;
1369 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1370
1371 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1372 ? Instruction::SExt
1373 : Instruction::ZExt;
1374 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1375 TruncTy);
1376 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1377 // UnderlyingExt has distinct return type, used to retain legacy cost.
1378 Ext->setUnderlyingValue(UnderlyingExt);
1379 }
1380 Def->replaceAllUsesWith(Ext);
1381 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1382 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1383 Def->replaceAllUsesWith(Trunc);
1384 }
1385 }
1386#ifndef NDEBUG
1387 // Verify that the cached type info is for both A and its users is still
1388 // accurate by comparing it to freshly computed types.
1389 VPTypeAnalysis TypeInfo2(*Plan);
1390 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1391 for (VPUser *U : A->users()) {
1392 auto *R = cast<VPRecipeBase>(U);
1393 for (VPValue *VPV : R->definedValues())
1394 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1395 }
1396#endif
1397 }
1398
1399 // Simplify (X && Y) | (X && !Y) -> X.
1400 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1401 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1402 // recipes to be visited during simplification.
1403 VPValue *X, *Y, *Z;
1404 if (match(Def,
1407 Def->replaceAllUsesWith(X);
1408 Def->eraseFromParent();
1409 return;
1410 }
1411
1412 // x | AllOnes -> AllOnes
1413 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1414 return Def->replaceAllUsesWith(
1415 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1416
1417 // x | 0 -> x
1418 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1419 return Def->replaceAllUsesWith(X);
1420
1421 // x | !x -> AllOnes
1423 return Def->replaceAllUsesWith(
1424 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1425
1426 // x & 0 -> 0
1427 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1428 return Def->replaceAllUsesWith(
1429 Plan->getZero(TypeInfo.inferScalarType(Def)));
1430
1431 // x & AllOnes -> x
1432 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes())))
1433 return Def->replaceAllUsesWith(X);
1434
1435 // x && false -> false
1436 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False())))
1437 return Def->replaceAllUsesWith(Plan->getFalse());
1438
1439 // x && true -> x
1440 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True())))
1441 return Def->replaceAllUsesWith(X);
1442
1443 // (x && y) | (x && z) -> x && (y | z)
1444 if (CanCreateNewRecipe &&
1447 // Simplify only if one of the operands has one use to avoid creating an
1448 // extra recipe.
1449 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1450 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1451 return Def->replaceAllUsesWith(
1452 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1453
1454 // x && (x && y) -> x && y
1455 if (match(Def, m_LogicalAnd(m_VPValue(X),
1457 return Def->replaceAllUsesWith(Def->getOperand(1));
1458
1459 // x && (y && x) -> x && y
1460 if (match(Def, m_LogicalAnd(m_VPValue(X),
1462 return Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));
1463
1464 // x && !x -> 0
1466 return Def->replaceAllUsesWith(Plan->getFalse());
1467
1468 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1469 return Def->replaceAllUsesWith(X);
1470
1471 // select c, false, true -> not c
1472 VPValue *C;
1473 if (CanCreateNewRecipe &&
1474 match(Def, m_Select(m_VPValue(C), m_False(), m_True())))
1475 return Def->replaceAllUsesWith(Builder.createNot(C));
1476
1477 // select !c, x, y -> select c, y, x
1478 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1479 Def->setOperand(0, C);
1480 Def->setOperand(1, Y);
1481 Def->setOperand(2, X);
1482 return;
1483 }
1484
1485 // select x, (i1 y | z), y -> y | (x && z)
1486 if (CanCreateNewRecipe &&
1487 match(Def, m_Select(m_VPValue(X),
1489 m_Deferred(Y))) &&
1490 TypeInfo.inferScalarType(Y)->isIntegerTy(1))
1491 return Def->replaceAllUsesWith(
1492 Builder.createOr(Y, Builder.createLogicalAnd(X, Z)));
1493
1494 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1495 return Def->replaceAllUsesWith(A);
1496
1497 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1498 return Def->replaceAllUsesWith(A);
1499
1500 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1501 return Def->replaceAllUsesWith(
1502 Plan->getZero(TypeInfo.inferScalarType(Def)));
1503
1504 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_AllOnes()))) {
1505 // Preserve nsw from the Mul on the new Sub.
1507 false, cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap()};
1508 return Def->replaceAllUsesWith(
1509 Builder.createSub(Plan->getZero(TypeInfo.inferScalarType(A)), A,
1510 Def->getDebugLoc(), "", NW));
1511 }
1512
1513 if (CanCreateNewRecipe &&
1515 // Preserve nsw from the Add and the Sub, if it's present on both, on the
1516 // new Sub.
1518 false,
1519 cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap() &&
1520 cast<VPRecipeWithIRFlags>(Def->getOperand(Def->getOperand(0) == X))
1521 ->hasNoSignedWrap()};
1522 return Def->replaceAllUsesWith(
1523 Builder.createSub(X, Y, Def->getDebugLoc(), "", NW));
1524 }
1525
1526 const APInt *APC;
1527 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1528 APC->isPowerOf2())
1529 return Def->replaceAllUsesWith(Builder.createNaryOp(
1530 Instruction::Shl,
1531 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1532 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1533
1534 if (CanCreateNewRecipe && match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) &&
1535 APC->isPowerOf2())
1536 return Def->replaceAllUsesWith(Builder.createNaryOp(
1537 Instruction::LShr,
1538 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1539 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1540
1541 if (match(Def, m_Not(m_VPValue(A)))) {
1542 if (match(A, m_Not(m_VPValue(A))))
1543 return Def->replaceAllUsesWith(A);
1544
1545 // Try to fold Not into compares by adjusting the predicate in-place.
1546 CmpPredicate Pred;
1547 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1548 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1549 if (all_of(Cmp->users(),
1551 m_Not(m_Specific(Cmp)),
1552 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1553 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1554 for (VPUser *U : to_vector(Cmp->users())) {
1555 auto *R = cast<VPSingleDefRecipe>(U);
1556 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1557 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1558 R->setOperand(1, Y);
1559 R->setOperand(2, X);
1560 } else {
1561 // not (cmp pred) -> cmp inv_pred
1562 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1563 R->replaceAllUsesWith(Cmp);
1564 }
1565 }
1566 // If Cmp doesn't have a debug location, use the one from the negation,
1567 // to preserve the location.
1568 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1569 Cmp->setDebugLoc(Def->getDebugLoc());
1570 }
1571 }
1572 }
1573
1574 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1575 // any-of (fcmp uno %A, %B), ...
1576 if (match(Def, m_AnyOf())) {
1578 VPRecipeBase *UnpairedCmp = nullptr;
1579 for (VPValue *Op : Def->operands()) {
1580 VPValue *X;
1581 if (Op->getNumUsers() > 1 ||
1583 m_Deferred(X)))) {
1584 NewOps.push_back(Op);
1585 } else if (!UnpairedCmp) {
1586 UnpairedCmp = Op->getDefiningRecipe();
1587 } else {
1588 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1589 UnpairedCmp->getOperand(0), X));
1590 UnpairedCmp = nullptr;
1591 }
1592 }
1593
1594 if (UnpairedCmp)
1595 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1596
1597 if (NewOps.size() < Def->getNumOperands()) {
1598 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1599 return Def->replaceAllUsesWith(NewAnyOf);
1600 }
1601 }
1602
1603 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1604 // This is useful for fmax/fmin without fast-math flags, where we need to
1605 // check if any operand is NaN.
1606 if (CanCreateNewRecipe &&
1608 m_Deferred(X)),
1610 m_Deferred(Y))))) {
1611 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1612 return Def->replaceAllUsesWith(NewCmp);
1613 }
1614
1615 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1616 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1617 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1618 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1619 TypeInfo.inferScalarType(Def))
1620 return Def->replaceAllUsesWith(Def->getOperand(1));
1621
1623 m_One()))) {
1624 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1625 if (TypeInfo.inferScalarType(X) != WideStepTy)
1626 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1627 Def->replaceAllUsesWith(X);
1628 return;
1629 }
1630
1631 // For i1 vp.merges produced by AnyOf reductions:
1632 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1634 m_VPValue(X), m_VPValue())) &&
1636 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1637 Def->setOperand(1, Def->getOperand(0));
1638 Def->setOperand(0, Y);
1639 return;
1640 }
1641
1642 // Simplify MaskedCond with no block mask to its single operand.
1644 !cast<VPInstruction>(Def)->isMasked())
1645 return Def->replaceAllUsesWith(Def->getOperand(0));
1646
1647 // Look through ExtractLastLane.
1648 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1649 if (match(A, m_BuildVector())) {
1650 auto *BuildVector = cast<VPInstruction>(A);
1651 Def->replaceAllUsesWith(
1652 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1653 return;
1654 }
1655
1656 if (match(A, m_Broadcast(m_VPValue(X))))
1657 return Def->replaceAllUsesWith(X);
1658
1660 return Def->replaceAllUsesWith(A);
1661
1662 if (Plan->hasScalarVFOnly())
1663 return Def->replaceAllUsesWith(A);
1664 }
1665
1666 // Look through ExtractPenultimateElement (BuildVector ....).
1668 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1669 Def->replaceAllUsesWith(
1670 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1671 return;
1672 }
1673
1674 uint64_t Idx;
1676 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1677 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1678 return;
1679 }
1680
1681 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1682 Def->replaceAllUsesWith(
1683 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1684 return;
1685 }
1686
1687 // Look through broadcast of single-scalar when used as select conditions; in
1688 // that case the scalar condition can be used directly.
1689 if (match(Def,
1692 "broadcast operand must be single-scalar");
1693 Def->setOperand(0, C);
1694 return;
1695 }
1696
1697 if (match(Def, m_Broadcast(m_VPValue(X))))
1698 return Def->replaceUsesWithIf(
1699 X, [Def](const VPUser &U, unsigned) { return U.usesScalars(Def); });
1700
1702 if (Def->getNumOperands() == 1) {
1703 Def->replaceAllUsesWith(Def->getOperand(0));
1704 return;
1705 }
1706 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1707 if (all_equal(Phi->incoming_values()))
1708 Phi->replaceAllUsesWith(Phi->getOperand(0));
1709 }
1710 return;
1711 }
1712
1713 VPIRValue *IRV;
1714 if (Def->getNumOperands() == 1 &&
1716 return Def->replaceAllUsesWith(IRV);
1717
1718 // Some simplifications can only be applied after unrolling. Perform them
1719 // below.
1720 if (!Plan->isUnrolled())
1721 return;
1722
1723 // After unrolling, extract-lane may be used to extract values from multiple
1724 // scalar sources. Only simplify when extracting from a single scalar source.
1725 VPValue *LaneToExtract;
1726 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1727 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1729 return Def->replaceAllUsesWith(A);
1730
1731 // Simplify extract-lane with single source to extract-element.
1732 Def->replaceAllUsesWith(Builder.createNaryOp(
1733 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1734 return;
1735 }
1736
1737 // Look for cycles where Def is of the form:
1738 // X = phi(0, IVInc) ; used only by IVInc, or by IVInc and Inc = X + Y
1739 // IVInc = X + Step ; used by X and Def
1740 // Def = IVInc + Y
1741 // Fold the increment Y into the phi's start value, replace Def with IVInc,
1742 // and if Inc exists, replace it with X.
1743 if (match(Def, m_Add(m_Add(m_VPValue(X), m_VPValue()), m_VPValue(Y))) &&
1744 isa<VPIRValue>(Y) &&
1745 match(X, m_VPPhi(m_ZeroInt(), m_Specific(Def->getOperand(0))))) {
1746 auto *Phi = cast<VPPhi>(X);
1747 auto *IVInc = Def->getOperand(0);
1748 if (IVInc->getNumUsers() == 2) {
1749 // If Phi has a second user (besides IVInc's defining recipe), it must
1750 // be Inc = Phi + Y for the fold to apply.
1753 if (Phi->getNumUsers() == 1 || (Phi->getNumUsers() == 2 && Inc)) {
1754 Def->replaceAllUsesWith(IVInc);
1755 if (Inc)
1756 Inc->replaceAllUsesWith(Phi);
1757 Phi->setOperand(0, Y);
1758 return;
1759 }
1760 }
1761 }
1762
1763 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1764 // just the pointer operand.
1765 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1766 if (!VPR->getVFxPart() || match(VPR->getVFxPart(), m_ZeroInt()))
1767 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1768
1769 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1770 // the start index is zero and only the first lane 0 is demanded.
1771 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1772 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1773 Steps->replaceAllUsesWith(Steps->getOperand(0));
1774 return;
1775 }
1776 }
1777 // Simplify redundant ReductionStartVector recipes after unrolling.
1778 VPValue *StartV;
1780 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1781 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1782 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1783 return PhiR && PhiR->isInLoop();
1784 });
1785 return;
1786 }
1787
1788 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1789 return Def->replaceAllUsesWith(A);
1790}
1791
1794 Plan.getEntry());
1795 VPTypeAnalysis TypeInfo(Plan);
1797 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1798 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1799 simplifyRecipe(Def, TypeInfo);
1800 }
1801}
1802
1803/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1804/// header mask to be simplified further when tail folding, e.g. in
1805/// optimizeEVLMasks.
1806static void reassociateHeaderMask(VPlan &Plan) {
1807 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1808 if (!HeaderMask)
1809 return;
1810
1811 SmallVector<VPUser *> Worklist;
1812 for (VPUser *U : HeaderMask->users())
1813 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1815
1816 while (!Worklist.empty()) {
1817 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1818 VPValue *X, *Y;
1819 if (!R || !match(R, m_LogicalAnd(
1820 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1821 m_VPValue(Y))))
1822 continue;
1823 append_range(Worklist, R->users());
1824 VPBuilder Builder(R);
1825 R->replaceAllUsesWith(
1826 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1827 }
1828}
1829
1830static std::optional<Instruction::BinaryOps>
1832 switch (ID) {
1833 case Intrinsic::masked_udiv:
1834 return Instruction::UDiv;
1835 case Intrinsic::masked_sdiv:
1836 return Instruction::SDiv;
1837 case Intrinsic::masked_urem:
1838 return Instruction::URem;
1839 case Intrinsic::masked_srem:
1840 return Instruction::SRem;
1841 default:
1842 return {};
1843 }
1844}
1845
1847 if (Plan.hasScalarVFOnly())
1848 return;
1849
1851 vp_depth_first_deep(Plan.getEntry()))) {
1852 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1855 continue;
1856 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1857 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1858 continue;
1859
1860 auto *RepOrWidenR = cast<VPRecipeWithIRFlags>(&R);
1861 if (RepR && RepR->getOpcode() == Instruction::Store &&
1862 vputils::isSingleScalar(RepR->getOperand(1))) {
1863 auto *Clone = new VPReplicateRecipe(
1864 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1865 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1866 *RepR /*Metadata*/, RepR->getDebugLoc());
1867 Clone->insertBefore(RepOrWidenR);
1868 VPBuilder Builder(Clone);
1869 VPValue *ExtractOp = Clone->getOperand(0);
1870 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1871 ExtractOp =
1872 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1873 ExtractOp =
1874 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1875 Clone->setOperand(0, ExtractOp);
1876 RepR->eraseFromParent();
1877 continue;
1878 }
1879
1880 // Narrow llvm.masked.{u,s}{div,rem} intrinsics with a safe divisor.
1881 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(RepOrWidenR)) {
1882 if (!vputils::onlyFirstLaneUsed(IntrR))
1883 continue;
1884 auto Opc = getUnmaskedDivRemOpcode(IntrR->getVectorIntrinsicID());
1885 if (!Opc)
1886 continue;
1887 VPBuilder Builder(IntrR);
1888 VPValue *SafeDivisor = Builder.createSelect(
1889 IntrR->getOperand(2), IntrR->getOperand(1),
1890 Plan.getConstantInt(IntrR->getScalarType(), 1));
1891 VPValue *Clone = Builder.createNaryOp(
1892 *Opc, {IntrR->getOperand(0), SafeDivisor},
1893 VPIRFlags::getDefaultFlags(*Opc), IntrR->getDebugLoc());
1894 IntrR->replaceAllUsesWith(Clone);
1895 IntrR->eraseFromParent();
1896 continue;
1897 }
1898
1899 // Skip recipes that aren't single scalars.
1900 if (!vputils::isSingleScalar(RepOrWidenR))
1901 continue;
1902
1903 // Predicate to check if a user of Op introduces extra broadcasts.
1904 auto IntroducesBCastOf = [](const VPValue *Op) {
1905 return [Op](const VPUser *U) {
1906 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1910 VPI->getOpcode()))
1911 return false;
1912 }
1913 return !U->usesScalars(Op);
1914 };
1915 };
1916
1917 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1918 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1919 if (any_of(
1920 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1921 IntroducesBCastOf(Op)))
1922 return false;
1923 // Non-constant live-ins require broadcasts, while constants do not
1924 // need explicit broadcasts.
1925 auto *IRV = dyn_cast<VPIRValue>(Op);
1926 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1927 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1928 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1929 }))
1930 continue;
1931
1932 auto *Clone = new VPReplicateRecipe(
1933 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1934 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1935 Clone->insertBefore(RepOrWidenR);
1936 RepOrWidenR->replaceAllUsesWith(Clone);
1937 if (isDeadRecipe(*RepOrWidenR))
1938 RepOrWidenR->eraseFromParent();
1939 }
1940 }
1941}
1942
1943/// Try to see if all of \p Blend's masks share a common value logically and'ed
1944/// and remove it from the masks.
1946 if (Blend->isNormalized())
1947 return;
1948 VPValue *CommonEdgeMask;
1949 if (!match(Blend->getMask(0),
1950 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1951 return;
1952 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1953 if (!match(Blend->getMask(I),
1954 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1955 return;
1956 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1957 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1958}
1959
1960/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1961/// to make sure the masks are simplified.
1962static void simplifyBlends(VPlan &Plan) {
1965 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1966 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1967 if (!Blend)
1968 continue;
1969
1970 removeCommonBlendMask(Blend);
1971
1972 // Try to remove redundant blend recipes.
1973 SmallPtrSet<VPValue *, 4> UniqueValues;
1974 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1975 UniqueValues.insert(Blend->getIncomingValue(0));
1976 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1977 if (!match(Blend->getMask(I), m_False()))
1978 UniqueValues.insert(Blend->getIncomingValue(I));
1979
1980 if (UniqueValues.size() == 1) {
1981 Blend->replaceAllUsesWith(*UniqueValues.begin());
1982 Blend->eraseFromParent();
1983 continue;
1984 }
1985
1986 if (Blend->isNormalized())
1987 continue;
1988
1989 // Normalize the blend so its first incoming value is used as the initial
1990 // value with the others blended into it.
1991
1992 unsigned StartIndex = 0;
1993 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1994 // If a value's mask is used only by the blend then is can be deadcoded.
1995 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1996 // that's used by multiple blends where it can be removed from them all.
1997 VPValue *Mask = Blend->getMask(I);
1998 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1999 StartIndex = I;
2000 break;
2001 }
2002 }
2003
2004 SmallVector<VPValue *, 4> OperandsWithMask;
2005 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
2006
2007 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
2008 if (I == StartIndex)
2009 continue;
2010 OperandsWithMask.push_back(Blend->getIncomingValue(I));
2011 OperandsWithMask.push_back(Blend->getMask(I));
2012 }
2013
2014 auto *NewBlend =
2015 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
2016 OperandsWithMask, *Blend, Blend->getDebugLoc());
2017 NewBlend->insertBefore(&R);
2018
2019 VPValue *DeadMask = Blend->getMask(StartIndex);
2020 Blend->replaceAllUsesWith(NewBlend);
2021 Blend->eraseFromParent();
2023
2024 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
2025 VPValue *NewMask;
2026 if (NewBlend->getNumOperands() == 3 &&
2027 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
2028 VPValue *Inc0 = NewBlend->getOperand(0);
2029 VPValue *Inc1 = NewBlend->getOperand(1);
2030 VPValue *OldMask = NewBlend->getOperand(2);
2031 NewBlend->setOperand(0, Inc1);
2032 NewBlend->setOperand(1, Inc0);
2033 NewBlend->setOperand(2, NewMask);
2034 if (OldMask->getNumUsers() == 0)
2035 cast<VPInstruction>(OldMask)->eraseFromParent();
2036 }
2037 }
2038 }
2039}
2040
2041/// Optimize the width of vector induction variables in \p Plan based on a known
2042/// constant Trip Count, \p BestVF and \p BestUF.
2044 ElementCount BestVF,
2045 unsigned BestUF) {
2046 // Only proceed if we have not completely removed the vector region.
2047 if (!Plan.getVectorLoopRegion())
2048 return false;
2049
2050 const APInt *TC;
2051 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
2052 return false;
2053
2054 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
2055 // and UF. Returns at least 8.
2056 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
2057 APInt AlignedTC =
2060 APInt MaxVal = AlignedTC - 1;
2061 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
2062 };
2063 unsigned NewBitWidth =
2064 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
2065
2066 LLVMContext &Ctx = Plan.getContext();
2067 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
2068
2069 bool MadeChange = false;
2070
2071 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2072 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2073 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2074
2075 // Currently only handle canonical IVs as it is trivial to replace the start
2076 // and stop values, and we currently only perform the optimization when the
2077 // IV has a single use.
2078 if (!WideIV || !WideIV->isCanonical() ||
2079 WideIV->hasMoreThanOneUniqueUser() ||
2080 NewIVTy == WideIV->getScalarType())
2081 continue;
2082
2083 // Currently only handle cases where the single user is a header-mask
2084 // comparison with the backedge-taken-count.
2085 VPUser *SingleUser = WideIV->getSingleUser();
2086 if (!SingleUser ||
2087 !match(SingleUser,
2088 m_ICmp(m_Specific(WideIV),
2090 continue;
2091
2092 // Update IV operands and comparison bound to use new narrower type.
2093 assert(!WideIV->getTruncInst() &&
2094 "canonical IV is not expected to have a truncation");
2095 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
2096 WideIV->getPHINode(), Plan.getZero(NewIVTy),
2097 Plan.getConstantInt(NewIVTy, 1), WideIV->getVFValue(),
2098 WideIV->getInductionDescriptor(), *WideIV, WideIV->getDebugLoc());
2099 NewWideIV->insertBefore(WideIV);
2100
2101 auto *NewBTC = new VPWidenCastRecipe(
2102 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2103 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
2104 Plan.getVectorPreheader()->appendRecipe(NewBTC);
2105 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
2106 Cmp->replaceAllUsesWith(
2107 VPBuilder(Cmp).createICmp(Cmp->getPredicate(), NewWideIV, NewBTC));
2108
2109 MadeChange = true;
2110 }
2111
2112 return MadeChange;
2113}
2114
2115/// Return true if \p Cond is known to be true for given \p BestVF and \p
2116/// BestUF.
2118 ElementCount BestVF, unsigned BestUF,
2121 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
2122 &PSE](VPValue *C) {
2123 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
2124 });
2125
2126 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2129 m_c_Add(m_Specific(CanIV), m_Specific(&Plan.getVFxUF())),
2130 m_Specific(&Plan.getVectorTripCount()))))
2131 return false;
2132
2133 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2134 // count is not conveniently available as SCEV so far, so we compare directly
2135 // against the original trip count. This is stricter than necessary, as we
2136 // will only return true if the trip count == vector trip count.
2137 const SCEV *VectorTripCount =
2139 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2140 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
2141 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2142 "Trip count SCEV must be computable");
2143 ScalarEvolution &SE = *PSE.getSE();
2144 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2145 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2146 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
2147}
2148
2149/// Try to replace multiple active lane masks used for control flow with
2150/// a single, wide active lane mask instruction followed by multiple
2151/// extract subvector intrinsics. This applies to the active lane mask
2152/// instructions both in the loop and in the preheader.
2153/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2154/// new extracts from the first active lane mask, which has it's last
2155/// operand (multiplier) set to UF.
2157 unsigned UF) {
2158 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2159 return false;
2160
2161 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2162 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2163 auto *Term = &ExitingVPBB->back();
2164
2165 using namespace llvm::VPlanPatternMatch;
2167 m_VPValue(), m_VPValue(), m_VPValue())))))
2168 return false;
2169
2170 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2171 LLVMContext &Ctx = Plan.getContext();
2172
2173 auto ExtractFromALM = [&](VPInstruction *ALM,
2174 SmallVectorImpl<VPValue *> &Extracts) {
2175 DebugLoc DL = ALM->getDebugLoc();
2176 for (unsigned Part = 0; Part < UF; ++Part) {
2178 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
2179 auto *Ext =
2180 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2181 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
2182 Extracts[Part] = Ext;
2183 Ext->insertAfter(ALM);
2184 }
2185 };
2186
2187 // Create a list of each active lane mask phi, ordered by unroll part.
2189 for (VPRecipeBase &R : Header->phis()) {
2191 if (!Phi)
2192 continue;
2193 VPValue *Index = nullptr;
2194 match(Phi->getBackedgeValue(),
2196 assert(Index && "Expected index from ActiveLaneMask instruction");
2197
2198 uint64_t Part;
2199 if (match(Index,
2201 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2202 Phis[Part] = Phi;
2203 else {
2204 // Anything other than a CanonicalIVIncrementForPart is part 0
2205 assert(!match(
2206 Index,
2208 Phis[0] = Phi;
2209 }
2210 }
2211
2212 assert(all_of(Phis, not_equal_to(nullptr)) &&
2213 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2214
2215 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2216 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2217
2218 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2219 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2220 "Expected incoming values of Phi to be ActiveLaneMasks");
2221
2222 // When using wide lane masks, the return type of the get.active.lane.mask
2223 // intrinsic is VF x UF (last operand).
2224 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2225 EntryALM->setOperand(2, ALMMultiplier);
2226 LoopALM->setOperand(2, ALMMultiplier);
2227
2228 // Create UF x extract vectors and insert into preheader.
2229 SmallVector<VPValue *> EntryExtracts(UF);
2230 ExtractFromALM(EntryALM, EntryExtracts);
2231
2232 // Create UF x extract vectors and insert before the loop compare & branch,
2233 // updating the compare to use the first extract.
2234 SmallVector<VPValue *> LoopExtracts(UF);
2235 ExtractFromALM(LoopALM, LoopExtracts);
2236 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2237 Not->setOperand(0, LoopExtracts[0]);
2238
2239 // Update the incoming values of active lane mask phis.
2240 for (unsigned Part = 0; Part < UF; ++Part) {
2241 Phis[Part]->setStartValue(EntryExtracts[Part]);
2242 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2243 }
2244
2245 return true;
2246}
2247
2248/// Try to simplify the branch condition of \p Plan. This may restrict the
2249/// resulting plan to \p BestVF and \p BestUF.
2251 unsigned BestUF,
2253 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2254 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2255 auto *Term = &ExitingVPBB->back();
2256 VPValue *Cond;
2257 auto m_CanIVInc = m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF()));
2258 // Check if the branch condition compares the canonical IV increment (for main
2259 // loop), or the canonical IV increment plus an offset (for epilog loop).
2260 if (match(Term, m_BranchOnCount(
2261 m_CombineOr(m_CanIVInc, m_c_Add(m_CanIVInc, m_LiveIn())),
2262 m_VPValue())) ||
2264 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2265 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2266 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2267 const SCEV *VectorTripCount =
2269 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2270 VectorTripCount =
2272 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2273 "Trip count SCEV must be computable");
2274 ScalarEvolution &SE = *PSE.getSE();
2275 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2276 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2277 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2278 return false;
2279 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2281 // For BranchOnCond, check if we can prove the condition to be true using VF
2282 // and UF.
2283 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2284 return false;
2285 } else {
2286 return false;
2287 }
2288
2289 // The vector loop region only executes once. Convert terminator of the
2290 // exiting block to exit in the first iteration.
2291 if (match(Term, m_BranchOnTwoConds())) {
2292 Term->setOperand(1, Plan.getTrue());
2293 return true;
2294 }
2295
2296 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, Plan.getTrue(), {},
2297 {}, Term->getDebugLoc());
2298 ExitingVPBB->appendRecipe(BOC);
2299 Term->eraseFromParent();
2300
2301 return true;
2302}
2303
2304/// From the definition of llvm.experimental.get.vector.length,
2305/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2309 vp_depth_first_deep(Plan.getEntry()))) {
2310 for (VPRecipeBase &R : *VPBB) {
2311 VPValue *AVL;
2312 if (!match(&R, m_EVL(m_VPValue(AVL))))
2313 continue;
2314
2315 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2316 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2317 continue;
2318 ScalarEvolution &SE = *PSE.getSE();
2319 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2320 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2321 continue;
2322
2324 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2325 R.getDebugLoc());
2326 if (Trunc != AVL) {
2327 auto *TruncR = cast<VPSingleDefRecipe>(Trunc);
2328 const DataLayout &DL = Plan.getDataLayout();
2329 VPTypeAnalysis TypeInfo(Plan);
2330 if (VPValue *Folded =
2331 tryToFoldLiveIns(*TruncR, TruncR->operands(), DL, TypeInfo))
2332 Trunc = Folded;
2333 }
2334 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2335 return true;
2336 }
2337 }
2338 return false;
2339}
2340
2342 unsigned BestUF,
2344 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2345 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2346
2347 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2348 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2349 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2350
2351 if (MadeChange) {
2352 Plan.setVF(BestVF);
2353 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2354 }
2355}
2356
2358 for (VPRecipeBase &R :
2360 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2361 if (!PhiR)
2362 continue;
2363 RecurKind RK = PhiR->getRecurrenceKind();
2364 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2366 continue;
2367
2368 for (VPUser *U : collectUsersRecursively(PhiR))
2369 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2370 RecWithFlags->dropPoisonGeneratingFlags();
2371 }
2372 }
2373}
2374
2375namespace {
2376struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2377 static bool isSentinel(const VPSingleDefRecipe *Def) {
2378 return Def == getEmptyKey() || Def == getTombstoneKey();
2379 }
2380
2381 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2382 /// return that source element type.
2383 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2384 // All VPInstructions that lower to GEPs must have the i8 source element
2385 // type (as they are PtrAdds), so we omit it.
2387 .Case([](const VPReplicateRecipe *I) -> Type * {
2388 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2389 return GEP->getSourceElementType();
2390 return nullptr;
2391 })
2392 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2393 [](auto *I) { return I->getSourceElementType(); })
2394 .Default([](auto *) { return nullptr; });
2395 }
2396
2397 /// Returns true if recipe \p Def can be safely handed for CSE.
2398 static bool canHandle(const VPSingleDefRecipe *Def) {
2399 // We can extend the list of handled recipes in the future,
2400 // provided we account for the data embedded in them while checking for
2401 // equality or hashing.
2402 auto C = getOpcodeOrIntrinsicID(Def);
2403
2404 // The issue with (Insert|Extract)Value is that the index of the
2405 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2406 // VPlan.
2407 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2408 C->second == Instruction::ExtractValue)))
2409 return false;
2410
2411 // During CSE, we can only handle recipes that don't read from memory: if
2412 // they read from memory, there could be an intervening write to memory
2413 // before the next instance is CSE'd, leading to an incorrect result.
2414 return !Def->mayReadFromMemory();
2415 }
2416
2417 /// Hash the underlying data of \p Def.
2418 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2419 const VPlan *Plan = Def->getParent()->getPlan();
2420 VPTypeAnalysis TypeInfo(*Plan);
2421 hash_code Result = hash_combine(
2422 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2423 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2425 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2426 if (RFlags->hasPredicate())
2427 return hash_combine(Result, RFlags->getPredicate());
2428 if (auto *SIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Def))
2429 return hash_combine(Result, SIVSteps->getInductionOpcode());
2430 return Result;
2431 }
2432
2433 /// Check equality of underlying data of \p L and \p R.
2434 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2435 if (isSentinel(L) || isSentinel(R))
2436 return L == R;
2437 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2439 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2441 !equal(L->operands(), R->operands()))
2442 return false;
2444 "must have valid opcode info for both recipes");
2445 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2446 if (LFlags->hasPredicate() &&
2447 LFlags->getPredicate() !=
2448 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2449 return false;
2450 if (auto *LSIV = dyn_cast<VPScalarIVStepsRecipe>(L))
2451 if (LSIV->getInductionOpcode() !=
2452 cast<VPScalarIVStepsRecipe>(R)->getInductionOpcode())
2453 return false;
2454 // Recipes in replicate regions implicitly depend on predicate. If either
2455 // recipe is in a replicate region, only consider them equal if both have
2456 // the same parent.
2457 const VPRegionBlock *RegionL = L->getRegion();
2458 const VPRegionBlock *RegionR = R->getRegion();
2459 if (((RegionL && RegionL->isReplicator()) ||
2460 (RegionR && RegionR->isReplicator())) &&
2461 L->getParent() != R->getParent())
2462 return false;
2463 const VPlan *Plan = L->getParent()->getPlan();
2464 VPTypeAnalysis TypeInfo(*Plan);
2465 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2466 }
2467};
2468} // end anonymous namespace
2469
2470/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2471/// Plan.
2473 VPDominatorTree VPDT(Plan);
2475
2477 Plan.getEntry());
2479 for (VPRecipeBase &R : *VPBB) {
2480 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2481 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2482 continue;
2483 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2484 // V must dominate Def for a valid replacement.
2485 if (!VPDT.dominates(V->getParent(), VPBB))
2486 continue;
2487 // Only keep flags present on both V and Def.
2488 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2489 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2490 Def->replaceAllUsesWith(V);
2491 continue;
2492 }
2493 CSEMap[Def] = Def;
2494 }
2495 }
2496}
2497
2498/// Return true if we do not know how to (mechanically) hoist or sink a
2499/// non-memory or memory recipe \p R out of a loop region.
2501 VPBasicBlock *LastBB) {
2502 if (!isa<VPReplicateRecipe>(R) || !R.mayReadFromMemory())
2504
2505 // Check that the load doesn't alias with stores between FirstBB and LastBB.
2506 auto MemLoc = vputils::getMemoryLocation(R);
2507 return !MemLoc || !canHoistOrSinkWithNoAliasCheck(*MemLoc, FirstBB, LastBB);
2508}
2509
2510/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2511static void licm(VPlan &Plan) {
2512 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2513
2514 // Hoist any loop invariant recipes from the vector loop region to the
2515 // preheader. Preform a shallow traversal of the vector loop region, to
2516 // exclude recipes in replicate regions. Since the top-level blocks in the
2517 // vector loop region are guaranteed to execute if the vector pre-header is,
2518 // we don't need to check speculation safety.
2519 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2520 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2521 "Expected vector prehader's successor to be the vector loop region");
2523 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2524 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2525 if (cannotHoistOrSinkRecipe(R, LoopRegion->getEntryBasicBlock(),
2526 LoopRegion->getExitingBasicBlock()))
2527 continue;
2528 if (any_of(R.operands(), [](VPValue *Op) {
2529 return !Op->isDefinedOutsideLoopRegions();
2530 }))
2531 continue;
2532 R.moveBefore(*Preheader, Preheader->end());
2533 }
2534 }
2535
2536#ifndef NDEBUG
2537 VPDominatorTree VPDT(Plan);
2538#endif
2539 // Sink recipes with no users inside the vector loop region if all users are
2540 // in the same exit block of the region.
2541 // TODO: Extend to sink recipes from inner loops.
2543 LoopRegion->getEntry());
2545 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2546 if (vputils::cannotHoistOrSinkRecipe(R, /*Sinking=*/true))
2547 continue;
2548
2549 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
2550 assert(!RepR->isPredicated() &&
2551 "Expected prior transformation of predicated replicates to "
2552 "replicate regions");
2553 // narrowToSingleScalarRecipes should have already maximally narrowed
2554 // replicates to single-scalar replicates.
2555 // TODO: When unrolling, replicateByVF doesn't handle sunk
2556 // non-single-scalar replicates correctly.
2557 if (!RepR->isSingleScalar())
2558 continue;
2559 }
2560
2561 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2562 // support recipes with multiple defined values (e.g., interleaved loads).
2563 auto *Def = cast<VPSingleDefRecipe>(&R);
2564
2565 // Cannot sink the recipe if the user is defined in a loop region or a
2566 // non-successor of the vector loop region. Cannot sink if user is a phi
2567 // either.
2568 VPBasicBlock *SinkBB = nullptr;
2569 if (any_of(Def->users(), [&SinkBB, &LoopRegion](VPUser *U) {
2570 auto *UserR = cast<VPRecipeBase>(U);
2571 VPBasicBlock *Parent = UserR->getParent();
2572 // TODO: Support sinking when users are in multiple blocks.
2573 if (SinkBB && SinkBB != Parent)
2574 return true;
2575 SinkBB = Parent;
2576 // TODO: If the user is a PHI node, we should check the block of
2577 // incoming value. Support PHI node users if needed.
2578 return UserR->isPhi() || Parent->getEnclosingLoopRegion() ||
2579 Parent->getSinglePredecessor() != LoopRegion;
2580 }))
2581 continue;
2582
2583 if (!SinkBB)
2584 SinkBB = cast<VPBasicBlock>(LoopRegion->getSingleSuccessor());
2585
2586 // TODO: This will need to be a check instead of a assert after
2587 // conditional branches in vectorized loops are supported.
2588 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2589 "Defining block must dominate sink block");
2590 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2591 // just moving.
2592 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2593 }
2594 }
2595}
2596
2598 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2599 if (Plan.hasScalarVFOnly())
2600 return;
2601 // Keep track of created truncates, so they can be re-used. Note that we
2602 // cannot use RAUW after creating a new truncate, as this would could make
2603 // other uses have different types for their operands, making them invalidly
2604 // typed.
2606 VPTypeAnalysis TypeInfo(Plan);
2607 VPBasicBlock *PH = Plan.getVectorPreheader();
2610 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2613 continue;
2614
2615 VPValue *ResultVPV = R.getVPSingleValue();
2616 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2617 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2618 if (!NewResSizeInBits)
2619 continue;
2620
2621 // If the value wasn't vectorized, we must maintain the original scalar
2622 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2623 // skip casts which do not need to be handled explicitly here, as
2624 // redundant casts will be removed during recipe simplification.
2626 continue;
2627
2628 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2629 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2630 assert(OldResTy->isIntegerTy() && "only integer types supported");
2631 (void)OldResSizeInBits;
2632
2633 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2634
2635 // Any wrapping introduced by shrinking this operation shouldn't be
2636 // considered undefined behavior. So, we can't unconditionally copy
2637 // arithmetic wrapping flags to VPW.
2638 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2639 VPW->dropPoisonGeneratingFlags();
2640
2641 assert((OldResSizeInBits != NewResSizeInBits ||
2642 match(&R, m_ICmp(m_VPValue(), m_VPValue()))) &&
2643 "Only ICmps should not need extending the result.");
2644 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2645
2646 // For loads/intrinsics we don't recreate the recipe; just wrap the
2647 // original wide result in a ZExt to OldResTy.
2649 if (OldResSizeInBits != NewResSizeInBits) {
2651 Instruction::ZExt, ResultVPV, OldResTy);
2652 ResultVPV->replaceAllUsesWith(Ext);
2653 Ext->setOperand(0, ResultVPV);
2654 }
2655 continue;
2656 }
2657
2658 // Shrink operands by introducing truncates as needed.
2659 unsigned StartIdx =
2660 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2661 SmallVector<VPValue *> NewOperands(R.operands());
2662 for (VPValue *&Op : drop_begin(NewOperands, StartIdx)) {
2663 unsigned OpSizeInBits =
2665 if (OpSizeInBits == NewResSizeInBits)
2666 continue;
2667 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2668 auto [ProcessedIter, Inserted] = ProcessedTruncs.try_emplace(Op);
2669 if (Inserted) {
2670 VPBuilder Builder;
2671 if (isa<VPIRValue>(Op))
2672 Builder.setInsertPoint(PH);
2673 else
2674 Builder.setInsertPoint(&R);
2675 ProcessedIter->second =
2676 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2677 }
2678 Op = ProcessedIter->second;
2679 }
2680
2681 auto *NWR = cast<VPWidenRecipe>(&R)->cloneWithOperands(NewOperands);
2682 NWR->insertBefore(&R);
2683
2684 // Wrap NWR in a ZExt to preserve the original wide type for downstream
2685 // users (unless this is an ICmp, which produces i1 regardless).
2686 VPValue *Replacement = NWR->getVPSingleValue();
2687 if (OldResSizeInBits != NewResSizeInBits)
2688 Replacement =
2690 .createWidenCast(Instruction::ZExt, Replacement, OldResTy)
2691 ->getVPSingleValue();
2692 ResultVPV->replaceAllUsesWith(Replacement);
2693 R.eraseFromParent();
2694 }
2695 }
2696}
2697
2698void VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) {
2699 std::optional<VPDominatorTree> VPDT;
2700 if (OnlyLatches)
2701 VPDT.emplace(Plan);
2702
2703 // Collect all blocks before modifying the CFG so we can identify unreachable
2704 // ones after constant branch removal.
2706
2707 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(AllBlocks)) {
2708 VPValue *Cond;
2709 // Skip blocks that are not terminated by BranchOnCond.
2710 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2711 continue;
2712
2713 if (OnlyLatches && !VPBlockUtils::isLatch(VPBB, *VPDT))
2714 continue;
2715
2716 assert(VPBB->getNumSuccessors() == 2 &&
2717 "Two successors expected for BranchOnCond");
2718 unsigned RemovedIdx;
2719 if (match(Cond, m_True()))
2720 RemovedIdx = 1;
2721 else if (match(Cond, m_False()))
2722 RemovedIdx = 0;
2723 else
2724 continue;
2725
2726 VPBasicBlock *RemovedSucc =
2727 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2728 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2729 "There must be a single edge between VPBB and its successor");
2730 // Values coming from VPBB into phi recipes of RemovedSucc are removed from
2731 // these recipes.
2732 for (VPRecipeBase &R : RemovedSucc->phis())
2733 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2734
2735 // Disconnect blocks and remove the terminator.
2736 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2737 VPBB->back().eraseFromParent();
2738 }
2739
2740 // Compute which blocks are still reachable from the entry after constant
2741 // branch removal.
2744
2745 // Detach all unreachable blocks from their successors, removing their recipes
2746 // and incoming values from phi recipes.
2747 VPSymbolicValue Tmp(nullptr);
2748 for (VPBlockBase *B : AllBlocks) {
2749 if (Reachable.contains(B))
2750 continue;
2751 for (VPBlockBase *Succ : to_vector(B->successors())) {
2752 if (auto *SuccBB = dyn_cast<VPBasicBlock>(Succ))
2753 for (VPRecipeBase &R : SuccBB->phis())
2754 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(B);
2756 }
2757 for (VPBasicBlock *DeadBB :
2759 for (VPRecipeBase &R : make_early_inc_range(*DeadBB)) {
2760 for (VPValue *Def : R.definedValues())
2761 Def->replaceAllUsesWith(&Tmp);
2762 R.eraseFromParent();
2763 }
2764 }
2765 }
2766}
2767
2787
2788// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2789// the loop terminator with a branch-on-cond recipe with the negated
2790// active-lane-mask as operand. Note that this turns the loop into an
2791// uncountable one. Only the existing terminator is replaced, all other existing
2792// recipes/users remain unchanged, except for poison-generating flags being
2793// dropped from the canonical IV increment. Return the created
2794// VPActiveLaneMaskPHIRecipe.
2795//
2796// The function adds the following recipes:
2797//
2798// vector.ph:
2799// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2800// %EntryALM = active-lane-mask %EntryInc, TC
2801//
2802// vector.body:
2803// ...
2804// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2805// ...
2806// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2807// %ALM = active-lane-mask %InLoopInc, TC
2808// %Negated = Not %ALM
2809// branch-on-cond %Negated
2810//
2813 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2814 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2815 VPValue *StartV = Plan.getZero(TopRegion->getCanonicalIVType());
2816 auto *CanonicalIVIncrement = TopRegion->getOrCreateCanonicalIVIncrement();
2817 // TODO: Check if dropping the flags is needed.
2818 TopRegion->clearCanonicalIVNUW(CanonicalIVIncrement);
2819 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2820 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2821 // we have to take unrolling into account. Each part needs to start at
2822 // Part * VF
2823 auto *VecPreheader = Plan.getVectorPreheader();
2824 VPBuilder Builder(VecPreheader);
2825
2826 // Create the ActiveLaneMask instruction using the correct start values.
2827 VPValue *TC = Plan.getTripCount();
2828 VPValue *VF = &Plan.getVF();
2829
2830 auto *EntryIncrement = Builder.createOverflowingOp(
2831 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
2832 DL, "index.part.next");
2833
2834 // Create the active lane mask instruction in the VPlan preheader.
2835 VPValue *ALMMultiplier =
2836 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2837 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2838 {EntryIncrement, TC, ALMMultiplier}, DL,
2839 "active.lane.mask.entry");
2840
2841 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2842 // preheader ActiveLaneMask instruction.
2843 auto *LaneMaskPhi =
2845 auto *HeaderVPBB = TopRegion->getEntryBasicBlock();
2846 LaneMaskPhi->insertBefore(*HeaderVPBB, HeaderVPBB->begin());
2847
2848 // Create the active lane mask for the next iteration of the loop before the
2849 // original terminator.
2850 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2851 Builder.setInsertPoint(OriginalTerminator);
2852 auto *InLoopIncrement = Builder.createOverflowingOp(
2854 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
2855 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2856 {InLoopIncrement, TC, ALMMultiplier}, DL,
2857 "active.lane.mask.next");
2858 LaneMaskPhi->addOperand(ALM);
2859
2860 // Replace the original terminator with BranchOnCond. We have to invert the
2861 // mask here because a true condition means jumping to the exit block.
2862 auto *NotMask = Builder.createNot(ALM, DL);
2863 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2864 OriginalTerminator->eraseFromParent();
2865 return LaneMaskPhi;
2866}
2867
2869 bool UseActiveLaneMaskForControlFlow) {
2870 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2871 auto *WideCanonicalIV = vputils::findUserOf<VPWidenCanonicalIVRecipe>(
2872 LoopRegion->getCanonicalIV());
2873 assert(WideCanonicalIV &&
2874 "Must have widened canonical IV when tail folding!");
2875 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
2876 VPSingleDefRecipe *LaneMask;
2877 if (UseActiveLaneMaskForControlFlow) {
2878 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
2879 } else {
2880 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2881 VPValue *ALMMultiplier =
2882 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
2883 LaneMask =
2884 B.createNaryOp(VPInstruction::ActiveLaneMask,
2885 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2886 nullptr, "active.lane.mask");
2887 }
2888
2889 // Walk users of WideCanonicalIV and replace the header mask of the form
2890 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2891 // removing the old one to ensure there is always only a single header mask.
2892 HeaderMask->replaceAllUsesWith(LaneMask);
2893 HeaderMask->eraseFromParent();
2894}
2895
2896template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2897 Op0_t In;
2899
2900 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2901
2902 template <typename OpTy> bool match(OpTy *V) const {
2903 if (m_Specific(In).match(V)) {
2904 Out = nullptr;
2905 return true;
2906 }
2907 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2908 }
2909};
2910
2911/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2912/// Returns the remaining part \p Out if so, or nullptr otherwise.
2913template <typename Op0_t, typename Op1_t>
2914static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2915 Op1_t &Out) {
2916 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2917}
2918
2919static std::optional<Intrinsic::ID> getVPDivRemIntrinsic(Intrinsic::ID IntrID) {
2920 switch (IntrID) {
2921 case Intrinsic::masked_udiv:
2922 return Intrinsic::vp_udiv;
2923 case Intrinsic::masked_sdiv:
2924 return Intrinsic::vp_sdiv;
2925 case Intrinsic::masked_urem:
2926 return Intrinsic::vp_urem;
2927 case Intrinsic::masked_srem:
2928 return Intrinsic::vp_srem;
2929 default:
2930 return std::nullopt;
2931 }
2932}
2933
2934/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2935/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2936/// recipe could be created.
2937/// \p HeaderMask Header Mask.
2938/// \p CurRecipe Recipe to be transform.
2939/// \p TypeInfo VPlan-based type analysis.
2940/// \p EVL The explicit vector length parameter of vector-predication
2941/// intrinsics.
2943 VPRecipeBase &CurRecipe,
2944 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2945 VPlan *Plan = CurRecipe.getParent()->getPlan();
2946 DebugLoc DL = CurRecipe.getDebugLoc();
2947 VPValue *Addr, *Mask, *EndPtr;
2948
2949 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2950 auto AdjustEndPtr = [&CurRecipe, &EVL, &TypeInfo](VPValue *EndPtr) {
2951 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2952 EVLEndPtr->insertBefore(&CurRecipe);
2953 // Cast EVL (i32) to match the VF operand's type.
2954 VPValue *EVLAsVF = VPBuilder(EVLEndPtr).createScalarZExtOrTrunc(
2955 &EVL, TypeInfo.inferScalarType(EVLEndPtr->getOperand(1)),
2956 TypeInfo.inferScalarType(&EVL), DebugLoc::getUnknown());
2957 EVLEndPtr->setOperand(1, EVLAsVF);
2958 return EVLEndPtr;
2959 };
2960
2961 auto GetVPReverse = [&CurRecipe, &EVL, &TypeInfo, Plan,
2963 if (!V)
2964 return nullptr;
2965 auto *Reverse = new VPWidenIntrinsicRecipe(
2966 Intrinsic::experimental_vp_reverse, {V, Plan->getTrue(), &EVL},
2967 TypeInfo.inferScalarType(V), {}, {}, DL);
2968 Reverse->insertBefore(&CurRecipe);
2969 return Reverse;
2970 };
2971
2972 if (match(&CurRecipe,
2973 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))))
2974 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2975 EVL, Mask);
2976
2977 VPValue *ReversedVal;
2978 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
2979 match(ReversedVal,
2980 m_MaskedLoad(m_VPValue(EndPtr),
2981 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
2982 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
2983 Mask = GetVPReverse(Mask);
2984 Addr = AdjustEndPtr(EndPtr);
2985 auto *LoadR = new VPWidenLoadEVLRecipe(
2986 *cast<VPWidenLoadRecipe>(ReversedVal), Addr, EVL, Mask);
2987 LoadR->insertBefore(&CurRecipe);
2988 return new VPWidenIntrinsicRecipe(
2989 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
2990 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
2991 }
2992
2993 VPValue *Stride;
2995 m_VPValue(Addr), m_VPValue(Stride),
2996 m_RemoveMask(HeaderMask, Mask),
2997 m_TruncOrSelf(m_Specific(&Plan->getVF()))))) {
2998 if (!Mask)
2999 Mask = Plan->getTrue();
3000 auto *NewLoad = cast<VPWidenMemIntrinsicRecipe>(&CurRecipe)->clone();
3001 NewLoad->setOperand(2, Mask);
3002 NewLoad->setOperand(3, &EVL);
3003 return NewLoad;
3004 }
3005
3006 VPValue *StoredVal;
3007 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
3008 m_RemoveMask(HeaderMask, Mask))))
3009 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3010 StoredVal, EVL, Mask);
3011
3012 if (match(&CurRecipe,
3013 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
3014 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3015 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3016 Mask = GetVPReverse(Mask);
3017 Addr = AdjustEndPtr(EndPtr);
3018 StoredVal = GetVPReverse(ReversedVal);
3019 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3020 StoredVal, EVL, Mask);
3021 }
3022
3023 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
3024 if (Rdx->isConditional() &&
3025 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
3026 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3027
3028 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
3029 if (Interleave->getMask() &&
3030 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
3031 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3032
3033 VPValue *LHS, *RHS;
3034 if (match(&CurRecipe,
3035 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
3036 return new VPWidenIntrinsicRecipe(
3037 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
3038 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3039
3040 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
3041 m_VPValue(RHS))))
3042 return new VPWidenIntrinsicRecipe(
3043 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
3044 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3045
3046 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3047 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
3048 VPValue *ZExt = VPBuilder(&CurRecipe)
3050 &EVL, Ty, TypeInfo.inferScalarType(&EVL), DL);
3051 return new VPInstruction(
3052 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
3053 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
3054 }
3055
3056 // lhs | (headermask && rhs) -> vp.merge rhs, true, lhs, evl
3057 if (match(&CurRecipe,
3059 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(RHS)))))
3060 return new VPWidenIntrinsicRecipe(
3061 Intrinsic::vp_merge, {RHS, Plan->getTrue(), LHS, &EVL},
3062 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3063
3064 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(&CurRecipe))
3065 if (auto VPID = getVPDivRemIntrinsic(IntrR->getVectorIntrinsicID()))
3066 if (match(IntrR->getOperand(2), m_RemoveMask(HeaderMask, Mask)))
3067 return new VPWidenIntrinsicRecipe(*VPID,
3068 {IntrR->getOperand(0),
3069 IntrR->getOperand(1),
3070 Mask ? Mask : Plan->getTrue(), &EVL},
3071 IntrR->getScalarType(), {}, {}, DL);
3072
3073 return nullptr;
3074}
3075
3076/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3077/// The transforms here need to preserve the original semantics.
3079 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3080 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3083 m_VPValue(EVL))) &&
3084 match(EVL, m_EVL(m_VPValue()))) {
3085 HeaderMask = R.getVPSingleValue();
3086 break;
3087 }
3088 }
3089 if (!HeaderMask)
3090 return;
3091
3092 VPTypeAnalysis TypeInfo(Plan);
3093 SmallVector<VPRecipeBase *> OldRecipes;
3094 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3096 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, TypeInfo, *EVL)) {
3097 NewR->insertBefore(R);
3098 for (auto [Old, New] :
3099 zip_equal(R->definedValues(), NewR->definedValues()))
3100 Old->replaceAllUsesWith(New);
3101 OldRecipes.push_back(R);
3102 }
3103 }
3104
3105 // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,
3106 // False, EVL)
3107 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3108 VPValue *Mask;
3109 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) {
3110 auto *LogicalAnd = cast<VPInstruction>(U);
3111 auto *Merge = new VPWidenIntrinsicRecipe(
3112 Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},
3113 TypeInfo.inferScalarType(Mask), {}, {}, LogicalAnd->getDebugLoc());
3114 Merge->insertBefore(LogicalAnd);
3115 LogicalAnd->replaceAllUsesWith(Merge);
3116 OldRecipes.push_back(LogicalAnd);
3117 }
3118 }
3119
3120 // Erase old recipes at the end so we don't invalidate TypeInfo.
3121 for (VPRecipeBase *R : reverse(OldRecipes)) {
3122 SmallVector<VPValue *> PossiblyDead(R->operands());
3123 R->eraseFromParent();
3124 for (VPValue *Op : PossiblyDead)
3126 }
3127}
3128
3129/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3130/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3131/// iteration.
3132static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3133 VPTypeAnalysis TypeInfo(Plan);
3134 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3135 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3136
3137 // EVL is i32 but VF/VFxUF are IdxTy. Convert as needed.
3138 VPValue *EVLAsIdx =
3140 .createScalarZExtOrTrunc(&EVL, Plan.getVF().getType(),
3141 TypeInfo.inferScalarType(&EVL),
3143
3144 assert(all_of(Plan.getVF().users(),
3145 [&Plan](VPUser *U) {
3146 auto IsAllowedUser =
3147 IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
3148 VPWidenIntOrFpInductionRecipe,
3149 VPWidenMemIntrinsicRecipe>;
3150 if (match(U, m_Trunc(m_Specific(&Plan.getVF()))))
3151 return all_of(cast<VPSingleDefRecipe>(U)->users(),
3152 IsAllowedUser);
3153 return IsAllowedUser(U);
3154 }) &&
3155 "User of VF that we can't transform to EVL.");
3156 Plan.getVF().replaceUsesWithIf(EVLAsIdx, [](VPUser &U, unsigned Idx) {
3158 });
3159
3160 assert(all_of(Plan.getVFxUF().users(),
3162 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3163 m_Specific(&Plan.getVFxUF())),
3165 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3166 "increment of the canonical induction.");
3167 Plan.getVFxUF().replaceUsesWithIf(EVLAsIdx, [](VPUser &U, unsigned Idx) {
3168 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3169 // canonical induction must not be updated.
3171 });
3172
3173 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3174 // contained.
3175 bool ContainsFORs =
3177 if (ContainsFORs) {
3178 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3179 VPValue *MaxEVL = &Plan.getVF();
3180 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3181 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3182 MaxEVL = Builder.createScalarZExtOrTrunc(
3183 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3184 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3185
3186 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3187 VPValue *PrevEVL = Builder.createScalarPhi(
3188 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3189
3192 for (VPRecipeBase &R : *VPBB) {
3193 VPValue *V1, *V2;
3194 if (!match(&R,
3196 m_VPValue(V1), m_VPValue(V2))))
3197 continue;
3198 VPValue *Imm = Plan.getOrAddLiveIn(
3201 Intrinsic::experimental_vp_splice,
3202 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3203 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3204 R.getDebugLoc());
3205 VPSplice->insertBefore(&R);
3206 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3207 }
3208 }
3209 }
3210
3211 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3212 if (!HeaderMask)
3213 return;
3214
3215 // Ensure that any reduction that uses a select to mask off tail lanes does so
3216 // in the vector loop, not the middle block, since EVL tail folding can have
3217 // tail elements in the penultimate iteration.
3218 assert(all_of(*Plan.getMiddleBlock(), [&Plan, HeaderMask](VPRecipeBase &R) {
3219 if (match(&R, m_ComputeReductionResult(m_Select(m_Specific(HeaderMask),
3220 m_VPValue(), m_VPValue()))))
3221 return R.getOperand(0)->getDefiningRecipe()->getRegion() ==
3222 Plan.getVectorLoopRegion();
3223 return true;
3224 }));
3225
3226 // Replace header masks with a mask equivalent to predicating by EVL:
3227 //
3228 // icmp ule widen-canonical-iv backedge-taken-count
3229 // ->
3230 // icmp ult step-vector, EVL
3231 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3232 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3233 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3234 VPValue *EVLMask = Builder.createICmp(
3236 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3237 HeaderMask->replaceAllUsesWith(EVLMask);
3238}
3239
3240/// Converts a tail folded vector loop region to step by
3241/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3242/// iteration.
3243///
3244/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3245/// replaces all uses of the canonical IV except for the canonical IV
3246/// increment with a VPCurrentIterationPHIRecipe. The canonical IV is used
3247/// only for loop iterations counting after this transformation.
3248///
3249/// - The header mask is replaced with a header mask based on the EVL.
3250///
3251/// - Plans with FORs have a new phi added to keep track of the EVL of the
3252/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3253/// @llvm.vp.splice.
3254///
3255/// The function uses the following definitions:
3256/// %StartV is the canonical induction start value.
3257///
3258/// The function adds the following recipes:
3259///
3260/// vector.ph:
3261/// ...
3262///
3263/// vector.body:
3264/// ...
3265/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3266/// [ %NextIter, %vector.body ]
3267/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3268/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3269/// ...
3270/// %OpEVL = cast i32 %VPEVL to IVSize
3271/// %NextIter = add IVSize %OpEVL, %CurrentIter
3272/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3273/// ...
3274///
3275/// If MaxSafeElements is provided, the function adds the following recipes:
3276/// vector.ph:
3277/// ...
3278///
3279/// vector.body:
3280/// ...
3281/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3282/// [ %NextIter, %vector.body ]
3283/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3284/// %cmp = cmp ult %AVL, MaxSafeElements
3285/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3286/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3287/// ...
3288/// %OpEVL = cast i32 %VPEVL to IVSize
3289/// %NextIter = add IVSize %OpEVL, %CurrentIter
3290/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3291/// ...
3292///
3294 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3295 if (Plan.hasScalarVFOnly())
3296 return;
3297 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3298 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3299
3300 auto *CanonicalIV = LoopRegion->getCanonicalIV();
3301 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3302 VPValue *StartV = Plan.getZero(CanIVTy);
3303 auto *CanonicalIVIncrement = LoopRegion->getOrCreateCanonicalIVIncrement();
3304
3305 // Create the CurrentIteration recipe in the vector loop.
3306 auto *CurrentIteration =
3308 CurrentIteration->insertBefore(*Header, Header->begin());
3309 VPBuilder Builder(Header, Header->getFirstNonPhi());
3310 // Create the AVL (application vector length), starting from TC -> 0 in steps
3311 // of EVL.
3312 VPPhi *AVLPhi = Builder.createScalarPhi(
3313 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3314 VPValue *AVL = AVLPhi;
3315
3316 if (MaxSafeElements) {
3317 // Support for MaxSafeDist for correct loop emission.
3318 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3319 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3320 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3321 "safe_avl");
3322 }
3323 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3324 DebugLoc::getUnknown(), "evl");
3325
3326 Builder.setInsertPoint(CanonicalIVIncrement);
3327 VPValue *OpVPEVL = VPEVL;
3328
3329 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3330 OpVPEVL = Builder.createScalarZExtOrTrunc(
3331 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3332
3333 auto *NextIter = Builder.createAdd(
3334 OpVPEVL, CurrentIteration, CanonicalIVIncrement->getDebugLoc(),
3335 "current.iteration.next", CanonicalIVIncrement->getNoWrapFlags());
3336 CurrentIteration->addOperand(NextIter);
3337
3338 VPValue *NextAVL =
3339 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3340 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3341 AVLPhi->addOperand(NextAVL);
3342
3343 fixupVFUsersForEVL(Plan, *VPEVL);
3344 removeDeadRecipes(Plan);
3345
3346 // Replace all uses of the canonical IV with VPCurrentIterationPHIRecipe
3347 // except for the canonical IV increment.
3348 CanonicalIV->replaceAllUsesWith(CurrentIteration);
3349 CanonicalIVIncrement->setOperand(0, CanonicalIV);
3350 // TODO: support unroll factor > 1.
3351 Plan.setUF(1);
3352}
3353
3355 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3356 // There should be only one VPCurrentIteration in the entire plan.
3357 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3358
3361 for (VPRecipeBase &R : VPBB->phis())
3362 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3363 assert(!CurrentIteration &&
3364 "Found multiple CurrentIteration. Only one expected");
3365 CurrentIteration = PhiR;
3366 }
3367
3368 // Early return if it is not variable-length stepping.
3369 if (!CurrentIteration)
3370 return;
3371
3372 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3373 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3374
3375 // Convert CurrentIteration to concrete recipe.
3376 auto *ScalarR =
3377 VPBuilder(CurrentIteration)
3379 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3380 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3381 CurrentIteration->replaceAllUsesWith(ScalarR);
3382 CurrentIteration->eraseFromParent();
3383
3384 // Replace CanonicalIVInc with CurrentIteration increment if it exists.
3385 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3386 if (auto *CanIVInc = vputils::findUserOf(
3387 CanonicalIV, m_c_Add(m_VPValue(), m_Specific(&Plan.getVFxUF())))) {
3388 cast<VPInstruction>(CanIVInc)->replaceAllUsesWith(CurrentIterationIncr);
3389 CanIVInc->eraseFromParent();
3390 }
3391}
3392
3394 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3395 if (!LoopRegion)
3396 return;
3397 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3398 if (Header->empty())
3399 return;
3400 // The EVL IV is always at the beginning.
3401 auto *EVLPhi = dyn_cast<VPCurrentIterationPHIRecipe>(&Header->front());
3402 if (!EVLPhi)
3403 return;
3404
3405 // Bail if not an EVL tail folded loop.
3406 VPValue *AVL;
3407 if (!match(EVLPhi->getBackedgeValue(),
3408 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3409 return;
3410
3411 // The AVL may be capped to a safe distance.
3412 VPValue *SafeAVL, *UnsafeAVL;
3413 if (match(AVL,
3415 m_VPValue(SafeAVL)),
3416 m_Deferred(UnsafeAVL), m_Deferred(SafeAVL))))
3417 AVL = UnsafeAVL;
3418
3419 VPValue *AVLNext;
3420 [[maybe_unused]] bool FoundAVLNext =
3422 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3423 assert(FoundAVLNext && "Didn't find AVL backedge?");
3424
3425 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3426 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3427 if (match(LatchBr, m_BranchOnCond(m_True())))
3428 return;
3429
3430 VPValue *CanIVInc;
3431 [[maybe_unused]] bool FoundIncrement = match(
3432 LatchBr,
3434 m_Specific(&Plan.getVectorTripCount()))));
3435 assert(FoundIncrement &&
3436 match(CanIVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
3437 m_Specific(&Plan.getVFxUF()))) &&
3438 "Expected BranchOnCond with ICmp comparing CanIV + VFxUF with vector "
3439 "trip count");
3440
3441 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3442 VPBuilder Builder(LatchBr);
3443 LatchBr->setOperand(
3444 0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));
3445}
3446
3448 VPlan &Plan, PredicatedScalarEvolution &PSE,
3449 const DenseMap<Value *, const SCEV *> &StridesMap) {
3450 // Replace VPValues for known constant strides guaranteed by predicated scalar
3451 // evolution that are guaranteed to be guarded by the runtime checks; that is,
3452 // blocks dominated by the vector preheader.
3453 assert(!Plan.getVectorLoopRegion() &&
3454 "expected to run before loop regions are created");
3455 VPDominatorTree VPDT(Plan);
3456 VPBlockBase *Preheader = Plan.getEntry()->getSuccessors()[1];
3457 auto CanUseVersionedStride = [&VPDT, Preheader](VPUser &U, unsigned) {
3458 auto *R = cast<VPRecipeBase>(&U);
3459 VPBlockBase *Parent = R->getParent();
3460 return VPDT.dominates(Preheader, Parent);
3461 };
3462 ValueToSCEVMapTy RewriteMap;
3463 for (const SCEV *Stride : StridesMap.values()) {
3464 using namespace SCEVPatternMatch;
3465 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3466 const APInt *StrideConst;
3467 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3468 // Only handle constant strides for now.
3469 continue;
3470
3471 auto *CI = Plan.getConstantInt(*StrideConst);
3472 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3473 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3474
3475 // The versioned value may not be used in the loop directly but through a
3476 // sext/zext. Add new live-ins in those cases.
3477 for (Value *U : StrideV->users()) {
3479 continue;
3480 VPValue *StrideVPV = Plan.getLiveIn(U);
3481 if (!StrideVPV)
3482 continue;
3483 unsigned BW = U->getType()->getScalarSizeInBits();
3484 APInt C =
3485 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3486 VPValue *CI = Plan.getConstantInt(C);
3487 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3488 }
3489 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3490 }
3491
3492 for (VPRecipeBase &R : *Plan.getEntry()) {
3493 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3494 if (!ExpSCEV)
3495 continue;
3496 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3497 auto *NewSCEV =
3498 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3499 if (NewSCEV != ScevExpr) {
3500 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3501 ExpSCEV->replaceAllUsesWith(NewExp);
3502 if (Plan.getTripCount() == ExpSCEV)
3503 Plan.resetTripCount(NewExp);
3504 }
3505 }
3506}
3507
3509 // Collect recipes in the backward slice of `Root` that may generate a poison
3510 // value that is used after vectorization.
3512 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3514 Worklist.push_back(Root);
3515
3516 // Traverse the backward slice of Root through its use-def chain.
3517 while (!Worklist.empty()) {
3518 VPRecipeBase *CurRec = Worklist.pop_back_val();
3519
3520 if (!Visited.insert(CurRec).second)
3521 continue;
3522
3523 // Prune search if we find another recipe generating a widen memory
3524 // instruction. Widen memory instructions involved in address computation
3525 // will lead to gather/scatter instructions, which don't need to be
3526 // handled.
3528 VPHeaderPHIRecipe>(CurRec))
3529 continue;
3530
3531 // This recipe contributes to the address computation of a widen
3532 // load/store. If the underlying instruction has poison-generating flags,
3533 // drop them directly.
3534 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3535 VPValue *A, *B;
3536 // Dropping disjoint from an OR may yield incorrect results, as some
3537 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3538 // for dependence analysis). Instead, replace it with an equivalent Add.
3539 // This is possible as all users of the disjoint OR only access lanes
3540 // where the operands are disjoint or poison otherwise.
3541 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3542 RecWithFlags->isDisjoint()) {
3543 VPBuilder Builder(RecWithFlags);
3544 VPInstruction *New =
3545 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3546 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3547 RecWithFlags->replaceAllUsesWith(New);
3548 RecWithFlags->eraseFromParent();
3549 CurRec = New;
3550 } else
3551 RecWithFlags->dropPoisonGeneratingFlags();
3552 } else {
3555 (void)Instr;
3556 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3557 "found instruction with poison generating flags not covered by "
3558 "VPRecipeWithIRFlags");
3559 }
3560
3561 // Add new definitions to the worklist.
3562 for (VPValue *Operand : CurRec->operands())
3563 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3564 Worklist.push_back(OpDef);
3565 }
3566 });
3567
3568 // We want to exclude the tail folding case, as we don't need to drop flags
3569 // for operations computing the first lane in this case: the first lane of the
3570 // header mask must always be true.
3571 auto IsNotHeaderMask = [&Plan](VPValue *Mask) {
3572 return Mask && !vputils::isHeaderMask(Mask, Plan);
3573 };
3574
3575 // Traverse all the recipes in the VPlan and collect the poison-generating
3576 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3577 // VPInterleaveRecipe.
3578 auto Iter =
3581 for (VPRecipeBase &Recipe : *VPBB) {
3582 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3583 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3584 if (AddrDef && WidenRec->isConsecutive() &&
3585 IsNotHeaderMask(WidenRec->getMask()))
3586 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3587 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3588 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3589 if (AddrDef && IsNotHeaderMask(InterleaveRec->getMask()))
3590 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3591 }
3592 }
3593 }
3594}
3595
3597 VPlan &Plan,
3599 &InterleaveGroups,
3600 const bool &EpilogueAllowed) {
3601 if (InterleaveGroups.empty())
3602 return;
3603
3605 for (VPBasicBlock *VPBB :
3608 for (VPRecipeBase &R : make_filter_range(*VPBB, [](VPRecipeBase &R) {
3609 return isa<VPWidenMemoryRecipe>(&R);
3610 })) {
3611 auto *MemR = cast<VPWidenMemoryRecipe>(&R);
3612 IRMemberToRecipe[&MemR->getIngredient()] = MemR;
3613 }
3614
3615 // Interleave memory: for each Interleave Group we marked earlier as relevant
3616 // for this VPlan, replace the Recipes widening its memory instructions with a
3617 // single VPInterleaveRecipe at its insertion point.
3618 VPDominatorTree VPDT(Plan);
3619 for (const auto *IG : InterleaveGroups) {
3620 // Skip interleave groups where members don't have recipes. This can happen
3621 // when removeDeadRecipes removes recipes that are part of interleave groups
3622 // but have no users.
3623 if (llvm::any_of(IG->members(), [&IRMemberToRecipe](Instruction *Member) {
3624 return !IRMemberToRecipe.contains(Member);
3625 }))
3626 continue;
3627
3628 auto *Start = IRMemberToRecipe.lookup(IG->getMember(0));
3629 VPIRMetadata InterleaveMD(*Start);
3630 SmallVector<VPValue *, 4> StoredValues;
3631 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start->getAsRecipe()))
3632 StoredValues.push_back(StoreR->getStoredValue());
3633 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3634 Instruction *MemberI = IG->getMember(I);
3635 if (!MemberI)
3636 continue;
3637 VPWidenMemoryRecipe *MemoryR = IRMemberToRecipe.lookup(MemberI);
3638 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR->getAsRecipe()))
3639 StoredValues.push_back(StoreR->getStoredValue());
3640 InterleaveMD.intersect(*MemoryR);
3641 }
3642
3643 bool NeedsMaskForGaps =
3644 (IG->requiresScalarEpilogue() && !EpilogueAllowed) ||
3645 (!StoredValues.empty() && !IG->isFull());
3646
3647 Instruction *IRInsertPos = IG->getInsertPos();
3648 auto *InsertPos = IRMemberToRecipe.lookup(IRInsertPos);
3649 VPRecipeBase *InsertPosR = InsertPos->getAsRecipe();
3650
3652 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3653 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3654 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3655
3656 // Get or create the start address for the interleave group.
3657 VPValue *Addr = Start->getAddr();
3658 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3659 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPosR)) {
3660 // We cannot re-use the address of member zero because it does not
3661 // dominate the insert position. Instead, use the address of the insert
3662 // position and create a PtrAdd adjusting it to the address of member
3663 // zero.
3664 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3665 // InsertPos or sink loads above zero members to join it.
3666 assert(IG->getIndex(IRInsertPos) != 0 &&
3667 "index of insert position shouldn't be zero");
3668 auto &DL = IRInsertPos->getDataLayout();
3669 APInt Offset(32,
3670 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3671 IG->getIndex(IRInsertPos),
3672 /*IsSigned=*/true);
3673 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3674 VPBuilder B(InsertPosR);
3675 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3676 }
3677 // If the group is reverse, adjust the index to refer to the last vector
3678 // lane instead of the first. We adjust the index from the first vector
3679 // lane, rather than directly getting the pointer for lane VF - 1, because
3680 // the pointer operand of the interleaved access is supposed to be uniform.
3681 if (IG->isReverse()) {
3682 auto *ReversePtr = new VPVectorEndPointerRecipe(
3683 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3684 -(int64_t)IG->getFactor(), NW, InsertPosR->getDebugLoc());
3685 ReversePtr->insertBefore(InsertPosR);
3686 Addr = ReversePtr;
3687 }
3688 auto *VPIG = new VPInterleaveRecipe(
3689 IG, Addr, StoredValues, InsertPos->getMask(), NeedsMaskForGaps,
3690 InterleaveMD, InsertPosR->getDebugLoc());
3691 VPIG->insertBefore(InsertPosR);
3692
3693 unsigned J = 0;
3694 for (unsigned i = 0; i < IG->getFactor(); ++i)
3695 if (Instruction *Member = IG->getMember(i)) {
3696 VPRecipeBase *MemberR = IRMemberToRecipe.lookup(Member)->getAsRecipe();
3697 if (!Member->getType()->isVoidTy()) {
3698 VPValue *OriginalV = MemberR->getVPSingleValue();
3699 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3700 J++;
3701 }
3702 MemberR->eraseFromParent();
3703 }
3704 }
3705}
3706
3707/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3708/// value, phi and backedge value. In the following example:
3709///
3710/// vector.ph:
3711/// Successor(s): vector loop
3712///
3713/// <x1> vector loop: {
3714/// vector.body:
3715/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3716/// ...
3717/// EMIT branch-on-count ...
3718/// No successors
3719/// }
3720///
3721/// WIDEN-INDUCTION will get expanded to:
3722///
3723/// vector.ph:
3724/// ...
3725/// vp<%induction.start> = ...
3726/// vp<%induction.increment> = ...
3727///
3728/// Successor(s): vector loop
3729///
3730/// <x1> vector loop: {
3731/// vector.body:
3732/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3733/// ...
3734/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3735/// EMIT branch-on-count ...
3736/// No successors
3737/// }
3738static void
3740 VPTypeAnalysis &TypeInfo) {
3741 VPlan *Plan = WidenIVR->getParent()->getPlan();
3742 VPValue *Start = WidenIVR->getStartValue();
3743 VPValue *Step = WidenIVR->getStepValue();
3744 VPValue *VF = WidenIVR->getVFValue();
3745 DebugLoc DL = WidenIVR->getDebugLoc();
3746
3747 // The value from the original loop to which we are mapping the new induction
3748 // variable.
3749 Type *Ty = WidenIVR->getScalarType();
3750
3751 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3754 VPIRFlags Flags = *WidenIVR;
3755 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3756 AddOp = Instruction::Add;
3757 MulOp = Instruction::Mul;
3758 } else {
3759 AddOp = ID.getInductionOpcode();
3760 MulOp = Instruction::FMul;
3761 }
3762
3763 // If the phi is truncated, truncate the start and step values.
3764 VPBuilder Builder(Plan->getVectorPreheader());
3765 Type *StepTy = Step->getScalarType();
3766 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3767 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3768 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3769 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3770 StepTy = Ty;
3771 }
3772
3773 // Construct the initial value of the vector IV in the vector loop preheader.
3774 Type *IVIntTy =
3776 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3777 if (StepTy->isFloatingPointTy())
3778 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3779
3780 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3781 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3782
3783 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3784 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3785 DebugLoc::getUnknown(), "induction");
3786
3787 // Create the widened phi of the vector IV.
3788 auto *WidePHI = VPBuilder(WidenIVR).createWidenPhi(
3789 Init, WidenIVR->getDebugLoc(), "vec.ind");
3790
3791 // Create the backedge value for the vector IV.
3792 VPValue *Inc;
3793 VPValue *Prev;
3794 // If unrolled, use the increment and prev value from the operands.
3795 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3796 Inc = SplatVF;
3797 Prev = WidenIVR->getLastUnrolledPartOperand();
3798 } else {
3799 if (VPRecipeBase *R = VF->getDefiningRecipe())
3800 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3801 // Multiply the vectorization factor by the step using integer or
3802 // floating-point arithmetic as appropriate.
3803 if (StepTy->isFloatingPointTy())
3804 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3805 DL);
3806 else
3807 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3808 TypeInfo.inferScalarType(VF), DL);
3809
3810 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3811 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3812 Prev = WidePHI;
3813 }
3814
3816 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3817 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3818 WidenIVR->getDebugLoc(), "vec.ind.next");
3819
3820 WidePHI->addOperand(Next);
3821
3822 WidenIVR->replaceAllUsesWith(WidePHI);
3823}
3824
3825/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3826/// initial value, phi and backedge value. In the following example:
3827///
3828/// <x1> vector loop: {
3829/// vector.body:
3830/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3831/// ...
3832/// EMIT branch-on-count ...
3833/// }
3834///
3835/// WIDEN-POINTER-INDUCTION will get expanded to:
3836///
3837/// <x1> vector loop: {
3838/// vector.body:
3839/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3840/// EMIT %mul = mul %stepvector, %step
3841/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3842/// ...
3843/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3844/// EMIT branch-on-count ...
3845/// }
3847 VPTypeAnalysis &TypeInfo) {
3848 VPlan *Plan = R->getParent()->getPlan();
3849 VPValue *Start = R->getStartValue();
3850 VPValue *Step = R->getStepValue();
3851 VPValue *VF = R->getVFValue();
3852
3853 assert(R->getInductionDescriptor().getKind() ==
3855 "Not a pointer induction according to InductionDescriptor!");
3856 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3857 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3858 "Recipe should have been replaced");
3859
3860 VPBuilder Builder(R);
3861 DebugLoc DL = R->getDebugLoc();
3862
3863 // Build a scalar pointer phi.
3864 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3865
3866 // Create actual address geps that use the pointer phi as base and a
3867 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3868 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3869 Type *StepTy = TypeInfo.inferScalarType(Step);
3870 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3871 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3872 VPValue *PtrAdd =
3873 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3874 R->replaceAllUsesWith(PtrAdd);
3875
3876 // Create the backedge value for the scalar pointer phi.
3878 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3879 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3880 DL);
3881 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3882
3883 VPValue *InductionGEP =
3884 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3885 ScalarPtrPhi->addOperand(InductionGEP);
3886}
3887
3888/// Expand a VPDerivedIVRecipe into executable recipes.
3890 VPBuilder Builder(R);
3891 VPIRValue *Start = R->getStartValue();
3892 VPValue *Step = R->getStepValue();
3893 VPValue *Index = R->getIndex();
3894 Type *StepTy = TypeInfo.inferScalarType(Step);
3895 Type *IndexTy = TypeInfo.inferScalarType(Index);
3896 Index = StepTy->isIntegerTy()
3897 ? Builder.createScalarSExtOrTrunc(
3898 Index, StepTy, IndexTy, DebugLoc::getCompilerGenerated())
3899 : Builder.createScalarCast(Instruction::SIToFP, Index, StepTy,
3901 switch (R->getInductionKind()) {
3903 assert(TypeInfo.inferScalarType(Index) == TypeInfo.inferScalarType(Start) &&
3904 "Index type does not match StartValue type");
3905 return R->replaceAllUsesWith(Builder.createAdd(
3906 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
3907 }
3909 return R->replaceAllUsesWith(Builder.createPtrAdd(
3910 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
3912 assert(StepTy->isFloatingPointTy() && "Expected FP Step value");
3913 const FPMathOperator *FPBinOp = R->getFPBinOp();
3914 assert(FPBinOp &&
3915 (FPBinOp->getOpcode() == Instruction::FAdd ||
3916 FPBinOp->getOpcode() == Instruction::FSub) &&
3917 "Original BinOp should be defined for FP induction");
3918 FastMathFlags FMF = FPBinOp->getFastMathFlags();
3919 VPValue *FMul = Builder.createNaryOp(Instruction::FMul, {Step, Index}, FMF);
3920 return R->replaceAllUsesWith(
3921 Builder.createNaryOp(FPBinOp->getOpcode(), {Start, FMul}, FMF));
3922 }
3924 return;
3925 }
3926 llvm_unreachable("Unhandled induction kind");
3927}
3928
3930 // Replace loop regions with explicity CFG.
3931 SmallVector<VPRegionBlock *> LoopRegions;
3933 vp_depth_first_deep(Plan.getEntry()))) {
3934 if (!R->isReplicator())
3935 LoopRegions.push_back(R);
3936 }
3937 for (VPRegionBlock *R : LoopRegions)
3938 R->dissolveToCFGLoop();
3939}
3940
3943 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3944 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3947 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3948 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3949 }
3950
3951 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3952 // single-condition branches:
3953 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3954 // the first condition is true, and otherwise jumps to a new interim block.
3955 // 2. A branch that ends the interim block, jumps to the second successor if
3956 // the second condition is true, and otherwise jumps to the third
3957 // successor.
3958 for (VPInstruction *Br : WorkList) {
3959 assert(Br->getNumOperands() == 2 &&
3960 "BranchOnTwoConds must have exactly 2 conditions");
3961 DebugLoc DL = Br->getDebugLoc();
3962 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
3963 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
3964 assert(Successors.size() == 3 &&
3965 "BranchOnTwoConds must have exactly 3 successors");
3966
3967 for (VPBlockBase *Succ : Successors)
3968 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
3969
3970 VPValue *Cond0 = Br->getOperand(0);
3971 VPValue *Cond1 = Br->getOperand(1);
3972 VPBlockBase *Succ0 = Successors[0];
3973 VPBlockBase *Succ1 = Successors[1];
3974 VPBlockBase *Succ2 = Successors[2];
3975 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
3976 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
3977
3978 VPBasicBlock *InterimBB =
3979 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
3980
3981 VPBuilder(BrOnTwoCondsBB)
3983 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
3984 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
3985
3987 VPBlockUtils::connectBlocks(InterimBB, Succ1);
3988 VPBlockUtils::connectBlocks(InterimBB, Succ2);
3989 Br->eraseFromParent();
3990 }
3991}
3992
3994 VPTypeAnalysis TypeInfo(Plan);
3997 vp_depth_first_deep(Plan.getEntry()))) {
3998 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3999 VPBuilder Builder(&R);
4000 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
4001 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
4002 ToRemove.push_back(WidenIVR);
4003 continue;
4004 }
4005
4006 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
4007 // If the recipe only generates scalars, scalarize it instead of
4008 // expanding it.
4009 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
4010 VPValue *PtrAdd =
4011 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
4012 WidenIVR->replaceAllUsesWith(PtrAdd);
4013 ToRemove.push_back(WidenIVR);
4014 continue;
4015 }
4016 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
4017 ToRemove.push_back(WidenIVR);
4018 continue;
4019 }
4020
4021 if (auto *DerivedIVR = dyn_cast<VPDerivedIVRecipe>(&R)) {
4022 expandVPDerivedIV(DerivedIVR, TypeInfo);
4023 ToRemove.push_back(DerivedIVR);
4024 continue;
4025 }
4026
4027 if (auto *WideCanIV = dyn_cast<VPWidenCanonicalIVRecipe>(&R)) {
4028 VPValue *CanIV = WideCanIV->getCanonicalIV();
4029 Type *CanIVTy = TypeInfo.inferScalarType(CanIV);
4030 VPValue *Step = WideCanIV->getStepValue();
4031 if (!Step) {
4032 assert(Plan.getConcreteUF() == 1 &&
4033 "Expected unroller to have materialized step for UF != 1");
4034 Step = Plan.getZero(CanIVTy);
4035 }
4036 CanIV = Builder.createNaryOp(VPInstruction::Broadcast, CanIV);
4037 Step = Builder.createNaryOp(VPInstruction::Broadcast, Step);
4038 Step = Builder.createAdd(
4039 Step, Builder.createNaryOp(VPInstruction::StepVector, {}, CanIVTy));
4040 VPValue *CanVecIV =
4041 Builder.createAdd(CanIV, Step, WideCanIV->getDebugLoc(), "vec.iv",
4042 WideCanIV->getNoWrapFlags());
4043 WideCanIV->replaceAllUsesWith(CanVecIV);
4044 ToRemove.push_back(WideCanIV);
4045 continue;
4046 }
4047
4048 // Expand VPBlendRecipe into VPInstruction::Select.
4049 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
4050 VPValue *Select = Blend->getIncomingValue(0);
4051 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
4052 Select = Builder.createSelect(Blend->getMask(I),
4053 Blend->getIncomingValue(I), Select,
4054 R.getDebugLoc(), "predphi", *Blend);
4055 Blend->replaceAllUsesWith(Select);
4056 ToRemove.push_back(Blend);
4057 }
4058
4059 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
4060 if (!VEPR->getOffset()) {
4061 assert(Plan.getConcreteUF() == 1 &&
4062 "Expected unroller to have materialized offset for UF != 1");
4063 VEPR->materializeOffset();
4064 }
4065 }
4066
4067 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
4068 Expr->decompose();
4069 ToRemove.push_back(Expr);
4070 }
4071
4072 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4073 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
4074 if (LastActiveL &&
4075 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4076 // Create Not(Mask) for all operands.
4078 for (VPValue *Op : LastActiveL->operands()) {
4079 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
4080 NotMasks.push_back(NotMask);
4081 }
4082
4083 // Create FirstActiveLane on the inverted masks.
4084 VPValue *FirstInactiveLane = Builder.createFirstActiveLane(
4085 NotMasks, LastActiveL->getDebugLoc(), "first.inactive.lane");
4086
4087 // Subtract 1 to get the last active lane.
4088 VPValue *One =
4089 Plan.getConstantInt(TypeInfo.inferScalarType(FirstInactiveLane), 1);
4090 VPValue *LastLane =
4091 Builder.createSub(FirstInactiveLane, One,
4092 LastActiveL->getDebugLoc(), "last.active.lane");
4093
4094 LastActiveL->replaceAllUsesWith(LastLane);
4095 ToRemove.push_back(LastActiveL);
4096 continue;
4097 }
4098
4099 // Lower MaskedCond with block mask to LogicalAnd.
4101 auto *VPI = cast<VPInstruction>(&R);
4102 assert(VPI->isMasked() &&
4103 "Unmasked MaskedCond should be simplified earlier");
4104 VPI->replaceAllUsesWith(Builder.createNaryOp(
4105 VPInstruction::LogicalAnd, {VPI->getMask(), VPI->getOperand(0)}));
4106 ToRemove.push_back(VPI);
4107 continue;
4108 }
4109
4110 // Lower CanonicalIVIncrementForPart to plain Add.
4111 if (match(
4112 &R,
4114 auto *VPI = cast<VPInstruction>(&R);
4115 VPValue *Add = Builder.createOverflowingOp(
4116 Instruction::Add, VPI->operands(), VPI->getNoWrapFlags(),
4117 VPI->getDebugLoc());
4118 VPI->replaceAllUsesWith(Add);
4119 ToRemove.push_back(VPI);
4120 continue;
4121 }
4122
4123 // Lower BranchOnCount to ICmp + BranchOnCond.
4124 VPValue *IV, *TC;
4125 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
4126 auto *BranchOnCountInst = cast<VPInstruction>(&R);
4127 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4128 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
4129 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
4130 ToRemove.push_back(BranchOnCountInst);
4131 continue;
4132 }
4133
4134 VPValue *VectorStep;
4135 VPValue *ScalarStep;
4137 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4138 continue;
4139
4140 // Expand WideIVStep.
4141 auto *VPI = cast<VPInstruction>(&R);
4142 Type *IVTy = TypeInfo.inferScalarType(VPI);
4143 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
4145 ? Instruction::UIToFP
4146 : Instruction::Trunc;
4147 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4148 }
4149
4150 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4151 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
4152 ScalarStep =
4153 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4154 }
4155
4156 VPIRFlags Flags;
4157 unsigned MulOpc;
4158 if (IVTy->isFloatingPointTy()) {
4159 MulOpc = Instruction::FMul;
4160 Flags = VPI->getFastMathFlags();
4161 } else {
4162 MulOpc = Instruction::Mul;
4163 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4164 }
4165
4166 VPInstruction *Mul = Builder.createNaryOp(
4167 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4168 VectorStep = Mul;
4169 VPI->replaceAllUsesWith(VectorStep);
4170 ToRemove.push_back(VPI);
4171 }
4172 }
4173
4174 for (VPRecipeBase *R : ToRemove)
4175 R->eraseFromParent();
4176}
4177
4179 VPBasicBlock *HeaderVPBB,
4180 VPBasicBlock *LatchVPBB,
4181 VPBasicBlock *MiddleVPBB,
4182 UncountableExitStyle Style) {
4183 struct EarlyExitInfo {
4184 VPBasicBlock *EarlyExitingVPBB;
4185 VPIRBasicBlock *EarlyExitVPBB;
4186 VPValue *CondToExit;
4187 };
4188
4189 VPDominatorTree VPDT(Plan);
4190 VPBuilder Builder(LatchVPBB->getTerminator());
4192 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4193 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4194 if (Pred == MiddleVPBB)
4195 continue;
4196 // Collect condition for this early exit.
4197 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4198 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4199 VPValue *CondOfEarlyExitingVPBB;
4200 [[maybe_unused]] bool Matched =
4201 match(EarlyExitingVPBB->getTerminator(),
4202 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4203 assert(Matched && "Terminator must be BranchOnCond");
4204
4205 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4206 // the correct block mask.
4207 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4208 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4210 TrueSucc == ExitBlock
4211 ? CondOfEarlyExitingVPBB
4212 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4213 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4214 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4215 VPDT.properlyDominates(
4216 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4217 LatchVPBB)) &&
4218 "exit condition must dominate the latch");
4219 Exits.push_back({
4220 EarlyExitingVPBB,
4221 ExitBlock,
4222 CondToEarlyExit,
4223 });
4224 }
4225 }
4226
4227 assert(!Exits.empty() && "must have at least one early exit");
4228 // Sort exits by RPO order to get correct program order. RPO gives a
4229 // topological ordering of the CFG, ensuring upstream exits are checked
4230 // before downstream exits in the dispatch chain.
4232 HeaderVPBB);
4234 for (const auto &[Num, VPB] : enumerate(RPOT))
4235 RPOIdx[VPB] = Num;
4236 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4237 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4238 });
4239#ifndef NDEBUG
4240 // After RPO sorting, verify that for any pair where one exit dominates
4241 // another, the dominating exit comes first. This is guaranteed by RPO
4242 // (topological order) and is required for the dispatch chain correctness.
4243 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4244 for (unsigned J = I + 1; J < Exits.size(); ++J)
4245 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4246 Exits[I].EarlyExitingVPBB) &&
4247 "RPO sort must place dominating exits before dominated ones");
4248#endif
4249
4250 // Build the AnyOf condition for the latch terminator using logical OR
4251 // to avoid poison propagation from later exit conditions when an earlier
4252 // exit is taken.
4253 VPValue *Combined = Exits[0].CondToExit;
4254 for (const EarlyExitInfo &Info : drop_begin(Exits))
4255 Combined = Builder.createLogicalOr(Combined, Info.CondToExit);
4256
4257 VPValue *IsAnyExitTaken =
4258 Builder.createNaryOp(VPInstruction::AnyOf, {Combined});
4259
4261 "Early exit store masking not implemented");
4262
4263 // Create the vector.early.exit blocks.
4264 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4265 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4266 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4267 VPBasicBlock *VectorEarlyExitVPBB =
4268 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4269 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4270 }
4271
4272 // Create the dispatch block (or reuse the single exit block if only one
4273 // exit). The dispatch block computes the first active lane of the combined
4274 // condition and, for multiple exits, chains through conditions to determine
4275 // which exit to take.
4276 VPBasicBlock *DispatchVPBB =
4277 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4278 : Plan.createVPBasicBlock("vector.early.exit.check");
4279 DispatchVPBB->setPredecessors({LatchVPBB});
4280 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4281 VPValue *FirstActiveLane = DispatchBuilder.createFirstActiveLane(
4282 {Combined}, DebugLoc::getUnknown(), "first.active.lane");
4283
4284 // For each early exit, disconnect the original exiting block
4285 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4286 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4287 // values at the first active lane:
4288 //
4289 // Input:
4290 // early.exiting.I:
4291 // ...
4292 // EMIT branch-on-cond vp<%cond.I>
4293 // Successor(s): in.loop.succ, ir-bb<exit.I>
4294 //
4295 // ir-bb<exit.I>:
4296 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4297 //
4298 // Output:
4299 // early.exiting.I:
4300 // ...
4301 // Successor(s): in.loop.succ
4302 //
4303 // vector.early.exit.I:
4304 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4305 // Successor(s): ir-bb<exit.I>
4306 //
4307 // ir-bb<exit.I>:
4308 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4309 // vector.early.exit.I)
4310 //
4311 for (auto [Exit, VectorEarlyExitVPBB] :
4312 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4313 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4314 // Adjust the phi nodes in EarlyExitVPBB.
4315 // 1. remove incoming values from EarlyExitingVPBB,
4316 // 2. extract the incoming value at FirstActiveLane
4317 // 3. add back the extracts as last operands for the phis
4318 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4319 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4320 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4321 // values from VectorEarlyExitVPBB.
4322 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4323 auto *ExitIRI = cast<VPIRPhi>(&R);
4324 VPValue *IncomingVal =
4325 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4326 VPValue *NewIncoming = IncomingVal;
4327 if (!isa<VPIRValue>(IncomingVal)) {
4328 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4329 NewIncoming = EarlyExitBuilder.createNaryOp(
4330 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4331 DebugLoc::getUnknown(), "early.exit.value");
4332 }
4333 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4334 ExitIRI->addOperand(NewIncoming);
4335 }
4336
4337 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4338 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4339 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4340 }
4341
4342 // Chain through exits: for each exit, check if its condition is true at
4343 // the first active lane. If so, take that exit; otherwise, try the next.
4344 // The last exit needs no check since it must be taken if all others fail.
4345 //
4346 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4347 //
4348 // latch:
4349 // ...
4350 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4351 // ...
4352 //
4353 // vector.early.exit.check:
4354 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4355 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4356 // EMIT branch-on-cond vp<%at.cond.0>
4357 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4358 //
4359 // vector.early.exit.check.0:
4360 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4361 // EMIT branch-on-cond vp<%at.cond.1>
4362 // Successor(s): vector.early.exit.1, vector.early.exit.2
4363 VPBasicBlock *CurrentBB = DispatchVPBB;
4364 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4365 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4366 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4367 DebugLoc::getUnknown(), "exit.cond.at.lane");
4368
4369 // For the last dispatch, branch directly to the last exit on false;
4370 // otherwise, create a new check block.
4371 bool IsLastDispatch = (I + 2 == Exits.size());
4372 VPBasicBlock *FalseBB =
4373 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4374 : Plan.createVPBasicBlock(
4375 Twine("vector.early.exit.check.") + Twine(I));
4376
4377 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4378 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4379 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4380 FalseBB->setPredecessors({CurrentBB});
4381
4382 CurrentBB = FalseBB;
4383 DispatchBuilder.setInsertPoint(CurrentBB);
4384 }
4385
4386 // Replace the latch terminator with the new branching logic.
4387 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4388 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4389 "Unexpected terminator");
4390 auto *IsLatchExitTaken =
4391 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
4392 LatchExitingBranch->getOperand(1));
4393
4394 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4395 LatchExitingBranch->eraseFromParent();
4396 Builder.setInsertPoint(LatchVPBB);
4397 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
4398 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4399 LatchVPBB->clearSuccessors();
4400 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4401}
4402
4403/// This function tries convert extended in-loop reductions to
4404/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4405/// valid. The created recipe must be decomposed to its constituent
4406/// recipes before execution.
4407static VPExpressionRecipe *
4409 VFRange &Range) {
4410 Type *RedTy = Ctx.Types.inferScalarType(Red);
4411 VPValue *VecOp = Red->getVecOp();
4412
4413 assert(!Red->isPartialReduction() &&
4414 "This path does not support partial reductions");
4415
4416 // Clamp the range if using extended-reduction is profitable.
4417 auto IsExtendedRedValidAndClampRange =
4418 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4420 [&](ElementCount VF) {
4421 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4423
4425 InstructionCost ExtCost =
4426 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4427 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4428
4429 assert(!RedTy->isFloatingPointTy() &&
4430 "getExtendedReductionCost only supports integer types");
4431 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4432 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4433 Red->getFastMathFlags(), CostKind);
4434 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4435 },
4436 Range);
4437 };
4438
4439 VPValue *A;
4440 // Match reduce(ext)).
4442 IsExtendedRedValidAndClampRange(
4443 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4444 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
4445 Ctx.Types.inferScalarType(A)))
4446 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4447
4448 return nullptr;
4449}
4450
4451/// This function tries convert extended in-loop reductions to
4452/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4453/// and valid. The created VPExpressionRecipe must be decomposed to its
4454/// constituent recipes before execution. Patterns of the
4455/// VPExpressionRecipe:
4456/// reduce.add(mul(...)),
4457/// reduce.add(mul(ext(A), ext(B))),
4458/// reduce.add(ext(mul(ext(A), ext(B)))).
4459/// reduce.fadd(fmul(ext(A), ext(B)))
4460static VPExpressionRecipe *
4462 VPCostContext &Ctx, VFRange &Range) {
4463 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4464 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4465 Opcode != Instruction::FAdd)
4466 return nullptr;
4467
4468 assert(!Red->isPartialReduction() &&
4469 "This path does not support partial reductions");
4470 Type *RedTy = Ctx.Types.inferScalarType(Red);
4471
4472 // Clamp the range if using multiply-accumulate-reduction is profitable.
4473 auto IsMulAccValidAndClampRange =
4475 VPWidenCastRecipe *OuterExt) -> bool {
4477 [&](ElementCount VF) {
4479 Type *SrcTy =
4480 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4481 InstructionCost MulAccCost;
4482
4483 // getMulAccReductionCost for in-loop reductions does not support
4484 // mixed or floating-point extends.
4485 if (Ext0 && Ext1 &&
4486 (Ext0->getOpcode() != Ext1->getOpcode() ||
4487 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4488 return false;
4489
4490 bool IsZExt =
4491 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4492 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4493 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4494 SrcVecTy, CostKind);
4495
4496 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4497 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4498 InstructionCost ExtCost = 0;
4499 if (Ext0)
4500 ExtCost += Ext0->computeCost(VF, Ctx);
4501 if (Ext1)
4502 ExtCost += Ext1->computeCost(VF, Ctx);
4503 if (OuterExt)
4504 ExtCost += OuterExt->computeCost(VF, Ctx);
4505
4506 return MulAccCost.isValid() &&
4507 MulAccCost < ExtCost + MulCost + RedCost;
4508 },
4509 Range);
4510 };
4511
4512 VPValue *VecOp = Red->getVecOp();
4513 VPRecipeBase *Sub = nullptr;
4514 VPValue *A, *B;
4515 VPValue *Tmp = nullptr;
4516
4517 if (RedTy->isFloatingPointTy())
4518 return nullptr;
4519
4520 // Sub reductions could have a sub between the add reduction and vec op.
4521 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4522 Sub = VecOp->getDefiningRecipe();
4523 VecOp = Tmp;
4524 }
4525
4526 // If ValB is a constant and can be safely extended, truncate it to the same
4527 // type as ExtA's operand, then extend it to the same type as ExtA. This
4528 // creates two uniform extends that can more easily be matched by the rest of
4529 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4530 // replaced with the new extend of the constant.
4531 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
4532 VPWidenCastRecipe *&ExtB,
4533 VPValue *&ValB, VPWidenRecipe *Mul) {
4534 if (!ExtA || ExtB || !isa<VPIRValue>(ValB))
4535 return;
4536 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4537 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4538 const APInt *Const;
4539 if (!match(ValB, m_APInt(Const)) ||
4541 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4542 return;
4543 // The truncate ensures that the type of each extended operand is the
4544 // same, and it's been proven that the constant can be extended from
4545 // NarrowTy safely. Necessary since ExtA's extended operand would be
4546 // e.g. an i8, while the const will likely be an i32. This will be
4547 // elided by later optimisations.
4548 VPBuilder Builder(Mul);
4549 auto *Trunc =
4550 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4551 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4552 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4553 Mul->setOperand(1, ExtB);
4554 };
4555
4556 // Try to match reduce.add(mul(...)).
4557 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4558 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(A);
4559 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(B);
4560 auto *Mul = cast<VPWidenRecipe>(VecOp);
4561
4562 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4563 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4564
4565 // Match reduce.add/sub(mul(ext, ext)).
4566 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4567 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4568 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4569 if (Sub)
4570 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4571 cast<VPWidenRecipe>(Sub), Red);
4572 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4573 }
4574 // TODO: Add an expression type for this variant with a negated mul
4575 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4576 return new VPExpressionRecipe(Mul, Red);
4577 }
4578 // TODO: Add an expression type for negated versions of other expression
4579 // variants.
4580 if (Sub)
4581 return nullptr;
4582
4583 // Match reduce.add(ext(mul(A, B))).
4584 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4585 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4586 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4587 auto *Ext0 = dyn_cast<VPWidenCastRecipe>(A);
4588 auto *Ext1 = dyn_cast<VPWidenCastRecipe>(B);
4589
4590 // reduce.add(ext(mul(ext, const)))
4591 // -> reduce.add(ext(mul(ext, ext(const))))
4592 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4593
4594 // reduce.add(ext(mul(ext(A), ext(B))))
4595 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4596 // The inner extends must either have the same opcode as the outer extend or
4597 // be the same, in which case the multiply can never result in a negative
4598 // value and the outer extend can be folded away by doing wider
4599 // extends for the operands of the mul.
4600 if (Ext0 && Ext1 &&
4601 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4602 Ext0->getOpcode() == Ext1->getOpcode() &&
4603 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4604 auto *NewExt0 = new VPWidenCastRecipe(
4605 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getScalarType(), nullptr,
4606 *Ext0, *Ext0, Ext0->getDebugLoc());
4607 NewExt0->insertBefore(Ext0);
4608
4609 VPWidenCastRecipe *NewExt1 = NewExt0;
4610 if (Ext0 != Ext1) {
4611 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4612 Ext->getScalarType(), nullptr, *Ext1,
4613 *Ext1, Ext1->getDebugLoc());
4614 NewExt1->insertBefore(Ext1);
4615 }
4616 auto *NewMul = Mul->cloneWithOperands({NewExt0, NewExt1});
4617 NewMul->insertBefore(Mul);
4618 Ext->replaceAllUsesWith(NewMul);
4619 Ext->eraseFromParent();
4620 Mul->eraseFromParent();
4621 return new VPExpressionRecipe(NewExt0, NewExt1, NewMul, Red);
4622 }
4623 }
4624 return nullptr;
4625}
4626
4627/// This function tries to create abstract recipes from the reduction recipe for
4628/// following optimizations and cost estimation.
4630 VPCostContext &Ctx,
4631 VFRange &Range) {
4632 // Creation of VPExpressions for partial reductions is entirely handled in
4633 // transformToPartialReduction.
4634 assert(!Red->isPartialReduction() &&
4635 "This path does not support partial reductions");
4636
4637 VPExpressionRecipe *AbstractR = nullptr;
4638 auto IP = std::next(Red->getIterator());
4639 auto *VPBB = Red->getParent();
4640 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4641 AbstractR = MulAcc;
4642 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4643 AbstractR = ExtRed;
4644 // Cannot create abstract inloop reduction recipes.
4645 if (!AbstractR)
4646 return;
4647
4648 AbstractR->insertBefore(*VPBB, IP);
4649 Red->replaceAllUsesWith(AbstractR);
4650}
4651
4662
4664 if (Plan.hasScalarVFOnly())
4665 return;
4666
4667#ifndef NDEBUG
4668 VPDominatorTree VPDT(Plan);
4669#endif
4670
4671 SmallVector<VPValue *> VPValues;
4672 if (VPValue *BTC = Plan.getBackedgeTakenCount())
4673 VPValues.push_back(BTC);
4674 append_range(VPValues, Plan.getLiveIns());
4675 for (VPRecipeBase &R : *Plan.getEntry())
4676 append_range(VPValues, R.definedValues());
4677
4678 auto *VectorPreheader = Plan.getVectorPreheader();
4679 for (VPValue *VPV : VPValues) {
4681 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4682 continue;
4683
4684 // Add explicit broadcast at the insert point that dominates all users.
4685 VPBasicBlock *HoistBlock = VectorPreheader;
4686 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4687 for (VPUser *User : VPV->users()) {
4688 if (User->usesScalars(VPV))
4689 continue;
4690 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4691 HoistPoint = HoistBlock->begin();
4692 else
4693 assert(VPDT.dominates(VectorPreheader,
4694 cast<VPRecipeBase>(User)->getParent()) &&
4695 "All users must be in the vector preheader or dominated by it");
4696 }
4697
4698 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4699 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4700 VPV->replaceUsesWithIf(Broadcast,
4701 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4702 return Broadcast != &U && !U.usesScalars(VPV);
4703 });
4704 }
4705}
4706
4707// Collect common metadata from a group of replicate recipes by intersecting
4708// metadata from all recipes in the group.
4710 VPIRMetadata CommonMetadata = *Recipes.front();
4711 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4712 CommonMetadata.intersect(*Recipe);
4713 return CommonMetadata;
4714}
4715
4716template <unsigned Opcode>
4720 const Loop *L) {
4721 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4722 "Only Load and Store opcodes supported");
4723 constexpr bool IsLoad = (Opcode == Instruction::Load);
4724 VPTypeAnalysis TypeInfo(Plan);
4725
4726 // For each address, collect operations with the same or complementary masks.
4728 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4729 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4730 };
4732 Plan, PSE, L,
4733 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
4734 for (auto Recipes : Groups) {
4735 if (Recipes.size() < 2)
4736 continue;
4737
4738 // Collect groups with the same or complementary masks.
4739 for (VPReplicateRecipe *&RecipeI : Recipes) {
4740 if (!RecipeI)
4741 continue;
4742
4743 VPValue *MaskI = RecipeI->getMask();
4744 Type *TypeI = GetLoadStoreValueType(RecipeI);
4746 Group.push_back(RecipeI);
4747 RecipeI = nullptr;
4748
4749 // Find all operations with the same or complementary masks.
4750 bool HasComplementaryMask = false;
4751 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4752 if (!RecipeJ)
4753 continue;
4754
4755 VPValue *MaskJ = RecipeJ->getMask();
4756 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4757 if (TypeI == TypeJ) {
4758 // Check if any operation in the group has a complementary mask with
4759 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4760 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4761 match(MaskJ, m_Not(m_Specific(MaskI)));
4762 Group.push_back(RecipeJ);
4763 RecipeJ = nullptr;
4764 }
4765 }
4766
4767 if (HasComplementaryMask) {
4768 assert(Group.size() >= 2 && "must have at least 2 entries");
4769 AllGroups.push_back(std::move(Group));
4770 }
4771 }
4772 }
4773
4774 return AllGroups;
4775}
4776
4777// Find the recipe with minimum alignment in the group.
4778template <typename InstType>
4779static VPReplicateRecipe *
4781 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4782 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4783 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4784 });
4785}
4786
4789 const Loop *L) {
4790 auto Groups =
4792 if (Groups.empty())
4793 return;
4794
4795 // Process each group of loads.
4796 for (auto &Group : Groups) {
4797 // Try to use the earliest (most dominating) load to replace all others.
4798 VPReplicateRecipe *EarliestLoad = Group[0];
4799 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4800 VPBasicBlock *LastBB = Group.back()->getParent();
4801
4802 // Check that the load doesn't alias with stores between first and last.
4803 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4804 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4805 continue;
4806
4807 // Collect common metadata from all loads in the group.
4808 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4809
4810 // Find the load with minimum alignment to use.
4811 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4812
4813 bool IsSingleScalar = EarliestLoad->isSingleScalar();
4814 assert(all_of(Group,
4815 [IsSingleScalar](VPReplicateRecipe *R) {
4816 return R->isSingleScalar() == IsSingleScalar;
4817 }) &&
4818 "all members in group must agree on IsSingleScalar");
4819
4820 // Create an unpredicated version of the earliest load with common
4821 // metadata.
4822 auto *UnpredicatedLoad = new VPReplicateRecipe(
4823 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4824 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
4825
4826 UnpredicatedLoad->insertBefore(EarliestLoad);
4827
4828 // Replace all loads in the group with the unpredicated load.
4829 for (VPReplicateRecipe *Load : Group) {
4830 Load->replaceAllUsesWith(UnpredicatedLoad);
4831 Load->eraseFromParent();
4832 }
4833 }
4834}
4835
4836static bool
4838 PredicatedScalarEvolution &PSE, const Loop &L,
4839 VPTypeAnalysis &TypeInfo) {
4840 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4841 if (!StoreLoc || !StoreLoc->AATags.Scope)
4842 return false;
4843
4844 // When sinking a group of stores, all members of the group alias each other.
4845 // Skip them during the alias checks.
4846 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4847 StoresToSink.end());
4848
4849 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4850 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4851 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4852 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4853}
4854
4857 const Loop *L) {
4858 auto Groups =
4860 if (Groups.empty())
4861 return;
4862
4863 VPTypeAnalysis TypeInfo(Plan);
4864
4865 for (auto &Group : Groups) {
4866 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4867 continue;
4868
4869 // Use the last (most dominated) store's location for the unconditional
4870 // store.
4871 VPReplicateRecipe *LastStore = Group.back();
4872 VPBasicBlock *InsertBB = LastStore->getParent();
4873
4874 // Collect common alias metadata from all stores in the group.
4875 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4876
4877 // Build select chain for stored values.
4878 VPValue *SelectedValue = Group[0]->getOperand(0);
4879 VPBuilder Builder(InsertBB, LastStore->getIterator());
4880
4881 bool IsSingleScalar = Group[0]->isSingleScalar();
4882 for (unsigned I = 1; I < Group.size(); ++I) {
4883 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
4884 "all members in group must agree on IsSingleScalar");
4885 VPValue *Mask = Group[I]->getMask();
4886 VPValue *Value = Group[I]->getOperand(0);
4887 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4888 Group[I]->getDebugLoc());
4889 }
4890
4891 // Find the store with minimum alignment to use.
4892 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4893
4894 // Create unconditional store with selected value and common metadata.
4895 auto *UnpredicatedStore = new VPReplicateRecipe(
4896 StoreWithMinAlign->getUnderlyingInstr(),
4897 {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,
4898 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4899 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4900
4901 // Remove all predicated stores from the group.
4902 for (VPReplicateRecipe *Store : Group)
4903 Store->eraseFromParent();
4904 }
4905}
4906
4908 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4910 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4911 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4912
4913 VPValue *TC = Plan.getTripCount();
4914 if (TC->getNumUsers() == 0)
4915 return;
4916
4917 // Skip cases for which the trip count may be non-trivial to materialize.
4918 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4919 // tail is required.
4920 if (!Plan.hasScalarTail() ||
4922 Plan.getScalarPreheader() ||
4923 !isa<VPIRValue>(TC))
4924 return;
4925
4926 // Materialize vector trip counts for constants early if it can simply
4927 // be computed as (Original TC / VF * UF) * VF * UF.
4928 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4929 // tail-folded loops.
4930 ScalarEvolution &SE = *PSE.getSE();
4931 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4932 if (!isa<SCEVConstant>(TCScev))
4933 return;
4934 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4935 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4936 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4937 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4938}
4939
4941 VPBasicBlock *VectorPH) {
4943 if (BTC->getNumUsers() == 0)
4944 return;
4945
4946 VPBuilder Builder(VectorPH, VectorPH->begin());
4947 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4948 auto *TCMO =
4949 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
4950 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4951 BTC->replaceAllUsesWith(TCMO);
4952}
4953
4955 if (Plan.hasScalarVFOnly())
4956 return;
4957
4958 VPTypeAnalysis TypeInfo(Plan);
4959 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4960 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4962 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4963 vp_depth_first_shallow(LoopRegion->getEntry()));
4964 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
4965 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
4966 // regions. Those are not materialized explicitly yet.
4967 // TODO: materialize build vectors for replicating recipes in replicating
4968 // regions.
4969 for (VPBasicBlock *VPBB :
4970 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
4971 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4973 continue;
4974 auto *DefR = cast<VPSingleDefRecipe>(&R);
4975 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4976 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4977 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
4978 };
4979 if ((isa<VPReplicateRecipe>(DefR) &&
4980 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
4981 (isa<VPInstruction>(DefR) &&
4983 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
4984 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
4985 continue;
4986
4987 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
4988 unsigned Opcode = ScalarTy->isStructTy()
4991 auto *BuildVector = new VPInstruction(Opcode, {DefR});
4992 BuildVector->insertAfter(DefR);
4993
4994 DefR->replaceUsesWithIf(
4995 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
4996 VPUser &U, unsigned) {
4997 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
4998 });
4999 }
5000 }
5001
5002 // Create explicit VPInstructions to convert vectors to scalars. The current
5003 // implementation is conservative - it may miss some cases that may or may not
5004 // be vector values. TODO: introduce Unpacks speculatively - remove them later
5005 // if they are known to operate on scalar values.
5006 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5007 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5009 VPDerivedIVRecipe>(&R))
5010 continue;
5011 for (VPValue *Def : R.definedValues()) {
5012 // Skip recipes that are single-scalar or only have their first lane
5013 // used.
5014 // TODO: The Defs skipped here may or may not be vector values.
5015 // Introduce Unpacks, and remove them later, if they are guaranteed to
5016 // produce scalar values.
5018 continue;
5019
5020 // At the moment, we create unpacks only for scalar users outside
5021 // replicate regions. Recipes inside replicate regions still extract the
5022 // required lanes implicitly.
5023 // TODO: Remove once replicate regions are unrolled completely.
5024 auto IsCandidateUnpackUser = [Def](VPUser *U) {
5025 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5026 return U->usesScalars(Def) &&
5027 (!ParentRegion || !ParentRegion->isReplicator());
5028 };
5029 if (none_of(Def->users(), IsCandidateUnpackUser))
5030 continue;
5031
5032 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
5033 if (R.isPhi())
5034 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
5035 else
5036 Unpack->insertAfter(&R);
5037 Def->replaceUsesWithIf(Unpack,
5038 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
5039 return IsCandidateUnpackUser(&U);
5040 });
5041 }
5042 }
5043 }
5044}
5045
5047 VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking,
5048 bool RequiresScalarEpilogue, VPValue *Step,
5049 std::optional<uint64_t> MaxRuntimeStep) {
5050 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5051 // There's nothing to do if there are no users of the vector trip count or its
5052 // IR value has already been set.
5053 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
5054 return;
5055
5056 VPValue *TC = Plan.getTripCount();
5057 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
5058 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5059 if (auto *StepR = Step->getDefiningRecipe()) {
5060 assert(StepR->getParent() == VectorPHVPBB &&
5061 "Step must be defined in VectorPHVPBB");
5062 // Insert after Step's definition to maintain valid def-use ordering.
5063 InsertPt = std::next(StepR->getIterator());
5064 }
5065 VPBuilder Builder(VectorPHVPBB, InsertPt);
5066
5067 // For scalable steps, if TC is a constant and is divisible by the maximum
5068 // possible runtime step, then TC % Step == 0 for all valid vscale values
5069 // and the vector trip count equals TC directly.
5070 const APInt *TCVal;
5071 if (!RequiresScalarEpilogue && match(TC, m_APInt(TCVal)) && MaxRuntimeStep &&
5072 TCVal->getZExtValue() % *MaxRuntimeStep == 0) {
5073 VectorTC.replaceAllUsesWith(TC);
5074 return;
5075 }
5076
5077 // If the tail is to be folded by masking, round the number of iterations N
5078 // up to a multiple of Step instead of rounding down. This is done by first
5079 // adding Step-1 and then rounding down. Note that it's ok if this addition
5080 // overflows: the vector induction variable will eventually wrap to zero given
5081 // that it starts at zero and its Step is a power of two; the loop will then
5082 // exit, with the last early-exit vector comparison also producing all-true.
5083 if (TailByMasking) {
5084 TC = Builder.createAdd(
5085 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5086 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5087 }
5088
5089 // Now we need to generate the expression for the part of the loop that the
5090 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5091 // iterations are not required for correctness, or N - Step, otherwise. Step
5092 // is equal to the vectorization factor (number of SIMD elements) times the
5093 // unroll factor (number of SIMD instructions).
5094 VPValue *R =
5095 Builder.createNaryOp(Instruction::URem, {TC, Step},
5096 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5097
5098 // There are cases where we *must* run at least one iteration in the remainder
5099 // loop. See the cost model for when this can happen. If the step evenly
5100 // divides the trip count, we set the remainder to be equal to the step. If
5101 // the step does not evenly divide the trip count, no adjustment is necessary
5102 // since there will already be scalar iterations. Note that the minimum
5103 // iterations check ensures that N >= Step.
5104 if (RequiresScalarEpilogue) {
5105 assert(!TailByMasking &&
5106 "requiring scalar epilogue is not supported with fail folding");
5107 VPValue *IsZero =
5108 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));
5109 R = Builder.createSelect(IsZero, Step, R);
5110 }
5111
5112 VPValue *Res =
5113 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5114 VectorTC.replaceAllUsesWith(Res);
5115}
5116
5118 ElementCount VFEC) {
5119 // If VF and VFxUF have already been materialized (no remaining users),
5120 // there's nothing more to do.
5121 if (Plan.getVF().isMaterialized()) {
5122 assert(Plan.getVFxUF().isMaterialized() &&
5123 "VF and VFxUF must be materialized together");
5124 return;
5125 }
5126
5127 VPBuilder Builder(VectorPH, VectorPH->begin());
5128 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
5129 VPValue &VF = Plan.getVF();
5130 VPValue &VFxUF = Plan.getVFxUF();
5131 // If there are no users of the runtime VF, compute VFxUF by constant folding
5132 // the multiplication of VF and UF.
5133 if (VF.getNumUsers() == 0) {
5134 VPValue *RuntimeVFxUF =
5135 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5136 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5137 return;
5138 }
5139
5140 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5141 // vscale) * UF.
5142 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5144 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5146 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5147 }
5148 VF.replaceAllUsesWith(RuntimeVF);
5149
5150 VPValue *MulByUF = Builder.createOverflowingOp(
5151 Instruction::Mul,
5152 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5153 {true, false});
5154 VFxUF.replaceAllUsesWith(MulByUF);
5155}
5156
5159 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5160
5161 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5162 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5163 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5164 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5166 continue;
5167 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5168 if (!ExpSCEV)
5169 break;
5170 const SCEV *Expr = ExpSCEV->getSCEV();
5171 Value *Res =
5172 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5173 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
5174 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5175 ExpSCEV->replaceAllUsesWith(Exp);
5176 if (Plan.getTripCount() == ExpSCEV)
5177 Plan.resetTripCount(Exp);
5178 ExpSCEV->eraseFromParent();
5179 }
5181 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
5182 "before any VPIRInstructions");
5183 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5184 // to the VPIRBasicBlock.
5185 auto EI = Entry->begin();
5186 for (Instruction &I : drop_end(*EntryBB)) {
5187 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5188 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5189 EI++;
5190 continue;
5191 }
5193 }
5194
5195 return ExpandedSCEVs;
5196}
5197
5198/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5199/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5200/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5201/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5202/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5203/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5204/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5205/// is defined at \p Idx of a load interleave group.
5206static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5207 VPValue *OpV, unsigned Idx, bool IsScalable) {
5208 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5209 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5210 if (!Member0OpR)
5211 return Member0Op == OpV;
5212 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5213 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5214 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5215 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5216 Member0Op == OpV;
5217 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5218 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5219 return false;
5220}
5221
5222static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5224 auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Ops[0]);
5225 if (!WideMember0)
5226 return false;
5227 for (VPValue *V : Ops) {
5229 return false;
5230 auto *R = cast<VPSingleDefRecipe>(V);
5231 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5232 return false;
5233 }
5234
5235 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5237 for (VPValue *Op : Ops)
5238 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5239
5240 if (canNarrowOps(OpsI, IsScalable))
5241 continue;
5242
5243 if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {
5244 const auto &[OpIdx, OpV] = P;
5245 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5246 }))
5247 return false;
5248 }
5249
5250 return true;
5251}
5252
5253/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5254/// number of members both equal to VF. The interleave group must also access
5255/// the full vector width.
5256static std::optional<ElementCount> isConsecutiveInterleaveGroup(
5258 VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
5259 if (!InterleaveR || InterleaveR->getMask())
5260 return std::nullopt;
5261
5262 Type *GroupElementTy = nullptr;
5263 if (InterleaveR->getStoredValues().empty()) {
5264 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
5265 if (!all_of(InterleaveR->definedValues(),
5266 [&TypeInfo, GroupElementTy](VPValue *Op) {
5267 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5268 }))
5269 return std::nullopt;
5270 } else {
5271 GroupElementTy =
5272 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
5273 if (!all_of(InterleaveR->getStoredValues(),
5274 [&TypeInfo, GroupElementTy](VPValue *Op) {
5275 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5276 }))
5277 return std::nullopt;
5278 }
5279
5280 auto IG = InterleaveR->getInterleaveGroup();
5281 if (IG->getFactor() != IG->getNumMembers())
5282 return std::nullopt;
5283
5284 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5285 TypeSize Size = TTI.getRegisterBitWidth(
5288 assert(Size.isScalable() == VF.isScalable() &&
5289 "if Size is scalable, VF must be scalable and vice versa");
5290 return Size.getKnownMinValue();
5291 };
5292
5293 for (ElementCount VF : VFs) {
5294 unsigned MinVal = VF.getKnownMinValue();
5295 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5296 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5297 return {VF};
5298 }
5299 return std::nullopt;
5300}
5301
5302/// Returns true if \p VPValue is a narrow VPValue.
5303static bool isAlreadyNarrow(VPValue *VPV) {
5304 if (isa<VPIRValue>(VPV))
5305 return true;
5306 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5307 return RepR && RepR->isSingleScalar();
5308}
5309
5310// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
5311// a narrow variant.
5312static VPValue *
5314 auto *R = V->getDefiningRecipe();
5315 if (!R || NarrowedOps.contains(V))
5316 return V;
5317
5318 if (isAlreadyNarrow(V))
5319 return V;
5320
5322 auto *WideMember0 = cast<VPSingleDefRecipe>(R);
5323 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
5324 WideMember0->setOperand(
5325 Idx,
5326 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
5327 return V;
5328 }
5329
5330 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5331 // Narrow interleave group to wide load, as transformed VPlan will only
5332 // process one original iteration.
5333 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5334 auto *L = new VPWidenLoadRecipe(*LI, LoadGroup->getAddr(),
5335 LoadGroup->getMask(), /*Consecutive=*/true,
5336 {}, LoadGroup->getDebugLoc());
5337 L->insertBefore(LoadGroup);
5338 NarrowedOps.insert(L);
5339 return L;
5340 }
5341
5342 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5343 assert(RepR->isSingleScalar() && RepR->getOpcode() == Instruction::Load &&
5344 "must be a single scalar load");
5345 NarrowedOps.insert(RepR);
5346 return RepR;
5347 }
5348
5349 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5350 VPValue *PtrOp = WideLoad->getAddr();
5351 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5352 PtrOp = VecPtr->getOperand(0);
5353 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5354 // process one original iteration.
5355 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5356 /*IsUniform*/ true,
5357 /*Mask*/ nullptr, {}, *WideLoad);
5358 N->insertBefore(WideLoad);
5359 NarrowedOps.insert(N);
5360 return N;
5361}
5362
5363std::unique_ptr<VPlan>
5365 const TargetTransformInfo &TTI) {
5366 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5367
5368 if (!VectorLoop)
5369 return nullptr;
5370
5371 // Only handle single-block loops for now.
5372 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5373 return nullptr;
5374
5375 // Skip plans when we may not be able to properly narrow.
5376 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5377 if (!match(&Exiting->back(), m_BranchOnCount()))
5378 return nullptr;
5379
5380 assert(match(&Exiting->back(),
5382 m_Specific(&Plan.getVectorTripCount()))) &&
5383 "unexpected branch-on-count");
5384
5385 VPTypeAnalysis TypeInfo(Plan);
5387 std::optional<ElementCount> VFToOptimize;
5388 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5391 continue;
5392
5393 // Bail out on recipes not supported at the moment:
5394 // * phi recipes other than the canonical induction
5395 // * recipes writing to memory except interleave groups
5396 // Only support plans with a canonical induction phi.
5397 if (R.isPhi())
5398 return nullptr;
5399
5400 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5401 if (R.mayWriteToMemory() && !InterleaveR)
5402 return nullptr;
5403
5404 // Bail out if any recipe defines a vector value used outside the
5405 // vector loop region.
5406 if (any_of(R.definedValues(), [&](VPValue *V) {
5407 return any_of(V->users(), [&](VPUser *U) {
5408 auto *UR = cast<VPRecipeBase>(U);
5409 return UR->getParent()->getParent() != VectorLoop;
5410 });
5411 }))
5412 return nullptr;
5413
5414 // All other ops are allowed, but we reject uses that cannot be converted
5415 // when checking all allowed consumers (store interleave groups) below.
5416 if (!InterleaveR)
5417 continue;
5418
5419 // Try to find a single VF, where all interleave groups are consecutive and
5420 // saturate the full vector width. If we already have a candidate VF, check
5421 // if it is applicable for the current InterleaveR, otherwise look for a
5422 // suitable VF across the Plan's VFs.
5424 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5425 : to_vector(Plan.vectorFactors());
5426 std::optional<ElementCount> NarrowedVF =
5427 isConsecutiveInterleaveGroup(InterleaveR, VFs, TypeInfo, TTI);
5428 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
5429 return nullptr;
5430 VFToOptimize = NarrowedVF;
5431
5432 // Skip read interleave groups.
5433 if (InterleaveR->getStoredValues().empty())
5434 continue;
5435
5436 // Narrow interleave groups, if all operands are already matching narrow
5437 // ops.
5438 auto *Member0 = InterleaveR->getStoredValues()[0];
5439 if (isAlreadyNarrow(Member0) &&
5440 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5441 StoreGroups.push_back(InterleaveR);
5442 continue;
5443 }
5444
5445 // For now, we only support full interleave groups storing load interleave
5446 // groups.
5447 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5448 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5449 if (!DefR)
5450 return false;
5451 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5452 return IR && IR->getInterleaveGroup()->isFull() &&
5453 IR->getVPValue(Op.index()) == Op.value();
5454 })) {
5455 StoreGroups.push_back(InterleaveR);
5456 continue;
5457 }
5458
5459 // Check if all values feeding InterleaveR are matching wide recipes, which
5460 // operands that can be narrowed.
5461 if (!canNarrowOps(InterleaveR->getStoredValues(),
5462 VFToOptimize->isScalable()))
5463 return nullptr;
5464 StoreGroups.push_back(InterleaveR);
5465 }
5466
5467 if (StoreGroups.empty())
5468 return nullptr;
5469
5470 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5471 bool RequiresScalarEpilogue =
5472 MiddleVPBB->getNumSuccessors() == 1 &&
5473 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
5474 // Bail out for tail-folding (middle block with a single successor to exit).
5475 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
5476 return nullptr;
5477
5478 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5479 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
5480 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5481 // TODO: Handle cases where only some interleave groups can be narrowed.
5482 std::unique_ptr<VPlan> NewPlan;
5483 if (size(Plan.vectorFactors()) != 1) {
5484 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5485 Plan.setVF(*VFToOptimize);
5486 NewPlan->removeVF(*VFToOptimize);
5487 }
5488
5489 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5490 SmallPtrSet<VPValue *, 4> NarrowedOps;
5491 // Narrow operation tree rooted at store groups.
5492 for (auto *StoreGroup : StoreGroups) {
5493 VPValue *Res =
5494 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5495 auto *SI =
5496 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5497 auto *S = new VPWidenStoreRecipe(*SI, StoreGroup->getAddr(), Res, nullptr,
5498 /*Consecutive=*/true, {},
5499 StoreGroup->getDebugLoc());
5500 S->insertBefore(StoreGroup);
5501 StoreGroup->eraseFromParent();
5502 }
5503
5504 // Adjust induction to reflect that the transformed plan only processes one
5505 // original iteration.
5507 Type *CanIVTy = VectorLoop->getCanonicalIVType();
5508 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
5509 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
5510
5511 VPValue *UF = &Plan.getUF();
5512 VPValue *Step;
5513 if (VFToOptimize->isScalable()) {
5514 VPValue *VScale =
5515 PHBuilder.createElementCount(CanIVTy, ElementCount::getScalable(1));
5516 Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},
5517 {true, false});
5518 Plan.getVF().replaceAllUsesWith(VScale);
5519 } else {
5520 Step = UF;
5521 Plan.getVF().replaceAllUsesWith(Plan.getConstantInt(CanIVTy, 1));
5522 }
5523 // Materialize vector trip count with the narrowed step.
5524 materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,
5525 RequiresScalarEpilogue, Step);
5526
5527 CanIVInc->setOperand(1, Step);
5528 Plan.getVFxUF().replaceAllUsesWith(Step);
5529
5530 removeDeadRecipes(Plan);
5531 assert(none_of(*VectorLoop->getEntryBasicBlock(),
5533 "All VPVectorPointerRecipes should have been removed");
5534 return NewPlan;
5535}
5536
5537/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5538/// BranchOnCond recipe.
5540 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5541 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5542 auto *MiddleTerm =
5544 // Only add branch metadata if there is a (conditional) terminator.
5545 if (!MiddleTerm)
5546 return;
5547
5548 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5549 "must have a BranchOnCond");
5550 // Assume that `TripCount % VectorStep ` is equally distributed.
5551 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5552 if (VF.isScalable() && VScaleForTuning.has_value())
5553 VectorStep *= *VScaleForTuning;
5554 assert(VectorStep > 0 && "trip count should not be zero");
5555 MDBuilder MDB(Plan.getContext());
5556 MDNode *BranchWeights =
5557 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5558 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5559}
5560
5562 VFRange &Range) {
5563 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5564 auto *MiddleVPBB = Plan.getMiddleBlock();
5565 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5566 VPTypeAnalysis TypeInfo(Plan);
5567
5568 auto IsScalableOne = [](ElementCount VF) -> bool {
5569 return VF == ElementCount::getScalable(1);
5570 };
5571
5572 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5573 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5574 if (!FOR)
5575 continue;
5576
5577 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5578 "Cannot handle loops with uncountable early exits");
5579
5580 // Find the existing splice for this FOR, created in
5581 // createHeaderPhiRecipes. All uses of FOR have already been replaced with
5582 // RecurSplice there; only RecurSplice itself still references FOR.
5583 auto *RecurSplice =
5585 assert(RecurSplice && "expected FirstOrderRecurrenceSplice");
5586
5587 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5588 // penultimate value of the recurrence. Instead we rely on the existing
5589 // extract of the last element from the result of
5590 // VPInstruction::FirstOrderRecurrenceSplice.
5591 // TODO: Consider vscale_range info and UF.
5592 if (any_of(RecurSplice->users(),
5593 [](VPUser *U) { return !cast<VPRecipeBase>(U)->getRegion(); }) &&
5595 Range))
5596 return;
5597
5598 // This is the second phase of vectorizing first-order recurrences, creating
5599 // extracts for users outside the loop. An overview of the transformation is
5600 // described below. Suppose we have the following loop with some use after
5601 // the loop of the last a[i-1],
5602 //
5603 // for (int i = 0; i < n; ++i) {
5604 // t = a[i - 1];
5605 // b[i] = a[i] - t;
5606 // }
5607 // use t;
5608 //
5609 // There is a first-order recurrence on "a". For this loop, the shorthand
5610 // scalar IR looks like:
5611 //
5612 // scalar.ph:
5613 // s.init = a[-1]
5614 // br scalar.body
5615 //
5616 // scalar.body:
5617 // i = phi [0, scalar.ph], [i+1, scalar.body]
5618 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5619 // s2 = a[i]
5620 // b[i] = s2 - s1
5621 // br cond, scalar.body, exit.block
5622 //
5623 // exit.block:
5624 // use = lcssa.phi [s1, scalar.body]
5625 //
5626 // In this example, s1 is a recurrence because it's value depends on the
5627 // previous iteration. In the first phase of vectorization, we created a
5628 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5629 // for users in the scalar preheader and exit block.
5630 //
5631 // vector.ph:
5632 // v_init = vector(..., ..., ..., a[-1])
5633 // br vector.body
5634 //
5635 // vector.body
5636 // i = phi [0, vector.ph], [i+4, vector.body]
5637 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5638 // v2 = a[i, i+1, i+2, i+3]
5639 // v1' = splice(v1(3), v2(0, 1, 2))
5640 // b[i, i+1, i+2, i+3] = v2 - v1'
5641 // br cond, vector.body, middle.block
5642 //
5643 // middle.block:
5644 // vector.recur.extract.for.phi = v2(2)
5645 // vector.recur.extract = v2(3)
5646 // br cond, scalar.ph, exit.block
5647 //
5648 // scalar.ph:
5649 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5650 // [s.init, otherwise]
5651 // br scalar.body
5652 //
5653 // scalar.body:
5654 // i = phi [0, scalar.ph], [i+1, scalar.body]
5655 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5656 // s2 = a[i]
5657 // b[i] = s2 - s1
5658 // br cond, scalar.body, exit.block
5659 //
5660 // exit.block:
5661 // lo = lcssa.phi [s1, scalar.body],
5662 // [vector.recur.extract.for.phi, middle.block]
5663 //
5664 // Update extracts of the splice in the middle block: they extract the
5665 // penultimate element of the recurrence.
5667 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5668 if (!match(&R, m_ExtractLastLaneOfLastPart(m_Specific(RecurSplice))))
5669 continue;
5670
5671 auto *ExtractR = cast<VPInstruction>(&R);
5672 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5673 VPInstruction::ExtractPenultimateElement, RecurSplice->getOperand(1),
5674 {}, "vector.recur.extract.for.phi");
5675 for (VPUser *ExitU : to_vector(ExtractR->users())) {
5676 if (auto *ExitPhi = dyn_cast<VPIRPhi>(ExitU))
5677 ExitPhi->replaceUsesOfWith(ExtractR, PenultimateElement);
5678 }
5679 }
5680 }
5681}
5682
5683/// Check if \p V is a binary expression of a widened IV and a loop-invariant
5684/// value. Returns the widened IV if found, nullptr otherwise.
5686 auto *BinOp = dyn_cast<VPWidenRecipe>(V);
5687 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||
5688 Instruction::isIntDivRem(BinOp->getOpcode()))
5689 return nullptr;
5690
5691 VPValue *WidenIVCandidate = BinOp->getOperand(0);
5692 VPValue *InvariantCandidate = BinOp->getOperand(1);
5693 if (!isa<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate))
5694 std::swap(WidenIVCandidate, InvariantCandidate);
5695
5696 if (!InvariantCandidate->isDefinedOutsideLoopRegions())
5697 return nullptr;
5698
5699 return dyn_cast<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate);
5700}
5701
5702/// Create a scalar version of \p BinOp, with its \p WidenIV operand replaced
5703/// by \p ScalarIV, and place it after \p ScalarIV's defining recipe.
5707 BinOp->getNumOperands() == 2 && "BinOp must have 2 operands");
5708 auto *ClonedOp = BinOp->clone();
5709 if (ClonedOp->getOperand(0) == WidenIV) {
5710 ClonedOp->setOperand(0, ScalarIV);
5711 } else {
5712 assert(ClonedOp->getOperand(1) == WidenIV && "one operand must be WideIV");
5713 ClonedOp->setOperand(1, ScalarIV);
5714 }
5715 ClonedOp->insertAfter(ScalarIV->getDefiningRecipe());
5716 return ClonedOp;
5717}
5718
5721 Loop &L) {
5722 ScalarEvolution &SE = *PSE.getSE();
5723 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
5724
5725 // Helper lambda to check if the IV range excludes the sentinel value. Try
5726 // signed first, then unsigned. Return an excluded sentinel if found,
5727 // otherwise return std::nullopt.
5728 auto CheckSentinel = [&SE](const SCEV *IVSCEV,
5729 bool UseMax) -> std::optional<APSInt> {
5730 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
5731 for (bool Signed : {true, false}) {
5732 APSInt Sentinel = UseMax ? APSInt::getMinValue(BW, /*Unsigned=*/!Signed)
5733 : APSInt::getMaxValue(BW, /*Unsigned=*/!Signed);
5734
5735 ConstantRange IVRange =
5736 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
5737 if (!IVRange.contains(Sentinel))
5738 return Sentinel;
5739 }
5740 return std::nullopt;
5741 };
5742
5743 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
5744 for (VPRecipeBase &Phi :
5745 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
5746 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
5748 PhiR->getRecurrenceKind()))
5749 continue;
5750
5751 Type *PhiTy = VPTypeAnalysis(Plan).inferScalarType(PhiR);
5752 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
5753 continue;
5754
5755 // If there's a header mask, the backedge select will not be the find-last
5756 // select.
5757 VPValue *BackedgeVal = PhiR->getBackedgeValue();
5758 auto *FindLastSelect = cast<VPSingleDefRecipe>(BackedgeVal);
5759 if (HeaderMask &&
5760 !match(BackedgeVal,
5761 m_Select(m_Specific(HeaderMask),
5762 m_VPSingleDefRecipe(FindLastSelect), m_Specific(PhiR))))
5763 continue;
5764
5765 // Get the find-last expression from the find-last select of the reduction
5766 // phi. The find-last select should be a select between the phi and the
5767 // find-last expression.
5768 VPValue *Cond, *FindLastExpression;
5769 if (!match(FindLastSelect, m_Select(m_VPValue(Cond), m_Specific(PhiR),
5770 m_VPValue(FindLastExpression))) &&
5771 !match(FindLastSelect,
5772 m_Select(m_VPValue(Cond), m_VPValue(FindLastExpression),
5773 m_Specific(PhiR))))
5774 continue;
5775
5776 // Check if FindLastExpression is a simple expression of a widened IV. If
5777 // so, we can track the underlying IV instead and sink the expression.
5778 auto *IVOfExpressionToSink = getExpressionIV(FindLastExpression);
5779 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(
5780 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression, PSE,
5781 &L);
5782 const SCEV *Step;
5783 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
5784 assert(!match(vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L),
5786 "IVOfExpressionToSink not being an AddRec must imply "
5787 "FindLastExpression not being an AddRec.");
5788 continue;
5789 }
5790
5791 // Determine direction from SCEV step.
5792 if (!SE.isKnownNonZero(Step))
5793 continue;
5794
5795 // Positive step means we need UMax/SMax to find the last IV value, and
5796 // UMin/SMin otherwise.
5797 bool UseMax = SE.isKnownPositive(Step);
5798 std::optional<APSInt> SentinelVal = CheckSentinel(IVSCEV, UseMax);
5799 bool UseSigned = SentinelVal && SentinelVal->isSigned();
5800
5801 // Sinking an expression will disable epilogue vectorization. Only use it,
5802 // if FindLastExpression cannot be vectorized via a sentinel. Sinking may
5803 // also prevent vectorizing using a sentinel (e.g., if the expression is a
5804 // multiply or divide by large constant, respectively), which also makes
5805 // sinking undesirable.
5806 if (IVOfExpressionToSink) {
5807 const SCEV *FindLastExpressionSCEV =
5808 vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L);
5809 if (match(FindLastExpressionSCEV,
5810 m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
5811 bool NewUseMax = SE.isKnownPositive(Step);
5812 if (auto NewSentinel =
5813 CheckSentinel(FindLastExpressionSCEV, NewUseMax)) {
5814 // The original expression already has a sentinel, so prefer not
5815 // sinking to keep epilogue vectorization possible.
5816 SentinelVal = *NewSentinel;
5817 UseSigned = NewSentinel->isSigned();
5818 UseMax = NewUseMax;
5819 IVSCEV = FindLastExpressionSCEV;
5820 IVOfExpressionToSink = nullptr;
5821 }
5822 }
5823 }
5824
5825 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
5826 // if the condition was ever true. Requires the IV to not wrap, otherwise we
5827 // cannot use min/max.
5828 if (!SentinelVal) {
5829 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
5830 if (AR->hasNoSignedWrap())
5831 UseSigned = true;
5832 else if (AR->hasNoUnsignedWrap())
5833 UseSigned = false;
5834 else
5835 continue;
5836 }
5837
5839 BackedgeVal,
5841
5842 VPValue *NewFindLastSelect = BackedgeVal;
5843 VPValue *SelectCond = Cond;
5844 if (!SentinelVal || IVOfExpressionToSink) {
5845 // When we need to create a new select, normalize the condition so that
5846 // PhiR is the last operand and include the header mask if needed.
5847 DebugLoc DL = FindLastSelect->getDefiningRecipe()->getDebugLoc();
5848 VPBuilder LoopBuilder(FindLastSelect->getDefiningRecipe());
5849 if (FindLastSelect->getDefiningRecipe()->getOperand(1) == PhiR)
5850 SelectCond = LoopBuilder.createNot(SelectCond);
5851
5852 // When tail folding, mask the condition with the header mask to prevent
5853 // propagating poison from inactive lanes in the last vector iteration.
5854 if (HeaderMask)
5855 SelectCond = LoopBuilder.createLogicalAnd(HeaderMask, SelectCond);
5856
5857 if (SelectCond != Cond || IVOfExpressionToSink) {
5858 NewFindLastSelect = LoopBuilder.createSelect(
5859 SelectCond,
5860 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression,
5861 PhiR, DL);
5862 }
5863 }
5864
5865 // Create the reduction result in the middle block using sentinel directly.
5866 RecurKind MinMaxKind =
5867 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
5868 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
5869 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
5870 FastMathFlags());
5871 DebugLoc ExitDL = RdxResult->getDebugLoc();
5872 VPBuilder MiddleBuilder(RdxResult);
5873 VPValue *ReducedIV =
5875 NewFindLastSelect, Flags, ExitDL);
5876
5877 // If IVOfExpressionToSink is an expression to sink, sink it now.
5878 VPValue *VectorRegionExitingVal = ReducedIV;
5879 if (IVOfExpressionToSink)
5880 VectorRegionExitingVal =
5881 cloneBinOpForScalarIV(cast<VPWidenRecipe>(FindLastExpression),
5882 ReducedIV, IVOfExpressionToSink);
5883
5884 VPValue *NewRdxResult;
5885 VPValue *StartVPV = PhiR->getStartValue();
5886 if (SentinelVal) {
5887 // Sentinel-based approach: reduce IVs with min/max, compare against
5888 // sentinel to detect if condition was ever true, select accordingly.
5889 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
5890 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
5891 Sentinel, ExitDL);
5892 NewRdxResult = MiddleBuilder.createSelect(Cmp, VectorRegionExitingVal,
5893 StartVPV, ExitDL);
5894 StartVPV = Sentinel;
5895 } else {
5896 // Introduce a boolean AnyOf reduction to track if the condition was ever
5897 // true in the loop. Use it to select the initial start value, if it was
5898 // never true.
5899 auto *AnyOfPhi = new VPReductionPHIRecipe(
5900 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
5901 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
5902 AnyOfPhi->insertAfter(PhiR);
5903
5904 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
5905 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, SelectCond);
5906 AnyOfPhi->setOperand(1, OrVal);
5907
5908 NewRdxResult = MiddleBuilder.createAnyOfReduction(
5909 OrVal, VectorRegionExitingVal, StartVPV, ExitDL);
5910
5911 // Initialize the IV reduction phi with the neutral element, not the
5912 // original start value, to ensure correct min/max reduction results.
5913 StartVPV = Plan.getOrAddLiveIn(
5914 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
5915 }
5916 RdxResult->replaceAllUsesWith(NewRdxResult);
5917 RdxResult->eraseFromParent();
5918
5919 auto *NewPhiR = new VPReductionPHIRecipe(
5920 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
5921 *NewFindLastSelect, RdxUnordered{1}, {},
5922 PhiR->hasUsesOutsideReductionChain());
5923 NewPhiR->insertBefore(PhiR);
5924 PhiR->replaceAllUsesWith(NewPhiR);
5925 PhiR->eraseFromParent();
5926 }
5927}
5928
5929namespace {
5930
5931using ExtendKind = TTI::PartialReductionExtendKind;
5932struct ReductionExtend {
5933 Type *SrcType = nullptr;
5934 ExtendKind Kind = ExtendKind::PR_None;
5935};
5936
5937/// Describes the extends used to compute the extended reduction operand.
5938/// ExtendB is optional. If ExtendB is present, ExtendsUser is a binary
5939/// operation.
5940struct ExtendedReductionOperand {
5941 /// The recipe that consumes the extends.
5942 VPWidenRecipe *ExtendsUser = nullptr;
5943 /// Extend descriptions (inputs to getPartialReductionCost).
5944 ReductionExtend ExtendA, ExtendB;
5945};
5946
5947/// A chain of recipes that form a partial reduction. Matches either
5948/// reduction_bin_op (extended op, accumulator), or
5949/// reduction_bin_op (accumulator, extended op).
5950/// The possible forms of the "extended op" are listed in
5951/// matchExtendedReductionOperand.
5952struct VPPartialReductionChain {
5953 /// The top-level binary operation that forms the reduction to a scalar
5954 /// after the loop body.
5955 VPWidenRecipe *ReductionBinOp = nullptr;
5956 /// The user of the extends that is then reduced.
5957 ExtendedReductionOperand ExtendedOp;
5958 /// The recurrence kind for the entire partial reduction chain.
5959 /// This allows distinguishing between Sub and AddWithSub recurrences,
5960 /// when the ReductionBinOp is a Instruction::Sub.
5961 RecurKind RK;
5962 /// The index of the accumulator operand of ReductionBinOp. The extended op
5963 /// is `1 - AccumulatorOpIdx`.
5964 unsigned AccumulatorOpIdx;
5965 unsigned ScaleFactor;
5966};
5967
5968static VPSingleDefRecipe *
5969optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op,
5970 VPTypeAnalysis &TypeInfo) {
5971 // reduce.add(mul(ext(A), C))
5972 // -> reduce.add(mul(ext(A), ext(trunc(C))))
5973 const APInt *Const;
5974 if (match(Op, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
5975 auto *ExtA = cast<VPWidenCastRecipe>(Op->getOperand(0));
5976 Instruction::CastOps ExtOpc = ExtA->getOpcode();
5977 Type *NarrowTy = TypeInfo.inferScalarType(ExtA->getOperand(0));
5978 if (!Op->hasOneUse() ||
5980 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
5981 return Op;
5982
5983 VPBuilder Builder(Op);
5984 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
5985 Op->getOperand(1), NarrowTy);
5986 Type *WideTy = TypeInfo.inferScalarType(ExtA);
5987 Op->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
5988 return Op;
5989 }
5990
5991 // reduce.add(abs(sub(ext(A), ext(B))))
5992 // -> reduce.add(ext(absolute-difference(A, B)))
5993 VPValue *X, *Y;
5996 auto *Sub = Op->getOperand(0)->getDefiningRecipe();
5997 auto *Ext = cast<VPWidenCastRecipe>(Sub->getOperand(0));
5998 assert(Ext->getOpcode() ==
5999 cast<VPWidenCastRecipe>(Sub->getOperand(1))->getOpcode() &&
6000 "Expected both the LHS and RHS extends to be the same");
6001 bool IsSigned = Ext->getOpcode() == Instruction::SExt;
6002 VPBuilder Builder(Op);
6003 Type *SrcTy = TypeInfo.inferScalarType(X);
6004 auto *FreezeX = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {X}));
6005 auto *FreezeY = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {Y}));
6006 auto *Max = Builder.insert(
6007 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smax : Intrinsic::umax,
6008 {FreezeX, FreezeY}, SrcTy));
6009 auto *Min = Builder.insert(
6010 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smin : Intrinsic::umin,
6011 {FreezeX, FreezeY}, SrcTy));
6012 auto *AbsDiff =
6013 Builder.insert(new VPWidenRecipe(Instruction::Sub, {Max, Min}));
6014 return Builder.createWidenCast(Instruction::CastOps::ZExt, AbsDiff,
6015 TypeInfo.inferScalarType(Op));
6016 }
6017
6018 // reduce.add(ext(mul(ext(A), ext(B))))
6019 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
6020 // TODO: Support this optimization for float types.
6022 m_ZExtOrSExt(m_VPValue()))))) {
6023 auto *Ext = cast<VPWidenCastRecipe>(Op);
6024 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
6025 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6026 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6027 if (!Mul->hasOneUse() ||
6028 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
6029 MulLHS->getOpcode() != MulRHS->getOpcode())
6030 return Op;
6031 VPBuilder Builder(Mul);
6032 auto *NewLHS = Builder.createWidenCast(
6033 MulLHS->getOpcode(), MulLHS->getOperand(0), Ext->getScalarType());
6034 auto *NewRHS = MulLHS == MulRHS
6035 ? NewLHS
6036 : Builder.createWidenCast(MulRHS->getOpcode(),
6037 MulRHS->getOperand(0),
6038 Ext->getScalarType());
6039 auto *NewMul = Mul->cloneWithOperands({NewLHS, NewRHS});
6040 Builder.insert(NewMul);
6041 Op->replaceAllUsesWith(NewMul);
6042 Op->eraseFromParent();
6043 Mul->eraseFromParent();
6044 return NewMul;
6045 }
6046
6047 return Op;
6048}
6049
6050static VPExpressionRecipe *
6051createPartialReductionExpression(VPReductionRecipe *Red) {
6052 VPValue *VecOp = Red->getVecOp();
6053
6054 // reduce.[f]add(ext(op))
6055 // -> VPExpressionRecipe(op, red)
6056 if (match(VecOp, m_WidenAnyExtend(m_VPValue())))
6057 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
6058
6059 // reduce.[f]add([f]mul(ext(a), ext(b)))
6060 // -> VPExpressionRecipe(a, b, mul, red)
6061 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue()))) ||
6062 match(VecOp,
6064 auto *Mul = cast<VPWidenRecipe>(VecOp);
6065 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6066 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6067 return new VPExpressionRecipe(ExtA, ExtB, Mul, Red);
6068 }
6069
6070 // reduce.add(neg(mul(ext(a), ext(b))))
6071 // -> VPExpressionRecipe(a, b, mul, sub, red)
6073 m_ZExtOrSExt(m_VPValue()))))) {
6074 auto *Sub = cast<VPWidenRecipe>(VecOp);
6075 auto *Mul = cast<VPWidenRecipe>(Sub->getOperand(1));
6076 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6077 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6078 return new VPExpressionRecipe(ExtA, ExtB, Mul, Sub, Red);
6079 }
6080
6081 llvm_unreachable("Unsupported expression");
6082}
6083
6084// Helper to transform a partial reduction chain into a partial reduction
6085// recipe. Assumes profitability has been checked.
6086static void transformToPartialReduction(const VPPartialReductionChain &Chain,
6087 VPTypeAnalysis &TypeInfo, VPlan &Plan,
6088 VPReductionPHIRecipe *RdxPhi) {
6089 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6090 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
6091
6092 VPValue *Accumulator = WidenRecipe->getOperand(Chain.AccumulatorOpIdx);
6093 auto *ExtendedOp = cast<VPSingleDefRecipe>(
6094 WidenRecipe->getOperand(1 - Chain.AccumulatorOpIdx));
6095
6096 // Sub-reductions can be implemented in two ways:
6097 // (1) negate the operand in the vector loop (the default way).
6098 // (2) subtract the reduced value from the init value in the middle block.
6099 // Both ways keep the reduction itself as an 'add' reduction.
6100 //
6101 // The ISD nodes for partial reductions don't support folding the
6102 // sub/negation into its operands because the following is not a valid
6103 // transformation:
6104 // sub(0, mul(ext(a), ext(b)))
6105 // -> mul(ext(a), ext(sub(0, b)))
6106 //
6107 // It's therefore better to choose option (2) such that the partial
6108 // reduction is always positive (starting at '0') and to do a final
6109 // subtract in the middle block.
6110 if (WidenRecipe->getOpcode() == Instruction::Sub &&
6111 Chain.RK != RecurKind::Sub) {
6112 VPBuilder Builder(WidenRecipe);
6113 Type *ElemTy = TypeInfo.inferScalarType(ExtendedOp);
6114 auto *Zero = Plan.getZero(ElemTy);
6115 auto *NegRecipe =
6116 new VPWidenRecipe(Instruction::Sub, {Zero, ExtendedOp}, VPIRFlags(),
6118 Builder.insert(NegRecipe);
6119 ExtendedOp = NegRecipe;
6120 }
6121
6122 assert((Chain.RK != RecurKind::FAddChainWithSubs) &&
6123 "FSub chain reduction isn't supported");
6124
6125 // FIXME: Do these transforms before invoking the cost-model.
6126 ExtendedOp = optimizeExtendsForPartialReduction(ExtendedOp, TypeInfo);
6127
6128 // Check if WidenRecipe is the final result of the reduction. If so look
6129 // through selects for predicated reductions.
6130 VPValue *Cond = nullptr;
6132 WidenRecipe,
6133 m_Select(m_VPValue(Cond), m_Specific(WidenRecipe), m_Specific(RdxPhi))));
6134 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
6135 RdxPhi->getBackedgeValue() == ExitValue;
6136 assert((!ExitValue || IsLastInChain) &&
6137 "if we found ExitValue, it must match RdxPhi's backedge value");
6138
6139 Type *PhiType = TypeInfo.inferScalarType(RdxPhi);
6140 RecurKind RdxKind =
6142 auto *PartialRed = new VPReductionRecipe(
6143 RdxKind,
6144 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
6145 : FastMathFlags(),
6146 WidenRecipe->getUnderlyingInstr(), Accumulator, ExtendedOp, Cond,
6147 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6148 PartialRed->insertBefore(WidenRecipe);
6149
6150 if (Cond)
6151 ExitValue->replaceAllUsesWith(PartialRed);
6152 WidenRecipe->replaceAllUsesWith(PartialRed);
6153
6154 // For cost-model purposes, fold this into a VPExpression.
6155 VPExpressionRecipe *E = createPartialReductionExpression(PartialRed);
6156 E->insertBefore(WidenRecipe);
6157 PartialRed->replaceAllUsesWith(E);
6158
6159 // We only need to update the PHI node once, which is when we find the
6160 // last reduction in the chain.
6161 if (!IsLastInChain)
6162 return;
6163
6164 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6165 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6166 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6167
6168 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
6169 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6170 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
6171 StartInst->setOperand(2, NewScaleFactor);
6172
6173 // If this is the last value in a sub-reduction chain, then update the PHI
6174 // node to start at `0` and update the reduction-result to subtract from
6175 // the PHI's start value.
6176 if (Chain.RK != RecurKind::Sub && Chain.RK != RecurKind::FSub)
6177 return;
6178
6179 VPValue *OldStartValue = StartInst->getOperand(0);
6180 StartInst->setOperand(0, StartInst->getOperand(1));
6181
6182 // Replace reduction_result by 'sub (startval, reductionresult)'.
6184 assert(RdxResult && "Could not find reduction result");
6185
6186 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
6187 unsigned SubOpc = Chain.RK == RecurKind::FSub ? Instruction::BinaryOps::FSub
6188 : Instruction::BinaryOps::Sub;
6189 VPInstruction *NewResult = Builder.createNaryOp(
6190 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
6191 RdxPhi->getDebugLoc());
6192 RdxResult->replaceUsesWithIf(
6193 NewResult,
6194 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6195}
6196
6197/// Returns the cost of a link in a partial-reduction chain for a given VF.
6198static InstructionCost
6199getPartialReductionLinkCost(VPCostContext &CostCtx,
6200 const VPPartialReductionChain &Link,
6201 ElementCount VF) {
6202 Type *RdxType = CostCtx.Types.inferScalarType(Link.ReductionBinOp);
6203 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6204 std::optional<unsigned> BinOpc = std::nullopt;
6205 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6206 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6207 BinOpc = ExtendedOp.ExtendsUser->getOpcode();
6208
6209 std::optional<llvm::FastMathFlags> Flags;
6210 if (RdxType->isFloatingPointTy())
6211 Flags = Link.ReductionBinOp->getFastMathFlags();
6212
6213 auto GetLinkOpcode = [&Link]() -> unsigned {
6214 switch (Link.RK) {
6215 case RecurKind::Sub:
6216 return Instruction::Add;
6217 case RecurKind::FSub:
6218 return Instruction::FAdd;
6219 default:
6220 return Link.ReductionBinOp->getOpcode();
6221 }
6222 };
6223
6224 return CostCtx.TTI.getPartialReductionCost(
6225 GetLinkOpcode(), ExtendedOp.ExtendA.SrcType, ExtendedOp.ExtendB.SrcType,
6226 RdxType, VF, ExtendedOp.ExtendA.Kind, ExtendedOp.ExtendB.Kind, BinOpc,
6227 CostCtx.CostKind, Flags);
6228}
6229
6230static ExtendKind getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
6232}
6233
6234/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6235/// operand. This is an operand where the source of the value (e.g. a load) has
6236/// been extended (sext, zext, or fpext) before it is used in the reduction.
6237///
6238/// Possible forms matched by this function:
6239/// - UpdateR(PrevValue, ext(...))
6240/// - UpdateR(PrevValue, mul(ext(...), ext(...)))
6241/// - UpdateR(PrevValue, mul(ext(...), Constant))
6242/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6243/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6244/// - UpdateR(PrevValue, abs(sub(ext(...), ext(...)))
6245///
6246/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6247static std::optional<ExtendedReductionOperand>
6248matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op,
6249 VPTypeAnalysis &TypeInfo) {
6250 assert(is_contained(UpdateR->operands(), Op) &&
6251 "Op should be operand of UpdateR");
6252
6253 // Try matching an absolute difference operand of the form
6254 // `abs(sub(ext(A), ext(B)))`. This will be later transformed into
6255 // `ext(absolute-difference(A, B))`. This allows us to perform the absolute
6256 // difference on a wider type and get the extend for "free" from the partial
6257 // reduction.
6258 VPValue *X, *Y;
6259 if (Op->hasOneUse() &&
6263 auto *Abs = cast<VPWidenIntrinsicRecipe>(Op);
6264 auto *Sub = cast<VPWidenRecipe>(Abs->getOperand(0));
6265 auto *LHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6266 auto *RHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(1));
6267 Type *LHSInputType = TypeInfo.inferScalarType(X);
6268 Type *RHSInputType = TypeInfo.inferScalarType(Y);
6269 if (LHSInputType != RHSInputType ||
6270 LHSExt->getOpcode() != RHSExt->getOpcode())
6271 return std::nullopt;
6272 // Note: This is essentially the same as matching ext(...) as we will
6273 // rewrite this operand to ext(absolute-difference(A, B)).
6274 return ExtendedReductionOperand{
6275 Sub,
6276 /*ExtendA=*/{LHSInputType, getPartialReductionExtendKind(LHSExt)},
6277 /*ExtendB=*/{}};
6278 }
6279
6280 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6282 auto *CastRecipe = cast<VPWidenCastRecipe>(Op);
6283 VPValue *CastSource = CastRecipe->getOperand(0);
6284 OuterExtKind = getPartialReductionExtendKind(CastRecipe);
6285 if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) ||
6286 match(CastSource, m_FMul(m_VPValue(), m_VPValue()))) {
6287 // Match: ext(mul(...))
6288 // Record the outer extend kind and set `Op` to the mul. We can then match
6289 // this as a binary operation. Note: We can optimize out the outer extend
6290 // by widening the inner extends to match it. See
6291 // optimizeExtendsForPartialReduction.
6292 Op = CastSource;
6293 // FIXME: createPartialReductionExpression can't handle sub(ext(mul(...)))
6294 if (UpdateR->getOpcode() == Instruction::Sub)
6295 return std::nullopt;
6296 } else if (UpdateR->getOpcode() == Instruction::Add ||
6297 UpdateR->getOpcode() == Instruction::FAdd) {
6298 // Match: UpdateR(PrevValue, ext(...))
6299 // TODO: Remove the add/fadd restriction (we should be able to handle this
6300 // case for sub reductions too).
6301 return ExtendedReductionOperand{
6302 UpdateR,
6303 /*ExtendA=*/{TypeInfo.inferScalarType(CastSource), *OuterExtKind},
6304 /*ExtendB=*/{}};
6305 }
6306 }
6307
6308 if (!Op->hasOneUse())
6309 return std::nullopt;
6310
6312 if (!MulOp ||
6313 !is_contained({Instruction::Mul, Instruction::FMul}, MulOp->getOpcode()))
6314 return std::nullopt;
6315
6316 // The rest of the matching assumes `Op` is a (possibly extended/negated)
6317 // binary operation.
6318
6319 VPValue *LHS = MulOp->getOperand(0);
6320 VPValue *RHS = MulOp->getOperand(1);
6321
6322 // The LHS of the operation must always be an extend.
6324 return std::nullopt;
6325
6326 auto *LHSCast = cast<VPWidenCastRecipe>(LHS);
6327 Type *LHSInputType = TypeInfo.inferScalarType(LHSCast->getOperand(0));
6328 ExtendKind LHSExtendKind = getPartialReductionExtendKind(LHSCast);
6329
6330 // The RHS of the operation can be an extend or a constant integer.
6331 const APInt *RHSConst = nullptr;
6332 VPWidenCastRecipe *RHSCast = nullptr;
6334 RHSCast = cast<VPWidenCastRecipe>(RHS);
6335 else if (!match(RHS, m_APInt(RHSConst)) ||
6336 !canConstantBeExtended(RHSConst, LHSInputType, LHSExtendKind))
6337 return std::nullopt;
6338
6339 // The outer extend kind must match the inner extends for folding.
6340 for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
6341 if (Cast && OuterExtKind &&
6342 getPartialReductionExtendKind(Cast) != OuterExtKind)
6343 return std::nullopt;
6344
6345 Type *RHSInputType = LHSInputType;
6346 ExtendKind RHSExtendKind = LHSExtendKind;
6347 if (RHSCast) {
6348 RHSInputType = TypeInfo.inferScalarType(RHSCast->getOperand(0));
6349 RHSExtendKind = getPartialReductionExtendKind(RHSCast);
6350 }
6351
6352 return ExtendedReductionOperand{
6353 MulOp, {LHSInputType, LHSExtendKind}, {RHSInputType, RHSExtendKind}};
6354}
6355
6356/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6357/// and determines if the target can use a cheaper operation with a wider
6358/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6359/// of operations in the reduction.
6360static std::optional<SmallVector<VPPartialReductionChain>>
6361getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,
6362 VFRange &Range) {
6363 // Get the backedge value from the reduction PHI and find the
6364 // ComputeReductionResult that uses it (directly or through a select for
6365 // predicated reductions).
6366 auto *RdxResult = vputils::findComputeReductionResult(RedPhiR);
6367 if (!RdxResult)
6368 return std::nullopt;
6369 VPValue *ExitValue = RdxResult->getOperand(0);
6370 match(ExitValue, m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6371
6372 VPTypeAnalysis &TypeInfo = CostCtx.Types;
6374 RecurKind RK = RedPhiR->getRecurrenceKind();
6375 Type *PhiType = TypeInfo.inferScalarType(RedPhiR);
6376 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6377
6378 // Work backwards from the ExitValue examining each reduction operation.
6379 VPValue *CurrentValue = ExitValue;
6380 while (CurrentValue != RedPhiR) {
6381 auto *UpdateR = dyn_cast<VPWidenRecipe>(CurrentValue);
6382 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6383 return std::nullopt;
6384
6385 VPValue *Op = UpdateR->getOperand(1);
6386 VPValue *PrevValue = UpdateR->getOperand(0);
6387
6388 // Find the extended operand. The other operand (PrevValue) is the next link
6389 // in the reduction chain.
6390 std::optional<ExtendedReductionOperand> ExtendedOp =
6391 matchExtendedReductionOperand(UpdateR, Op, TypeInfo);
6392 if (!ExtendedOp) {
6393 ExtendedOp = matchExtendedReductionOperand(UpdateR, PrevValue, TypeInfo);
6394 if (!ExtendedOp)
6395 return std::nullopt;
6396 std::swap(Op, PrevValue);
6397 }
6398
6399 Type *ExtSrcType = ExtendedOp->ExtendA.SrcType;
6400 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6401 if (!PHISize.hasKnownScalarFactor(ExtSrcSize))
6402 return std::nullopt;
6403
6404 // Check if a partial reduction chain is supported by the target (i.e. does
6405 // not have an invalid cost) for the given VF range. Clamps the range and
6406 // returns true if feasible for any VF.
6407 VPPartialReductionChain Link(
6408 {UpdateR, *ExtendedOp, RK,
6409 PrevValue == UpdateR->getOperand(0) ? 0U : 1U,
6410 static_cast<unsigned>(PHISize.getKnownScalarFactor(ExtSrcSize))});
6411 Chain.push_back(Link);
6412 CurrentValue = PrevValue;
6413 }
6414
6415 // The chain links were collected by traversing backwards from the exit value.
6416 // Reverse the chains so they are in program order.
6417 std::reverse(Chain.begin(), Chain.end());
6418 return Chain;
6419}
6420} // namespace
6421
6423 VPCostContext &CostCtx,
6424 VFRange &Range) {
6425 // Find all possible valid partial reductions, grouping chains by their PHI.
6426 // This grouping allows invalidating the whole chain, if any link is not a
6427 // valid partial reduction.
6429 ChainsByPhi;
6430 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6431 for (VPRecipeBase &R : HeaderVPBB->phis()) {
6432 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
6433 if (!RedPhiR)
6434 continue;
6435
6436 if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))
6437 ChainsByPhi.try_emplace(RedPhiR, std::move(*Chains));
6438 }
6439
6440 if (ChainsByPhi.empty())
6441 return;
6442
6443 // Build set of partial reduction operations for extend user validation and
6444 // a map of reduction bin ops to their scale factors for scale validation.
6445 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
6446 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
6447 for (const auto &[_, Chains] : ChainsByPhi)
6448 for (const VPPartialReductionChain &Chain : Chains) {
6449 PartialReductionOps.insert(Chain.ExtendedOp.ExtendsUser);
6450 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
6451 }
6452
6453 // A partial reduction is invalid if any of its extends are used by
6454 // something that isn't another partial reduction. This is because the
6455 // extends are intended to be lowered along with the reduction itself.
6456 auto ExtendUsersValid = [&](VPValue *Ext) {
6457 return !isa<VPWidenCastRecipe>(Ext) || all_of(Ext->users(), [&](VPUser *U) {
6458 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
6459 });
6460 };
6461
6462 auto IsProfitablePartialReductionChainForVF =
6463 [&](ArrayRef<VPPartialReductionChain> Chain, ElementCount VF) -> bool {
6464 InstructionCost PartialCost = 0, RegularCost = 0;
6465
6466 // The chain is a profitable partial reduction chain if the cost of handling
6467 // the entire chain is cheaper when using partial reductions than when
6468 // handling the entire chain using regular reductions.
6469 for (const VPPartialReductionChain &Link : Chain) {
6470 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6471 InstructionCost LinkCost = getPartialReductionLinkCost(CostCtx, Link, VF);
6472 if (!LinkCost.isValid())
6473 return false;
6474
6475 PartialCost += LinkCost;
6476 RegularCost += Link.ReductionBinOp->computeCost(VF, CostCtx);
6477 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6478 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6479 RegularCost += ExtendedOp.ExtendsUser->computeCost(VF, CostCtx);
6480 for (VPValue *Op : ExtendedOp.ExtendsUser->operands())
6481 if (auto *Extend = dyn_cast<VPWidenCastRecipe>(Op))
6482 RegularCost += Extend->computeCost(VF, CostCtx);
6483 }
6484 return PartialCost.isValid() && PartialCost < RegularCost;
6485 };
6486
6487 // Validate chains: check that extends are only used by partial reductions,
6488 // and that reduction bin ops are only used by other partial reductions with
6489 // matching scale factors, are outside the loop region or the select
6490 // introduced by tail-folding. Otherwise we would create users of scaled
6491 // reductions where the types of the other operands don't match.
6492 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6493 for (const VPPartialReductionChain &Chain : Chains) {
6494 if (!all_of(Chain.ExtendedOp.ExtendsUser->operands(), ExtendUsersValid)) {
6495 Chains.clear();
6496 break;
6497 }
6498 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6499 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
6500 return PhiR == RedPhiR;
6501 auto *R = cast<VPSingleDefRecipe>(U);
6502 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
6504 m_Specific(Chain.ReductionBinOp))) ||
6505 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
6506 m_Specific(RedPhiR)));
6507 };
6508 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
6509 Chains.clear();
6510 break;
6511 }
6512
6513 // Check if the compute-reduction-result is used by a sunk store.
6514 // TODO: Also form partial reductions in those cases.
6515 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6516 if (any_of(RdxResult->users(), [](VPUser *U) {
6517 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
6518 return RepR && RepR->getOpcode() == Instruction::Store;
6519 })) {
6520 Chains.clear();
6521 break;
6522 }
6523 }
6524 }
6525
6526 // Clear the chain if it is not profitable.
6528 [&, &Chains = Chains](ElementCount VF) {
6529 return IsProfitablePartialReductionChainForVF(Chains, VF);
6530 },
6531 Range))
6532 Chains.clear();
6533 }
6534
6535 for (auto &[Phi, Chains] : ChainsByPhi)
6536 for (const VPPartialReductionChain &Chain : Chains)
6537 transformToPartialReduction(Chain, CostCtx.Types, Plan, Phi);
6538}
6539
6541 VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder) {
6542 // Collect all loads/stores first. We will start with ones having simpler
6543 // decisions followed by more complex ones that are potentially
6544 // guided/dependent on the simpler ones.
6546 for (VPBasicBlock *VPBB :
6549 for (VPRecipeBase &R : *VPBB) {
6550 auto *VPI = dyn_cast<VPInstruction>(&R);
6551 if (VPI && VPI->getUnderlyingValue() &&
6552 is_contained({Instruction::Load, Instruction::Store},
6553 VPI->getOpcode()))
6554 MemOps.push_back(VPI);
6555 }
6556 }
6557
6558 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
6559 VPBuilder FinalRedStoresBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
6560
6561 for (VPInstruction *VPI : MemOps) {
6562 auto ReplaceWith = [&](VPRecipeBase *New) {
6563 New->insertBefore(VPI);
6564 if (VPI->getOpcode() == Instruction::Load)
6565 VPI->replaceAllUsesWith(New->getVPSingleValue());
6566 VPI->eraseFromParent();
6567 };
6568
6569 // Note: we must do that for scalar VPlan as well.
6570 if (RecipeBuilder.replaceWithFinalIfReductionStore(VPI,
6571 FinalRedStoresBuilder))
6572 continue;
6573
6574 // Filter out scalar VPlan for the remaining memory operations.
6576 [](ElementCount VF) { return VF.isScalar(); }, Range))
6577 continue;
6578
6579 if (VPHistogramRecipe *Histogram = RecipeBuilder.widenIfHistogram(VPI)) {
6580 ReplaceWith(Histogram);
6581 continue;
6582 }
6583
6584 VPRecipeBase *Recipe = RecipeBuilder.tryToWidenMemory(VPI, Range);
6585 if (!Recipe)
6586 Recipe = RecipeBuilder.handleReplication(VPI, Range);
6587
6588 ReplaceWith(Recipe);
6589 }
6590}
6591
6594 [&](ElementCount VF) { return VF.isScalar(); }, Range))
6595 return;
6596
6598 Plan.getEntry());
6600 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
6601 auto *VPI = dyn_cast<VPInstruction>(&R);
6602 if (!VPI)
6603 continue;
6604
6605 auto *I = cast_or_null<Instruction>(VPI->getUnderlyingValue());
6606 // Wouldn't be able to create a `VPReplicateRecipe` anyway.
6607 if (!I)
6608 continue;
6609
6610 // If executing other lanes produces side-effects we can't avoid them.
6611 if (VPI->mayHaveSideEffects())
6612 continue;
6613
6614 // We want to drop the mask operand, verify we can safely do that.
6615 if (VPI->isMasked() && !VPI->isSafeToSpeculativelyExecute())
6616 continue;
6617
6618 // Avoid rewriting IV increment as that interferes with
6619 // `removeRedundantCanonicalIVs`.
6620 if (VPI->getOpcode() == Instruction::Add &&
6622 continue;
6623
6624 // Other lanes are needed - can't drop them.
6626 continue;
6627
6628 auto *Recipe = new VPReplicateRecipe(
6629 I, VPI->operandsWithoutMask(), /*IsSingleScalar=*/true,
6630 /*Mask=*/nullptr, *VPI, *VPI, VPI->getDebugLoc());
6631 Recipe->insertBefore(VPI);
6632 VPI->replaceAllUsesWith(Recipe);
6633 VPI->eraseFromParent();
6634 }
6635 }
6636}
6637
6638/// Returns true if \p Info's parameter kinds are compatible with \p Args.
6639static bool areVFParamsOk(const VFInfo &Info, ArrayRef<VPValue *> Args,
6640 PredicatedScalarEvolution &PSE, const Loop *L,
6641 VPTypeAnalysis &Types) {
6642 ScalarEvolution *SE = PSE.getSE();
6643 return all_of(Info.Shape.Parameters, [&](VFParameter Param) {
6644 switch (Param.ParamKind) {
6645 case VFParamKind::Vector:
6646 case VFParamKind::GlobalPredicate:
6647 return true;
6648 case VFParamKind::OMP_Uniform:
6649 return SE->isSCEVable(Types.inferScalarType(Args[Param.ParamPos])) &&
6650 SE->isLoopInvariant(
6651 vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),
6652 L);
6653 case VFParamKind::OMP_Linear:
6654 return match(vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),
6655 m_scev_AffineAddRec(
6656 m_SCEV(), m_scev_SpecificSInt(Param.LinearStepOrPos),
6657 m_SpecificLoop(L)));
6658 default:
6659 return false;
6660 }
6661 });
6662}
6663
6664/// Find a vector variant of \p CI for \p VF, respecting \p MaskRequired.
6665/// Returns the variant function, or nullptr. Masked variants are assumed to
6666/// take the mask as a trailing parameter.
6668 ElementCount VF, bool MaskRequired,
6670 const Loop *L, VPTypeAnalysis &Types) {
6671 if (CI->isNoBuiltin())
6672 return nullptr;
6673 auto Mappings = VFDatabase::getMappings(*CI);
6674 const auto *It = find_if(Mappings, [&](const VFInfo &Info) {
6675 return Info.Shape.VF == VF && (!MaskRequired || Info.isMasked()) &&
6676 areVFParamsOk(Info, Args, PSE, L, Types);
6677 });
6678 if (It == Mappings.end())
6679 return nullptr;
6680 return CI->getModule()->getFunction(It->VectorName);
6681}
6682
6683namespace {
6684/// The outcome of choosing how to widen a call at a given VF.
6685struct CallWideningDecision {
6686 using KindTy = VPCostContext::CallWideningKind;
6687 CallWideningDecision(KindTy Kind, Function *Variant = nullptr)
6688 : Kind(Kind), Variant(Variant) {}
6689 KindTy Kind;
6690
6691 /// Set when Kind == VectorVariant.
6693
6694 bool operator==(const CallWideningDecision &Other) const {
6695 return Kind == Other.Kind && Variant == Other.Variant;
6696 }
6697};
6698} // namespace
6699
6700/// Pick the cheapest widening for the call \p VPI at \p VF among scalarization,
6701/// vector intrinsic, and vector library variant.
6702static CallWideningDecision decideCallWidening(VPInstruction &VPI,
6704 ElementCount VF,
6705 VPCostContext &CostCtx) {
6706 auto *CI = cast<CallInst>(VPI.getUnderlyingInstr());
6707
6708 // Scalar VFs and calls forced or known to scalarize always replicate.
6709 if (VF.isScalar() || CostCtx.willBeScalarized(CI, VF))
6710 return CallWideningDecision::KindTy::Scalarize;
6711
6712 auto *CalledFn = cast<Function>(
6714 Type *ResultTy = CostCtx.Types.inferScalarType(&VPI);
6716 bool MaskRequired = CostCtx.isMaskRequired(CI);
6717
6718 // Pseudo intrinsics (assume, lifetime, ...) are always scalarized.
6720 return CallWideningDecision::KindTy::Scalarize;
6721
6722 InstructionCost ScalarCost =
6723 VPReplicateRecipe::computeCallCost(CalledFn, ResultTy, Ops,
6724 /*IsSingleScalar=*/false, VF, CostCtx);
6725
6726 Function *VecFunc = findVectorVariant(CI, Ops, VF, MaskRequired, CostCtx.PSE,
6727 CostCtx.L, CostCtx.Types);
6729 if (VecFunc)
6730 VecCallCost = VPWidenCallRecipe::computeCallCost(VecFunc, CostCtx);
6731
6732 // Prefer the intrinsic if it is at least as cheap as scalarizing and any
6733 // available vector variant.
6734 if (ID) {
6737 if (IntrinsicCost.isValid() && ScalarCost >= IntrinsicCost &&
6738 (!VecFunc || VecCallCost >= IntrinsicCost))
6739 return CallWideningDecision::KindTy::Intrinsic;
6740 }
6741
6742 // Otherwise, use a vector library variant when it beats scalarizing.
6743 if (VecFunc && ScalarCost >= VecCallCost)
6744 return {CallWideningDecision::KindTy::VectorVariant, VecFunc};
6745
6746 return CallWideningDecision::KindTy::Scalarize;
6747}
6748
6750 VPRecipeBuilder &RecipeBuilder,
6751 VPCostContext &CostCtx) {
6755 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
6756 auto *VPI = dyn_cast<VPInstruction>(&R);
6757 if (!VPI || !VPI->getUnderlyingValue() ||
6758 VPI->getOpcode() != Instruction::Call)
6759 continue;
6760
6761 auto *CI = cast<CallInst>(VPI->getUnderlyingInstr());
6762 SmallVector<VPValue *, 4> Ops(VPI->op_begin(),
6763 VPI->op_begin() + CI->arg_size());
6764
6765 CallWideningDecision Decision =
6766 decideCallWidening(*VPI, Ops, Range.Start, CostCtx);
6768 [&](ElementCount VF) {
6769 return Decision == decideCallWidening(*VPI, Ops, VF, CostCtx);
6770 },
6771 Range);
6772
6773 VPSingleDefRecipe *Replacement = nullptr;
6774 switch (Decision.Kind) {
6775 case CallWideningDecision::KindTy::Intrinsic: {
6777 Type *ResultTy = CostCtx.Types.inferScalarType(VPI);
6778 Replacement = new VPWidenIntrinsicRecipe(*CI, ID, Ops, ResultTy, *VPI,
6779 *VPI, VPI->getDebugLoc());
6780 break;
6781 }
6782 case CallWideningDecision::KindTy::VectorVariant: {
6783 // Masked variants take the mask as a trailing parameter, so they have
6784 // one more parameter than the original call's arguments.
6785 if (Decision.Variant->arg_size() > Ops.size()) {
6786 VPValue *Mask = VPI->isMasked() ? VPI->getMask() : Plan.getTrue();
6787 Ops.push_back(Mask);
6788 }
6789 Ops.push_back(VPI->getOperand(VPI->getNumOperandsWithoutMask() - 1));
6790 Replacement = new VPWidenCallRecipe(CI, Decision.Variant, Ops, *VPI,
6791 *VPI, VPI->getDebugLoc());
6792 break;
6793 }
6794 case CallWideningDecision::KindTy::Scalarize:
6795 Replacement = RecipeBuilder.handleReplication(VPI, Range);
6796 break;
6797 }
6798
6800 [&](ElementCount VF) {
6801 Intrinsic::ID IID =
6802 getVectorIntrinsicIDForCall(CI, &CostCtx.TLI);
6804 return true;
6805 auto Legacy = CostCtx.getLegacyCallKind(CI, VF);
6806 return !Legacy || *Legacy == Decision.Kind;
6807 }) &&
6808 "VPlan call widening decision must match legacy decision");
6809
6810 Replacement->insertBefore(VPI);
6811 VPI->replaceAllUsesWith(Replacement);
6812 ToErase.push_back(VPI);
6813 }
6814 }
6815 for (VPInstruction *VPI : ToErase)
6816 VPI->eraseFromParent();
6817}
6818
6821 Loop &L, VPCostContext &Ctx,
6822 VFRange &Range) {
6823 if (Plan.hasScalarVFOnly())
6824 return;
6825
6826 VPTypeAnalysis TypeInfo(Plan);
6827 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
6828 VPValue *I32VF = nullptr;
6830 vp_depth_first_shallow(VectorLoop->getEntry()))) {
6831 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
6832 auto *LoadR = dyn_cast<VPWidenLoadRecipe>(&R);
6833 // TODO: Support strided store.
6834 // TODO: Transform reverse access into strided access with -1 stride.
6835 // TODO: Transform gather/scatter with uniform address into strided access
6836 // with 0 stride.
6837 // TODO: Transform interleave access into multiple strided accesses.
6838 if (!LoadR || LoadR->isConsecutive())
6839 continue;
6840
6841 auto *Ptr = dyn_cast<VPWidenGEPRecipe>(LoadR->getAddr());
6842 if (!Ptr)
6843 continue;
6844
6845 // Check if this is a strided access by analyzing the address SCEV for an
6846 // affine addRec.
6847 const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(Ptr, PSE, &L);
6848 const SCEV *Start;
6849 const APInt *Step;
6850 // TODO: Support non-constant loop invariant stride.
6851 if (!match(PtrSCEV, m_scev_AffineAddRec(m_SCEV(Start), m_scev_APInt(Step),
6852 m_SpecificLoop(&L))))
6853 continue;
6854
6855 Type *LoadTy = TypeInfo.inferScalarType(LoadR);
6856 Align Alignment = LoadR->getAlign();
6857 auto IsProfitable = [&](ElementCount VF) {
6858 Type *DataTy = toVectorTy(LoadTy, VF);
6859 if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))
6860 return false;
6861 const InstructionCost CurrentCost = LoadR->computeCost(VF, Ctx);
6862 const InstructionCost StridedLoadStoreCost =
6864 Intrinsic::experimental_vp_strided_load, DataTy,
6865 LoadR->isMasked(), Alignment, Ctx);
6866 return StridedLoadStoreCost < CurrentCost;
6867 };
6868
6870 Range))
6871 continue;
6872
6873 // Invalidate the legacy widening decision so the cost of replaced load is
6874 // not counted during precomputeCosts.
6875 // TODO: Remove once the legacy exit cost computation is retired.
6876 for (ElementCount VF : Range)
6877 Ctx.invalidateWideningDecision(&LoadR->getIngredient(), VF);
6878
6879 // Get VF as i32 for the vector length operand.
6880 if (!I32VF) {
6881 VPBuilder Builder(Plan.getVectorPreheader());
6882 I32VF = Builder.createScalarZExtOrTrunc(
6883 &Plan.getVF(), Type::getInt32Ty(Plan.getContext()),
6884 TypeInfo.inferScalarType(&Plan.getVF()), DebugLoc::getUnknown());
6885 }
6886
6887 VPBuilder Builder(LoadR);
6888 // Create the base pointer of strided access.
6889 VPValue *StartVPV = vputils::getOrCreateVPValueForSCEVExpr(Plan, Start);
6890 VPValue *StrideInBytes =
6891 Plan.getConstantInt(VectorLoop->getCanonicalIVType(),
6892 Step->getSExtValue(), /*IsSigned=*/true);
6893 auto *AddRecPtr = cast<SCEVAddRecExpr>(PtrSCEV);
6894 auto *Offset = Builder.createOverflowingOp(
6895 Instruction::Mul, {VectorLoop->getCanonicalIV(), StrideInBytes},
6896 {AddRecPtr->hasNoUnsignedWrap(), AddRecPtr->hasNoSignedWrap()});
6897 auto *BasePtr = Builder.createNoWrapPtrAdd(
6898 StartVPV, Offset,
6899 AddRecPtr->hasNoUnsignedWrap() ? GEPNoWrapFlags::noUnsignedWrap()
6901
6902 // Create a new vector pointer for strided access.
6903 VPValue *NewPtr = Builder.createVectorPointer(
6904 BasePtr, Type::getInt8Ty(Plan.getContext()), StrideInBytes,
6905 Ptr->getGEPNoWrapFlags(), Ptr->getDebugLoc());
6906
6907 VPValue *Mask = LoadR->getMask();
6908 if (!Mask)
6909 Mask = Plan.getTrue();
6910 auto *StridedLoad = Builder.createWidenMemIntrinsic(
6911 Intrinsic::experimental_vp_strided_load,
6912 {NewPtr, StrideInBytes, Mask, I32VF}, LoadTy, Alignment, *LoadR,
6913 LoadR->getDebugLoc());
6914 LoadR->replaceAllUsesWith(StridedLoad);
6915 }
6916 }
6917}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static CallWideningDecision decideCallWidening(VPInstruction &VPI, ArrayRef< VPValue * > Ops, ElementCount VF, VPCostContext &CostCtx)
Pick the cheapest widening for the call VPI at VF among scalarization, vector intrinsic,...
static bool sinkScalarOperands(VPlan &Plan)
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static VPValue * cloneBinOpForScalarIV(VPWidenRecipe *BinOp, VPValue *ScalarIV, VPWidenIntOrFpInductionRecipe *WidenIV)
Create a scalar version of BinOp, with its WidenIV operand replaced by ScalarIV, and place it after S...
static VPWidenIntOrFpInductionRecipe * getExpressionIV(VPValue *V)
Check if V is a binary expression of a widened IV and a loop-invariant value.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static Function * findVectorVariant(CallInst *CI, ArrayRef< VPValue * > Args, ElementCount VF, bool MaskRequired, PredicatedScalarEvolution &PSE, const Loop *L, VPTypeAnalysis &Types)
Find a vector variant of CI for VF, respecting MaskRequired.
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx, bool IsScalable)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectGroupedReplicateMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L, function_ref< bool(VPReplicateRecipe *)> FilterFn)
Collect either replicated Loads or Stores grouped by their address SCEV, in a deep-traversal of the v...
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
static std::optional< Intrinsic::ID > getVPDivRemIntrinsic(Intrinsic::ID IntrID)
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void expandVPDerivedIV(VPDerivedIVRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPDerivedIVRecipe into executable recipes.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static bool areVFParamsOk(const VFInfo &Info, ArrayRef< VPValue * > Args, PredicatedScalarEvolution &PSE, const Loop *L, VPTypeAnalysis &Types)
Returns true if Info's parameter kinds are compatible with Args.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations using scoped noalias metadata,...
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPRegionBlock *ParentRegion, VPlan &Plan)
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static std::optional< Instruction::BinaryOps > getUnmaskedDivRemOpcode(Intrinsic::ID ID)
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool cannotHoistOrSinkRecipe(VPRecipeBase &R, VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Return true if we do not know how to (mechanically) hoist or sink a non-memory or memory recipe R out...
static bool canNarrowOps(ArrayRef< VPValue * > Ops, bool IsScalable)
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1055
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
APInt abs() const
Get the absolute value.
Definition APInt.h:1818
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:1028
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getMinValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the minimum integer value with the given bit width and signedness.
Definition APSInt.h:310
static APSInt getMaxValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the maximum integer value with the given bit width and signedness.
Definition APSInt.h:302
@ NoAlias
The two locations do not alias at all.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
Get the last element.
Definition ArrayRef.h:150
const T & front() const
Get the first element.
Definition ArrayRef.h:144
iterator end() const
Definition ArrayRef.h:130
iterator begin() const
Definition ArrayRef.h:129
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:852
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:254
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:215
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:289
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
size_t arg_size() const
Definition Function.h:901
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags noUnsignedWrap()
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
static LLVM_ABI InductionDescriptor getCanonicalIntInduction(Type *Ty, ScalarEvolution &SE)
Returns the canonical integer induction for type Ty with start = 0 and step = 1.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
The group of interleaved loads/stores sharing the same stride and close to each other.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1698
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1080
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:110
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:118
bool empty() const
Definition MapVector.h:79
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition Module.cpp:235
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
Post-order traversal of a graph.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(SCEVUse SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getUDivExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
@ SK_Broadcast
Broadcast element 0 to all other elements.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
op_range operands()
Definition User.h:267
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:76
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3912
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4263
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4338
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4290
iterator end()
Definition VPlan.h:4300
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4298
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4351
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:266
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:582
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:661
const VPRecipeBase & back() const
Definition VPlan.h:4312
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2887
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2925
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2915
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2931
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2911
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:93
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:314
VPRegionBlock * getParent()
Definition VPlan.h:185
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:236
size_t getNumSuccessors() const
Definition VPlan.h:236
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:305
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:221
VPlan * getPlan()
Definition VPlan.cpp:211
const std::string & getName() const
Definition VPlan.h:176
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:324
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:232
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:216
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:278
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:226
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:210
static auto blocksAs(T &&Range)
Return an iterator range over Range with each block cast to BlockTy.
Definition VPlanUtils.h:314
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:333
static bool isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT)
Returns true if VPB is a loop latch, using isHeader().
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:223
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:241
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:259
static auto blocksOnly(T &&Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:295
static void transferSuccessors(VPBlockBase *Old, VPBlockBase *New)
Transfer successors from Old to New. New must have no successors.
Definition VPlanUtils.h:279
static SmallVector< VPBasicBlock * > blocksInSingleSuccessorChainBetween(VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Returns the blocks between FirstBB and LastBB, where FirstBB to LastBB forms a single-sucessor chain.
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3387
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createFirstActiveLane(ArrayRef< VPValue * > Masks, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAnyOfReduction(VPValue *ChainOp, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown())
Create an AnyOf reduction pattern: or-reduce ChainOp, freeze the result, then select between TrueVal ...
Definition VPlan.cpp:1683
VPInstruction * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step)
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
VPWidenPHIRecipe * createWidenPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPWidenCastRecipe * createWidenCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:3944
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:559
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:532
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:544
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:554
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:4038
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3432
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2395
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2442
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2431
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition VPlan.h:2133
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4416
Class to record and manage LLVM IR flags.
Definition VPlan.h:696
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1172
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1227
unsigned getNumOperandsWithoutMask() const
Returns the number of operands, excluding the mask if the VPInstruction is masked.
Definition VPlan.h:1461
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1316
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1267
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1312
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1262
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1259
@ CanonicalIVIncrementForPart
Definition VPlan.h:1243
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1270
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:3026
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3018
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:3047
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:3099
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:3057
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3574
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:401
VPBasicBlock * getParent()
Definition VPlan.h:476
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:554
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPHistogramRecipe * widenIfHistogram(VPInstruction *VPI)
If VPI represents a histogram operation (as determined by LoopVectorizationLegality) make that safe f...
VPRecipeBase * tryToWidenMemory(VPInstruction *VPI, VFRange &Range)
Check if the load or store instruction VPI should widened for Range.Start and potentially masked.
bool replaceWithFinalIfReductionStore(VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder)
If VPI is a store of a reduction into an invariant address, delete it.
VPReplicateRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a VPReplicationRecipe for VPI.
Type * getScalarType() const
Returns the scalar type of this VPRecipeValue.
Definition VPlanValue.h:337
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3259
A recipe for handling reduction phis.
Definition VPlan.h:2789
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2840
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2833
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2851
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3150
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4473
const VPBlockBase * getEntry() const
Definition VPlan.h:4517
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4549
VPInstruction * getOrCreateCanonicalIVIncrement()
Get the canonical IV increment instruction if it exists.
Definition VPlan.cpp:879
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4534
Type * getCanonicalIVType() const
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4593
void clearCanonicalIVNUW(VPInstruction *Increment)
Unsets NUW for the canonical IV increment Increment, for loop regions.
Definition VPlan.h:4601
VPRegionValue * getCanonicalIV()
Return the canonical induction variable of the region, null for replicating regions.
Definition VPlan.h:4585
const VPBlockBase * getExiting() const
Definition VPlan.h:4529
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4542
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3304
bool isSingleScalar() const
Definition VPlan.h:3352
static InstructionCost computeCallCost(Function *CalledFn, Type *ResultTy, ArrayRef< const VPValue * > ArgOps, bool IsSingleScalar, ElementCount VF, VPCostContext &Ctx)
Return the cost of scalarizing a call to CalledFn with argument operands ArgOps for a given VF.
bool isPredicated() const
Definition VPlan.h:3354
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3371
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:4108
VPSingleDefRecipe is a base class for recipes that model a sequence of one or more output IR that def...
Definition VPlan.h:610
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:681
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:384
operand_range operands()
Definition VPlanValue.h:455
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:428
unsigned getNumOperands() const
Definition VPlanValue.h:422
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:423
void addOperand(VPValue *Operand)
Definition VPlanValue.h:417
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:50
Type * getScalarType() const
Returns the scalar type of this VPValue, dispatching based on the concrete subclass.
Definition VPlan.cpp:149
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:143
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1508
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:130
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:75
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:208
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1511
unsigned getNumUsers() const
Definition VPlanValue.h:115
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1517
user_range users()
Definition VPlanValue.h:157
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2240
A recipe for widening Call instructions using library calls.
Definition VPlan.h:2067
static InstructionCost computeCallCost(Function *Variant, VPCostContext &Ctx)
Return the cost of widening a call using the vector function Variant.
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1848
Instruction::CastOps getOpcode() const
Definition VPlan.h:1884
A recipe for handling GEP instructions.
Definition VPlan.h:2175
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2466
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2500
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2503
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2523
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2554
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2601
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2605
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2627
A recipe for widening vector intrinsics.
Definition VPlan.h:1895
static InstructionCost computeCallCost(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost of a vector intrinsic with ID and Operands.
static InstructionCost computeMemIntrinsicCost(Intrinsic::ID IID, Type *Ty, bool IsMasked, Align Alignment, VPCostContext &Ctx)
Helper function for computing the cost of vector memory intrinsic.
A common mixin class for widening memory operations.
Definition VPlan.h:3610
virtual VPRecipeBase * getAsRecipe()=0
Return a VPRecipeBase* to the current object.
A recipe for widened phis.
Definition VPlan.h:2685
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1790
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
VPWidenRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1810
unsigned getOpcode() const
Definition VPlan.h:1829
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4621
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4946
bool hasVF(ElementCount VF) const
Definition VPlan.h:4844
const DataLayout & getDataLayout() const
Definition VPlan.h:4826
LLVMContext & getContext() const
Definition VPlan.h:4822
VPBasicBlock * getEntry()
Definition VPlan.h:4717
bool hasScalableVF() const
Definition VPlan.h:4845
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4780
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4801
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4851
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:4917
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4820
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:4923
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4995
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4949
bool hasUF(unsigned UF) const
Definition VPlan.h:4869
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4770
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4810
VPValue * getBackedgeTakenCount() const
Definition VPlan.h:4807
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4894
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:4920
void setVF(ElementCount VF)
Definition VPlan.h:4832
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4885
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1098
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:4872
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4794
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4746
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4972
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:4914
VPBasicBlock * getVectorPreheader() const
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4722
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4817
bool hasScalarVFOnly() const
Definition VPlan.h:4862
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4760
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4813
void setUF(unsigned UF)
Definition VPlan.h:4877
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop, i.e.
Definition VPlan.h:5027
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1254
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4928
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:261
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2815
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition DwarfDebug.h:190
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
match_combine_or< CastInst_match< OpTy, TruncInst >, OpTy > m_TruncOrSelf(const OpTy &Op)
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
specificloop_ty m_SpecificLoop(const Loop *L)
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
auto m_WidenAnyExtend(const Op0_t &Op0)
match_bind< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
auto m_VPPhi(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
auto m_WidenIntrinsic(const T &...Ops)
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
match_bind< VPSingleDefRecipe > m_VPSingleDefRecipe(VPSingleDefRecipe *&V)
Match a VPSingleDefRecipe, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
auto m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
match_bind< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
auto m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
bool cannotHoistOrSinkRecipe(const VPRecipeBase &R, bool Sinking=false)
Return true if we do not know how to (mechanically) hoist or sink R.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
VPInstruction * findCanonicalIVIncrement(VPlan &Plan)
Find the canonical IV increment of Plan's vector loop region.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:116
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
Definition VPlanUtils.h:137
bool isUniformAcrossVFsAndUFs(const VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Offset
Definition DWP.cpp:558
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2179
void stable_sort(R &&Range)
Definition STLExtras.h:2115
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2077
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1668
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:288
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
SmallVector< VPRegisterUsage, 8 > calculateRegisterUsageForPlan(VPlan &Plan, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Estimate the register usage for Plan and vectorization factors in VFs by calculating the highest numb...
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1151
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:78
@ ReadOnly
No side effects to worry about, so we can process any uncountable exits in the loop and branch either...
Definition VPlan.h:83
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:551
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1882
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ FSub
Subtraction of floats.
@ FAddChainWithSubs
A chain of fadds and fsubs.
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2011
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2087
ArrayRef(const T &OneElt) -> ArrayRef< T >
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1408
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1771
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:325
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2145
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:305
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2771
Holds the VFShape for a specific scalar to vector function mapping.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
static bool isFreeScalarIntrinsic(Intrinsic::ID ID)
Returns true if ID is a pseudo intrinsic that is dropped via scalarization rather than widened.
Definition VPlan.cpp:1978
CallWideningKind
Choice for how to widen a call at a given VF.
bool isMaskRequired(Instruction *I) const
Forwards to LoopVectorizationCostModel::isMaskRequired.
PredicatedScalarEvolution & PSE
bool willBeScalarized(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalarized at VF.
TargetTransformInfo::TargetCostKind CostKind
VPTypeAnalysis Types
const TargetLibraryInfo & TLI
std::optional< CallWideningKind > getLegacyCallKind(CallInst *CI, ElementCount VF) const
Returns the legacy call widening decision for CI at VF, or std::nullopt if none was recorded.
const TargetTransformInfo & TTI
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:246
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:147
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
InstructionCost spillCost(const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, unsigned OverrideMaxNumRegs=0) const
Calculate the estimated cost of any spills due to using more registers than the number available for ...
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:286
Type * getType() const
Returns the scalar type of this symbolic value.
Definition VPlanValue.h:294
bool isMaterialized() const
Returns true if this symbolic value has been materialized.
Definition VPlanValue.h:297
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3721
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3672
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3823
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3770
static void handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, UncountableExitStyle Style)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder)
Convert load/store VPInstructions in Plan into widened or replicate recipes.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, const bool &EpilogueAllowed)
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
static void removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void replaceWideCanonicalIVWithWideIV(VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, ElementCount VF, unsigned UF, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Replace a VPWidenCanonicalIVRecipe if it is present in Plan, with a VPWidenIntOrFpInductionRecipe,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step, std::optional< uint64_t > MaxRuntimeStep=std::nullopt)
Materialize vector trip count computations to a set of VPInstructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static bool mergeBlocksIntoPredecessors(VPlan &Plan)
Remove redundant VPBasicBlocks by merging them into their single predecessor if the latter has a sing...
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void makeScalarizationDecisions(VPlan &Plan, VFRange &Range)
Make VPlan-based scalarization decision prior to delegating to the ones made by the legacy CM.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses of the canonical ...
static void makeCallWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder, VPCostContext &CostCtx)
Convert call VPInstructions in Plan into widened call, vector intrinsic or replicate recipes based on...
static void adjustFirstOrderRecurrenceMiddleUsers(VPlan &Plan, VFRange &Range)
Adjust first-order recurrence users in the middle block: create penultimate element extracts for LCSS...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void convertToStridedAccesses(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L, VPCostContext &Ctx, VFRange &Range)
Transform widen memory recipes into strided access recipes when legal and profitable.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static void dropPoisonGeneratingRecipes(VPlan &Plan)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...