LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(PhiR->operands(), PhiR->getDebugLoc(),
75 Phi->getName());
76 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
77 assert(!isa<PHINode>(Inst) && "phis should be handled above");
78 // Create VPWidenMemoryRecipe for loads and stores.
79 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
80 NewRecipe = new VPWidenLoadRecipe(
81 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
82 false /*Consecutive*/, *VPI, Ingredient.getDebugLoc());
83 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
84 NewRecipe = new VPWidenStoreRecipe(
85 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
86 nullptr /*Mask*/, false /*Consecutive*/, *VPI,
87 Ingredient.getDebugLoc());
89 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
90 Ingredient.getDebugLoc());
91 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
92 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
93 if (VectorID == Intrinsic::not_intrinsic)
94 return false;
95
96 // The noalias.scope.decl intrinsic declares a noalias scope that
97 // is valid for a single iteration. Emitting it as a single-scalar
98 // replicate would incorrectly extend the scope across multiple
99 // original iterations packed into one vector iteration.
100 // FIXME: If we want to vectorize this loop, then we have to drop
101 // all the associated !alias.scope and !noalias.
102 if (VectorID == Intrinsic::experimental_noalias_scope_decl)
103 return false;
104
105 // These intrinsics are recognized by getVectorIntrinsicIDForCall
106 // but are not widenable. Emit them as replicate instead of widening.
107 if (VectorID == Intrinsic::assume ||
108 VectorID == Intrinsic::lifetime_end ||
109 VectorID == Intrinsic::lifetime_start ||
110 VectorID == Intrinsic::sideeffect ||
111 VectorID == Intrinsic::pseudoprobe) {
112 // If the operand of llvm.assume holds before vectorization, it will
113 // also hold per lane.
114 // llvm.pseudoprobe requires to be duplicated per lane for accurate
115 // sample count.
116 const bool IsSingleScalar = VectorID != Intrinsic::assume &&
117 VectorID != Intrinsic::pseudoprobe;
118 NewRecipe = new VPReplicateRecipe(CI, Ingredient.operands(),
119 /*IsSingleScalar=*/IsSingleScalar,
120 /*Mask=*/nullptr, *VPI, *VPI,
121 Ingredient.getDebugLoc());
122 } else {
123 NewRecipe = new VPWidenIntrinsicRecipe(
124 *CI, VectorID, drop_end(Ingredient.operands()), CI->getType(),
125 VPIRFlags(*CI), *VPI, CI->getDebugLoc());
126 }
127 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
128 NewRecipe = new VPWidenCastRecipe(
129 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
130 VPIRFlags(*CI), VPIRMetadata(*CI));
131 } else {
132 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
133 *VPI, Ingredient.getDebugLoc());
134 }
135 } else {
137 "inductions must be created earlier");
138 continue;
139 }
140
141 NewRecipe->insertBefore(&Ingredient);
142 if (NewRecipe->getNumDefinedValues() == 1)
143 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
144 else
145 assert(NewRecipe->getNumDefinedValues() == 0 &&
146 "Only recpies with zero or one defined values expected");
147 Ingredient.eraseFromParent();
148 }
149 }
150 return true;
151}
152
153/// Helper for extra no-alias checks via known-safe recipe and SCEV.
155 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
156 VPReplicateRecipe &GroupLeader;
158 const Loop &L;
159 VPTypeAnalysis &TypeInfo;
160
161 // Return true if \p A and \p B are known to not alias for all VFs in the
162 // plan, checked via the distance between the accesses
163 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
164 if (A->getOpcode() != Instruction::Store ||
165 B->getOpcode() != Instruction::Store)
166 return false;
167
168 VPValue *AddrA = A->getOperand(1);
169 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
170 VPValue *AddrB = B->getOperand(1);
171 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
173 return false;
174
175 const APInt *Distance;
176 ScalarEvolution &SE = *PSE.getSE();
177 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
178 return false;
179
180 const DataLayout &DL = SE.getDataLayout();
181 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
182 uint64_t SizeA = DL.getTypeStoreSize(TyA);
183 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
184 uint64_t SizeB = DL.getTypeStoreSize(TyB);
185
186 // Use the maximum store size to ensure no overlap from either direction.
187 // Currently only handles fixed sizes, as it is only used for
188 // replicating VPReplicateRecipes.
189 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
190
191 auto VFs = B->getParent()->getPlan()->vectorFactors();
193 if (MaxVF.isScalable())
194 return false;
195 return Distance->abs().uge(
196 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
197 }
198
199public:
202 const Loop &L, VPTypeAnalysis &TypeInfo)
203 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
204 L(L), TypeInfo(TypeInfo) {}
205
206 /// Return true if \p R should be skipped during alias checking, either
207 /// because it's in the exclude set or because no-alias can be proven via
208 /// SCEV.
209 bool shouldSkip(VPRecipeBase &R) const {
210 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
211 return ExcludeRecipes.contains(&R) ||
212 (Store && isNoAliasViaDistance(Store, &GroupLeader));
213 }
214};
215
216/// Check if a memory operation doesn't alias with memory operations using
217/// scoped noalias metadata, in blocks in the single-successor chain between \p
218/// FirstBB and \p LastBB. If \p SinkInfo is std::nullopt, only recipes that may
219/// write to memory are checked (for load hoisting). Otherwise recipes that both
220/// read and write memory are checked, and SCEV is used to prove no-alias
221/// between the group leader and other replicate recipes (for store sinking).
222static bool
224 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
225 std::optional<SinkStoreInfo> SinkInfo = {}) {
226 bool CheckReads = SinkInfo.has_value();
227 if (!MemLoc.AATags.Scope)
228 return false;
229
230 for (VPBasicBlock *VPBB :
232 for (VPRecipeBase &R : *VPBB) {
233 if (SinkInfo && SinkInfo->shouldSkip(R))
234 continue;
235
236 // Skip recipes that don't need checking.
237 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
238 continue;
239
241 if (!Loc)
242 // Conservatively assume aliasing for memory operations without
243 // location.
244 return false;
245
247 return false;
248 }
249 }
250 return true;
251}
252
253/// Collect either replicated Loads or Stores grouped by their address SCEV, in
254/// a deep-traversal of the vector loop region in \p Plan.
255template <unsigned Opcode>
258 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
259 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
260 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
261 "Only Load and Store opcodes supported");
262 constexpr bool IsLoad = (Opcode == Instruction::Load);
264 RecipesByAddress;
267 for (VPRecipeBase &R : *VPBB) {
268 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
269 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
270 continue;
271
272 // For loads, operand 0 is address; for stores, operand 1 is address.
273 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
274 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
275 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
276 RecipesByAddress[AddrSCEV].push_back(RepR);
277 }
278 }
279 auto Groups = to_vector(RecipesByAddress.values());
280 VPDominatorTree VPDT(Plan);
281 for (auto &Group : Groups) {
282 // Sort mem ops by dominance order, with earliest (most dominating) first.
284 return VPDT.properlyDominates(A, B);
285 });
286 }
287 return Groups;
288}
289
290static bool sinkScalarOperands(VPlan &Plan) {
291 auto Iter = vp_depth_first_deep(Plan.getEntry());
292 bool ScalarVFOnly = Plan.hasScalarVFOnly();
293 bool Changed = false;
294
296 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
297 VPBasicBlock *SinkTo, VPValue *Op) {
298 auto *Candidate =
299 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
300 if (!Candidate)
301 return;
302
303 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
304 // for now.
306 return;
307
308 if (Candidate->getParent() == SinkTo ||
309 vputils::cannotHoistOrSinkRecipe(*Candidate, /*Sinking=*/true))
310 return;
311
312 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
313 if (!ScalarVFOnly && RepR->isSingleScalar())
314 return;
315
316 WorkList.insert({SinkTo, Candidate});
317 };
318
319 // First, collect the operands of all recipes in replicate blocks as seeds for
320 // sinking.
322 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
323 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
324 continue;
325 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
326 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
327 continue;
328 for (auto &Recipe : *VPBB)
329 for (VPValue *Op : Recipe.operands())
330 InsertIfValidSinkCandidate(VPBB, Op);
331 }
332
333 // Try to sink each replicate or scalar IV steps recipe in the worklist.
334 for (unsigned I = 0; I != WorkList.size(); ++I) {
335 VPBasicBlock *SinkTo;
336 VPSingleDefRecipe *SinkCandidate;
337 std::tie(SinkTo, SinkCandidate) = WorkList[I];
338
339 // All recipe users of SinkCandidate must be in the same block SinkTo or all
340 // users outside of SinkTo must only use the first lane of SinkCandidate. In
341 // the latter case, we need to duplicate SinkCandidate.
342 auto UsersOutsideSinkTo =
343 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
344 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
345 });
346 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
347 return !U->usesFirstLaneOnly(SinkCandidate);
348 }))
349 continue;
350 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
351
352 if (NeedsDuplicating) {
353 if (ScalarVFOnly)
354 continue;
355 VPSingleDefRecipe *Clone;
356 if (auto *SinkCandidateRepR =
357 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
358 // TODO: Handle converting to uniform recipes as separate transform,
359 // then cloning should be sufficient here.
360 Instruction *I = SinkCandidate->getUnderlyingInstr();
361 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
362 nullptr /*Mask*/, *SinkCandidateRepR,
363 *SinkCandidateRepR);
364 // TODO: add ".cloned" suffix to name of Clone's VPValue.
365 } else {
366 Clone = SinkCandidate->clone();
367 }
368
369 Clone->insertBefore(SinkCandidate);
370 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
371 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
372 });
373 }
374 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
375 for (VPValue *Op : SinkCandidate->operands())
376 InsertIfValidSinkCandidate(SinkTo, Op);
377 Changed = true;
378 }
379 return Changed;
380}
381
382/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
383/// the mask.
385 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
386 if (!EntryBB || EntryBB->size() != 1 ||
387 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
388 return nullptr;
389
390 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
391}
392
393/// If \p R is a triangle region, return the 'then' block of the triangle.
395 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
396 if (EntryBB->getNumSuccessors() != 2)
397 return nullptr;
398
399 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
400 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
401 if (!Succ0 || !Succ1)
402 return nullptr;
403
404 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
405 return nullptr;
406 if (Succ0->getSingleSuccessor() == Succ1)
407 return Succ0;
408 if (Succ1->getSingleSuccessor() == Succ0)
409 return Succ1;
410 return nullptr;
411}
412
413// Merge replicate regions in their successor region, if a replicate region
414// is connected to a successor replicate region with the same predicate by a
415// single, empty VPBasicBlock.
417 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
418
419 // Collect replicate regions followed by an empty block, followed by another
420 // replicate region with matching masks to process front. This is to avoid
421 // iterator invalidation issues while merging regions.
424 vp_depth_first_deep(Plan.getEntry()))) {
425 if (!Region1->isReplicator())
426 continue;
427 auto *MiddleBasicBlock =
428 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
429 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
430 continue;
431
432 auto *Region2 =
433 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
434 if (!Region2 || !Region2->isReplicator())
435 continue;
436
437 VPValue *Mask1 = getPredicatedMask(Region1);
438 VPValue *Mask2 = getPredicatedMask(Region2);
439 if (!Mask1 || Mask1 != Mask2)
440 continue;
441
442 assert(Mask1 && Mask2 && "both region must have conditions");
443 WorkList.push_back(Region1);
444 }
445
446 // Move recipes from Region1 to its successor region, if both are triangles.
447 for (VPRegionBlock *Region1 : WorkList) {
448 if (TransformedRegions.contains(Region1))
449 continue;
450 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
451 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
452
453 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
454 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
455 if (!Then1 || !Then2)
456 continue;
457
458 // Note: No fusion-preventing memory dependencies are expected in either
459 // region. Such dependencies should be rejected during earlier dependence
460 // checks, which guarantee accesses can be re-ordered for vectorization.
461 //
462 // Move recipes to the successor region.
463 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
464 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
465
466 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
467 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
468
469 // Move VPPredInstPHIRecipes from the merge block to the successor region's
470 // merge block. Update all users inside the successor region to use the
471 // original values.
472 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
473 VPValue *PredInst1 =
474 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
475 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
476 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
477 return cast<VPRecipeBase>(&U)->getParent() == Then2;
478 });
479
480 // Remove phi recipes that are unused after merging the regions.
481 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
482 Phi1ToMove.eraseFromParent();
483 continue;
484 }
485 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
486 }
487
488 // Remove the dead recipes in Region1's entry block.
489 for (VPRecipeBase &R :
490 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
491 R.eraseFromParent();
492
493 // Finally, remove the first region.
494 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
495 VPBlockUtils::disconnectBlocks(Pred, Region1);
496 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
497 }
498 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
499 TransformedRegions.insert(Region1);
500 }
501
502 return !TransformedRegions.empty();
503}
504
506 VPRegionBlock *ParentRegion,
507 VPlan &Plan) {
508 Instruction *Instr = PredRecipe->getUnderlyingInstr();
509 // Build the triangular if-then region.
510 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
511 assert(Instr->getParent() && "Predicated instruction not in any basic block");
512 auto *BlockInMask = PredRecipe->getMask();
513 auto *MaskDef = BlockInMask->getDefiningRecipe();
514 auto *BOMRecipe = new VPBranchOnMaskRecipe(
515 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
516 auto *Entry =
517 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
518
519 // Replace predicated replicate recipe with a replicate recipe without a
520 // mask but in the replicate region.
521 auto *RecipeWithoutMask = new VPReplicateRecipe(
522 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
523 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
524 PredRecipe->getDebugLoc());
525 auto *Pred =
526 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
527 auto *Exiting = Plan.createVPBasicBlock(Twine(RegionName) + ".continue");
529 Plan.createReplicateRegion(Entry, Exiting, RegionName);
530
531 // Note: first set Entry as region entry and then connect successors starting
532 // from it in order, to propagate the "parent" of each VPBasicBlock.
533 Region->setParent(ParentRegion);
534 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
535 VPBlockUtils::connectBlocks(Pred, Exiting);
536
537 if (PredRecipe->getNumUsers() != 0) {
538 auto *PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
539 RecipeWithoutMask->getDebugLoc());
540 Exiting->appendRecipe(PHIRecipe);
541 PredRecipe->replaceAllUsesWith(PHIRecipe);
542 }
543 PredRecipe->eraseFromParent();
544 return Region;
545}
546
547static void addReplicateRegions(VPlan &Plan) {
550 vp_depth_first_deep(Plan.getEntry()))) {
551 for (VPRecipeBase &R : *VPBB)
552 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
553 if (RepR->isPredicated())
554 WorkList.push_back(RepR);
555 }
556 }
557
558 unsigned BBNum = 0;
559 for (VPReplicateRecipe *RepR : WorkList) {
560 VPBasicBlock *CurrentBlock = RepR->getParent();
561 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
562
563 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
564 SplitBlock->setName(
565 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
566 // Record predicated instructions for above packing optimizations.
568 createReplicateRegion(RepR, CurrentBlock->getParent(), Plan);
570
571 VPRegionBlock *ParentRegion = Region->getParent();
572 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
573 ParentRegion->setExiting(SplitBlock);
574 }
575}
576
580 vp_depth_first_deep(Plan.getEntry()))) {
581 // Don't fold the blocks in the skeleton of the Plan into their single
582 // predecessors for now.
583 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
584 if (!VPBB->getParent())
585 continue;
586 auto *PredVPBB =
587 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
588 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
589 isa<VPIRBasicBlock>(PredVPBB))
590 continue;
591 WorkList.push_back(VPBB);
592 }
593
594 for (VPBasicBlock *VPBB : WorkList) {
595 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
596 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
597 R.moveBefore(*PredVPBB, PredVPBB->end());
598 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
599 auto *ParentRegion = VPBB->getParent();
600 if (ParentRegion && ParentRegion->getExiting() == VPBB)
601 ParentRegion->setExiting(PredVPBB);
602 VPBlockUtils::transferSuccessors(VPBB, PredVPBB);
603 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
604 }
605 return !WorkList.empty();
606}
607
609 // Convert masked VPReplicateRecipes to if-then region blocks.
611
612 bool ShouldSimplify = true;
613 while (ShouldSimplify) {
614 ShouldSimplify = sinkScalarOperands(Plan);
615 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
616 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
617 }
618}
619
620/// Remove redundant casts of inductions.
621///
622/// Such redundant casts are casts of induction variables that can be ignored,
623/// because we already proved that the casted phi is equal to the uncasted phi
624/// in the vectorized loop. There is no need to vectorize the cast - the same
625/// value can be used for both the phi and casts in the vector loop.
627 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
629 if (!IV || IV->getTruncInst())
630 continue;
631
632 // A sequence of IR Casts has potentially been recorded for IV, which
633 // *must be bypassed* when the IV is vectorized, because the vectorized IV
634 // will produce the desired casted value. This sequence forms a def-use
635 // chain and is provided in reverse order, ending with the cast that uses
636 // the IV phi. Search for the recipe of the last cast in the chain and
637 // replace it with the original IV. Note that only the final cast is
638 // expected to have users outside the cast-chain and the dead casts left
639 // over will be cleaned up later.
640 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
641 VPValue *FindMyCast = IV;
642 for (Instruction *IRCast : reverse(Casts)) {
643 VPSingleDefRecipe *FoundUserCast = nullptr;
644 for (auto *U : FindMyCast->users()) {
645 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
646 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
647 FoundUserCast = UserCast;
648 break;
649 }
650 }
651 // A cast recipe in the chain may have been removed by earlier DCE.
652 if (!FoundUserCast)
653 break;
654 FindMyCast = FoundUserCast;
655 }
656 if (FindMyCast != IV)
657 FindMyCast->replaceAllUsesWith(IV);
658 }
659}
660
663 Instruction::BinaryOps InductionOpcode,
664 FPMathOperator *FPBinOp, Instruction *TruncI,
665 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
666 VPBuilder &Builder) {
667 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
668 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
669 VPValue *CanonicalIV = LoopRegion->getCanonicalIV();
670 VPSingleDefRecipe *BaseIV =
671 Builder.createDerivedIV(Kind, FPBinOp, StartV, CanonicalIV, Step);
672
673 // Truncate base induction if needed.
674 VPTypeAnalysis TypeInfo(Plan);
675 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
676 if (TruncI) {
677 Type *TruncTy = TruncI->getType();
678 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
679 "Not truncating.");
680 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
681 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
682 ResultTy = TruncTy;
683 }
684
685 // Truncate step if needed.
686 Type *StepTy = TypeInfo.inferScalarType(Step);
687 if (ResultTy != StepTy) {
688 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
689 "Not truncating.");
690 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
691 auto *VecPreheader =
693 VPBuilder::InsertPointGuard Guard(Builder);
694 Builder.setInsertPoint(VecPreheader);
695 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
696 }
697 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
698 &Plan.getVF(), DL);
699}
700
702 VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI,
704 const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
705 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
706 if (!LoopRegion)
707 return;
708
710 LoopRegion->getCanonicalIV());
711 if (!WideCanIV)
712 return;
713
714 Type *CanIVTy = LoopRegion->getCanonicalIVType();
715
716 // Replace the wide canonical IV with a scalar-iv-steps over the canonical
717 // IV.
718 if (Plan.hasScalarVFOnly() || vputils::onlyFirstLaneUsed(WideCanIV)) {
719 VPBuilder Builder(WideCanIV);
720 WideCanIV->replaceAllUsesWith(createScalarIVSteps(
721 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
722 nullptr, Plan.getZero(CanIVTy), Plan.getConstantInt(CanIVTy, 1),
723 WideCanIV->getDebugLoc(), Builder));
724 WideCanIV->eraseFromParent();
725 return;
726 }
727
728 if (vputils::onlyScalarValuesUsed(WideCanIV))
729 return;
730
731 // If a canonical VPWidenIntOrFpInductionRecipe already produces vector lanes
732 // in the header, reuse it instead of introducing another wide induction phi.
733 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
734 for (VPRecipeBase &Phi : Header->phis()) {
735 auto *WidenIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
736 if (!WidenIV || !WidenIV->isCanonical())
737 continue;
738 // The reused wide IV feeds the header mask, whose lanes may extend past
739 // the trip count; drop flags that only hold inside the scalar loop.
740 WidenIV->dropPoisonGeneratingFlags();
741 WideCanIV->replaceAllUsesWith(WidenIV);
742 WideCanIV->eraseFromParent();
743 return;
744 }
745
746 // Introduce a new VPWidenIntOrFpInductionRecipe if profitable.
747 auto *VecTy = VectorType::get(CanIVTy, VF);
748 InstructionCost BroadcastCost = TTI.getShuffleCost(
750 InstructionCost PHICost = TTI.getCFInstrCost(Instruction::PHI, CostKind);
751 if (PHICost > BroadcastCost)
752 return;
753
754 // Bail out if the additional wide induction phi increase the expected spill
755 // cost.
756 VPRegisterUsage UnrolledBase =
757 calculateRegisterUsageForPlan(Plan, VF, TTI, ValuesToIgnore)[0];
758 for (unsigned &NumUsers : make_second_range(UnrolledBase.MaxLocalUsers))
759 NumUsers *= UF;
760 unsigned RegClass = TTI.getRegisterClassForType(/*Vector=*/true, VecTy);
761 VPRegisterUsage Projected = UnrolledBase;
762 Projected.MaxLocalUsers[RegClass] += TTI.getRegUsageForType(VecTy);
763 if (Projected.spillCost(TTI, CostKind) >
764 UnrolledBase.spillCost(TTI, CostKind))
765 return;
766
769 VPValue *StepV = Plan.getConstantInt(CanIVTy, 1);
770 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
771 /*IV=*/nullptr, Plan.getZero(CanIVTy), StepV, &Plan.getVF(), ID,
772 VPIRFlags::WrapFlagsTy(/*HasNUW=*/LoopRegion->hasCanonicalIVNUW(),
773 /*HasNSW=*/false),
774 WideCanIV->getDebugLoc());
775 NewWideIV->insertBefore(&*Header->getFirstNonPhi());
776 WideCanIV->replaceAllUsesWith(NewWideIV);
777 WideCanIV->eraseFromParent();
778}
779
780/// Returns true if \p R is dead and can be removed.
781static bool isDeadRecipe(VPRecipeBase &R) {
782 // Do remove conditional assume instructions as their conditions may be
783 // flattened.
784 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
785 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
787 if (IsConditionalAssume)
788 return true;
789
790 if (R.mayHaveSideEffects())
791 return false;
792
793 // Recipe is dead if no user keeps the recipe alive.
794 return all_of(R.definedValues(),
795 [](VPValue *V) { return V->getNumUsers() == 0; });
796}
797
800 Plan.getEntry());
802 // The recipes in the block are processed in reverse order, to catch chains
803 // of dead recipes.
804 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
805 if (isDeadRecipe(R)) {
806 R.eraseFromParent();
807 continue;
808 }
809
810 // Check if R is a dead VPPhi <-> update cycle and remove it.
811 VPValue *Start, *Incoming;
812 if (!match(&R, m_VPPhi(m_VPValue(Start), m_VPValue(Incoming))))
813 continue;
814 auto *PhiR = cast<VPPhi>(&R);
815 VPUser *PhiUser = PhiR->getSingleUser();
816 if (!PhiUser)
817 continue;
818 if (PhiUser != Incoming->getDefiningRecipe() ||
819 Incoming->getNumUsers() != 1)
820 continue;
821 PhiR->replaceAllUsesWith(Start);
822 PhiR->eraseFromParent();
823 Incoming->getDefiningRecipe()->eraseFromParent();
824 }
825 }
826}
827
830 for (unsigned I = 0; I != Users.size(); ++I) {
832 for (VPValue *V : Cur->definedValues())
833 Users.insert_range(V->users());
834 }
835 return Users.takeVector();
836}
837
838/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
839/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
840/// generates scalar values.
841static VPValue *
843 VPlan &Plan, VPBuilder &Builder) {
845 VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());
846 VPValue *StepV = PtrIV->getOperand(1);
848 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
849 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
850
851 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
852 PtrIV->getDebugLoc(), "next.gep");
853}
854
855/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
856/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
857/// VPWidenPointerInductionRecipe will generate vectors only. If some users
858/// require vectors while other require scalars, the scalar uses need to extract
859/// the scalars from the generated vectors (Note that this is different to how
860/// int/fp inductions are handled). Legalize extract-from-ends using uniform
861/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
862/// the correct end value is available. Also optimize
863/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
864/// providing them scalar steps built on the canonical scalar IV and update the
865/// original IV's users. This is an optional optimization to reduce the needs of
866/// vector extracts.
869 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
870 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
871 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
872 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
873 if (!PhiR)
874 continue;
875
876 // Try to narrow wide and replicating recipes to uniform recipes, based on
877 // VPlan analysis.
878 // TODO: Apply to all recipes in the future, to replace legacy uniformity
879 // analysis.
880 auto Users = collectUsersRecursively(PhiR);
881 for (VPUser *U : reverse(Users)) {
882 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
883 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
884 // Skip recipes that shouldn't be narrowed.
885 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
886 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
887 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
888 continue;
889
890 // Skip recipes that may have other lanes than their first used.
892 continue;
893
894 // TODO: Support scalarizing ExtractValue.
895 if (match(Def,
897 continue;
898
899 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
900 Def->operands(), /*IsUniform*/ true,
901 /*Mask*/ nullptr, /*Flags*/ *Def);
902 Clone->insertAfter(Def);
903 Def->replaceAllUsesWith(Clone);
904 }
905
906 // Replace wide pointer inductions which have only their scalars used by
907 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
908 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
909 if (!Plan.hasScalarVFOnly() &&
910 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
911 continue;
912
913 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
914 PtrIV->replaceAllUsesWith(PtrAdd);
915 continue;
916 }
917
918 // Replace widened induction with scalar steps for users that only use
919 // scalars.
920 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
921 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
922 return U->usesScalars(WideIV);
923 }))
924 continue;
925
926 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
928 Plan, ID.getKind(), ID.getInductionOpcode(),
929 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
930 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
931 WideIV->getDebugLoc(), Builder);
932
933 // Update scalar users of IV to use Step instead.
934 if (!HasOnlyVectorVFs) {
935 assert(!Plan.hasScalableVF() &&
936 "plans containing a scalar VF cannot also include scalable VFs");
937 WideIV->replaceAllUsesWith(Steps);
938 } else {
939 bool HasScalableVF = Plan.hasScalableVF();
940 WideIV->replaceUsesWithIf(Steps,
941 [WideIV, HasScalableVF](VPUser &U, unsigned) {
942 if (HasScalableVF)
943 return U.usesFirstLaneOnly(WideIV);
944 return U.usesScalars(WideIV);
945 });
946 }
947 }
948}
949
950/// Check if \p VPV is an untruncated wide induction, either before or after the
951/// increment. If so return the header IV (before the increment), otherwise
952/// return null.
955 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
956 if (WideIV) {
957 // VPV itself is a wide induction, separately compute the end value for exit
958 // users if it is not a truncated IV.
959 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
960 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
961 }
962
963 // Check if VPV is an optimizable induction increment.
964 VPRecipeBase *Def = VPV->getDefiningRecipe();
965 if (!Def || Def->getNumOperands() != 2)
966 return nullptr;
967 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
968 if (!WideIV)
969 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
970 if (!WideIV)
971 return nullptr;
972
973 auto IsWideIVInc = [&]() {
974 auto &ID = WideIV->getInductionDescriptor();
975
976 // Check if VPV increments the induction by the induction step.
977 VPValue *IVStep = WideIV->getStepValue();
978 switch (ID.getInductionOpcode()) {
979 case Instruction::Add:
980 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
981 case Instruction::FAdd:
982 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
983 case Instruction::FSub:
984 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
985 m_Specific(IVStep)));
986 case Instruction::Sub: {
987 // IVStep will be the negated step of the subtraction. Check if Step == -1
988 // * IVStep.
989 VPValue *Step;
990 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
991 return false;
992 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
993 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
994 ScalarEvolution &SE = *PSE.getSE();
995 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
996 !isa<SCEVCouldNotCompute>(StepSCEV) &&
997 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
998 }
999 default:
1000 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
1001 match(VPV, m_GetElementPtr(m_Specific(WideIV),
1002 m_Specific(WideIV->getStepValue())));
1003 }
1004 llvm_unreachable("should have been covered by switch above");
1005 };
1006 return IsWideIVInc() ? WideIV : nullptr;
1007}
1008
1009/// Attempts to optimize the induction variable exit values for users in the
1010/// early exit block.
1012 VPTypeAnalysis &TypeInfo,
1013 VPValue *Op,
1015 VPValue *Incoming, *Mask;
1017 m_VPValue(Incoming))))
1018 return nullptr;
1019
1020 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
1021 if (!WideIV)
1022 return nullptr;
1023
1024 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1025 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1026 return nullptr;
1027
1028 // Calculate the final index.
1029 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1030 auto *CanonicalIV = LoopRegion->getCanonicalIV();
1031 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
1032 auto *ExtractR = cast<VPInstruction>(Op);
1033 VPBuilder B(ExtractR);
1034
1035 DebugLoc DL = ExtractR->getDebugLoc();
1036 VPValue *FirstActiveLane = B.createFirstActiveLane(Mask, DL);
1037 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
1038 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
1039 FirstActiveLaneType, DL);
1040 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
1041
1042 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1043 // changed it means the exit is using the incremented value, so we need to
1044 // add the step.
1045 if (Incoming != WideIV) {
1046 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
1047 EndValue = B.createAdd(EndValue, One, DL);
1048 }
1049
1050 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1051 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1052 VPIRValue *Start = WideIV->getStartValue();
1053 VPValue *Step = WideIV->getStepValue();
1054 EndValue = B.createDerivedIV(
1055 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1056 Start, EndValue, Step);
1057 }
1058
1059 return EndValue;
1060}
1061
1062/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1063/// VPDerivedIVRecipe for non-canonical inductions.
1065 VPBuilder &VectorPHBuilder,
1066 VPTypeAnalysis &TypeInfo,
1067 VPValue *VectorTC) {
1068 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1069 // Truncated wide inductions resume from the last lane of their vector value
1070 // in the last vector iteration which is handled elsewhere.
1071 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1072 return nullptr;
1073
1074 VPIRValue *Start = WideIV->getStartValue();
1075 VPValue *Step = WideIV->getStepValue();
1077 VPValue *EndValue = VectorTC;
1078 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1079 EndValue = VectorPHBuilder.createDerivedIV(
1080 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1081 Start, VectorTC, Step);
1082 }
1083
1084 // EndValue is derived from the vector trip count (which has the same type as
1085 // the widest induction) and thus may be wider than the induction here.
1086 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
1087 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
1088 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
1089 ScalarTypeOfWideIV,
1090 WideIV->getDebugLoc());
1091 }
1092
1093 return EndValue;
1094}
1095
1096/// Attempts to optimize the induction variable exit values for users in the
1097/// exit block coming from the latch in the original scalar loop.
1099 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPValue *Op,
1101 VPValue *Incoming;
1103 return nullptr;
1104
1105 VPWidenInductionRecipe *WideIV = getOptimizableIVOf(Incoming, PSE);
1106 if (!WideIV)
1107 return nullptr;
1108
1109 VPValue *EndValue = EndValues.lookup(WideIV);
1110 assert(EndValue && "Must have computed the end value up front");
1111
1112 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1113 // changed it means the exit is using the incremented value, so we don't
1114 // need to subtract the step.
1115 if (Incoming != WideIV)
1116 return EndValue;
1117
1118 // Otherwise, subtract the step from the EndValue.
1119 auto *ExtractR = cast<VPInstruction>(Op);
1120 VPBuilder B(ExtractR);
1121 VPValue *Step = WideIV->getStepValue();
1122 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1123 if (ScalarTy->isIntegerTy())
1124 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1125 if (ScalarTy->isPointerTy()) {
1126 Type *StepTy = TypeInfo.inferScalarType(Step);
1127 auto *Zero = Plan.getZero(StepTy);
1128 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1129 DebugLoc::getUnknown(), "ind.escape");
1130 }
1131 if (ScalarTy->isFloatingPointTy()) {
1132 const auto &ID = WideIV->getInductionDescriptor();
1133 return B.createNaryOp(
1134 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1135 ? Instruction::FSub
1136 : Instruction::FAdd,
1137 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1138 }
1139 llvm_unreachable("all possible induction types must be handled");
1140 return nullptr;
1141}
1142
1144 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1145 // Compute end values for all inductions.
1146 VPTypeAnalysis TypeInfo(Plan);
1147 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1148 auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());
1149 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1151 VPValue *ResumeTC =
1152 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1153 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1154 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);
1155 if (!WideIV)
1156 continue;
1158 WideIV, VectorPHBuilder, TypeInfo, ResumeTC))
1159 EndValues[WideIV] = EndValue;
1160 }
1161
1162 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1163 for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
1164 VPValue *Op;
1165 if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))
1166 continue;
1167 auto *WideIV = cast<VPWidenInductionRecipe>(Op);
1168 if (VPValue *EndValue = EndValues.lookup(WideIV)) {
1169 R.getVPSingleValue()->replaceAllUsesWith(EndValue);
1170 R.eraseFromParent();
1171 }
1172 }
1173
1174 // Then, optimize exit block users.
1175 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1176 for (VPRecipeBase &R : ExitVPBB->phis()) {
1177 auto *ExitIRI = cast<VPIRPhi>(&R);
1178
1179 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1180 VPValue *Escape = nullptr;
1181 if (PredVPBB == MiddleVPBB)
1183 Plan, TypeInfo, ExitIRI->getOperand(Idx), EndValues, PSE);
1184 else
1186 Plan, TypeInfo, ExitIRI->getOperand(Idx), PSE);
1187 if (Escape)
1188 ExitIRI->setOperand(Idx, Escape);
1189 }
1190 }
1191 }
1192}
1193
1194/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1195/// them with already existing recipes expanding the same SCEV expression.
1198
1199 for (VPRecipeBase &R :
1201 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1202 if (!ExpR)
1203 continue;
1204
1205 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1206 if (Inserted)
1207 continue;
1208 ExpR->replaceAllUsesWith(V->second);
1209 ExpR->eraseFromParent();
1210 }
1211}
1212
1214 SmallVector<VPValue *> WorkList;
1216 WorkList.push_back(V);
1217
1218 while (!WorkList.empty()) {
1219 VPValue *Cur = WorkList.pop_back_val();
1220 if (!Seen.insert(Cur).second)
1221 continue;
1222 VPRecipeBase *R = Cur->getDefiningRecipe();
1223 if (!R)
1224 continue;
1225 if (!isDeadRecipe(*R))
1226 continue;
1227 append_range(WorkList, R->operands());
1228 R->eraseFromParent();
1229 }
1230}
1231
1232/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1233/// Returns an optional pair, where the first element indicates whether it is
1234/// an intrinsic ID.
1235static std::optional<std::pair<bool, unsigned>>
1237 return TypeSwitch<const VPSingleDefRecipe *,
1238 std::optional<std::pair<bool, unsigned>>>(R)
1241 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1242 .Case([](const VPWidenIntrinsicRecipe *I) {
1243 return std::make_pair(true, I->getVectorIntrinsicID());
1244 })
1245 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe, VPScalarIVStepsRecipe>(
1246 [](auto *I) {
1247 // For recipes that do not directly map to LLVM IR instructions,
1248 // assign opcodes after the last VPInstruction opcode (which is also
1249 // after the last IR Instruction opcode), based on the VPRecipeID.
1250 return std::make_pair(false, VPInstruction::OpsEnd + 1 +
1251 I->getVPRecipeID());
1252 })
1253 .Default([](auto *) { return std::nullopt; });
1254}
1255
1256/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1257/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1258/// Operands are foldable live-ins.
1260 ArrayRef<VPValue *> Operands,
1261 const DataLayout &DL,
1262 VPTypeAnalysis &TypeInfo) {
1263 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1264 if (!OpcodeOrIID)
1265 return nullptr;
1266
1268 for (VPValue *Op : Operands) {
1269 if (!match(Op, m_LiveIn()))
1270 return nullptr;
1271 Value *V = Op->getUnderlyingValue();
1272 if (!V)
1273 return nullptr;
1274 Ops.push_back(V);
1275 }
1276
1277 auto FoldToIRValue = [&]() -> Value * {
1278 InstSimplifyFolder Folder(DL);
1279 if (OpcodeOrIID->first) {
1280 if (R.getNumOperands() != 2)
1281 return nullptr;
1282 unsigned ID = OpcodeOrIID->second;
1283 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1284 TypeInfo.inferScalarType(&R));
1285 }
1286 unsigned Opcode = OpcodeOrIID->second;
1287 if (Instruction::isBinaryOp(Opcode))
1288 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1289 Ops[0], Ops[1]);
1290 if (Instruction::isCast(Opcode))
1291 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1292 TypeInfo.inferScalarType(R.getVPSingleValue()));
1293 switch (Opcode) {
1295 return Folder.FoldSelect(Ops[0], Ops[1],
1297 case VPInstruction::Not:
1298 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1300 case Instruction::Select:
1301 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1302 case Instruction::ICmp:
1303 case Instruction::FCmp:
1304 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1305 Ops[1]);
1306 case Instruction::GetElementPtr: {
1307 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1308 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1309 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1310 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1311 }
1314 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1315 Ops[0], Ops[1],
1316 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1317 // An extract of a live-in is an extract of a broadcast, so return the
1318 // broadcasted element.
1319 case Instruction::ExtractElement:
1320 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1321 return Ops[0];
1322 }
1323 return nullptr;
1324 };
1325
1326 if (Value *V = FoldToIRValue())
1327 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1328 return nullptr;
1329}
1330
1331/// Try to simplify VPSingleDefRecipe \p Def.
1333 VPlan *Plan = Def->getParent()->getPlan();
1334
1335 // Simplification of live-in IR values for SingleDef recipes using
1336 // InstSimplifyFolder.
1337 const DataLayout &DL = Plan->getDataLayout();
1338 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1339 return Def->replaceAllUsesWith(V);
1340
1341 // Fold PredPHI LiveIn -> LiveIn.
1342 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1343 VPValue *Op = PredPHI->getOperand(0);
1344 if (isa<VPIRValue>(Op))
1345 PredPHI->replaceAllUsesWith(Op);
1346 }
1347
1348 VPBuilder Builder(Def);
1349
1350 // Avoid replacing VPInstructions with underlying values with new
1351 // VPInstructions, as we would fail to create widen/replicate recpes from the
1352 // new VPInstructions without an underlying value, and miss out on some
1353 // transformations that only apply to widened/replicated recipes later, by
1354 // doing so.
1355 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1356 // VPInstructions without underlying values, as those will get skipped during
1357 // cost computation.
1358 bool CanCreateNewRecipe =
1359 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1360
1361 VPValue *A;
1362 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1363 Type *TruncTy = TypeInfo.inferScalarType(Def);
1364 Type *ATy = TypeInfo.inferScalarType(A);
1365 if (TruncTy == ATy) {
1366 Def->replaceAllUsesWith(A);
1367 } else {
1368 // Don't replace a non-widened cast recipe with a widened cast.
1369 if (!isa<VPWidenCastRecipe>(Def))
1370 return;
1371 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1372
1373 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1374 ? Instruction::SExt
1375 : Instruction::ZExt;
1376 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1377 TruncTy);
1378 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1379 // UnderlyingExt has distinct return type, used to retain legacy cost.
1380 Ext->setUnderlyingValue(UnderlyingExt);
1381 }
1382 Def->replaceAllUsesWith(Ext);
1383 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1384 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1385 Def->replaceAllUsesWith(Trunc);
1386 }
1387 }
1388#ifndef NDEBUG
1389 // Verify that the cached type info is for both A and its users is still
1390 // accurate by comparing it to freshly computed types.
1391 VPTypeAnalysis TypeInfo2(*Plan);
1392 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1393 for (VPUser *U : A->users()) {
1394 auto *R = cast<VPRecipeBase>(U);
1395 for (VPValue *VPV : R->definedValues())
1396 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1397 }
1398#endif
1399 }
1400
1401 // Simplify (X && Y) | (X && !Y) -> X.
1402 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1403 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1404 // recipes to be visited during simplification.
1405 VPValue *X, *Y, *Z;
1406 if (match(Def,
1409 Def->replaceAllUsesWith(X);
1410 Def->eraseFromParent();
1411 return;
1412 }
1413
1414 // x | AllOnes -> AllOnes
1415 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1416 return Def->replaceAllUsesWith(
1417 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1418
1419 // x | 0 -> x
1420 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1421 return Def->replaceAllUsesWith(X);
1422
1423 // x | !x -> AllOnes
1425 return Def->replaceAllUsesWith(
1426 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1427
1428 // x & 0 -> 0
1429 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1430 return Def->replaceAllUsesWith(
1431 Plan->getZero(TypeInfo.inferScalarType(Def)));
1432
1433 // x & AllOnes -> x
1434 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes())))
1435 return Def->replaceAllUsesWith(X);
1436
1437 // x && false -> false
1438 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False())))
1439 return Def->replaceAllUsesWith(Plan->getFalse());
1440
1441 // x && true -> x
1442 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True())))
1443 return Def->replaceAllUsesWith(X);
1444
1445 // (x && y) | (x && z) -> x && (y | z)
1446 if (CanCreateNewRecipe &&
1449 // Simplify only if one of the operands has one use to avoid creating an
1450 // extra recipe.
1451 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1452 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1453 return Def->replaceAllUsesWith(
1454 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1455
1456 // x && (x && y) -> x && y
1457 if (match(Def, m_LogicalAnd(m_VPValue(X),
1459 return Def->replaceAllUsesWith(Def->getOperand(1));
1460
1461 // x && (y && x) -> x && y
1462 if (match(Def, m_LogicalAnd(m_VPValue(X),
1464 return Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));
1465
1466 // x && !x -> 0
1468 return Def->replaceAllUsesWith(Plan->getFalse());
1469
1470 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1471 return Def->replaceAllUsesWith(X);
1472
1473 // select c, false, true -> not c
1474 VPValue *C;
1475 if (CanCreateNewRecipe &&
1476 match(Def, m_Select(m_VPValue(C), m_False(), m_True())))
1477 return Def->replaceAllUsesWith(Builder.createNot(C));
1478
1479 // select !c, x, y -> select c, y, x
1480 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1481 Def->setOperand(0, C);
1482 Def->setOperand(1, Y);
1483 Def->setOperand(2, X);
1484 return;
1485 }
1486
1487 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1488 return Def->replaceAllUsesWith(A);
1489
1490 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1491 return Def->replaceAllUsesWith(A);
1492
1493 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1494 return Def->replaceAllUsesWith(
1495 Plan->getZero(TypeInfo.inferScalarType(Def)));
1496
1497 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_AllOnes()))) {
1498 // Preserve nsw from the Mul on the new Sub.
1500 false, cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap()};
1501 return Def->replaceAllUsesWith(
1502 Builder.createSub(Plan->getZero(TypeInfo.inferScalarType(A)), A,
1503 Def->getDebugLoc(), "", NW));
1504 }
1505
1506 if (CanCreateNewRecipe &&
1508 // Preserve nsw from the Add and the Sub, if it's present on both, on the
1509 // new Sub.
1511 false,
1512 cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap() &&
1513 cast<VPRecipeWithIRFlags>(Def->getOperand(Def->getOperand(0) == X))
1514 ->hasNoSignedWrap()};
1515 return Def->replaceAllUsesWith(
1516 Builder.createSub(X, Y, Def->getDebugLoc(), "", NW));
1517 }
1518
1519 const APInt *APC;
1520 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1521 APC->isPowerOf2())
1522 return Def->replaceAllUsesWith(Builder.createNaryOp(
1523 Instruction::Shl,
1524 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1525 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1526
1527 if (CanCreateNewRecipe && match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) &&
1528 APC->isPowerOf2())
1529 return Def->replaceAllUsesWith(Builder.createNaryOp(
1530 Instruction::LShr,
1531 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1532 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1533
1534 if (match(Def, m_Not(m_VPValue(A)))) {
1535 if (match(A, m_Not(m_VPValue(A))))
1536 return Def->replaceAllUsesWith(A);
1537
1538 // Try to fold Not into compares by adjusting the predicate in-place.
1539 CmpPredicate Pred;
1540 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1541 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1542 if (all_of(Cmp->users(),
1544 m_Not(m_Specific(Cmp)),
1545 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1546 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1547 for (VPUser *U : to_vector(Cmp->users())) {
1548 auto *R = cast<VPSingleDefRecipe>(U);
1549 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1550 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1551 R->setOperand(1, Y);
1552 R->setOperand(2, X);
1553 } else {
1554 // not (cmp pred) -> cmp inv_pred
1555 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1556 R->replaceAllUsesWith(Cmp);
1557 }
1558 }
1559 // If Cmp doesn't have a debug location, use the one from the negation,
1560 // to preserve the location.
1561 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1562 Cmp->setDebugLoc(Def->getDebugLoc());
1563 }
1564 }
1565 }
1566
1567 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1568 // any-of (fcmp uno %A, %B), ...
1569 if (match(Def, m_AnyOf())) {
1571 VPRecipeBase *UnpairedCmp = nullptr;
1572 for (VPValue *Op : Def->operands()) {
1573 VPValue *X;
1574 if (Op->getNumUsers() > 1 ||
1576 m_Deferred(X)))) {
1577 NewOps.push_back(Op);
1578 } else if (!UnpairedCmp) {
1579 UnpairedCmp = Op->getDefiningRecipe();
1580 } else {
1581 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1582 UnpairedCmp->getOperand(0), X));
1583 UnpairedCmp = nullptr;
1584 }
1585 }
1586
1587 if (UnpairedCmp)
1588 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1589
1590 if (NewOps.size() < Def->getNumOperands()) {
1591 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1592 return Def->replaceAllUsesWith(NewAnyOf);
1593 }
1594 }
1595
1596 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1597 // This is useful for fmax/fmin without fast-math flags, where we need to
1598 // check if any operand is NaN.
1599 if (CanCreateNewRecipe &&
1601 m_Deferred(X)),
1603 m_Deferred(Y))))) {
1604 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1605 return Def->replaceAllUsesWith(NewCmp);
1606 }
1607
1608 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1609 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1610 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1611 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1612 TypeInfo.inferScalarType(Def))
1613 return Def->replaceAllUsesWith(Def->getOperand(1));
1614
1616 m_One()))) {
1617 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1618 if (TypeInfo.inferScalarType(X) != WideStepTy)
1619 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1620 Def->replaceAllUsesWith(X);
1621 return;
1622 }
1623
1624 // For i1 vp.merges produced by AnyOf reductions:
1625 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1627 m_VPValue(X), m_VPValue())) &&
1629 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1630 Def->setOperand(1, Def->getOperand(0));
1631 Def->setOperand(0, Y);
1632 return;
1633 }
1634
1635 // Simplify MaskedCond with no block mask to its single operand.
1637 !cast<VPInstruction>(Def)->isMasked())
1638 return Def->replaceAllUsesWith(Def->getOperand(0));
1639
1640 // Look through ExtractLastLane.
1641 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1642 if (match(A, m_BuildVector())) {
1643 auto *BuildVector = cast<VPInstruction>(A);
1644 Def->replaceAllUsesWith(
1645 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1646 return;
1647 }
1648 if (Plan->hasScalarVFOnly())
1649 return Def->replaceAllUsesWith(A);
1650 }
1651
1652 // Look through ExtractPenultimateElement (BuildVector ....).
1654 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1655 Def->replaceAllUsesWith(
1656 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1657 return;
1658 }
1659
1660 uint64_t Idx;
1662 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1663 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1664 return;
1665 }
1666
1667 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1668 Def->replaceAllUsesWith(
1669 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1670 return;
1671 }
1672
1673 // Look through broadcast of single-scalar when used as select conditions; in
1674 // that case the scalar condition can be used directly.
1675 if (match(Def,
1678 "broadcast operand must be single-scalar");
1679 Def->setOperand(0, C);
1680 return;
1681 }
1682
1683 if (match(Def, m_Broadcast(m_VPValue(X))))
1684 return Def->replaceUsesWithIf(
1685 X, [Def](const VPUser &U, unsigned) { return U.usesScalars(Def); });
1686
1688 if (Def->getNumOperands() == 1) {
1689 Def->replaceAllUsesWith(Def->getOperand(0));
1690 return;
1691 }
1692 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1693 if (all_equal(Phi->incoming_values()))
1694 Phi->replaceAllUsesWith(Phi->getOperand(0));
1695 }
1696 return;
1697 }
1698
1699 VPIRValue *IRV;
1700 if (Def->getNumOperands() == 1 &&
1702 return Def->replaceAllUsesWith(IRV);
1703
1704 // Some simplifications can only be applied after unrolling. Perform them
1705 // below.
1706 if (!Plan->isUnrolled())
1707 return;
1708
1709 // After unrolling, extract-lane may be used to extract values from multiple
1710 // scalar sources. Only simplify when extracting from a single scalar source.
1711 VPValue *LaneToExtract;
1712 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1713 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1715 return Def->replaceAllUsesWith(A);
1716
1717 // Simplify extract-lane with single source to extract-element.
1718 Def->replaceAllUsesWith(Builder.createNaryOp(
1719 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1720 return;
1721 }
1722
1723 // Look for cycles where Def is of the form:
1724 // X = phi(0, IVInc) ; used only by IVInc, or by IVInc and Inc = X + Y
1725 // IVInc = X + Step ; used by X and Def
1726 // Def = IVInc + Y
1727 // Fold the increment Y into the phi's start value, replace Def with IVInc,
1728 // and if Inc exists, replace it with X.
1729 if (match(Def, m_Add(m_Add(m_VPValue(X), m_VPValue()), m_VPValue(Y))) &&
1730 isa<VPIRValue>(Y) &&
1731 match(X, m_VPPhi(m_ZeroInt(), m_Specific(Def->getOperand(0))))) {
1732 auto *Phi = cast<VPPhi>(X);
1733 auto *IVInc = Def->getOperand(0);
1734 if (IVInc->getNumUsers() == 2) {
1735 // If Phi has a second user (besides IVInc's defining recipe), it must
1736 // be Inc = Phi + Y for the fold to apply.
1739 if (Phi->getNumUsers() == 1 || (Phi->getNumUsers() == 2 && Inc)) {
1740 Def->replaceAllUsesWith(IVInc);
1741 if (Inc)
1742 Inc->replaceAllUsesWith(Phi);
1743 Phi->setOperand(0, Y);
1744 return;
1745 }
1746 }
1747 }
1748
1749 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1750 // just the pointer operand.
1751 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1752 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1753 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1754
1755 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1756 // the start index is zero and only the first lane 0 is demanded.
1757 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1758 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1759 Steps->replaceAllUsesWith(Steps->getOperand(0));
1760 return;
1761 }
1762 }
1763 // Simplify redundant ReductionStartVector recipes after unrolling.
1764 VPValue *StartV;
1766 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1767 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1768 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1769 return PhiR && PhiR->isInLoop();
1770 });
1771 return;
1772 }
1773
1775 Def->replaceAllUsesWith(A);
1776 return;
1777 }
1778
1779 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1780 vputils::isSingleScalar(A) && all_of(A->users(), [Def, A](VPUser *U) {
1781 return U->usesScalars(A) || Def == U;
1782 })) {
1783 return Def->replaceAllUsesWith(A);
1784 }
1785
1786 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1787 return Def->replaceAllUsesWith(A);
1788}
1789
1792 Plan.getEntry());
1793 VPTypeAnalysis TypeInfo(Plan);
1795 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1796 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1797 simplifyRecipe(Def, TypeInfo);
1798 }
1799}
1800
1801/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1802/// header mask to be simplified further when tail folding, e.g. in
1803/// optimizeEVLMasks.
1804static void reassociateHeaderMask(VPlan &Plan) {
1805 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1806 if (!HeaderMask)
1807 return;
1808
1809 SmallVector<VPUser *> Worklist;
1810 for (VPUser *U : HeaderMask->users())
1811 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1813
1814 while (!Worklist.empty()) {
1815 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1816 VPValue *X, *Y;
1817 if (!R || !match(R, m_LogicalAnd(
1818 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1819 m_VPValue(Y))))
1820 continue;
1821 append_range(Worklist, R->users());
1822 VPBuilder Builder(R);
1823 R->replaceAllUsesWith(
1824 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1825 }
1826}
1827
1828static std::optional<Instruction::BinaryOps>
1830 switch (ID) {
1831 case Intrinsic::masked_udiv:
1832 return Instruction::UDiv;
1833 case Intrinsic::masked_sdiv:
1834 return Instruction::SDiv;
1835 case Intrinsic::masked_urem:
1836 return Instruction::URem;
1837 case Intrinsic::masked_srem:
1838 return Instruction::SRem;
1839 default:
1840 return {};
1841 }
1842}
1843
1845 if (Plan.hasScalarVFOnly())
1846 return;
1847
1849 vp_depth_first_deep(Plan.getEntry()))) {
1850 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1853 continue;
1854 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1855 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1856 continue;
1857
1858 auto *RepOrWidenR = cast<VPRecipeWithIRFlags>(&R);
1859 if (RepR && RepR->getOpcode() == Instruction::Store &&
1860 vputils::isSingleScalar(RepR->getOperand(1))) {
1861 auto *Clone = new VPReplicateRecipe(
1862 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1863 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1864 *RepR /*Metadata*/, RepR->getDebugLoc());
1865 Clone->insertBefore(RepOrWidenR);
1866 VPBuilder Builder(Clone);
1867 VPValue *ExtractOp = Clone->getOperand(0);
1868 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1869 ExtractOp =
1870 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1871 ExtractOp =
1872 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1873 Clone->setOperand(0, ExtractOp);
1874 RepR->eraseFromParent();
1875 continue;
1876 }
1877
1878 // Narrow llvm.masked.{u,s}{div,rem} intrinsics with a safe divisor.
1879 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(RepOrWidenR)) {
1880 if (!vputils::onlyFirstLaneUsed(IntrR))
1881 continue;
1882 auto Opc = getUnmaskedDivRemOpcode(IntrR->getVectorIntrinsicID());
1883 if (!Opc)
1884 continue;
1885 VPBuilder Builder(IntrR);
1886 VPValue *SafeDivisor = Builder.createSelect(
1887 IntrR->getOperand(2), IntrR->getOperand(1),
1888 Plan.getConstantInt(IntrR->getResultType(), 1));
1889 VPValue *Clone = Builder.createNaryOp(
1890 *Opc, {IntrR->getOperand(0), SafeDivisor},
1891 VPIRFlags::getDefaultFlags(*Opc), IntrR->getDebugLoc());
1892 IntrR->replaceAllUsesWith(Clone);
1893 IntrR->eraseFromParent();
1894 continue;
1895 }
1896
1897 // Skip recipes that aren't single scalars.
1898 if (!vputils::isSingleScalar(RepOrWidenR))
1899 continue;
1900
1901 // Predicate to check if a user of Op introduces extra broadcasts.
1902 auto IntroducesBCastOf = [](const VPValue *Op) {
1903 return [Op](const VPUser *U) {
1904 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1908 VPI->getOpcode()))
1909 return false;
1910 }
1911 return !U->usesScalars(Op);
1912 };
1913 };
1914
1915 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1916 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1917 if (any_of(
1918 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1919 IntroducesBCastOf(Op)))
1920 return false;
1921 // Non-constant live-ins require broadcasts, while constants do not
1922 // need explicit broadcasts.
1923 auto *IRV = dyn_cast<VPIRValue>(Op);
1924 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1925 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1926 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1927 }))
1928 continue;
1929
1930 auto *Clone = new VPReplicateRecipe(
1931 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1932 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1933 Clone->insertBefore(RepOrWidenR);
1934 RepOrWidenR->replaceAllUsesWith(Clone);
1935 if (isDeadRecipe(*RepOrWidenR))
1936 RepOrWidenR->eraseFromParent();
1937 }
1938 }
1939}
1940
1941/// Try to see if all of \p Blend's masks share a common value logically and'ed
1942/// and remove it from the masks.
1944 if (Blend->isNormalized())
1945 return;
1946 VPValue *CommonEdgeMask;
1947 if (!match(Blend->getMask(0),
1948 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1949 return;
1950 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1951 if (!match(Blend->getMask(I),
1952 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1953 return;
1954 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1955 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1956}
1957
1958/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1959/// to make sure the masks are simplified.
1960static void simplifyBlends(VPlan &Plan) {
1963 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1964 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1965 if (!Blend)
1966 continue;
1967
1968 removeCommonBlendMask(Blend);
1969
1970 // Try to remove redundant blend recipes.
1971 SmallPtrSet<VPValue *, 4> UniqueValues;
1972 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1973 UniqueValues.insert(Blend->getIncomingValue(0));
1974 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1975 if (!match(Blend->getMask(I), m_False()))
1976 UniqueValues.insert(Blend->getIncomingValue(I));
1977
1978 if (UniqueValues.size() == 1) {
1979 Blend->replaceAllUsesWith(*UniqueValues.begin());
1980 Blend->eraseFromParent();
1981 continue;
1982 }
1983
1984 if (Blend->isNormalized())
1985 continue;
1986
1987 // Normalize the blend so its first incoming value is used as the initial
1988 // value with the others blended into it.
1989
1990 unsigned StartIndex = 0;
1991 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1992 // If a value's mask is used only by the blend then is can be deadcoded.
1993 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1994 // that's used by multiple blends where it can be removed from them all.
1995 VPValue *Mask = Blend->getMask(I);
1996 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1997 StartIndex = I;
1998 break;
1999 }
2000 }
2001
2002 SmallVector<VPValue *, 4> OperandsWithMask;
2003 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
2004
2005 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
2006 if (I == StartIndex)
2007 continue;
2008 OperandsWithMask.push_back(Blend->getIncomingValue(I));
2009 OperandsWithMask.push_back(Blend->getMask(I));
2010 }
2011
2012 auto *NewBlend =
2013 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
2014 OperandsWithMask, *Blend, Blend->getDebugLoc());
2015 NewBlend->insertBefore(&R);
2016
2017 VPValue *DeadMask = Blend->getMask(StartIndex);
2018 Blend->replaceAllUsesWith(NewBlend);
2019 Blend->eraseFromParent();
2021
2022 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
2023 VPValue *NewMask;
2024 if (NewBlend->getNumOperands() == 3 &&
2025 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
2026 VPValue *Inc0 = NewBlend->getOperand(0);
2027 VPValue *Inc1 = NewBlend->getOperand(1);
2028 VPValue *OldMask = NewBlend->getOperand(2);
2029 NewBlend->setOperand(0, Inc1);
2030 NewBlend->setOperand(1, Inc0);
2031 NewBlend->setOperand(2, NewMask);
2032 if (OldMask->getNumUsers() == 0)
2033 cast<VPInstruction>(OldMask)->eraseFromParent();
2034 }
2035 }
2036 }
2037}
2038
2039/// Optimize the width of vector induction variables in \p Plan based on a known
2040/// constant Trip Count, \p BestVF and \p BestUF.
2042 ElementCount BestVF,
2043 unsigned BestUF) {
2044 // Only proceed if we have not completely removed the vector region.
2045 if (!Plan.getVectorLoopRegion())
2046 return false;
2047
2048 const APInt *TC;
2049 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
2050 return false;
2051
2052 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
2053 // and UF. Returns at least 8.
2054 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
2055 APInt AlignedTC =
2058 APInt MaxVal = AlignedTC - 1;
2059 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
2060 };
2061 unsigned NewBitWidth =
2062 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
2063
2064 LLVMContext &Ctx = Plan.getContext();
2065 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
2066
2067 bool MadeChange = false;
2068
2069 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2070 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2071 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2072
2073 // Currently only handle canonical IVs as it is trivial to replace the start
2074 // and stop values, and we currently only perform the optimization when the
2075 // IV has a single use.
2076 if (!WideIV || !WideIV->isCanonical() ||
2077 WideIV->hasMoreThanOneUniqueUser() ||
2078 NewIVTy == WideIV->getScalarType())
2079 continue;
2080
2081 // Currently only handle cases where the single user is a header-mask
2082 // comparison with the backedge-taken-count.
2083 VPUser *SingleUser = WideIV->getSingleUser();
2084 if (!SingleUser ||
2085 !match(SingleUser,
2086 m_ICmp(m_Specific(WideIV),
2088 continue;
2089
2090 // Update IV operands and comparison bound to use new narrower type.
2091 auto *NewStart = Plan.getZero(NewIVTy);
2092 WideIV->setStartValue(NewStart);
2093 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
2094 WideIV->setStepValue(NewStep);
2095
2096 auto *NewBTC = new VPWidenCastRecipe(
2097 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2098 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
2099 Plan.getVectorPreheader()->appendRecipe(NewBTC);
2100 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
2101 Cmp->setOperand(1, NewBTC);
2102
2103 MadeChange = true;
2104 }
2105
2106 return MadeChange;
2107}
2108
2109/// Return true if \p Cond is known to be true for given \p BestVF and \p
2110/// BestUF.
2112 ElementCount BestVF, unsigned BestUF,
2115 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
2116 &PSE](VPValue *C) {
2117 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
2118 });
2119
2120 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2123 m_c_Add(m_Specific(CanIV), m_Specific(&Plan.getVFxUF())),
2124 m_Specific(&Plan.getVectorTripCount()))))
2125 return false;
2126
2127 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2128 // count is not conveniently available as SCEV so far, so we compare directly
2129 // against the original trip count. This is stricter than necessary, as we
2130 // will only return true if the trip count == vector trip count.
2131 const SCEV *VectorTripCount =
2133 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2134 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
2135 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2136 "Trip count SCEV must be computable");
2137 ScalarEvolution &SE = *PSE.getSE();
2138 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2139 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2140 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
2141}
2142
2143/// Try to replace multiple active lane masks used for control flow with
2144/// a single, wide active lane mask instruction followed by multiple
2145/// extract subvector intrinsics. This applies to the active lane mask
2146/// instructions both in the loop and in the preheader.
2147/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2148/// new extracts from the first active lane mask, which has it's last
2149/// operand (multiplier) set to UF.
2151 unsigned UF) {
2152 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2153 return false;
2154
2155 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2156 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2157 auto *Term = &ExitingVPBB->back();
2158
2159 using namespace llvm::VPlanPatternMatch;
2161 m_VPValue(), m_VPValue(), m_VPValue())))))
2162 return false;
2163
2164 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2165 LLVMContext &Ctx = Plan.getContext();
2166
2167 auto ExtractFromALM = [&](VPInstruction *ALM,
2168 SmallVectorImpl<VPValue *> &Extracts) {
2169 DebugLoc DL = ALM->getDebugLoc();
2170 for (unsigned Part = 0; Part < UF; ++Part) {
2172 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
2173 auto *Ext =
2174 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2175 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
2176 Extracts[Part] = Ext;
2177 Ext->insertAfter(ALM);
2178 }
2179 };
2180
2181 // Create a list of each active lane mask phi, ordered by unroll part.
2183 for (VPRecipeBase &R : Header->phis()) {
2185 if (!Phi)
2186 continue;
2187 VPValue *Index = nullptr;
2188 match(Phi->getBackedgeValue(),
2190 assert(Index && "Expected index from ActiveLaneMask instruction");
2191
2192 uint64_t Part;
2193 if (match(Index,
2195 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2196 Phis[Part] = Phi;
2197 else {
2198 // Anything other than a CanonicalIVIncrementForPart is part 0
2199 assert(!match(
2200 Index,
2202 Phis[0] = Phi;
2203 }
2204 }
2205
2206 assert(all_of(Phis, not_equal_to(nullptr)) &&
2207 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2208
2209 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2210 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2211
2212 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2213 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2214 "Expected incoming values of Phi to be ActiveLaneMasks");
2215
2216 // When using wide lane masks, the return type of the get.active.lane.mask
2217 // intrinsic is VF x UF (last operand).
2218 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2219 EntryALM->setOperand(2, ALMMultiplier);
2220 LoopALM->setOperand(2, ALMMultiplier);
2221
2222 // Create UF x extract vectors and insert into preheader.
2223 SmallVector<VPValue *> EntryExtracts(UF);
2224 ExtractFromALM(EntryALM, EntryExtracts);
2225
2226 // Create UF x extract vectors and insert before the loop compare & branch,
2227 // updating the compare to use the first extract.
2228 SmallVector<VPValue *> LoopExtracts(UF);
2229 ExtractFromALM(LoopALM, LoopExtracts);
2230 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2231 Not->setOperand(0, LoopExtracts[0]);
2232
2233 // Update the incoming values of active lane mask phis.
2234 for (unsigned Part = 0; Part < UF; ++Part) {
2235 Phis[Part]->setStartValue(EntryExtracts[Part]);
2236 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2237 }
2238
2239 return true;
2240}
2241
2242/// Try to simplify the branch condition of \p Plan. This may restrict the
2243/// resulting plan to \p BestVF and \p BestUF.
2245 unsigned BestUF,
2247 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2248 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2249 auto *Term = &ExitingVPBB->back();
2250 VPValue *Cond;
2251 auto m_CanIVInc = m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF()));
2252 // Check if the branch condition compares the canonical IV increment (for main
2253 // loop), or the canonical IV increment plus an offset (for epilog loop).
2254 if (match(Term, m_BranchOnCount(
2255 m_CombineOr(m_CanIVInc, m_c_Add(m_CanIVInc, m_LiveIn())),
2256 m_VPValue())) ||
2258 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2259 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2260 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2261 const SCEV *VectorTripCount =
2263 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2264 VectorTripCount =
2266 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2267 "Trip count SCEV must be computable");
2268 ScalarEvolution &SE = *PSE.getSE();
2269 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2270 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2271 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2272 return false;
2273 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2275 // For BranchOnCond, check if we can prove the condition to be true using VF
2276 // and UF.
2277 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2278 return false;
2279 } else {
2280 return false;
2281 }
2282
2283 // The vector loop region only executes once. Convert terminator of the
2284 // exiting block to exit in the first iteration.
2285 if (match(Term, m_BranchOnTwoConds())) {
2286 Term->setOperand(1, Plan.getTrue());
2287 return true;
2288 }
2289
2290 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, Plan.getTrue(), {},
2291 {}, Term->getDebugLoc());
2292 ExitingVPBB->appendRecipe(BOC);
2293 Term->eraseFromParent();
2294
2295 return true;
2296}
2297
2298/// From the definition of llvm.experimental.get.vector.length,
2299/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2303 vp_depth_first_deep(Plan.getEntry()))) {
2304 for (VPRecipeBase &R : *VPBB) {
2305 VPValue *AVL;
2306 if (!match(&R, m_EVL(m_VPValue(AVL))))
2307 continue;
2308
2309 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2310 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2311 continue;
2312 ScalarEvolution &SE = *PSE.getSE();
2313 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2314 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2315 continue;
2316
2318 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2319 R.getDebugLoc());
2320 if (Trunc != AVL) {
2321 auto *TruncR = cast<VPSingleDefRecipe>(Trunc);
2322 const DataLayout &DL = Plan.getDataLayout();
2323 VPTypeAnalysis TypeInfo(Plan);
2324 if (VPValue *Folded =
2325 tryToFoldLiveIns(*TruncR, TruncR->operands(), DL, TypeInfo))
2326 Trunc = Folded;
2327 }
2328 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2329 return true;
2330 }
2331 }
2332 return false;
2333}
2334
2336 unsigned BestUF,
2338 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2339 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2340
2341 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2342 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2343 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2344
2345 if (MadeChange) {
2346 Plan.setVF(BestVF);
2347 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2348 }
2349}
2350
2352 for (VPRecipeBase &R :
2354 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2355 if (!PhiR)
2356 continue;
2357 RecurKind RK = PhiR->getRecurrenceKind();
2358 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2360 continue;
2361
2362 for (VPUser *U : collectUsersRecursively(PhiR))
2363 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2364 RecWithFlags->dropPoisonGeneratingFlags();
2365 }
2366 }
2367}
2368
2369namespace {
2370struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2371 static bool isSentinel(const VPSingleDefRecipe *Def) {
2372 return Def == getEmptyKey() || Def == getTombstoneKey();
2373 }
2374
2375 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2376 /// return that source element type.
2377 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2378 // All VPInstructions that lower to GEPs must have the i8 source element
2379 // type (as they are PtrAdds), so we omit it.
2381 .Case([](const VPReplicateRecipe *I) -> Type * {
2382 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2383 return GEP->getSourceElementType();
2384 return nullptr;
2385 })
2386 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2387 [](auto *I) { return I->getSourceElementType(); })
2388 .Default([](auto *) { return nullptr; });
2389 }
2390
2391 /// Returns true if recipe \p Def can be safely handed for CSE.
2392 static bool canHandle(const VPSingleDefRecipe *Def) {
2393 // We can extend the list of handled recipes in the future,
2394 // provided we account for the data embedded in them while checking for
2395 // equality or hashing.
2396 auto C = getOpcodeOrIntrinsicID(Def);
2397
2398 // The issue with (Insert|Extract)Value is that the index of the
2399 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2400 // VPlan.
2401 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2402 C->second == Instruction::ExtractValue)))
2403 return false;
2404
2405 // During CSE, we can only handle recipes that don't read from memory: if
2406 // they read from memory, there could be an intervening write to memory
2407 // before the next instance is CSE'd, leading to an incorrect result.
2408 return !Def->mayReadFromMemory();
2409 }
2410
2411 /// Hash the underlying data of \p Def.
2412 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2413 const VPlan *Plan = Def->getParent()->getPlan();
2414 VPTypeAnalysis TypeInfo(*Plan);
2415 hash_code Result = hash_combine(
2416 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2417 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2419 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2420 if (RFlags->hasPredicate())
2421 return hash_combine(Result, RFlags->getPredicate());
2422 if (auto *SIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Def))
2423 return hash_combine(Result, SIVSteps->getInductionOpcode());
2424 return Result;
2425 }
2426
2427 /// Check equality of underlying data of \p L and \p R.
2428 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2429 if (isSentinel(L) || isSentinel(R))
2430 return L == R;
2431 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2433 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2435 !equal(L->operands(), R->operands()))
2436 return false;
2438 "must have valid opcode info for both recipes");
2439 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2440 if (LFlags->hasPredicate() &&
2441 LFlags->getPredicate() !=
2442 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2443 return false;
2444 if (auto *LSIV = dyn_cast<VPScalarIVStepsRecipe>(L))
2445 if (LSIV->getInductionOpcode() !=
2446 cast<VPScalarIVStepsRecipe>(R)->getInductionOpcode())
2447 return false;
2448 // Recipes in replicate regions implicitly depend on predicate. If either
2449 // recipe is in a replicate region, only consider them equal if both have
2450 // the same parent.
2451 const VPRegionBlock *RegionL = L->getRegion();
2452 const VPRegionBlock *RegionR = R->getRegion();
2453 if (((RegionL && RegionL->isReplicator()) ||
2454 (RegionR && RegionR->isReplicator())) &&
2455 L->getParent() != R->getParent())
2456 return false;
2457 const VPlan *Plan = L->getParent()->getPlan();
2458 VPTypeAnalysis TypeInfo(*Plan);
2459 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2460 }
2461};
2462} // end anonymous namespace
2463
2464/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2465/// Plan.
2467 VPDominatorTree VPDT(Plan);
2469
2471 Plan.getEntry());
2473 for (VPRecipeBase &R : *VPBB) {
2474 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2475 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2476 continue;
2477 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2478 // V must dominate Def for a valid replacement.
2479 if (!VPDT.dominates(V->getParent(), VPBB))
2480 continue;
2481 // Only keep flags present on both V and Def.
2482 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2483 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2484 Def->replaceAllUsesWith(V);
2485 continue;
2486 }
2487 CSEMap[Def] = Def;
2488 }
2489 }
2490}
2491
2492/// Return true if we do not know how to (mechanically) hoist or sink a
2493/// non-memory or memory recipe \p R out of a loop region.
2495 VPBasicBlock *LastBB) {
2496 if (!isa<VPReplicateRecipe>(R) || !R.mayReadFromMemory())
2498
2499 // Check that the load doesn't alias with stores between FirstBB and LastBB.
2500 auto MemLoc = vputils::getMemoryLocation(R);
2501 return !MemLoc || !canHoistOrSinkWithNoAliasCheck(*MemLoc, FirstBB, LastBB);
2502}
2503
2504/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2505static void licm(VPlan &Plan) {
2506 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2507
2508 // Hoist any loop invariant recipes from the vector loop region to the
2509 // preheader. Preform a shallow traversal of the vector loop region, to
2510 // exclude recipes in replicate regions. Since the top-level blocks in the
2511 // vector loop region are guaranteed to execute if the vector pre-header is,
2512 // we don't need to check speculation safety.
2513 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2514 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2515 "Expected vector prehader's successor to be the vector loop region");
2517 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2518 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2519 if (cannotHoistOrSinkRecipe(R, LoopRegion->getEntryBasicBlock(),
2520 LoopRegion->getExitingBasicBlock()))
2521 continue;
2522 if (any_of(R.operands(), [](VPValue *Op) {
2523 return !Op->isDefinedOutsideLoopRegions();
2524 }))
2525 continue;
2526 R.moveBefore(*Preheader, Preheader->end());
2527 }
2528 }
2529
2530#ifndef NDEBUG
2531 VPDominatorTree VPDT(Plan);
2532#endif
2533 // Sink recipes with no users inside the vector loop region if all users are
2534 // in the same exit block of the region.
2535 // TODO: Extend to sink recipes from inner loops.
2537 LoopRegion->getEntry());
2539 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2540 if (vputils::cannotHoistOrSinkRecipe(R, /*Sinking=*/true))
2541 continue;
2542
2543 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
2544 assert(!RepR->isPredicated() &&
2545 "Expected prior transformation of predicated replicates to "
2546 "replicate regions");
2547 // narrowToSingleScalarRecipes should have already maximally narrowed
2548 // replicates to single-scalar replicates.
2549 // TODO: When unrolling, replicateByVF doesn't handle sunk
2550 // non-single-scalar replicates correctly.
2551 if (!RepR->isSingleScalar())
2552 continue;
2553 }
2554
2555 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2556 // support recipes with multiple defined values (e.g., interleaved loads).
2557 auto *Def = cast<VPSingleDefRecipe>(&R);
2558
2559 // Cannot sink the recipe if the user is defined in a loop region or a
2560 // non-successor of the vector loop region. Cannot sink if user is a phi
2561 // either.
2562 VPBasicBlock *SinkBB = nullptr;
2563 if (any_of(Def->users(), [&SinkBB, &LoopRegion](VPUser *U) {
2564 auto *UserR = cast<VPRecipeBase>(U);
2565 VPBasicBlock *Parent = UserR->getParent();
2566 // TODO: Support sinking when users are in multiple blocks.
2567 if (SinkBB && SinkBB != Parent)
2568 return true;
2569 SinkBB = Parent;
2570 // TODO: If the user is a PHI node, we should check the block of
2571 // incoming value. Support PHI node users if needed.
2572 return UserR->isPhi() || Parent->getEnclosingLoopRegion() ||
2573 Parent->getSinglePredecessor() != LoopRegion;
2574 }))
2575 continue;
2576
2577 if (!SinkBB)
2578 SinkBB = cast<VPBasicBlock>(LoopRegion->getSingleSuccessor());
2579
2580 // TODO: This will need to be a check instead of a assert after
2581 // conditional branches in vectorized loops are supported.
2582 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2583 "Defining block must dominate sink block");
2584 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2585 // just moving.
2586 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2587 }
2588 }
2589}
2590
2592 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2593 if (Plan.hasScalarVFOnly())
2594 return;
2595 // Keep track of created truncates, so they can be re-used. Note that we
2596 // cannot use RAUW after creating a new truncate, as this would could make
2597 // other uses have different types for their operands, making them invalidly
2598 // typed.
2600 VPTypeAnalysis TypeInfo(Plan);
2601 VPBasicBlock *PH = Plan.getVectorPreheader();
2604 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2607 continue;
2608
2609 VPValue *ResultVPV = R.getVPSingleValue();
2610 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2611 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2612 if (!NewResSizeInBits)
2613 continue;
2614
2615 // If the value wasn't vectorized, we must maintain the original scalar
2616 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2617 // skip casts which do not need to be handled explicitly here, as
2618 // redundant casts will be removed during recipe simplification.
2620 continue;
2621
2622 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2623 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2624 assert(OldResTy->isIntegerTy() && "only integer types supported");
2625 (void)OldResSizeInBits;
2626
2627 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2628
2629 // Any wrapping introduced by shrinking this operation shouldn't be
2630 // considered undefined behavior. So, we can't unconditionally copy
2631 // arithmetic wrapping flags to VPW.
2632 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2633 VPW->dropPoisonGeneratingFlags();
2634
2635 if (OldResSizeInBits != NewResSizeInBits &&
2636 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2637 // Extend result to original width.
2638 auto *Ext = new VPWidenCastRecipe(
2639 Instruction::ZExt, ResultVPV, OldResTy, nullptr,
2640 VPIRFlags::getDefaultFlags(Instruction::ZExt));
2641 Ext->insertAfter(&R);
2642 ResultVPV->replaceAllUsesWith(Ext);
2643 Ext->setOperand(0, ResultVPV);
2644 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2645 } else {
2646 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2647 "Only ICmps should not need extending the result.");
2648 }
2649
2650 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2652 continue;
2653
2654 // Shrink operands by introducing truncates as needed.
2655 unsigned StartIdx =
2656 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2657 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2658 auto *Op = R.getOperand(Idx);
2659 unsigned OpSizeInBits =
2661 if (OpSizeInBits == NewResSizeInBits)
2662 continue;
2663 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2664 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2665 if (!IterIsEmpty) {
2666 R.setOperand(Idx, ProcessedIter->second);
2667 continue;
2668 }
2669
2670 VPBuilder Builder;
2671 if (isa<VPIRValue>(Op))
2672 Builder.setInsertPoint(PH);
2673 else
2674 Builder.setInsertPoint(&R);
2675 VPWidenCastRecipe *NewOp =
2676 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2677 ProcessedIter->second = NewOp;
2678 R.setOperand(Idx, NewOp);
2679 }
2680
2681 }
2682 }
2683}
2684
2685void VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) {
2686 std::optional<VPDominatorTree> VPDT;
2687 if (OnlyLatches)
2688 VPDT.emplace(Plan);
2689
2690 // Collect all blocks before modifying the CFG so we can identify unreachable
2691 // ones after constant branch removal.
2693
2694 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(AllBlocks)) {
2695 VPValue *Cond;
2696 // Skip blocks that are not terminated by BranchOnCond.
2697 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2698 continue;
2699
2700 if (OnlyLatches && !VPBlockUtils::isLatch(VPBB, *VPDT))
2701 continue;
2702
2703 assert(VPBB->getNumSuccessors() == 2 &&
2704 "Two successors expected for BranchOnCond");
2705 unsigned RemovedIdx;
2706 if (match(Cond, m_True()))
2707 RemovedIdx = 1;
2708 else if (match(Cond, m_False()))
2709 RemovedIdx = 0;
2710 else
2711 continue;
2712
2713 VPBasicBlock *RemovedSucc =
2714 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2715 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2716 "There must be a single edge between VPBB and its successor");
2717 // Values coming from VPBB into phi recipes of RemovedSucc are removed from
2718 // these recipes.
2719 for (VPRecipeBase &R : RemovedSucc->phis())
2720 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2721
2722 // Disconnect blocks and remove the terminator.
2723 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2724 VPBB->back().eraseFromParent();
2725 }
2726
2727 // Compute which blocks are still reachable from the entry after constant
2728 // branch removal.
2731
2732 // Detach all unreachable blocks from their successors, removing their recipes
2733 // and incoming values from phi recipes.
2734 VPSymbolicValue Tmp(nullptr);
2735 for (VPBlockBase *B : AllBlocks) {
2736 if (Reachable.contains(B))
2737 continue;
2738 for (VPBlockBase *Succ : to_vector(B->successors())) {
2739 if (auto *SuccBB = dyn_cast<VPBasicBlock>(Succ))
2740 for (VPRecipeBase &R : SuccBB->phis())
2741 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(B);
2743 }
2744 for (VPBasicBlock *DeadBB :
2746 for (VPRecipeBase &R : make_early_inc_range(*DeadBB)) {
2747 for (VPValue *Def : R.definedValues())
2748 Def->replaceAllUsesWith(&Tmp);
2749 R.eraseFromParent();
2750 }
2751 }
2752 }
2753}
2754
2774
2775// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2776// the loop terminator with a branch-on-cond recipe with the negated
2777// active-lane-mask as operand. Note that this turns the loop into an
2778// uncountable one. Only the existing terminator is replaced, all other existing
2779// recipes/users remain unchanged, except for poison-generating flags being
2780// dropped from the canonical IV increment. Return the created
2781// VPActiveLaneMaskPHIRecipe.
2782//
2783// The function adds the following recipes:
2784//
2785// vector.ph:
2786// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2787// %EntryALM = active-lane-mask %EntryInc, TC
2788//
2789// vector.body:
2790// ...
2791// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2792// ...
2793// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2794// %ALM = active-lane-mask %InLoopInc, TC
2795// %Negated = Not %ALM
2796// branch-on-cond %Negated
2797//
2800 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2801 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2802 VPValue *StartV = Plan.getZero(TopRegion->getCanonicalIVType());
2803 auto *CanonicalIVIncrement = TopRegion->getOrCreateCanonicalIVIncrement();
2804 // TODO: Check if dropping the flags is needed.
2805 TopRegion->clearCanonicalIVNUW(CanonicalIVIncrement);
2806 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2807 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2808 // we have to take unrolling into account. Each part needs to start at
2809 // Part * VF
2810 auto *VecPreheader = Plan.getVectorPreheader();
2811 VPBuilder Builder(VecPreheader);
2812
2813 // Create the ActiveLaneMask instruction using the correct start values.
2814 VPValue *TC = Plan.getTripCount();
2815 VPValue *VF = &Plan.getVF();
2816
2817 auto *EntryIncrement = Builder.createOverflowingOp(
2818 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
2819 DL, "index.part.next");
2820
2821 // Create the active lane mask instruction in the VPlan preheader.
2822 VPValue *ALMMultiplier =
2823 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2824 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2825 {EntryIncrement, TC, ALMMultiplier}, DL,
2826 "active.lane.mask.entry");
2827
2828 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2829 // preheader ActiveLaneMask instruction.
2830 auto *LaneMaskPhi =
2832 auto *HeaderVPBB = TopRegion->getEntryBasicBlock();
2833 LaneMaskPhi->insertBefore(*HeaderVPBB, HeaderVPBB->begin());
2834
2835 // Create the active lane mask for the next iteration of the loop before the
2836 // original terminator.
2837 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2838 Builder.setInsertPoint(OriginalTerminator);
2839 auto *InLoopIncrement = Builder.createOverflowingOp(
2841 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
2842 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2843 {InLoopIncrement, TC, ALMMultiplier}, DL,
2844 "active.lane.mask.next");
2845 LaneMaskPhi->addOperand(ALM);
2846
2847 // Replace the original terminator with BranchOnCond. We have to invert the
2848 // mask here because a true condition means jumping to the exit block.
2849 auto *NotMask = Builder.createNot(ALM, DL);
2850 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2851 OriginalTerminator->eraseFromParent();
2852 return LaneMaskPhi;
2853}
2854
2856 bool UseActiveLaneMaskForControlFlow) {
2857 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2858 auto *WideCanonicalIV = vputils::findUserOf<VPWidenCanonicalIVRecipe>(
2859 LoopRegion->getCanonicalIV());
2860 assert(WideCanonicalIV &&
2861 "Must have widened canonical IV when tail folding!");
2862 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
2863 VPSingleDefRecipe *LaneMask;
2864 if (UseActiveLaneMaskForControlFlow) {
2865 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
2866 } else {
2867 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2868 VPValue *ALMMultiplier =
2869 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
2870 LaneMask =
2871 B.createNaryOp(VPInstruction::ActiveLaneMask,
2872 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2873 nullptr, "active.lane.mask");
2874 }
2875
2876 // Walk users of WideCanonicalIV and replace the header mask of the form
2877 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2878 // removing the old one to ensure there is always only a single header mask.
2879 HeaderMask->replaceAllUsesWith(LaneMask);
2880 HeaderMask->eraseFromParent();
2881}
2882
2883template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2884 Op0_t In;
2886
2887 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2888
2889 template <typename OpTy> bool match(OpTy *V) const {
2890 if (m_Specific(In).match(V)) {
2891 Out = nullptr;
2892 return true;
2893 }
2894 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2895 }
2896};
2897
2898/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2899/// Returns the remaining part \p Out if so, or nullptr otherwise.
2900template <typename Op0_t, typename Op1_t>
2901static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2902 Op1_t &Out) {
2903 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2904}
2905
2906static std::optional<Intrinsic::ID> getVPDivRemIntrinsic(Intrinsic::ID IntrID) {
2907 switch (IntrID) {
2908 case Intrinsic::masked_udiv:
2909 return Intrinsic::vp_udiv;
2910 case Intrinsic::masked_sdiv:
2911 return Intrinsic::vp_sdiv;
2912 case Intrinsic::masked_urem:
2913 return Intrinsic::vp_urem;
2914 case Intrinsic::masked_srem:
2915 return Intrinsic::vp_srem;
2916 default:
2917 return std::nullopt;
2918 }
2919}
2920
2921/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2922/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2923/// recipe could be created.
2924/// \p HeaderMask Header Mask.
2925/// \p CurRecipe Recipe to be transform.
2926/// \p TypeInfo VPlan-based type analysis.
2927/// \p EVL The explicit vector length parameter of vector-predication
2928/// intrinsics.
2930 VPRecipeBase &CurRecipe,
2931 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2932 VPlan *Plan = CurRecipe.getParent()->getPlan();
2933 DebugLoc DL = CurRecipe.getDebugLoc();
2934 VPValue *Addr, *Mask, *EndPtr;
2935
2936 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2937 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2938 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2939 EVLEndPtr->insertBefore(&CurRecipe);
2940 EVLEndPtr->setOperand(1, &EVL);
2941 return EVLEndPtr;
2942 };
2943
2944 auto GetVPReverse = [&CurRecipe, &EVL, &TypeInfo, Plan,
2946 if (!V)
2947 return nullptr;
2948 auto *Reverse = new VPWidenIntrinsicRecipe(
2949 Intrinsic::experimental_vp_reverse, {V, Plan->getTrue(), &EVL},
2950 TypeInfo.inferScalarType(V), {}, {}, DL);
2951 Reverse->insertBefore(&CurRecipe);
2952 return Reverse;
2953 };
2954
2955 if (match(&CurRecipe,
2956 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))))
2957 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2958 EVL, Mask);
2959
2960 VPValue *ReversedVal;
2961 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
2962 match(ReversedVal,
2963 m_MaskedLoad(m_VPValue(EndPtr),
2964 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
2965 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
2966 Mask = GetVPReverse(Mask);
2967 Addr = AdjustEndPtr(EndPtr);
2968 auto *LoadR = new VPWidenLoadEVLRecipe(
2969 *cast<VPWidenLoadRecipe>(ReversedVal), Addr, EVL, Mask);
2970 LoadR->insertBefore(&CurRecipe);
2971 return new VPWidenIntrinsicRecipe(
2972 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
2973 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
2974 }
2975
2976 VPValue *StoredVal;
2977 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
2978 m_RemoveMask(HeaderMask, Mask))))
2979 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2980 StoredVal, EVL, Mask);
2981
2982 if (match(&CurRecipe,
2983 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
2984 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
2985 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
2986 Mask = GetVPReverse(Mask);
2987 Addr = AdjustEndPtr(EndPtr);
2988 StoredVal = GetVPReverse(ReversedVal);
2989 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2990 StoredVal, EVL, Mask);
2991 }
2992
2993 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
2994 if (Rdx->isConditional() &&
2995 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
2996 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
2997
2998 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
2999 if (Interleave->getMask() &&
3000 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
3001 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3002
3003 VPValue *LHS, *RHS;
3004 if (match(&CurRecipe,
3005 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
3006 return new VPWidenIntrinsicRecipe(
3007 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
3008 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3009
3010 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
3011 m_VPValue(RHS))))
3012 return new VPWidenIntrinsicRecipe(
3013 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
3014 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3015
3016 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3017 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
3018 VPValue *ZExt = VPBuilder(&CurRecipe)
3020 &EVL, Ty, TypeInfo.inferScalarType(&EVL), DL);
3021 return new VPInstruction(
3022 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
3023 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
3024 }
3025
3026 // lhs | (headermask && rhs) -> vp.merge rhs, true, lhs, evl
3027 if (match(&CurRecipe,
3029 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(RHS)))))
3030 return new VPWidenIntrinsicRecipe(
3031 Intrinsic::vp_merge, {RHS, Plan->getTrue(), LHS, &EVL},
3032 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3033
3034 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(&CurRecipe))
3035 if (auto VPID = getVPDivRemIntrinsic(IntrR->getVectorIntrinsicID()))
3036 if (match(IntrR->getOperand(2), m_RemoveMask(HeaderMask, Mask)))
3037 return new VPWidenIntrinsicRecipe(*VPID,
3038 {IntrR->getOperand(0),
3039 IntrR->getOperand(1),
3040 Mask ? Mask : Plan->getTrue(), &EVL},
3041 IntrR->getResultType(), {}, {}, DL);
3042
3043 return nullptr;
3044}
3045
3046/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3047/// The transforms here need to preserve the original semantics.
3049 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3050 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3053 m_VPValue(EVL))) &&
3054 match(EVL, m_EVL(m_VPValue()))) {
3055 HeaderMask = R.getVPSingleValue();
3056 break;
3057 }
3058 }
3059 if (!HeaderMask)
3060 return;
3061
3062 VPTypeAnalysis TypeInfo(Plan);
3063 SmallVector<VPRecipeBase *> OldRecipes;
3064 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3066 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, TypeInfo, *EVL)) {
3067 NewR->insertBefore(R);
3068 for (auto [Old, New] :
3069 zip_equal(R->definedValues(), NewR->definedValues()))
3070 Old->replaceAllUsesWith(New);
3071 OldRecipes.push_back(R);
3072 }
3073 }
3074
3075 // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,
3076 // False, EVL)
3077 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3078 VPValue *Mask;
3079 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) {
3080 auto *LogicalAnd = cast<VPInstruction>(U);
3081 auto *Merge = new VPWidenIntrinsicRecipe(
3082 Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},
3083 TypeInfo.inferScalarType(Mask), {}, {}, LogicalAnd->getDebugLoc());
3084 Merge->insertBefore(LogicalAnd);
3085 LogicalAnd->replaceAllUsesWith(Merge);
3086 OldRecipes.push_back(LogicalAnd);
3087 }
3088 }
3089
3090 // Erase old recipes at the end so we don't invalidate TypeInfo.
3091 for (VPRecipeBase *R : reverse(OldRecipes)) {
3092 SmallVector<VPValue *> PossiblyDead(R->operands());
3093 R->eraseFromParent();
3094 for (VPValue *Op : PossiblyDead)
3096 }
3097}
3098
3099/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3100/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3101/// iteration.
3102static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3103 VPTypeAnalysis TypeInfo(Plan);
3104 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3105 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3106
3107 assert(all_of(Plan.getVF().users(),
3110 "User of VF that we can't transform to EVL.");
3111 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3113 });
3114
3115 assert(all_of(Plan.getVFxUF().users(),
3117 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3118 m_Specific(&Plan.getVFxUF())),
3120 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3121 "increment of the canonical induction.");
3122 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3123 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3124 // canonical induction must not be updated.
3126 });
3127
3128 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3129 // contained.
3130 bool ContainsFORs =
3132 if (ContainsFORs) {
3133 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3134 VPValue *MaxEVL = &Plan.getVF();
3135 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3136 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3137 MaxEVL = Builder.createScalarZExtOrTrunc(
3138 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3139 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3140
3141 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3142 VPValue *PrevEVL = Builder.createScalarPhi(
3143 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3144
3147 for (VPRecipeBase &R : *VPBB) {
3148 VPValue *V1, *V2;
3149 if (!match(&R,
3151 m_VPValue(V1), m_VPValue(V2))))
3152 continue;
3153 VPValue *Imm = Plan.getOrAddLiveIn(
3156 Intrinsic::experimental_vp_splice,
3157 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3158 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3159 R.getDebugLoc());
3160 VPSplice->insertBefore(&R);
3161 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3162 }
3163 }
3164 }
3165
3166 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3167 if (!HeaderMask)
3168 return;
3169
3170 // Ensure that any reduction that uses a select to mask off tail lanes does so
3171 // in the vector loop, not the middle block, since EVL tail folding can have
3172 // tail elements in the penultimate iteration.
3173 assert(all_of(*Plan.getMiddleBlock(), [&Plan, HeaderMask](VPRecipeBase &R) {
3174 if (match(&R, m_ComputeReductionResult(m_Select(m_Specific(HeaderMask),
3175 m_VPValue(), m_VPValue()))))
3176 return R.getOperand(0)->getDefiningRecipe()->getRegion() ==
3177 Plan.getVectorLoopRegion();
3178 return true;
3179 }));
3180
3181 // Replace header masks with a mask equivalent to predicating by EVL:
3182 //
3183 // icmp ule widen-canonical-iv backedge-taken-count
3184 // ->
3185 // icmp ult step-vector, EVL
3186 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3187 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3188 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3189 VPValue *EVLMask = Builder.createICmp(
3191 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3192 HeaderMask->replaceAllUsesWith(EVLMask);
3193}
3194
3195/// Converts a tail folded vector loop region to step by
3196/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3197/// iteration.
3198///
3199/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3200/// replaces all uses of the canonical IV except for the canonical IV
3201/// increment with a VPCurrentIterationPHIRecipe. The canonical IV is used
3202/// only for loop iterations counting after this transformation.
3203///
3204/// - The header mask is replaced with a header mask based on the EVL.
3205///
3206/// - Plans with FORs have a new phi added to keep track of the EVL of the
3207/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3208/// @llvm.vp.splice.
3209///
3210/// The function uses the following definitions:
3211/// %StartV is the canonical induction start value.
3212///
3213/// The function adds the following recipes:
3214///
3215/// vector.ph:
3216/// ...
3217///
3218/// vector.body:
3219/// ...
3220/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3221/// [ %NextIter, %vector.body ]
3222/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3223/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3224/// ...
3225/// %OpEVL = cast i32 %VPEVL to IVSize
3226/// %NextIter = add IVSize %OpEVL, %CurrentIter
3227/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3228/// ...
3229///
3230/// If MaxSafeElements is provided, the function adds the following recipes:
3231/// vector.ph:
3232/// ...
3233///
3234/// vector.body:
3235/// ...
3236/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3237/// [ %NextIter, %vector.body ]
3238/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3239/// %cmp = cmp ult %AVL, MaxSafeElements
3240/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3241/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3242/// ...
3243/// %OpEVL = cast i32 %VPEVL to IVSize
3244/// %NextIter = add IVSize %OpEVL, %CurrentIter
3245/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3246/// ...
3247///
3249 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3250 if (Plan.hasScalarVFOnly())
3251 return;
3252 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3253 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3254
3255 auto *CanonicalIV = LoopRegion->getCanonicalIV();
3256 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3257 VPValue *StartV = Plan.getZero(CanIVTy);
3258 auto *CanonicalIVIncrement = LoopRegion->getOrCreateCanonicalIVIncrement();
3259
3260 // Create the CurrentIteration recipe in the vector loop.
3261 auto *CurrentIteration =
3263 CurrentIteration->insertBefore(*Header, Header->begin());
3264 VPBuilder Builder(Header, Header->getFirstNonPhi());
3265 // Create the AVL (application vector length), starting from TC -> 0 in steps
3266 // of EVL.
3267 VPPhi *AVLPhi = Builder.createScalarPhi(
3268 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3269 VPValue *AVL = AVLPhi;
3270
3271 if (MaxSafeElements) {
3272 // Support for MaxSafeDist for correct loop emission.
3273 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3274 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3275 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3276 "safe_avl");
3277 }
3278 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3279 DebugLoc::getUnknown(), "evl");
3280
3281 Builder.setInsertPoint(CanonicalIVIncrement);
3282 VPValue *OpVPEVL = VPEVL;
3283
3284 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3285 OpVPEVL = Builder.createScalarZExtOrTrunc(
3286 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3287
3288 auto *NextIter = Builder.createAdd(
3289 OpVPEVL, CurrentIteration, CanonicalIVIncrement->getDebugLoc(),
3290 "current.iteration.next", CanonicalIVIncrement->getNoWrapFlags());
3291 CurrentIteration->addOperand(NextIter);
3292
3293 VPValue *NextAVL =
3294 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3295 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3296 AVLPhi->addOperand(NextAVL);
3297
3298 fixupVFUsersForEVL(Plan, *VPEVL);
3299 removeDeadRecipes(Plan);
3300
3301 // Replace all uses of the canonical IV with VPCurrentIterationPHIRecipe
3302 // except for the canonical IV increment.
3303 CanonicalIV->replaceAllUsesWith(CurrentIteration);
3304 CanonicalIVIncrement->setOperand(0, CanonicalIV);
3305 // TODO: support unroll factor > 1.
3306 Plan.setUF(1);
3307}
3308
3310 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3311 // There should be only one VPCurrentIteration in the entire plan.
3312 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3313
3316 for (VPRecipeBase &R : VPBB->phis())
3317 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3318 assert(!CurrentIteration &&
3319 "Found multiple CurrentIteration. Only one expected");
3320 CurrentIteration = PhiR;
3321 }
3322
3323 // Early return if it is not variable-length stepping.
3324 if (!CurrentIteration)
3325 return;
3326
3327 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3328 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3329
3330 // Convert CurrentIteration to concrete recipe.
3331 auto *ScalarR =
3332 VPBuilder(CurrentIteration)
3334 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3335 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3336 CurrentIteration->replaceAllUsesWith(ScalarR);
3337 CurrentIteration->eraseFromParent();
3338
3339 // Replace CanonicalIVInc with CurrentIteration increment if it exists.
3340 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3341 if (auto *CanIVInc = vputils::findUserOf(
3342 CanonicalIV, m_c_Add(m_VPValue(), m_Specific(&Plan.getVFxUF())))) {
3343 cast<VPInstruction>(CanIVInc)->replaceAllUsesWith(CurrentIterationIncr);
3344 CanIVInc->eraseFromParent();
3345 }
3346}
3347
3349 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3350 if (!LoopRegion)
3351 return;
3352 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3353 if (Header->empty())
3354 return;
3355 // The EVL IV is always at the beginning.
3356 auto *EVLPhi = dyn_cast<VPCurrentIterationPHIRecipe>(&Header->front());
3357 if (!EVLPhi)
3358 return;
3359
3360 // Bail if not an EVL tail folded loop.
3361 VPValue *AVL;
3362 if (!match(EVLPhi->getBackedgeValue(),
3363 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3364 return;
3365
3366 // The AVL may be capped to a safe distance.
3367 VPValue *SafeAVL, *UnsafeAVL;
3368 if (match(AVL,
3370 m_VPValue(SafeAVL)),
3371 m_Deferred(UnsafeAVL), m_Deferred(SafeAVL))))
3372 AVL = UnsafeAVL;
3373
3374 VPValue *AVLNext;
3375 [[maybe_unused]] bool FoundAVLNext =
3377 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3378 assert(FoundAVLNext && "Didn't find AVL backedge?");
3379
3380 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3381 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3382 if (match(LatchBr, m_BranchOnCond(m_True())))
3383 return;
3384
3385 VPValue *CanIVInc;
3386 [[maybe_unused]] bool FoundIncrement = match(
3387 LatchBr,
3389 m_Specific(&Plan.getVectorTripCount()))));
3390 assert(FoundIncrement &&
3391 match(CanIVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
3392 m_Specific(&Plan.getVFxUF()))) &&
3393 "Expected BranchOnCond with ICmp comparing CanIV + VFxUF with vector "
3394 "trip count");
3395
3396 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3397 VPBuilder Builder(LatchBr);
3398 LatchBr->setOperand(
3399 0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));
3400}
3401
3403 VPlan &Plan, PredicatedScalarEvolution &PSE,
3404 const DenseMap<Value *, const SCEV *> &StridesMap) {
3405 // Replace VPValues for known constant strides guaranteed by predicate scalar
3406 // evolution.
3407 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3408 auto *R = cast<VPRecipeBase>(&U);
3409 return R->getRegion() ||
3410 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3411 };
3412 ValueToSCEVMapTy RewriteMap;
3413 for (const SCEV *Stride : StridesMap.values()) {
3414 using namespace SCEVPatternMatch;
3415 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3416 const APInt *StrideConst;
3417 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3418 // Only handle constant strides for now.
3419 continue;
3420
3421 auto *CI = Plan.getConstantInt(*StrideConst);
3422 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3423 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3424
3425 // The versioned value may not be used in the loop directly but through a
3426 // sext/zext. Add new live-ins in those cases.
3427 for (Value *U : StrideV->users()) {
3429 continue;
3430 VPValue *StrideVPV = Plan.getLiveIn(U);
3431 if (!StrideVPV)
3432 continue;
3433 unsigned BW = U->getType()->getScalarSizeInBits();
3434 APInt C =
3435 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3436 VPValue *CI = Plan.getConstantInt(C);
3437 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3438 }
3439 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3440 }
3441
3442 for (VPRecipeBase &R : *Plan.getEntry()) {
3443 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3444 if (!ExpSCEV)
3445 continue;
3446 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3447 auto *NewSCEV =
3448 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3449 if (NewSCEV != ScevExpr) {
3450 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3451 ExpSCEV->replaceAllUsesWith(NewExp);
3452 if (Plan.getTripCount() == ExpSCEV)
3453 Plan.resetTripCount(NewExp);
3454 }
3455 }
3456}
3457
3459 // Collect recipes in the backward slice of `Root` that may generate a poison
3460 // value that is used after vectorization.
3462 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3464 Worklist.push_back(Root);
3465
3466 // Traverse the backward slice of Root through its use-def chain.
3467 while (!Worklist.empty()) {
3468 VPRecipeBase *CurRec = Worklist.pop_back_val();
3469
3470 if (!Visited.insert(CurRec).second)
3471 continue;
3472
3473 // Prune search if we find another recipe generating a widen memory
3474 // instruction. Widen memory instructions involved in address computation
3475 // will lead to gather/scatter instructions, which don't need to be
3476 // handled.
3478 VPHeaderPHIRecipe>(CurRec))
3479 continue;
3480
3481 // This recipe contributes to the address computation of a widen
3482 // load/store. If the underlying instruction has poison-generating flags,
3483 // drop them directly.
3484 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3485 VPValue *A, *B;
3486 // Dropping disjoint from an OR may yield incorrect results, as some
3487 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3488 // for dependence analysis). Instead, replace it with an equivalent Add.
3489 // This is possible as all users of the disjoint OR only access lanes
3490 // where the operands are disjoint or poison otherwise.
3491 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3492 RecWithFlags->isDisjoint()) {
3493 VPBuilder Builder(RecWithFlags);
3494 VPInstruction *New =
3495 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3496 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3497 RecWithFlags->replaceAllUsesWith(New);
3498 RecWithFlags->eraseFromParent();
3499 CurRec = New;
3500 } else
3501 RecWithFlags->dropPoisonGeneratingFlags();
3502 } else {
3505 (void)Instr;
3506 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3507 "found instruction with poison generating flags not covered by "
3508 "VPRecipeWithIRFlags");
3509 }
3510
3511 // Add new definitions to the worklist.
3512 for (VPValue *Operand : CurRec->operands())
3513 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3514 Worklist.push_back(OpDef);
3515 }
3516 });
3517
3518 // We want to exclude the tail folding case, as we don't need to drop flags
3519 // for operations computing the first lane in this case: the first lane of the
3520 // header mask must always be true.
3521 auto IsNotHeaderMask = [&Plan](VPValue *Mask) {
3522 return Mask && !vputils::isHeaderMask(Mask, Plan);
3523 };
3524
3525 // Traverse all the recipes in the VPlan and collect the poison-generating
3526 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3527 // VPInterleaveRecipe.
3528 auto Iter =
3531 for (VPRecipeBase &Recipe : *VPBB) {
3532 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3533 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3534 if (AddrDef && WidenRec->isConsecutive() &&
3535 IsNotHeaderMask(WidenRec->getMask()))
3536 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3537 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3538 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3539 if (AddrDef && IsNotHeaderMask(InterleaveRec->getMask()))
3540 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3541 }
3542 }
3543 }
3544}
3545
3547 VPlan &Plan,
3549 &InterleaveGroups,
3550 const bool &EpilogueAllowed) {
3551 if (InterleaveGroups.empty())
3552 return;
3553
3555 for (VPBasicBlock *VPBB :
3558 for (VPRecipeBase &R : make_filter_range(*VPBB, [](VPRecipeBase &R) {
3559 return isa<VPWidenMemoryRecipe>(&R);
3560 })) {
3561 auto *MemR = cast<VPWidenMemoryRecipe>(&R);
3562 IRMemberToRecipe[&MemR->getIngredient()] = MemR;
3563 }
3564
3565 // Interleave memory: for each Interleave Group we marked earlier as relevant
3566 // for this VPlan, replace the Recipes widening its memory instructions with a
3567 // single VPInterleaveRecipe at its insertion point.
3568 VPDominatorTree VPDT(Plan);
3569 for (const auto *IG : InterleaveGroups) {
3570 // Skip interleave groups where members don't have recipes. This can happen
3571 // when removeDeadRecipes removes recipes that are part of interleave groups
3572 // but have no users.
3573 if (llvm::any_of(IG->members(), [&IRMemberToRecipe](Instruction *Member) {
3574 return !IRMemberToRecipe.contains(Member);
3575 }))
3576 continue;
3577
3578 auto *Start = IRMemberToRecipe.lookup(IG->getMember(0));
3579 VPIRMetadata InterleaveMD(*Start);
3580 SmallVector<VPValue *, 4> StoredValues;
3581 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start->getAsRecipe()))
3582 StoredValues.push_back(StoreR->getStoredValue());
3583 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3584 Instruction *MemberI = IG->getMember(I);
3585 if (!MemberI)
3586 continue;
3587 VPWidenMemoryRecipe *MemoryR = IRMemberToRecipe.lookup(MemberI);
3588 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR->getAsRecipe()))
3589 StoredValues.push_back(StoreR->getStoredValue());
3590 InterleaveMD.intersect(*MemoryR);
3591 }
3592
3593 bool NeedsMaskForGaps =
3594 (IG->requiresScalarEpilogue() && !EpilogueAllowed) ||
3595 (!StoredValues.empty() && !IG->isFull());
3596
3597 Instruction *IRInsertPos = IG->getInsertPos();
3598 auto *InsertPos = IRMemberToRecipe.lookup(IRInsertPos);
3599 VPRecipeBase *InsertPosR = InsertPos->getAsRecipe();
3600
3602 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3603 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3604 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3605
3606 // Get or create the start address for the interleave group.
3607 VPValue *Addr = Start->getAddr();
3608 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3609 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPosR)) {
3610 // We cannot re-use the address of member zero because it does not
3611 // dominate the insert position. Instead, use the address of the insert
3612 // position and create a PtrAdd adjusting it to the address of member
3613 // zero.
3614 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3615 // InsertPos or sink loads above zero members to join it.
3616 assert(IG->getIndex(IRInsertPos) != 0 &&
3617 "index of insert position shouldn't be zero");
3618 auto &DL = IRInsertPos->getDataLayout();
3619 APInt Offset(32,
3620 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3621 IG->getIndex(IRInsertPos),
3622 /*IsSigned=*/true);
3623 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3624 VPBuilder B(InsertPosR);
3625 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3626 }
3627 // If the group is reverse, adjust the index to refer to the last vector
3628 // lane instead of the first. We adjust the index from the first vector
3629 // lane, rather than directly getting the pointer for lane VF - 1, because
3630 // the pointer operand of the interleaved access is supposed to be uniform.
3631 if (IG->isReverse()) {
3632 auto *ReversePtr = new VPVectorEndPointerRecipe(
3633 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3634 -(int64_t)IG->getFactor(), NW, InsertPosR->getDebugLoc());
3635 ReversePtr->insertBefore(InsertPosR);
3636 Addr = ReversePtr;
3637 }
3638 auto *VPIG = new VPInterleaveRecipe(
3639 IG, Addr, StoredValues, InsertPos->getMask(), NeedsMaskForGaps,
3640 InterleaveMD, InsertPosR->getDebugLoc());
3641 VPIG->insertBefore(InsertPosR);
3642
3643 unsigned J = 0;
3644 for (unsigned i = 0; i < IG->getFactor(); ++i)
3645 if (Instruction *Member = IG->getMember(i)) {
3646 VPRecipeBase *MemberR = IRMemberToRecipe.lookup(Member)->getAsRecipe();
3647 if (!Member->getType()->isVoidTy()) {
3648 VPValue *OriginalV = MemberR->getVPSingleValue();
3649 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3650 J++;
3651 }
3652 MemberR->eraseFromParent();
3653 }
3654 }
3655}
3656
3657/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3658/// value, phi and backedge value. In the following example:
3659///
3660/// vector.ph:
3661/// Successor(s): vector loop
3662///
3663/// <x1> vector loop: {
3664/// vector.body:
3665/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3666/// ...
3667/// EMIT branch-on-count ...
3668/// No successors
3669/// }
3670///
3671/// WIDEN-INDUCTION will get expanded to:
3672///
3673/// vector.ph:
3674/// ...
3675/// vp<%induction.start> = ...
3676/// vp<%induction.increment> = ...
3677///
3678/// Successor(s): vector loop
3679///
3680/// <x1> vector loop: {
3681/// vector.body:
3682/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3683/// ...
3684/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3685/// EMIT branch-on-count ...
3686/// No successors
3687/// }
3688static void
3690 VPTypeAnalysis &TypeInfo) {
3691 VPlan *Plan = WidenIVR->getParent()->getPlan();
3692 VPValue *Start = WidenIVR->getStartValue();
3693 VPValue *Step = WidenIVR->getStepValue();
3694 VPValue *VF = WidenIVR->getVFValue();
3695 DebugLoc DL = WidenIVR->getDebugLoc();
3696
3697 // The value from the original loop to which we are mapping the new induction
3698 // variable.
3699 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3700
3701 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3704 VPIRFlags Flags = *WidenIVR;
3705 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3706 AddOp = Instruction::Add;
3707 MulOp = Instruction::Mul;
3708 } else {
3709 AddOp = ID.getInductionOpcode();
3710 MulOp = Instruction::FMul;
3711 }
3712
3713 // If the phi is truncated, truncate the start and step values.
3714 VPBuilder Builder(Plan->getVectorPreheader());
3715 Type *StepTy = TypeInfo.inferScalarType(Step);
3716 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3717 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3718 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3719 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3720 StepTy = Ty;
3721 }
3722
3723 // Construct the initial value of the vector IV in the vector loop preheader.
3724 Type *IVIntTy =
3726 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3727 if (StepTy->isFloatingPointTy())
3728 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3729
3730 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3731 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3732
3733 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3734 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3735 DebugLoc::getUnknown(), "induction");
3736
3737 // Create the widened phi of the vector IV.
3738 auto *WidePHI = VPBuilder(WidenIVR).createWidenPhi(
3739 Init, WidenIVR->getDebugLoc(), "vec.ind");
3740
3741 // Create the backedge value for the vector IV.
3742 VPValue *Inc;
3743 VPValue *Prev;
3744 // If unrolled, use the increment and prev value from the operands.
3745 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3746 Inc = SplatVF;
3747 Prev = WidenIVR->getLastUnrolledPartOperand();
3748 } else {
3749 if (VPRecipeBase *R = VF->getDefiningRecipe())
3750 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3751 // Multiply the vectorization factor by the step using integer or
3752 // floating-point arithmetic as appropriate.
3753 if (StepTy->isFloatingPointTy())
3754 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3755 DL);
3756 else
3757 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3758 TypeInfo.inferScalarType(VF), DL);
3759
3760 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3761 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3762 Prev = WidePHI;
3763 }
3764
3766 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3767 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3768 WidenIVR->getDebugLoc(), "vec.ind.next");
3769
3770 WidePHI->addOperand(Next);
3771
3772 WidenIVR->replaceAllUsesWith(WidePHI);
3773}
3774
3775/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3776/// initial value, phi and backedge value. In the following example:
3777///
3778/// <x1> vector loop: {
3779/// vector.body:
3780/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3781/// ...
3782/// EMIT branch-on-count ...
3783/// }
3784///
3785/// WIDEN-POINTER-INDUCTION will get expanded to:
3786///
3787/// <x1> vector loop: {
3788/// vector.body:
3789/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3790/// EMIT %mul = mul %stepvector, %step
3791/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3792/// ...
3793/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3794/// EMIT branch-on-count ...
3795/// }
3797 VPTypeAnalysis &TypeInfo) {
3798 VPlan *Plan = R->getParent()->getPlan();
3799 VPValue *Start = R->getStartValue();
3800 VPValue *Step = R->getStepValue();
3801 VPValue *VF = R->getVFValue();
3802
3803 assert(R->getInductionDescriptor().getKind() ==
3805 "Not a pointer induction according to InductionDescriptor!");
3806 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3807 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3808 "Recipe should have been replaced");
3809
3810 VPBuilder Builder(R);
3811 DebugLoc DL = R->getDebugLoc();
3812
3813 // Build a scalar pointer phi.
3814 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3815
3816 // Create actual address geps that use the pointer phi as base and a
3817 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3818 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3819 Type *StepTy = TypeInfo.inferScalarType(Step);
3820 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3821 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3822 VPValue *PtrAdd =
3823 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3824 R->replaceAllUsesWith(PtrAdd);
3825
3826 // Create the backedge value for the scalar pointer phi.
3828 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3829 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3830 DL);
3831 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3832
3833 VPValue *InductionGEP =
3834 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3835 ScalarPtrPhi->addOperand(InductionGEP);
3836}
3837
3838/// Expand a VPDerivedIVRecipe into executable recipes.
3840 VPBuilder Builder(R);
3841 VPIRValue *Start = R->getStartValue();
3842 VPValue *Step = R->getStepValue();
3843 VPValue *Index = R->getIndex();
3844 Type *StepTy = TypeInfo.inferScalarType(Step);
3845 Type *IndexTy = TypeInfo.inferScalarType(Index);
3846 Index = StepTy->isIntegerTy()
3847 ? Builder.createScalarSExtOrTrunc(
3848 Index, StepTy, IndexTy, DebugLoc::getCompilerGenerated())
3849 : Builder.createScalarCast(Instruction::SIToFP, Index, StepTy,
3851 switch (R->getInductionKind()) {
3853 assert(TypeInfo.inferScalarType(Index) == TypeInfo.inferScalarType(Start) &&
3854 "Index type does not match StartValue type");
3855 return R->replaceAllUsesWith(Builder.createAdd(
3856 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
3857 }
3859 return R->replaceAllUsesWith(Builder.createPtrAdd(
3860 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
3862 assert(StepTy->isFloatingPointTy() && "Expected FP Step value");
3863 const FPMathOperator *FPBinOp = R->getFPBinOp();
3864 assert(FPBinOp &&
3865 (FPBinOp->getOpcode() == Instruction::FAdd ||
3866 FPBinOp->getOpcode() == Instruction::FSub) &&
3867 "Original BinOp should be defined for FP induction");
3868 FastMathFlags FMF = FPBinOp->getFastMathFlags();
3869 VPValue *FMul = Builder.createNaryOp(Instruction::FMul, {Step, Index}, FMF);
3870 return R->replaceAllUsesWith(
3871 Builder.createNaryOp(FPBinOp->getOpcode(), {Start, FMul}, FMF));
3872 }
3874 return;
3875 }
3876 llvm_unreachable("Unhandled induction kind");
3877}
3878
3880 // Replace loop regions with explicity CFG.
3881 SmallVector<VPRegionBlock *> LoopRegions;
3883 vp_depth_first_deep(Plan.getEntry()))) {
3884 if (!R->isReplicator())
3885 LoopRegions.push_back(R);
3886 }
3887 for (VPRegionBlock *R : LoopRegions)
3888 R->dissolveToCFGLoop();
3889}
3890
3893 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3894 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3897 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3898 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3899 }
3900
3901 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3902 // single-condition branches:
3903 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3904 // the first condition is true, and otherwise jumps to a new interim block.
3905 // 2. A branch that ends the interim block, jumps to the second successor if
3906 // the second condition is true, and otherwise jumps to the third
3907 // successor.
3908 for (VPInstruction *Br : WorkList) {
3909 assert(Br->getNumOperands() == 2 &&
3910 "BranchOnTwoConds must have exactly 2 conditions");
3911 DebugLoc DL = Br->getDebugLoc();
3912 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
3913 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
3914 assert(Successors.size() == 3 &&
3915 "BranchOnTwoConds must have exactly 3 successors");
3916
3917 for (VPBlockBase *Succ : Successors)
3918 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
3919
3920 VPValue *Cond0 = Br->getOperand(0);
3921 VPValue *Cond1 = Br->getOperand(1);
3922 VPBlockBase *Succ0 = Successors[0];
3923 VPBlockBase *Succ1 = Successors[1];
3924 VPBlockBase *Succ2 = Successors[2];
3925 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
3926 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
3927
3928 VPBasicBlock *InterimBB =
3929 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
3930
3931 VPBuilder(BrOnTwoCondsBB)
3933 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
3934 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
3935
3937 VPBlockUtils::connectBlocks(InterimBB, Succ1);
3938 VPBlockUtils::connectBlocks(InterimBB, Succ2);
3939 Br->eraseFromParent();
3940 }
3941}
3942
3944 VPTypeAnalysis TypeInfo(Plan);
3947 vp_depth_first_deep(Plan.getEntry()))) {
3948 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3949 VPBuilder Builder(&R);
3950 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3951 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3952 ToRemove.push_back(WidenIVR);
3953 continue;
3954 }
3955
3956 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3957 // If the recipe only generates scalars, scalarize it instead of
3958 // expanding it.
3959 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
3960 VPValue *PtrAdd =
3961 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
3962 WidenIVR->replaceAllUsesWith(PtrAdd);
3963 ToRemove.push_back(WidenIVR);
3964 continue;
3965 }
3966 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3967 ToRemove.push_back(WidenIVR);
3968 continue;
3969 }
3970
3971 if (auto *DerivedIVR = dyn_cast<VPDerivedIVRecipe>(&R)) {
3972 expandVPDerivedIV(DerivedIVR, TypeInfo);
3973 ToRemove.push_back(DerivedIVR);
3974 continue;
3975 }
3976
3977 if (auto *WideCanIV = dyn_cast<VPWidenCanonicalIVRecipe>(&R)) {
3978 VPValue *CanIV = WideCanIV->getCanonicalIV();
3979 Type *CanIVTy = TypeInfo.inferScalarType(CanIV);
3980 VPValue *Step = WideCanIV->getStepValue();
3981 if (!Step) {
3982 assert(Plan.getConcreteUF() == 1 &&
3983 "Expected unroller to have materialized step for UF != 1");
3984 Step = Plan.getZero(CanIVTy);
3985 }
3986 CanIV = Builder.createNaryOp(VPInstruction::Broadcast, CanIV);
3987 Step = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3988 Step = Builder.createAdd(
3989 Step, Builder.createNaryOp(VPInstruction::StepVector, {}, CanIVTy));
3990 VPValue *CanVecIV =
3991 Builder.createAdd(CanIV, Step, WideCanIV->getDebugLoc(), "vec.iv");
3992 WideCanIV->replaceAllUsesWith(CanVecIV);
3993 ToRemove.push_back(WideCanIV);
3994 continue;
3995 }
3996
3997 // Expand VPBlendRecipe into VPInstruction::Select.
3998 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3999 VPValue *Select = Blend->getIncomingValue(0);
4000 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
4001 Select = Builder.createSelect(Blend->getMask(I),
4002 Blend->getIncomingValue(I), Select,
4003 R.getDebugLoc(), "predphi", *Blend);
4004 Blend->replaceAllUsesWith(Select);
4005 ToRemove.push_back(Blend);
4006 }
4007
4008 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
4009 if (!VEPR->getOffset()) {
4010 assert(Plan.getConcreteUF() == 1 &&
4011 "Expected unroller to have materialized offset for UF != 1");
4012 VEPR->materializeOffset();
4013 }
4014 }
4015
4016 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
4017 Expr->decompose();
4018 ToRemove.push_back(Expr);
4019 }
4020
4021 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4022 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
4023 if (LastActiveL &&
4024 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4025 // Create Not(Mask) for all operands.
4027 for (VPValue *Op : LastActiveL->operands()) {
4028 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
4029 NotMasks.push_back(NotMask);
4030 }
4031
4032 // Create FirstActiveLane on the inverted masks.
4033 VPValue *FirstInactiveLane = Builder.createFirstActiveLane(
4034 NotMasks, LastActiveL->getDebugLoc(), "first.inactive.lane");
4035
4036 // Subtract 1 to get the last active lane.
4037 VPValue *One =
4038 Plan.getConstantInt(TypeInfo.inferScalarType(FirstInactiveLane), 1);
4039 VPValue *LastLane =
4040 Builder.createSub(FirstInactiveLane, One,
4041 LastActiveL->getDebugLoc(), "last.active.lane");
4042
4043 LastActiveL->replaceAllUsesWith(LastLane);
4044 ToRemove.push_back(LastActiveL);
4045 continue;
4046 }
4047
4048 // Lower MaskedCond with block mask to LogicalAnd.
4050 auto *VPI = cast<VPInstruction>(&R);
4051 assert(VPI->isMasked() &&
4052 "Unmasked MaskedCond should be simplified earlier");
4053 VPI->replaceAllUsesWith(Builder.createNaryOp(
4054 VPInstruction::LogicalAnd, {VPI->getMask(), VPI->getOperand(0)}));
4055 ToRemove.push_back(VPI);
4056 continue;
4057 }
4058
4059 // Lower CanonicalIVIncrementForPart to plain Add.
4060 if (match(
4061 &R,
4063 auto *VPI = cast<VPInstruction>(&R);
4064 VPValue *Add = Builder.createOverflowingOp(
4065 Instruction::Add, VPI->operands(), VPI->getNoWrapFlags(),
4066 VPI->getDebugLoc());
4067 VPI->replaceAllUsesWith(Add);
4068 ToRemove.push_back(VPI);
4069 continue;
4070 }
4071
4072 // Lower BranchOnCount to ICmp + BranchOnCond.
4073 VPValue *IV, *TC;
4074 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
4075 auto *BranchOnCountInst = cast<VPInstruction>(&R);
4076 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4077 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
4078 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
4079 ToRemove.push_back(BranchOnCountInst);
4080 continue;
4081 }
4082
4083 VPValue *VectorStep;
4084 VPValue *ScalarStep;
4086 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4087 continue;
4088
4089 // Expand WideIVStep.
4090 auto *VPI = cast<VPInstruction>(&R);
4091 Type *IVTy = TypeInfo.inferScalarType(VPI);
4092 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
4094 ? Instruction::UIToFP
4095 : Instruction::Trunc;
4096 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4097 }
4098
4099 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4100 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
4101 ScalarStep =
4102 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4103 }
4104
4105 VPIRFlags Flags;
4106 unsigned MulOpc;
4107 if (IVTy->isFloatingPointTy()) {
4108 MulOpc = Instruction::FMul;
4109 Flags = VPI->getFastMathFlags();
4110 } else {
4111 MulOpc = Instruction::Mul;
4112 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4113 }
4114
4115 VPInstruction *Mul = Builder.createNaryOp(
4116 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4117 VectorStep = Mul;
4118 VPI->replaceAllUsesWith(VectorStep);
4119 ToRemove.push_back(VPI);
4120 }
4121 }
4122
4123 for (VPRecipeBase *R : ToRemove)
4124 R->eraseFromParent();
4125}
4126
4128 VPBasicBlock *HeaderVPBB,
4129 VPBasicBlock *LatchVPBB,
4130 VPBasicBlock *MiddleVPBB,
4131 UncountableExitStyle Style) {
4132 struct EarlyExitInfo {
4133 VPBasicBlock *EarlyExitingVPBB;
4134 VPIRBasicBlock *EarlyExitVPBB;
4135 VPValue *CondToExit;
4136 };
4137
4138 VPDominatorTree VPDT(Plan);
4139 VPBuilder Builder(LatchVPBB->getTerminator());
4141 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4142 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4143 if (Pred == MiddleVPBB)
4144 continue;
4145 // Collect condition for this early exit.
4146 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4147 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4148 VPValue *CondOfEarlyExitingVPBB;
4149 [[maybe_unused]] bool Matched =
4150 match(EarlyExitingVPBB->getTerminator(),
4151 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4152 assert(Matched && "Terminator must be BranchOnCond");
4153
4154 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4155 // the correct block mask.
4156 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4157 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4159 TrueSucc == ExitBlock
4160 ? CondOfEarlyExitingVPBB
4161 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4162 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4163 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4164 VPDT.properlyDominates(
4165 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4166 LatchVPBB)) &&
4167 "exit condition must dominate the latch");
4168 Exits.push_back({
4169 EarlyExitingVPBB,
4170 ExitBlock,
4171 CondToEarlyExit,
4172 });
4173 }
4174 }
4175
4176 assert(!Exits.empty() && "must have at least one early exit");
4177 // Sort exits by RPO order to get correct program order. RPO gives a
4178 // topological ordering of the CFG, ensuring upstream exits are checked
4179 // before downstream exits in the dispatch chain.
4181 HeaderVPBB);
4183 for (const auto &[Num, VPB] : enumerate(RPOT))
4184 RPOIdx[VPB] = Num;
4185 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4186 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4187 });
4188#ifndef NDEBUG
4189 // After RPO sorting, verify that for any pair where one exit dominates
4190 // another, the dominating exit comes first. This is guaranteed by RPO
4191 // (topological order) and is required for the dispatch chain correctness.
4192 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4193 for (unsigned J = I + 1; J < Exits.size(); ++J)
4194 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4195 Exits[I].EarlyExitingVPBB) &&
4196 "RPO sort must place dominating exits before dominated ones");
4197#endif
4198
4199 // Build the AnyOf condition for the latch terminator using logical OR
4200 // to avoid poison propagation from later exit conditions when an earlier
4201 // exit is taken.
4202 VPValue *Combined = Exits[0].CondToExit;
4203 for (const EarlyExitInfo &Info : drop_begin(Exits))
4204 Combined = Builder.createLogicalOr(Combined, Info.CondToExit);
4205
4206 VPValue *IsAnyExitTaken =
4207 Builder.createNaryOp(VPInstruction::AnyOf, {Combined});
4208
4210 "Early exit store masking not implemented");
4211
4212 // Create the vector.early.exit blocks.
4213 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4214 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4215 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4216 VPBasicBlock *VectorEarlyExitVPBB =
4217 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4218 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4219 }
4220
4221 // Create the dispatch block (or reuse the single exit block if only one
4222 // exit). The dispatch block computes the first active lane of the combined
4223 // condition and, for multiple exits, chains through conditions to determine
4224 // which exit to take.
4225 VPBasicBlock *DispatchVPBB =
4226 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4227 : Plan.createVPBasicBlock("vector.early.exit.check");
4228 DispatchVPBB->setPredecessors({LatchVPBB});
4229 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4230 VPValue *FirstActiveLane = DispatchBuilder.createFirstActiveLane(
4231 {Combined}, DebugLoc::getUnknown(), "first.active.lane");
4232
4233 // For each early exit, disconnect the original exiting block
4234 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4235 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4236 // values at the first active lane:
4237 //
4238 // Input:
4239 // early.exiting.I:
4240 // ...
4241 // EMIT branch-on-cond vp<%cond.I>
4242 // Successor(s): in.loop.succ, ir-bb<exit.I>
4243 //
4244 // ir-bb<exit.I>:
4245 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4246 //
4247 // Output:
4248 // early.exiting.I:
4249 // ...
4250 // Successor(s): in.loop.succ
4251 //
4252 // vector.early.exit.I:
4253 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4254 // Successor(s): ir-bb<exit.I>
4255 //
4256 // ir-bb<exit.I>:
4257 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4258 // vector.early.exit.I)
4259 //
4260 for (auto [Exit, VectorEarlyExitVPBB] :
4261 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4262 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4263 // Adjust the phi nodes in EarlyExitVPBB.
4264 // 1. remove incoming values from EarlyExitingVPBB,
4265 // 2. extract the incoming value at FirstActiveLane
4266 // 3. add back the extracts as last operands for the phis
4267 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4268 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4269 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4270 // values from VectorEarlyExitVPBB.
4271 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4272 auto *ExitIRI = cast<VPIRPhi>(&R);
4273 VPValue *IncomingVal =
4274 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4275 VPValue *NewIncoming = IncomingVal;
4276 if (!isa<VPIRValue>(IncomingVal)) {
4277 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4278 NewIncoming = EarlyExitBuilder.createNaryOp(
4279 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4280 DebugLoc::getUnknown(), "early.exit.value");
4281 }
4282 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4283 ExitIRI->addOperand(NewIncoming);
4284 }
4285
4286 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4287 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4288 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4289 }
4290
4291 // Chain through exits: for each exit, check if its condition is true at
4292 // the first active lane. If so, take that exit; otherwise, try the next.
4293 // The last exit needs no check since it must be taken if all others fail.
4294 //
4295 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4296 //
4297 // latch:
4298 // ...
4299 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4300 // ...
4301 //
4302 // vector.early.exit.check:
4303 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4304 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4305 // EMIT branch-on-cond vp<%at.cond.0>
4306 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4307 //
4308 // vector.early.exit.check.0:
4309 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4310 // EMIT branch-on-cond vp<%at.cond.1>
4311 // Successor(s): vector.early.exit.1, vector.early.exit.2
4312 VPBasicBlock *CurrentBB = DispatchVPBB;
4313 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4314 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4315 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4316 DebugLoc::getUnknown(), "exit.cond.at.lane");
4317
4318 // For the last dispatch, branch directly to the last exit on false;
4319 // otherwise, create a new check block.
4320 bool IsLastDispatch = (I + 2 == Exits.size());
4321 VPBasicBlock *FalseBB =
4322 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4323 : Plan.createVPBasicBlock(
4324 Twine("vector.early.exit.check.") + Twine(I));
4325
4326 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4327 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4328 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4329 FalseBB->setPredecessors({CurrentBB});
4330
4331 CurrentBB = FalseBB;
4332 DispatchBuilder.setInsertPoint(CurrentBB);
4333 }
4334
4335 // Replace the latch terminator with the new branching logic.
4336 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4337 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4338 "Unexpected terminator");
4339 auto *IsLatchExitTaken =
4340 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
4341 LatchExitingBranch->getOperand(1));
4342
4343 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4344 LatchExitingBranch->eraseFromParent();
4345 Builder.setInsertPoint(LatchVPBB);
4346 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
4347 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4348 LatchVPBB->clearSuccessors();
4349 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4350}
4351
4352/// This function tries convert extended in-loop reductions to
4353/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4354/// valid. The created recipe must be decomposed to its constituent
4355/// recipes before execution.
4356static VPExpressionRecipe *
4358 VFRange &Range) {
4359 Type *RedTy = Ctx.Types.inferScalarType(Red);
4360 VPValue *VecOp = Red->getVecOp();
4361
4362 assert(!Red->isPartialReduction() &&
4363 "This path does not support partial reductions");
4364
4365 // Clamp the range if using extended-reduction is profitable.
4366 auto IsExtendedRedValidAndClampRange =
4367 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4369 [&](ElementCount VF) {
4370 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4372
4374 InstructionCost ExtCost =
4375 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4376 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4377
4378 assert(!RedTy->isFloatingPointTy() &&
4379 "getExtendedReductionCost only supports integer types");
4380 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4381 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4382 Red->getFastMathFlags(), CostKind);
4383 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4384 },
4385 Range);
4386 };
4387
4388 VPValue *A;
4389 // Match reduce(ext)).
4391 IsExtendedRedValidAndClampRange(
4392 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4393 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
4394 Ctx.Types.inferScalarType(A)))
4395 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4396
4397 return nullptr;
4398}
4399
4400/// This function tries convert extended in-loop reductions to
4401/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4402/// and valid. The created VPExpressionRecipe must be decomposed to its
4403/// constituent recipes before execution. Patterns of the
4404/// VPExpressionRecipe:
4405/// reduce.add(mul(...)),
4406/// reduce.add(mul(ext(A), ext(B))),
4407/// reduce.add(ext(mul(ext(A), ext(B)))).
4408/// reduce.fadd(fmul(ext(A), ext(B)))
4409static VPExpressionRecipe *
4411 VPCostContext &Ctx, VFRange &Range) {
4412 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4413 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4414 Opcode != Instruction::FAdd)
4415 return nullptr;
4416
4417 assert(!Red->isPartialReduction() &&
4418 "This path does not support partial reductions");
4419 Type *RedTy = Ctx.Types.inferScalarType(Red);
4420
4421 // Clamp the range if using multiply-accumulate-reduction is profitable.
4422 auto IsMulAccValidAndClampRange =
4424 VPWidenCastRecipe *OuterExt) -> bool {
4426 [&](ElementCount VF) {
4428 Type *SrcTy =
4429 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4430 InstructionCost MulAccCost;
4431
4432 // getMulAccReductionCost for in-loop reductions does not support
4433 // mixed or floating-point extends.
4434 if (Ext0 && Ext1 &&
4435 (Ext0->getOpcode() != Ext1->getOpcode() ||
4436 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4437 return false;
4438
4439 bool IsZExt =
4440 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4441 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4442 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4443 SrcVecTy, CostKind);
4444
4445 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4446 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4447 InstructionCost ExtCost = 0;
4448 if (Ext0)
4449 ExtCost += Ext0->computeCost(VF, Ctx);
4450 if (Ext1)
4451 ExtCost += Ext1->computeCost(VF, Ctx);
4452 if (OuterExt)
4453 ExtCost += OuterExt->computeCost(VF, Ctx);
4454
4455 return MulAccCost.isValid() &&
4456 MulAccCost < ExtCost + MulCost + RedCost;
4457 },
4458 Range);
4459 };
4460
4461 VPValue *VecOp = Red->getVecOp();
4462 VPRecipeBase *Sub = nullptr;
4463 VPValue *A, *B;
4464 VPValue *Tmp = nullptr;
4465
4466 if (RedTy->isFloatingPointTy())
4467 return nullptr;
4468
4469 // Sub reductions could have a sub between the add reduction and vec op.
4470 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4471 Sub = VecOp->getDefiningRecipe();
4472 VecOp = Tmp;
4473 }
4474
4475 // If ValB is a constant and can be safely extended, truncate it to the same
4476 // type as ExtA's operand, then extend it to the same type as ExtA. This
4477 // creates two uniform extends that can more easily be matched by the rest of
4478 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4479 // replaced with the new extend of the constant.
4480 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
4481 VPWidenCastRecipe *&ExtB,
4482 VPValue *&ValB, VPWidenRecipe *Mul) {
4483 if (!ExtA || ExtB || !isa<VPIRValue>(ValB))
4484 return;
4485 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4486 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4487 const APInt *Const;
4488 if (!match(ValB, m_APInt(Const)) ||
4490 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4491 return;
4492 // The truncate ensures that the type of each extended operand is the
4493 // same, and it's been proven that the constant can be extended from
4494 // NarrowTy safely. Necessary since ExtA's extended operand would be
4495 // e.g. an i8, while the const will likely be an i32. This will be
4496 // elided by later optimisations.
4497 VPBuilder Builder(Mul);
4498 auto *Trunc =
4499 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4500 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4501 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4502 Mul->setOperand(1, ExtB);
4503 };
4504
4505 // Try to match reduce.add(mul(...)).
4506 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4507 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(A);
4508 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(B);
4509 auto *Mul = cast<VPWidenRecipe>(VecOp);
4510
4511 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4512 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4513
4514 // Match reduce.add/sub(mul(ext, ext)).
4515 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4516 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4517 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4518 if (Sub)
4519 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4520 cast<VPWidenRecipe>(Sub), Red);
4521 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4522 }
4523 // TODO: Add an expression type for this variant with a negated mul
4524 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4525 return new VPExpressionRecipe(Mul, Red);
4526 }
4527 // TODO: Add an expression type for negated versions of other expression
4528 // variants.
4529 if (Sub)
4530 return nullptr;
4531
4532 // Match reduce.add(ext(mul(A, B))).
4533 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4534 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4535 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4536 auto *Ext0 = dyn_cast<VPWidenCastRecipe>(A);
4537 auto *Ext1 = dyn_cast<VPWidenCastRecipe>(B);
4538
4539 // reduce.add(ext(mul(ext, const)))
4540 // -> reduce.add(ext(mul(ext, ext(const))))
4541 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4542
4543 // reduce.add(ext(mul(ext(A), ext(B))))
4544 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4545 // The inner extends must either have the same opcode as the outer extend or
4546 // be the same, in which case the multiply can never result in a negative
4547 // value and the outer extend can be folded away by doing wider
4548 // extends for the operands of the mul.
4549 if (Ext0 && Ext1 &&
4550 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4551 Ext0->getOpcode() == Ext1->getOpcode() &&
4552 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4553 auto *NewExt0 = new VPWidenCastRecipe(
4554 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4555 *Ext0, *Ext0, Ext0->getDebugLoc());
4556 NewExt0->insertBefore(Ext0);
4557
4558 VPWidenCastRecipe *NewExt1 = NewExt0;
4559 if (Ext0 != Ext1) {
4560 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4561 Ext->getResultType(), nullptr, *Ext1,
4562 *Ext1, Ext1->getDebugLoc());
4563 NewExt1->insertBefore(Ext1);
4564 }
4565 Mul->setOperand(0, NewExt0);
4566 Mul->setOperand(1, NewExt1);
4567 Red->setOperand(1, Mul);
4568 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4569 }
4570 }
4571 return nullptr;
4572}
4573
4574/// This function tries to create abstract recipes from the reduction recipe for
4575/// following optimizations and cost estimation.
4577 VPCostContext &Ctx,
4578 VFRange &Range) {
4579 // Creation of VPExpressions for partial reductions is entirely handled in
4580 // transformToPartialReduction.
4581 assert(!Red->isPartialReduction() &&
4582 "This path does not support partial reductions");
4583
4584 VPExpressionRecipe *AbstractR = nullptr;
4585 auto IP = std::next(Red->getIterator());
4586 auto *VPBB = Red->getParent();
4587 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4588 AbstractR = MulAcc;
4589 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4590 AbstractR = ExtRed;
4591 // Cannot create abstract inloop reduction recipes.
4592 if (!AbstractR)
4593 return;
4594
4595 AbstractR->insertBefore(*VPBB, IP);
4596 Red->replaceAllUsesWith(AbstractR);
4597}
4598
4609
4611 if (Plan.hasScalarVFOnly())
4612 return;
4613
4614#ifndef NDEBUG
4615 VPDominatorTree VPDT(Plan);
4616#endif
4617
4618 SmallVector<VPValue *> VPValues;
4619 if (VPValue *BTC = Plan.getBackedgeTakenCount())
4620 VPValues.push_back(BTC);
4621 append_range(VPValues, Plan.getLiveIns());
4622 for (VPRecipeBase &R : *Plan.getEntry())
4623 append_range(VPValues, R.definedValues());
4624
4625 auto *VectorPreheader = Plan.getVectorPreheader();
4626 for (VPValue *VPV : VPValues) {
4628 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4629 continue;
4630
4631 // Add explicit broadcast at the insert point that dominates all users.
4632 VPBasicBlock *HoistBlock = VectorPreheader;
4633 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4634 for (VPUser *User : VPV->users()) {
4635 if (User->usesScalars(VPV))
4636 continue;
4637 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4638 HoistPoint = HoistBlock->begin();
4639 else
4640 assert(VPDT.dominates(VectorPreheader,
4641 cast<VPRecipeBase>(User)->getParent()) &&
4642 "All users must be in the vector preheader or dominated by it");
4643 }
4644
4645 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4646 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4647 VPV->replaceUsesWithIf(Broadcast,
4648 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4649 return Broadcast != &U && !U.usesScalars(VPV);
4650 });
4651 }
4652}
4653
4654// Collect common metadata from a group of replicate recipes by intersecting
4655// metadata from all recipes in the group.
4657 VPIRMetadata CommonMetadata = *Recipes.front();
4658 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4659 CommonMetadata.intersect(*Recipe);
4660 return CommonMetadata;
4661}
4662
4663template <unsigned Opcode>
4667 const Loop *L) {
4668 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4669 "Only Load and Store opcodes supported");
4670 constexpr bool IsLoad = (Opcode == Instruction::Load);
4671 VPTypeAnalysis TypeInfo(Plan);
4672
4673 // For each address, collect operations with the same or complementary masks.
4675 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4676 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4677 };
4679 Plan, PSE, L,
4680 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
4681 for (auto Recipes : Groups) {
4682 if (Recipes.size() < 2)
4683 continue;
4684
4685 // Collect groups with the same or complementary masks.
4686 for (VPReplicateRecipe *&RecipeI : Recipes) {
4687 if (!RecipeI)
4688 continue;
4689
4690 VPValue *MaskI = RecipeI->getMask();
4691 Type *TypeI = GetLoadStoreValueType(RecipeI);
4693 Group.push_back(RecipeI);
4694 RecipeI = nullptr;
4695
4696 // Find all operations with the same or complementary masks.
4697 bool HasComplementaryMask = false;
4698 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4699 if (!RecipeJ)
4700 continue;
4701
4702 VPValue *MaskJ = RecipeJ->getMask();
4703 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4704 if (TypeI == TypeJ) {
4705 // Check if any operation in the group has a complementary mask with
4706 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4707 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4708 match(MaskJ, m_Not(m_Specific(MaskI)));
4709 Group.push_back(RecipeJ);
4710 RecipeJ = nullptr;
4711 }
4712 }
4713
4714 if (HasComplementaryMask) {
4715 assert(Group.size() >= 2 && "must have at least 2 entries");
4716 AllGroups.push_back(std::move(Group));
4717 }
4718 }
4719 }
4720
4721 return AllGroups;
4722}
4723
4724// Find the recipe with minimum alignment in the group.
4725template <typename InstType>
4726static VPReplicateRecipe *
4728 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4729 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4730 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4731 });
4732}
4733
4736 const Loop *L) {
4737 auto Groups =
4739 if (Groups.empty())
4740 return;
4741
4742 // Process each group of loads.
4743 for (auto &Group : Groups) {
4744 // Try to use the earliest (most dominating) load to replace all others.
4745 VPReplicateRecipe *EarliestLoad = Group[0];
4746 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4747 VPBasicBlock *LastBB = Group.back()->getParent();
4748
4749 // Check that the load doesn't alias with stores between first and last.
4750 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4751 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4752 continue;
4753
4754 // Collect common metadata from all loads in the group.
4755 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4756
4757 // Find the load with minimum alignment to use.
4758 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4759
4760 bool IsSingleScalar = EarliestLoad->isSingleScalar();
4761 assert(all_of(Group,
4762 [IsSingleScalar](VPReplicateRecipe *R) {
4763 return R->isSingleScalar() == IsSingleScalar;
4764 }) &&
4765 "all members in group must agree on IsSingleScalar");
4766
4767 // Create an unpredicated version of the earliest load with common
4768 // metadata.
4769 auto *UnpredicatedLoad = new VPReplicateRecipe(
4770 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4771 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
4772
4773 UnpredicatedLoad->insertBefore(EarliestLoad);
4774
4775 // Replace all loads in the group with the unpredicated load.
4776 for (VPReplicateRecipe *Load : Group) {
4777 Load->replaceAllUsesWith(UnpredicatedLoad);
4778 Load->eraseFromParent();
4779 }
4780 }
4781}
4782
4783static bool
4785 PredicatedScalarEvolution &PSE, const Loop &L,
4786 VPTypeAnalysis &TypeInfo) {
4787 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4788 if (!StoreLoc || !StoreLoc->AATags.Scope)
4789 return false;
4790
4791 // When sinking a group of stores, all members of the group alias each other.
4792 // Skip them during the alias checks.
4793 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4794 StoresToSink.end());
4795
4796 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4797 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4798 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4799 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4800}
4801
4804 const Loop *L) {
4805 auto Groups =
4807 if (Groups.empty())
4808 return;
4809
4810 VPTypeAnalysis TypeInfo(Plan);
4811
4812 for (auto &Group : Groups) {
4813 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4814 continue;
4815
4816 // Use the last (most dominated) store's location for the unconditional
4817 // store.
4818 VPReplicateRecipe *LastStore = Group.back();
4819 VPBasicBlock *InsertBB = LastStore->getParent();
4820
4821 // Collect common alias metadata from all stores in the group.
4822 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4823
4824 // Build select chain for stored values.
4825 VPValue *SelectedValue = Group[0]->getOperand(0);
4826 VPBuilder Builder(InsertBB, LastStore->getIterator());
4827
4828 bool IsSingleScalar = Group[0]->isSingleScalar();
4829 for (unsigned I = 1; I < Group.size(); ++I) {
4830 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
4831 "all members in group must agree on IsSingleScalar");
4832 VPValue *Mask = Group[I]->getMask();
4833 VPValue *Value = Group[I]->getOperand(0);
4834 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4835 Group[I]->getDebugLoc());
4836 }
4837
4838 // Find the store with minimum alignment to use.
4839 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4840
4841 // Create unconditional store with selected value and common metadata.
4842 auto *UnpredicatedStore = new VPReplicateRecipe(
4843 StoreWithMinAlign->getUnderlyingInstr(),
4844 {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,
4845 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4846 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4847
4848 // Remove all predicated stores from the group.
4849 for (VPReplicateRecipe *Store : Group)
4850 Store->eraseFromParent();
4851 }
4852}
4853
4855 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4857 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4858 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4859
4860 VPValue *TC = Plan.getTripCount();
4861 if (TC->getNumUsers() == 0)
4862 return;
4863
4864 // Skip cases for which the trip count may be non-trivial to materialize.
4865 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4866 // tail is required.
4867 if (!Plan.hasScalarTail() ||
4869 Plan.getScalarPreheader() ||
4870 !isa<VPIRValue>(TC))
4871 return;
4872
4873 // Materialize vector trip counts for constants early if it can simply
4874 // be computed as (Original TC / VF * UF) * VF * UF.
4875 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4876 // tail-folded loops.
4877 ScalarEvolution &SE = *PSE.getSE();
4878 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4879 if (!isa<SCEVConstant>(TCScev))
4880 return;
4881 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4882 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4883 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4884 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4885}
4886
4888 VPBasicBlock *VectorPH) {
4890 if (BTC->getNumUsers() == 0)
4891 return;
4892
4893 VPBuilder Builder(VectorPH, VectorPH->begin());
4894 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4895 auto *TCMO =
4896 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
4897 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4898 BTC->replaceAllUsesWith(TCMO);
4899}
4900
4902 if (Plan.hasScalarVFOnly())
4903 return;
4904
4905 VPTypeAnalysis TypeInfo(Plan);
4906 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4907 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4909 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4910 vp_depth_first_shallow(LoopRegion->getEntry()));
4911 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
4912 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
4913 // regions. Those are not materialized explicitly yet.
4914 // TODO: materialize build vectors for replicating recipes in replicating
4915 // regions.
4916 for (VPBasicBlock *VPBB :
4917 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
4918 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4920 continue;
4921 auto *DefR = cast<VPSingleDefRecipe>(&R);
4922 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4923 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4924 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
4925 };
4926 if ((isa<VPReplicateRecipe>(DefR) &&
4927 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
4928 (isa<VPInstruction>(DefR) &&
4930 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
4931 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
4932 continue;
4933
4934 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
4935 unsigned Opcode = ScalarTy->isStructTy()
4938 auto *BuildVector = new VPInstruction(Opcode, {DefR});
4939 BuildVector->insertAfter(DefR);
4940
4941 DefR->replaceUsesWithIf(
4942 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
4943 VPUser &U, unsigned) {
4944 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
4945 });
4946 }
4947 }
4948
4949 // Create explicit VPInstructions to convert vectors to scalars. The current
4950 // implementation is conservative - it may miss some cases that may or may not
4951 // be vector values. TODO: introduce Unpacks speculatively - remove them later
4952 // if they are known to operate on scalar values.
4953 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
4954 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4956 VPDerivedIVRecipe>(&R))
4957 continue;
4958 for (VPValue *Def : R.definedValues()) {
4959 // Skip recipes that are single-scalar or only have their first lane
4960 // used.
4961 // TODO: The Defs skipped here may or may not be vector values.
4962 // Introduce Unpacks, and remove them later, if they are guaranteed to
4963 // produce scalar values.
4965 continue;
4966
4967 // At the moment, we create unpacks only for scalar users outside
4968 // replicate regions. Recipes inside replicate regions still extract the
4969 // required lanes implicitly.
4970 // TODO: Remove once replicate regions are unrolled completely.
4971 auto IsCandidateUnpackUser = [Def](VPUser *U) {
4972 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4973 return U->usesScalars(Def) &&
4974 (!ParentRegion || !ParentRegion->isReplicator());
4975 };
4976 if (none_of(Def->users(), IsCandidateUnpackUser))
4977 continue;
4978
4979 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
4980 if (R.isPhi())
4981 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
4982 else
4983 Unpack->insertAfter(&R);
4984 Def->replaceUsesWithIf(Unpack,
4985 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
4986 return IsCandidateUnpackUser(&U);
4987 });
4988 }
4989 }
4990 }
4991}
4992
4994 VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking,
4995 bool RequiresScalarEpilogue, VPValue *Step,
4996 std::optional<uint64_t> MaxRuntimeStep) {
4997 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
4998 // There's nothing to do if there are no users of the vector trip count or its
4999 // IR value has already been set.
5000 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
5001 return;
5002
5003 VPValue *TC = Plan.getTripCount();
5004 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
5005 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5006 if (auto *StepR = Step->getDefiningRecipe()) {
5007 assert(StepR->getParent() == VectorPHVPBB &&
5008 "Step must be defined in VectorPHVPBB");
5009 // Insert after Step's definition to maintain valid def-use ordering.
5010 InsertPt = std::next(StepR->getIterator());
5011 }
5012 VPBuilder Builder(VectorPHVPBB, InsertPt);
5013
5014 // For scalable steps, if TC is a constant and is divisible by the maximum
5015 // possible runtime step, then TC % Step == 0 for all valid vscale values
5016 // and the vector trip count equals TC directly.
5017 const APInt *TCVal;
5018 if (!RequiresScalarEpilogue && match(TC, m_APInt(TCVal)) && MaxRuntimeStep &&
5019 TCVal->getZExtValue() % *MaxRuntimeStep == 0) {
5020 VectorTC.replaceAllUsesWith(TC);
5021 return;
5022 }
5023
5024 // If the tail is to be folded by masking, round the number of iterations N
5025 // up to a multiple of Step instead of rounding down. This is done by first
5026 // adding Step-1 and then rounding down. Note that it's ok if this addition
5027 // overflows: the vector induction variable will eventually wrap to zero given
5028 // that it starts at zero and its Step is a power of two; the loop will then
5029 // exit, with the last early-exit vector comparison also producing all-true.
5030 if (TailByMasking) {
5031 TC = Builder.createAdd(
5032 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5033 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5034 }
5035
5036 // Now we need to generate the expression for the part of the loop that the
5037 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5038 // iterations are not required for correctness, or N - Step, otherwise. Step
5039 // is equal to the vectorization factor (number of SIMD elements) times the
5040 // unroll factor (number of SIMD instructions).
5041 VPValue *R =
5042 Builder.createNaryOp(Instruction::URem, {TC, Step},
5043 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5044
5045 // There are cases where we *must* run at least one iteration in the remainder
5046 // loop. See the cost model for when this can happen. If the step evenly
5047 // divides the trip count, we set the remainder to be equal to the step. If
5048 // the step does not evenly divide the trip count, no adjustment is necessary
5049 // since there will already be scalar iterations. Note that the minimum
5050 // iterations check ensures that N >= Step.
5051 if (RequiresScalarEpilogue) {
5052 assert(!TailByMasking &&
5053 "requiring scalar epilogue is not supported with fail folding");
5054 VPValue *IsZero =
5055 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));
5056 R = Builder.createSelect(IsZero, Step, R);
5057 }
5058
5059 VPValue *Res =
5060 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5061 VectorTC.replaceAllUsesWith(Res);
5062}
5063
5065 ElementCount VFEC) {
5066 // If VF and VFxUF have already been materialized (no remaining users),
5067 // there's nothing more to do.
5068 if (Plan.getVF().isMaterialized()) {
5069 assert(Plan.getVFxUF().isMaterialized() &&
5070 "VF and VFxUF must be materialized together");
5071 return;
5072 }
5073
5074 VPBuilder Builder(VectorPH, VectorPH->begin());
5075 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
5076 VPValue &VF = Plan.getVF();
5077 VPValue &VFxUF = Plan.getVFxUF();
5078 // If there are no users of the runtime VF, compute VFxUF by constant folding
5079 // the multiplication of VF and UF.
5080 if (VF.getNumUsers() == 0) {
5081 VPValue *RuntimeVFxUF =
5082 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5083 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5084 return;
5085 }
5086
5087 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5088 // vscale) * UF.
5089 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5091 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5093 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5094 }
5095 VF.replaceAllUsesWith(RuntimeVF);
5096
5097 VPValue *MulByUF = Builder.createOverflowingOp(
5098 Instruction::Mul,
5099 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5100 {true, false});
5101 VFxUF.replaceAllUsesWith(MulByUF);
5102}
5103
5106 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5107
5108 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5109 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5110 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5111 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5113 continue;
5114 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5115 if (!ExpSCEV)
5116 break;
5117 const SCEV *Expr = ExpSCEV->getSCEV();
5118 Value *Res =
5119 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5120 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
5121 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5122 ExpSCEV->replaceAllUsesWith(Exp);
5123 if (Plan.getTripCount() == ExpSCEV)
5124 Plan.resetTripCount(Exp);
5125 ExpSCEV->eraseFromParent();
5126 }
5128 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
5129 "before any VPIRInstructions");
5130 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5131 // to the VPIRBasicBlock.
5132 auto EI = Entry->begin();
5133 for (Instruction &I : drop_end(*EntryBB)) {
5134 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5135 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5136 EI++;
5137 continue;
5138 }
5140 }
5141
5142 return ExpandedSCEVs;
5143}
5144
5145/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5146/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5147/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5148/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5149/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5150/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5151/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5152/// is defined at \p Idx of a load interleave group.
5153static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5154 VPValue *OpV, unsigned Idx, bool IsScalable) {
5155 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5156 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5157 if (!Member0OpR)
5158 return Member0Op == OpV;
5159 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5160 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5161 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5162 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5163 Member0Op == OpV;
5164 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5165 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5166 return false;
5167}
5168
5169static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5171 auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Ops[0]);
5172 if (!WideMember0)
5173 return false;
5174 for (VPValue *V : Ops) {
5176 return false;
5177 auto *R = cast<VPSingleDefRecipe>(V);
5178 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5179 return false;
5180 }
5181
5182 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5184 for (VPValue *Op : Ops)
5185 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5186
5187 if (canNarrowOps(OpsI, IsScalable))
5188 continue;
5189
5190 if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {
5191 const auto &[OpIdx, OpV] = P;
5192 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5193 }))
5194 return false;
5195 }
5196
5197 return true;
5198}
5199
5200/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5201/// number of members both equal to VF. The interleave group must also access
5202/// the full vector width.
5203static std::optional<ElementCount> isConsecutiveInterleaveGroup(
5205 VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
5206 if (!InterleaveR || InterleaveR->getMask())
5207 return std::nullopt;
5208
5209 Type *GroupElementTy = nullptr;
5210 if (InterleaveR->getStoredValues().empty()) {
5211 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
5212 if (!all_of(InterleaveR->definedValues(),
5213 [&TypeInfo, GroupElementTy](VPValue *Op) {
5214 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5215 }))
5216 return std::nullopt;
5217 } else {
5218 GroupElementTy =
5219 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
5220 if (!all_of(InterleaveR->getStoredValues(),
5221 [&TypeInfo, GroupElementTy](VPValue *Op) {
5222 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5223 }))
5224 return std::nullopt;
5225 }
5226
5227 auto IG = InterleaveR->getInterleaveGroup();
5228 if (IG->getFactor() != IG->getNumMembers())
5229 return std::nullopt;
5230
5231 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5232 TypeSize Size = TTI.getRegisterBitWidth(
5235 assert(Size.isScalable() == VF.isScalable() &&
5236 "if Size is scalable, VF must be scalable and vice versa");
5237 return Size.getKnownMinValue();
5238 };
5239
5240 for (ElementCount VF : VFs) {
5241 unsigned MinVal = VF.getKnownMinValue();
5242 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5243 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5244 return {VF};
5245 }
5246 return std::nullopt;
5247}
5248
5249/// Returns true if \p VPValue is a narrow VPValue.
5250static bool isAlreadyNarrow(VPValue *VPV) {
5251 if (isa<VPIRValue>(VPV))
5252 return true;
5253 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5254 return RepR && RepR->isSingleScalar();
5255}
5256
5257// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
5258// a narrow variant.
5259static VPValue *
5261 auto *R = V->getDefiningRecipe();
5262 if (!R || NarrowedOps.contains(V))
5263 return V;
5264
5265 if (isAlreadyNarrow(V))
5266 return V;
5267
5269 auto *WideMember0 = cast<VPSingleDefRecipe>(R);
5270 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
5271 WideMember0->setOperand(
5272 Idx,
5273 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
5274 return V;
5275 }
5276
5277 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5278 // Narrow interleave group to wide load, as transformed VPlan will only
5279 // process one original iteration.
5280 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5281 auto *L = new VPWidenLoadRecipe(*LI, LoadGroup->getAddr(),
5282 LoadGroup->getMask(), /*Consecutive=*/true,
5283 {}, LoadGroup->getDebugLoc());
5284 L->insertBefore(LoadGroup);
5285 NarrowedOps.insert(L);
5286 return L;
5287 }
5288
5289 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5290 assert(RepR->isSingleScalar() && RepR->getOpcode() == Instruction::Load &&
5291 "must be a single scalar load");
5292 NarrowedOps.insert(RepR);
5293 return RepR;
5294 }
5295
5296 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5297 VPValue *PtrOp = WideLoad->getAddr();
5298 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5299 PtrOp = VecPtr->getOperand(0);
5300 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5301 // process one original iteration.
5302 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5303 /*IsUniform*/ true,
5304 /*Mask*/ nullptr, {}, *WideLoad);
5305 N->insertBefore(WideLoad);
5306 NarrowedOps.insert(N);
5307 return N;
5308}
5309
5310std::unique_ptr<VPlan>
5312 const TargetTransformInfo &TTI) {
5313 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5314
5315 if (!VectorLoop)
5316 return nullptr;
5317
5318 // Only handle single-block loops for now.
5319 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5320 return nullptr;
5321
5322 // Skip plans when we may not be able to properly narrow.
5323 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5324 if (!match(&Exiting->back(), m_BranchOnCount()))
5325 return nullptr;
5326
5327 assert(match(&Exiting->back(),
5329 m_Specific(&Plan.getVectorTripCount()))) &&
5330 "unexpected branch-on-count");
5331
5332 VPTypeAnalysis TypeInfo(Plan);
5334 std::optional<ElementCount> VFToOptimize;
5335 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5338 continue;
5339
5340 // Bail out on recipes not supported at the moment:
5341 // * phi recipes other than the canonical induction
5342 // * recipes writing to memory except interleave groups
5343 // Only support plans with a canonical induction phi.
5344 if (R.isPhi())
5345 return nullptr;
5346
5347 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5348 if (R.mayWriteToMemory() && !InterleaveR)
5349 return nullptr;
5350
5351 // Bail out if any recipe defines a vector value used outside the
5352 // vector loop region.
5353 if (any_of(R.definedValues(), [&](VPValue *V) {
5354 return any_of(V->users(), [&](VPUser *U) {
5355 auto *UR = cast<VPRecipeBase>(U);
5356 return UR->getParent()->getParent() != VectorLoop;
5357 });
5358 }))
5359 return nullptr;
5360
5361 // All other ops are allowed, but we reject uses that cannot be converted
5362 // when checking all allowed consumers (store interleave groups) below.
5363 if (!InterleaveR)
5364 continue;
5365
5366 // Try to find a single VF, where all interleave groups are consecutive and
5367 // saturate the full vector width. If we already have a candidate VF, check
5368 // if it is applicable for the current InterleaveR, otherwise look for a
5369 // suitable VF across the Plan's VFs.
5371 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5372 : to_vector(Plan.vectorFactors());
5373 std::optional<ElementCount> NarrowedVF =
5374 isConsecutiveInterleaveGroup(InterleaveR, VFs, TypeInfo, TTI);
5375 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
5376 return nullptr;
5377 VFToOptimize = NarrowedVF;
5378
5379 // Skip read interleave groups.
5380 if (InterleaveR->getStoredValues().empty())
5381 continue;
5382
5383 // Narrow interleave groups, if all operands are already matching narrow
5384 // ops.
5385 auto *Member0 = InterleaveR->getStoredValues()[0];
5386 if (isAlreadyNarrow(Member0) &&
5387 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5388 StoreGroups.push_back(InterleaveR);
5389 continue;
5390 }
5391
5392 // For now, we only support full interleave groups storing load interleave
5393 // groups.
5394 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5395 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5396 if (!DefR)
5397 return false;
5398 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5399 return IR && IR->getInterleaveGroup()->isFull() &&
5400 IR->getVPValue(Op.index()) == Op.value();
5401 })) {
5402 StoreGroups.push_back(InterleaveR);
5403 continue;
5404 }
5405
5406 // Check if all values feeding InterleaveR are matching wide recipes, which
5407 // operands that can be narrowed.
5408 if (!canNarrowOps(InterleaveR->getStoredValues(),
5409 VFToOptimize->isScalable()))
5410 return nullptr;
5411 StoreGroups.push_back(InterleaveR);
5412 }
5413
5414 if (StoreGroups.empty())
5415 return nullptr;
5416
5417 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5418 bool RequiresScalarEpilogue =
5419 MiddleVPBB->getNumSuccessors() == 1 &&
5420 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
5421 // Bail out for tail-folding (middle block with a single successor to exit).
5422 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
5423 return nullptr;
5424
5425 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5426 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
5427 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5428 // TODO: Handle cases where only some interleave groups can be narrowed.
5429 std::unique_ptr<VPlan> NewPlan;
5430 if (size(Plan.vectorFactors()) != 1) {
5431 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5432 Plan.setVF(*VFToOptimize);
5433 NewPlan->removeVF(*VFToOptimize);
5434 }
5435
5436 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5437 SmallPtrSet<VPValue *, 4> NarrowedOps;
5438 // Narrow operation tree rooted at store groups.
5439 for (auto *StoreGroup : StoreGroups) {
5440 VPValue *Res =
5441 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5442 auto *SI =
5443 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5444 auto *S = new VPWidenStoreRecipe(*SI, StoreGroup->getAddr(), Res, nullptr,
5445 /*Consecutive=*/true, {},
5446 StoreGroup->getDebugLoc());
5447 S->insertBefore(StoreGroup);
5448 StoreGroup->eraseFromParent();
5449 }
5450
5451 // Adjust induction to reflect that the transformed plan only processes one
5452 // original iteration.
5454 Type *CanIVTy = VectorLoop->getCanonicalIVType();
5455 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
5456 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
5457
5458 VPValue *UF = &Plan.getUF();
5459 VPValue *Step;
5460 if (VFToOptimize->isScalable()) {
5461 VPValue *VScale =
5462 PHBuilder.createElementCount(CanIVTy, ElementCount::getScalable(1));
5463 Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},
5464 {true, false});
5465 Plan.getVF().replaceAllUsesWith(VScale);
5466 } else {
5467 Step = UF;
5468 Plan.getVF().replaceAllUsesWith(Plan.getConstantInt(CanIVTy, 1));
5469 }
5470 // Materialize vector trip count with the narrowed step.
5471 materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,
5472 RequiresScalarEpilogue, Step);
5473
5474 CanIVInc->setOperand(1, Step);
5475 Plan.getVFxUF().replaceAllUsesWith(Step);
5476
5477 removeDeadRecipes(Plan);
5478 assert(none_of(*VectorLoop->getEntryBasicBlock(),
5480 "All VPVectorPointerRecipes should have been removed");
5481 return NewPlan;
5482}
5483
5484/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5485/// BranchOnCond recipe.
5487 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5488 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5489 auto *MiddleTerm =
5491 // Only add branch metadata if there is a (conditional) terminator.
5492 if (!MiddleTerm)
5493 return;
5494
5495 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5496 "must have a BranchOnCond");
5497 // Assume that `TripCount % VectorStep ` is equally distributed.
5498 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5499 if (VF.isScalable() && VScaleForTuning.has_value())
5500 VectorStep *= *VScaleForTuning;
5501 assert(VectorStep > 0 && "trip count should not be zero");
5502 MDBuilder MDB(Plan.getContext());
5503 MDNode *BranchWeights =
5504 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5505 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5506}
5507
5509 VFRange &Range) {
5510 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5511 auto *MiddleVPBB = Plan.getMiddleBlock();
5512 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5513 VPTypeAnalysis TypeInfo(Plan);
5514
5515 auto IsScalableOne = [](ElementCount VF) -> bool {
5516 return VF == ElementCount::getScalable(1);
5517 };
5518
5519 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5520 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5521 if (!FOR)
5522 continue;
5523
5524 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5525 "Cannot handle loops with uncountable early exits");
5526
5527 // Find the existing splice for this FOR, created in
5528 // createHeaderPhiRecipes. All uses of FOR have already been replaced with
5529 // RecurSplice there; only RecurSplice itself still references FOR.
5530 auto *RecurSplice =
5532 assert(RecurSplice && "expected FirstOrderRecurrenceSplice");
5533
5534 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5535 // penultimate value of the recurrence. Instead we rely on the existing
5536 // extract of the last element from the result of
5537 // VPInstruction::FirstOrderRecurrenceSplice.
5538 // TODO: Consider vscale_range info and UF.
5539 if (any_of(RecurSplice->users(),
5540 [](VPUser *U) { return !cast<VPRecipeBase>(U)->getRegion(); }) &&
5542 Range))
5543 return;
5544
5545 // This is the second phase of vectorizing first-order recurrences, creating
5546 // extracts for users outside the loop. An overview of the transformation is
5547 // described below. Suppose we have the following loop with some use after
5548 // the loop of the last a[i-1],
5549 //
5550 // for (int i = 0; i < n; ++i) {
5551 // t = a[i - 1];
5552 // b[i] = a[i] - t;
5553 // }
5554 // use t;
5555 //
5556 // There is a first-order recurrence on "a". For this loop, the shorthand
5557 // scalar IR looks like:
5558 //
5559 // scalar.ph:
5560 // s.init = a[-1]
5561 // br scalar.body
5562 //
5563 // scalar.body:
5564 // i = phi [0, scalar.ph], [i+1, scalar.body]
5565 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5566 // s2 = a[i]
5567 // b[i] = s2 - s1
5568 // br cond, scalar.body, exit.block
5569 //
5570 // exit.block:
5571 // use = lcssa.phi [s1, scalar.body]
5572 //
5573 // In this example, s1 is a recurrence because it's value depends on the
5574 // previous iteration. In the first phase of vectorization, we created a
5575 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5576 // for users in the scalar preheader and exit block.
5577 //
5578 // vector.ph:
5579 // v_init = vector(..., ..., ..., a[-1])
5580 // br vector.body
5581 //
5582 // vector.body
5583 // i = phi [0, vector.ph], [i+4, vector.body]
5584 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5585 // v2 = a[i, i+1, i+2, i+3]
5586 // v1' = splice(v1(3), v2(0, 1, 2))
5587 // b[i, i+1, i+2, i+3] = v2 - v1'
5588 // br cond, vector.body, middle.block
5589 //
5590 // middle.block:
5591 // vector.recur.extract.for.phi = v2(2)
5592 // vector.recur.extract = v2(3)
5593 // br cond, scalar.ph, exit.block
5594 //
5595 // scalar.ph:
5596 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5597 // [s.init, otherwise]
5598 // br scalar.body
5599 //
5600 // scalar.body:
5601 // i = phi [0, scalar.ph], [i+1, scalar.body]
5602 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5603 // s2 = a[i]
5604 // b[i] = s2 - s1
5605 // br cond, scalar.body, exit.block
5606 //
5607 // exit.block:
5608 // lo = lcssa.phi [s1, scalar.body],
5609 // [vector.recur.extract.for.phi, middle.block]
5610 //
5611 // Update extracts of the splice in the middle block: they extract the
5612 // penultimate element of the recurrence.
5614 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5615 if (!match(&R, m_ExtractLastLaneOfLastPart(m_Specific(RecurSplice))))
5616 continue;
5617
5618 auto *ExtractR = cast<VPInstruction>(&R);
5619 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5620 VPInstruction::ExtractPenultimateElement, RecurSplice->getOperand(1),
5621 {}, "vector.recur.extract.for.phi");
5622 for (VPUser *ExitU : to_vector(ExtractR->users())) {
5623 if (auto *ExitPhi = dyn_cast<VPIRPhi>(ExitU))
5624 ExitPhi->replaceUsesOfWith(ExtractR, PenultimateElement);
5625 }
5626 }
5627 }
5628}
5629
5630/// Check if \p V is a binary expression of a widened IV and a loop-invariant
5631/// value. Returns the widened IV if found, nullptr otherwise.
5633 auto *BinOp = dyn_cast<VPWidenRecipe>(V);
5634 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||
5635 Instruction::isIntDivRem(BinOp->getOpcode()))
5636 return nullptr;
5637
5638 VPValue *WidenIVCandidate = BinOp->getOperand(0);
5639 VPValue *InvariantCandidate = BinOp->getOperand(1);
5640 if (!isa<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate))
5641 std::swap(WidenIVCandidate, InvariantCandidate);
5642
5643 if (!InvariantCandidate->isDefinedOutsideLoopRegions())
5644 return nullptr;
5645
5646 return dyn_cast<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate);
5647}
5648
5649/// Create a scalar version of \p BinOp, with its \p WidenIV operand replaced
5650/// by \p ScalarIV, and place it after \p ScalarIV's defining recipe.
5654 BinOp->getNumOperands() == 2 && "BinOp must have 2 operands");
5655 auto *ClonedOp = BinOp->clone();
5656 if (ClonedOp->getOperand(0) == WidenIV) {
5657 ClonedOp->setOperand(0, ScalarIV);
5658 } else {
5659 assert(ClonedOp->getOperand(1) == WidenIV && "one operand must be WideIV");
5660 ClonedOp->setOperand(1, ScalarIV);
5661 }
5662 ClonedOp->insertAfter(ScalarIV->getDefiningRecipe());
5663 return ClonedOp;
5664}
5665
5668 Loop &L) {
5669 ScalarEvolution &SE = *PSE.getSE();
5670 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
5671
5672 // Helper lambda to check if the IV range excludes the sentinel value. Try
5673 // signed first, then unsigned. Return an excluded sentinel if found,
5674 // otherwise return std::nullopt.
5675 auto CheckSentinel = [&SE](const SCEV *IVSCEV,
5676 bool UseMax) -> std::optional<APSInt> {
5677 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
5678 for (bool Signed : {true, false}) {
5679 APSInt Sentinel = UseMax ? APSInt::getMinValue(BW, /*Unsigned=*/!Signed)
5680 : APSInt::getMaxValue(BW, /*Unsigned=*/!Signed);
5681
5682 ConstantRange IVRange =
5683 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
5684 if (!IVRange.contains(Sentinel))
5685 return Sentinel;
5686 }
5687 return std::nullopt;
5688 };
5689
5690 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
5691 for (VPRecipeBase &Phi :
5692 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
5693 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
5695 PhiR->getRecurrenceKind()))
5696 continue;
5697
5698 Type *PhiTy = VPTypeAnalysis(Plan).inferScalarType(PhiR);
5699 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
5700 continue;
5701
5702 // If there's a header mask, the backedge select will not be the find-last
5703 // select.
5704 VPValue *BackedgeVal = PhiR->getBackedgeValue();
5705 auto *FindLastSelect = cast<VPSingleDefRecipe>(BackedgeVal);
5706 if (HeaderMask &&
5707 !match(BackedgeVal,
5708 m_Select(m_Specific(HeaderMask),
5709 m_VPSingleDefRecipe(FindLastSelect), m_Specific(PhiR))))
5710 continue;
5711
5712 // Get the find-last expression from the find-last select of the reduction
5713 // phi. The find-last select should be a select between the phi and the
5714 // find-last expression.
5715 VPValue *Cond, *FindLastExpression;
5716 if (!match(FindLastSelect, m_Select(m_VPValue(Cond), m_Specific(PhiR),
5717 m_VPValue(FindLastExpression))) &&
5718 !match(FindLastSelect,
5719 m_Select(m_VPValue(Cond), m_VPValue(FindLastExpression),
5720 m_Specific(PhiR))))
5721 continue;
5722
5723 // Check if FindLastExpression is a simple expression of a widened IV. If
5724 // so, we can track the underlying IV instead and sink the expression.
5725 auto *IVOfExpressionToSink = getExpressionIV(FindLastExpression);
5726 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(
5727 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression, PSE,
5728 &L);
5729 const SCEV *Step;
5730 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
5731 assert(!match(vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L),
5733 "IVOfExpressionToSink not being an AddRec must imply "
5734 "FindLastExpression not being an AddRec.");
5735 continue;
5736 }
5737
5738 // Determine direction from SCEV step.
5739 if (!SE.isKnownNonZero(Step))
5740 continue;
5741
5742 // Positive step means we need UMax/SMax to find the last IV value, and
5743 // UMin/SMin otherwise.
5744 bool UseMax = SE.isKnownPositive(Step);
5745 std::optional<APSInt> SentinelVal = CheckSentinel(IVSCEV, UseMax);
5746 bool UseSigned = SentinelVal && SentinelVal->isSigned();
5747
5748 // Sinking an expression will disable epilogue vectorization. Only use it,
5749 // if FindLastExpression cannot be vectorized via a sentinel. Sinking may
5750 // also prevent vectorizing using a sentinel (e.g., if the expression is a
5751 // multiply or divide by large constant, respectively), which also makes
5752 // sinking undesirable.
5753 if (IVOfExpressionToSink) {
5754 const SCEV *FindLastExpressionSCEV =
5755 vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L);
5756 if (match(FindLastExpressionSCEV,
5757 m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
5758 bool NewUseMax = SE.isKnownPositive(Step);
5759 if (auto NewSentinel =
5760 CheckSentinel(FindLastExpressionSCEV, NewUseMax)) {
5761 // The original expression already has a sentinel, so prefer not
5762 // sinking to keep epilogue vectorization possible.
5763 SentinelVal = *NewSentinel;
5764 UseSigned = NewSentinel->isSigned();
5765 UseMax = NewUseMax;
5766 IVSCEV = FindLastExpressionSCEV;
5767 IVOfExpressionToSink = nullptr;
5768 }
5769 }
5770 }
5771
5772 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
5773 // if the condition was ever true. Requires the IV to not wrap, otherwise we
5774 // cannot use min/max.
5775 if (!SentinelVal) {
5776 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
5777 if (AR->hasNoSignedWrap())
5778 UseSigned = true;
5779 else if (AR->hasNoUnsignedWrap())
5780 UseSigned = false;
5781 else
5782 continue;
5783 }
5784
5786 BackedgeVal,
5788
5789 VPValue *NewFindLastSelect = BackedgeVal;
5790 VPValue *SelectCond = Cond;
5791 if (!SentinelVal || IVOfExpressionToSink) {
5792 // When we need to create a new select, normalize the condition so that
5793 // PhiR is the last operand and include the header mask if needed.
5794 DebugLoc DL = FindLastSelect->getDefiningRecipe()->getDebugLoc();
5795 VPBuilder LoopBuilder(FindLastSelect->getDefiningRecipe());
5796 if (FindLastSelect->getDefiningRecipe()->getOperand(1) == PhiR)
5797 SelectCond = LoopBuilder.createNot(SelectCond);
5798
5799 // When tail folding, mask the condition with the header mask to prevent
5800 // propagating poison from inactive lanes in the last vector iteration.
5801 if (HeaderMask)
5802 SelectCond = LoopBuilder.createLogicalAnd(HeaderMask, SelectCond);
5803
5804 if (SelectCond != Cond || IVOfExpressionToSink) {
5805 NewFindLastSelect = LoopBuilder.createSelect(
5806 SelectCond,
5807 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression,
5808 PhiR, DL);
5809 }
5810 }
5811
5812 // Create the reduction result in the middle block using sentinel directly.
5813 RecurKind MinMaxKind =
5814 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
5815 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
5816 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
5817 FastMathFlags());
5818 DebugLoc ExitDL = RdxResult->getDebugLoc();
5819 VPBuilder MiddleBuilder(RdxResult);
5820 VPValue *ReducedIV =
5822 NewFindLastSelect, Flags, ExitDL);
5823
5824 // If IVOfExpressionToSink is an expression to sink, sink it now.
5825 VPValue *VectorRegionExitingVal = ReducedIV;
5826 if (IVOfExpressionToSink)
5827 VectorRegionExitingVal =
5828 cloneBinOpForScalarIV(cast<VPWidenRecipe>(FindLastExpression),
5829 ReducedIV, IVOfExpressionToSink);
5830
5831 VPValue *NewRdxResult;
5832 VPValue *StartVPV = PhiR->getStartValue();
5833 if (SentinelVal) {
5834 // Sentinel-based approach: reduce IVs with min/max, compare against
5835 // sentinel to detect if condition was ever true, select accordingly.
5836 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
5837 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
5838 Sentinel, ExitDL);
5839 NewRdxResult = MiddleBuilder.createSelect(Cmp, VectorRegionExitingVal,
5840 StartVPV, ExitDL);
5841 StartVPV = Sentinel;
5842 } else {
5843 // Introduce a boolean AnyOf reduction to track if the condition was ever
5844 // true in the loop. Use it to select the initial start value, if it was
5845 // never true.
5846 auto *AnyOfPhi = new VPReductionPHIRecipe(
5847 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
5848 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
5849 AnyOfPhi->insertAfter(PhiR);
5850
5851 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
5852 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, SelectCond);
5853 AnyOfPhi->setOperand(1, OrVal);
5854
5855 NewRdxResult = MiddleBuilder.createAnyOfReduction(
5856 OrVal, VectorRegionExitingVal, StartVPV, ExitDL);
5857
5858 // Initialize the IV reduction phi with the neutral element, not the
5859 // original start value, to ensure correct min/max reduction results.
5860 StartVPV = Plan.getOrAddLiveIn(
5861 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
5862 }
5863 RdxResult->replaceAllUsesWith(NewRdxResult);
5864 RdxResult->eraseFromParent();
5865
5866 auto *NewPhiR = new VPReductionPHIRecipe(
5867 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
5868 *NewFindLastSelect, RdxUnordered{1}, {},
5869 PhiR->hasUsesOutsideReductionChain());
5870 NewPhiR->insertBefore(PhiR);
5871 PhiR->replaceAllUsesWith(NewPhiR);
5872 PhiR->eraseFromParent();
5873 }
5874}
5875
5876namespace {
5877
5878using ExtendKind = TTI::PartialReductionExtendKind;
5879struct ReductionExtend {
5880 Type *SrcType = nullptr;
5881 ExtendKind Kind = ExtendKind::PR_None;
5882};
5883
5884/// Describes the extends used to compute the extended reduction operand.
5885/// ExtendB is optional. If ExtendB is present, ExtendsUser is a binary
5886/// operation.
5887struct ExtendedReductionOperand {
5888 /// The recipe that consumes the extends.
5889 VPWidenRecipe *ExtendsUser = nullptr;
5890 /// Extend descriptions (inputs to getPartialReductionCost).
5891 ReductionExtend ExtendA, ExtendB;
5892};
5893
5894/// A chain of recipes that form a partial reduction. Matches either
5895/// reduction_bin_op (extended op, accumulator), or
5896/// reduction_bin_op (accumulator, extended op).
5897/// The possible forms of the "extended op" are listed in
5898/// matchExtendedReductionOperand.
5899struct VPPartialReductionChain {
5900 /// The top-level binary operation that forms the reduction to a scalar
5901 /// after the loop body.
5902 VPWidenRecipe *ReductionBinOp = nullptr;
5903 /// The user of the extends that is then reduced.
5904 ExtendedReductionOperand ExtendedOp;
5905 /// The recurrence kind for the entire partial reduction chain.
5906 /// This allows distinguishing between Sub and AddWithSub recurrences,
5907 /// when the ReductionBinOp is a Instruction::Sub.
5908 RecurKind RK;
5909 /// The index of the accumulator operand of ReductionBinOp. The extended op
5910 /// is `1 - AccumulatorOpIdx`.
5911 unsigned AccumulatorOpIdx;
5912 unsigned ScaleFactor;
5913};
5914
5915static VPSingleDefRecipe *
5916optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op,
5917 VPTypeAnalysis &TypeInfo) {
5918 // reduce.add(mul(ext(A), C))
5919 // -> reduce.add(mul(ext(A), ext(trunc(C))))
5920 const APInt *Const;
5921 if (match(Op, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
5922 auto *ExtA = cast<VPWidenCastRecipe>(Op->getOperand(0));
5923 Instruction::CastOps ExtOpc = ExtA->getOpcode();
5924 Type *NarrowTy = TypeInfo.inferScalarType(ExtA->getOperand(0));
5925 if (!Op->hasOneUse() ||
5927 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
5928 return Op;
5929
5930 VPBuilder Builder(Op);
5931 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
5932 Op->getOperand(1), NarrowTy);
5933 Type *WideTy = TypeInfo.inferScalarType(ExtA);
5934 Op->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
5935 return Op;
5936 }
5937
5938 // reduce.add(abs(sub(ext(A), ext(B))))
5939 // -> reduce.add(ext(absolute-difference(A, B)))
5940 VPValue *X, *Y;
5943 auto *Sub = Op->getOperand(0)->getDefiningRecipe();
5944 auto *Ext = cast<VPWidenCastRecipe>(Sub->getOperand(0));
5945 assert(Ext->getOpcode() ==
5946 cast<VPWidenCastRecipe>(Sub->getOperand(1))->getOpcode() &&
5947 "Expected both the LHS and RHS extends to be the same");
5948 bool IsSigned = Ext->getOpcode() == Instruction::SExt;
5949 VPBuilder Builder(Op);
5950 Type *SrcTy = TypeInfo.inferScalarType(X);
5951 auto *FreezeX = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {X}));
5952 auto *FreezeY = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {Y}));
5953 auto *Max = Builder.insert(
5954 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smax : Intrinsic::umax,
5955 {FreezeX, FreezeY}, SrcTy));
5956 auto *Min = Builder.insert(
5957 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smin : Intrinsic::umin,
5958 {FreezeX, FreezeY}, SrcTy));
5959 auto *AbsDiff =
5960 Builder.insert(new VPWidenRecipe(Instruction::Sub, {Max, Min}));
5961 return Builder.createWidenCast(Instruction::CastOps::ZExt, AbsDiff,
5962 TypeInfo.inferScalarType(Op));
5963 }
5964
5965 // reduce.add(ext(mul(ext(A), ext(B))))
5966 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
5967 // TODO: Support this optimization for float types.
5969 m_ZExtOrSExt(m_VPValue()))))) {
5970 auto *Ext = cast<VPWidenCastRecipe>(Op);
5971 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
5972 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
5973 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
5974 if (!Mul->hasOneUse() ||
5975 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
5976 MulLHS->getOpcode() != MulRHS->getOpcode())
5977 return Op;
5978 VPBuilder Builder(Mul);
5979 Mul->setOperand(0, Builder.createWidenCast(MulLHS->getOpcode(),
5980 MulLHS->getOperand(0),
5981 Ext->getResultType()));
5982 Mul->setOperand(1, MulLHS == MulRHS
5983 ? Mul->getOperand(0)
5984 : Builder.createWidenCast(MulRHS->getOpcode(),
5985 MulRHS->getOperand(0),
5986 Ext->getResultType()));
5987 return Mul;
5988 }
5989
5990 return Op;
5991}
5992
5993static VPExpressionRecipe *
5994createPartialReductionExpression(VPReductionRecipe *Red) {
5995 VPValue *VecOp = Red->getVecOp();
5996
5997 // reduce.[f]add(ext(op))
5998 // -> VPExpressionRecipe(op, red)
5999 if (match(VecOp, m_WidenAnyExtend(m_VPValue())))
6000 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
6001
6002 // reduce.[f]add([f]mul(ext(a), ext(b)))
6003 // -> VPExpressionRecipe(a, b, mul, red)
6004 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue()))) ||
6005 match(VecOp,
6007 auto *Mul = cast<VPWidenRecipe>(VecOp);
6008 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6009 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6010 return new VPExpressionRecipe(ExtA, ExtB, Mul, Red);
6011 }
6012
6013 // reduce.add(neg(mul(ext(a), ext(b))))
6014 // -> VPExpressionRecipe(a, b, mul, sub, red)
6016 m_ZExtOrSExt(m_VPValue()))))) {
6017 auto *Sub = cast<VPWidenRecipe>(VecOp);
6018 auto *Mul = cast<VPWidenRecipe>(Sub->getOperand(1));
6019 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6020 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6021 return new VPExpressionRecipe(ExtA, ExtB, Mul, Sub, Red);
6022 }
6023
6024 llvm_unreachable("Unsupported expression");
6025}
6026
6027// Helper to transform a partial reduction chain into a partial reduction
6028// recipe. Assumes profitability has been checked.
6029static void transformToPartialReduction(const VPPartialReductionChain &Chain,
6030 VPTypeAnalysis &TypeInfo, VPlan &Plan,
6031 VPReductionPHIRecipe *RdxPhi) {
6032 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6033 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
6034
6035 VPValue *Accumulator = WidenRecipe->getOperand(Chain.AccumulatorOpIdx);
6036 auto *ExtendedOp = cast<VPSingleDefRecipe>(
6037 WidenRecipe->getOperand(1 - Chain.AccumulatorOpIdx));
6038
6039 // Sub-reductions can be implemented in two ways:
6040 // (1) negate the operand in the vector loop (the default way).
6041 // (2) subtract the reduced value from the init value in the middle block.
6042 // Both ways keep the reduction itself as an 'add' reduction.
6043 //
6044 // The ISD nodes for partial reductions don't support folding the
6045 // sub/negation into its operands because the following is not a valid
6046 // transformation:
6047 // sub(0, mul(ext(a), ext(b)))
6048 // -> mul(ext(a), ext(sub(0, b)))
6049 //
6050 // It's therefore better to choose option (2) such that the partial
6051 // reduction is always positive (starting at '0') and to do a final
6052 // subtract in the middle block.
6053 if (WidenRecipe->getOpcode() == Instruction::Sub &&
6054 Chain.RK != RecurKind::Sub) {
6055 VPBuilder Builder(WidenRecipe);
6056 Type *ElemTy = TypeInfo.inferScalarType(ExtendedOp);
6057 auto *Zero = Plan.getZero(ElemTy);
6058 auto *NegRecipe =
6059 new VPWidenRecipe(Instruction::Sub, {Zero, ExtendedOp}, VPIRFlags(),
6061 Builder.insert(NegRecipe);
6062 ExtendedOp = NegRecipe;
6063 }
6064
6065 assert((Chain.RK != RecurKind::FAddChainWithSubs) &&
6066 "FSub chain reduction isn't supported");
6067
6068 // FIXME: Do these transforms before invoking the cost-model.
6069 ExtendedOp = optimizeExtendsForPartialReduction(ExtendedOp, TypeInfo);
6070
6071 // Check if WidenRecipe is the final result of the reduction. If so look
6072 // through selects for predicated reductions.
6073 VPValue *Cond = nullptr;
6075 WidenRecipe,
6076 m_Select(m_VPValue(Cond), m_Specific(WidenRecipe), m_Specific(RdxPhi))));
6077 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
6078 RdxPhi->getBackedgeValue() == ExitValue;
6079 assert((!ExitValue || IsLastInChain) &&
6080 "if we found ExitValue, it must match RdxPhi's backedge value");
6081
6082 Type *PhiType = TypeInfo.inferScalarType(RdxPhi);
6083 RecurKind RdxKind =
6085 auto *PartialRed = new VPReductionRecipe(
6086 RdxKind,
6087 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
6088 : FastMathFlags(),
6089 WidenRecipe->getUnderlyingInstr(), Accumulator, ExtendedOp, Cond,
6090 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6091 PartialRed->insertBefore(WidenRecipe);
6092
6093 if (Cond)
6094 ExitValue->replaceAllUsesWith(PartialRed);
6095 WidenRecipe->replaceAllUsesWith(PartialRed);
6096
6097 // For cost-model purposes, fold this into a VPExpression.
6098 VPExpressionRecipe *E = createPartialReductionExpression(PartialRed);
6099 E->insertBefore(WidenRecipe);
6100 PartialRed->replaceAllUsesWith(E);
6101
6102 // We only need to update the PHI node once, which is when we find the
6103 // last reduction in the chain.
6104 if (!IsLastInChain)
6105 return;
6106
6107 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6108 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6109 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6110
6111 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
6112 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6113 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
6114 StartInst->setOperand(2, NewScaleFactor);
6115
6116 // If this is the last value in a sub-reduction chain, then update the PHI
6117 // node to start at `0` and update the reduction-result to subtract from
6118 // the PHI's start value.
6119 if (Chain.RK != RecurKind::Sub && Chain.RK != RecurKind::FSub)
6120 return;
6121
6122 VPValue *OldStartValue = StartInst->getOperand(0);
6123 StartInst->setOperand(0, StartInst->getOperand(1));
6124
6125 // Replace reduction_result by 'sub (startval, reductionresult)'.
6127 assert(RdxResult && "Could not find reduction result");
6128
6129 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
6130 unsigned SubOpc = Chain.RK == RecurKind::FSub ? Instruction::BinaryOps::FSub
6131 : Instruction::BinaryOps::Sub;
6132 VPInstruction *NewResult = Builder.createNaryOp(
6133 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
6134 RdxPhi->getDebugLoc());
6135 RdxResult->replaceUsesWithIf(
6136 NewResult,
6137 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6138}
6139
6140/// Returns the cost of a link in a partial-reduction chain for a given VF.
6141static InstructionCost
6142getPartialReductionLinkCost(VPCostContext &CostCtx,
6143 const VPPartialReductionChain &Link,
6144 ElementCount VF) {
6145 Type *RdxType = CostCtx.Types.inferScalarType(Link.ReductionBinOp);
6146 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6147 std::optional<unsigned> BinOpc = std::nullopt;
6148 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6149 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6150 BinOpc = ExtendedOp.ExtendsUser->getOpcode();
6151
6152 std::optional<llvm::FastMathFlags> Flags;
6153 if (RdxType->isFloatingPointTy())
6154 Flags = Link.ReductionBinOp->getFastMathFlags();
6155
6156 auto GetLinkOpcode = [&Link]() -> unsigned {
6157 switch (Link.RK) {
6158 case RecurKind::Sub:
6159 return Instruction::Add;
6160 case RecurKind::FSub:
6161 return Instruction::FAdd;
6162 default:
6163 return Link.ReductionBinOp->getOpcode();
6164 }
6165 };
6166
6167 return CostCtx.TTI.getPartialReductionCost(
6168 GetLinkOpcode(), ExtendedOp.ExtendA.SrcType, ExtendedOp.ExtendB.SrcType,
6169 RdxType, VF, ExtendedOp.ExtendA.Kind, ExtendedOp.ExtendB.Kind, BinOpc,
6170 CostCtx.CostKind, Flags);
6171}
6172
6173static ExtendKind getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
6175}
6176
6177/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6178/// operand. This is an operand where the source of the value (e.g. a load) has
6179/// been extended (sext, zext, or fpext) before it is used in the reduction.
6180///
6181/// Possible forms matched by this function:
6182/// - UpdateR(PrevValue, ext(...))
6183/// - UpdateR(PrevValue, mul(ext(...), ext(...)))
6184/// - UpdateR(PrevValue, mul(ext(...), Constant))
6185/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6186/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6187/// - UpdateR(PrevValue, abs(sub(ext(...), ext(...)))
6188///
6189/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6190static std::optional<ExtendedReductionOperand>
6191matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op,
6192 VPTypeAnalysis &TypeInfo) {
6193 assert(is_contained(UpdateR->operands(), Op) &&
6194 "Op should be operand of UpdateR");
6195
6196 // Try matching an absolute difference operand of the form
6197 // `abs(sub(ext(A), ext(B)))`. This will be later transformed into
6198 // `ext(absolute-difference(A, B))`. This allows us to perform the absolute
6199 // difference on a wider type and get the extend for "free" from the partial
6200 // reduction.
6201 VPValue *X, *Y;
6202 if (Op->hasOneUse() &&
6206 auto *Abs = cast<VPWidenIntrinsicRecipe>(Op);
6207 auto *Sub = cast<VPWidenRecipe>(Abs->getOperand(0));
6208 auto *LHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6209 auto *RHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(1));
6210 Type *LHSInputType = TypeInfo.inferScalarType(X);
6211 Type *RHSInputType = TypeInfo.inferScalarType(Y);
6212 if (LHSInputType != RHSInputType ||
6213 LHSExt->getOpcode() != RHSExt->getOpcode())
6214 return std::nullopt;
6215 // Note: This is essentially the same as matching ext(...) as we will
6216 // rewrite this operand to ext(absolute-difference(A, B)).
6217 return ExtendedReductionOperand{
6218 Sub,
6219 /*ExtendA=*/{LHSInputType, getPartialReductionExtendKind(LHSExt)},
6220 /*ExtendB=*/{}};
6221 }
6222
6223 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6225 auto *CastRecipe = cast<VPWidenCastRecipe>(Op);
6226 VPValue *CastSource = CastRecipe->getOperand(0);
6227 OuterExtKind = getPartialReductionExtendKind(CastRecipe);
6228 if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) ||
6229 match(CastSource, m_FMul(m_VPValue(), m_VPValue()))) {
6230 // Match: ext(mul(...))
6231 // Record the outer extend kind and set `Op` to the mul. We can then match
6232 // this as a binary operation. Note: We can optimize out the outer extend
6233 // by widening the inner extends to match it. See
6234 // optimizeExtendsForPartialReduction.
6235 Op = CastSource;
6236 // FIXME: createPartialReductionExpression can't handle sub(ext(mul(...)))
6237 if (UpdateR->getOpcode() == Instruction::Sub)
6238 return std::nullopt;
6239 } else if (UpdateR->getOpcode() == Instruction::Add ||
6240 UpdateR->getOpcode() == Instruction::FAdd) {
6241 // Match: UpdateR(PrevValue, ext(...))
6242 // TODO: Remove the add/fadd restriction (we should be able to handle this
6243 // case for sub reductions too).
6244 return ExtendedReductionOperand{
6245 UpdateR,
6246 /*ExtendA=*/{TypeInfo.inferScalarType(CastSource), *OuterExtKind},
6247 /*ExtendB=*/{}};
6248 }
6249 }
6250
6251 if (!Op->hasOneUse())
6252 return std::nullopt;
6253
6255 if (!MulOp ||
6256 !is_contained({Instruction::Mul, Instruction::FMul}, MulOp->getOpcode()))
6257 return std::nullopt;
6258
6259 // The rest of the matching assumes `Op` is a (possibly extended/negated)
6260 // binary operation.
6261
6262 VPValue *LHS = MulOp->getOperand(0);
6263 VPValue *RHS = MulOp->getOperand(1);
6264
6265 // The LHS of the operation must always be an extend.
6267 return std::nullopt;
6268
6269 auto *LHSCast = cast<VPWidenCastRecipe>(LHS);
6270 Type *LHSInputType = TypeInfo.inferScalarType(LHSCast->getOperand(0));
6271 ExtendKind LHSExtendKind = getPartialReductionExtendKind(LHSCast);
6272
6273 // The RHS of the operation can be an extend or a constant integer.
6274 const APInt *RHSConst = nullptr;
6275 VPWidenCastRecipe *RHSCast = nullptr;
6277 RHSCast = cast<VPWidenCastRecipe>(RHS);
6278 else if (!match(RHS, m_APInt(RHSConst)) ||
6279 !canConstantBeExtended(RHSConst, LHSInputType, LHSExtendKind))
6280 return std::nullopt;
6281
6282 // The outer extend kind must match the inner extends for folding.
6283 for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
6284 if (Cast && OuterExtKind &&
6285 getPartialReductionExtendKind(Cast) != OuterExtKind)
6286 return std::nullopt;
6287
6288 Type *RHSInputType = LHSInputType;
6289 ExtendKind RHSExtendKind = LHSExtendKind;
6290 if (RHSCast) {
6291 RHSInputType = TypeInfo.inferScalarType(RHSCast->getOperand(0));
6292 RHSExtendKind = getPartialReductionExtendKind(RHSCast);
6293 }
6294
6295 return ExtendedReductionOperand{
6296 MulOp, {LHSInputType, LHSExtendKind}, {RHSInputType, RHSExtendKind}};
6297}
6298
6299/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6300/// and determines if the target can use a cheaper operation with a wider
6301/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6302/// of operations in the reduction.
6303static std::optional<SmallVector<VPPartialReductionChain>>
6304getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,
6305 VFRange &Range) {
6306 // Get the backedge value from the reduction PHI and find the
6307 // ComputeReductionResult that uses it (directly or through a select for
6308 // predicated reductions).
6309 auto *RdxResult = vputils::findComputeReductionResult(RedPhiR);
6310 if (!RdxResult)
6311 return std::nullopt;
6312 VPValue *ExitValue = RdxResult->getOperand(0);
6313 match(ExitValue, m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6314
6315 VPTypeAnalysis &TypeInfo = CostCtx.Types;
6317 RecurKind RK = RedPhiR->getRecurrenceKind();
6318 Type *PhiType = TypeInfo.inferScalarType(RedPhiR);
6319 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6320
6321 // Work backwards from the ExitValue examining each reduction operation.
6322 VPValue *CurrentValue = ExitValue;
6323 while (CurrentValue != RedPhiR) {
6324 auto *UpdateR = dyn_cast<VPWidenRecipe>(CurrentValue);
6325 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6326 return std::nullopt;
6327
6328 VPValue *Op = UpdateR->getOperand(1);
6329 VPValue *PrevValue = UpdateR->getOperand(0);
6330
6331 // Find the extended operand. The other operand (PrevValue) is the next link
6332 // in the reduction chain.
6333 std::optional<ExtendedReductionOperand> ExtendedOp =
6334 matchExtendedReductionOperand(UpdateR, Op, TypeInfo);
6335 if (!ExtendedOp) {
6336 ExtendedOp = matchExtendedReductionOperand(UpdateR, PrevValue, TypeInfo);
6337 if (!ExtendedOp)
6338 return std::nullopt;
6339 std::swap(Op, PrevValue);
6340 }
6341
6342 Type *ExtSrcType = ExtendedOp->ExtendA.SrcType;
6343 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6344 if (!PHISize.hasKnownScalarFactor(ExtSrcSize))
6345 return std::nullopt;
6346
6347 // Check if a partial reduction chain is supported by the target (i.e. does
6348 // not have an invalid cost) for the given VF range. Clamps the range and
6349 // returns true if feasible for any VF.
6350 VPPartialReductionChain Link(
6351 {UpdateR, *ExtendedOp, RK,
6352 PrevValue == UpdateR->getOperand(0) ? 0U : 1U,
6353 static_cast<unsigned>(PHISize.getKnownScalarFactor(ExtSrcSize))});
6354 Chain.push_back(Link);
6355 CurrentValue = PrevValue;
6356 }
6357
6358 // The chain links were collected by traversing backwards from the exit value.
6359 // Reverse the chains so they are in program order.
6360 std::reverse(Chain.begin(), Chain.end());
6361 return Chain;
6362}
6363} // namespace
6364
6366 VPCostContext &CostCtx,
6367 VFRange &Range) {
6368 // Find all possible valid partial reductions, grouping chains by their PHI.
6369 // This grouping allows invalidating the whole chain, if any link is not a
6370 // valid partial reduction.
6372 ChainsByPhi;
6373 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6374 for (VPRecipeBase &R : HeaderVPBB->phis()) {
6375 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
6376 if (!RedPhiR)
6377 continue;
6378
6379 if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))
6380 ChainsByPhi.try_emplace(RedPhiR, std::move(*Chains));
6381 }
6382
6383 if (ChainsByPhi.empty())
6384 return;
6385
6386 // Build set of partial reduction operations for extend user validation and
6387 // a map of reduction bin ops to their scale factors for scale validation.
6388 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
6389 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
6390 for (const auto &[_, Chains] : ChainsByPhi)
6391 for (const VPPartialReductionChain &Chain : Chains) {
6392 PartialReductionOps.insert(Chain.ExtendedOp.ExtendsUser);
6393 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
6394 }
6395
6396 // A partial reduction is invalid if any of its extends are used by
6397 // something that isn't another partial reduction. This is because the
6398 // extends are intended to be lowered along with the reduction itself.
6399 auto ExtendUsersValid = [&](VPValue *Ext) {
6400 return !isa<VPWidenCastRecipe>(Ext) || all_of(Ext->users(), [&](VPUser *U) {
6401 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
6402 });
6403 };
6404
6405 auto IsProfitablePartialReductionChainForVF =
6406 [&](ArrayRef<VPPartialReductionChain> Chain, ElementCount VF) -> bool {
6407 InstructionCost PartialCost = 0, RegularCost = 0;
6408
6409 // The chain is a profitable partial reduction chain if the cost of handling
6410 // the entire chain is cheaper when using partial reductions than when
6411 // handling the entire chain using regular reductions.
6412 for (const VPPartialReductionChain &Link : Chain) {
6413 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6414 InstructionCost LinkCost = getPartialReductionLinkCost(CostCtx, Link, VF);
6415 if (!LinkCost.isValid())
6416 return false;
6417
6418 PartialCost += LinkCost;
6419 RegularCost += Link.ReductionBinOp->computeCost(VF, CostCtx);
6420 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6421 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6422 RegularCost += ExtendedOp.ExtendsUser->computeCost(VF, CostCtx);
6423 for (VPValue *Op : ExtendedOp.ExtendsUser->operands())
6424 if (auto *Extend = dyn_cast<VPWidenCastRecipe>(Op))
6425 RegularCost += Extend->computeCost(VF, CostCtx);
6426 }
6427 return PartialCost.isValid() && PartialCost < RegularCost;
6428 };
6429
6430 // Validate chains: check that extends are only used by partial reductions,
6431 // and that reduction bin ops are only used by other partial reductions with
6432 // matching scale factors, are outside the loop region or the select
6433 // introduced by tail-folding. Otherwise we would create users of scaled
6434 // reductions where the types of the other operands don't match.
6435 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6436 for (const VPPartialReductionChain &Chain : Chains) {
6437 if (!all_of(Chain.ExtendedOp.ExtendsUser->operands(), ExtendUsersValid)) {
6438 Chains.clear();
6439 break;
6440 }
6441 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6442 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
6443 return PhiR == RedPhiR;
6444 auto *R = cast<VPSingleDefRecipe>(U);
6445 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
6447 m_Specific(Chain.ReductionBinOp))) ||
6448 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
6449 m_Specific(RedPhiR)));
6450 };
6451 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
6452 Chains.clear();
6453 break;
6454 }
6455
6456 // Check if the compute-reduction-result is used by a sunk store.
6457 // TODO: Also form partial reductions in those cases.
6458 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6459 if (any_of(RdxResult->users(), [](VPUser *U) {
6460 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
6461 return RepR && RepR->getOpcode() == Instruction::Store;
6462 })) {
6463 Chains.clear();
6464 break;
6465 }
6466 }
6467 }
6468
6469 // Clear the chain if it is not profitable.
6471 [&, &Chains = Chains](ElementCount VF) {
6472 return IsProfitablePartialReductionChainForVF(Chains, VF);
6473 },
6474 Range))
6475 Chains.clear();
6476 }
6477
6478 for (auto &[Phi, Chains] : ChainsByPhi)
6479 for (const VPPartialReductionChain &Chain : Chains)
6480 transformToPartialReduction(Chain, CostCtx.Types, Plan, Phi);
6481}
6482
6484 VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder) {
6485 // Collect all loads/stores first. We will start with ones having simpler
6486 // decisions followed by more complex ones that are potentially
6487 // guided/dependent on the simpler ones.
6489 for (VPBasicBlock *VPBB :
6492 for (VPRecipeBase &R : *VPBB) {
6493 auto *VPI = dyn_cast<VPInstruction>(&R);
6494 if (VPI && VPI->getUnderlyingValue() &&
6495 is_contained({Instruction::Load, Instruction::Store},
6496 VPI->getOpcode()))
6497 MemOps.push_back(VPI);
6498 }
6499 }
6500
6501 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
6502 VPBuilder FinalRedStoresBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
6503
6504 for (VPInstruction *VPI : MemOps) {
6505 auto ReplaceWith = [&](VPRecipeBase *New) {
6506 New->insertBefore(VPI);
6507 if (VPI->getOpcode() == Instruction::Load)
6508 VPI->replaceAllUsesWith(New->getVPSingleValue());
6509 VPI->eraseFromParent();
6510 };
6511
6512 // Note: we must do that for scalar VPlan as well.
6513 if (RecipeBuilder.replaceWithFinalIfReductionStore(VPI,
6514 FinalRedStoresBuilder))
6515 continue;
6516
6517 // Filter out scalar VPlan for the remaining memory operations.
6519 [](ElementCount VF) { return VF.isScalar(); }, Range))
6520 continue;
6521
6522 if (VPHistogramRecipe *Histogram = RecipeBuilder.widenIfHistogram(VPI)) {
6523 ReplaceWith(Histogram);
6524 continue;
6525 }
6526
6527 VPRecipeBase *Recipe = RecipeBuilder.tryToWidenMemory(VPI, Range);
6528 if (!Recipe)
6529 Recipe = RecipeBuilder.handleReplication(VPI, Range);
6530
6531 ReplaceWith(Recipe);
6532 }
6533}
6534
6537 [&](ElementCount VF) { return VF.isScalar(); }, Range))
6538 return;
6539
6541 Plan.getEntry());
6543 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
6544 auto *VPI = dyn_cast<VPInstruction>(&R);
6545 if (!VPI)
6546 continue;
6547
6548 auto *I = cast_or_null<Instruction>(VPI->getUnderlyingValue());
6549 // Wouldn't be able to create a `VPReplicateRecipe` anyway.
6550 if (!I)
6551 continue;
6552
6553 // If executing other lanes produces side-effects we can't avoid them.
6554 if (VPI->mayHaveSideEffects())
6555 continue;
6556
6557 // We want to drop the mask operand, verify we can safely do that.
6558 if (VPI->isMasked() && !VPI->isSafeToSpeculativelyExecute())
6559 continue;
6560
6561 // Avoid rewriting IV increment as that interferes with
6562 // `removeRedundantCanonicalIVs`.
6563 if (VPI->getOpcode() == Instruction::Add &&
6565 continue;
6566
6567 // Other lanes are needed - can't drop them.
6569 continue;
6570
6571 auto *Recipe = new VPReplicateRecipe(
6572 I, VPI->operandsWithoutMask(), /*IsSingleScalar=*/true,
6573 /*Mask=*/nullptr, *VPI, *VPI, VPI->getDebugLoc());
6574 Recipe->insertBefore(VPI);
6575 VPI->replaceAllUsesWith(Recipe);
6576 VPI->eraseFromParent();
6577 }
6578 }
6579}
6580
6581/// Returns true if \p Info's parameter kinds are compatible with \p Args.
6582static bool areVFParamsOk(const VFInfo &Info, ArrayRef<VPValue *> Args,
6583 PredicatedScalarEvolution &PSE, const Loop *L,
6584 VPTypeAnalysis &Types) {
6585 ScalarEvolution *SE = PSE.getSE();
6586 return all_of(Info.Shape.Parameters, [&](VFParameter Param) {
6587 switch (Param.ParamKind) {
6588 case VFParamKind::Vector:
6589 case VFParamKind::GlobalPredicate:
6590 return true;
6591 case VFParamKind::OMP_Uniform:
6592 return SE->isSCEVable(Types.inferScalarType(Args[Param.ParamPos])) &&
6593 SE->isLoopInvariant(
6594 vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),
6595 L);
6596 case VFParamKind::OMP_Linear:
6597 return match(vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),
6598 m_scev_AffineAddRec(
6599 m_SCEV(), m_scev_SpecificSInt(Param.LinearStepOrPos),
6600 m_SpecificLoop(L)));
6601 default:
6602 return false;
6603 }
6604 });
6605}
6606
6607/// Find a vector variant of \p CI for \p VF, respecting \p MaskRequired.
6608/// Returns the variant function, or nullptr. Masked variants are assumed to
6609/// take the mask as a trailing parameter.
6611 ElementCount VF, bool MaskRequired,
6613 const Loop *L, VPTypeAnalysis &Types) {
6614 if (CI->isNoBuiltin())
6615 return nullptr;
6616 auto Mappings = VFDatabase::getMappings(*CI);
6617 const auto *It = find_if(Mappings, [&](const VFInfo &Info) {
6618 return Info.Shape.VF == VF && (!MaskRequired || Info.isMasked()) &&
6619 areVFParamsOk(Info, Args, PSE, L, Types);
6620 });
6621 if (It == Mappings.end())
6622 return nullptr;
6623 return CI->getModule()->getFunction(It->VectorName);
6624}
6625
6626namespace {
6627/// The outcome of choosing how to widen a call at a given VF.
6628struct CallWideningDecision {
6629 using KindTy = VPCostContext::CallWideningKind;
6630 CallWideningDecision(KindTy Kind, Function *Variant = nullptr)
6631 : Kind(Kind), Variant(Variant) {}
6632 KindTy Kind;
6633
6634 /// Set when Kind == VectorVariant.
6636
6637 bool operator==(const CallWideningDecision &Other) const {
6638 return Kind == Other.Kind && Variant == Other.Variant;
6639 }
6640};
6641} // namespace
6642
6643/// Pick the cheapest widening for the call \p VPI at \p VF among scalarization,
6644/// vector intrinsic, and vector library variant.
6645static CallWideningDecision decideCallWidening(VPInstruction &VPI,
6647 ElementCount VF,
6648 VPCostContext &CostCtx) {
6649 auto *CI = cast<CallInst>(VPI.getUnderlyingInstr());
6650
6651 // Scalar VFs and calls forced or known to scalarize always replicate.
6652 if (VF.isScalar() || CostCtx.willBeScalarized(CI, VF))
6653 return CallWideningDecision::KindTy::Scalarize;
6654
6655 auto *CalledFn = cast<Function>(
6657 Type *ResultTy = CostCtx.Types.inferScalarType(&VPI);
6659 bool MaskRequired = CostCtx.isMaskRequired(CI);
6660
6661 // Pseudo intrinsics (assume, lifetime, ...) are always scalarized.
6663 return CallWideningDecision::KindTy::Scalarize;
6664
6665 InstructionCost ScalarCost =
6666 VPReplicateRecipe::computeCallCost(CalledFn, ResultTy, Ops,
6667 /*IsSingleScalar=*/false, VF, CostCtx);
6668
6669 Function *VecFunc = findVectorVariant(CI, Ops, VF, MaskRequired, CostCtx.PSE,
6670 CostCtx.L, CostCtx.Types);
6672 if (VecFunc)
6673 VecCallCost = VPWidenCallRecipe::computeCallCost(VecFunc, CostCtx);
6674
6675 // Prefer the intrinsic if it is at least as cheap as scalarizing and any
6676 // available vector variant.
6677 if (ID) {
6680 if (IntrinsicCost.isValid() && ScalarCost >= IntrinsicCost &&
6681 (!VecFunc || VecCallCost >= IntrinsicCost))
6682 return CallWideningDecision::KindTy::Intrinsic;
6683 }
6684
6685 // Otherwise, use a vector library variant when it beats scalarizing.
6686 if (VecFunc && ScalarCost >= VecCallCost)
6687 return {CallWideningDecision::KindTy::VectorVariant, VecFunc};
6688
6689 return CallWideningDecision::KindTy::Scalarize;
6690}
6691
6693 VPRecipeBuilder &RecipeBuilder,
6694 VPCostContext &CostCtx) {
6698 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
6699 auto *VPI = dyn_cast<VPInstruction>(&R);
6700 if (!VPI || !VPI->getUnderlyingValue() ||
6701 VPI->getOpcode() != Instruction::Call)
6702 continue;
6703
6704 auto *CI = cast<CallInst>(VPI->getUnderlyingInstr());
6705 SmallVector<VPValue *, 4> Ops(VPI->op_begin(),
6706 VPI->op_begin() + CI->arg_size());
6707
6708 CallWideningDecision Decision =
6709 decideCallWidening(*VPI, Ops, Range.Start, CostCtx);
6711 [&](ElementCount VF) {
6712 return Decision == decideCallWidening(*VPI, Ops, VF, CostCtx);
6713 },
6714 Range);
6715
6716 VPSingleDefRecipe *Replacement = nullptr;
6717 switch (Decision.Kind) {
6718 case CallWideningDecision::KindTy::Intrinsic: {
6720 Type *ResultTy = CostCtx.Types.inferScalarType(VPI);
6721 Replacement = new VPWidenIntrinsicRecipe(*CI, ID, Ops, ResultTy, *VPI,
6722 *VPI, VPI->getDebugLoc());
6723 break;
6724 }
6725 case CallWideningDecision::KindTy::VectorVariant: {
6726 // Masked variants take the mask as a trailing parameter, so they have
6727 // one more parameter than the original call's arguments.
6728 if (Decision.Variant->arg_size() > Ops.size()) {
6729 VPValue *Mask = VPI->isMasked() ? VPI->getMask() : Plan.getTrue();
6730 Ops.push_back(Mask);
6731 }
6732 Ops.push_back(VPI->getOperand(VPI->getNumOperandsWithoutMask() - 1));
6733 Replacement = new VPWidenCallRecipe(CI, Decision.Variant, Ops, *VPI,
6734 *VPI, VPI->getDebugLoc());
6735 break;
6736 }
6737 case CallWideningDecision::KindTy::Scalarize:
6738 Replacement = RecipeBuilder.handleReplication(VPI, Range);
6739 break;
6740 }
6741
6743 [&](ElementCount VF) {
6744 Intrinsic::ID IID =
6745 getVectorIntrinsicIDForCall(CI, &CostCtx.TLI);
6747 return true;
6748 auto Legacy = CostCtx.getLegacyCallKind(CI, VF);
6749 return !Legacy || *Legacy == Decision.Kind;
6750 }) &&
6751 "VPlan call widening decision must match legacy decision");
6752
6753 Replacement->insertBefore(VPI);
6754 VPI->replaceAllUsesWith(Replacement);
6755 ToErase.push_back(VPI);
6756 }
6757 }
6758 for (VPInstruction *VPI : ToErase)
6759 VPI->eraseFromParent();
6760}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static CallWideningDecision decideCallWidening(VPInstruction &VPI, ArrayRef< VPValue * > Ops, ElementCount VF, VPCostContext &CostCtx)
Pick the cheapest widening for the call VPI at VF among scalarization, vector intrinsic,...
static bool sinkScalarOperands(VPlan &Plan)
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static VPValue * cloneBinOpForScalarIV(VPWidenRecipe *BinOp, VPValue *ScalarIV, VPWidenIntOrFpInductionRecipe *WidenIV)
Create a scalar version of BinOp, with its WidenIV operand replaced by ScalarIV, and place it after S...
static VPWidenIntOrFpInductionRecipe * getExpressionIV(VPValue *V)
Check if V is a binary expression of a widened IV and a loop-invariant value.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static Function * findVectorVariant(CallInst *CI, ArrayRef< VPValue * > Args, ElementCount VF, bool MaskRequired, PredicatedScalarEvolution &PSE, const Loop *L, VPTypeAnalysis &Types)
Find a vector variant of CI for VF, respecting MaskRequired.
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx, bool IsScalable)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectGroupedReplicateMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L, function_ref< bool(VPReplicateRecipe *)> FilterFn)
Collect either replicated Loads or Stores grouped by their address SCEV, in a deep-traversal of the v...
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
static std::optional< Intrinsic::ID > getVPDivRemIntrinsic(Intrinsic::ID IntrID)
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void expandVPDerivedIV(VPDerivedIVRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPDerivedIVRecipe into executable recipes.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static bool areVFParamsOk(const VFInfo &Info, ArrayRef< VPValue * > Args, PredicatedScalarEvolution &PSE, const Loop *L, VPTypeAnalysis &Types)
Returns true if Info's parameter kinds are compatible with Args.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations using scoped noalias metadata,...
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPRegionBlock *ParentRegion, VPlan &Plan)
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static std::optional< Instruction::BinaryOps > getUnmaskedDivRemOpcode(Intrinsic::ID ID)
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool cannotHoistOrSinkRecipe(VPRecipeBase &R, VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Return true if we do not know how to (mechanically) hoist or sink a non-memory or memory recipe R out...
static bool canNarrowOps(ArrayRef< VPValue * > Ops, bool IsScalable)
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1055
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
APInt abs() const
Get the absolute value.
Definition APInt.h:1818
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:1028
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getMinValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the minimum integer value with the given bit width and signedness.
Definition APSInt.h:310
static APSInt getMaxValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the maximum integer value with the given bit width and signedness.
Definition APSInt.h:302
@ NoAlias
The two locations do not alias at all.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
Get the last element.
Definition ArrayRef.h:150
const T & front() const
Get the first element.
Definition ArrayRef.h:144
iterator end() const
Definition ArrayRef.h:130
iterator begin() const
Definition ArrayRef.h:129
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:254
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:215
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:333
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
size_t arg_size() const
Definition Function.h:901
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
static LLVM_ABI InductionDescriptor getCanonicalIntInduction(Type *Ty, ScalarEvolution &SE)
Returns the canonical integer induction for type Ty with start = 0 and step = 1.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
The group of interleaved loads/stores sharing the same stride and close to each other.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1676
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1080
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:110
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:118
bool empty() const
Definition MapVector.h:79
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition Module.cpp:235
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
Post-order traversal of a graph.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(SCEVUse SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getUDivExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
@ SK_Broadcast
Broadcast element 0 to all other elements.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
op_range operands()
Definition User.h:267
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:76
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3794
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4143
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4218
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4170
iterator end()
Definition VPlan.h:4180
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4178
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4231
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:244
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:560
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:639
const VPRecipeBase & back() const
Definition VPlan.h:4192
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2773
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2809
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2799
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2815
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2795
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:93
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:314
VPRegionBlock * getParent()
Definition VPlan.h:185
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:214
size_t getNumSuccessors() const
Definition VPlan.h:236
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:305
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:221
VPlan * getPlan()
Definition VPlan.cpp:189
const std::string & getName() const
Definition VPlan.h:176
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:324
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:232
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:194
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:278
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:226
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:210
static auto blocksAs(T &&Range)
Return an iterator range over Range with each block cast to BlockTy.
Definition VPlanUtils.h:314
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:333
static bool isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT)
Returns true if VPB is a loop latch, using isHeader().
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:223
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:241
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:259
static auto blocksOnly(T &&Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:295
static void transferSuccessors(VPBlockBase *Old, VPBlockBase *New)
Transfer successors from Old to New. New must have no successors.
Definition VPlanUtils.h:279
static SmallVector< VPBasicBlock * > blocksInSingleSuccessorChainBetween(VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Returns the blocks between FirstBB and LastBB, where FirstBB to LastBB forms a single-sucessor chain.
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3271
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createFirstActiveLane(ArrayRef< VPValue * > Masks, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAnyOfReduction(VPValue *ChainOp, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown())
Create an AnyOf reduction pattern: or-reduce ChainOp, freeze the result, then select between TrueVal ...
Definition VPlan.cpp:1661
VPInstruction * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step)
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
VPWidenPHIRecipe * createWidenPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:3826
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:545
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:518
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:530
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:540
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3917
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3316
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2293
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2335
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2324
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition VPlan.h:2038
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4296
Class to record and manage LLVM IR flags.
Definition VPlan.h:685
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1153
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1208
unsigned getNumOperandsWithoutMask() const
Returns the number of operands, excluding the mask if the VPInstruction is masked.
Definition VPlan.h:1434
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1305
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1248
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1299
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1243
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1240
@ CanonicalIVIncrementForPart
Definition VPlan.h:1224
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1251
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2910
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2902
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2931
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2983
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2941
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3458
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:401
VPBasicBlock * getParent()
Definition VPlan.h:475
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:553
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPHistogramRecipe * widenIfHistogram(VPInstruction *VPI)
If VPI represents a histogram operation (as determined by LoopVectorizationLegality) make that safe f...
VPRecipeBase * tryToWidenMemory(VPInstruction *VPI, VFRange &Range)
Check if the load or store instruction VPI should widened for Range.Start and potentially masked.
bool replaceWithFinalIfReductionStore(VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder)
If VPI is a store of a reduction into an invariant address, delete it.
VPReplicateRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a VPReplicationRecipe for VPI.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3143
A recipe for handling reduction phis.
Definition VPlan.h:2679
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2726
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2719
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2737
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3034
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4353
const VPBlockBase * getEntry() const
Definition VPlan.h:4397
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4429
VPInstruction * getOrCreateCanonicalIVIncrement()
Get the canonical IV increment instruction if it exists.
Definition VPlan.cpp:857
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4414
Type * getCanonicalIVType() const
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4473
bool hasCanonicalIVNUW() const
Indicates if NUW is set for the canonical IV increment, for loop regions.
Definition VPlan.h:4478
void clearCanonicalIVNUW(VPInstruction *Increment)
Unsets NUW for the canonical IV increment Increment, for loop regions.
Definition VPlan.h:4481
VPRegionValue * getCanonicalIV()
Return the canonical induction variable of the region, null for replicating regions.
Definition VPlan.h:4465
const VPBlockBase * getExiting() const
Definition VPlan.h:4409
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4422
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3188
bool isSingleScalar() const
Definition VPlan.h:3236
static InstructionCost computeCallCost(Function *CalledFn, Type *ResultTy, ArrayRef< const VPValue * > ArgOps, bool IsSingleScalar, ElementCount VF, VPCostContext &Ctx)
Return the cost of scalarizing a call to CalledFn with argument operands ArgOps for a given VF.
bool isPredicated() const
Definition VPlan.h:3238
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3255
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3988
VPSingleDefRecipe is a base class for recipes that model a sequence of one or more output IR that def...
Definition VPlan.h:605
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:670
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:373
operand_range operands()
Definition VPlanValue.h:441
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:417
unsigned getNumOperands() const
Definition VPlanValue.h:411
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:412
void addOperand(VPValue *Operand)
Definition VPlanValue.h:406
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:50
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:143
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1486
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:130
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:75
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:204
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1489
unsigned getNumUsers() const
Definition VPlanValue.h:115
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1495
user_range users()
Definition VPlanValue.h:157
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2144
A recipe for widening Call instructions using library calls.
Definition VPlan.h:1976
static InstructionCost computeCallCost(Function *Variant, VPCostContext &Ctx)
Return the cost of widening a call using the vector function Variant.
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1818
Instruction::CastOps getOpcode() const
Definition VPlan.h:1856
A recipe for handling GEP instructions.
Definition VPlan.h:2080
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2359
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2387
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2390
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2410
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2441
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2488
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2492
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2519
A recipe for widening vector intrinsics.
Definition VPlan.h:1870
static InstructionCost computeCallCost(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost of a vector intrinsic with ID and Operands.
A common mixin class for widening memory operations.
Definition VPlan.h:3493
virtual VPRecipeBase * getAsRecipe()=0
Return a VPRecipeBase* to the current object.
A recipe for widened phis.
Definition VPlan.h:2577
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1762
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
VPWidenRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1782
unsigned getOpcode() const
Definition VPlan.h:1799
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4501
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4826
bool hasVF(ElementCount VF) const
Definition VPlan.h:4724
const DataLayout & getDataLayout() const
Definition VPlan.h:4706
LLVMContext & getContext() const
Definition VPlan.h:4702
VPBasicBlock * getEntry()
Definition VPlan.h:4597
bool hasScalableVF() const
Definition VPlan.h:4725
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4660
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4681
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4731
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:4797
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4700
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:4803
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4875
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4829
bool hasUF(unsigned UF) const
Definition VPlan.h:4749
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4650
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4690
VPValue * getBackedgeTakenCount() const
Definition VPlan.h:4687
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4774
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:4800
void setVF(ElementCount VF)
Definition VPlan.h:4712
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4765
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1076
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:4752
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4674
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4626
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4852
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:4794
VPBasicBlock * getVectorPreheader() const
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4602
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4697
bool hasScalarVFOnly() const
Definition VPlan.h:4742
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4640
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4693
void setUF(unsigned UF)
Definition VPlan.h:4757
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop, i.e.
Definition VPlan.h:4907
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1232
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4808
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:261
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2815
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition DwarfDebug.h:190
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
auto m_WidenAnyExtend(const Op0_t &Op0)
match_bind< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
auto m_VPPhi(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
auto m_WidenIntrinsic(const T &...Ops)
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
match_bind< VPSingleDefRecipe > m_VPSingleDefRecipe(VPSingleDefRecipe *&V)
Match a VPSingleDefRecipe, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
auto m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
match_bind< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
auto m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
bool cannotHoistOrSinkRecipe(const VPRecipeBase &R, bool Sinking=false)
Return true if we do not know how to (mechanically) hoist or sink R.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
VPInstruction * findCanonicalIVIncrement(VPlan &Plan)
Find the canonical IV increment of Plan's vector loop region.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:116
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
Definition VPlanUtils.h:137
bool isUniformAcrossVFsAndUFs(const VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Offset
Definition DWP.cpp:557
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2179
void stable_sort(R &&Range)
Definition STLExtras.h:2115
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2077
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1668
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:288
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
SmallVector< VPRegisterUsage, 8 > calculateRegisterUsageForPlan(VPlan &Plan, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Estimate the register usage for Plan and vectorization factors in VFs by calculating the highest numb...
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1151
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:78
@ ReadOnly
No side effects to worry about, so we can process any uncountable exits in the loop and branch either...
Definition VPlan.h:83
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:551
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1860
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ FSub
Subtraction of floats.
@ FAddChainWithSubs
A chain of fadds and fsubs.
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2011
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2087
ArrayRef(const T &OneElt) -> ArrayRef< T >
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1408
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1771
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:325
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2145
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:305
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2661
Holds the VFShape for a specific scalar to vector function mapping.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
static bool isFreeScalarIntrinsic(Intrinsic::ID ID)
Returns true if ID is a pseudo intrinsic that is dropped via scalarization rather than widened.
Definition VPlan.cpp:1956
CallWideningKind
Choice for how to widen a call at a given VF.
bool isMaskRequired(Instruction *I) const
Forwards to LoopVectorizationCostModel::isMaskRequired.
PredicatedScalarEvolution & PSE
bool willBeScalarized(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalarized at VF.
TargetTransformInfo::TargetCostKind CostKind
VPTypeAnalysis Types
const TargetLibraryInfo & TLI
std::optional< CallWideningKind > getLegacyCallKind(CallInst *CI, ElementCount VF) const
Returns the legacy call widening decision for CI at VF, or std::nullopt if none was recorded.
const TargetTransformInfo & TTI
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:242
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:147
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
InstructionCost spillCost(const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, unsigned OverrideMaxNumRegs=0) const
Calculate the estimated cost of any spills due to using more registers than the number available for ...
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:282
bool isMaterialized() const
Returns true if this symbolic value has been materialized.
Definition VPlanValue.h:293
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3603
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3555
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3704
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3651
static void handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, UncountableExitStyle Style)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder)
Convert load/store VPInstructions in Plan into widened or replicate recipes.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, const bool &EpilogueAllowed)
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
static void removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void replaceWideCanonicalIVWithWideIV(VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, ElementCount VF, unsigned UF, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Replace a VPWidenCanonicalIVRecipe if it is present in Plan, with a VPWidenIntOrFpInductionRecipe,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step, std::optional< uint64_t > MaxRuntimeStep=std::nullopt)
Materialize vector trip count computations to a set of VPInstructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static bool mergeBlocksIntoPredecessors(VPlan &Plan)
Remove redundant VPBasicBlocks by merging them into their single predecessor if the latter has a sing...
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void makeScalarizationDecisions(VPlan &Plan, VFRange &Range)
Make VPlan-based scalarization decision prior to delegating to the ones made by the legacy CM.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses of the canonical ...
static void makeCallWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder, VPCostContext &CostCtx)
Convert call VPInstructions in Plan into widened call, vector intrinsic or replicate recipes based on...
static void adjustFirstOrderRecurrenceMiddleUsers(VPlan &Plan, VFRange &Range)
Adjust first-order recurrence users in the middle block: create penultimate element extracts for LCSS...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static void dropPoisonGeneratingRecipes(VPlan &Plan)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...