LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/TypeSwitch.h"
32#include "llvm/Analysis/Loads.h"
39#include "llvm/IR/Intrinsics.h"
40#include "llvm/IR/MDBuilder.h"
41#include "llvm/IR/Metadata.h"
46
47using namespace llvm;
48using namespace VPlanPatternMatch;
49using namespace SCEVPatternMatch;
50
52 VPlan &Plan, const TargetLibraryInfo &TLI) {
53
55 Plan.getVectorLoopRegion());
57 // Skip blocks outside region
58 if (!VPBB->getParent())
59 break;
60 VPRecipeBase *Term = VPBB->getTerminator();
61 auto EndIter = Term ? Term->getIterator() : VPBB->end();
62 // Introduce each ingredient into VPlan.
63 for (VPRecipeBase &Ingredient :
64 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
65
66 VPValue *VPV = Ingredient.getVPSingleValue();
67 if (!VPV->getUnderlyingValue())
68 continue;
69
71
72 VPRecipeBase *NewRecipe = nullptr;
73 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
74 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
75 NewRecipe = new VPWidenPHIRecipe(PhiR->operands(), PhiR->getDebugLoc(),
76 Phi->getName());
77 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
78 assert(!isa<PHINode>(Inst) && "phis should be handled above");
79 // Create VPWidenMemoryRecipe for loads and stores.
80 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
81 NewRecipe = new VPWidenLoadRecipe(
82 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
83 false /*Consecutive*/, *VPI, Ingredient.getDebugLoc());
84 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
85 NewRecipe = new VPWidenStoreRecipe(
86 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
87 nullptr /*Mask*/, false /*Consecutive*/, *VPI,
88 Ingredient.getDebugLoc());
90 NewRecipe = new VPWidenGEPRecipe(GEP->getSourceElementType(),
91 Ingredient.operands(), *VPI,
92 Ingredient.getDebugLoc(), GEP);
93 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
94 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
95 if (VectorID == Intrinsic::not_intrinsic)
96 return false;
97
98 // The noalias.scope.decl intrinsic declares a noalias scope that
99 // is valid for a single iteration. Emitting it as a single-scalar
100 // replicate would incorrectly extend the scope across multiple
101 // original iterations packed into one vector iteration.
102 // FIXME: If we want to vectorize this loop, then we have to drop
103 // all the associated !alias.scope and !noalias.
104 if (VectorID == Intrinsic::experimental_noalias_scope_decl)
105 return false;
106
107 // These intrinsics are recognized by getVectorIntrinsicIDForCall
108 // but are not widenable. Emit them as replicate instead of widening.
109 if (VectorID == Intrinsic::assume ||
110 VectorID == Intrinsic::lifetime_end ||
111 VectorID == Intrinsic::lifetime_start ||
112 VectorID == Intrinsic::sideeffect ||
113 VectorID == Intrinsic::pseudoprobe) {
114 // If the operand of llvm.assume holds before vectorization, it will
115 // also hold per lane.
116 // llvm.pseudoprobe requires to be duplicated per lane for accurate
117 // sample count.
118 const bool IsSingleScalar = VectorID != Intrinsic::assume &&
119 VectorID != Intrinsic::pseudoprobe;
120 NewRecipe = new VPReplicateRecipe(CI, Ingredient.operands(),
121 /*IsSingleScalar=*/IsSingleScalar,
122 /*Mask=*/nullptr, *VPI, *VPI,
123 Ingredient.getDebugLoc());
124 } else {
125 NewRecipe = new VPWidenIntrinsicRecipe(
126 *CI, VectorID, drop_end(Ingredient.operands()), CI->getType(),
127 VPIRFlags(*CI), *VPI, CI->getDebugLoc());
128 }
129 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
130 NewRecipe = new VPWidenCastRecipe(
131 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
132 VPIRFlags(*CI), VPIRMetadata(*CI));
133 } else {
134 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
135 *VPI, Ingredient.getDebugLoc());
136 }
137 } else {
139 "inductions must be created earlier");
140 continue;
141 }
142
143 NewRecipe->insertBefore(&Ingredient);
144 if (NewRecipe->getNumDefinedValues() == 1)
145 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
146 else
147 assert(NewRecipe->getNumDefinedValues() == 0 &&
148 "Only recpies with zero or one defined values expected");
149 Ingredient.eraseFromParent();
150 }
151 }
152 return true;
153}
154
155/// Helper for extra no-alias checks via known-safe recipe and SCEV.
158 VPReplicateRecipe &GroupLeader;
159 PredicatedScalarEvolution *PSE = nullptr;
160 const Loop *L = nullptr;
161
162 // Return true if \p A and \p B are known to not alias for all VFs in the
163 // plan, checked via the distance between the accesses
164 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
165 if (A->getOpcode() != Instruction::Store ||
166 B->getOpcode() != Instruction::Store)
167 return false;
168
169 if (!PSE || !L)
170 return A == B;
171
172 VPValue *AddrA = A->getOperand(1);
173 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, *PSE, L);
174 VPValue *AddrB = B->getOperand(1);
175 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, *PSE, L);
177 return false;
178
179 const APInt *Distance;
180 ScalarEvolution &SE = *PSE->getSE();
181 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
182 return false;
183
184 const DataLayout &DL = SE.getDataLayout();
185 Type *TyA = A->getOperand(0)->getScalarType();
186 uint64_t SizeA = DL.getTypeStoreSize(TyA);
187 Type *TyB = B->getOperand(0)->getScalarType();
188 uint64_t SizeB = DL.getTypeStoreSize(TyB);
189
190 // Use the maximum store size to ensure no overlap from either direction.
191 // Currently only handles fixed sizes, as it is only used for
192 // replicating VPReplicateRecipes.
193 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
194
195 auto VFs = B->getParent()->getPlan()->vectorFactors();
197 if (MaxVF.isScalable())
198 return false;
199 return Distance->abs().uge(
200 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
201 }
202
203public:
206 const Loop &L)
207 : ExcludeRecipes(ExcludeRecipes.begin(), ExcludeRecipes.end()),
208 GroupLeader(GroupLeader), PSE(&PSE), L(&L) {}
209
210 SinkStoreInfo(VPReplicateRecipe &GroupLeader) : GroupLeader(GroupLeader) {}
211
212 /// Return true if \p R should be skipped during alias checking, either
213 /// because it's in the exclude set or because no-alias can be proven via
214 /// SCEV.
215 bool shouldSkip(VPRecipeBase &R) const {
216 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
217 return ExcludeRecipes.contains(Store) ||
218 (Store && isNoAliasViaDistance(Store, &GroupLeader));
219 }
220};
221
222/// Check if a memory operation doesn't alias with memory operations using
223/// scoped noalias metadata, in blocks in the single-successor chain between \p
224/// FirstBB and \p LastBB. If \p SinkInfo is std::nullopt, only recipes that may
225/// write to memory are checked (for load hoisting). Otherwise recipes that both
226/// read and write memory are checked, and SCEV is used to prove no-alias
227/// between the group leader and other replicate recipes (for store sinking).
228static bool
230 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
231 std::optional<SinkStoreInfo> SinkInfo = {}) {
232 bool CheckReads = SinkInfo.has_value();
233 if (!MemLoc.AATags.Scope)
234 return false;
235
236 for (VPBasicBlock *VPBB :
238 for (VPRecipeBase &R : *VPBB) {
239 if (SinkInfo && SinkInfo->shouldSkip(R))
240 continue;
241
242 // Skip recipes that don't need checking.
243 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
244 continue;
245
247 if (!Loc)
248 // Conservatively assume aliasing for memory operations without
249 // location.
250 return false;
251
253 return false;
254 }
255 }
256 return true;
257}
258
259/// Get the value type of the replicate load or store. \p IsLoad indicates
260/// whether it is a load.
262 return (IsLoad ? R : R->getOperand(0))->getScalarType();
263}
264
265/// Collect either replicated Loads or Stores grouped by their address SCEV and
266/// their load-store type, in a deep-traversal of the vector loop region in \p
267/// Plan.
268template <unsigned Opcode>
271 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
272 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
273 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
274 "Only Load and Store opcodes supported");
275 constexpr bool IsLoad = (Opcode == Instruction::Load);
278 RecipesByAddressAndType;
281 for (VPRecipeBase &R : *VPBB) {
282 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
283 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
284 continue;
285
286 // For loads, operand 0 is address; for stores, operand 1 is address.
287 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
288 const Type *LoadStoreTy = getLoadStoreValueType(RepR, IsLoad);
289 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
290 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
291 RecipesByAddressAndType[{AddrSCEV, LoadStoreTy}].push_back(RepR);
292 }
293 }
294 auto Groups = to_vector(RecipesByAddressAndType.values());
295 VPDominatorTree VPDT(Plan);
296 for (auto &Group : Groups) {
297 // Sort mem ops by dominance order, with earliest (most dominating) first.
299 return VPDT.properlyDominates(A, B);
300 });
301 }
302 return Groups;
303}
304
305static bool sinkScalarOperands(VPlan &Plan) {
306 auto Iter = vp_depth_first_deep(Plan.getEntry());
307 bool ScalarVFOnly = Plan.hasScalarVFOnly();
308 bool Changed = false;
309
311 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
312 VPBasicBlock *SinkTo, VPValue *Op) {
313 auto *Candidate =
314 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
315 if (!Candidate)
316 return;
317
318 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
319 // for now.
321 return;
322
323 if (Candidate->getParent() == SinkTo ||
324 vputils::cannotHoistOrSinkRecipe(*Candidate, /*Sinking=*/true))
325 return;
326
327 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
328 if (!ScalarVFOnly && RepR->isSingleScalar())
329 return;
330
331 WorkList.insert({SinkTo, Candidate});
332 };
333
334 // First, collect the operands of all recipes in replicate blocks as seeds for
335 // sinking.
337 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
338 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
339 continue;
340 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
341 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
342 continue;
343 for (auto &Recipe : *VPBB)
344 for (VPValue *Op : Recipe.operands())
345 InsertIfValidSinkCandidate(VPBB, Op);
346 }
347
348 // Try to sink each replicate or scalar IV steps recipe in the worklist.
349 for (unsigned I = 0; I != WorkList.size(); ++I) {
350 VPBasicBlock *SinkTo;
351 VPSingleDefRecipe *SinkCandidate;
352 std::tie(SinkTo, SinkCandidate) = WorkList[I];
353
354 // All recipe users of SinkCandidate must be in the same block SinkTo or all
355 // users outside of SinkTo must only use the first lane of SinkCandidate. In
356 // the latter case, we need to duplicate SinkCandidate.
357 auto UsersOutsideSinkTo =
358 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
359 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
360 });
361 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
362 return !U->usesFirstLaneOnly(SinkCandidate);
363 }))
364 continue;
365 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
366
367 if (NeedsDuplicating) {
368 if (ScalarVFOnly)
369 continue;
370 VPSingleDefRecipe *Clone;
371 if (auto *SinkCandidateRepR =
372 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
373 // TODO: Handle converting to uniform recipes as separate transform,
374 // then cloning should be sufficient here.
376 SinkCandidateRepR->getOpcode(), SinkCandidate->operands(),
377 /*Mask=*/nullptr, *SinkCandidateRepR, *SinkCandidateRepR,
378 SinkCandidate->getDebugLoc(), SinkCandidate->getUnderlyingInstr());
379 // TODO: add ".cloned" suffix to name of Clone's VPValue.
380 } else {
381 Clone = SinkCandidate->clone();
382 }
383
384 Clone->insertBefore(SinkCandidate);
385 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
386 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
387 });
388 }
389 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
390 for (VPValue *Op : SinkCandidate->operands())
391 InsertIfValidSinkCandidate(SinkTo, Op);
392 Changed = true;
393 }
394 return Changed;
395}
396
397/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
398/// the mask.
400 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
401 if (!EntryBB || EntryBB->size() != 1 ||
402 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
403 return nullptr;
404
405 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
406}
407
408/// If \p R is a triangle region, return the 'then' block of the triangle.
410 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
411 if (EntryBB->getNumSuccessors() != 2)
412 return nullptr;
413
414 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
415 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
416 if (!Succ0 || !Succ1)
417 return nullptr;
418
419 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
420 return nullptr;
421 if (Succ0->getSingleSuccessor() == Succ1)
422 return Succ0;
423 if (Succ1->getSingleSuccessor() == Succ0)
424 return Succ1;
425 return nullptr;
426}
427
428// Merge replicate regions in their successor region, if a replicate region
429// is connected to a successor replicate region with the same predicate by a
430// single, empty VPBasicBlock.
432 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
433
434 // Collect replicate regions followed by an empty block, followed by another
435 // replicate region with matching masks to process front. This is to avoid
436 // iterator invalidation issues while merging regions.
439 vp_depth_first_deep(Plan.getEntry()))) {
440 if (!Region1->isReplicator())
441 continue;
442 auto *MiddleBasicBlock =
443 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
444 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
445 continue;
446
447 auto *Region2 =
448 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
449 if (!Region2 || !Region2->isReplicator())
450 continue;
451
452 VPValue *Mask1 = getPredicatedMask(Region1);
453 VPValue *Mask2 = getPredicatedMask(Region2);
454 if (!Mask1 || Mask1 != Mask2)
455 continue;
456
457 assert(Mask1 && Mask2 && "both region must have conditions");
458 WorkList.push_back(Region1);
459 }
460
461 // Move recipes from Region1 to its successor region, if both are triangles.
462 for (VPRegionBlock *Region1 : WorkList) {
463 if (TransformedRegions.contains(Region1))
464 continue;
465 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
466 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
467
468 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
469 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
470 if (!Then1 || !Then2)
471 continue;
472
473 // Note: No fusion-preventing memory dependencies are expected in either
474 // region. Such dependencies should be rejected during earlier dependence
475 // checks, which guarantee accesses can be re-ordered for vectorization.
476 //
477 // Move recipes to the successor region.
478 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
479 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
480
481 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
482 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
483
484 // Move VPPredInstPHIRecipes from the merge block to the successor region's
485 // merge block. Update all users inside the successor region to use the
486 // original values.
487 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
488 VPValue *PredInst1 =
489 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
490 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
491 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
492 return cast<VPRecipeBase>(&U)->getParent() == Then2;
493 });
494
495 // Remove phi recipes that are unused after merging the regions.
496 if (Phi1ToMove.getVPSingleValue()->user_empty()) {
497 Phi1ToMove.eraseFromParent();
498 continue;
499 }
500 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
501 }
502
503 // Remove the dead recipes in Region1's entry block.
504 for (VPRecipeBase &R :
505 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
506 R.eraseFromParent();
507
508 // Finally, remove the first region.
509 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
510 VPBlockUtils::disconnectBlocks(Pred, Region1);
511 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
512 }
513 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
514 TransformedRegions.insert(Region1);
515 }
516
517 return !TransformedRegions.empty();
518}
519
521 VPRegionBlock *ParentRegion,
522 VPlan &Plan) {
523 Instruction *Instr = PredRecipe->getUnderlyingInstr();
524 // Build the triangular if-then region.
525 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
526 assert(Instr->getParent() && "Predicated instruction not in any basic block");
527 auto *BlockInMask = PredRecipe->getMask();
528 auto *MaskDef = BlockInMask->getDefiningRecipe();
529 auto *BOMRecipe = new VPBranchOnMaskRecipe(
530 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
531 auto *Entry =
532 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
533
534 // Replace predicated replicate recipe with a replicate recipe without a
535 // mask but in the replicate region.
536 auto *RecipeWithoutMask = new VPReplicateRecipe(
537 PredRecipe->getUnderlyingInstr(), PredRecipe->operandsWithoutMask(),
538 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
539 PredRecipe->getDebugLoc());
540 auto *Pred =
541 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
542 auto *Exiting = Plan.createVPBasicBlock(Twine(RegionName) + ".continue");
544 Plan.createReplicateRegion(Entry, Exiting, RegionName);
545
546 // Note: first set Entry as region entry and then connect successors starting
547 // from it in order, to propagate the "parent" of each VPBasicBlock.
548 Region->setParent(ParentRegion);
549 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
550 VPBlockUtils::connectBlocks(Pred, Exiting);
551
552 if (!PredRecipe->user_empty()) {
553 auto *PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
554 RecipeWithoutMask->getDebugLoc());
555 Exiting->appendRecipe(PHIRecipe);
556 PredRecipe->replaceAllUsesWith(PHIRecipe);
557 }
558 PredRecipe->eraseFromParent();
559 return Region;
560}
561
562static void addReplicateRegions(VPlan &Plan) {
565 vp_depth_first_deep(Plan.getEntry()))) {
566 for (VPRecipeBase &R : *VPBB)
567 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
568 if (RepR->isPredicated())
569 WorkList.push_back(RepR);
570 }
571 }
572
573 unsigned BBNum = 0;
574 for (VPReplicateRecipe *RepR : WorkList) {
575 VPBasicBlock *CurrentBlock = RepR->getParent();
576 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
577
578 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
579 SplitBlock->setName(
580 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
581 // Record predicated instructions for above packing optimizations.
583 createReplicateRegion(RepR, CurrentBlock->getParent(), Plan);
585
586 VPRegionBlock *ParentRegion = Region->getParent();
587 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
588 ParentRegion->setExiting(SplitBlock);
589 }
590}
591
595 vp_depth_first_deep(Plan.getEntry()))) {
596 // Don't fold the blocks in the skeleton of the Plan into their single
597 // predecessors for now.
598 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
599 if (!VPBB->getParent())
600 continue;
601 auto *PredVPBB =
602 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
603 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
604 isa<VPIRBasicBlock>(PredVPBB))
605 continue;
606 WorkList.push_back(VPBB);
607 }
608
609 for (VPBasicBlock *VPBB : WorkList) {
610 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
611 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
612 R.moveBefore(*PredVPBB, PredVPBB->end());
613 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
614 auto *ParentRegion = VPBB->getParent();
615 if (ParentRegion && ParentRegion->getExiting() == VPBB)
616 ParentRegion->setExiting(PredVPBB);
617 VPBlockUtils::transferSuccessors(VPBB, PredVPBB);
618 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
619 }
620 return !WorkList.empty();
621}
622
624 // Convert masked VPReplicateRecipes to if-then region blocks.
626
627 bool ShouldSimplify = true;
628 while (ShouldSimplify) {
629 ShouldSimplify = sinkScalarOperands(Plan);
630 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
631 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
632 }
633}
634
635/// Remove redundant casts of inductions.
636///
637/// Such redundant casts are casts of induction variables that can be ignored,
638/// because we already proved that the casted phi is equal to the uncasted phi
639/// in the vectorized loop. There is no need to vectorize the cast - the same
640/// value can be used for both the phi and casts in the vector loop.
642 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
644 if (!IV || IV->getTruncInst())
645 continue;
646
647 // A sequence of IR Casts has potentially been recorded for IV, which
648 // *must be bypassed* when the IV is vectorized, because the vectorized IV
649 // will produce the desired casted value. This sequence forms a def-use
650 // chain and is provided in reverse order, ending with the cast that uses
651 // the IV phi. Search for the recipe of the last cast in the chain and
652 // replace it with the original IV. Note that only the final cast is
653 // expected to have users outside the cast-chain and the dead casts left
654 // over will be cleaned up later.
655 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
656 VPValue *FindMyCast = IV;
657 for (Instruction *IRCast : reverse(Casts)) {
658 VPSingleDefRecipe *FoundUserCast = nullptr;
659 for (auto *U : FindMyCast->users()) {
660 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
661 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
662 FoundUserCast = UserCast;
663 break;
664 }
665 }
666 // A cast recipe in the chain may have been removed by earlier DCE.
667 if (!FoundUserCast)
668 break;
669 FindMyCast = FoundUserCast;
670 }
671 if (FindMyCast != IV)
672 FindMyCast->replaceAllUsesWith(IV);
673 }
674}
675
678 Instruction::BinaryOps InductionOpcode,
679 FPMathOperator *FPBinOp, Instruction *TruncI,
680 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
681 VPBuilder &Builder) {
682 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
683 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
684 VPValue *CanonicalIV = LoopRegion->getCanonicalIV();
685 VPSingleDefRecipe *BaseIV =
686 Builder.createDerivedIV(Kind, FPBinOp, StartV, CanonicalIV, Step);
687
688 // Truncate base induction if needed.
689 Type *ResultTy = BaseIV->getScalarType();
690 if (TruncI) {
691 Type *TruncTy = TruncI->getType();
692 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
693 "Not truncating.");
694 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
695 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
696 ResultTy = TruncTy;
697 }
698
699 // Truncate step if needed.
700 Type *StepTy = Step->getScalarType();
701 if (ResultTy != StepTy) {
702 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
703 "Not truncating.");
704 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
705 auto *VecPreheader =
707 VPBuilder::InsertPointGuard Guard(Builder);
708 Builder.setInsertPoint(VecPreheader);
709 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
710 }
711 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
712 &Plan.getVF(), DL);
713}
714
716 VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI,
718 const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
719 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
720 if (!LoopRegion)
721 return;
722
723 auto *WideCanIV =
725 if (!WideCanIV)
726 return;
727
728 Type *CanIVTy = LoopRegion->getCanonicalIVType();
729
730 // Replace the wide canonical IV with a scalar-iv-steps over the canonical
731 // IV.
732 if (Plan.hasScalarVFOnly() || vputils::onlyFirstLaneUsed(WideCanIV)) {
733 VPBuilder Builder(WideCanIV);
734 WideCanIV->replaceAllUsesWith(createScalarIVSteps(
735 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
736 nullptr, Plan.getZero(CanIVTy), Plan.getConstantInt(CanIVTy, 1),
737 WideCanIV->getDebugLoc(), Builder));
738 WideCanIV->eraseFromParent();
739 return;
740 }
741
742 if (vputils::onlyScalarValuesUsed(WideCanIV))
743 return;
744
745 // If a canonical VPWidenIntOrFpInductionRecipe already produces vector lanes
746 // in the header, reuse it instead of introducing another wide induction phi.
747 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
748 for (VPRecipeBase &Phi : Header->phis()) {
750 if (!match(&Phi, m_CanonicalWidenIV(WidenIV)))
751 continue;
752 // The reused wide IV feeds the header mask, whose lanes may extend past
753 // the trip count; drop flags that only hold inside the scalar loop.
754 WidenIV->dropPoisonGeneratingFlags();
755 WideCanIV->replaceAllUsesWith(WidenIV);
756 WideCanIV->eraseFromParent();
757 return;
758 }
759
760 // Introduce a new VPWidenIntOrFpInductionRecipe if profitable.
761 auto *VecTy = VectorType::get(CanIVTy, VF);
762 InstructionCost BroadcastCost = TTI.getShuffleCost(
764 InstructionCost PHICost = TTI.getCFInstrCost(Instruction::PHI, CostKind);
765 if (PHICost > BroadcastCost)
766 return;
767
768 // Bail out if the additional wide induction phi increase the expected spill
769 // cost.
770 VPRegisterUsage UnrolledBase =
771 calculateRegisterUsageForPlan(Plan, VF, TTI, ValuesToIgnore)[0];
772 for (unsigned &NumUsers : make_second_range(UnrolledBase.MaxLocalUsers))
773 NumUsers *= UF;
774 unsigned RegClass = TTI.getRegisterClassForType(/*Vector=*/true, VecTy);
775 VPRegisterUsage Projected = UnrolledBase;
776 Projected.MaxLocalUsers[RegClass] += TTI.getRegUsageForType(VecTy);
777 if (Projected.spillCost(TTI, CostKind) >
778 UnrolledBase.spillCost(TTI, CostKind))
779 return;
780
783 VPValue *StepV = Plan.getConstantInt(CanIVTy, 1);
784 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
785 /*IV=*/nullptr, Plan.getZero(CanIVTy), StepV, &Plan.getVF(), ID,
786 WideCanIV->getNoWrapFlags(), WideCanIV->getDebugLoc());
787 NewWideIV->insertBefore(&*Header->getFirstNonPhi());
788 WideCanIV->replaceAllUsesWith(NewWideIV);
789 WideCanIV->eraseFromParent();
790}
791
792/// Returns true if \p R is dead and can be removed.
793static bool isDeadRecipe(VPRecipeBase &R) {
794 // Do remove conditional assume instructions as their conditions may be
795 // flattened.
796 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
797 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
799 if (IsConditionalAssume)
800 return true;
801
802 if (R.mayHaveSideEffects())
803 return false;
804
805 // Recipe is dead if no user keeps the recipe alive.
806 return all_of(R.definedValues(), [](VPValue *V) { return V->user_empty(); });
807}
808
811 Plan.getEntry());
813 // The recipes in the block are processed in reverse order, to catch chains
814 // of dead recipes.
815 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
816 if (isDeadRecipe(R)) {
817 R.eraseFromParent();
818 continue;
819 }
820
821 // Check if R is a dead VPPhi <-> update cycle and remove it.
822 VPValue *Start, *Incoming;
823 if (!match(&R, m_VPPhi(m_VPValue(Start), m_VPValue(Incoming))))
824 continue;
825 auto *PhiR = cast<VPPhi>(&R);
826 VPUser *PhiUser = PhiR->getSingleUser();
827 if (!PhiUser)
828 continue;
829 if (PhiUser != Incoming->getDefiningRecipe() ||
830 Incoming->getNumUsers() != 1)
831 continue;
832 PhiR->replaceAllUsesWith(Start);
833 PhiR->eraseFromParent();
834 Incoming->getDefiningRecipe()->eraseFromParent();
835 }
836 }
837}
838
841 for (unsigned I = 0; I != Users.size(); ++I) {
843 for (VPValue *V : Cur->definedValues())
844 Users.insert_range(V->users());
845 }
846 return Users.takeVector();
847}
848
849/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
850/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
851/// generates scalar values.
852static VPValue *
854 VPlan &Plan, VPBuilder &Builder) {
856 VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());
857 VPValue *StepV = PtrIV->getOperand(1);
859 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
860 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
861
862 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
863 PtrIV->getDebugLoc(), "next.gep");
864}
865
866/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
867/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
868/// VPWidenPointerInductionRecipe will generate vectors only. If some users
869/// require vectors while other require scalars, the scalar uses need to extract
870/// the scalars from the generated vectors (Note that this is different to how
871/// int/fp inductions are handled). Legalize extract-from-ends using uniform
872/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
873/// the correct end value is available. Also optimize
874/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
875/// providing them scalar steps built on the canonical scalar IV and update the
876/// original IV's users. This is an optional optimization to reduce the needs of
877/// vector extracts.
880 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
881 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
882 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
883 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
884 if (!PhiR)
885 continue;
886
887 // Try to narrow wide and replicating recipes to uniform recipes, based on
888 // VPlan analysis.
889 // TODO: Apply to all recipes in the future, to replace legacy uniformity
890 // analysis.
891 auto Users = collectUsersRecursively(PhiR);
892 for (VPUser *U : reverse(Users)) {
893 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
894 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
895 // Skip recipes that shouldn't be narrowed.
896 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
897 Def->user_empty() || !Def->getUnderlyingValue() ||
898 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
899 continue;
900
901 // Skip recipes that may have other lanes than their first used.
903 continue;
904
905 // TODO: Support scalarizing ExtractValue.
906 if (match(Def,
908 continue;
909
911 Def->getUnderlyingInstr()->getOpcode(), Def->operands(),
912 /*Mask=*/nullptr, *Def, {}, DebugLoc::getUnknown(),
913 Def->getUnderlyingInstr());
914 Clone->insertAfter(Def);
915 Def->replaceAllUsesWith(Clone);
916 }
917
918 // Replace wide pointer inductions which have only their scalars used by
919 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
920 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
921 if (!Plan.hasScalarVFOnly() &&
922 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
923 continue;
924
925 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
926 PtrIV->replaceAllUsesWith(PtrAdd);
927 continue;
928 }
929
930 // Replace widened induction with scalar steps for users that only use
931 // scalars.
932 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
933 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
934 return U->usesScalars(WideIV);
935 }))
936 continue;
937
938 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
940 Plan, ID.getKind(), ID.getInductionOpcode(),
941 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
942 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
943 WideIV->getDebugLoc(), Builder);
944
945 // Update scalar users of IV to use Step instead.
946 if (!HasOnlyVectorVFs) {
947 assert(!Plan.hasScalableVF() &&
948 "plans containing a scalar VF cannot also include scalable VFs");
949 WideIV->replaceAllUsesWith(Steps);
950 } else {
951 bool HasScalableVF = Plan.hasScalableVF();
952 WideIV->replaceUsesWithIf(Steps,
953 [WideIV, HasScalableVF](VPUser &U, unsigned) {
954 if (HasScalableVF)
955 return U.usesFirstLaneOnly(WideIV);
956 return U.usesScalars(WideIV);
957 });
958 }
959 }
960}
961
962/// Check if \p VPV is an untruncated wide induction, either before or after the
963/// increment. If so return the header IV (before the increment), otherwise
964/// return null.
967 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
968 if (WideIV) {
969 // VPV itself is a wide induction, separately compute the end value for exit
970 // users if it is not a truncated IV.
971 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
972 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
973 }
974
975 // Check if VPV is an optimizable induction increment.
976 VPRecipeBase *Def = VPV->getDefiningRecipe();
977 if (!Def || Def->getNumOperands() != 2)
978 return nullptr;
979 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
980 if (!WideIV)
981 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
982 if (!WideIV)
983 return nullptr;
984
985 auto IsWideIVInc = [&]() {
986 auto &ID = WideIV->getInductionDescriptor();
987
988 // Check if VPV increments the induction by the induction step.
989 VPValue *IVStep = WideIV->getStepValue();
990 switch (ID.getInductionOpcode()) {
991 case Instruction::Add:
992 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
993 case Instruction::FAdd:
994 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
995 case Instruction::FSub:
996 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
997 m_Specific(IVStep)));
998 case Instruction::Sub: {
999 // IVStep will be the negated step of the subtraction. Check if Step == -1
1000 // * IVStep.
1001 VPValue *Step;
1002 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
1003 return false;
1004 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
1005 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
1006 ScalarEvolution &SE = *PSE.getSE();
1007 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
1008 !isa<SCEVCouldNotCompute>(StepSCEV) &&
1009 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
1010 }
1011 default:
1012 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
1013 match(VPV, m_GetElementPtr(m_Specific(WideIV),
1014 m_Specific(WideIV->getStepValue())));
1015 }
1016 llvm_unreachable("should have been covered by switch above");
1017 };
1018 return IsWideIVInc() ? WideIV : nullptr;
1019}
1020
1021/// Attempts to optimize the induction variable exit values for users in the
1022/// early exit block.
1025 VPValue *Incoming, *Mask;
1027 m_VPValue(Incoming))))
1028 return nullptr;
1029
1030 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
1031 if (!WideIV)
1032 return nullptr;
1033
1034 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1035 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1036 return nullptr;
1037
1038 // Calculate the final index.
1039 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1040 auto *CanonicalIV = LoopRegion->getCanonicalIV();
1041 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
1042 auto *ExtractR = cast<VPInstruction>(Op);
1043 VPBuilder B(ExtractR);
1044
1045 DebugLoc DL = ExtractR->getDebugLoc();
1046 VPValue *FirstActiveLane = B.createFirstActiveLane(Mask, DL);
1047 FirstActiveLane = B.createScalarZExtOrTrunc(
1048 FirstActiveLane, CanonicalIVType, FirstActiveLane->getScalarType(), DL);
1049 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
1050
1051 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1052 // changed it means the exit is using the incremented value, so we need to
1053 // add the step.
1054 if (Incoming != WideIV) {
1055 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
1056 EndValue = B.createAdd(EndValue, One, DL);
1057 }
1058
1059 if (!match(WideIV, m_CanonicalWidenIV())) {
1060 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1061 VPIRValue *Start = WideIV->getStartValue();
1062 VPValue *Step = WideIV->getStepValue();
1063 EndValue = B.createDerivedIV(
1064 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1065 Start, EndValue, Step);
1066 }
1067
1068 return EndValue;
1069}
1070
1071/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1072/// VPDerivedIVRecipe for non-canonical inductions.
1074 VPBuilder &VectorPHBuilder,
1075 VPValue *VectorTC) {
1076 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1077 // Truncated wide inductions resume from the last lane of their vector value
1078 // in the last vector iteration which is handled elsewhere.
1079 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1080 return nullptr;
1081
1082 VPIRValue *Start = WideIV->getStartValue();
1083 VPValue *Step = WideIV->getStepValue();
1085 VPValue *EndValue = VectorTC;
1086 if (!match(WideIV, m_CanonicalWidenIV())) {
1087 EndValue = VectorPHBuilder.createDerivedIV(
1088 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1089 Start, VectorTC, Step);
1090 }
1091
1092 // EndValue is derived from the vector trip count (which has the same type as
1093 // the widest induction) and thus may be wider than the induction here.
1094 Type *ScalarTypeOfWideIV = WideIV->getScalarType();
1095 if (ScalarTypeOfWideIV != EndValue->getScalarType()) {
1096 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
1097 ScalarTypeOfWideIV,
1098 WideIV->getDebugLoc());
1099 }
1100
1101 return EndValue;
1102}
1103
1104/// Attempts to optimize the induction variable exit values for users in the
1105/// exit block coming from the latch in the original scalar loop.
1106static VPValue *
1110 VPValue *Incoming;
1112 return nullptr;
1113
1114 VPWidenInductionRecipe *WideIV = getOptimizableIVOf(Incoming, PSE);
1115 if (!WideIV)
1116 return nullptr;
1117
1118 VPValue *EndValue = EndValues.lookup(WideIV);
1119 assert(EndValue && "Must have computed the end value up front");
1120
1121 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1122 // changed it means the exit is using the incremented value, so we don't
1123 // need to subtract the step.
1124 if (Incoming != WideIV)
1125 return EndValue;
1126
1127 // Otherwise, subtract the step from the EndValue.
1128 auto *ExtractR = cast<VPInstruction>(Op);
1129 VPBuilder B(ExtractR);
1130 VPValue *Step = WideIV->getStepValue();
1131 Type *ScalarTy = WideIV->getScalarType();
1132 if (ScalarTy->isIntegerTy())
1133 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1134 if (ScalarTy->isPointerTy()) {
1135 Type *StepTy = Step->getScalarType();
1136 auto *Zero = Plan.getZero(StepTy);
1137 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1138 DebugLoc::getUnknown(), "ind.escape");
1139 }
1140 if (ScalarTy->isFloatingPointTy()) {
1141 const auto &ID = WideIV->getInductionDescriptor();
1142 return B.createNaryOp(
1143 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1144 ? Instruction::FSub
1145 : Instruction::FAdd,
1146 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1147 }
1148 llvm_unreachable("all possible induction types must be handled");
1149 return nullptr;
1150}
1151
1153 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1154 // Compute end values for all inductions.
1155 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1156 auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());
1157 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1159 VPValue *ResumeTC =
1160 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1161 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1162 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);
1163 if (!WideIV)
1164 continue;
1165 if (VPValue *EndValue =
1166 tryToComputeEndValueForInduction(WideIV, VectorPHBuilder, ResumeTC))
1167 EndValues[WideIV] = EndValue;
1168 }
1169
1170 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1171 for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
1172 VPValue *Op;
1173 if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))
1174 continue;
1175 auto *WideIV = cast<VPWidenInductionRecipe>(Op);
1176 if (VPValue *EndValue = EndValues.lookup(WideIV)) {
1177 R.getVPSingleValue()->replaceAllUsesWith(EndValue);
1178 R.eraseFromParent();
1179 }
1180 }
1181
1182 // Then, optimize exit block users.
1183 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1184 for (VPRecipeBase &R : ExitVPBB->phis()) {
1185 auto *ExitIRI = cast<VPIRPhi>(&R);
1186
1187 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1188 VPValue *Escape = nullptr;
1189 if (PredVPBB == MiddleVPBB)
1191 Plan, ExitIRI->getOperand(Idx), EndValues, PSE);
1192 else
1194 Plan, ExitIRI->getOperand(Idx), PSE);
1195 if (Escape)
1196 ExitIRI->setOperand(Idx, Escape);
1197 }
1198 }
1199 }
1200}
1201
1202/// Remove redundant ExpandSCEVRecipes in \p Plan's entry block by replacing
1203/// them with already existing recipes expanding the same SCEV expression.
1206
1207 for (VPRecipeBase &R :
1209 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1210 if (!ExpR)
1211 continue;
1212
1213 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1214 if (Inserted)
1215 continue;
1216
1217 ExpR->replaceAllUsesWith(V->second);
1218 if (ExpR == Plan.getTripCount())
1219 Plan.resetTripCount(V->second);
1220
1221 ExpR->eraseFromParent();
1222 }
1223}
1224
1226 SmallVector<VPValue *> WorkList;
1228 WorkList.push_back(V);
1229
1230 while (!WorkList.empty()) {
1231 VPValue *Cur = WorkList.pop_back_val();
1232 if (!Seen.insert(Cur).second)
1233 continue;
1234 VPRecipeBase *R = Cur->getDefiningRecipe();
1235 if (!R)
1236 continue;
1237 if (!isDeadRecipe(*R))
1238 continue;
1239 append_range(WorkList, R->operands());
1240 R->eraseFromParent();
1241 }
1242}
1243
1244/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1245/// Returns an optional pair, where the first element indicates whether it is
1246/// an intrinsic ID.
1247static std::optional<std::pair<bool, unsigned>>
1249 return TypeSwitch<const VPSingleDefRecipe *,
1250 std::optional<std::pair<bool, unsigned>>>(R)
1253 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1254 .Case([](const VPWidenIntrinsicRecipe *I) {
1255 return std::make_pair(true, I->getVectorIntrinsicID());
1256 })
1257 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe, VPScalarIVStepsRecipe>(
1258 [](auto *I) {
1259 // For recipes that do not directly map to LLVM IR instructions,
1260 // assign opcodes after the last VPInstruction opcode (which is also
1261 // after the last IR Instruction opcode), based on the VPRecipeID.
1262 return std::make_pair(false, VPInstruction::OpsEnd + 1 +
1263 I->getVPRecipeID());
1264 })
1265 .Default([](auto *) { return std::nullopt; });
1266}
1267
1268/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1269/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1270/// Operands are foldable live-ins.
1272 ArrayRef<VPValue *> Operands,
1273 const DataLayout &DL) {
1274 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1275 if (!OpcodeOrIID)
1276 return nullptr;
1277
1279 for (VPValue *Op : Operands) {
1280 VPValue *Candidate = Op;
1281 match(Op, m_Broadcast(m_VPValue(Candidate)));
1282 if (!match(Candidate, m_LiveIn()))
1283 return nullptr;
1284 Value *V = Candidate->getUnderlyingValue();
1285 if (!V)
1286 return nullptr;
1287 Ops.push_back(V);
1288 }
1289
1290 VPlan &Plan = *R.getParent()->getPlan();
1291 auto FoldToIRValue = [&]() -> Value * {
1292 InstSimplifyFolder Folder(DL);
1293 if (OpcodeOrIID->first) {
1294 auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(&R);
1295 return Folder.FoldIntrinsic(OpcodeOrIID->second, Ops, R.getScalarType(),
1296 RFlags ? RFlags->getFastMathFlagsOrNone()
1297 : FastMathFlags());
1298 }
1299 unsigned Opcode = OpcodeOrIID->second;
1300 if (Instruction::isBinaryOp(Opcode))
1301 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1302 Ops[0], Ops[1]);
1303 if (Instruction::isCast(Opcode))
1304 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1305 R.getVPSingleValue()->getScalarType());
1306 switch (Opcode) {
1307 case VPInstruction::Not:
1308 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1310 case Instruction::Select:
1311 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1312 case Instruction::ICmp:
1313 case Instruction::FCmp:
1314 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1315 Ops[1]);
1316 case Instruction::GetElementPtr: {
1317 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1318 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1319 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1320 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1321 }
1324 return Folder.FoldGEP(IntegerType::getInt8Ty(Plan.getContext()), Ops[0],
1325 Ops[1],
1326 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1327 // An extract of a live-in is an extract of a broadcast, so return the
1328 // broadcasted element.
1329 case Instruction::ExtractElement:
1330 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1331 return Ops[0];
1332 }
1333 return nullptr;
1334 };
1335
1336 if (Value *V = FoldToIRValue())
1337 return Plan.getOrAddLiveIn(V);
1338 return nullptr;
1339}
1340
1341/// Try to simplify logical and bitwise recipes in \p Def.
1343 bool CanCreateNewRecipe) {
1344 VPlan *Plan = Def->getParent()->getPlan();
1345
1346 // Simplify (X && Y) | (X && !Y) -> X.
1347 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1348 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1349 // recipes to be visited during simplification.
1350 VPValue *X, *Y, *Z;
1351 if (match(Def,
1354 Def->replaceAllUsesWith(X);
1355 Def->eraseFromParent();
1356 return true;
1357 }
1358
1359 // x | AllOnes -> AllOnes
1360 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) {
1361 Def->replaceAllUsesWith(Plan->getAllOnesValue(Def->getScalarType()));
1362 return true;
1363 }
1364
1365 // x | 0 -> x
1366 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt()))) {
1367 Def->replaceAllUsesWith(X);
1368 return true;
1369 }
1370
1371 // x | !x -> AllOnes
1372 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_Not(m_Deferred(X))))) {
1373 Def->replaceAllUsesWith(Plan->getAllOnesValue(Def->getScalarType()));
1374 return true;
1375 }
1376
1377 // x & 0 -> 0
1378 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt()))) {
1379 Def->replaceAllUsesWith(Plan->getZero(Def->getScalarType()));
1380 return true;
1381 }
1382
1383 // x & AllOnes -> x
1384 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes()))) {
1385 Def->replaceAllUsesWith(X);
1386 return true;
1387 }
1388
1389 // x && false -> false
1390 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False()))) {
1391 Def->replaceAllUsesWith(Plan->getFalse());
1392 return true;
1393 }
1394
1395 // x && true -> x
1396 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True()))) {
1397 Def->replaceAllUsesWith(X);
1398 return true;
1399 }
1400
1401 // (x && y) | (x && z) -> x && (y | z)
1402 if (CanCreateNewRecipe &&
1405 // Simplify only if one of the operands has one use to avoid creating an
1406 // extra recipe.
1407 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1408 !Def->getOperand(1)->hasMoreThanOneUniqueUser())) {
1409 Def->replaceAllUsesWith(
1410 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1411 return true;
1412 }
1413
1414 // x && (x && y) -> x && y
1415 if (match(Def, m_LogicalAnd(m_VPValue(X),
1417 Def->replaceAllUsesWith(Def->getOperand(1));
1418 return true;
1419 }
1420
1421 // x && (y && x) -> x && y
1422 if (match(Def, m_LogicalAnd(m_VPValue(X),
1424 Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));
1425 return true;
1426 }
1427
1428 // x && !x -> 0
1429 if (match(Def, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) {
1430 Def->replaceAllUsesWith(Plan->getFalse());
1431 return true;
1432 }
1433
1434 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X)))) {
1435 Def->replaceAllUsesWith(X);
1436 return true;
1437 }
1438
1439 // select c, false, true -> not c
1440 VPValue *C;
1441 if (CanCreateNewRecipe &&
1442 match(Def, m_Select(m_VPValue(C), m_False(), m_True()))) {
1443 Def->replaceAllUsesWith(Builder.createNot(C));
1444 return true;
1445 }
1446
1447 // select !c, x, y -> select c, y, x
1448 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1449 Def->setOperand(0, C);
1450 Def->setOperand(1, Y);
1451 Def->setOperand(2, X);
1452 return true;
1453 }
1454
1455 // select x, (i1 y | z), y -> y | (x && z)
1456 if (CanCreateNewRecipe &&
1457 match(Def, m_Select(m_VPValue(X),
1459 m_Deferred(Y))) &&
1460 Y->getScalarType()->isIntegerTy(1)) {
1461 Def->replaceAllUsesWith(
1462 Builder.createOr(Y, Builder.createLogicalAnd(X, Z)));
1463 return true;
1464 }
1465
1466 return false;
1467}
1468
1469/// Try to simplify VPSingleDefRecipe \p Def.
1471 VPlan *Plan = Def->getParent()->getPlan();
1472
1473 // Simplification of live-in IR values for SingleDef recipes using
1474 // InstSimplifyFolder.
1475 const DataLayout &DL = Plan->getDataLayout();
1476 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL))
1477 return Def->replaceAllUsesWith(V);
1478
1479 // Fold PredPHI LiveIn -> LiveIn.
1480 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1481 VPValue *Op = PredPHI->getOperand(0);
1482 if (isa<VPIRValue>(Op))
1483 PredPHI->replaceAllUsesWith(Op);
1484 }
1485
1486 // Drop the mask of a predicated store masked by the header mask (which is
1487 // guaranteed to be true at least for the first lane) and both the stored
1488 // value and the address are uniform across VF and UF.
1489 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
1490 RepR && RepR->isPredicated() && RepR->getOpcode() == Instruction::Store &&
1491 all_of(RepR->operandsWithoutMask(), vputils::isUniformAcrossVFsAndUFs) &&
1492 vputils::isHeaderMask(RepR->getMask(), *Plan)) {
1493 auto *Unmasked = new VPReplicateRecipe(
1494 RepR->getUnderlyingInstr(), RepR->operandsWithoutMask(),
1495 RepR->isSingleScalar(), /*Mask=*/nullptr, *RepR, *RepR,
1496 RepR->getDebugLoc());
1497 Unmasked->insertBefore(RepR);
1498 RepR->replaceAllUsesWith(Unmasked);
1499 RepR->eraseFromParent();
1500 return;
1501 }
1502
1503 VPBuilder Builder(Def);
1504
1505 // Avoid replacing VPInstructions with underlying values with new
1506 // VPInstructions, as we would fail to create widen/replicate recpes from the
1507 // new VPInstructions without an underlying value, and miss out on some
1508 // transformations that only apply to widened/replicated recipes later, by
1509 // doing so.
1510 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1511 // VPInstructions without underlying values, as those will get skipped during
1512 // cost computation.
1513 bool CanCreateNewRecipe =
1514 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1515
1516 VPValue *A;
1517 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1518 Type *TruncTy = Def->getScalarType();
1519 Type *ATy = A->getScalarType();
1520 if (TruncTy == ATy) {
1521 Def->replaceAllUsesWith(A);
1522 } else {
1523 // Don't replace a non-widened cast recipe with a widened cast.
1524 if (!isa<VPWidenCastRecipe>(Def))
1525 return;
1526 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1527
1528 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1529 ? Instruction::SExt
1530 : Instruction::ZExt;
1531 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1532 TruncTy);
1533 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1534 // UnderlyingExt has distinct return type, used to retain legacy cost.
1535 Ext->setUnderlyingValue(UnderlyingExt);
1536 }
1537 Def->replaceAllUsesWith(Ext);
1538 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1539 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1540 Def->replaceAllUsesWith(Trunc);
1541 }
1542 }
1543 }
1544
1545 if (simplifyLogicalRecipe(Def, Builder, CanCreateNewRecipe))
1546 return;
1547
1548 VPValue *X, *Y, *C;
1549 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1550 return Def->replaceAllUsesWith(A);
1551
1552 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1553 return Def->replaceAllUsesWith(A);
1554
1555 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1556 return Def->replaceAllUsesWith(Plan->getZero(Def->getScalarType()));
1557
1558 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_AllOnes()))) {
1559 // Preserve nsw from the Mul on the new Sub.
1561 false, cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap()};
1562 return Def->replaceAllUsesWith(Builder.createSub(
1563 Plan->getZero(A->getScalarType()), A, Def->getDebugLoc(), "", NW));
1564 }
1565
1566 if (CanCreateNewRecipe &&
1568 // Preserve nsw from the Add and the Sub, if it's present on both, on the
1569 // new Sub.
1571 false,
1572 cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap() &&
1573 cast<VPRecipeWithIRFlags>(Def->getOperand(Def->getOperand(0) == X))
1574 ->hasNoSignedWrap()};
1575 return Def->replaceAllUsesWith(
1576 Builder.createSub(X, Y, Def->getDebugLoc(), "", NW));
1577 }
1578
1579 const APInt *APC;
1580 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1581 APC->isPowerOf2())
1582 return Def->replaceAllUsesWith(Builder.createNaryOp(
1583 Instruction::Shl,
1584 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1585 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1586
1587 if (CanCreateNewRecipe && match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) &&
1588 APC->isPowerOf2())
1589 return Def->replaceAllUsesWith(Builder.createNaryOp(
1590 Instruction::LShr,
1591 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1592 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1593
1594 if (match(Def, m_Not(m_VPValue(A)))) {
1595 if (match(A, m_Not(m_VPValue(A))))
1596 return Def->replaceAllUsesWith(A);
1597
1598 // Try to fold Not into compares by adjusting the predicate in-place.
1599 CmpPredicate Pred;
1600 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1601 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1602 if (all_of(Cmp->users(),
1604 m_Not(m_Specific(Cmp)),
1605 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1606 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1607 for (VPUser *U : to_vector(Cmp->users())) {
1608 auto *R = cast<VPSingleDefRecipe>(U);
1609 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1610 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1611 R->setOperand(1, Y);
1612 R->setOperand(2, X);
1613 } else {
1614 // not (cmp pred) -> cmp inv_pred
1615 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1616 R->replaceAllUsesWith(Cmp);
1617 }
1618 }
1619 // If Cmp doesn't have a debug location, use the one from the negation,
1620 // to preserve the location.
1621 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1622 Cmp->setDebugLoc(Def->getDebugLoc());
1623 }
1624 }
1625 }
1626
1627 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1628 // any-of (fcmp uno %A, %B), ...
1629 if (match(Def, m_AnyOf())) {
1631 VPRecipeBase *UnpairedCmp = nullptr;
1632 for (VPValue *Op : Def->operands()) {
1633 VPValue *X;
1634 if (Op->getNumUsers() > 1 ||
1636 m_Deferred(X)))) {
1637 NewOps.push_back(Op);
1638 } else if (!UnpairedCmp) {
1639 UnpairedCmp = Op->getDefiningRecipe();
1640 } else {
1641 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1642 UnpairedCmp->getOperand(0), X));
1643 UnpairedCmp = nullptr;
1644 }
1645 }
1646
1647 if (UnpairedCmp)
1648 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1649
1650 if (NewOps.size() < Def->getNumOperands()) {
1651 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1652 return Def->replaceAllUsesWith(NewAnyOf);
1653 }
1654 }
1655
1656 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1657 // This is useful for fmax/fmin without fast-math flags, where we need to
1658 // check if any operand is NaN.
1659 if (CanCreateNewRecipe &&
1661 m_Deferred(X)),
1663 m_Deferred(Y))))) {
1664 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1665 return Def->replaceAllUsesWith(NewCmp);
1666 }
1667
1668 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1669 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1670 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1671 Def->getOperand(1)->getScalarType() == Def->getScalarType())
1672 return Def->replaceAllUsesWith(Def->getOperand(1));
1673
1675 m_One()))) {
1676 Type *WideStepTy = Def->getScalarType();
1677 if (X->getScalarType() != WideStepTy)
1678 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1679 Def->replaceAllUsesWith(X);
1680 return;
1681 }
1682
1683 // For i1 vp.merges produced by AnyOf reductions:
1684 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1686 m_VPValue(X), m_VPValue())) &&
1688 Def->getScalarType()->isIntegerTy(1)) {
1689 Def->setOperand(1, Def->getOperand(0));
1690 Def->setOperand(0, Y);
1691 return;
1692 }
1693
1694 // Simplify MaskedCond with no block mask to its single operand.
1696 !cast<VPInstruction>(Def)->isMasked())
1697 return Def->replaceAllUsesWith(Def->getOperand(0));
1698
1699 // Look through ExtractLastLane.
1700 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1701 if (match(A, m_BuildVector())) {
1702 auto *BuildVector = cast<VPInstruction>(A);
1703 Def->replaceAllUsesWith(
1704 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1705 return;
1706 }
1707
1708 if (match(A, m_Broadcast(m_VPValue(X))))
1709 return Def->replaceAllUsesWith(X);
1710
1712 return Def->replaceAllUsesWith(A);
1713
1714 if (Plan->hasScalarVFOnly())
1715 return Def->replaceAllUsesWith(A);
1716 }
1717
1718 // Look through ExtractPenultimateElement (BuildVector ....).
1720 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1721 Def->replaceAllUsesWith(
1722 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1723 return;
1724 }
1725
1726 uint64_t Idx;
1728 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1729 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1730 return;
1731 }
1732
1733 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1734 Def->replaceAllUsesWith(
1735 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1736 return;
1737 }
1738
1739 // Look through broadcast of single-scalar when used as select conditions; in
1740 // that case the scalar condition can be used directly.
1741 if (match(Def,
1744 "broadcast operand must be single-scalar");
1745 Def->setOperand(0, C);
1746 return;
1747 }
1748
1749 if (match(Def, m_Broadcast(m_VPValue(X))))
1750 return Def->replaceUsesWithIf(
1751 X, [Def](const VPUser &U, unsigned) { return U.usesScalars(Def); });
1752
1754 if (Def->getNumOperands() == 1) {
1755 Def->replaceAllUsesWith(Def->getOperand(0));
1756 return;
1757 }
1758 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1759 if (all_equal(Phi->incoming_values()))
1760 Phi->replaceAllUsesWith(Phi->getOperand(0));
1761 }
1762 return;
1763 }
1764
1765 VPIRValue *IRV;
1766 if (Def->getNumOperands() == 1 &&
1768 return Def->replaceAllUsesWith(IRV);
1769
1770 // Some simplifications can only be applied after unrolling. Perform them
1771 // below.
1772 if (!Plan->isUnrolled())
1773 return;
1774
1775 // After unrolling, extract-lane may be used to extract values from multiple
1776 // scalar sources. Only simplify when extracting from a single scalar source.
1777 VPValue *LaneToExtract;
1778 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1779 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1781 return Def->replaceAllUsesWith(A);
1782
1783 // Replace extract-lane(0, canonical-WIDEN-INDUCTION) with the region's
1784 // scalar canonical IV.
1786 if (match(LaneToExtract, m_ZeroInt()) &&
1787 match(A, m_CanonicalWidenIV(WidenIV)))
1788 return Def->replaceAllUsesWith(WidenIV->getRegion()->getCanonicalIV());
1789
1790 // Simplify extract-lane with single source to extract-element.
1791 Def->replaceAllUsesWith(Builder.createNaryOp(
1792 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1793 return;
1794 }
1795
1796 // Look for cycles where Def is of the form:
1797 // X = phi(0, IVInc) ; used only by IVInc, or by IVInc and Inc = X + Y
1798 // IVInc = X + Step ; used by X and Def
1799 // Def = IVInc + Y
1800 // Fold the increment Y into the phi's start value, replace Def with IVInc,
1801 // and if Inc exists, replace it with X.
1802 if (match(Def, m_Add(m_Add(m_VPValue(X), m_VPValue()), m_VPValue(Y))) &&
1803 isa<VPIRValue>(Y) &&
1804 match(X, m_VPPhi(m_ZeroInt(), m_Specific(Def->getOperand(0))))) {
1805 auto *Phi = cast<VPPhi>(X);
1806 auto *IVInc = Def->getOperand(0);
1807 if (IVInc->getNumUsers() == 2) {
1808 // If Phi has a second user (besides IVInc's defining recipe), it must
1809 // be Inc = Phi + Y for the fold to apply.
1811 findUserOf(Phi, m_Add(m_Specific(Phi), m_Specific(Y))));
1812 if (Phi->getNumUsers() == 1 || (Phi->getNumUsers() == 2 && Inc)) {
1813 Def->replaceAllUsesWith(IVInc);
1814 if (Inc)
1815 Inc->replaceAllUsesWith(Phi);
1816 Phi->setOperand(0, Y);
1817 return;
1818 }
1819 }
1820 }
1821
1822 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1823 // just the pointer operand.
1824 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1825 if (!VPR->getVFxPart() || match(VPR->getVFxPart(), m_ZeroInt()))
1826 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1827
1828 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1829 // the start index is zero and only the first lane 0 is demanded.
1830 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1831 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1832 Steps->replaceAllUsesWith(Steps->getOperand(0));
1833 return;
1834 }
1835 }
1836 // Simplify redundant ReductionStartVector recipes after unrolling.
1837 VPValue *StartV;
1839 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1840 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1841 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1842 return PhiR && PhiR->isInLoop();
1843 });
1844 return;
1845 }
1846
1847 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1848 return Def->replaceAllUsesWith(A);
1849}
1850
1860
1862 VPValue *X;
1865 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1866 if (match(&R, m_Reverse(m_Reverse(m_VPValue(X)))))
1867 R.getVPSingleValue()->replaceAllUsesWith(X);
1868}
1869
1870/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1871/// header mask to be simplified further when tail folding, e.g. in
1872/// optimizeEVLMasks.
1873static void reassociateHeaderMask(VPlan &Plan) {
1874 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1875 if (!HeaderMask)
1876 return;
1877
1878 SmallVector<VPUser *> Worklist;
1879 for (VPUser *U : HeaderMask->users())
1880 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1882
1883 while (!Worklist.empty()) {
1884 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1885 VPValue *X, *Y;
1886 if (!R || !match(R, m_LogicalAnd(
1887 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1888 m_VPValue(Y))))
1889 continue;
1890 append_range(Worklist, R->users());
1891 VPBuilder Builder(R);
1892 R->replaceAllUsesWith(
1893 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1894 }
1895}
1896
1897static std::optional<Instruction::BinaryOps>
1899 switch (ID) {
1900 case Intrinsic::masked_udiv:
1901 return Instruction::UDiv;
1902 case Intrinsic::masked_sdiv:
1903 return Instruction::SDiv;
1904 case Intrinsic::masked_urem:
1905 return Instruction::URem;
1906 case Intrinsic::masked_srem:
1907 return Instruction::SRem;
1908 default:
1909 return {};
1910 }
1911}
1912
1914 if (Plan.hasScalarVFOnly())
1915 return;
1916
1918 vp_depth_first_deep(Plan.getEntry()))) {
1919 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1922 continue;
1923 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1924 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1925 continue;
1926
1927 auto *RepOrWidenR = cast<VPRecipeWithIRFlags>(&R);
1928 if (RepR && RepR->getOpcode() == Instruction::Store &&
1929 vputils::isSingleScalar(RepR->getOperand(1))) {
1930 auto *Clone = new VPReplicateRecipe(
1931 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1932 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1933 *RepR /*Metadata*/, RepR->getDebugLoc());
1934 Clone->insertBefore(RepOrWidenR);
1935 VPBuilder Builder(Clone);
1936 VPValue *ExtractOp = Clone->getOperand(0);
1937 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1938 ExtractOp =
1939 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1940 ExtractOp =
1941 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1942 Clone->setOperand(0, ExtractOp);
1943 RepR->eraseFromParent();
1944 continue;
1945 }
1946
1947 // Narrow llvm.masked.{u,s}{div,rem} intrinsics with a safe divisor.
1948 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(RepOrWidenR)) {
1949 if (!vputils::onlyFirstLaneUsed(IntrR))
1950 continue;
1951 auto Opc = getUnmaskedDivRemOpcode(IntrR->getVectorIntrinsicID());
1952 if (!Opc)
1953 continue;
1954 VPBuilder Builder(IntrR);
1955 VPValue *SafeDivisor = Builder.createSelect(
1956 IntrR->getOperand(2), IntrR->getOperand(1),
1957 Plan.getConstantInt(IntrR->getScalarType(), 1));
1958 VPValue *Clone = Builder.createNaryOp(
1959 *Opc, {IntrR->getOperand(0), SafeDivisor},
1960 VPIRFlags::getDefaultFlags(*Opc), IntrR->getDebugLoc());
1961 IntrR->replaceAllUsesWith(Clone);
1962 IntrR->eraseFromParent();
1963 continue;
1964 }
1965
1966 // Skip recipes that aren't single scalars.
1967 if (!vputils::isSingleScalar(RepOrWidenR))
1968 continue;
1969
1970 // Predicate to check if a user of Op introduces extra broadcasts.
1971 auto IntroducesBCastOf = [](const VPValue *Op) {
1972 return [Op](const VPUser *U) {
1973 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1977 VPI->getOpcode()))
1978 return false;
1979 }
1980 return !U->usesScalars(Op);
1981 };
1982 };
1983
1984 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1985 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1986 if (any_of(
1987 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1988 IntroducesBCastOf(Op)))
1989 return false;
1990 // Non-constant live-ins require broadcasts, while constants do not
1991 // need explicit broadcasts.
1992 auto *IRV = dyn_cast<VPIRValue>(Op);
1993 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1994 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1995 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1996 }))
1997 continue;
1998
1999 auto *Clone = VPBuilder::createSingleScalarOp(
2000 getOpcodeOrIntrinsicID(RepOrWidenR)->second, RepOrWidenR->operands(),
2001 /*Mask=*/nullptr, *RepOrWidenR, {}, DebugLoc::getUnknown(),
2002 RepOrWidenR->getUnderlyingInstr());
2003 Clone->insertBefore(RepOrWidenR);
2004 RepOrWidenR->replaceAllUsesWith(Clone);
2005 if (isDeadRecipe(*RepOrWidenR))
2006 RepOrWidenR->eraseFromParent();
2007 }
2008 }
2009}
2010
2011/// Try to see if all of \p Blend's masks share a common value logically and'ed
2012/// and remove it from the masks.
2014 if (Blend->isNormalized())
2015 return;
2016 VPValue *CommonEdgeMask;
2017 if (!match(Blend->getMask(0),
2018 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
2019 return;
2020 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
2021 if (!match(Blend->getMask(I),
2022 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
2023 return;
2024 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
2025 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
2026}
2027
2028/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
2029/// to make sure the masks are simplified.
2030static void simplifyBlends(VPlan &Plan) {
2033 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2034 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
2035 if (!Blend)
2036 continue;
2037
2038 removeCommonBlendMask(Blend);
2039
2040 // Try to remove redundant blend recipes.
2041 SmallPtrSet<VPValue *, 4> UniqueValues;
2042 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
2043 UniqueValues.insert(Blend->getIncomingValue(0));
2044 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
2045 if (!match(Blend->getMask(I), m_False()))
2046 UniqueValues.insert(Blend->getIncomingValue(I));
2047
2048 if (UniqueValues.size() == 1) {
2049 Blend->replaceAllUsesWith(*UniqueValues.begin());
2050 Blend->eraseFromParent();
2051 continue;
2052 }
2053
2054 if (Blend->isNormalized())
2055 continue;
2056
2057 // Normalize the blend so its first incoming value is used as the initial
2058 // value with the others blended into it.
2059
2060 unsigned StartIndex = 0;
2061 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
2062 // If a value's mask is used only by the blend then is can be deadcoded.
2063 // TODO: Find the most expensive mask that can be deadcoded, or a mask
2064 // that's used by multiple blends where it can be removed from them all.
2065 VPValue *Mask = Blend->getMask(I);
2066 if (Mask->hasOneUse() && !match(Mask, m_False())) {
2067 StartIndex = I;
2068 break;
2069 }
2070 }
2071
2072 SmallVector<VPValue *, 4> OperandsWithMask;
2073 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
2074
2075 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
2076 if (I == StartIndex)
2077 continue;
2078 OperandsWithMask.push_back(Blend->getIncomingValue(I));
2079 OperandsWithMask.push_back(Blend->getMask(I));
2080 }
2081
2082 auto *NewBlend =
2083 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
2084 OperandsWithMask, *Blend, Blend->getDebugLoc());
2085 NewBlend->insertBefore(&R);
2086
2087 VPValue *DeadMask = Blend->getMask(StartIndex);
2088 Blend->replaceAllUsesWith(NewBlend);
2089 Blend->eraseFromParent();
2091
2092 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
2093 VPValue *NewMask;
2094 if (NewBlend->getNumOperands() == 3 &&
2095 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
2096 VPValue *Inc0 = NewBlend->getOperand(0);
2097 VPValue *Inc1 = NewBlend->getOperand(1);
2098 VPValue *OldMask = NewBlend->getOperand(2);
2099 NewBlend->setOperand(0, Inc1);
2100 NewBlend->setOperand(1, Inc0);
2101 NewBlend->setOperand(2, NewMask);
2102 if (OldMask->user_empty())
2103 cast<VPInstruction>(OldMask)->eraseFromParent();
2104 }
2105 }
2106 }
2107}
2108
2109/// Optimize the width of vector induction variables in \p Plan based on a known
2110/// constant Trip Count, \p BestVF and \p BestUF.
2112 ElementCount BestVF,
2113 unsigned BestUF) {
2114 // Only proceed if we have not completely removed the vector region.
2115 if (!Plan.getVectorLoopRegion())
2116 return false;
2117
2118 const APInt *TC;
2119 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
2120 return false;
2121
2122 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
2123 // and UF. Returns at least 8.
2124 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
2125 APInt AlignedTC =
2128 APInt MaxVal = AlignedTC - 1;
2129 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
2130 };
2131 unsigned NewBitWidth =
2132 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
2133
2134 LLVMContext &Ctx = Plan.getContext();
2135 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
2136
2137 bool MadeChange = false;
2138
2139 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2140 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2141 // Currently only handle canonical IVs as it is trivial to replace the start
2142 // and stop values, and we currently only perform the optimization when the
2143 // IV has a single use.
2145 if (!match(&Phi, m_CanonicalWidenIV(WideIV)))
2146 continue;
2147 if (WideIV->hasMoreThanOneUniqueUser() ||
2148 NewIVTy == WideIV->getScalarType())
2149 continue;
2150
2151 // Currently only handle cases where the single user is a header-mask
2152 // comparison with the backedge-taken-count.
2153 VPUser *SingleUser = WideIV->getSingleUser();
2154 if (!SingleUser ||
2155 !match(SingleUser,
2156 m_ICmp(m_Specific(WideIV),
2158 continue;
2159
2160 // Update IV operands and comparison bound to use new narrower type.
2161 assert(!WideIV->getTruncInst() &&
2162 "canonical IV is not expected to have a truncation");
2163 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
2164 WideIV->getPHINode(), Plan.getZero(NewIVTy),
2165 Plan.getConstantInt(NewIVTy, 1), WideIV->getVFValue(),
2166 WideIV->getInductionDescriptor(), *WideIV, WideIV->getDebugLoc());
2167 NewWideIV->insertBefore(WideIV);
2168
2169 auto *NewBTC = new VPWidenCastRecipe(
2170 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2171 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
2172 Plan.getVectorPreheader()->appendRecipe(NewBTC);
2173 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
2174 Cmp->replaceAllUsesWith(
2175 VPBuilder(Cmp).createICmp(Cmp->getPredicate(), NewWideIV, NewBTC));
2176
2177 MadeChange = true;
2178 }
2179
2180 return MadeChange;
2181}
2182
2183/// Return true if \p Cond is known to be true for given \p BestVF and \p
2184/// BestUF.
2186 ElementCount BestVF, unsigned BestUF,
2189 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
2190 &PSE](VPValue *C) {
2191 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
2192 });
2193
2194 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2197 m_c_Add(m_Specific(CanIV), m_Specific(&Plan.getVFxUF())),
2198 m_Specific(&Plan.getVectorTripCount()))))
2199 return false;
2200
2201 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2202 // count is not conveniently available as SCEV so far, so we compare directly
2203 // against the original trip count. This is stricter than necessary, as we
2204 // will only return true if the trip count == vector trip count.
2205 const SCEV *VectorTripCount =
2207 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2208 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
2209 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2210 "Trip count SCEV must be computable");
2211 ScalarEvolution &SE = *PSE.getSE();
2212 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2213 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2214 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
2215}
2216
2217/// Try to replace multiple active lane masks used for control flow with
2218/// a single, wide active lane mask instruction followed by multiple
2219/// extract subvector intrinsics. This applies to the active lane mask
2220/// instructions both in the loop and in the preheader.
2221/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2222/// new extracts from the first active lane mask, which has it's last
2223/// operand (multiplier) set to UF.
2225 unsigned UF) {
2226 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2227 return false;
2228
2229 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2230 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2231 auto *Term = &ExitingVPBB->back();
2232
2233 using namespace llvm::VPlanPatternMatch;
2235 m_VPValue(), m_VPValue(), m_VPValue())))))
2236 return false;
2237
2238 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2239 LLVMContext &Ctx = Plan.getContext();
2240
2241 auto ExtractFromALM = [&](VPInstruction *ALM,
2242 SmallVectorImpl<VPValue *> &Extracts) {
2243 DebugLoc DL = ALM->getDebugLoc();
2244 for (unsigned Part = 0; Part < UF; ++Part) {
2246 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
2247 auto *Ext =
2248 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2249 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
2250 Extracts[Part] = Ext;
2251 Ext->insertAfter(ALM);
2252 }
2253 };
2254
2255 // Create a list of each active lane mask phi, ordered by unroll part.
2257 for (VPRecipeBase &R : Header->phis()) {
2259 if (!Phi)
2260 continue;
2261 VPValue *Index = nullptr;
2262 match(Phi->getBackedgeValue(),
2264 assert(Index && "Expected index from ActiveLaneMask instruction");
2265
2266 uint64_t Part;
2267 if (match(Index,
2269 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2270 Phis[Part] = Phi;
2271 else {
2272 // Anything other than a CanonicalIVIncrementForPart is part 0
2273 assert(!match(
2274 Index,
2276 Phis[0] = Phi;
2277 }
2278 }
2279
2280 assert(all_of(Phis, not_equal_to(nullptr)) &&
2281 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2282
2283 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2284 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2285
2286 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2287 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2288 "Expected incoming values of Phi to be ActiveLaneMasks");
2289
2290 // When using wide lane masks, the return type of the get.active.lane.mask
2291 // intrinsic is VF x UF (last operand).
2292 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2293 EntryALM->setOperand(2, ALMMultiplier);
2294 LoopALM->setOperand(2, ALMMultiplier);
2295
2296 // Create UF x extract vectors and insert into preheader.
2297 SmallVector<VPValue *> EntryExtracts(UF);
2298 ExtractFromALM(EntryALM, EntryExtracts);
2299
2300 // Create UF x extract vectors and insert before the loop compare & branch,
2301 // updating the compare to use the first extract.
2302 SmallVector<VPValue *> LoopExtracts(UF);
2303 ExtractFromALM(LoopALM, LoopExtracts);
2304 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2305 Not->setOperand(0, LoopExtracts[0]);
2306
2307 // Update the incoming values of active lane mask phis.
2308 for (unsigned Part = 0; Part < UF; ++Part) {
2309 Phis[Part]->setStartValue(EntryExtracts[Part]);
2310 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2311 }
2312
2313 return true;
2314}
2315
2316/// Try to simplify the branch condition of \p Plan. This may restrict the
2317/// resulting plan to \p BestVF and \p BestUF.
2319 unsigned BestUF,
2321 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2322 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2323 auto *Term = &ExitingVPBB->back();
2324 VPValue *Cond;
2325 auto m_CanIVInc = m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF()));
2326 // Check if the branch condition compares the canonical IV increment (for main
2327 // loop), or the canonical IV increment plus an offset (for epilog loop).
2328 if (match(Term, m_BranchOnCount(
2329 m_CombineOr(m_CanIVInc, m_c_Add(m_CanIVInc, m_LiveIn())),
2330 m_VPValue())) ||
2332 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2333 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2334 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2335 const SCEV *VectorTripCount =
2337 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2338 VectorTripCount =
2340 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2341 "Trip count SCEV must be computable");
2342 ScalarEvolution &SE = *PSE.getSE();
2343 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2344 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2345 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2346 return false;
2347 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2349 // For BranchOnCond, check if we can prove the condition to be true using VF
2350 // and UF.
2351 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2352 return false;
2353 } else {
2354 return false;
2355 }
2356
2357 // The vector loop region only executes once. Convert terminator of the
2358 // exiting block to exit in the first iteration.
2359 if (match(Term, m_BranchOnTwoConds())) {
2360 Term->setOperand(1, Plan.getTrue());
2361 return true;
2362 }
2363
2364 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, Plan.getTrue(), {},
2365 {}, Term->getDebugLoc());
2366 ExitingVPBB->appendRecipe(BOC);
2367 Term->eraseFromParent();
2368
2369 return true;
2370}
2371
2372/// From the definition of llvm.experimental.get.vector.length,
2373/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2377 vp_depth_first_deep(Plan.getEntry()))) {
2378 for (VPRecipeBase &R : *VPBB) {
2379 VPValue *AVL;
2380 if (!match(&R, m_EVL(m_VPValue(AVL))))
2381 continue;
2382
2383 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2384 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2385 continue;
2386 ScalarEvolution &SE = *PSE.getSE();
2387 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2388 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2389 continue;
2390
2392 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2393 R.getDebugLoc());
2394 if (Trunc != AVL) {
2395 auto *TruncR = cast<VPSingleDefRecipe>(Trunc);
2396 const DataLayout &DL = Plan.getDataLayout();
2397 if (VPValue *Folded = tryToFoldLiveIns(*TruncR, TruncR->operands(), DL))
2398 Trunc = Folded;
2399 }
2400 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2401 return true;
2402 }
2403 }
2404 return false;
2405}
2406
2408 unsigned BestUF,
2410 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2411 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2412
2413 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2414 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2415 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2416
2417 if (MadeChange) {
2418 Plan.setVF(BestVF);
2419 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2420 }
2421}
2422
2424 for (VPRecipeBase &R :
2426 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2427 if (!PhiR)
2428 continue;
2429 RecurKind RK = PhiR->getRecurrenceKind();
2430 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2432 continue;
2433
2434 for (VPUser *U : collectUsersRecursively(PhiR))
2435 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2436 RecWithFlags->dropPoisonGeneratingFlags();
2437 }
2438 }
2439}
2440
2441namespace {
2442struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2443 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2444 /// return that source element type.
2445 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2446 // All VPInstructions that lower to GEPs must have the i8 source element
2447 // type (as they are PtrAdds), so we omit it.
2449 .Case([](const VPReplicateRecipe *I) -> Type * {
2450 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2451 return GEP->getSourceElementType();
2452 return nullptr;
2453 })
2454 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2455 [](auto *I) { return I->getSourceElementType(); })
2456 .Default([](auto *) { return nullptr; });
2457 }
2458
2459 /// Returns true if recipe \p Def can be safely handed for CSE.
2460 static bool canHandle(const VPSingleDefRecipe *Def) {
2461 // We can extend the list of handled recipes in the future,
2462 // provided we account for the data embedded in them while checking for
2463 // equality or hashing.
2464 auto C = getOpcodeOrIntrinsicID(Def);
2465
2466 // The issue with (Insert|Extract)Value is that the index of the
2467 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2468 // VPlan.
2469 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2470 C->second == Instruction::ExtractValue)))
2471 return false;
2472
2473 // During CSE, we can only handle recipes that don't read from memory: if
2474 // they read from memory, there could be an intervening write to memory
2475 // before the next instance is CSE'd, leading to an incorrect result.
2476 return !Def->mayReadFromMemory();
2477 }
2478
2479 /// Hash the underlying data of \p Def.
2480 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2481 hash_code Result = hash_combine(
2482 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2483 getGEPSourceElementType(Def), Def->getScalarType(),
2485 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2486 if (RFlags->hasPredicate())
2487 return hash_combine(Result, RFlags->getPredicate());
2488 if (auto *SIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Def))
2489 return hash_combine(Result, SIVSteps->getInductionOpcode());
2490 return Result;
2491 }
2492
2493 /// Check equality of underlying data of \p L and \p R.
2494 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2495 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2497 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2499 !equal(L->operands(), R->operands()))
2500 return false;
2502 "must have valid opcode info for both recipes");
2503 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2504 if (LFlags->hasPredicate() &&
2505 LFlags->getPredicate() !=
2506 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2507 return false;
2508 if (auto *LSIV = dyn_cast<VPScalarIVStepsRecipe>(L))
2509 if (LSIV->getInductionOpcode() !=
2510 cast<VPScalarIVStepsRecipe>(R)->getInductionOpcode())
2511 return false;
2512 // Recipes in replicate regions implicitly depend on predicate. If either
2513 // recipe is in a replicate region, only consider them equal if both have
2514 // the same parent.
2515 const VPRegionBlock *RegionL = L->getRegion();
2516 const VPRegionBlock *RegionR = R->getRegion();
2517 if (((RegionL && RegionL->isReplicator()) ||
2518 (RegionR && RegionR->isReplicator())) &&
2519 L->getParent() != R->getParent())
2520 return false;
2521 return L->getScalarType() == R->getScalarType();
2522 }
2523};
2524} // end anonymous namespace
2525
2526/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2527/// Plan.
2529 VPDominatorTree VPDT(Plan);
2531
2533 Plan.getEntry());
2535 for (VPRecipeBase &R : *VPBB) {
2536 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2537 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2538 continue;
2539 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2540 // V must dominate Def for a valid replacement.
2541 if (!VPDT.dominates(V->getParent(), VPBB))
2542 continue;
2543 // Only keep flags present on both V and Def.
2544 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2545 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2546 Def->replaceAllUsesWith(V);
2547 continue;
2548 }
2549 CSEMap[Def] = Def;
2550 }
2551 }
2552}
2553
2554/// Return true if we do not know how to (mechanically) hoist or sink a
2555/// non-memory or memory recipe \p R out of a loop region. When sinking, passing
2556/// \p Sinking = true ensures that assumes aren't sunk.
2558 VPBasicBlock *LastBB,
2559 bool Sinking = false) {
2560 if (!isa<VPReplicateRecipe>(R) || !R.mayReadOrWriteMemory() ||
2562 return vputils::cannotHoistOrSinkRecipe(R, Sinking);
2563
2564 // Check that the memory operation doesn't alias between FirstBB and LastBB.
2565 auto MemLoc = vputils::getMemoryLocation(R);
2566
2567 // TODO: Could make use of SinkStoreInfo::isNoAliasViaDistance by collecting
2568 // stores upfront, and constructing a full SinkStoreInfo.
2569 auto SinkInfo =
2570 Sinking ? std::make_optional(SinkStoreInfo(cast<VPReplicateRecipe>(R)))
2571 : std::nullopt;
2572
2573 return !MemLoc ||
2574 !canHoistOrSinkWithNoAliasCheck(*MemLoc, FirstBB, LastBB, SinkInfo);
2575}
2576
2577/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2578static void licm(VPlan &Plan) {
2579 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2580
2581 // Hoist any loop invariant recipes from the vector loop region to the
2582 // preheader. Preform a shallow traversal of the vector loop region, to
2583 // exclude recipes in replicate regions. Since the top-level blocks in the
2584 // vector loop region are guaranteed to execute if the vector pre-header is,
2585 // we don't need to check speculation safety.
2586 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2587 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2588 "Expected vector prehader's successor to be the vector loop region");
2590 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2591 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2592 if (cannotHoistOrSinkRecipe(R, LoopRegion->getEntryBasicBlock(),
2593 LoopRegion->getExitingBasicBlock()))
2594 continue;
2595 if (any_of(R.operands(), [](VPValue *Op) {
2596 return !Op->isDefinedOutsideLoopRegions();
2597 }))
2598 continue;
2599 R.moveBefore(*Preheader, Preheader->end());
2600 }
2601 }
2602
2603#ifndef NDEBUG
2604 VPDominatorTree VPDT(Plan);
2605#endif
2606 // Sink recipes with no users inside the vector loop region if all users are
2607 // in the same exit block of the region.
2608 // TODO: Extend to sink recipes from inner loops.
2610 LoopRegion->getEntry());
2612 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2613 if (cannotHoistOrSinkRecipe(R, LoopRegion->getEntryBasicBlock(),
2614 LoopRegion->getExitingBasicBlock(),
2615 /*Sinking=*/true))
2616 continue;
2617
2618 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
2619 assert(!RepR->isPredicated() &&
2620 "Expected prior transformation of predicated replicates to "
2621 "replicate regions");
2622 // narrowToSingleScalarRecipes should have already maximally narrowed
2623 // replicates to single-scalar replicates.
2624 // TODO: When unrolling, replicateByVF doesn't handle sunk
2625 // non-single-scalar replicates correctly.
2626 if (!RepR->isSingleScalar())
2627 continue;
2628
2629 // The pointer operand of stores must be loop-invariant.
2630 if (RepR->getOpcode() == Instruction::Store &&
2631 !RepR->getOperand(1)->isDefinedOutsideLoopRegions())
2632 continue;
2633 }
2634
2635 [[maybe_unused]] auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
2636 assert((!R.mayWriteToMemory() ||
2637 (RepR && RepR->getOpcode() == Instruction::Store &&
2638 RepR->getOperand(1)->isDefinedOutsideLoopRegions())) &&
2639 "The only recipes that may write to memory are expected to be "
2640 "stores with invariant pointer-operand");
2641
2642 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2643 // support recipes with multiple defined values (e.g., interleaved loads).
2644 auto *Def = cast<VPSingleDefRecipe>(&R);
2645
2646 // Cannot sink the recipe if the user is defined in a loop region or a
2647 // non-successor of the vector loop region. Cannot sink if user is a phi
2648 // either.
2649 VPBasicBlock *SinkBB = nullptr;
2650 if (any_of(Def->users(), [&SinkBB, &LoopRegion](VPUser *U) {
2651 auto *UserR = cast<VPRecipeBase>(U);
2652 VPBasicBlock *Parent = UserR->getParent();
2653 // TODO: Support sinking when users are in multiple blocks.
2654 if (SinkBB && SinkBB != Parent)
2655 return true;
2656 SinkBB = Parent;
2657 // TODO: If the user is a PHI node, we should check the block of
2658 // incoming value. Support PHI node users if needed.
2659 return UserR->isPhi() || Parent->getEnclosingLoopRegion() ||
2660 Parent->getSinglePredecessor() != LoopRegion;
2661 }))
2662 continue;
2663
2664 if (!SinkBB)
2665 SinkBB = cast<VPBasicBlock>(LoopRegion->getSingleSuccessor());
2666
2667 // TODO: This will need to be a check instead of a assert after
2668 // conditional branches in vectorized loops are supported.
2669 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2670 "Defining block must dominate sink block");
2671 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2672 // just moving.
2673 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2674 }
2675 }
2676}
2677
2679 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2680 if (Plan.hasScalarVFOnly())
2681 return;
2682 // Keep track of created truncates, so they can be re-used. Note that we
2683 // cannot use RAUW after creating a new truncate, as this would could make
2684 // other uses have different types for their operands, making them invalidly
2685 // typed.
2687 VPBasicBlock *PH = Plan.getVectorPreheader();
2690 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2693 continue;
2694
2695 VPValue *ResultVPV = R.getVPSingleValue();
2696 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2697 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2698 if (!NewResSizeInBits)
2699 continue;
2700
2701 // If the value wasn't vectorized, we must maintain the original scalar
2702 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2703 // skip casts which do not need to be handled explicitly here, as
2704 // redundant casts will be removed during recipe simplification.
2706 continue;
2707
2708 Type *OldResTy = ResultVPV->getScalarType();
2709 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2710 assert(OldResTy->isIntegerTy() && "only integer types supported");
2711 (void)OldResSizeInBits;
2712
2713 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2714
2715 // Any wrapping introduced by shrinking this operation shouldn't be
2716 // considered undefined behavior. So, we can't unconditionally copy
2717 // arithmetic wrapping flags to VPW.
2718 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2719 VPW->dropPoisonGeneratingFlags();
2720
2721 assert((OldResSizeInBits != NewResSizeInBits ||
2722 match(&R, m_ICmp(m_VPValue(), m_VPValue()))) &&
2723 "Only ICmps should not need extending the result.");
2724 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2725
2726 // For loads/intrinsics we don't recreate the recipe; just wrap the
2727 // original wide result in a ZExt to OldResTy.
2729 if (OldResSizeInBits != NewResSizeInBits) {
2731 Instruction::ZExt, ResultVPV, OldResTy);
2732 ResultVPV->replaceAllUsesWith(Ext);
2733 Ext->setOperand(0, ResultVPV);
2734 }
2735 continue;
2736 }
2737
2738 // Shrink operands by introducing truncates as needed.
2739 unsigned StartIdx =
2740 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2741 SmallVector<VPValue *> NewOperands(R.operands());
2742 for (VPValue *&Op : drop_begin(NewOperands, StartIdx)) {
2743 unsigned OpSizeInBits = Op->getScalarType()->getScalarSizeInBits();
2744 if (OpSizeInBits == NewResSizeInBits)
2745 continue;
2746 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2747 auto [ProcessedIter, Inserted] = ProcessedTruncs.try_emplace(Op);
2748 if (Inserted) {
2749 VPBuilder Builder;
2750 if (isa<VPIRValue>(Op))
2751 Builder.setInsertPoint(PH);
2752 else
2753 Builder.setInsertPoint(&R);
2754 ProcessedIter->second =
2755 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2756 }
2757 Op = ProcessedIter->second;
2758 }
2759
2760 auto *NWR = cast<VPWidenRecipe>(&R)->cloneWithOperands(NewOperands);
2761 NWR->insertBefore(&R);
2762
2763 // Wrap NWR in a ZExt to preserve the original wide type for downstream
2764 // users (unless this is an ICmp, which produces i1 regardless).
2765 VPValue *Replacement = NWR->getVPSingleValue();
2766 if (OldResSizeInBits != NewResSizeInBits)
2767 Replacement =
2769 .createWidenCast(Instruction::ZExt, Replacement, OldResTy)
2770 ->getVPSingleValue();
2771 ResultVPV->replaceAllUsesWith(Replacement);
2772 R.eraseFromParent();
2773 }
2774 }
2775}
2776
2777bool VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) {
2778 std::optional<VPDominatorTree> VPDT;
2779 if (OnlyLatches)
2780 VPDT.emplace(Plan);
2781
2782 // Collect all blocks before modifying the CFG so we can identify unreachable
2783 // ones after constant branch removal.
2785
2786 bool SimplifiedPhi = false;
2787 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(AllBlocks)) {
2788 VPValue *Cond;
2789 // Skip blocks that are not terminated by BranchOnCond.
2790 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2791 continue;
2792
2793 if (OnlyLatches && !VPBlockUtils::isLatch(VPBB, *VPDT))
2794 continue;
2795
2796 assert(VPBB->getNumSuccessors() == 2 &&
2797 "Two successors expected for BranchOnCond");
2798 unsigned RemovedIdx;
2799 if (match(Cond, m_True()))
2800 RemovedIdx = 1;
2801 else if (match(Cond, m_False()))
2802 RemovedIdx = 0;
2803 else
2804 continue;
2805
2806 VPBasicBlock *RemovedSucc =
2807 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2808 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2809 "There must be a single edge between VPBB and its successor");
2810 // Values coming from VPBB into phi recipes of RemovedSucc are removed from
2811 // these recipes.
2812 auto Phis = RemovedSucc->phis();
2813 for (VPRecipeBase &R : Phis)
2814 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2815 SimplifiedPhi |= !std::empty(Phis);
2816
2817 // Disconnect blocks and remove the terminator.
2818 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2819 VPBB->back().eraseFromParent();
2820 }
2821
2822 // Compute which blocks are still reachable from the entry after constant
2823 // branch removal.
2826
2827 // Detach all unreachable blocks from their successors, removing their recipes
2828 // and incoming values from phi recipes.
2829 VPSymbolicValue Tmp(nullptr);
2830 for (VPBlockBase *B : AllBlocks) {
2831 if (Reachable.contains(B))
2832 continue;
2833 for (VPBlockBase *Succ : to_vector(B->successors())) {
2834 if (auto *SuccBB = dyn_cast<VPBasicBlock>(Succ))
2835 for (VPRecipeBase &R : SuccBB->phis())
2836 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(B);
2838 }
2839 for (VPBasicBlock *DeadBB :
2841 for (VPRecipeBase &R : make_early_inc_range(*DeadBB)) {
2842 for (VPValue *Def : R.definedValues())
2843 Def->replaceAllUsesWith(&Tmp);
2844 R.eraseFromParent();
2845 }
2846 }
2847 }
2848 return SimplifiedPhi;
2849}
2850
2871
2872// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2873// the loop terminator with a branch-on-cond recipe with the negated
2874// active-lane-mask as operand. Note that this turns the loop into an
2875// uncountable one. Only the existing terminator is replaced, all other existing
2876// recipes/users remain unchanged, except for poison-generating flags being
2877// dropped from the canonical IV increment. Return the created
2878// VPActiveLaneMaskPHIRecipe.
2879//
2880// The function adds the following recipes:
2881//
2882// vector.ph:
2883// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2884// %EntryALM = active-lane-mask %EntryInc, TC
2885//
2886// vector.body:
2887// ...
2888// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2889// ...
2890// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2891// %ALM = active-lane-mask %InLoopInc, TC
2892// %Negated = Not %ALM
2893// branch-on-cond %Negated
2894//
2897 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2898 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2899 VPValue *StartV = Plan.getZero(TopRegion->getCanonicalIVType());
2900 auto *CanonicalIVIncrement = TopRegion->getOrCreateCanonicalIVIncrement();
2901 // TODO: Check if dropping the flags is needed.
2902 TopRegion->clearCanonicalIVNUW(CanonicalIVIncrement);
2903 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2904 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2905 // we have to take unrolling into account. Each part needs to start at
2906 // Part * VF
2907 auto *VecPreheader = Plan.getVectorPreheader();
2908 VPBuilder Builder(VecPreheader);
2909
2910 // Create the ActiveLaneMask instruction using the correct start values.
2911 VPValue *TC = Plan.getTripCount();
2912 VPValue *VF = &Plan.getVF();
2913
2914 auto *EntryIncrement = Builder.createOverflowingOp(
2915 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
2916 DL, "index.part.next");
2917
2918 // Create the active lane mask instruction in the VPlan preheader.
2919 VPValue *ALMMultiplier =
2920 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2921 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2922 {EntryIncrement, TC, ALMMultiplier}, DL,
2923 "active.lane.mask.entry");
2924
2925 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2926 // preheader ActiveLaneMask instruction.
2927 auto *LaneMaskPhi =
2929 auto *HeaderVPBB = TopRegion->getEntryBasicBlock();
2930 LaneMaskPhi->insertBefore(*HeaderVPBB, HeaderVPBB->begin());
2931
2932 // Create the active lane mask for the next iteration of the loop before the
2933 // original terminator.
2934 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2935 Builder.setInsertPoint(OriginalTerminator);
2936 auto *InLoopIncrement = Builder.createOverflowingOp(
2938 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
2939 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2940 {InLoopIncrement, TC, ALMMultiplier}, DL,
2941 "active.lane.mask.next");
2942 LaneMaskPhi->addBackedgeValue(ALM);
2943
2944 // Replace the original terminator with BranchOnCond. We have to invert the
2945 // mask here because a true condition means jumping to the exit block.
2946 auto *NotMask = Builder.createNot(ALM, DL);
2947 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2948 OriginalTerminator->eraseFromParent();
2949 return LaneMaskPhi;
2950}
2951
2953 bool UseActiveLaneMaskForControlFlow) {
2954 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2955 auto *WideCanonicalIV =
2957 assert(WideCanonicalIV &&
2958 "Must have widened canonical IV when tail folding!");
2959 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
2960 VPSingleDefRecipe *LaneMask;
2961 if (UseActiveLaneMaskForControlFlow) {
2962 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
2963 } else {
2964 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2965 VPValue *ALMMultiplier =
2966 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
2967 LaneMask =
2968 B.createNaryOp(VPInstruction::ActiveLaneMask,
2969 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2970 nullptr, "active.lane.mask");
2971 }
2972
2973 // Walk users of WideCanonicalIV and replace the header mask of the form
2974 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2975 // removing the old one to ensure there is always only a single header mask.
2976 HeaderMask->replaceAllUsesWith(LaneMask);
2977 HeaderMask->eraseFromParent();
2978}
2979
2980template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2981 Op0_t In;
2983
2984 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2985
2986 template <typename OpTy> bool match(OpTy *V) const {
2987 if (m_Specific(In).match(V)) {
2988 Out = nullptr;
2989 return true;
2990 }
2991 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2992 }
2993};
2994
2995/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2996/// Returns the remaining part \p Out if so, or nullptr otherwise.
2997template <typename Op0_t, typename Op1_t>
2998static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2999 Op1_t &Out) {
3000 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
3001}
3002
3003static std::optional<Intrinsic::ID> getVPDivRemIntrinsic(Intrinsic::ID IntrID) {
3004 switch (IntrID) {
3005 case Intrinsic::masked_udiv:
3006 return Intrinsic::vp_udiv;
3007 case Intrinsic::masked_sdiv:
3008 return Intrinsic::vp_sdiv;
3009 case Intrinsic::masked_urem:
3010 return Intrinsic::vp_urem;
3011 case Intrinsic::masked_srem:
3012 return Intrinsic::vp_srem;
3013 default:
3014 return std::nullopt;
3015 }
3016}
3017
3018/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
3019/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
3020/// recipe could be created.
3021/// \p HeaderMask Header Mask.
3022/// \p CurRecipe Recipe to be transform.
3023/// \p EVL The explicit vector length parameter of vector-predication
3024/// intrinsics.
3026 VPRecipeBase &CurRecipe, VPValue &EVL) {
3027 VPlan *Plan = CurRecipe.getParent()->getPlan();
3028 DebugLoc DL = CurRecipe.getDebugLoc();
3029 VPValue *Addr, *Mask, *EndPtr;
3030
3031 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
3032 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
3033 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
3034 EVLEndPtr->insertBefore(&CurRecipe);
3035 // Cast EVL (i32) to match the VF operand's type.
3036 VPValue *EVLAsVF = VPBuilder(EVLEndPtr).createScalarZExtOrTrunc(
3037 &EVL, EVLEndPtr->getOperand(1)->getScalarType(), EVL.getScalarType(),
3039 EVLEndPtr->setOperand(1, EVLAsVF);
3040 return EVLEndPtr;
3041 };
3042
3043 auto GetVPReverse = [&CurRecipe, &EVL, Plan,
3045 if (!V)
3046 return nullptr;
3047 auto *Reverse = new VPWidenIntrinsicRecipe(
3048 Intrinsic::experimental_vp_reverse, {V, Plan->getTrue(), &EVL},
3049 V->getScalarType(), {}, {}, DL);
3050 Reverse->insertBefore(&CurRecipe);
3051 return Reverse;
3052 };
3053
3054 if (match(&CurRecipe,
3055 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))))
3056 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
3057 EVL, Mask);
3058
3059 if (match(&CurRecipe,
3060 m_MaskedLoad(m_VPValue(EndPtr),
3061 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3062 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3063 Mask = GetVPReverse(Mask);
3064 Addr = AdjustEndPtr(EndPtr);
3065 auto *LoadR = new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
3066 Addr, EVL, Mask);
3067 LoadR->insertBefore(&CurRecipe);
3068 VPValue *Poison = Plan->getPoison(LoadR->getScalarType());
3069 return new VPWidenIntrinsicRecipe(Intrinsic::vector_splice_left,
3070 {Poison, LoadR, &EVL},
3071 LoadR->getScalarType(), {}, {}, DL);
3072 }
3073
3074 VPValue *Stride;
3076 m_VPValue(Addr), m_VPValue(Stride),
3077 m_RemoveMask(HeaderMask, Mask),
3078 m_TruncOrSelf(m_Specific(&Plan->getVF()))))) {
3079 if (!Mask)
3080 Mask = Plan->getTrue();
3081 auto *NewLoad = cast<VPWidenMemIntrinsicRecipe>(&CurRecipe)->clone();
3082 NewLoad->setOperand(2, Mask);
3083 NewLoad->setOperand(3, &EVL);
3084 return NewLoad;
3085 }
3086
3087 VPValue *StoredVal;
3088 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
3089 m_RemoveMask(HeaderMask, Mask))))
3090 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3091 StoredVal, EVL, Mask);
3092
3093 if (match(&CurRecipe,
3094 m_MaskedStore(m_VPValue(EndPtr), m_VPValue(StoredVal),
3095 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3096 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3097 Mask = GetVPReverse(Mask);
3098 Addr = AdjustEndPtr(EndPtr);
3099 VPValue *Poison = Plan->getPoison(StoredVal->getScalarType());
3100 auto *SpliceR = new VPWidenIntrinsicRecipe(
3101 Intrinsic::vector_splice_right, {StoredVal, Poison, &EVL},
3102 StoredVal->getScalarType(), {}, {}, DL);
3103 SpliceR->insertBefore(&CurRecipe);
3104 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3105 SpliceR, EVL, Mask);
3106 }
3107
3108 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
3109 if (Rdx->isConditional() &&
3110 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
3111 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3112
3113 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
3114 if (Interleave->getMask() &&
3115 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
3116 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3117
3118 VPValue *LHS, *RHS;
3119 if (match(&CurRecipe, m_SelectLike(m_RemoveMask(HeaderMask, Mask),
3121 return new VPWidenIntrinsicRecipe(
3122 Intrinsic::vp_merge, {Mask ? Mask : Plan->getTrue(), LHS, RHS, &EVL},
3123 LHS->getScalarType(), {}, {}, DL);
3124
3125 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3126 Type *Ty = CurRecipe.getVPSingleValue()->getScalarType();
3127 VPValue *ZExt =
3128 VPBuilder(&CurRecipe)
3129 .createScalarZExtOrTrunc(&EVL, Ty, EVL.getScalarType(), DL);
3130 return new VPInstruction(
3131 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
3132 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
3133 }
3134
3135 // lhs | (headermask && rhs) -> vp.merge rhs, true, lhs, evl
3136 if (match(&CurRecipe,
3138 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(RHS)))))
3139 return new VPWidenIntrinsicRecipe(Intrinsic::vp_merge,
3140 {RHS, Plan->getTrue(), LHS, &EVL},
3141 LHS->getScalarType(), {}, {}, DL);
3142
3143 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(&CurRecipe))
3144 if (auto VPID = getVPDivRemIntrinsic(IntrR->getVectorIntrinsicID()))
3145 if (match(IntrR->getOperand(2), m_RemoveMask(HeaderMask, Mask)))
3146 return new VPWidenIntrinsicRecipe(*VPID,
3147 {IntrR->getOperand(0),
3148 IntrR->getOperand(1),
3149 Mask ? Mask : Plan->getTrue(), &EVL},
3150 IntrR->getScalarType(), {}, {}, DL);
3151
3152 return nullptr;
3153}
3154
3155/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3156/// The transforms here need to preserve the original semantics.
3158 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3159 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3162 m_VPValue(EVL))) &&
3163 match(EVL, m_EVL(m_VPValue()))) {
3164 HeaderMask = R.getVPSingleValue();
3165 break;
3166 }
3167 }
3168 if (!HeaderMask)
3169 return;
3170
3171 SmallVector<VPRecipeBase *> OldRecipes;
3172 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3174 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, *EVL)) {
3175 NewR->insertBefore(R);
3176 for (auto [Old, New] :
3177 zip_equal(R->definedValues(), NewR->definedValues()))
3178 Old->replaceAllUsesWith(New);
3179 OldRecipes.push_back(R);
3180 }
3181 }
3182
3183 // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,
3184 // False, EVL)
3185 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3186 VPValue *Mask;
3187 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) {
3188 auto *LogicalAnd = cast<VPInstruction>(U);
3189 auto *Merge = new VPWidenIntrinsicRecipe(
3190 Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},
3191 Mask->getScalarType(), {}, {}, LogicalAnd->getDebugLoc());
3192 Merge->insertBefore(LogicalAnd);
3193 LogicalAnd->replaceAllUsesWith(Merge);
3194 OldRecipes.push_back(LogicalAnd);
3195 }
3196 }
3197
3198 // Fold the following splice patterns:
3199 // splice.right(splice.left(poison, x, evl), poison, evl) -> x
3200 // vector.reverse(splice.left(poison, x, evl)) -> vp.reverse(x, true, evl)
3201 // splice.right(vector.reverse(x), poison, evl) -> vp.reverse(x, true, evl)
3202 for (VPUser *U : collectUsersRecursively(EVL)) {
3203 auto *R = cast<VPRecipeBase>(U);
3204 VPValue *X;
3207 m_Poison(), m_VPValue(X), m_Specific(EVL)),
3208 m_Poison(), m_Specific(EVL)))) {
3209 R->getVPSingleValue()->replaceAllUsesWith(X);
3210 OldRecipes.push_back(R);
3211 continue;
3212 }
3213
3214 if (!match(U,
3217 m_Poison(), m_VPValue(X), m_Specific(EVL))),
3219 m_Reverse(m_VPValue(X)), m_Poison(), m_Specific(EVL)))))
3220 continue;
3221
3222 auto *VPReverse = new VPWidenIntrinsicRecipe(
3223 Intrinsic::experimental_vp_reverse, {X, Plan.getTrue(), EVL},
3224 X->getScalarType(), {}, {}, R->getDebugLoc());
3225 VPReverse->insertBefore(R);
3226 R->getVPSingleValue()->replaceAllUsesWith(VPReverse);
3227 OldRecipes.push_back(R);
3228 }
3229
3230 for (VPRecipeBase *R : reverse(OldRecipes)) {
3231 SmallVector<VPValue *> PossiblyDead(R->operands());
3232 R->eraseFromParent();
3233 for (VPValue *Op : PossiblyDead)
3235 }
3236}
3237
3238/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3239/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3240/// iteration.
3241static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3242 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3243 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3244
3245 // EVL is i32 but VF/VFxUF are IdxTy. Convert as needed.
3246 VPValue *EVLAsIdx =
3250
3251 assert(all_of(Plan.getVF().users(),
3252 [&Plan](VPUser *U) {
3253 auto IsAllowedUser =
3254 IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
3255 VPWidenIntOrFpInductionRecipe,
3256 VPWidenMemIntrinsicRecipe>;
3257 if (match(U, m_Trunc(m_Specific(&Plan.getVF()))))
3258 return all_of(cast<VPSingleDefRecipe>(U)->users(),
3259 IsAllowedUser);
3260 return IsAllowedUser(U);
3261 }) &&
3262 "User of VF that we can't transform to EVL.");
3263 Plan.getVF().replaceUsesWithIf(EVLAsIdx, [](VPUser &U, unsigned Idx) {
3265 });
3266
3267 assert(all_of(Plan.getVFxUF().users(),
3269 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3270 m_Specific(&Plan.getVFxUF())),
3272 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3273 "increment of the canonical induction.");
3274 Plan.getVFxUF().replaceUsesWithIf(EVLAsIdx, [](VPUser &U, unsigned Idx) {
3275 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3276 // canonical induction must not be updated.
3278 });
3279
3280 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3281 // contained.
3282 bool ContainsFORs =
3284 if (ContainsFORs) {
3285 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3286 VPValue *MaxEVL = &Plan.getVF();
3287 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3288 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3289 MaxEVL = Builder.createScalarZExtOrTrunc(
3290 MaxEVL, Type::getInt32Ty(Plan.getContext()), MaxEVL->getScalarType(),
3292
3293 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3294 VPValue *PrevEVL = Builder.createScalarPhi(
3295 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3296
3299 for (VPRecipeBase &R : *VPBB) {
3300 VPValue *V1, *V2;
3301 if (!match(&R,
3303 m_VPValue(V1), m_VPValue(V2))))
3304 continue;
3305 VPValue *Imm = Plan.getOrAddLiveIn(
3308 Intrinsic::experimental_vp_splice,
3309 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3310 R.getVPSingleValue()->getScalarType(), {}, {}, R.getDebugLoc());
3311 VPSplice->insertBefore(&R);
3312 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3313 }
3314 }
3315 }
3316
3317 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3318 if (!HeaderMask)
3319 return;
3320
3321 // Ensure that any reduction that uses a select to mask off tail lanes does so
3322 // in the vector loop, not the middle block, since EVL tail folding can have
3323 // tail elements in the penultimate iteration.
3324 assert(all_of(*Plan.getMiddleBlock(), [&Plan, HeaderMask](VPRecipeBase &R) {
3325 if (match(&R, m_ComputeReductionResult(m_Select(m_Specific(HeaderMask),
3326 m_VPValue(), m_VPValue()))))
3327 return R.getOperand(0)->getDefiningRecipe()->getRegion() ==
3328 Plan.getVectorLoopRegion();
3329 return true;
3330 }));
3331
3332 // Replace header masks with a mask equivalent to predicating by EVL:
3333 //
3334 // icmp ule widen-canonical-iv backedge-taken-count
3335 // ->
3336 // icmp ult step-vector, EVL
3337 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3338 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3339 Type *EVLType = EVL.getScalarType();
3340 VPValue *EVLMask = Builder.createICmp(
3342 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3343 HeaderMask->replaceAllUsesWith(EVLMask);
3344}
3345
3346/// Converts a tail folded vector loop region to step by
3347/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3348/// iteration.
3349///
3350/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3351/// replaces all uses of the canonical IV except for the canonical IV
3352/// increment with a VPCurrentIterationPHIRecipe. The canonical IV is used
3353/// only for loop iterations counting after this transformation.
3354///
3355/// - The header mask is replaced with a header mask based on the EVL.
3356///
3357/// - Plans with FORs have a new phi added to keep track of the EVL of the
3358/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3359/// @llvm.vp.splice.
3360///
3361/// The function uses the following definitions:
3362/// %StartV is the canonical induction start value.
3363///
3364/// The function adds the following recipes:
3365///
3366/// vector.ph:
3367/// ...
3368///
3369/// vector.body:
3370/// ...
3371/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3372/// [ %NextIter, %vector.body ]
3373/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3374/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3375/// ...
3376/// %OpEVL = cast i32 %VPEVL to IVSize
3377/// %NextIter = add IVSize %OpEVL, %CurrentIter
3378/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3379/// ...
3380///
3381/// If MaxSafeElements is provided, the function adds the following recipes:
3382/// vector.ph:
3383/// ...
3384///
3385/// vector.body:
3386/// ...
3387/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3388/// [ %NextIter, %vector.body ]
3389/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3390/// %cmp = cmp ult %AVL, MaxSafeElements
3391/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3392/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3393/// ...
3394/// %OpEVL = cast i32 %VPEVL to IVSize
3395/// %NextIter = add IVSize %OpEVL, %CurrentIter
3396/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3397/// ...
3398///
3400 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3401 if (Plan.hasScalarVFOnly())
3402 return;
3403 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3404 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3405
3406 auto *CanonicalIV = LoopRegion->getCanonicalIV();
3407 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3408 VPValue *StartV = Plan.getZero(CanIVTy);
3409 auto *CanonicalIVIncrement = LoopRegion->getOrCreateCanonicalIVIncrement();
3410
3411 // Create the CurrentIteration recipe in the vector loop.
3412 auto *CurrentIteration =
3414 CurrentIteration->insertBefore(*Header, Header->begin());
3415 VPBuilder Builder(Header, Header->getFirstNonPhi());
3416 // Create the AVL (application vector length), starting from TC -> 0 in steps
3417 // of EVL.
3418 VPPhi *AVLPhi = Builder.createScalarPhi(
3419 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3420 VPValue *AVL = AVLPhi;
3421
3422 if (MaxSafeElements) {
3423 // Support for MaxSafeDist for correct loop emission.
3424 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3425 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3426 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3427 "safe_avl");
3428 }
3429 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3430 DebugLoc::getUnknown(), "evl");
3431
3432 Builder.setInsertPoint(CanonicalIVIncrement);
3433 VPValue *OpVPEVL = VPEVL;
3434
3435 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3436 OpVPEVL = Builder.createScalarZExtOrTrunc(
3437 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3438
3439 auto *NextIter = Builder.createAdd(
3440 OpVPEVL, CurrentIteration, CanonicalIVIncrement->getDebugLoc(),
3441 "current.iteration.next", CanonicalIVIncrement->getNoWrapFlags());
3442 CurrentIteration->addBackedgeValue(NextIter);
3443
3444 VPValue *NextAVL =
3445 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3446 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3447 AVLPhi->addIncoming(NextAVL);
3448
3449 fixupVFUsersForEVL(Plan, *VPEVL);
3450 removeDeadRecipes(Plan);
3451
3452 // Replace all uses of the canonical IV with VPCurrentIterationPHIRecipe
3453 // except for the canonical IV increment.
3454 CanonicalIV->replaceAllUsesWith(CurrentIteration);
3455 CanonicalIVIncrement->setOperand(0, CanonicalIV);
3456 // TODO: support unroll factor > 1.
3457 Plan.setUF(1);
3458}
3459
3461 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3462 // There should be only one VPCurrentIteration in the entire plan.
3463 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3464
3467 for (VPRecipeBase &R : VPBB->phis())
3468 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3469 assert(!CurrentIteration &&
3470 "Found multiple CurrentIteration. Only one expected");
3471 CurrentIteration = PhiR;
3472 }
3473
3474 // Early return if it is not variable-length stepping.
3475 if (!CurrentIteration)
3476 return;
3477
3478 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3479 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3480
3481 // Convert CurrentIteration to concrete recipe.
3482 auto *ScalarR =
3483 VPBuilder(CurrentIteration)
3485 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3486 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3487 CurrentIteration->replaceAllUsesWith(ScalarR);
3488 CurrentIteration->eraseFromParent();
3489
3490 // Replace CanonicalIVInc with CurrentIteration increment if it exists.
3491 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3492 if (auto *CanIVInc = findUserOf(
3493 CanonicalIV, m_c_Add(m_VPValue(), m_Specific(&Plan.getVFxUF())))) {
3494 cast<VPInstruction>(CanIVInc)->replaceAllUsesWith(CurrentIterationIncr);
3495 CanIVInc->eraseFromParent();
3496 }
3497}
3498
3500 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3501 if (!LoopRegion)
3502 return;
3503 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3504 if (Header->empty())
3505 return;
3506 // The EVL IV is always at the beginning.
3507 auto *EVLPhi = dyn_cast<VPCurrentIterationPHIRecipe>(&Header->front());
3508 if (!EVLPhi)
3509 return;
3510
3511 // Bail if not an EVL tail folded loop.
3512 VPValue *AVL;
3513 if (!match(EVLPhi->getBackedgeValue(),
3514 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3515 return;
3516
3517 // The AVL may be capped to a safe distance.
3518 VPValue *SafeAVL, *UnsafeAVL;
3519 if (match(AVL,
3521 m_VPValue(SafeAVL)),
3522 m_Deferred(UnsafeAVL), m_Deferred(SafeAVL))))
3523 AVL = UnsafeAVL;
3524
3525 VPValue *AVLNext;
3526 [[maybe_unused]] bool FoundAVLNext =
3528 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3529 assert(FoundAVLNext && "Didn't find AVL backedge?");
3530
3531 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3532 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3533 if (match(LatchBr, m_BranchOnCond(m_True())))
3534 return;
3535
3536 VPValue *CanIVInc;
3537 [[maybe_unused]] bool FoundIncrement = match(
3538 LatchBr,
3540 m_Specific(&Plan.getVectorTripCount()))));
3541 assert(FoundIncrement &&
3542 match(CanIVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
3543 m_Specific(&Plan.getVFxUF()))) &&
3544 "Expected BranchOnCond with ICmp comparing CanIV + VFxUF with vector "
3545 "trip count");
3546
3547 Type *AVLTy = AVLNext->getScalarType();
3548 VPBuilder Builder(LatchBr);
3549 LatchBr->setOperand(
3550 0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));
3551}
3552
3554 VPlan &Plan, PredicatedScalarEvolution &PSE,
3555 const DenseMap<Value *, const SCEV *> &StridesMap,
3556 const VPDominatorTree &VPDT) {
3557 // Replace VPValues for known constant strides guaranteed by predicated scalar
3558 // evolution that are guaranteed to be guarded by the runtime checks; that is,
3559 // blocks dominated by the vector preheader.
3560 assert(!Plan.getVectorLoopRegion() &&
3561 "expected to run before loop regions are created");
3562 VPBlockBase *Preheader = Plan.getEntry()->getSuccessors()[1];
3563 auto CanUseVersionedStride = [&VPDT, Preheader](VPUser &U, unsigned) {
3564 auto *R = cast<VPRecipeBase>(&U);
3565 VPBlockBase *Parent = R->getParent();
3566 return VPDT.dominates(Preheader, Parent);
3567 };
3568 ValueToSCEVMapTy RewriteMap;
3569 for (const SCEV *Stride : StridesMap.values()) {
3570 using namespace SCEVPatternMatch;
3571 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3572 const APInt *StrideConst;
3573 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3574 // Only handle constant strides for now.
3575 continue;
3576
3577 auto *CI = Plan.getConstantInt(*StrideConst);
3578 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3579 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3580
3581 // The versioned value may not be used in the loop directly but through a
3582 // sext/zext. Add new live-ins in those cases.
3583 for (Value *U : StrideV->users()) {
3585 continue;
3586 VPValue *StrideVPV = Plan.getLiveIn(U);
3587 if (!StrideVPV)
3588 continue;
3589 unsigned BW = U->getType()->getScalarSizeInBits();
3590 APInt C =
3591 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3592 VPValue *CI = Plan.getConstantInt(C);
3593 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3594 }
3595 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3596 }
3597
3598 for (VPRecipeBase &R : *Plan.getEntry()) {
3599 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3600 if (!ExpSCEV)
3601 continue;
3602 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3603 auto *NewSCEV =
3604 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3605 if (NewSCEV != ScevExpr) {
3606 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3607 ExpSCEV->replaceAllUsesWith(NewExp);
3608 if (Plan.getTripCount() == ExpSCEV)
3609 Plan.resetTripCount(NewExp);
3610 }
3611 }
3612}
3613
3615 // Collect recipes in the backward slice of `Root` that may generate a poison
3616 // value that is used after vectorization.
3618 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3620 Worklist.push_back(Root);
3621
3622 // Traverse the backward slice of Root through its use-def chain.
3623 while (!Worklist.empty()) {
3624 VPRecipeBase *CurRec = Worklist.pop_back_val();
3625
3626 if (!Visited.insert(CurRec).second)
3627 continue;
3628
3629 // Prune search if we find another recipe generating a widen memory
3630 // instruction. Widen memory instructions involved in address computation
3631 // will lead to gather/scatter instructions, which don't need to be
3632 // handled.
3634 VPHeaderPHIRecipe>(CurRec))
3635 continue;
3636
3637 // This recipe contributes to the address computation of a widen
3638 // load/store. If the underlying instruction has poison-generating flags,
3639 // drop them directly.
3640 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3641 VPValue *A, *B;
3642 // Dropping disjoint from an OR may yield incorrect results, as some
3643 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3644 // for dependence analysis). Instead, replace it with an equivalent Add.
3645 // This is possible as all users of the disjoint OR only access lanes
3646 // where the operands are disjoint or poison otherwise.
3647 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3648 RecWithFlags->isDisjoint()) {
3649 VPBuilder Builder(RecWithFlags);
3650 VPInstruction *New =
3651 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3652 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3653 RecWithFlags->replaceAllUsesWith(New);
3654 RecWithFlags->eraseFromParent();
3655 CurRec = New;
3656 } else
3657 RecWithFlags->dropPoisonGeneratingFlags();
3658 } else {
3661 (void)Instr;
3662 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3663 "found instruction with poison generating flags not covered by "
3664 "VPRecipeWithIRFlags");
3665 }
3666
3667 // Add new definitions to the worklist.
3668 for (VPValue *Operand : CurRec->operands())
3669 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3670 Worklist.push_back(OpDef);
3671 }
3672 });
3673
3674 // We want to exclude the tail folding case, as we don't need to drop flags
3675 // for operations computing the first lane in this case: the first lane of the
3676 // header mask must always be true.
3677 auto IsNotHeaderMask = [&Plan](VPValue *Mask) {
3678 return Mask && !vputils::isHeaderMask(Mask, Plan);
3679 };
3680
3681 // Traverse all the recipes in the VPlan and collect the poison-generating
3682 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3683 // VPInterleaveRecipe.
3684 auto Iter =
3687 for (VPRecipeBase &Recipe : *VPBB) {
3688 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3689 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3690 if (AddrDef && WidenRec->isConsecutive() &&
3691 IsNotHeaderMask(WidenRec->getMask()))
3692 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3693 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3694 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3695 if (AddrDef && IsNotHeaderMask(InterleaveRec->getMask()))
3696 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3697 }
3698 }
3699 }
3700}
3701
3703 VPlan &Plan,
3705 &InterleaveGroups,
3706 const bool &EpilogueAllowed) {
3707 if (InterleaveGroups.empty())
3708 return;
3709
3711 for (VPBasicBlock *VPBB :
3714 for (VPRecipeBase &R : make_filter_range(*VPBB, [](VPRecipeBase &R) {
3715 return isa<VPWidenMemoryRecipe>(&R);
3716 })) {
3717 auto *MemR = cast<VPWidenMemoryRecipe>(&R);
3718 IRMemberToRecipe[&MemR->getIngredient()] = MemR;
3719 }
3720
3721 // Interleave memory: for each Interleave Group we marked earlier as relevant
3722 // for this VPlan, replace the Recipes widening its memory instructions with a
3723 // single VPInterleaveRecipe at its insertion point.
3724 VPDominatorTree VPDT(Plan);
3725 for (const auto *IG : InterleaveGroups) {
3726 // Skip interleave groups where members don't have recipes. This can happen
3727 // when removeDeadRecipes removes recipes that are part of interleave groups
3728 // but have no users.
3729 if (llvm::any_of(IG->members(), [&IRMemberToRecipe](Instruction *Member) {
3730 return !IRMemberToRecipe.contains(Member);
3731 }))
3732 continue;
3733
3734 auto *Start = IRMemberToRecipe.lookup(IG->getMember(0));
3735 VPIRMetadata InterleaveMD(*Start);
3736 SmallVector<VPValue *, 4> StoredValues;
3737 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start->getAsRecipe()))
3738 StoredValues.push_back(StoreR->getStoredValue());
3739 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3740 Instruction *MemberI = IG->getMember(I);
3741 if (!MemberI)
3742 continue;
3743 VPWidenMemoryRecipe *MemoryR = IRMemberToRecipe.lookup(MemberI);
3744 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR->getAsRecipe()))
3745 StoredValues.push_back(StoreR->getStoredValue());
3746 InterleaveMD.intersect(*MemoryR);
3747 }
3748
3749 bool NeedsMaskForGaps =
3750 (IG->requiresScalarEpilogue() && !EpilogueAllowed) ||
3751 (!StoredValues.empty() && !IG->isFull());
3752
3753 Instruction *IRInsertPos = IG->getInsertPos();
3754 auto *InsertPos = IRMemberToRecipe.lookup(IRInsertPos);
3755 VPRecipeBase *InsertPosR = InsertPos->getAsRecipe();
3756
3758 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3759 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3760 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3761
3762 // Get or create the start address for the interleave group.
3763 VPValue *Addr = Start->getAddr();
3764 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3765 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPosR)) {
3766 // We cannot re-use the address of member zero because it does not
3767 // dominate the insert position. Instead, use the address of the insert
3768 // position and create a PtrAdd adjusting it to the address of member
3769 // zero.
3770 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3771 // InsertPos or sink loads above zero members to join it.
3772 assert(IG->getIndex(IRInsertPos) != 0 &&
3773 "index of insert position shouldn't be zero");
3774 auto &DL = IRInsertPos->getDataLayout();
3775 APInt Offset(32,
3776 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3777 IG->getIndex(IRInsertPos),
3778 /*IsSigned=*/true);
3779 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3780 VPBuilder B(InsertPosR);
3781 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3782 }
3783 // If the group is reverse, adjust the index to refer to the last vector
3784 // lane instead of the first. We adjust the index from the first vector
3785 // lane, rather than directly getting the pointer for lane VF - 1, because
3786 // the pointer operand of the interleaved access is supposed to be uniform.
3787 if (IG->isReverse()) {
3788 auto *ReversePtr = new VPVectorEndPointerRecipe(
3789 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3790 -(int64_t)IG->getFactor(), NW, InsertPosR->getDebugLoc());
3791 ReversePtr->insertBefore(InsertPosR);
3792 Addr = ReversePtr;
3793 }
3794 auto *VPIG = new VPInterleaveRecipe(
3795 IG, Addr, StoredValues, InsertPos->getMask(), NeedsMaskForGaps,
3796 InterleaveMD, InsertPosR->getDebugLoc());
3797 VPIG->insertBefore(InsertPosR);
3798
3799 unsigned J = 0;
3800 for (unsigned i = 0; i < IG->getFactor(); ++i)
3801 if (Instruction *Member = IG->getMember(i)) {
3802 VPRecipeBase *MemberR = IRMemberToRecipe.lookup(Member)->getAsRecipe();
3803 if (!Member->getType()->isVoidTy()) {
3804 VPValue *OriginalV = MemberR->getVPSingleValue();
3805 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3806 J++;
3807 }
3808 MemberR->eraseFromParent();
3809 }
3810 }
3811}
3812
3813/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3814/// value, phi and backedge value. In the following example:
3815///
3816/// vector.ph:
3817/// Successor(s): vector loop
3818///
3819/// <x1> vector loop: {
3820/// vector.body:
3821/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3822/// ...
3823/// EMIT branch-on-count ...
3824/// No successors
3825/// }
3826///
3827/// WIDEN-INDUCTION will get expanded to:
3828///
3829/// vector.ph:
3830/// ...
3831/// vp<%induction.start> = ...
3832/// vp<%induction.increment> = ...
3833///
3834/// Successor(s): vector loop
3835///
3836/// <x1> vector loop: {
3837/// vector.body:
3838/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3839/// ...
3840/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3841/// EMIT branch-on-count ...
3842/// No successors
3843/// }
3844static void
3846 VPlan *Plan = WidenIVR->getParent()->getPlan();
3847 VPValue *Start = WidenIVR->getStartValue();
3848 VPValue *Step = WidenIVR->getStepValue();
3849 VPValue *VF = WidenIVR->getVFValue();
3850 DebugLoc DL = WidenIVR->getDebugLoc();
3851
3852 // The value from the original loop to which we are mapping the new induction
3853 // variable.
3854 Type *Ty = WidenIVR->getScalarType();
3855
3856 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3859 VPIRFlags Flags = *WidenIVR;
3860 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3861 AddOp = Instruction::Add;
3862 MulOp = Instruction::Mul;
3863 } else {
3864 AddOp = ID.getInductionOpcode();
3865 MulOp = Instruction::FMul;
3866 }
3867
3868 // If the phi is truncated, truncate the start and step values.
3869 VPBuilder Builder(Plan->getVectorPreheader());
3870 Type *StepTy = Step->getScalarType();
3871 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3872 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3873 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3874 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3875 StepTy = Ty;
3876 }
3877
3878 // Construct the initial value of the vector IV in the vector loop preheader.
3879 Type *IVIntTy =
3881 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3882 if (StepTy->isFloatingPointTy())
3883 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3884
3885 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3886 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3887
3888 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3889 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3890 DebugLoc::getUnknown(), "induction");
3891
3892 // Create the widened phi of the vector IV.
3893 auto *WidePHI = VPBuilder(WidenIVR).createWidenPhi(
3894 Init, WidenIVR->getDebugLoc(), "vec.ind");
3895
3896 // Create the backedge value for the vector IV.
3897 VPValue *Inc;
3898 VPValue *Prev;
3899 // If unrolled, use the increment and prev value from the operands.
3900 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3901 Inc = SplatVF;
3902 Prev = WidenIVR->getLastUnrolledPartOperand();
3903 } else {
3904 // Move the insertion point after the VF definition when the VF is defined
3905 // inside a loop, such as for EVL tail-folding.
3906 if (VPRecipeBase *R = VF->getDefiningRecipe())
3907 if (R->getParent()->getEnclosingLoopRegion())
3908 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3909
3910 // Multiply the vectorization factor by the step using integer or
3911 // floating-point arithmetic as appropriate.
3912 if (StepTy->isFloatingPointTy())
3913 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3914 DL);
3915 else
3916 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, VF->getScalarType(), DL);
3917
3918 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3919 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3920 Prev = WidePHI;
3921 }
3922
3924 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3925 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3926 WidenIVR->getDebugLoc(), "vec.ind.next");
3927
3928 WidePHI->addIncoming(Next);
3929
3930 WidenIVR->replaceAllUsesWith(WidePHI);
3931}
3932
3933/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3934/// initial value, phi and backedge value. In the following example:
3935///
3936/// <x1> vector loop: {
3937/// vector.body:
3938/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3939/// ...
3940/// EMIT branch-on-count ...
3941/// }
3942///
3943/// WIDEN-POINTER-INDUCTION will get expanded to:
3944///
3945/// <x1> vector loop: {
3946/// vector.body:
3947/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3948/// EMIT %mul = mul %stepvector, %step
3949/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3950/// ...
3951/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3952/// EMIT branch-on-count ...
3953/// }
3955 VPlan *Plan = R->getParent()->getPlan();
3956 VPValue *Start = R->getStartValue();
3957 VPValue *Step = R->getStepValue();
3958 VPValue *VF = R->getVFValue();
3959
3960 assert(R->getInductionDescriptor().getKind() ==
3962 "Not a pointer induction according to InductionDescriptor!");
3963 assert(R->getScalarType()->isPointerTy() && "Unexpected type.");
3964 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3965 "Recipe should have been replaced");
3966
3967 VPBuilder Builder(R);
3968 DebugLoc DL = R->getDebugLoc();
3969
3970 // Build a scalar pointer phi.
3971 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3972
3973 // Create actual address geps that use the pointer phi as base and a
3974 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3975 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3976 Type *StepTy = Step->getScalarType();
3977 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3978 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3979 VPValue *PtrAdd =
3980 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3981 R->replaceAllUsesWith(PtrAdd);
3982
3983 // Create the backedge value for the scalar pointer phi.
3985 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3986 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, VF->getScalarType(), DL);
3987 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3988
3989 VPValue *InductionGEP =
3990 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3991 ScalarPtrPhi->addIncoming(InductionGEP);
3992}
3993
3994/// Expand a VPDerivedIVRecipe into executable recipes.
3996 VPBuilder Builder(R);
3997 VPIRValue *Start = R->getStartValue();
3998 VPValue *Step = R->getStepValue();
3999 VPValue *Index = R->getIndex();
4000 Type *StepTy = Step->getScalarType();
4001 Type *IndexTy = Index->getScalarType();
4002 Index = StepTy->isIntegerTy()
4003 ? Builder.createScalarSExtOrTrunc(
4004 Index, StepTy, IndexTy, DebugLoc::getCompilerGenerated())
4005 : Builder.createScalarCast(Instruction::SIToFP, Index, StepTy,
4007 switch (R->getInductionKind()) {
4009 assert(Index->getScalarType() == Start->getScalarType() &&
4010 "Index type does not match StartValue type");
4011 return R->replaceAllUsesWith(Builder.createAdd(
4012 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
4013 }
4015 return R->replaceAllUsesWith(Builder.createPtrAdd(
4016 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
4018 assert(StepTy->isFloatingPointTy() && "Expected FP Step value");
4019 const FPMathOperator *FPBinOp = R->getFPBinOp();
4020 assert(FPBinOp &&
4021 (FPBinOp->getOpcode() == Instruction::FAdd ||
4022 FPBinOp->getOpcode() == Instruction::FSub) &&
4023 "Original BinOp should be defined for FP induction");
4024 FastMathFlags FMF = FPBinOp->getFastMathFlags();
4025 VPValue *FMul = Builder.createNaryOp(Instruction::FMul, {Step, Index}, FMF);
4026 return R->replaceAllUsesWith(
4027 Builder.createNaryOp(FPBinOp->getOpcode(), {Start, FMul}, FMF));
4028 }
4030 return;
4031 }
4032 llvm_unreachable("Unhandled induction kind");
4033}
4034
4036 // Replace loop regions with explicity CFG.
4037 SmallVector<VPRegionBlock *> LoopRegions;
4039 vp_depth_first_deep(Plan.getEntry()))) {
4040 if (!R->isReplicator())
4041 LoopRegions.push_back(R);
4042 }
4043 for (VPRegionBlock *R : LoopRegions)
4044 R->dissolveToCFGLoop();
4045}
4046
4049 // The transform runs after dissolving loop regions, so all VPBasicBlocks
4050 // terminated with BranchOnTwoConds are reached via a shallow traversal.
4053 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
4054 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
4055 }
4056
4057 // Expand BranchOnTwoConds instructions into explicit CFG with two new
4058 // single-condition branches:
4059 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
4060 // the first condition is true, and otherwise jumps to a new interim block.
4061 // 2. A branch that ends the interim block, jumps to the second successor if
4062 // the second condition is true, and otherwise jumps to the third
4063 // successor.
4064 for (VPInstruction *Br : WorkList) {
4065 assert(Br->getNumOperands() == 2 &&
4066 "BranchOnTwoConds must have exactly 2 conditions");
4067 DebugLoc DL = Br->getDebugLoc();
4068 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
4069 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
4070 assert(Successors.size() == 3 &&
4071 "BranchOnTwoConds must have exactly 3 successors");
4072
4073 for (VPBlockBase *Succ : Successors)
4074 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
4075
4076 VPValue *Cond0 = Br->getOperand(0);
4077 VPValue *Cond1 = Br->getOperand(1);
4078 VPBlockBase *Succ0 = Successors[0];
4079 VPBlockBase *Succ1 = Successors[1];
4080 VPBlockBase *Succ2 = Successors[2];
4081
4082 // If the successor block for both conditions is the same, then combine the
4083 // two conditions and plant a single conditional branch.
4084 if (Succ0 == Succ1) {
4085 VPBuilder Builder(Br);
4086 VPValue *Combined = Builder.createOr(Cond0, Cond1, DL);
4087 Builder.createNaryOp(VPInstruction::BranchOnCond, {Combined}, DL);
4088 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
4089 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ2);
4090 Br->eraseFromParent();
4091 continue;
4092 }
4093
4094 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
4095 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
4096
4097 VPBasicBlock *InterimBB =
4098 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
4099
4100 VPBuilder(BrOnTwoCondsBB)
4102 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
4103 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
4104
4106 VPBlockUtils::connectBlocks(InterimBB, Succ1);
4107 VPBlockUtils::connectBlocks(InterimBB, Succ2);
4108 Br->eraseFromParent();
4109 }
4110}
4111
4114 vp_depth_first_deep(Plan.getEntry()))) {
4115 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4116 VPBuilder Builder(&R);
4117 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
4119 WidenIVR->eraseFromParent();
4120 continue;
4121 }
4122
4123 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
4124 // If the recipe only generates scalars, scalarize it instead of
4125 // expanding it.
4126 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
4127 VPValue *PtrAdd =
4128 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
4129 WidenIVR->replaceAllUsesWith(PtrAdd);
4130 WidenIVR->eraseFromParent();
4131 continue;
4132 }
4134 WidenIVR->eraseFromParent();
4135 continue;
4136 }
4137
4138 if (auto *DerivedIVR = dyn_cast<VPDerivedIVRecipe>(&R)) {
4139 expandVPDerivedIV(DerivedIVR);
4140 DerivedIVR->eraseFromParent();
4141 continue;
4142 }
4143
4144 if (auto *WideCanIV = dyn_cast<VPWidenCanonicalIVRecipe>(&R)) {
4145 VPValue *CanIV = WideCanIV->getCanonicalIV();
4146 Type *CanIVTy = CanIV->getScalarType();
4147 VPValue *Step = WideCanIV->getStepValue();
4148 if (!Step) {
4149 assert(Plan.getConcreteUF() == 1 &&
4150 "Expected unroller to have materialized step for UF != 1");
4151 Step = Plan.getZero(CanIVTy);
4152 }
4153 CanIV = Builder.createNaryOp(VPInstruction::Broadcast, CanIV);
4154 Step = Builder.createNaryOp(VPInstruction::Broadcast, Step);
4155 Step = Builder.createAdd(
4156 Step, Builder.createNaryOp(VPInstruction::StepVector, {}, CanIVTy));
4157 VPValue *CanVecIV =
4158 Builder.createAdd(CanIV, Step, WideCanIV->getDebugLoc(), "vec.iv",
4159 WideCanIV->getNoWrapFlags());
4160 WideCanIV->replaceAllUsesWith(CanVecIV);
4161 WideCanIV->eraseFromParent();
4162 continue;
4163 }
4164
4165 // Expand VPBlendRecipe into VPInstruction::Select.
4166 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
4167 VPValue *Select = Blend->getIncomingValue(0);
4168 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
4169 Select = Builder.createSelect(Blend->getMask(I),
4170 Blend->getIncomingValue(I), Select,
4171 R.getDebugLoc(), "predphi", *Blend);
4172 Blend->replaceAllUsesWith(Select);
4173 Blend->eraseFromParent();
4174 continue;
4175 }
4176
4177 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
4178 if (!VEPR->getOffset()) {
4179 assert(Plan.getConcreteUF() == 1 &&
4180 "Expected unroller to have materialized offset for UF != 1");
4181 VEPR->materializeOffset();
4182 }
4183 continue;
4184 }
4185
4186 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
4187 Expr->decompose();
4188 Expr->eraseFromParent();
4189 continue;
4190 }
4191
4192 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4193 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
4194 if (LastActiveL &&
4195 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4196 // Create Not(Mask) for all operands.
4198 for (VPValue *Op : LastActiveL->operands()) {
4199 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
4200 NotMasks.push_back(NotMask);
4201 }
4202
4203 // Create FirstActiveLane on the inverted masks.
4204 VPValue *FirstInactiveLane = Builder.createFirstActiveLane(
4205 NotMasks, LastActiveL->getDebugLoc(), "first.inactive.lane");
4206
4207 // Subtract 1 to get the last active lane.
4208 VPValue *One =
4209 Plan.getConstantInt(FirstInactiveLane->getScalarType(), 1);
4210 VPValue *LastLane =
4211 Builder.createSub(FirstInactiveLane, One,
4212 LastActiveL->getDebugLoc(), "last.active.lane");
4213
4214 LastActiveL->replaceAllUsesWith(LastLane);
4215 LastActiveL->eraseFromParent();
4216 continue;
4217 }
4218
4219 // Lower MaskedCond with block mask to LogicalAnd.
4221 auto *VPI = cast<VPInstruction>(&R);
4222 assert(VPI->isMasked() &&
4223 "Unmasked MaskedCond should be simplified earlier");
4224 VPI->replaceAllUsesWith(Builder.createNaryOp(
4225 VPInstruction::LogicalAnd, {VPI->getMask(), VPI->getOperand(0)}));
4226 VPI->eraseFromParent();
4227 continue;
4228 }
4229
4230 // Lower CanonicalIVIncrementForPart to plain Add.
4231 if (match(
4232 &R,
4234 auto *VPI = cast<VPInstruction>(&R);
4235 VPValue *Add = Builder.createOverflowingOp(
4236 Instruction::Add, VPI->operands(), VPI->getNoWrapFlags(),
4237 VPI->getDebugLoc());
4238 VPI->replaceAllUsesWith(Add);
4239 VPI->eraseFromParent();
4240 continue;
4241 }
4242
4243 // Lower BranchOnCount to ICmp + BranchOnCond.
4244 VPValue *IV, *TC;
4245 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
4246 auto *BranchOnCountInst = cast<VPInstruction>(&R);
4247 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4248 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
4249 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
4250 BranchOnCountInst->eraseFromParent();
4251 continue;
4252 }
4253
4254 VPValue *VectorStep;
4255 VPValue *ScalarStep;
4257 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4258 continue;
4259
4260 // Expand WideIVStep.
4261 auto *VPI = cast<VPInstruction>(&R);
4262 Type *IVTy = VPI->getScalarType();
4263 if (VectorStep->getScalarType() != IVTy) {
4265 ? Instruction::UIToFP
4266 : Instruction::Trunc;
4267 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4268 }
4269
4270 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4271 if (ScalarStep->getScalarType() != IVTy) {
4272 ScalarStep =
4273 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4274 }
4275
4276 VPIRFlags Flags;
4277 unsigned MulOpc;
4278 if (IVTy->isFloatingPointTy()) {
4279 MulOpc = Instruction::FMul;
4280 Flags = VPI->getFastMathFlagsOrNone();
4281 } else {
4282 MulOpc = Instruction::Mul;
4283 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4284 }
4285
4286 VPInstruction *Mul = Builder.createNaryOp(
4287 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4288 VectorStep = Mul;
4289 VPI->replaceAllUsesWith(VectorStep);
4290 VPI->eraseFromParent();
4291 }
4292 }
4293}
4294
4295/// Returns the VPValue representing the uncountable exit comparison used by
4296/// AnyOf if the recipes it depends on can be traced back to live-ins and
4297/// the addresses (in GEP/PtrAdd form) of any (non-masked) load used in
4298/// generating the values for the comparison. The recipes are stored in
4299/// \p Recipes.
4300static std::optional<VPValue *>
4302 VPBasicBlock *LatchVPBB) {
4303 // Given a plain CFG VPlan loop with countable latch exiting block
4304 // \p LatchVPBB, we're looking to match the recipes contributing to the
4305 // uncountable exit condition comparison (here, vp<%4>) back to either
4306 // live-ins or the address nodes for the load used as part of the uncountable
4307 // exit comparison so that we can either move them within the loop, or copy
4308 // them to the preheader depending on the chosen method for dealing with
4309 // stores in uncountable exit loops.
4310 //
4311 // Currently, the address of the load is restricted to a GEP with 2 operands
4312 // and a live-in base address. This constraint may be relaxed later.
4313 //
4314 // VPlan ' for UF>=1' {
4315 // Live-in vp<%0> = VF * UF
4316 // Live-in vp<%1> = vector-trip-count
4317 // Live-in ir<20> = original trip-count
4318 //
4319 // ir-bb<entry>:
4320 // Successor(s): scalar.ph, vector.ph
4321 //
4322 // vector.ph:
4323 // Successor(s): for.body
4324 //
4325 // for.body:
4326 // EMIT vp<%2> = phi ir<0>, vp<%index.next>
4327 // EMIT-SCALAR ir<%iv> = phi [ ir<0>, vector.ph ], [ ir<%iv.next>, for.inc ]
4328 // EMIT ir<%uncountable.addr> = getelementptr inbounds nuw ir<%pred>,ir<%iv>
4329 // EMIT ir<%uncountable.val> = load ir<%uncountable.addr>
4330 // EMIT ir<%uncountable.cond> = icmp sgt ir<%uncountable.val>, ir<500>
4331 // EMIT vp<%3> = masked-cond ir<%uncountable.cond>
4332 // Successor(s): for.inc
4333 //
4334 // for.inc:
4335 // EMIT ir<%iv.next> = add nuw nsw ir<%iv>, ir<1>
4336 // EMIT ir<%countable.cond> = icmp eq ir<%iv.next>, ir<20>
4337 // EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
4338 // EMIT vp<%4> = any-of ir<%3>
4339 // EMIT vp<%5> = icmp eq vp<%index.next>, vp<%1>
4340 // EMIT branch-on-two-conds vp<%4>, vp<%5>
4341 // Successor(s): middle.block, middle.block, for.body
4342 //
4343 // middle.block:
4344 // Successor(s): ir-bb<exit>, scalar.ph
4345 //
4346 // ir-bb<exit>:
4347 // No successors
4348 //
4349 // scalar.ph:
4350 // }
4351
4352 // Find the uncountable loop exit condition.
4353 VPValue *UncountableCondition = nullptr;
4354 if (!match(LatchVPBB->getTerminator(),
4355 m_BranchOnTwoConds(m_AnyOf(m_VPValue(UncountableCondition)),
4356 m_VPValue())))
4357 return std::nullopt;
4358
4360 Worklist.push_back(UncountableCondition);
4361 while (!Worklist.empty()) {
4362 VPValue *V = Worklist.pop_back_val();
4363
4364 // Any value defined outside the loop does not need to be copied.
4365 if (V->isDefinedOutsideLoopRegions())
4366 continue;
4367
4368 // FIXME: Remove the single user restriction; it's here because we're
4369 // starting with the simplest set of loops we can, and multiple
4370 // users means needing to add PHI nodes in the transform.
4371 if (V->getNumUsers() > 1)
4372 return std::nullopt;
4373
4374 VPValue *Op1, *Op2;
4375 // Walk back through recipes until we find at least one load from memory.
4376 if (match(V, m_ICmp(m_VPValue(Op1), m_VPValue(Op2)))) {
4377 Worklist.push_back(Op1);
4378 Worklist.push_back(Op2);
4379 Recipes.push_back(cast<VPInstruction>(V->getDefiningRecipe()));
4380 } else if (match(V, m_VPInstruction<Instruction::Load>(m_VPValue(Op1)))) {
4381 VPRecipeBase *GepR = Op1->getDefiningRecipe();
4382 // Only matching base + single offset term for now.
4383 if (GepR->getNumOperands() != 2)
4384 return std::nullopt;
4385 // Matching a GEP with a loop-invariant base ptr.
4387 m_LiveIn(), m_VPValue())))
4388 return std::nullopt;
4389 Recipes.push_back(cast<VPInstruction>(V->getDefiningRecipe()));
4390 Recipes.push_back(cast<VPInstruction>(GepR));
4392 m_VPValue(Op1)))) {
4393 Worklist.push_back(Op1);
4394 Recipes.push_back(cast<VPInstruction>(V->getDefiningRecipe()));
4395 } else
4396 return std::nullopt;
4397 }
4398
4399 // If we couldn't match anything, don't return the condition. It may be
4400 // defined outside the loop.
4401 if (Recipes.empty() || none_of(Recipes, [](VPInstruction *I) {
4403 }))
4404 return std::nullopt;
4405
4406 return UncountableCondition;
4407}
4408
4414
4415/// Update \p Plan to mask memory operations in the loop based on whether the
4416/// early exit is taken or not.
4417///
4418/// We're currently expecting to find a loop with properties similar to the
4419/// following:
4420///
4421/// for.body:
4422/// ir<%indvars.iv> = WIDEN-INDUCTION nuw nsw ir<0>, ir<1>, vp<%0>
4423/// EMIT ir<%arrayidx> = getelementptr inbounds nuw ir<@c>, ir<%indvars.iv>
4424/// EMIT-SCALAR ir<%0> = load ir<%arrayidx>
4425/// EMIT ir<%cmp1> = icmp sgt ir<%0>, ir<5>
4426/// EMIT vp<%1> = masked-cond ir<%cmp1>
4427/// Successor(s): if.end
4428///
4429/// if.end:
4430/// EMIT ir<%arrayidx3> = getelementptr inbounds nuw ir<@src>, ir<%indvars.iv>
4431/// EMIT-SCALAR ir<%2> = load ir<%arrayidx3>
4432/// EMIT ir<%add> = add nsw ir<%2>, ir<42>
4433/// EMIT ir<%arrayidx5> = getelementptr inbounds nuw ir<@dst>, ir<%indvars.iv>
4434/// EMIT store ir<%add>, ir<%arrayidx5>
4435/// EMIT ir<%indvars.iv.next> = add nuw nsw ir<%indvars.iv>, ir<1>
4436/// EMIT vp<%3> = any-of ir<%1>
4437/// EMIT ir<%exitcond.not> = icmp eq ir<%indvars.iv.next>, ir<10000>
4438/// EMIT branch-on-two-conds vp<%3>, ir<%exitcond.not>
4439/// Successor(s): middle.block, middle.block, for.body
4440///
4441/// We currently expect LoopVectorizationLegality to ensure that:
4442/// * There must also be a counted exit. We will need to support speculative
4443/// or first-faulting loads before we can remove this restriction.
4444/// * Any stores within the loop must not alias with the load used for the
4445/// uncountable exit. We can relax this a bit with runtime aliasing checks.
4446/// * Other memory operations in the loop can take place before or after the
4447/// uncountable exit, but must also be unconditional. We need to support
4448/// combining the conditions in VPlanPredicator.
4449/// * The loop must have a single unconditional load contributing to the
4450/// uncountable exit comparison, and the other term must be loop-invariant.
4451/// Improving upon this requires work in getRecipesForUncountableExit to
4452/// handle more complex recipe graphs.
4455 VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB,
4456 Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT,
4457 AssumptionCache *AC) {
4458
4459 // Disconnect early exiting blocks from successors, remove branches. We
4460 // currently don't support multiple uses for recipes involved in creating
4461 // the uncountable exit condition.
4462 for (auto &Exit : Exits) {
4463 if (Exit.EarlyExitingVPBB == LatchVPBB)
4464 continue;
4465
4466 for (VPRecipeBase &R : Exit.EarlyExitVPBB->phis())
4467 cast<VPIRPhi>(&R)->removeIncomingValueFor(Exit.EarlyExitingVPBB);
4468 Exit.EarlyExitingVPBB->getTerminator()->eraseFromParent();
4469 VPBlockUtils::disconnectBlocks(Exit.EarlyExitingVPBB, Exit.EarlyExitVPBB);
4470 }
4471
4472 VPDominatorTree VPDT(Plan);
4473
4474 // We can abandon a VPlan entirely if we return false here, so we shouldn't
4475 // crash if some earlier assumptions on scalar IR don't hold for the vplan
4476 // version of the loop.
4477 SmallVector<VPInstruction *, 8> ConditionRecipes;
4478
4479 std::optional<VPValue *> Cond =
4480 getRecipesForUncountableExit(ConditionRecipes, LatchVPBB);
4481 if (!Cond)
4482 return false;
4483
4484 // Find load contributing to condition.
4485 // At the moment LoopVectorizationLegality only supports a single
4486 // early-exit expression with a compare and a single load that must
4487 // be unconditional.
4488 // TODO: Support more than one load.
4489 auto *Load =
4490 find_singleton<VPInstruction>(ConditionRecipes, [](auto *I, bool _) {
4492 ? I
4493 : nullptr;
4494 });
4495 assert(Load && "Couldn't find exactly one load");
4496 // TODO: Support conditional loads for uncountable exits.
4497 assert(VPDT.dominates(Load->getParent(), LatchVPBB) &&
4498 "Uncountable exit condition load is conditional.");
4499 VPInstruction *Ptr = cast<VPInstruction>(Load->getOperand(0));
4500
4501 // Ensure that we are guaranteed to be able to dereference the memory used
4502 // for determining the uncountable exit for the maximum possible number of
4503 // scalar iterations of the loop.
4504 //
4505 // TODO: Support first-faulting loads in cases where we don't know whether
4506 // all possible addresses are dereferenceable.
4507 {
4509 const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(Ptr, PSE, TheLoop);
4510 const DataLayout &DL = Plan.getDataLayout();
4511 APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getScalarType()),
4512 DL.getTypeStoreSize(Load->getScalarType()).getFixedValue());
4514 PtrSCEV, cast<LoadInst>(Load->getUnderlyingInstr())->getAlign(),
4515 PSE.getSE()->getConstant(EltSize), TheLoop, *PSE.getSE(), DT, AC,
4516 &Predicates))
4517 return false;
4518 }
4519
4520 // Check for a single GEP for the condition load to see if we can link it to
4521 // a widen IV recipe with a step of 1; we're only interested in contiguous
4522 // accesses for the condition load right now.
4523 auto *IV = cast<VPWidenInductionRecipe>(&HeaderVPBB->front());
4524 if (!match(IV->getStartValue(), m_SpecificInt(0)) ||
4525 !match(IV->getStepValue(), m_SpecificInt(1)))
4526 return false;
4528 m_Specific(IV))))
4529 return false;
4530
4531 // We want to guarantee that the uncountable exit condition (and the mask
4532 // we will generate from it) are available for all operations in the loop
4533 // that need to be masked. If the condition recipes are not already the first
4534 // recipes in the header after the last phi, move them there.
4535 auto InsertIt = HeaderVPBB->getFirstNonPhi();
4536 while (InsertIt != HeaderVPBB->end() &&
4537 is_contained(ConditionRecipes, &*InsertIt)) {
4538 erase(ConditionRecipes, &*InsertIt);
4539 InsertIt++;
4540 }
4541 for (auto *Recipe : reverse(ConditionRecipes))
4542 Recipe->moveBefore(*HeaderVPBB, InsertIt);
4543
4544 // Create a mask to represent all lanes that fully execute in the vector loop,
4545 // stopping short of any early exit.
4546 VPBuilder MaskBuilder(HeaderVPBB, InsertIt);
4547 VPValue *FirstActive = MaskBuilder.createFirstActiveLane(*Cond);
4548 Type *IVScalarTy = IV->getScalarType();
4549 Type *FirstActiveTy = FirstActive->getScalarType();
4550 VPValue *ALMMultiplier = Plan.getConstantInt(IVScalarTy, 1);
4551 VPValue *Zero = Plan.getZero(IVScalarTy);
4552 FirstActive = MaskBuilder.createScalarZExtOrTrunc(FirstActive, IVScalarTy,
4553 FirstActiveTy, DebugLoc());
4555 {Zero, FirstActive, ALMMultiplier},
4556 DebugLoc(), "uncountable.exit.mask");
4557
4558 // Convert all other memory operations to use the mask.
4559 for (VPBasicBlock *VPBB : vp_rpo_plain_cfg_loop_body(HeaderVPBB))
4560 for (VPRecipeBase &R : *VPBB)
4561 if (R.mayReadOrWriteMemory() && &R != Load) {
4562 // TODO: Handle conditional memory operations in the loop.
4563 if (!VPDT.dominates(R.getParent(), LatchVPBB))
4564 return false;
4565 cast<VPInstruction>(&R)->addMask(Mask);
4566 }
4567
4568 // Update middle block branch to compare (IV + however many lanes were active)
4569 // against the full trip count, since we may be exiting the vector loop early.
4570 // If we didn't take an early exit, we should get the equivalent of VF from
4571 // the FirstActiveLane.
4572 assert(match(MiddleVPBB->getTerminator(), m_BranchOnCond()) &&
4573 "Expected BranchOnCond terminator for MiddleVPBB");
4574 VPBuilder MiddleBuilder(MiddleVPBB->getTerminator());
4575 VPValue *ScalarIV = MiddleBuilder.createNaryOp(VPInstruction::ExtractLane,
4576 {Zero, IV}, DebugLoc());
4577 VPValue *ExitIV = MiddleBuilder.createAdd(ScalarIV, FirstActive);
4578 VPValue *FullTC =
4579 MiddleBuilder.createICmp(CmpInst::ICMP_EQ, ExitIV, Plan.getTripCount());
4580 MiddleVPBB->getTerminator()->setOperand(0, FullTC);
4581
4582 // Update resume phi in scalar.ph.
4583 VPBasicBlock *ScalarPH = Plan.getScalarPreheader();
4584 auto Phis = ScalarPH->phis();
4585 // TODO: Handle more than one Phi; re-derive from IV.
4586 // TODO: Handle reductions.
4587 if (range_size(Phis) != 1)
4588 return false;
4589 VPPhi *ContinueIV = cast<VPPhi>(Phis.begin());
4590 // Make sure we're referring to the same IV.
4591 assert(
4592 match(ContinueIV->getOperand(0),
4594 "Continuing from different IV");
4595 ContinueIV->setOperand(0, ExitIV);
4596 return true;
4597}
4598
4600 VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB,
4601 VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE,
4603#ifndef NDEBUG
4604 VPDominatorTree VPDT(Plan);
4605#endif
4606 VPBuilder LatchBuilder(LatchVPBB->getTerminator());
4608 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4609 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4610 if (Pred == MiddleVPBB)
4611 continue;
4612 // Collect condition for this early exit.
4613 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4614 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4615 VPValue *CondOfEarlyExitingVPBB;
4616 [[maybe_unused]] bool Matched =
4617 match(EarlyExitingVPBB->getTerminator(),
4618 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4619 assert(Matched && "Terminator must be BranchOnCond");
4620
4621 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4622 // the correct block mask.
4623 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4624 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4626 TrueSucc == ExitBlock
4627 ? CondOfEarlyExitingVPBB
4628 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4629 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4630 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4631 VPDT.properlyDominates(
4632 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4633 LatchVPBB)) &&
4634 "exit condition must dominate the latch");
4635 Exits.push_back({
4636 EarlyExitingVPBB,
4637 ExitBlock,
4638 CondToEarlyExit,
4639 });
4640 }
4641 }
4642
4643 assert(!Exits.empty() && "must have at least one early exit");
4644 // Sort exits by RPO order to get correct program order. RPO gives a
4645 // topological ordering of the CFG, ensuring upstream exits are checked
4646 // before downstream exits in the dispatch chain.
4648 HeaderVPBB);
4650 for (const auto &[Num, VPB] : enumerate(RPOT))
4651 RPOIdx[VPB] = Num;
4652 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4653 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4654 });
4655#ifndef NDEBUG
4656 // After RPO sorting, verify that for any pair where one exit dominates
4657 // another, the dominating exit comes first. This is guaranteed by RPO
4658 // (topological order) and is required for the dispatch chain correctness.
4659 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4660 for (unsigned J = I + 1; J < Exits.size(); ++J)
4661 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4662 Exits[I].EarlyExitingVPBB) &&
4663 "RPO sort must place dominating exits before dominated ones");
4664#endif
4665
4666 // Build the AnyOf condition for the latch terminator using logical OR
4667 // to avoid poison propagation from later exit conditions when an earlier
4668 // exit is taken.
4669 VPValue *Combined = Exits[0].CondToExit;
4670 for (const EarlyExitInfo &Info : drop_begin(Exits))
4671 Combined = LatchBuilder.createLogicalOr(Combined, Info.CondToExit);
4672
4673 VPValue *IsAnyExitTaken =
4674 LatchBuilder.createNaryOp(VPInstruction::AnyOf, {Combined});
4675
4676 // Create a comparison for the latch exit condition and replace the
4677 // BranchOnCond with a BranchOnTwoConds. The original BranchOnCond's condition
4678 // is used as the latch-exit condition; canonical IV recipes have not been
4679 // introduced yet, so there is no BranchOnCount to derive the condition from.
4680 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4681 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCond &&
4682 "Unexpected terminator");
4683 VPValue *IsLatchExitTaken = LatchExitingBranch->getOperand(0);
4684 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4685 LatchExitingBranch->eraseFromParent();
4686 LatchBuilder.setInsertPoint(LatchVPBB);
4688 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4689 LatchVPBB->clearSuccessors();
4690
4692 // If handling the exiting lane in the scalar loop, combine the exit
4693 // conditions into a single BranchOnCond.
4694 LatchVPBB->setSuccessors({MiddleVPBB, MiddleVPBB, HeaderVPBB});
4695 MiddleVPBB->clearPredecessors();
4696 MiddleVPBB->setPredecessors({LatchVPBB, LatchVPBB});
4698 Plan, Exits, HeaderVPBB, LatchVPBB, MiddleVPBB, TheLoop, PSE, DT, AC);
4699 }
4700
4701 // Create the vector.early.exit blocks.
4702 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4703 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4704 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4705 VPBasicBlock *VectorEarlyExitVPBB =
4706 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4707 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4708 }
4709
4710 // Create the dispatch block (or reuse the single exit block if only one
4711 // exit). The dispatch block computes the first active lane of the combined
4712 // condition and, for multiple exits, chains through conditions to determine
4713 // which exit to take.
4714 VPBasicBlock *DispatchVPBB =
4715 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4716 : Plan.createVPBasicBlock("vector.early.exit.check");
4717 DispatchVPBB->setPredecessors({LatchVPBB});
4718 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4719 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4720 VPValue *FirstActiveLane = DispatchBuilder.createFirstActiveLane(
4721 {Combined}, DebugLoc::getUnknown(), "first.active.lane");
4722
4723 // For each early exit, disconnect the original exiting block
4724 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4725 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4726 // values at the first active lane:
4727 //
4728 // Input:
4729 // early.exiting.I:
4730 // ...
4731 // EMIT branch-on-cond vp<%cond.I>
4732 // Successor(s): in.loop.succ, ir-bb<exit.I>
4733 //
4734 // ir-bb<exit.I>:
4735 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4736 //
4737 // Output:
4738 // early.exiting.I:
4739 // ...
4740 // Successor(s): in.loop.succ
4741 //
4742 // vector.early.exit.I:
4743 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4744 // Successor(s): ir-bb<exit.I>
4745 //
4746 // ir-bb<exit.I>:
4747 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4748 // vector.early.exit.I)
4749 //
4750 for (auto [Exit, VectorEarlyExitVPBB] :
4751 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4752 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4753 // Adjust the phi nodes in EarlyExitVPBB.
4754 // 1. remove incoming values from EarlyExitingVPBB,
4755 // 2. extract the incoming value at FirstActiveLane
4756 // 3. add back the extracts as last operands for the phis
4757 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4758 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4759 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4760 // values from VectorEarlyExitVPBB.
4761 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4762 auto *ExitIRI = cast<VPIRPhi>(&R);
4763 VPValue *IncomingVal =
4764 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4765 VPValue *NewIncoming = IncomingVal;
4766 if (!isa<VPIRValue>(IncomingVal)) {
4767 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4768 NewIncoming = EarlyExitBuilder.createNaryOp(
4769 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4770 DebugLoc::getUnknown(), "early.exit.value");
4771 }
4772 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4773 ExitIRI->addIncoming(NewIncoming);
4774 }
4775
4776 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4777 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4778 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4779 }
4780
4781 // Chain through exits: for each exit, check if its condition is true at
4782 // the first active lane. If so, take that exit; otherwise, try the next.
4783 // The last exit needs no check since it must be taken if all others fail.
4784 //
4785 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4786 //
4787 // latch:
4788 // ...
4789 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4790 // ...
4791 //
4792 // vector.early.exit.check:
4793 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4794 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4795 // EMIT branch-on-cond vp<%at.cond.0>
4796 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4797 //
4798 // vector.early.exit.check.0:
4799 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4800 // EMIT branch-on-cond vp<%at.cond.1>
4801 // Successor(s): vector.early.exit.1, vector.early.exit.2
4802 VPBasicBlock *CurrentBB = DispatchVPBB;
4803 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4804 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4805 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4806 DebugLoc::getUnknown(), "exit.cond.at.lane");
4807
4808 // For the last dispatch, branch directly to the last exit on false;
4809 // otherwise, create a new check block.
4810 bool IsLastDispatch = (I + 2 == Exits.size());
4811 VPBasicBlock *FalseBB =
4812 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4813 : Plan.createVPBasicBlock(
4814 Twine("vector.early.exit.check.") + Twine(I));
4815
4816 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4817 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4818 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4819 FalseBB->setPredecessors({CurrentBB});
4820
4821 CurrentBB = FalseBB;
4822 DispatchBuilder.setInsertPoint(CurrentBB);
4823 }
4824
4825 return true;
4826}
4827
4828/// This function tries convert extended in-loop reductions to
4829/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4830/// valid. The created recipe must be decomposed to its constituent
4831/// recipes before execution.
4832static VPExpressionRecipe *
4834 VFRange &Range) {
4835 Type *RedTy = Red->getScalarType();
4836 VPValue *VecOp = Red->getVecOp();
4837
4838 assert(!Red->isPartialReduction() &&
4839 "This path does not support partial reductions");
4840
4841 // Clamp the range if using extended-reduction is profitable.
4842 auto IsExtendedRedValidAndClampRange =
4843 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4845 [&](ElementCount VF) {
4846 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4848
4850 InstructionCost ExtCost =
4851 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4852 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4853
4854 assert(!RedTy->isFloatingPointTy() &&
4855 "getExtendedReductionCost only supports integer types");
4856 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4857 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4858 Red->getFastMathFlagsOrNone(), CostKind);
4859 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4860 },
4861 Range);
4862 };
4863
4864 VPValue *A;
4865 // Match reduce(ext)).
4867 IsExtendedRedValidAndClampRange(
4868 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4869 cast<VPWidenCastRecipe>(VecOp)->getOpcode(), A->getScalarType()))
4870 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4871
4872 return nullptr;
4873}
4874
4875/// This function tries convert extended in-loop reductions to
4876/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4877/// and valid. The created VPExpressionRecipe must be decomposed to its
4878/// constituent recipes before execution. Patterns of the
4879/// VPExpressionRecipe:
4880/// reduce.add(mul(...)),
4881/// reduce.add(mul(ext(A), ext(B))),
4882/// reduce.add(ext(mul(ext(A), ext(B)))).
4883/// reduce.fadd(fmul(ext(A), ext(B)))
4884static VPExpressionRecipe *
4886 VPCostContext &Ctx, VFRange &Range) {
4887 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4888 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4889 Opcode != Instruction::FAdd)
4890 return nullptr;
4891
4892 assert(!Red->isPartialReduction() &&
4893 "This path does not support partial reductions");
4894 Type *RedTy = Red->getScalarType();
4895
4896 // Clamp the range if using multiply-accumulate-reduction is profitable.
4897 auto IsMulAccValidAndClampRange =
4899 VPWidenCastRecipe *OuterExt) -> bool {
4901 [&](ElementCount VF) {
4903 Type *SrcTy = Ext0 ? Ext0->getOperand(0)->getScalarType() : RedTy;
4904 InstructionCost MulAccCost;
4905
4906 // getMulAccReductionCost for in-loop reductions does not support
4907 // mixed or floating-point extends.
4908 if (Ext0 && Ext1 &&
4909 (Ext0->getOpcode() != Ext1->getOpcode() ||
4910 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4911 return false;
4912
4913 bool IsZExt =
4914 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4915 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4916 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4917 SrcVecTy, CostKind);
4918
4919 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4920 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4921 InstructionCost ExtCost = 0;
4922 if (Ext0)
4923 ExtCost += Ext0->computeCost(VF, Ctx);
4924 if (Ext1)
4925 ExtCost += Ext1->computeCost(VF, Ctx);
4926 if (OuterExt)
4927 ExtCost += OuterExt->computeCost(VF, Ctx);
4928
4929 return MulAccCost.isValid() &&
4930 MulAccCost < ExtCost + MulCost + RedCost;
4931 },
4932 Range);
4933 };
4934
4935 VPValue *VecOp = Red->getVecOp();
4936 VPRecipeBase *Sub = nullptr;
4937 VPValue *A, *B;
4938 VPValue *Tmp = nullptr;
4939
4940 if (RedTy->isFloatingPointTy())
4941 return nullptr;
4942
4943 // Sub reductions could have a sub between the add reduction and vec op.
4944 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4945 Sub = VecOp->getDefiningRecipe();
4946 VecOp = Tmp;
4947 }
4948
4949 // If ValB is a constant and can be safely extended, truncate it to the same
4950 // type as ExtA's operand, then extend it to the same type as ExtA. This
4951 // creates two uniform extends that can more easily be matched by the rest of
4952 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4953 // replaced with the new extend of the constant.
4954 auto ExtendAndReplaceConstantOp = [](VPWidenCastRecipe *ExtA,
4955 VPWidenCastRecipe *&ExtB, VPValue *&ValB,
4956 VPWidenRecipe *Mul) {
4957 if (!ExtA || ExtB || !isa<VPIRValue>(ValB))
4958 return;
4959 Type *NarrowTy = ExtA->getOperand(0)->getScalarType();
4960 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4961 const APInt *Const;
4962 if (!match(ValB, m_APInt(Const)) ||
4964 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4965 return;
4966 // The truncate ensures that the type of each extended operand is the
4967 // same, and it's been proven that the constant can be extended from
4968 // NarrowTy safely. Necessary since ExtA's extended operand would be
4969 // e.g. an i8, while the const will likely be an i32. This will be
4970 // elided by later optimisations.
4971 VPBuilder Builder(Mul);
4972 auto *Trunc =
4973 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4974 Type *WideTy = ExtA->getScalarType();
4975 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4976 Mul->setOperand(1, ExtB);
4977 };
4978
4979 // Try to match reduce.add(mul(...)).
4980 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4981 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(A);
4982 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(B);
4983 auto *Mul = cast<VPWidenRecipe>(VecOp);
4984
4985 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4986 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4987
4988 // Match reduce.add/sub(mul(ext, ext)).
4989 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4990 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4991 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4992 if (Sub)
4993 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4994 cast<VPWidenRecipe>(Sub), Red);
4995 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4996 }
4997 // TODO: Add an expression type for this variant with a negated mul
4998 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4999 return new VPExpressionRecipe(Mul, Red);
5000 }
5001 // TODO: Add an expression type for negated versions of other expression
5002 // variants.
5003 if (Sub)
5004 return nullptr;
5005
5006 // Match reduce.add(ext(mul(A, B))).
5007 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
5008 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
5009 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
5010 auto *Ext0 = dyn_cast<VPWidenCastRecipe>(A);
5011 auto *Ext1 = dyn_cast<VPWidenCastRecipe>(B);
5012
5013 // reduce.add(ext(mul(ext, const)))
5014 // -> reduce.add(ext(mul(ext, ext(const))))
5015 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
5016
5017 // reduce.add(ext(mul(ext(A), ext(B))))
5018 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
5019 // The inner extends must either have the same opcode as the outer extend or
5020 // be the same, in which case the multiply can never result in a negative
5021 // value and the outer extend can be folded away by doing wider
5022 // extends for the operands of the mul.
5023 if (Ext0 && Ext1 &&
5024 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
5025 Ext0->getOpcode() == Ext1->getOpcode() &&
5026 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
5027 auto *NewExt0 = new VPWidenCastRecipe(
5028 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getScalarType(), nullptr,
5029 *Ext0, *Ext0, Ext0->getDebugLoc());
5030 NewExt0->insertBefore(Ext0);
5031
5032 VPWidenCastRecipe *NewExt1 = NewExt0;
5033 if (Ext0 != Ext1) {
5034 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
5035 Ext->getScalarType(), nullptr, *Ext1,
5036 *Ext1, Ext1->getDebugLoc());
5037 NewExt1->insertBefore(Ext1);
5038 }
5039 auto *NewMul = Mul->cloneWithOperands({NewExt0, NewExt1});
5040 NewMul->insertBefore(Mul);
5041 Ext->replaceAllUsesWith(NewMul);
5042 Ext->eraseFromParent();
5043 Mul->eraseFromParent();
5044 return new VPExpressionRecipe(NewExt0, NewExt1, NewMul, Red);
5045 }
5046 }
5047 return nullptr;
5048}
5049
5050/// This function tries to create abstract recipes from the reduction recipe for
5051/// following optimizations and cost estimation.
5053 VPCostContext &Ctx,
5054 VFRange &Range) {
5055 // Creation of VPExpressions for partial reductions is entirely handled in
5056 // transformToPartialReduction.
5057 assert(!Red->isPartialReduction() &&
5058 "This path does not support partial reductions");
5059
5060 VPExpressionRecipe *AbstractR = nullptr;
5061 auto IP = std::next(Red->getIterator());
5062 auto *VPBB = Red->getParent();
5063 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
5064 AbstractR = MulAcc;
5065 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
5066 AbstractR = ExtRed;
5067 // Cannot create abstract inloop reduction recipes.
5068 if (!AbstractR)
5069 return;
5070
5071 AbstractR->insertBefore(*VPBB, IP);
5072 Red->replaceAllUsesWith(AbstractR);
5073}
5074
5085
5087 if (Plan.hasScalarVFOnly())
5088 return;
5089
5090#ifndef NDEBUG
5091 VPDominatorTree VPDT(Plan);
5092#endif
5093
5094 SmallVector<VPValue *> VPValues;
5095 if (VPValue *BTC = Plan.getBackedgeTakenCount())
5096 VPValues.push_back(BTC);
5097 append_range(VPValues, Plan.getLiveIns());
5098 for (VPRecipeBase &R : *Plan.getEntry())
5099 append_range(VPValues, R.definedValues());
5100
5101 auto *VectorPreheader = Plan.getVectorPreheader();
5102 for (VPValue *VPV : VPValues) {
5104 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
5105 continue;
5106
5107 // Add explicit broadcast at the insert point that dominates all users.
5108 VPBasicBlock *HoistBlock = VectorPreheader;
5109 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
5110 for (VPUser *User : VPV->users()) {
5111 if (User->usesScalars(VPV))
5112 continue;
5113 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
5114 HoistPoint = HoistBlock->begin();
5115 else
5116 assert(VPDT.dominates(VectorPreheader,
5117 cast<VPRecipeBase>(User)->getParent()) &&
5118 "All users must be in the vector preheader or dominated by it");
5119 }
5120
5121 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
5122 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
5123 VPV->replaceUsesWithIf(Broadcast,
5124 [VPV, Broadcast](VPUser &U, unsigned Idx) {
5125 return Broadcast != &U && !U.usesScalars(VPV);
5126 });
5127 }
5128}
5129
5130// Collect common metadata from a group of replicate recipes by intersecting
5131// metadata from all recipes in the group.
5133 VPIRMetadata CommonMetadata = *Recipes.front();
5134 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
5135 CommonMetadata.intersect(*Recipe);
5136 return CommonMetadata;
5137}
5138
5139template <unsigned Opcode>
5143 const Loop *L) {
5144 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
5145 "Only Load and Store opcodes supported");
5146 [[maybe_unused]] constexpr bool IsLoad = (Opcode == Instruction::Load);
5147
5148 // For each address, collect operations with the same or complementary masks.
5151 Plan, PSE, L,
5152 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
5153 for (auto Recipes : Groups) {
5154 if (Recipes.size() < 2)
5155 continue;
5156
5158 map_range(Recipes, bind_back<getLoadStoreValueType>(IsLoad))) &&
5159 "Expected all recipes in group to have the same load-store type");
5160
5161 // Collect groups with the same or complementary masks.
5162 for (VPReplicateRecipe *&RecipeI : Recipes) {
5163 if (!RecipeI)
5164 continue;
5165
5166 VPValue *MaskI = RecipeI->getMask();
5168 Group.push_back(RecipeI);
5169 RecipeI = nullptr;
5170
5171 // Find all operations with the same or complementary masks.
5172 bool HasComplementaryMask = false;
5173 for (VPReplicateRecipe *&RecipeJ : Recipes) {
5174 if (!RecipeJ)
5175 continue;
5176
5177 VPValue *MaskJ = RecipeJ->getMask();
5178 // Check if any operation in the group has a complementary mask with
5179 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
5180 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
5181 match(MaskJ, m_Not(m_Specific(MaskI)));
5182 Group.push_back(RecipeJ);
5183 RecipeJ = nullptr;
5184 }
5185
5186 if (HasComplementaryMask) {
5187 assert(Group.size() >= 2 && "must have at least 2 entries");
5188 AllGroups.push_back(std::move(Group));
5189 }
5190 }
5191 }
5192
5193 return AllGroups;
5194}
5195
5196// Find the recipe with minimum alignment in the group.
5197template <typename InstType>
5198static VPReplicateRecipe *
5200 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
5201 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
5202 cast<InstType>(B->getUnderlyingInstr())->getAlign();
5203 });
5204}
5205
5208 const Loop *L) {
5209 auto Groups =
5211 if (Groups.empty())
5212 return;
5213
5214 // Process each group of loads.
5215 for (auto &Group : Groups) {
5216 // Try to use the earliest (most dominating) load to replace all others.
5217 VPReplicateRecipe *EarliestLoad = Group[0];
5218 VPBasicBlock *FirstBB = EarliestLoad->getParent();
5219 VPBasicBlock *LastBB = Group.back()->getParent();
5220
5221 // Check that the load doesn't alias with stores between first and last.
5222 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
5223 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
5224 continue;
5225
5226 // Collect common metadata from all loads in the group.
5227 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
5228
5229 // Find the load with minimum alignment to use.
5230 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
5231
5232 bool IsSingleScalar = EarliestLoad->isSingleScalar();
5233 assert(all_of(Group,
5234 [IsSingleScalar](VPReplicateRecipe *R) {
5235 return R->isSingleScalar() == IsSingleScalar;
5236 }) &&
5237 "all members in group must agree on IsSingleScalar");
5238
5239 // Create an unpredicated version of the earliest load with common
5240 // metadata.
5241 auto *UnpredicatedLoad = new VPReplicateRecipe(
5242 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
5243 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
5244
5245 UnpredicatedLoad->insertBefore(EarliestLoad);
5246
5247 // Replace all loads in the group with the unpredicated load.
5248 for (VPReplicateRecipe *Load : Group) {
5249 Load->replaceAllUsesWith(UnpredicatedLoad);
5250 Load->eraseFromParent();
5251 }
5252 }
5253}
5254
5255static bool
5257 PredicatedScalarEvolution &PSE, const Loop &L) {
5258 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
5259 if (!StoreLoc || !StoreLoc->AATags.Scope)
5260 return false;
5261
5262 // When sinking a group of stores, all members of the group alias each other.
5263 // Skip them during the alias checks.
5264 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
5265 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
5266 SinkStoreInfo SinkInfo(StoresToSink, *StoresToSink[0], PSE, L);
5267 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
5268}
5269
5272 const Loop *L) {
5273 auto Groups =
5275 if (Groups.empty())
5276 return;
5277
5278 for (auto &Group : Groups) {
5279 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L))
5280 continue;
5281
5282 // Use the last (most dominated) store's location for the unconditional
5283 // store.
5284 VPReplicateRecipe *LastStore = Group.back();
5285 VPBasicBlock *InsertBB = LastStore->getParent();
5286
5287 // Collect common alias metadata from all stores in the group.
5288 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
5289
5290 // Build select chain for stored values.
5291 VPValue *SelectedValue = Group[0]->getOperand(0);
5292 VPBuilder Builder(InsertBB, LastStore->getIterator());
5293
5294 bool IsSingleScalar = Group[0]->isSingleScalar();
5295 for (unsigned I = 1; I < Group.size(); ++I) {
5296 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
5297 "all members in group must agree on IsSingleScalar");
5298 VPValue *Mask = Group[I]->getMask();
5299 VPValue *Value = Group[I]->getOperand(0);
5300 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
5301 Group[I]->getDebugLoc());
5302 }
5303
5304 // Find the store with minimum alignment to use.
5305 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
5306
5307 // Create unconditional store with selected value and common metadata.
5308 auto *UnpredicatedStore = new VPReplicateRecipe(
5309 StoreWithMinAlign->getUnderlyingInstr(),
5310 {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,
5311 /*Mask=*/nullptr, *LastStore, CommonMetadata);
5312 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
5313
5314 // Remove all predicated stores from the group.
5315 for (VPReplicateRecipe *Store : Group)
5316 Store->eraseFromParent();
5317 }
5318}
5319
5321 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
5323 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
5324 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
5325
5326 VPValue *TC = Plan.getTripCount();
5327 if (TC->user_empty())
5328 return;
5329
5330 // Skip cases for which the trip count may be non-trivial to materialize.
5331 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
5332 // tail is required.
5333 if (!Plan.hasScalarTail() ||
5335 Plan.getScalarPreheader() ||
5336 !isa<VPIRValue>(TC))
5337 return;
5338
5339 // Materialize vector trip counts for constants early if it can simply
5340 // be computed as (Original TC / VF * UF) * VF * UF.
5341 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
5342 // tail-folded loops.
5343 ScalarEvolution &SE = *PSE.getSE();
5344 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
5345 if (!isa<SCEVConstant>(TCScev))
5346 return;
5347 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
5348 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
5349 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
5350 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
5351}
5352
5354 VPBasicBlock *VectorPH) {
5356 if (BTC->user_empty())
5357 return;
5358
5359 VPBuilder Builder(VectorPH, VectorPH->begin());
5360 auto *TCTy = Plan.getTripCount()->getScalarType();
5361 auto *TCMO =
5362 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
5363 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
5364 BTC->replaceAllUsesWith(TCMO);
5365}
5366
5368 if (Plan.hasScalarVFOnly())
5369 return;
5370
5371 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
5372 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5374 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5375 vp_depth_first_shallow(LoopRegion->getEntry()));
5376 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
5377 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
5378 // regions. Those are not materialized explicitly yet.
5379 // TODO: materialize build vectors for replicating recipes in replicating
5380 // regions.
5381 for (VPBasicBlock *VPBB :
5382 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
5383 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5385 continue;
5386 auto *DefR = cast<VPSingleDefRecipe>(&R);
5387 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
5388 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5389 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
5390 };
5391 if ((isa<VPReplicateRecipe>(DefR) &&
5392 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
5393 (isa<VPInstruction>(DefR) &&
5395 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
5396 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
5397 continue;
5398
5399 Type *ScalarTy = DefR->getScalarType();
5400 unsigned Opcode = ScalarTy->isStructTy()
5403 auto *BuildVector = new VPInstruction(Opcode, {DefR});
5404 BuildVector->insertAfter(DefR);
5405
5406 DefR->replaceUsesWithIf(
5407 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
5408 VPUser &U, unsigned) {
5409 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
5410 });
5411 }
5412 }
5413
5414 // Create explicit VPInstructions to convert vectors to scalars. The current
5415 // implementation is conservative - it may miss some cases that may or may not
5416 // be vector values. TODO: introduce Unpacks speculatively - remove them later
5417 // if they are known to operate on scalar values.
5418 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5419 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5421 VPDerivedIVRecipe>(&R))
5422 continue;
5423 for (VPValue *Def : R.definedValues()) {
5424 // Skip recipes that are single-scalar.
5425 // TODO: The Defs skipped here may or may not be vector values.
5426 // Introduce Unpacks, and remove them later, if they are guaranteed to
5427 // produce scalar values.
5428 if (vputils::isSingleScalar(Def))
5429 continue;
5430
5431 // Only introduce an Unpack if some, but not all, users use the first
5432 // lane only.
5433 unsigned NumFirstLaneUsers = count_if(Def->users(), [&Def](VPUser *U) {
5434 return U->usesFirstLaneOnly(Def);
5435 });
5436 if (!NumFirstLaneUsers || NumFirstLaneUsers == Def->getNumUsers())
5437 continue;
5438
5439 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
5440 if (R.isPhi())
5441 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
5442 else
5443 Unpack->insertAfter(&R);
5444 Def->replaceUsesWithIf(Unpack, [&Def](VPUser &U, unsigned) {
5445 return U.usesFirstLaneOnly(Def);
5446 });
5447 }
5448 }
5449 }
5450}
5451
5453 VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking,
5454 bool RequiresScalarEpilogue, VPValue *Step,
5455 std::optional<uint64_t> MaxRuntimeStep) {
5456 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5457 // There's nothing to do if there are no users of the vector trip count or its
5458 // IR value has already been set.
5459 if (VectorTC.user_empty() || VectorTC.getUnderlyingValue())
5460 return;
5461
5462 VPValue *TC = Plan.getTripCount();
5463 Type *TCTy = TC->getScalarType();
5464 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5465 if (auto *StepR = Step->getDefiningRecipe()) {
5466 assert(VPDominatorTree(Plan).dominates(StepR->getParent(), VectorPHVPBB) &&
5467 "Step VPBB must dominate VectorPHVPBB");
5468 // Insert after Step's definition to maintain valid def-use ordering.
5469 InsertPt = std::next(StepR->getIterator());
5470 }
5471 VPBuilder Builder(VectorPHVPBB, InsertPt);
5472
5473 // For scalable steps, if TC is a constant and is divisible by the maximum
5474 // possible runtime step, then TC % Step == 0 for all valid vscale values
5475 // and the vector trip count equals TC directly.
5476 const APInt *TCVal;
5477 if (!RequiresScalarEpilogue && match(TC, m_APInt(TCVal)) && MaxRuntimeStep &&
5478 TCVal->urem(*MaxRuntimeStep) == 0) {
5479 VectorTC.replaceAllUsesWith(TC);
5480 return;
5481 }
5482
5483 // If the tail is to be folded by masking, round the number of iterations N
5484 // up to a multiple of Step instead of rounding down. This is done by first
5485 // adding Step-1 and then rounding down. Note that it's ok if this addition
5486 // overflows: the vector induction variable will eventually wrap to zero given
5487 // that it starts at zero and its Step is a power of two; the loop will then
5488 // exit, with the last early-exit vector comparison also producing all-true.
5489 if (TailByMasking) {
5490 TC = Builder.createAdd(
5491 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5492 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5493 }
5494
5495 // Now we need to generate the expression for the part of the loop that the
5496 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5497 // iterations are not required for correctness, or N - Step, otherwise. Step
5498 // is equal to the vectorization factor (number of SIMD elements) times the
5499 // unroll factor (number of SIMD instructions).
5500 VPValue *R =
5501 Builder.createNaryOp(Instruction::URem, {TC, Step},
5502 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5503
5504 // There are cases where we *must* run at least one iteration in the remainder
5505 // loop. See the cost model for when this can happen. If the step evenly
5506 // divides the trip count, we set the remainder to be equal to the step. If
5507 // the step does not evenly divide the trip count, no adjustment is necessary
5508 // since there will already be scalar iterations. Note that the minimum
5509 // iterations check ensures that N >= Step.
5510 if (RequiresScalarEpilogue) {
5511 assert(!TailByMasking &&
5512 "requiring scalar epilogue is not supported with fail folding");
5513 VPValue *IsZero =
5514 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));
5515 R = Builder.createSelect(IsZero, Step, R);
5516 }
5517
5518 VPValue *Res =
5519 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5520 VectorTC.replaceAllUsesWith(Res);
5521}
5522
5524 ElementCount VFEC) {
5525 // If VF and VFxUF have already been materialized (no remaining users),
5526 // there's nothing more to do.
5527 if (Plan.getVF().isMaterialized()) {
5528 assert(Plan.getVFxUF().isMaterialized() &&
5529 "VF and VFxUF must be materialized together");
5530 return;
5531 }
5532
5533 VPBuilder Builder(VectorPH, VectorPH->begin());
5534 Type *TCTy = Plan.getTripCount()->getScalarType();
5535 VPValue &VF = Plan.getVF();
5536 VPValue &VFxUF = Plan.getVFxUF();
5537 // If there are no users of the runtime VF, compute VFxUF by constant folding
5538 // the multiplication of VF and UF.
5539 if (VF.user_empty()) {
5540 VPValue *RuntimeVFxUF =
5541 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5542 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5543 return;
5544 }
5545
5546 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5547 // vscale) * UF.
5548 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5550 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5552 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5553 }
5554 VF.replaceAllUsesWith(RuntimeVF);
5555
5556 VPValue *MulByUF = Builder.createOverflowingOp(
5557 Instruction::Mul,
5558 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5559 {true, false});
5560 VFxUF.replaceAllUsesWith(MulByUF);
5561}
5562
5564 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
5565 auto *HeaderMaskDef = HeaderMask->getDefiningRecipe();
5566 Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());
5567
5568 VPBuilder Builder(Plan.getVectorPreheader());
5569 auto *AliasMask = Builder.createNaryOp(
5570 VPInstruction::IncomingAliasMask, {}, nullptr, {}, {},
5571 DebugLoc::getUnknown(), "incoming.alias.mask", I1Ty);
5572
5573 if (HeaderMaskDef->isPhi())
5574 Builder = VPBuilder(&*HeaderMaskDef->getParent()->getFirstNonPhi());
5575 else
5576 Builder = VPBuilder::getToInsertAfter(HeaderMaskDef);
5577
5578 // Update all existing users of the header mask to "HeaderMask & AliasMask".
5579 auto *ClampedHeaderMask = Builder.createAnd(HeaderMask, AliasMask);
5580 HeaderMask->replaceUsesWithIf(ClampedHeaderMask, [&](VPUser &U, unsigned) {
5581 return &U != ClampedHeaderMask;
5582 });
5583}
5584
5585VPValue *
5587 ArrayRef<PointerDiffInfo> DiffChecks) {
5588 VPBuilder Builder(AliasCheckVPBB);
5589 Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());
5590
5591 VPValue *IncomingAliasMask = vputils::findIncomingAliasMask(Plan);
5592 assert(IncomingAliasMask && "Expected an alias mask!");
5593
5594 VPValue *AliasMask = nullptr;
5595 for (const PointerDiffInfo &Check : DiffChecks) {
5597 VPValue *Sink =
5599 Type *AddrType = Src->getScalarType();
5600
5601 // TODO: Only freeze the required pointer (not both src and sink).
5602 if (Check.NeedsFreeze) {
5603 Src = Builder.createScalarFreeze(Src, AddrType, DebugLoc::getUnknown());
5604 Sink = Builder.createScalarFreeze(Sink, AddrType, DebugLoc::getUnknown());
5605 }
5606
5607 // TODO: Generate loop_dependence_raw_mask when there's a read-after-write
5608 // dependency between the source and the sink. This is not necessary for
5609 // correctness of the mask, but using the "raw" variant prevents loads
5610 // depending on the completion of stores.
5611 VPWidenIntrinsicRecipe *WARMask = Builder.insert(new VPWidenIntrinsicRecipe(
5612 Intrinsic::loop_dependence_war_mask,
5613 {Src, Sink, Plan.getConstantInt(AddrType, Check.AccessSize)}, I1Ty));
5614
5615 if (AliasMask)
5616 AliasMask = Builder.createAnd(AliasMask, WARMask);
5617 else
5618 AliasMask = WARMask;
5619 }
5620
5622 Type *IndexTy = Plan.getDataLayout().getIndexType(Plan.getContext(), 0);
5623 VPValue *NumActive = Builder.createNaryOp(
5624 VPInstruction::NumActiveLanes, {AliasMask}, nullptr, {}, {},
5625 DebugLoc::getUnknown(), "num.active.lanes", IndexTy);
5626 VPValue *ClampedVF = Builder.createScalarZExtOrTrunc(
5627 NumActive, IVTy, IndexTy, DebugLoc::getCompilerGenerated());
5628
5629 IncomingAliasMask->replaceAllUsesWith(AliasMask);
5630
5631 return ClampedVF;
5632}
5633
5635 VPlan &Plan, ArrayRef<PointerDiffInfo> DiffChecks, bool HasBranchWeights) {
5636 VPBasicBlock *ClampedVFCheck =
5637 Plan.createVPBasicBlock("vector.clamped.vf.check");
5638
5639 VPValue *ClampedVF = materializeAliasMask(Plan, ClampedVFCheck, DiffChecks);
5640 VPBuilder Builder(ClampedVFCheck);
5642 Type *TCTy = Plan.getTripCount()->getScalarType();
5643
5644 // Check the "ClampedVF" from the alias mask is larger than one.
5645 VPValue *IsScalar =
5646 Builder.createICmp(CmpInst::ICMP_ULE, ClampedVF,
5647 Plan.getConstantInt(TCTy, 1), DL, "vf.is.scalar");
5648
5649 VPValue *TripCount = Plan.getTripCount();
5650 VPValue *MaxUIntTripCount =
5652 VPValue *DistanceToMax = Builder.createSub(MaxUIntTripCount, TripCount);
5653
5654 // For tail-folding: Don't execute the vector loop if (UMax - n) < ClampedVF.
5655 // Note: The ClampedVF may not be a power-of-two. This means the loop exit
5656 // condition (index.next == n.vec) may not be correct in the case of an
5657 // overflow. The issue is `n.vec` could be zero due to an overflow, but
5658 // index.next is not guaranteed to overflow to zero as the ClampedVF is not a
5659 // power-of-two).
5660 VPValue *TripCountCheck = Builder.createICmp(
5661 ICmpInst::ICMP_ULT, DistanceToMax, ClampedVF, DL, "vf.step.overflow");
5662
5663 VPValue *Cond = Builder.createOr(IsScalar, TripCountCheck, DL);
5664 attachVPCheckBlock(Plan, Cond, ClampedVFCheck, HasBranchWeights);
5665
5666 // Materialize the trip count early as this will add a use of (VFxUF) that
5667 // needs to be replaced with the ClampedVF.
5669 /*TailByMasking=*/true,
5670 /*RequiresScalarEpilogue=*/false,
5671 &Plan.getVFxUF());
5672
5673 assert(Plan.getConcreteUF() == 1 &&
5674 "Clamped VF not supported with interleaving");
5675 Plan.getVF().replaceAllUsesWith(ClampedVF);
5676 Plan.getVFxUF().replaceAllUsesWith(ClampedVF);
5677}
5678
5680 ScalarEvolution &SE) {
5681 auto *Entry = Plan.getEntry();
5682 VPBuilder Builder(Entry, Entry->begin());
5684 ->getIRBasicBlock()
5685 ->getTerminator()
5686 ->getDebugLoc();
5687 VPSCEVExpander Expander(Builder, SE, DL);
5688
5689 // Expand VPExpandSCEVRecipes to VPInstructions using VPSCEVExpander. During
5690 // the transition, unsupported VPExpandSCEVRecipes are skipped and left for
5691 // late expansion.
5692 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5693 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5694 if (!ExpSCEV || ExpSCEV->user_empty())
5695 continue;
5696 Builder.setInsertPoint(ExpSCEV);
5697 VPValue *Expanded = Expander.tryToExpand(ExpSCEV->getSCEV());
5698 if (!Expanded)
5699 continue;
5700 ExpSCEV->replaceAllUsesWith(Expanded);
5701 // TripCount should not be used after expansion to VPInstructions. Reset to
5702 // poison to avoid dangling references.
5703 if (Plan.getTripCount() == ExpSCEV)
5704 Plan.resetTripCount(Plan.getPoison(ExpSCEV->getScalarType()));
5705 ExpSCEV->eraseFromParent();
5706 }
5707}
5708
5711 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5712
5713 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5714 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5715 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5716 // Expand remaining VPExpandSCEVRecipes to IR instructions using SCEVExpander.
5717 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5718 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5719 if (!ExpSCEV)
5720 continue;
5721 const SCEV *Expr = ExpSCEV->getSCEV();
5722 Value *Res =
5723 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5724 ExpandedSCEVs[Expr] = Res;
5725 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5726 ExpSCEV->replaceAllUsesWith(Exp);
5727 if (Plan.getTripCount() == ExpSCEV)
5728 Plan.resetTripCount(Exp);
5729 ExpSCEV->eraseFromParent();
5730 }
5732 "all VPExpandSCEVRecipes must have been expanded");
5733 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5734 // to the VPIRBasicBlock.
5735 auto EI = Entry->begin();
5736 for (Instruction &I : drop_end(*EntryBB)) {
5737 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5738 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5739 EI++;
5740 continue;
5741 }
5743 }
5744
5745 return ExpandedSCEVs;
5746}
5747
5748/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5749/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5750/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5751/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5752/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5753/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5754/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5755/// is defined at \p Idx of a load interleave group.
5756/// A live-in or recipe defined outside the loop region can be converted, if it
5757/// is the same across all lanes, or we can create a BuildVector for it.
5758static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5759 VPValue *OpV, unsigned Idx, bool IsScalable) {
5760 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5761 if (Member0Op->isDefinedOutsideLoopRegions()) {
5762 // Operand matches Member0, broadcast across all fields for both live-ins
5763 // and recipes.
5764 if (Member0Op == OpV)
5765 return true;
5766 // Otherwise distinct per-field VPValues are assembled into a BuildVector.
5767 return !IsScalable && OpV->isDefinedOutsideLoopRegions() &&
5768 OpV->getScalarType() == Member0Op->getScalarType();
5769 }
5770 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5771 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5772 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5773 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5774 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5775 Member0Op == OpV;
5776 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5777 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5778 return false;
5779}
5780
5781static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5783 auto *WideMember0 = dyn_cast<VPRecipeWithIRFlags>(Ops[0]);
5784 if (!WideMember0)
5785 return false;
5786 for (VPValue *V : Ops) {
5788 return false;
5789 auto *R = cast<VPRecipeWithIRFlags>(V);
5790 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5791 return false;
5792 if (R->getScalarType() != WideMember0->getScalarType())
5793 return false;
5794 if (R->hasPredicate() && R->getPredicate() != WideMember0->getPredicate())
5795 return false;
5796 }
5797
5798 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5800 for (VPValue *Op : Ops)
5801 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5802
5803 if (canNarrowOps(OpsI, IsScalable))
5804 continue;
5805
5806 if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {
5807 const auto &[OpIdx, OpV] = P;
5808 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5809 }))
5810 return false;
5811 }
5812
5813 return true;
5814}
5815
5816/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5817/// number of members both equal to VF. The interleave group must also access
5818/// the full vector width.
5819static std::optional<ElementCount>
5822 const TargetTransformInfo &TTI) {
5823 if (!InterleaveR || InterleaveR->getMask())
5824 return std::nullopt;
5825
5826 Type *GroupElementTy = nullptr;
5827 if (InterleaveR->getStoredValues().empty()) {
5828 GroupElementTy = InterleaveR->getVPValue(0)->getScalarType();
5829 if (!all_of(InterleaveR->definedValues(), [GroupElementTy](VPValue *Op) {
5830 return Op->getScalarType() == GroupElementTy;
5831 }))
5832 return std::nullopt;
5833 } else {
5834 GroupElementTy = InterleaveR->getStoredValues()[0]->getScalarType();
5835 if (!all_of(InterleaveR->getStoredValues(), [GroupElementTy](VPValue *Op) {
5836 return Op->getScalarType() == GroupElementTy;
5837 }))
5838 return std::nullopt;
5839 }
5840
5841 auto IG = InterleaveR->getInterleaveGroup();
5842 if (IG->getFactor() != IG->getNumMembers())
5843 return std::nullopt;
5844
5845 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5846 TypeSize Size = TTI.getRegisterBitWidth(
5849 assert(Size.isScalable() == VF.isScalable() &&
5850 "if Size is scalable, VF must be scalable and vice versa");
5851 return Size.getKnownMinValue();
5852 };
5853
5854 for (ElementCount VF : VFs) {
5855 unsigned MinVal = VF.getKnownMinValue();
5856 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5857 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5858 return {VF};
5859 }
5860 return std::nullopt;
5861}
5862
5863/// Returns true if \p VPValue is a narrow VPValue.
5864static bool isAlreadyNarrow(VPValue *VPV) {
5865 if (isa<VPIRValue>(VPV))
5866 return true;
5867 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5868 return RepR && RepR->isSingleScalar();
5869}
5870
5871// Convert the wide recipes defining the VPValues in \p Members feeding an
5872// interleave group to a single narrow variant. The first member is reused as
5873// the narrowed recipe. BuildVectors for live-in operands are inserted into \p
5874// Preheader.
5876 SmallPtrSetImpl<VPValue *> &NarrowedOps,
5877 VPBasicBlock *Preheader) {
5878 VPValue *V = Members.front();
5879 if (NarrowedOps.contains(V))
5880 return V;
5881
5882 if (V->isDefinedOutsideLoopRegions()) {
5883 assert(all_of(Members,
5884 [V](VPValue *M) {
5885 return M->isDefinedOutsideLoopRegions() &&
5886 M->getScalarType() == V->getScalarType();
5887 }) &&
5888 "expected distinct loop-invariant values of matching scalar type");
5889 auto *BV = new VPInstruction(VPInstruction::BuildVector, Members);
5890 Preheader->appendRecipe(BV);
5891 NarrowedOps.insert(BV);
5892 return BV;
5893 }
5894
5895 if (isAlreadyNarrow(V))
5896 return V;
5897
5898 VPRecipeBase *R = V->getDefiningRecipe();
5900 auto *WideMember0 = cast<VPRecipeWithIRFlags>(R);
5901 for (VPValue *Member : Members.drop_front())
5902 WideMember0->intersectFlags(*cast<VPRecipeWithIRFlags>(Member));
5903 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx) {
5905 for (VPValue *Member : Members)
5906 OpsI.push_back(Member->getDefiningRecipe()->getOperand(Idx));
5907 WideMember0->setOperand(
5908 Idx, narrowInterleaveGroupOp(OpsI, NarrowedOps, Preheader));
5909 }
5910 return V;
5911 }
5912
5913 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5914 // Narrow interleave group to wide load, as transformed VPlan will only
5915 // process one original iteration.
5916 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5917 auto *L = new VPWidenLoadRecipe(*LI, LoadGroup->getAddr(),
5918 LoadGroup->getMask(), /*Consecutive=*/true,
5919 *LoadGroup, LoadGroup->getDebugLoc());
5920 L->insertBefore(LoadGroup);
5921 NarrowedOps.insert(L);
5922 return L;
5923 }
5924
5925 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5926 assert(RepR->isSingleScalar() && RepR->getOpcode() == Instruction::Load &&
5927 "must be a single scalar load");
5928 NarrowedOps.insert(RepR);
5929 return RepR;
5930 }
5931
5932 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5933 VPValue *PtrOp = WideLoad->getAddr();
5934 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5935 PtrOp = VecPtr->getOperand(0);
5936 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5937 // process one original iteration.
5938 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5939 /*IsUniform*/ true,
5940 /*Mask*/ nullptr, {}, *WideLoad);
5941 N->insertBefore(WideLoad);
5942 NarrowedOps.insert(N);
5943 return N;
5944}
5945
5946std::unique_ptr<VPlan>
5948 const TargetTransformInfo &TTI) {
5949 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5950
5951 if (!VectorLoop)
5952 return nullptr;
5953
5954 // Only handle single-block loops for now.
5955 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5956 return nullptr;
5957
5958 // Skip plans when we may not be able to properly narrow.
5959 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5960 if (!match(&Exiting->back(), m_BranchOnCount()))
5961 return nullptr;
5962
5963 assert(match(&Exiting->back(),
5965 m_Specific(&Plan.getVectorTripCount()))) &&
5966 "unexpected branch-on-count");
5967
5969 std::optional<ElementCount> VFToOptimize;
5970 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5973 continue;
5974
5975 // Bail out on recipes not supported at the moment:
5976 // * phi recipes other than the canonical induction
5977 // * recipes writing to memory except interleave groups
5978 // Only support plans with a canonical induction phi.
5979 if (R.isPhi())
5980 return nullptr;
5981
5982 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5983 if (R.mayWriteToMemory() && !InterleaveR)
5984 return nullptr;
5985
5986 // Bail out if any recipe defines a vector value used outside the
5987 // vector loop region.
5988 if (any_of(R.definedValues(), [&](VPValue *V) {
5989 return any_of(V->users(), [&](VPUser *U) {
5990 auto *UR = cast<VPRecipeBase>(U);
5991 return UR->getParent()->getParent() != VectorLoop;
5992 });
5993 }))
5994 return nullptr;
5995
5996 // All other ops are allowed, but we reject uses that cannot be converted
5997 // when checking all allowed consumers (store interleave groups) below.
5998 if (!InterleaveR)
5999 continue;
6000
6001 // Try to find a single VF, where all interleave groups are consecutive and
6002 // saturate the full vector width. If we already have a candidate VF, check
6003 // if it is applicable for the current InterleaveR, otherwise look for a
6004 // suitable VF across the Plan's VFs.
6006 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
6007 : to_vector(Plan.vectorFactors());
6008 std::optional<ElementCount> NarrowedVF =
6009 isConsecutiveInterleaveGroup(InterleaveR, VFs, TTI);
6010 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
6011 return nullptr;
6012 VFToOptimize = NarrowedVF;
6013
6014 // Skip read interleave groups.
6015 if (InterleaveR->getStoredValues().empty())
6016 continue;
6017
6018 // Narrow interleave groups, if all operands are already matching narrow
6019 // ops.
6020 auto *Member0 = InterleaveR->getStoredValues()[0];
6021 if (isAlreadyNarrow(Member0) &&
6022 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
6023 StoreGroups.push_back(InterleaveR);
6024 continue;
6025 }
6026
6027 // For now, we only support full interleave groups storing load interleave
6028 // groups.
6029 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
6030 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
6031 if (!DefR)
6032 return false;
6033 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
6034 return IR && IR->getInterleaveGroup()->isFull() &&
6035 IR->getVPValue(Op.index()) == Op.value();
6036 })) {
6037 StoreGroups.push_back(InterleaveR);
6038 continue;
6039 }
6040
6041 // Check if all values feeding InterleaveR are matching wide recipes, which
6042 // operands that can be narrowed.
6043 if (!canNarrowOps(InterleaveR->getStoredValues(),
6044 VFToOptimize->isScalable()))
6045 return nullptr;
6046 StoreGroups.push_back(InterleaveR);
6047 }
6048
6049 if (StoreGroups.empty())
6050 return nullptr;
6051
6052 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
6053 bool RequiresScalarEpilogue =
6054 MiddleVPBB->getNumSuccessors() == 1 &&
6055 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
6056 // Bail out for tail-folding (middle block with a single successor to exit).
6057 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
6058 return nullptr;
6059
6060 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
6061 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
6062 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
6063 // TODO: Handle cases where only some interleave groups can be narrowed.
6064 std::unique_ptr<VPlan> NewPlan;
6065 if (size(Plan.vectorFactors()) != 1) {
6066 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
6067 Plan.setVF(*VFToOptimize);
6068 NewPlan->removeVF(*VFToOptimize);
6069 }
6070
6071 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
6072 SmallPtrSet<VPValue *, 4> NarrowedOps;
6073 VPBasicBlock *Preheader = Plan.getVectorPreheader();
6074 // Narrow operation tree rooted at store groups.
6075 for (auto *StoreGroup : StoreGroups) {
6076 VPValue *Res = narrowInterleaveGroupOp(StoreGroup->getStoredValues(),
6077 NarrowedOps, Preheader);
6078 auto *SI =
6079 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
6080 auto *S = new VPWidenStoreRecipe(*SI, StoreGroup->getAddr(), Res, nullptr,
6081 /*Consecutive=*/true, *StoreGroup,
6082 StoreGroup->getDebugLoc());
6083 S->insertBefore(StoreGroup);
6084 StoreGroup->eraseFromParent();
6085 }
6086
6087 // Adjust induction to reflect that the transformed plan only processes one
6088 // original iteration.
6090 Type *CanIVTy = VectorLoop->getCanonicalIVType();
6091 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
6092 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
6093
6094 VPValue *UF = &Plan.getUF();
6095 VPValue *Step;
6096 if (VFToOptimize->isScalable()) {
6097 VPValue *VScale =
6098 PHBuilder.createElementCount(CanIVTy, ElementCount::getScalable(1));
6099 Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},
6100 {true, false});
6101 Plan.getVF().replaceAllUsesWith(VScale);
6102 } else {
6103 Step = UF;
6104 Plan.getVF().replaceAllUsesWith(Plan.getConstantInt(CanIVTy, 1));
6105 }
6106 // Materialize vector trip count with the narrowed step.
6107 materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,
6108 RequiresScalarEpilogue, Step);
6109
6110 CanIVInc->setOperand(1, Step);
6111 Plan.getVFxUF().replaceAllUsesWith(Step);
6112
6113 removeDeadRecipes(Plan);
6114 assert(none_of(*VectorLoop->getEntryBasicBlock(),
6116 "All VPVectorPointerRecipes should have been removed");
6117 return NewPlan;
6118}
6119
6120/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
6121/// BranchOnCond recipe.
6123 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
6124 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
6125 auto *MiddleTerm =
6127 // Only add branch metadata if there is a (conditional) terminator.
6128 if (!MiddleTerm)
6129 return;
6130
6131 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
6132 "must have a BranchOnCond");
6133 // Assume that `TripCount % VectorStep ` is equally distributed.
6134 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
6135 if (VF.isScalable() && VScaleForTuning.has_value())
6136 VectorStep *= *VScaleForTuning;
6137 assert(VectorStep > 0 && "trip count should not be zero");
6138 MDBuilder MDB(Plan.getContext());
6139 MDNode *BranchWeights =
6140 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
6141 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
6142}
6143
6145 VFRange &Range) {
6146 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
6147 auto *MiddleVPBB = Plan.getMiddleBlock();
6148 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
6149
6150 auto IsScalableOne = [](ElementCount VF) -> bool {
6151 return VF == ElementCount::getScalable(1);
6152 };
6153
6154 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
6155 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
6156 if (!FOR)
6157 continue;
6158
6159 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
6160 "Cannot handle loops with uncountable early exits");
6161
6162 // Find the existing splice for this FOR, created in
6163 // createHeaderPhiRecipes. All uses of FOR have already been replaced with
6164 // RecurSplice there; only RecurSplice itself still references FOR.
6165 auto *RecurSplice =
6167 assert(RecurSplice && "expected FirstOrderRecurrenceSplice");
6168
6169 // For VF vscale x 1, if vscale = 1, we are unable to extract the
6170 // penultimate value of the recurrence. Instead we rely on the existing
6171 // extract of the last element from the result of
6172 // VPInstruction::FirstOrderRecurrenceSplice.
6173 // TODO: Consider vscale_range info and UF.
6174 if (any_of(RecurSplice->users(),
6175 [](VPUser *U) { return !cast<VPRecipeBase>(U)->getRegion(); }) &&
6177 Range))
6178 return;
6179
6180 // This is the second phase of vectorizing first-order recurrences, creating
6181 // extracts for users outside the loop. An overview of the transformation is
6182 // described below. Suppose we have the following loop with some use after
6183 // the loop of the last a[i-1],
6184 //
6185 // for (int i = 0; i < n; ++i) {
6186 // t = a[i - 1];
6187 // b[i] = a[i] - t;
6188 // }
6189 // use t;
6190 //
6191 // There is a first-order recurrence on "a". For this loop, the shorthand
6192 // scalar IR looks like:
6193 //
6194 // scalar.ph:
6195 // s.init = a[-1]
6196 // br scalar.body
6197 //
6198 // scalar.body:
6199 // i = phi [0, scalar.ph], [i+1, scalar.body]
6200 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
6201 // s2 = a[i]
6202 // b[i] = s2 - s1
6203 // br cond, scalar.body, exit.block
6204 //
6205 // exit.block:
6206 // use = lcssa.phi [s1, scalar.body]
6207 //
6208 // In this example, s1 is a recurrence because it's value depends on the
6209 // previous iteration. In the first phase of vectorization, we created a
6210 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
6211 // for users in the scalar preheader and exit block.
6212 //
6213 // vector.ph:
6214 // v_init = vector(..., ..., ..., a[-1])
6215 // br vector.body
6216 //
6217 // vector.body
6218 // i = phi [0, vector.ph], [i+4, vector.body]
6219 // v1 = phi [v_init, vector.ph], [v2, vector.body]
6220 // v2 = a[i, i+1, i+2, i+3]
6221 // v1' = splice(v1(3), v2(0, 1, 2))
6222 // b[i, i+1, i+2, i+3] = v2 - v1'
6223 // br cond, vector.body, middle.block
6224 //
6225 // middle.block:
6226 // vector.recur.extract.for.phi = v2(2)
6227 // vector.recur.extract = v2(3)
6228 // br cond, scalar.ph, exit.block
6229 //
6230 // scalar.ph:
6231 // scalar.recur.init = phi [vector.recur.extract, middle.block],
6232 // [s.init, otherwise]
6233 // br scalar.body
6234 //
6235 // scalar.body:
6236 // i = phi [0, scalar.ph], [i+1, scalar.body]
6237 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
6238 // s2 = a[i]
6239 // b[i] = s2 - s1
6240 // br cond, scalar.body, exit.block
6241 //
6242 // exit.block:
6243 // lo = lcssa.phi [s1, scalar.body],
6244 // [vector.recur.extract.for.phi, middle.block]
6245 //
6246 // Update extracts of the splice in the middle block: they extract the
6247 // penultimate element of the recurrence.
6249 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
6250 if (!match(&R, m_ExtractLastLaneOfLastPart(m_Specific(RecurSplice))))
6251 continue;
6252
6253 auto *ExtractR = cast<VPInstruction>(&R);
6254 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
6255 VPInstruction::ExtractPenultimateElement, RecurSplice->getOperand(1),
6256 {}, "vector.recur.extract.for.phi");
6257 for (VPUser *ExitU : to_vector(ExtractR->users())) {
6258 if (auto *ExitPhi = dyn_cast<VPIRPhi>(ExitU))
6259 ExitPhi->replaceUsesOfWith(ExtractR, PenultimateElement);
6260 }
6261 }
6262 }
6263}
6264
6265/// Check if \p V is a binary expression of a widened IV and a loop-invariant
6266/// value. Returns the widened IV if found, nullptr otherwise.
6268 auto *BinOp = dyn_cast<VPWidenRecipe>(V);
6269 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||
6270 Instruction::isIntDivRem(BinOp->getOpcode()))
6271 return nullptr;
6272
6273 VPValue *WidenIVCandidate = BinOp->getOperand(0);
6274 VPValue *InvariantCandidate = BinOp->getOperand(1);
6275 if (!isa<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate))
6276 std::swap(WidenIVCandidate, InvariantCandidate);
6277
6278 if (!InvariantCandidate->isDefinedOutsideLoopRegions())
6279 return nullptr;
6280
6281 return dyn_cast<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate);
6282}
6283
6284/// Create a scalar version of \p BinOp, with its \p WidenIV operand replaced
6285/// by \p ScalarIV, and place it after \p ScalarIV's defining recipe.
6289 BinOp->getNumOperands() == 2 && "BinOp must have 2 operands");
6290 auto *ClonedOp = BinOp->clone();
6291 if (ClonedOp->getOperand(0) == WidenIV) {
6292 ClonedOp->setOperand(0, ScalarIV);
6293 } else {
6294 assert(ClonedOp->getOperand(1) == WidenIV && "one operand must be WideIV");
6295 ClonedOp->setOperand(1, ScalarIV);
6296 }
6297 ClonedOp->insertAfter(ScalarIV->getDefiningRecipe());
6298 return ClonedOp;
6299}
6300
6303 Loop &L) {
6304 ScalarEvolution &SE = *PSE.getSE();
6305 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
6306
6307 // Helper lambda to check if the IV range excludes the sentinel value. Try
6308 // signed first, then unsigned. Return an excluded sentinel if found,
6309 // otherwise return std::nullopt.
6310 auto CheckSentinel = [&SE](const SCEV *IVSCEV,
6311 bool UseMax) -> std::optional<APSInt> {
6312 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
6313 for (bool Signed : {true, false}) {
6314 APSInt Sentinel = UseMax ? APSInt::getMinValue(BW, /*Unsigned=*/!Signed)
6315 : APSInt::getMaxValue(BW, /*Unsigned=*/!Signed);
6316
6317 ConstantRange IVRange =
6318 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
6319 if (!IVRange.contains(Sentinel))
6320 return Sentinel;
6321 }
6322 return std::nullopt;
6323 };
6324
6325 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
6326 for (VPRecipeBase &Phi :
6327 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
6328 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
6330 PhiR->getRecurrenceKind()))
6331 continue;
6332
6333 Type *PhiTy = PhiR->getScalarType();
6334 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
6335 continue;
6336
6337 // If there's a header mask, the backedge select will not be the find-last
6338 // select.
6339 VPValue *BackedgeVal = PhiR->getBackedgeValue();
6340 auto *FindLastSelect = cast<VPSingleDefRecipe>(BackedgeVal);
6341 if (HeaderMask &&
6342 !match(BackedgeVal,
6343 m_Select(m_Specific(HeaderMask),
6344 m_VPSingleDefRecipe(FindLastSelect), m_Specific(PhiR))))
6345 continue;
6346
6347 // Get the find-last expression from the find-last select of the reduction
6348 // phi. The find-last select should be a select between the phi and the
6349 // find-last expression.
6350 VPValue *Cond, *FindLastExpression;
6351 if (!match(FindLastSelect, m_SelectLike(m_VPValue(Cond), m_Specific(PhiR),
6352 m_VPValue(FindLastExpression))) &&
6353 !match(FindLastSelect,
6354 m_SelectLike(m_VPValue(Cond), m_VPValue(FindLastExpression),
6355 m_Specific(PhiR))))
6356 continue;
6357
6358 // Check if FindLastExpression is a simple expression of a widened IV. If
6359 // so, we can track the underlying IV instead and sink the expression.
6360 auto *IVOfExpressionToSink = getExpressionIV(FindLastExpression);
6361 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(
6362 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression, PSE,
6363 &L);
6364 const SCEV *Step;
6365 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
6366 assert(!match(vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L),
6368 "IVOfExpressionToSink not being an AddRec must imply "
6369 "FindLastExpression not being an AddRec.");
6370 continue;
6371 }
6372
6373 // Determine direction from SCEV step.
6374 if (!SE.isKnownNonZero(Step))
6375 continue;
6376
6377 // Positive step means we need UMax/SMax to find the last IV value, and
6378 // UMin/SMin otherwise.
6379 bool UseMax = SE.isKnownPositive(Step);
6380 std::optional<APSInt> SentinelVal = CheckSentinel(IVSCEV, UseMax);
6381 bool UseSigned = SentinelVal && SentinelVal->isSigned();
6382
6383 // Sinking an expression will disable epilogue vectorization. Only use it,
6384 // if FindLastExpression cannot be vectorized via a sentinel. Sinking may
6385 // also prevent vectorizing using a sentinel (e.g., if the expression is a
6386 // multiply or divide by large constant, respectively), which also makes
6387 // sinking undesirable.
6388 if (IVOfExpressionToSink) {
6389 const SCEV *FindLastExpressionSCEV =
6390 vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L);
6391 if (match(FindLastExpressionSCEV,
6392 m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
6393 bool NewUseMax = SE.isKnownPositive(Step);
6394 if (auto NewSentinel =
6395 CheckSentinel(FindLastExpressionSCEV, NewUseMax)) {
6396 // The original expression already has a sentinel, so prefer not
6397 // sinking to keep epilogue vectorization possible.
6398 SentinelVal = *NewSentinel;
6399 UseSigned = NewSentinel->isSigned();
6400 UseMax = NewUseMax;
6401 IVSCEV = FindLastExpressionSCEV;
6402 IVOfExpressionToSink = nullptr;
6403 }
6404 }
6405 }
6406
6407 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
6408 // if the condition was ever true. Requires the IV to not wrap, otherwise we
6409 // cannot use min/max.
6410 if (!SentinelVal) {
6411 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
6412 if (AR->hasNoSignedWrap())
6413 UseSigned = true;
6414 else if (AR->hasNoUnsignedWrap())
6415 UseSigned = false;
6416 else
6417 continue;
6418 }
6419
6421 BackedgeVal,
6423
6424 VPValue *NewFindLastSelect = BackedgeVal;
6425 VPValue *SelectCond = Cond;
6426 if (!SentinelVal || IVOfExpressionToSink) {
6427 // When we need to create a new select, normalize the condition so that
6428 // PhiR is the last operand and include the header mask if needed.
6429 DebugLoc DL = FindLastSelect->getDefiningRecipe()->getDebugLoc();
6430 VPBuilder LoopBuilder(FindLastSelect->getDefiningRecipe());
6431 if (FindLastSelect->getDefiningRecipe()->getOperand(1) == PhiR)
6432 SelectCond = LoopBuilder.createNot(SelectCond);
6433
6434 // When tail folding, mask the condition with the header mask to prevent
6435 // propagating poison from inactive lanes in the last vector iteration.
6436 if (HeaderMask)
6437 SelectCond = LoopBuilder.createLogicalAnd(HeaderMask, SelectCond);
6438
6439 if (SelectCond != Cond || IVOfExpressionToSink) {
6440 NewFindLastSelect = LoopBuilder.createSelect(
6441 SelectCond,
6442 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression,
6443 PhiR, DL);
6444 }
6445 }
6446
6447 // Create the reduction result in the middle block using sentinel directly.
6448 RecurKind MinMaxKind =
6449 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
6450 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
6451 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
6452 FastMathFlags());
6453 DebugLoc ExitDL = RdxResult->getDebugLoc();
6454 VPBuilder MiddleBuilder(RdxResult);
6455 VPValue *ReducedIV =
6457 NewFindLastSelect, Flags, ExitDL);
6458
6459 // If IVOfExpressionToSink is an expression to sink, sink it now.
6460 VPValue *VectorRegionExitingVal = ReducedIV;
6461 if (IVOfExpressionToSink)
6462 VectorRegionExitingVal =
6463 cloneBinOpForScalarIV(cast<VPWidenRecipe>(FindLastExpression),
6464 ReducedIV, IVOfExpressionToSink);
6465
6466 VPValue *NewRdxResult;
6467 VPValue *StartVPV = PhiR->getStartValue();
6468 if (SentinelVal) {
6469 // Sentinel-based approach: reduce IVs with min/max, compare against
6470 // sentinel to detect if condition was ever true, select accordingly.
6471 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
6472 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
6473 Sentinel, ExitDL);
6474 NewRdxResult = MiddleBuilder.createSelect(Cmp, VectorRegionExitingVal,
6475 StartVPV, ExitDL);
6476 StartVPV = Sentinel;
6477 } else {
6478 // Introduce a boolean AnyOf reduction to track if the condition was ever
6479 // true in the loop. Use it to select the initial start value, if it was
6480 // never true.
6481 auto *AnyOfPhi = new VPReductionPHIRecipe(
6482 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
6483 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
6484 AnyOfPhi->insertAfter(PhiR);
6485
6486 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
6487 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, SelectCond);
6488 AnyOfPhi->setOperand(1, OrVal);
6489
6490 NewRdxResult = MiddleBuilder.createAnyOfReduction(
6491 OrVal, VectorRegionExitingVal, StartVPV, ExitDL);
6492
6493 // Initialize the IV reduction phi with the neutral element, not the
6494 // original start value, to ensure correct min/max reduction results.
6495 StartVPV = Plan.getOrAddLiveIn(
6496 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
6497 }
6498 RdxResult->replaceAllUsesWith(NewRdxResult);
6499 RdxResult->eraseFromParent();
6500
6501 auto *NewPhiR = new VPReductionPHIRecipe(
6502 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
6503 *NewFindLastSelect, RdxUnordered{1}, {},
6504 PhiR->hasUsesOutsideReductionChain());
6505 NewPhiR->insertBefore(PhiR);
6506 PhiR->replaceAllUsesWith(NewPhiR);
6507 PhiR->eraseFromParent();
6508 }
6509}
6510
6511namespace {
6512
6513using ExtendKind = TTI::PartialReductionExtendKind;
6514struct ReductionExtend {
6515 Type *SrcType = nullptr;
6516 ExtendKind Kind = ExtendKind::PR_None;
6517};
6518
6519/// Describes the extends used to compute the extended reduction operand.
6520/// ExtendB is optional. If ExtendB is present, ExtendsUser is a binary
6521/// operation.
6522struct ExtendedReductionOperand {
6523 /// The recipe that consumes the extends.
6524 VPWidenRecipe *ExtendsUser = nullptr;
6525 /// Extend descriptions (inputs to getPartialReductionCost).
6526 ReductionExtend ExtendA, ExtendB;
6527};
6528
6529/// A chain of recipes that form a partial reduction. Matches either
6530/// reduction_bin_op (extended op, accumulator), or
6531/// reduction_bin_op (accumulator, extended op).
6532/// The possible forms of the "extended op" are listed in
6533/// matchExtendedReductionOperand.
6534struct VPPartialReductionChain {
6535 /// The top-level binary operation that forms the reduction to a scalar
6536 /// after the loop body.
6537 VPWidenRecipe *ReductionBinOp = nullptr;
6538 /// The user of the extends that is then reduced.
6539 ExtendedReductionOperand ExtendedOp;
6540 /// The recurrence kind for the entire partial reduction chain.
6541 /// This allows distinguishing between Sub and AddWithSub recurrences,
6542 /// when the ReductionBinOp is a Instruction::Sub.
6543 RecurKind RK;
6544 /// The index of the accumulator operand of ReductionBinOp. The extended op
6545 /// is `1 - AccumulatorOpIdx`.
6546 unsigned AccumulatorOpIdx;
6547 unsigned ScaleFactor;
6548};
6549
6550static VPSingleDefRecipe *
6551optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op) {
6552 // reduce.add(mul(ext(A), C))
6553 // -> reduce.add(mul(ext(A), ext(trunc(C))))
6554 const APInt *Const;
6555 if (match(Op, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
6556 auto *ExtA = cast<VPWidenCastRecipe>(Op->getOperand(0));
6557 Instruction::CastOps ExtOpc = ExtA->getOpcode();
6558 Type *NarrowTy = ExtA->getOperand(0)->getScalarType();
6559 if (!Op->hasOneUse() ||
6561 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
6562 return Op;
6563
6564 VPBuilder Builder(Op);
6565 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
6566 Op->getOperand(1), NarrowTy);
6567 Type *WideTy = ExtA->getScalarType();
6568 Op->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
6569 return Op;
6570 }
6571
6572 // reduce.add(abs(sub(ext(A), ext(B))))
6573 // -> reduce.add(ext(absolute-difference(A, B)))
6574 VPValue *X, *Y;
6577 auto *Sub = Op->getOperand(0)->getDefiningRecipe();
6578 auto *Ext = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6579 assert(Ext->getOpcode() ==
6580 cast<VPWidenCastRecipe>(Sub->getOperand(1))->getOpcode() &&
6581 "Expected both the LHS and RHS extends to be the same");
6582 bool IsSigned = Ext->getOpcode() == Instruction::SExt;
6583 VPBuilder Builder(Op);
6584 Type *SrcTy = X->getScalarType();
6585 auto *FreezeX = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {X}));
6586 auto *FreezeY = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {Y}));
6587 auto *Max = Builder.insert(
6588 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smax : Intrinsic::umax,
6589 {FreezeX, FreezeY}, SrcTy));
6590 auto *Min = Builder.insert(
6591 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smin : Intrinsic::umin,
6592 {FreezeX, FreezeY}, SrcTy));
6593 auto *AbsDiff =
6594 Builder.insert(new VPWidenRecipe(Instruction::Sub, {Max, Min}));
6595 return Builder.createWidenCast(Instruction::CastOps::ZExt, AbsDiff,
6596 Op->getScalarType());
6597 }
6598
6599 // reduce.add(ext(mul(ext(A), ext(B))))
6600 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
6601 // TODO: Support this optimization for float types.
6603 m_ZExtOrSExt(m_VPValue()))))) {
6604 auto *Ext = cast<VPWidenCastRecipe>(Op);
6605 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
6606 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6607 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6608 if (!Mul->hasOneUse() ||
6609 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
6610 MulLHS->getOpcode() != MulRHS->getOpcode())
6611 return Op;
6612 VPBuilder Builder(Mul);
6613 auto *NewLHS = Builder.createWidenCast(
6614 MulLHS->getOpcode(), MulLHS->getOperand(0), Ext->getScalarType());
6615 auto *NewRHS = MulLHS == MulRHS
6616 ? NewLHS
6617 : Builder.createWidenCast(MulRHS->getOpcode(),
6618 MulRHS->getOperand(0),
6619 Ext->getScalarType());
6620 auto *NewMul = Mul->cloneWithOperands({NewLHS, NewRHS});
6621 Builder.insert(NewMul);
6622 Op->replaceAllUsesWith(NewMul);
6623 Op->eraseFromParent();
6624 Mul->eraseFromParent();
6625 return NewMul;
6626 }
6627
6628 return Op;
6629}
6630
6631static VPExpressionRecipe *
6632createPartialReductionExpression(VPReductionRecipe *Red) {
6633 VPValue *VecOp = Red->getVecOp();
6634
6635 // reduce.[f]add(ext(op))
6636 // -> VPExpressionRecipe(op, red)
6637 if (match(VecOp, m_WidenAnyExtend(m_VPValue())))
6638 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
6639
6640 // reduce.[f]add(neg(ext(op)))
6641 // -> VPExpressionRecipe(op, sub/neg, red)
6642 if (match(VecOp, m_AnyNeg(m_WidenAnyExtend(m_VPValue())))) {
6643 auto *Neg = cast<VPWidenRecipe>(VecOp);
6644 auto *Ext =
6645 cast<VPWidenCastRecipe>(Neg->getOperand(Neg->getNumOperands() - 1));
6646 return new VPExpressionRecipe(Ext, Neg, Red);
6647 }
6648
6649 // reduce.[f]add([f]mul(ext(a), ext(b)))
6650 // -> VPExpressionRecipe(a, b, mul, red)
6651 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue()))) ||
6652 match(VecOp,
6654 auto *Mul = cast<VPWidenRecipe>(VecOp);
6655 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6656 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6657 return new VPExpressionRecipe(ExtA, ExtB, Mul, Red);
6658 }
6659
6660 // reduce.fadd(fneg(fmul(fpext(a), fpext(b))))
6661 // -> VPExpressionRecipe(a, b, fmul, fsub, red)
6662 if (match(VecOp,
6664 auto *FNeg = cast<VPWidenRecipe>(VecOp);
6665 auto *FMul = cast<VPWidenRecipe>(FNeg->getOperand(0));
6666 auto *ExtA = cast<VPWidenCastRecipe>(FMul->getOperand(0));
6667 auto *ExtB = cast<VPWidenCastRecipe>(FMul->getOperand(1));
6668 return new VPExpressionRecipe(ExtA, ExtB, FMul, FNeg, Red);
6669 }
6670
6671 // reduce.add(neg(mul(ext(a), ext(b))))
6672 // -> VPExpressionRecipe(a, b, mul, sub, red)
6674 m_ZExtOrSExt(m_VPValue()))))) {
6675 auto *Sub = cast<VPWidenRecipe>(VecOp);
6676 auto *Mul = cast<VPWidenRecipe>(Sub->getOperand(1));
6677 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6678 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6679 return new VPExpressionRecipe(ExtA, ExtB, Mul, Sub, Red);
6680 }
6681
6682 llvm_unreachable("Unsupported expression");
6683}
6684
6685// Helper to transform a partial reduction chain into a partial reduction
6686// recipe. Assumes profitability has been checked.
6687static void transformToPartialReduction(const VPPartialReductionChain &Chain,
6688 VPlan &Plan,
6689 VPReductionPHIRecipe *RdxPhi) {
6690 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6691 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
6692
6693 VPValue *Accumulator = WidenRecipe->getOperand(Chain.AccumulatorOpIdx);
6694 auto *ExtendedOp = cast<VPSingleDefRecipe>(
6695 WidenRecipe->getOperand(1 - Chain.AccumulatorOpIdx));
6696
6697 // FIXME: Do these transforms before invoking the cost-model.
6698 ExtendedOp = optimizeExtendsForPartialReduction(ExtendedOp);
6699
6700 // Sub-reductions can be implemented in two ways:
6701 // (1) negate the operand in the vector loop (the default way).
6702 // (2) subtract the reduced value from the init value in the middle block.
6703 // Both ways keep the reduction itself as an 'add' reduction.
6704 //
6705 // The ISD nodes for partial reductions don't support folding the
6706 // sub/negation into its operands because the following is not a valid
6707 // transformation:
6708 // sub(0, mul(ext(a), ext(b)))
6709 // -> mul(ext(a), ext(sub(0, b)))
6710 //
6711 // It's therefore better to choose option (2) such that the partial
6712 // reduction is always positive (starting at '0') and to do a final
6713 // subtract in the middle block.
6714 if ((WidenRecipe->getOpcode() == Instruction::Sub &&
6715 Chain.RK != RecurKind::Sub) ||
6716 (WidenRecipe->getOpcode() == Instruction::FSub &&
6717 Chain.RK != RecurKind::FSub)) {
6718 VPBuilder Builder(WidenRecipe);
6719 Type *ElemTy = ExtendedOp->getScalarType();
6720 VPWidenRecipe *NegRecipe;
6721 if (WidenRecipe->getOpcode() == Instruction::FSub) {
6722 NegRecipe =
6723 new VPWidenRecipe(Instruction::FNeg, {ExtendedOp}, VPIRFlags(),
6725 } else {
6726 auto *Zero = Plan.getZero(ElemTy);
6727 NegRecipe =
6728 new VPWidenRecipe(Instruction::Sub, {Zero, ExtendedOp}, VPIRFlags(),
6730 }
6731 Builder.insert(NegRecipe);
6732 ExtendedOp = NegRecipe;
6733 }
6734
6735 // Check if WidenRecipe is the final result of the reduction. If so look
6736 // through selects for predicated reductions.
6737 VPValue *Cond = nullptr;
6739 findUserOf(WidenRecipe, m_Select(m_VPValue(Cond), m_Specific(WidenRecipe),
6740 m_Specific(RdxPhi))));
6741 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
6742 RdxPhi->getBackedgeValue() == ExitValue;
6743 assert((!ExitValue || IsLastInChain) &&
6744 "if we found ExitValue, it must match RdxPhi's backedge value");
6745
6746 Type *PhiType = RdxPhi->getScalarType();
6747 RecurKind RdxKind =
6749 auto *PartialRed = new VPReductionRecipe(
6750 RdxKind,
6751 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlagsOrNone()
6752 : FastMathFlags(),
6753 WidenRecipe->getUnderlyingInstr(), Accumulator, ExtendedOp, Cond,
6754 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6755 PartialRed->insertBefore(WidenRecipe);
6756
6757 if (Cond)
6758 ExitValue->replaceAllUsesWith(PartialRed);
6759 WidenRecipe->replaceAllUsesWith(PartialRed);
6760
6761 // For cost-model purposes, fold this into a VPExpression.
6762 VPExpressionRecipe *E = createPartialReductionExpression(PartialRed);
6763 E->insertBefore(WidenRecipe);
6764 PartialRed->replaceAllUsesWith(E);
6765
6766 // We only need to update the PHI node once, which is when we find the
6767 // last reduction in the chain.
6768 if (!IsLastInChain)
6769 return;
6770
6771 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6772 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6773 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6774
6775 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
6776 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6777 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
6778 StartInst->setOperand(2, NewScaleFactor);
6779
6780 // If this is the last value in a sub-reduction chain, then update the PHI
6781 // node to start at `0` and update the reduction-result to subtract from
6782 // the PHI's start value.
6783 if (Chain.RK != RecurKind::Sub && Chain.RK != RecurKind::FSub)
6784 return;
6785
6786 VPValue *OldStartValue = StartInst->getOperand(0);
6787 StartInst->setOperand(0, StartInst->getOperand(1));
6788
6789 // Replace reduction_result by 'sub (startval, reductionresult)'.
6791 assert(RdxResult && "Could not find reduction result");
6792
6793 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
6794 unsigned SubOpc = Chain.RK == RecurKind::FSub ? Instruction::BinaryOps::FSub
6795 : Instruction::BinaryOps::Sub;
6796 VPInstruction *NewResult = Builder.createNaryOp(
6797 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
6798 RdxPhi->getDebugLoc());
6799 RdxResult->replaceUsesWithIf(
6800 NewResult,
6801 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6802}
6803
6804/// Returns the cost of a link in a partial-reduction chain for a given VF.
6805static InstructionCost
6806getPartialReductionLinkCost(VPCostContext &CostCtx,
6807 const VPPartialReductionChain &Link,
6808 ElementCount VF) {
6809 Type *RdxType = Link.ReductionBinOp->getScalarType();
6810 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6811 std::optional<unsigned> BinOpc = std::nullopt;
6812 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6813 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6814 BinOpc = ExtendedOp.ExtendsUser->getOpcode();
6815
6816 std::optional<llvm::FastMathFlags> Flags;
6817 if (RdxType->isFloatingPointTy())
6818 Flags = Link.ReductionBinOp->getFastMathFlagsOrNone();
6819
6820 auto GetLinkOpcode = [&Link]() -> unsigned {
6821 switch (Link.RK) {
6822 case RecurKind::Sub:
6823 return Instruction::Add;
6824 case RecurKind::FSub:
6825 return Instruction::FAdd;
6826 default:
6827 return Link.ReductionBinOp->getOpcode();
6828 }
6829 };
6830
6831 return CostCtx.TTI.getPartialReductionCost(
6832 GetLinkOpcode(), ExtendedOp.ExtendA.SrcType, ExtendedOp.ExtendB.SrcType,
6833 RdxType, VF, ExtendedOp.ExtendA.Kind, ExtendedOp.ExtendB.Kind, BinOpc,
6834 CostCtx.CostKind, Flags);
6835}
6836
6837static ExtendKind getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
6839}
6840
6841/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6842/// operand. This is an operand where the source of the value (e.g. a load) has
6843/// been extended (sext, zext, or fpext) before it is used in the reduction.
6844///
6845/// Possible forms matched by this function:
6846/// - UpdateR(PrevValue, ext(...))
6847/// - UpdateR(PrevValue, mul(ext(...), ext(...)))
6848/// - UpdateR(PrevValue, mul(ext(...), Constant))
6849/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6850/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6851/// - UpdateR(PrevValue, abs(sub(ext(...), ext(...)))
6852///
6853/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6854static std::optional<ExtendedReductionOperand>
6855matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op) {
6856 assert(is_contained(UpdateR->operands(), Op) &&
6857 "Op should be operand of UpdateR");
6858
6859 // Try matching an absolute difference operand of the form
6860 // `abs(sub(ext(A), ext(B)))`. This will be later transformed into
6861 // `ext(absolute-difference(A, B))`. This allows us to perform the absolute
6862 // difference on a wider type and get the extend for "free" from the partial
6863 // reduction.
6864 VPValue *X, *Y;
6865 if (Op->hasOneUse() &&
6869 auto *Abs = cast<VPWidenIntrinsicRecipe>(Op);
6870 auto *Sub = cast<VPWidenRecipe>(Abs->getOperand(0));
6871 auto *LHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6872 auto *RHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(1));
6873 Type *LHSInputType = X->getScalarType();
6874 Type *RHSInputType = Y->getScalarType();
6875 if (LHSInputType != RHSInputType ||
6876 LHSExt->getOpcode() != RHSExt->getOpcode())
6877 return std::nullopt;
6878 // Note: This is essentially the same as matching ext(...) as we will
6879 // rewrite this operand to ext(absolute-difference(A, B)).
6880 return ExtendedReductionOperand{
6881 Sub,
6882 /*ExtendA=*/{LHSInputType, getPartialReductionExtendKind(LHSExt)},
6883 /*ExtendB=*/{}};
6884 }
6885
6886 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6888 auto *CastRecipe = cast<VPWidenCastRecipe>(Op);
6889 VPValue *CastSource = CastRecipe->getOperand(0);
6890 OuterExtKind = getPartialReductionExtendKind(CastRecipe);
6891 if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) ||
6892 match(CastSource, m_FMul(m_VPValue(), m_VPValue()))) {
6893 // Match: ext(mul(...))
6894 // Record the outer extend kind and set `Op` to the mul. We can then match
6895 // this as a binary operation. Note: We can optimize out the outer extend
6896 // by widening the inner extends to match it. See
6897 // optimizeExtendsForPartialReduction.
6898 Op = CastSource;
6899 } else {
6900 return ExtendedReductionOperand{
6901 UpdateR,
6902 /*ExtendA=*/{CastSource->getScalarType(), *OuterExtKind},
6903 /*ExtendB=*/{}};
6904 }
6905 }
6906
6907 if (!Op->hasOneUse())
6908 return std::nullopt;
6909
6911 if (!MulOp ||
6912 !is_contained({Instruction::Mul, Instruction::FMul}, MulOp->getOpcode()))
6913 return std::nullopt;
6914
6915 // The rest of the matching assumes `Op` is a (possibly extended) mul
6916 // operation.
6917
6918 VPValue *LHS = MulOp->getOperand(0);
6919 VPValue *RHS = MulOp->getOperand(1);
6920
6921 // The LHS of the operation must always be an extend.
6923 return std::nullopt;
6924
6925 auto *LHSCast = cast<VPWidenCastRecipe>(LHS);
6926 Type *LHSInputType = LHSCast->getOperand(0)->getScalarType();
6927 ExtendKind LHSExtendKind = getPartialReductionExtendKind(LHSCast);
6928
6929 // The RHS of the operation can be an extend or a constant integer.
6930 const APInt *RHSConst = nullptr;
6931 VPWidenCastRecipe *RHSCast = nullptr;
6933 RHSCast = cast<VPWidenCastRecipe>(RHS);
6934 else if (!match(RHS, m_APInt(RHSConst)) ||
6935 !canConstantBeExtended(RHSConst, LHSInputType, LHSExtendKind))
6936 return std::nullopt;
6937
6938 // The outer extend kind must match the inner extends for folding.
6939 for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
6940 if (Cast && OuterExtKind &&
6941 getPartialReductionExtendKind(Cast) != OuterExtKind)
6942 return std::nullopt;
6943
6944 Type *RHSInputType = LHSInputType;
6945 ExtendKind RHSExtendKind = LHSExtendKind;
6946 if (RHSCast) {
6947 RHSInputType = RHSCast->getOperand(0)->getScalarType();
6948 RHSExtendKind = getPartialReductionExtendKind(RHSCast);
6949 }
6950
6951 return ExtendedReductionOperand{
6952 MulOp, {LHSInputType, LHSExtendKind}, {RHSInputType, RHSExtendKind}};
6953}
6954
6955/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6956/// and determines if the target can use a cheaper operation with a wider
6957/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6958/// of operations in the reduction.
6959static std::optional<SmallVector<VPPartialReductionChain>>
6960getScaledReductions(VPReductionPHIRecipe *RedPhiR) {
6961 // Get the backedge value from the reduction PHI and find the
6962 // ComputeReductionResult that uses it (directly or through a select for
6963 // predicated reductions).
6964 auto *RdxResult = vputils::findComputeReductionResult(RedPhiR);
6965 if (!RdxResult)
6966 return std::nullopt;
6967 VPValue *ExitValue = RdxResult->getOperand(0);
6968 match(ExitValue, m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6969
6971 RecurKind RK = RedPhiR->getRecurrenceKind();
6972 Type *PhiType = RedPhiR->getScalarType();
6973 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6974
6975 // Work backwards from the ExitValue examining each reduction operation.
6976 VPValue *CurrentValue = ExitValue;
6977 while (CurrentValue != RedPhiR) {
6978 auto *UpdateR = dyn_cast<VPWidenRecipe>(CurrentValue);
6979 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6980 return std::nullopt;
6981
6982 VPValue *Op = UpdateR->getOperand(1);
6983 VPValue *PrevValue = UpdateR->getOperand(0);
6984
6985 // Find the extended operand. The other operand (PrevValue) is the next link
6986 // in the reduction chain.
6987 std::optional<ExtendedReductionOperand> ExtendedOp =
6988 matchExtendedReductionOperand(UpdateR, Op);
6989 if (!ExtendedOp) {
6990 ExtendedOp = matchExtendedReductionOperand(UpdateR, PrevValue);
6991 if (!ExtendedOp)
6992 return std::nullopt;
6993 std::swap(Op, PrevValue);
6994 }
6995
6996 Type *ExtSrcType = ExtendedOp->ExtendA.SrcType;
6997 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6998 if (!PHISize.hasKnownScalarFactor(ExtSrcSize))
6999 return std::nullopt;
7000
7001 VPPartialReductionChain Link(
7002 {UpdateR, *ExtendedOp, RK,
7003 PrevValue == UpdateR->getOperand(0) ? 0U : 1U,
7004 static_cast<unsigned>(PHISize.getKnownScalarFactor(ExtSrcSize))});
7005 Chain.push_back(Link);
7006 CurrentValue = PrevValue;
7007 }
7008
7009 // The chain links were collected by traversing backwards from the exit value.
7010 // Reverse the chains so they are in program order.
7011 std::reverse(Chain.begin(), Chain.end());
7012 return Chain;
7013}
7014} // namespace
7015
7017 VPCostContext &CostCtx,
7018 VFRange &Range) {
7019 // Find all possible valid partial reductions, grouping chains by their PHI.
7020 // This grouping allows invalidating the whole chain, if any link is not a
7021 // valid partial reduction.
7023 ChainsByPhi;
7024 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
7025 for (VPRecipeBase &R : HeaderVPBB->phis()) {
7026 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
7027 if (!RedPhiR)
7028 continue;
7029
7030 if (auto Chains = getScaledReductions(RedPhiR))
7031 ChainsByPhi.try_emplace(RedPhiR, std::move(*Chains));
7032 }
7033
7034 if (ChainsByPhi.empty())
7035 return;
7036
7037 // Build set of partial reduction operations for extend user validation and
7038 // a map of reduction bin ops to their scale factors for scale validation.
7039 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
7040 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
7041 for (const auto &[_, Chains] : ChainsByPhi)
7042 for (const VPPartialReductionChain &Chain : Chains) {
7043 PartialReductionOps.insert(Chain.ExtendedOp.ExtendsUser);
7044 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
7045 }
7046
7047 // A partial reduction is invalid if any of its extends are used by
7048 // something that isn't another partial reduction. This is because the
7049 // extends are intended to be lowered along with the reduction itself.
7050 auto ExtendUsersValid = [&](VPValue *Ext) {
7051 return !isa<VPWidenCastRecipe>(Ext) || all_of(Ext->users(), [&](VPUser *U) {
7052 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
7053 });
7054 };
7055
7056 auto IsProfitablePartialReductionChainForVF =
7057 [&](ArrayRef<VPPartialReductionChain> Chain, ElementCount VF) -> bool {
7058 InstructionCost PartialCost = 0, RegularCost = 0;
7059
7060 // The chain is a profitable partial reduction chain if the cost of handling
7061 // the entire chain is cheaper when using partial reductions than when
7062 // handling the entire chain using regular reductions.
7063 for (const VPPartialReductionChain &Link : Chain) {
7064 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
7065 InstructionCost LinkCost = getPartialReductionLinkCost(CostCtx, Link, VF);
7066 if (!LinkCost.isValid())
7067 return false;
7068
7069 PartialCost += LinkCost;
7070 RegularCost += Link.ReductionBinOp->computeCost(VF, CostCtx);
7071 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
7072 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
7073 RegularCost += ExtendedOp.ExtendsUser->computeCost(VF, CostCtx);
7074 for (VPValue *Op : ExtendedOp.ExtendsUser->operands())
7075 if (auto *Extend = dyn_cast<VPWidenCastRecipe>(Op))
7076 RegularCost += Extend->computeCost(VF, CostCtx);
7077 }
7078 return PartialCost.isValid() && PartialCost < RegularCost;
7079 };
7080
7081 // Validate chains: check that extends are only used by partial reductions,
7082 // and that reduction bin ops are only used by other partial reductions with
7083 // matching scale factors, are outside the loop region or the select
7084 // introduced by tail-folding. Otherwise we would create users of scaled
7085 // reductions where the types of the other operands don't match.
7086 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
7087 for (const VPPartialReductionChain &Chain : Chains) {
7088 if (!all_of(Chain.ExtendedOp.ExtendsUser->operands(), ExtendUsersValid)) {
7089 Chains.clear();
7090 break;
7091 }
7092 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
7093 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
7094 return PhiR == RedPhiR;
7095 auto *R = cast<VPSingleDefRecipe>(U);
7096 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
7098 m_Specific(Chain.ReductionBinOp))) ||
7099 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
7100 m_Specific(RedPhiR)));
7101 };
7102 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
7103 Chains.clear();
7104 break;
7105 }
7106
7107 // Check if the compute-reduction-result is used by a sunk store.
7108 // TODO: Also form partial reductions in those cases.
7109 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
7110 if (any_of(RdxResult->users(), [](VPUser *U) {
7111 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
7112 return RepR && RepR->getOpcode() == Instruction::Store;
7113 })) {
7114 Chains.clear();
7115 break;
7116 }
7117 }
7118 }
7119
7120 // Clear the chain if it is not profitable.
7122 [&, &Chains = Chains](ElementCount VF) {
7123 return IsProfitablePartialReductionChainForVF(Chains, VF);
7124 },
7125 Range))
7126 Chains.clear();
7127 }
7128
7129 for (auto &[Phi, Chains] : ChainsByPhi)
7130 for (const VPPartialReductionChain &Chain : Chains)
7131 transformToPartialReduction(Chain, Plan, Phi);
7132}
7133
7134/// If the pointer operand \p Addr of a memory access is an affine AddRec
7135/// w.r.t. \p L with a constant stride, return the stride in units of
7136/// \p AccessTy. Otherwise return std::nullopt.
7137static std::optional<int64_t> getConstantStride(VPValue *Addr, Type *AccessTy,
7139 const Loop *L) {
7140 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
7141 auto *AddRec = dyn_cast<SCEVAddRecExpr>(AddrSCEV);
7142 if (!AddRec)
7143 return {};
7144
7145 return getStrideFromAddRec(AddRec, L, AccessTy, /*Ptr=*/nullptr, PSE);
7146}
7147
7149 VPRecipeBuilder &RecipeBuilder,
7150 VPCostContext &CostCtx) {
7151 // Collect all loads/stores first. We will start with ones having simpler
7152 // decisions followed by more complex ones that are potentially
7153 // guided/dependent on the simpler ones.
7155 for (VPBasicBlock *VPBB :
7158 for (VPRecipeBase &R : *VPBB) {
7159 auto *VPI = dyn_cast<VPInstruction>(&R);
7160 if (VPI && VPI->getUnderlyingValue() &&
7161 is_contained({Instruction::Load, Instruction::Store},
7162 VPI->getOpcode()))
7163 MemOps.push_back(VPI);
7164 }
7165 }
7166
7167 // Few helpers to process different kinds of memory operations.
7168
7169 // To be used as argument to `VPlanTransforms::runPass` which explicitly
7170 // specified pass name, hence `VPlan &` parameter.
7171 auto ProcessSubset = [&](VPlan &, auto ProcessVPInst) {
7172 SmallVector<VPInstruction *> RemainingMemOps;
7173 for (VPInstruction *VPI : MemOps) {
7174 if (!ProcessVPInst(VPI))
7175 RemainingMemOps.push_back(VPI);
7176 }
7177
7178 MemOps.clear();
7179 std::swap(MemOps, RemainingMemOps);
7180 };
7181
7182 auto ReplaceWith = [&](VPInstruction *VPI, VPRecipeBase *New) {
7183 New->insertBefore(VPI);
7184 if (VPI->getOpcode() == Instruction::Load)
7185 VPI->replaceAllUsesWith(New->getVPSingleValue());
7186 VPI->eraseFromParent();
7187
7188 // VPI has been processed.
7189 return true;
7190 };
7191
7192 auto Scalarize = [&](VPInstruction *VPI) {
7193 return ReplaceWith(VPI, RecipeBuilder.handleReplication(VPI, Range));
7194 };
7195
7196 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
7197 VPBuilder FinalRedStoresBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
7199 "lowerMemoryIdioms", ProcessSubset, Plan, [&](VPInstruction *VPI) {
7200 if (RecipeBuilder.replaceWithFinalIfReductionStore(
7201 VPI, FinalRedStoresBuilder))
7202 return true;
7203
7204 // Filter out scalar VPlan for the remaining idioms.
7206 [](ElementCount VF) { return VF.isScalar(); }, Range))
7207 return false;
7208
7209 if (VPHistogramRecipe *Histogram = RecipeBuilder.widenIfHistogram(VPI))
7210 return ReplaceWith(VPI, Histogram);
7211
7212 return false;
7213 });
7214
7215 // Filter out scalar VPlan for the remaining memory operations.
7217 [](ElementCount VF) { return VF.isScalar(); }, Range))
7218 return;
7219
7220 // If the instruction's allocated size doesn't equal it's type size, it
7221 // requires padding and will be scalarized.
7223 "scalarizeMemOpsWithIrregularTypes", ProcessSubset, Plan,
7224 [&](VPInstruction *VPI) {
7226 if (hasIrregularType(getLoadStoreType(I), I->getDataLayout()))
7227 return Scalarize(VPI);
7228
7229 return false;
7230 });
7231
7232 if (!RecipeBuilder.prefersVectorizedAddressing()) {
7234 "makeVPlanMemOpDecision", ProcessSubset, Plan, [&](VPInstruction *VPI) {
7236 bool IsLoad = VPI->getOpcode() == Instruction::Load;
7237 if (RecipeBuilder.isPredicatedInst(I) || !IsLoad ||
7239 return false;
7240
7241 // Scalarize loads used as addresses, matching the legacy CM. The load
7242 // is single-scalar if the pointer is loop-invariant, otherwise it is
7243 // replicated per-lane. No mask is needed as the load is not
7244 // predicated.
7245 VPValue *Ptr = VPI->getOperand(0);
7246 const SCEV *PtrSCEV =
7247 vputils::getSCEVExprForVPValue(Ptr, CostCtx.PSE, CostCtx.L);
7248 bool IsSingleScalarLoad =
7249 !isa<SCEVCouldNotCompute>(PtrSCEV) &&
7250 CostCtx.PSE.getSE()->isLoopInvariant(PtrSCEV, CostCtx.L);
7251
7252 ReplaceWith(VPI,
7254 I, Ptr, /*IsSingleScalar=*/IsSingleScalarLoad,
7255 /*Mask=*/nullptr, *VPI, *VPI, VPI->getDebugLoc()));
7256 return true;
7257 });
7258 }
7259
7260 // Widen unmasked unit-stride consecutive accesses, matching the legacy CM.
7262 "widenConsecutiveMemOps", ProcessSubset, Plan, [&](VPInstruction *VPI) {
7264 if (RecipeBuilder.isPredicatedInst(I))
7265 return false;
7266
7267 bool IsLoad = VPI->getOpcode() == Instruction::Load;
7268 VPValue *Ptr = VPI->getOperand(!IsLoad);
7269 Type *ScalarTy =
7270 IsLoad ? VPI->getScalarType() : VPI->getOperand(0)->getScalarType();
7271 if (getConstantStride(Ptr, ScalarTy, CostCtx.PSE, CostCtx.L) != 1)
7272 return false;
7273
7274 Type *StrideTy =
7276 VPValue *StrideOne = Plan.getConstantInt(StrideTy, 1);
7277 auto *VectorPtr = new VPVectorPointerRecipe(
7278 Ptr, ScalarTy, StrideOne, vputils::getGEPFlagsForPtr(Ptr),
7279 VPI->getDebugLoc());
7280 VectorPtr->insertBefore(VPI);
7281 VPRecipeBase *WidenedR;
7282 if (IsLoad)
7283 WidenedR = new VPWidenLoadRecipe(*cast<LoadInst>(I), VectorPtr,
7284 /*Mask=*/nullptr,
7285 /*Consecutive=*/true, *VPI,
7286 VPI->getDebugLoc());
7287 else
7288 WidenedR = new VPWidenStoreRecipe(
7289 *cast<StoreInst>(I), VectorPtr, VPI->getOperand(0),
7290 /*Mask=*/nullptr, /*Consecutive=*/true, *VPI, VPI->getDebugLoc());
7291 return ReplaceWith(VPI, WidenedR);
7292 });
7293
7294 VPlanTransforms::runPass("delegateMemOpWideningToLegacyCM", ProcessSubset,
7295 Plan, [&](VPInstruction *VPI) {
7296 if (VPRecipeBase *Recipe =
7297 RecipeBuilder.tryToWidenMemory(VPI, Range))
7298 return ReplaceWith(VPI, Recipe);
7299
7300 return Scalarize(VPI);
7301 });
7302}
7303
7306 [&](ElementCount VF) { return VF.isScalar(); }, Range))
7307 return;
7308
7310 Plan.getEntry());
7312 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
7313 auto *VPI = dyn_cast<VPInstruction>(&R);
7314 if (!VPI)
7315 continue;
7316
7317 auto *I = cast_or_null<Instruction>(VPI->getUnderlyingValue());
7318 // Wouldn't be able to create a `VPReplicateRecipe` anyway.
7319 if (!I)
7320 continue;
7321
7322 // If executing other lanes produces side-effects we can't avoid them.
7323 if (VPI->mayHaveSideEffects())
7324 continue;
7325
7326 // We want to drop the mask operand, verify we can safely do that.
7327 if (VPI->isMasked() && !VPI->isSafeToSpeculativelyExecute())
7328 continue;
7329
7330 // Avoid rewriting IV increment as that interferes with
7331 // `removeRedundantCanonicalIVs`.
7332 if (VPI->getOpcode() == Instruction::Add &&
7334 continue;
7335
7336 // Other lanes are needed - can't drop them.
7338 continue;
7339
7340 auto *Recipe = VPBuilder::createSingleScalarOp(
7341 VPI->getOpcode(), VPI->operandsWithoutMask(), /*Mask=*/nullptr, *VPI,
7342 *VPI, VPI->getDebugLoc(), I);
7343 Recipe->insertBefore(VPI);
7344 VPI->replaceAllUsesWith(Recipe);
7345 VPI->eraseFromParent();
7346 }
7347 }
7348}
7349
7350/// Returns true if \p Info's parameter kinds are compatible with \p Args.
7351static bool areVFParamsOk(const VFInfo &Info, ArrayRef<VPValue *> Args,
7352 PredicatedScalarEvolution &PSE, const Loop *L) {
7353 ScalarEvolution *SE = PSE.getSE();
7354 return all_of(Info.Shape.Parameters, [&](VFParameter Param) {
7355 switch (Param.ParamKind) {
7356 case VFParamKind::Vector:
7357 case VFParamKind::GlobalPredicate:
7358 return true;
7359 case VFParamKind::OMP_Uniform:
7360 return SE->isSCEVable(Args[Param.ParamPos]->getScalarType()) &&
7361 SE->isLoopInvariant(
7362 vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),
7363 L);
7364 case VFParamKind::OMP_Linear:
7365 return match(vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),
7366 m_scev_AffineAddRec(
7367 m_SCEV(), m_scev_SpecificSInt(Param.LinearStepOrPos),
7368 m_SpecificLoop(L)));
7369 default:
7370 return false;
7371 }
7372 });
7373}
7374
7375/// Find a vector variant of \p CI for \p VF, respecting \p MaskRequired.
7376/// Returns the variant function, or nullptr. Masked variants are assumed to
7377/// take the mask as a trailing parameter.
7379 ElementCount VF, bool MaskRequired,
7381 const Loop *L) {
7382 if (CI->isNoBuiltin())
7383 return nullptr;
7384 auto Mappings = VFDatabase::getMappings(*CI);
7385 const auto *It = find_if(Mappings, [&](const VFInfo &Info) {
7386 return Info.Shape.VF == VF && (!MaskRequired || Info.isMasked()) &&
7387 areVFParamsOk(Info, Args, PSE, L);
7388 });
7389 if (It == Mappings.end())
7390 return nullptr;
7391 return CI->getModule()->getFunction(It->VectorName);
7392}
7393
7394namespace {
7395/// The outcome of choosing how to widen a call at a given VF.
7396struct CallWideningDecision {
7397 enum class KindTy { Scalarize, Intrinsic, VectorVariant };
7398 CallWideningDecision(KindTy Kind, Function *Variant = nullptr)
7399 : Kind(Kind), Variant(Variant) {}
7400 KindTy Kind;
7401
7402 /// Set when Kind == VectorVariant.
7404
7405 bool operator==(const CallWideningDecision &Other) const {
7406 return Kind == Other.Kind && Variant == Other.Variant;
7407 }
7408};
7409} // namespace
7410
7411/// Pick the cheapest widening for the call \p VPI at \p VF among scalarization,
7412/// vector intrinsic, and vector library variant.
7413static CallWideningDecision decideCallWidening(VPInstruction &VPI,
7415 ElementCount VF,
7416 VPCostContext &CostCtx) {
7417 auto *CI = cast<CallInst>(VPI.getUnderlyingInstr());
7418
7419 // Scalar VFs and calls forced or known to scalarize always replicate.
7420 if (VF.isScalar() || CostCtx.willBeScalarized(CI, VF))
7421 return CallWideningDecision::KindTy::Scalarize;
7422
7423 auto *CalledFn = cast<Function>(
7425 Type *ResultTy = VPI.getScalarType();
7427 bool MaskRequired = CostCtx.isMaskRequired(CI);
7428
7429 // Pseudo intrinsics (assume, lifetime, ...) are always scalarized.
7431 return CallWideningDecision::KindTy::Scalarize;
7432
7433 InstructionCost ScalarCost =
7434 VPReplicateRecipe::computeCallCost(CalledFn, ResultTy, Ops,
7435 /*IsSingleScalar=*/false, VF, CostCtx);
7436
7437 Function *VecFunc =
7438 findVectorVariant(CI, Ops, VF, MaskRequired, CostCtx.PSE, CostCtx.L);
7440 if (VecFunc)
7441 VecCallCost = VPWidenCallRecipe::computeCallCost(VecFunc, CostCtx);
7442
7443 // Prefer the intrinsic if it is at least as cheap as scalarizing and any
7444 // available vector variant.
7445 if (ID) {
7448 if (IntrinsicCost.isValid() && ScalarCost >= IntrinsicCost &&
7449 (!VecFunc || VecCallCost >= IntrinsicCost))
7450 return CallWideningDecision::KindTy::Intrinsic;
7451 }
7452
7453 // Otherwise, use a vector library variant when it beats scalarizing.
7454 if (VecFunc && ScalarCost >= VecCallCost)
7455 return {CallWideningDecision::KindTy::VectorVariant, VecFunc};
7456
7457 return CallWideningDecision::KindTy::Scalarize;
7458}
7459
7461 VPRecipeBuilder &RecipeBuilder,
7462 VPCostContext &CostCtx) {
7465 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
7466 auto *VPI = dyn_cast<VPInstruction>(&R);
7467 if (!VPI || !VPI->getUnderlyingValue() ||
7468 VPI->getOpcode() != Instruction::Call)
7469 continue;
7470
7471 auto *CI = cast<CallInst>(VPI->getUnderlyingInstr());
7472 SmallVector<VPValue *, 4> Ops(VPI->op_begin(),
7473 VPI->op_begin() + CI->arg_size());
7474
7475 CallWideningDecision Decision =
7476 decideCallWidening(*VPI, Ops, Range.Start, CostCtx);
7478 [&](ElementCount VF) {
7479 return Decision == decideCallWidening(*VPI, Ops, VF, CostCtx);
7480 },
7481 Range);
7482
7483 VPSingleDefRecipe *Replacement = nullptr;
7484 switch (Decision.Kind) {
7485 case CallWideningDecision::KindTy::Intrinsic: {
7487 Type *ResultTy = VPI->getScalarType();
7488 Replacement = new VPWidenIntrinsicRecipe(*CI, ID, Ops, ResultTy, *VPI,
7489 *VPI, VPI->getDebugLoc());
7490 break;
7491 }
7492 case CallWideningDecision::KindTy::VectorVariant: {
7493 // Masked variants take the mask as a trailing parameter, so they have
7494 // one more parameter than the original call's arguments.
7495 if (Decision.Variant->arg_size() > Ops.size()) {
7496 VPValue *Mask = VPI->isMasked() ? VPI->getMask() : Plan.getTrue();
7497 Ops.push_back(Mask);
7498 }
7499 Ops.push_back(VPI->getOperand(VPI->getNumOperandsWithoutMask() - 1));
7500 Replacement = new VPWidenCallRecipe(CI, Decision.Variant, Ops, *VPI,
7501 *VPI, VPI->getDebugLoc());
7502 break;
7503 }
7504 case CallWideningDecision::KindTy::Scalarize:
7505 Replacement = RecipeBuilder.handleReplication(VPI, Range);
7506 break;
7507 }
7508
7509 Replacement->insertBefore(VPI);
7510 VPI->replaceAllUsesWith(Replacement);
7511 VPI->eraseFromParent();
7512 }
7513 }
7514}
7515
7518 Loop &L, VPCostContext &Ctx,
7519 VFRange &Range) {
7520 if (Plan.hasScalarVFOnly())
7521 return;
7522
7523 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
7524 VPValue *I32VF = nullptr;
7526 vp_depth_first_shallow(VectorLoop->getEntry()))) {
7527 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
7528 auto *LoadR = dyn_cast<VPWidenLoadRecipe>(&R);
7529 // TODO: Support strided store.
7530 // TODO: Transform reverse access into strided access with -1 stride.
7531 // TODO: Transform gather/scatter with uniform address into strided access
7532 // with 0 stride.
7533 // TODO: Transform interleave access into multiple strided accesses.
7534 if (!LoadR || LoadR->isConsecutive())
7535 continue;
7536
7537 auto *Ptr = dyn_cast<VPWidenGEPRecipe>(LoadR->getAddr());
7538 if (!Ptr)
7539 continue;
7540
7541 // Check if this is a strided access by analyzing the address SCEV for an
7542 // affine addRec.
7543 const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(Ptr, PSE, &L);
7544 const SCEV *Start;
7545 const SCEVConstant *Step;
7546 // TODO: Support non-constant loop invariant stride.
7547 if (!match(PtrSCEV,
7549 m_SpecificLoop(&L))))
7550 continue;
7551
7552 Type *LoadTy = LoadR->getScalarType();
7553 Align Alignment = LoadR->getAlign();
7554 auto IsProfitable = [&](ElementCount VF) {
7555 Type *DataTy = toVectorTy(LoadTy, VF);
7556 if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))
7557 return false;
7558 const InstructionCost CurrentCost = LoadR->computeCost(VF, Ctx);
7559 const InstructionCost StridedLoadStoreCost =
7561 Intrinsic::experimental_vp_strided_load, DataTy,
7562 LoadR->isMasked(), Alignment, Ctx);
7563 return StridedLoadStoreCost < CurrentCost;
7564 };
7565
7567 Range))
7568 continue;
7569
7570 // Invalidate the legacy widening decision so the cost of replaced load is
7571 // not counted during precomputeCosts.
7572 // TODO: Remove once the legacy exit cost computation is retired.
7573 for (ElementCount VF : Range)
7574 Ctx.invalidateWideningDecision(&LoadR->getIngredient(), VF);
7575
7576 // Get VF as i32 for the vector length operand.
7577 if (!I32VF) {
7578 VPBuilder Builder(Plan.getVectorPreheader());
7579 I32VF = Builder.createScalarZExtOrTrunc(
7580 &Plan.getVF(), Type::getInt32Ty(Plan.getContext()),
7582 }
7583
7584 VPBuilder Builder(LoadR);
7585 // Create the base pointer of strided access.
7586 // TODO: reuse VPDerivedIVRecipe for base pointer computation when it
7587 // supports a general VPValue as the start value.
7588 VPValue *StartVPV = vputils::getOrCreateVPValueForSCEVExpr(Plan, Start);
7589 VPValue *StrideInBytes = Plan.getOrAddLiveIn(Step->getValue());
7590 Type *IndexTy = Plan.getDataLayout().getIndexType(Ptr->getScalarType());
7591 assert(IndexTy == StrideInBytes->getScalarType() &&
7592 "Stride type from SCEV must match the index type");
7593 VPValue *CanIV = Builder.createScalarSExtOrTrunc(
7594 VectorLoop->getCanonicalIV(), IndexTy,
7595 VectorLoop->getCanonicalIVType(), DebugLoc::getUnknown());
7596 auto *AddRecPtr = cast<SCEVAddRecExpr>(PtrSCEV);
7597 auto *Offset = Builder.createOverflowingOp(
7598 Instruction::Mul, {CanIV, StrideInBytes},
7599 {AddRecPtr->hasNoUnsignedWrap(), AddRecPtr->hasNoSignedWrap()});
7600 auto *BasePtr = Builder.createNoWrapPtrAdd(
7601 StartVPV, Offset,
7602 AddRecPtr->hasNoUnsignedWrap() ? GEPNoWrapFlags::noUnsignedWrap()
7604
7605 // Create a new vector pointer for strided access.
7606 VPValue *NewPtr = Builder.createVectorPointer(
7607 BasePtr, Type::getInt8Ty(Plan.getContext()), StrideInBytes,
7608 Ptr->getGEPNoWrapFlags(), Ptr->getDebugLoc());
7609
7610 VPValue *Mask = LoadR->getMask();
7611 if (!Mask)
7612 Mask = Plan.getTrue();
7613 auto *StridedLoad = Builder.createWidenMemIntrinsic(
7614 Intrinsic::experimental_vp_strided_load,
7615 {NewPtr, StrideInBytes, Mask, I32VF}, LoadTy, Alignment, *LoadR,
7616 LoadR->getDebugLoc());
7617 LoadR->replaceAllUsesWith(StridedLoad);
7618 }
7619 }
7620}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:856
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static bool dominates(InstrPosIndexes &PosIndexes, const MachineInstr &A, const MachineInstr &B)
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool handleUncountableExitsWithSideEffects(VPlan &Plan, SmallVectorImpl< EarlyExitInfo > &Exits, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT, AssumptionCache *AC)
Update Plan to mask memory operations in the loop based on whether the early exit is taken or not.
static CallWideningDecision decideCallWidening(VPInstruction &VPI, ArrayRef< VPValue * > Ops, ElementCount VF, VPCostContext &CostCtx)
Pick the cheapest widening for the call VPI at VF among scalarization, vector intrinsic,...
static bool areVFParamsOk(const VFInfo &Info, ArrayRef< VPValue * > Args, PredicatedScalarEvolution &PSE, const Loop *L)
Returns true if Info's parameter kinds are compatible with Args.
static std::optional< VPValue * > getRecipesForUncountableExit(SmallVectorImpl< VPInstruction * > &Recipes, VPBasicBlock *LatchVPBB)
Returns the VPValue representing the uncountable exit comparison used by AnyOf if the recipes it depe...
static bool simplifyLogicalRecipe(VPSingleDefRecipe *Def, VPBuilder &Builder, bool CanCreateNewRecipe)
Try to simplify logical and bitwise recipes in Def.
static bool sinkScalarOperands(VPlan &Plan)
static std::optional< int64_t > getConstantStride(VPValue *Addr, Type *AccessTy, PredicatedScalarEvolution &PSE, const Loop *L)
If the pointer operand Addr of a memory access is an affine AddRec w.r.t.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static VPValue * cloneBinOpForScalarIV(VPWidenRecipe *BinOp, VPValue *ScalarIV, VPWidenIntOrFpInductionRecipe *WidenIV)
Create a scalar version of BinOp, with its WidenIV operand replaced by ScalarIV, and place it after S...
static VPWidenIntOrFpInductionRecipe * getExpressionIV(VPValue *V)
Check if V is a binary expression of a widened IV and a loop-invariant value.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static Type * getLoadStoreValueType(VPReplicateRecipe *R, bool IsLoad)
Get the value type of the replicate load or store.
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static Function * findVectorVariant(CallInst *CI, ArrayRef< VPValue * > Args, ElementCount VF, bool MaskRequired, PredicatedScalarEvolution &PSE, const Loop *L)
Find a vector variant of CI for VF, respecting MaskRequired.
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx, bool IsScalable)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void simplifyRecipe(VPSingleDefRecipe *Def)
Try to simplify VPSingleDefRecipe Def.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectGroupedReplicateMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L, function_ref< bool(VPReplicateRecipe *)> FilterFn)
Collect either replicated Loads or Stores grouped by their address SCEV and their load-store type,...
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
static std::optional< Intrinsic::ID > getVPDivRemIntrinsic(Intrinsic::ID IntrID)
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant ExpandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static VPValue * narrowInterleaveGroupOp(ArrayRef< VPValue * > Members, SmallPtrSetImpl< VPValue * > &NarrowedOps, VPBasicBlock *Preheader)
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL)
Try to fold R using InstSimplifyFolder.
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static void expandVPDerivedIV(VPDerivedIVRecipe *R)
Expand a VPDerivedIVRecipe into executable recipes.
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations using scoped noalias metadata,...
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPRegionBlock *ParentRegion, VPlan &Plan)
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool cannotHoistOrSinkRecipe(VPRecipeBase &R, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, bool Sinking=false)
Return true if we do not know how to (mechanically) hoist or sink a non-memory or memory recipe R out...
static std::optional< Instruction::BinaryOps > getUnmaskedDivRemOpcode(Intrinsic::ID ID)
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool canNarrowOps(ArrayRef< VPValue * > Ops, bool IsScalable)
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L)
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(ArrayRef< VPReplicateRecipe * > ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L)
SinkStoreInfo(VPReplicateRecipe &GroupLeader)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1055
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
APInt abs() const
Get the absolute value.
Definition APInt.h:1818
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1692
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:1028
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getMinValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the minimum integer value with the given bit width and signedness.
Definition APSInt.h:310
static APSInt getMaxValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the maximum integer value with the given bit width and signedness.
Definition APSInt.h:302
@ NoAlias
The two locations do not alias at all.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
Get the last element.
Definition ArrayRef.h:150
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:194
const T & front() const
Get the first element.
Definition ArrayRef.h:144
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:852
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
A debug info location.
Definition DebugLoc.h:126
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:154
static DebugLoc getUnknown()
Definition DebugLoc.h:153
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:252
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:301
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:262
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:202
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:291
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
size_t arg_size() const
Definition Function.h:875
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags noUnsignedWrap()
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
static LLVM_ABI InductionDescriptor getCanonicalIntInduction(Type *Ty, ScalarEvolution &SE)
Returns the canonical integer induction for type Ty with start = 0 and step = 1.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:348
The group of interleaved loads/stores sharing the same stride and close to each other.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1644
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1069
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:110
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:118
bool empty() const
Definition MapVector.h:79
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition Module.cpp:235
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
Post-order traversal of a graph.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class represents a constant integer value.
ConstantInt * getValue() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(SCEVUse SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getUDivExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
@ SK_Broadcast
Broadcast element 0 to all other elements.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:276
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
op_range operands()
Definition User.h:267
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:76
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:4042
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4387
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4462
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4414
iterator end()
Definition VPlan.h:4424
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4422
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4475
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:266
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:560
const VPRecipeBase & front() const
Definition VPlan.h:4434
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:639
const VPRecipeBase & back() const
Definition VPlan.h:4436
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2944
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2994
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2984
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:3000
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2980
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:94
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:315
VPRegionBlock * getParent()
Definition VPlan.h:186
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:236
size_t getNumSuccessors() const
Definition VPlan.h:237
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:306
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:222
VPlan * getPlan()
Definition VPlan.cpp:211
const std::string & getName() const
Definition VPlan.h:177
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:325
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:233
void clearPredecessors()
Remove all the predecessor of this block.
Definition VPlan.h:322
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:216
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:279
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:227
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:211
static auto blocksAs(T &&Range)
Return an iterator range over Range with each block cast to BlockTy.
Definition VPlanUtils.h:331
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:350
static bool isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT)
Returns true if VPB is a loop latch, using isHeader().
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:240
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:258
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:276
static auto blocksOnly(T &&Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:312
static void transferSuccessors(VPBlockBase *Old, VPBlockBase *New)
Transfer successors from Old to New. New must have no successors.
Definition VPlanUtils.h:296
static SmallVector< VPBasicBlock * > blocksInSingleSuccessorChainBetween(VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Returns the blocks between FirstBB and LastBB, where FirstBB to LastBB forms a single-sucessor chain.
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3489
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createFirstActiveLane(ArrayRef< VPValue * > Masks, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAdd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false})
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPInstruction * createLogicalOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAnyOfReduction(VPValue *ChainOp, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown())
Create an AnyOf reduction pattern: or-reduce ChainOp, freeze the result, then select between TrueVal ...
Definition VPlan.cpp:1631
VPInstruction * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step)
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
VPWidenPHIRecipe * createWidenPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPWidenCastRecipe * createWidenCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={}, Type *ResultTy=nullptr)
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", Type *ResultTy=nullptr)
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
static VPSingleDefRecipe * createSingleScalarOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPValue *Mask, const VPIRFlags &Flags, const VPIRMetadata &Metadata, DebugLoc DL, Instruction *UV)
Create a single-scalar recipe with Opcode and Operands without inserting it.
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:4074
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:562
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:535
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:547
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:557
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:4175
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B) const
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3534
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2436
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2483
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2472
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition VPlan.h:2163
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4540
Class to record and manage LLVM IR flags.
Definition VPlan.h:695
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlagsOrNone() const
void dropPoisonGeneratingFlags()
Drop all poison-generating flags.
Definition VPlan.h:892
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1171
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1226
unsigned getNumOperandsWithoutMask() const
Returns the number of operands, excluding the mask if the VPInstruction is masked.
Definition VPlan.h:1473
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1319
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1269
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1315
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1264
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1261
@ CanonicalIVIncrementForPart
Definition VPlan.h:1245
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1272
unsigned getOpcode() const
Definition VPlan.h:1417
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:3096
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3088
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:3117
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:3169
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:3127
void addIncoming(VPValue *IncomingV)
Append IncomingV as an incoming value to the phi-like recipe.
Definition VPlan.h:1665
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3700
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:402
VPBasicBlock * getParent()
Definition VPlan.h:477
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:555
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPHistogramRecipe * widenIfHistogram(VPInstruction *VPI)
If VPI represents a histogram operation (as determined by LoopVectorizationLegality) make that safe f...
bool prefersVectorizedAddressing() const
Returns true if the target prefers vectorized addressing.
VPRecipeBase * tryToWidenMemory(VPInstruction *VPI, VFRange &Range)
Check if the load or store instruction VPI should widened for Range.Start and potentially masked.
bool replaceWithFinalIfReductionStore(VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder)
If VPI is a store of a reduction into an invariant address, delete it.
VPSingleDefRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a replicating or single-scalar recipe for VPI.
bool isPredicatedInst(Instruction *I) const
Returns true if I needs to be predicated (i.e.
Type * getScalarType() const
Returns the scalar type of this VPRecipeValue.
Definition VPlanValue.h:338
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3340
A recipe for handling reduction phis.
Definition VPlan.h:2851
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2902
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2895
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2908
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3220
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4597
const VPBlockBase * getEntry() const
Definition VPlan.h:4641
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4673
VPInstruction * getOrCreateCanonicalIVIncrement()
Get the canonical IV increment instruction if it exists.
Definition VPlan.cpp:857
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4658
Type * getCanonicalIVType() const
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4717
void clearCanonicalIVNUW(VPInstruction *Increment)
Unsets NUW for the canonical IV increment Increment, for loop regions.
Definition VPlan.h:4725
VPRegionValue * getCanonicalIV()
Return the canonical induction variable of the region, null for replicating regions.
Definition VPlan.h:4709
const VPBlockBase * getExiting() const
Definition VPlan.h:4653
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4666
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3385
bool isSingleScalar() const
Definition VPlan.h:3443
static InstructionCost computeCallCost(Function *CalledFn, Type *ResultTy, ArrayRef< const VPValue * > ArgOps, bool IsSingleScalar, ElementCount VF, VPCostContext &Ctx)
Return the cost of scalarizing a call to CalledFn with argument operands ArgOps for a given VF.
operand_range operandsWithoutMask()
Return the recipe's operands, excluding the mask of a predicated recipe.
Definition VPlan.h:3468
bool isPredicated() const
Definition VPlan.h:3445
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3462
Lightweight SCEV-to-VPlan expander.
Definition VPlanUtils.h:178
VPValue * tryToExpand(const SCEV *S)
Try to expand S into recipes and live-ins using the builder.
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:4235
VPSingleDefRecipe is a base class for recipes that model a sequence of one or more output IR that def...
Definition VPlan.h:609
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:680
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:385
operand_range operands()
Definition VPlanValue.h:458
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:431
unsigned getNumOperands() const
Definition VPlanValue.h:425
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:426
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:50
Type * getScalarType() const
Returns the scalar type of this VPValue, dispatching based on the concrete subclass.
Definition VPlan.cpp:149
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:143
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1456
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:130
bool hasMoreThanOneUniqueUser() const
Returns true if the value has more than one unique user.
Definition VPlanValue.h:164
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:75
bool user_empty() const
Definition VPlanValue.h:161
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:209
VPUser * getSingleUser()
Return the single user of this value, or nullptr if there is not exactly one user.
Definition VPlanValue.h:179
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1459
unsigned getNumUsers() const
Definition VPlanValue.h:115
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1465
user_range users()
Definition VPlanValue.h:157
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2266
A recipe to compute the pointers for widened memory accesses of SourceElementTy, with the Stride expr...
Definition VPlan.h:2348
A recipe for widening Call instructions using library calls.
Definition VPlan.h:2097
static InstructionCost computeCallCost(Function *Variant, VPCostContext &Ctx)
Return the cost of widening a call using the vector function Variant.
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1878
Instruction::CastOps getOpcode() const
Definition VPlan.h:1914
A recipe for handling GEP instructions.
Definition VPlan.h:2206
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2510
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2558
PHINode * getPHINode() const
Returns the underlying PHINode if one exists, or null otherwise.
Definition VPlan.h:2576
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2561
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2581
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2610
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2657
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2661
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2672
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2683
A recipe for widening vector intrinsics.
Definition VPlan.h:1925
static InstructionCost computeCallCost(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost of a vector intrinsic with ID and Operands.
static InstructionCost computeMemIntrinsicCost(Intrinsic::ID IID, Type *Ty, bool IsMasked, Align Alignment, VPCostContext &Ctx)
Helper function for computing the cost of vector memory intrinsic.
A common mixin class for widening memory operations.
Definition VPlan.h:3736
virtual VPRecipeBase * getAsRecipe()=0
Return a VPRecipeBase* to the current object.
A recipe for widened phis.
Definition VPlan.h:2741
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1817
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
VPWidenRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1838
unsigned getOpcode() const
Definition VPlan.h:1857
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4745
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:5071
bool hasVF(ElementCount VF) const
Definition VPlan.h:4964
const DataLayout & getDataLayout() const
Definition VPlan.h:4946
LLVMContext & getContext() const
Definition VPlan.h:4942
VPBasicBlock * getEntry()
Definition VPlan.h:4841
bool hasScalableVF() const
Definition VPlan.h:4965
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4900
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4921
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4971
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:5037
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4940
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:5043
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:5120
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:5074
bool hasUF(unsigned UF) const
Definition VPlan.h:4989
VPIRValue * getPoison(Type *Ty)
Return a VPIRValue wrapping a poison value of type Ty.
Definition VPlan.h:5065
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4894
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4930
VPValue * getBackedgeTakenCount() const
Definition VPlan.h:4927
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:5014
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:5040
void setVF(ElementCount VF)
Definition VPlan.h:4952
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:5005
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1053
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:4992
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4914
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4870
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:5097
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:5034
VPBasicBlock * getVectorPreheader() const
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4846
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4937
bool hasScalarVFOnly() const
Definition VPlan.h:4982
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4884
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4933
void setUF(unsigned UF)
Definition VPlan.h:4997
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop, i.e.
Definition VPlan.h:5152
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1209
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:5048
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:261
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:319
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2798
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition DwarfDebug.h:190
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
match_combine_or< CastInst_match< OpTy, TruncInst >, OpTy > m_TruncOrSelf(const OpTy &Op)
auto m_Poison()
Match an arbitrary poison constant.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
SelectLike_match< CondTy, LTy, RTy > m_SelectLike(const CondTy &C, const LTy &TrueC, const RTy &FalseC)
Matches a value that behaves like a boolean-controlled select, i.e.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
specificloop_ty m_SpecificLoop(const Loop *L)
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
auto m_WidenAnyExtend(const Op0_t &Op0)
match_bind< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
auto m_VPPhi(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
auto m_WidenIntrinsic(const T &...Ops)
canonical_widen_iv_match m_CanonicalWidenIV()
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
match_bind< VPSingleDefRecipe > m_VPSingleDefRecipe(VPSingleDefRecipe *&V)
Match a VPSingleDefRecipe, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
auto m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
match_bind< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
auto m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
auto m_AnyNeg(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
bool cannotHoistOrSinkRecipe(const VPRecipeBase &R, bool Sinking=false)
Return true if we do not know how to (mechanically) hoist or sink R.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
VPInstruction * findCanonicalIVIncrement(VPlan &Plan)
Find the canonical IV increment of Plan's vector loop region.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPValue * findIncomingAliasMask(const VPlan &Plan)
Finds the incoming alias-mask within the vector preheader.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:128
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) Note: If ...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isUniformAcrossVFsAndUFs(const VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
bool isUsedByLoadStoreAddress(const VPValue *V)
Returns true if V is used as part of the address of another load or store.
GEPNoWrapFlags getGEPFlagsForPtr(VPValue *Ptr)
Returns the GEP nowrap flags for Ptr, looking through pointer casts mirroring Value::stripPointerCast...
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
SmallVector< VPBasicBlock * > vp_rpo_plain_cfg_loop_body(VPBasicBlock *Header)
Returns the VPBasicBlocks forming the loop body of a plain (pre-region) VPlan in reverse post-order s...
Definition VPlanCFG.h:265
@ Offset
Definition DWP.cpp:573
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2180
void stable_sort(R &&Range)
Definition STLExtras.h:2116
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
constexpr auto bind_back(FnT &&Fn, BindArgsT &&...BindArgs)
C++23 bind_back.
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:288
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
SmallVector< VPRegisterUsage, 8 > calculateRegisterUsageForPlan(VPlan &Plan, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Estimate the register usage for Plan and vectorization factors in VFs by calculating the highest numb...
auto map_range(ContainerTy &&C, FuncTy F)
Return a range that applies F to the elements of C.
Definition STLExtras.h:365
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1151
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2200
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
constexpr size_t range_size(R &&Range)
Returns the size of the Range, i.e., the number of elements.
Definition STLExtras.h:1694
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:79
@ MaskedHandleExitInScalarLoop
All memory operations other than the load(s) required to determine whether an uncountable exit occurr...
Definition VPlan.h:89
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:551
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1828
T * find_singleton(R &&Range, Predicate P, bool AllowRepeats=false)
Return the single value in Range that satisfies P(<member of Range> *, AllowRepeats)->T * returning n...
Definition STLExtras.h:1837
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ FSub
Subtraction of floats.
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2088
ArrayRef(const T &OneElt) -> ArrayRef< T >
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1409
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2019
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Next
Definition InstrProf.h:147
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:305
LLVM_ABI std::optional< int64_t > getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, Value *Ptr, PredicatedScalarEvolution &PSE)
If AR is an affine AddRec for Lp with a constant step, return the step in units of AccessTy's allocat...
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2146
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT, AssumptionCache *AC=nullptr, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Return true if we can prove that the given load (which is assumed to be within the specified loop) wo...
Definition Loads.cpp:304
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:285
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
VPBasicBlock * EarlyExitingVPBB
VPIRBasicBlock * EarlyExitVPBB
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2833
Holds the VFShape for a specific scalar to vector function mapping.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
static bool isFreeScalarIntrinsic(Intrinsic::ID ID)
Returns true if ID is a pseudo intrinsic that is dropped via scalarization rather than widened.
Definition VPlan.cpp:1924
bool isMaskRequired(Instruction *I) const
Forwards to LoopVectorizationCostModel::isMaskRequired.
PredicatedScalarEvolution & PSE
bool willBeScalarized(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalarized at VF.
TargetTransformInfo::TargetCostKind CostKind
const TargetLibraryInfo & TLI
const TargetTransformInfo & TTI
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:247
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:147
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
InstructionCost spillCost(const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, unsigned OverrideMaxNumRegs=0) const
Calculate the estimated cost of any spills due to using more registers than the number available for ...
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:287
bool isMaterialized() const
Returns true if this symbolic value has been materialized.
Definition VPlanValue.h:298
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3850
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3800
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3953
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3899
static VPValue * materializeAliasMask(VPlan &Plan, VPBasicBlock *AliasCheckVPBB, ArrayRef< PointerDiffInfo > DiffChecks)
Materializes within the AliasCheckVPBB block.
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static decltype(auto) runPass(StringRef PassName, PassTy &&Pass, VPlan &Plan, ArgsTy &&...Args)
Helper to run a VPlan pass Pass on VPlan, forwarding extra arguments to the pass.
static void expandSCEVsToVPInstructions(VPlan &Plan, ScalarEvolution &SE)
Try to expand VPExpandSCEVRecipes in Plan's entry block to VPInstructions.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, const bool &EpilogueAllowed)
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void replaceWideCanonicalIVWithWideIV(VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, ElementCount VF, unsigned UF, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Replace a VPWidenCanonicalIVRecipe if it is present in Plan, with a VPWidenIntOrFpInductionRecipe,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static void materializeAliasMaskCheckBlock(VPlan &Plan, ArrayRef< PointerDiffInfo > DiffChecks, bool HasBranchWeights)
Materializes the alias mask within a check block before the loop.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand remaining VPExpandSCEVRecipes in Plan's entry block using SCEVExpander.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder, VPCostContext &CostCtx)
Convert load/store VPInstructions in Plan into widened or replicate recipes.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step, std::optional< uint64_t > MaxRuntimeStep=std::nullopt)
Materialize vector trip count computations to a set of VPInstructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static bool mergeBlocksIntoPredecessors(VPlan &Plan)
Remove redundant VPBasicBlocks by merging them into their single predecessor if the latter has a sing...
static void attachAliasMaskToHeaderMask(VPlan &Plan)
Attaches the alias-mask to the existing header-mask.
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void makeScalarizationDecisions(VPlan &Plan, VFRange &Range)
Make VPlan-based scalarization decision prior to delegating to the ones made by the legacy CM.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses of the canonical ...
static void simplifyReverses(VPlan &Plan)
Cancel out redundant reverses in Plan, e.g. reverse(reverse(x)) -> x.
static void makeCallWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder, VPCostContext &CostCtx)
Convert call VPInstructions in Plan into widened call, vector intrinsic or replicate recipes based on...
static void adjustFirstOrderRecurrenceMiddleUsers(VPlan &Plan, VFRange &Range)
Adjust first-order recurrence users in the middle block: create penultimate element extracts for LCSS...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap, const VPDominatorTree &VPDT)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static bool removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void convertToStridedAccesses(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L, VPCostContext &Ctx, VFRange &Range)
Transform widen memory recipes into strided access recipes when legal and profitable.
static bool handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT, AssumptionCache *AC, UncountableExitStyle Style)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void attachVPCheckBlock(VPlan &Plan, VPValue *Cond, VPBasicBlock *CheckBlock, bool AddBranchWeights)
Wrap runtime check block CheckBlock in a VPIRBB and Cond in a VPValue and connect the block to Plan,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static void dropPoisonGeneratingRecipes(VPlan &Plan)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...