LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
33#include "llvm/Analysis/Loads.h"
39#include "llvm/IR/Intrinsics.h"
40#include "llvm/IR/MDBuilder.h"
41#include "llvm/IR/Metadata.h"
46
47using namespace llvm;
48using namespace VPlanPatternMatch;
49using namespace SCEVPatternMatch;
50
52 VPlan &Plan, const TargetLibraryInfo &TLI) {
53
55 Plan.getVectorLoopRegion());
57 // Skip blocks outside region
58 if (!VPBB->getParent())
59 break;
60 VPRecipeBase *Term = VPBB->getTerminator();
61 auto EndIter = Term ? Term->getIterator() : VPBB->end();
62 // Introduce each ingredient into VPlan.
63 for (VPRecipeBase &Ingredient :
64 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
65
66 VPValue *VPV = Ingredient.getVPSingleValue();
67 if (!VPV->getUnderlyingValue())
68 continue;
69
71
72 VPRecipeBase *NewRecipe = nullptr;
73 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
74 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
75 NewRecipe = new VPWidenPHIRecipe(PhiR->operands(), PhiR->getDebugLoc(),
76 Phi->getName());
77 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
78 assert(!isa<PHINode>(Inst) && "phis should be handled above");
79 // Create VPWidenMemoryRecipe for loads and stores.
80 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
81 NewRecipe = new VPWidenLoadRecipe(
82 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
83 false /*Consecutive*/, *VPI, Ingredient.getDebugLoc());
84 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
85 NewRecipe = new VPWidenStoreRecipe(
86 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
87 nullptr /*Mask*/, false /*Consecutive*/, *VPI,
88 Ingredient.getDebugLoc());
90 NewRecipe = new VPWidenGEPRecipe(GEP->getSourceElementType(),
91 Ingredient.operands(), *VPI,
92 Ingredient.getDebugLoc(), GEP);
93 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
94 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
95 if (VectorID == Intrinsic::not_intrinsic)
96 return false;
97
98 // The noalias.scope.decl intrinsic declares a noalias scope that
99 // is valid for a single iteration. Emitting it as a single-scalar
100 // replicate would incorrectly extend the scope across multiple
101 // original iterations packed into one vector iteration.
102 // FIXME: If we want to vectorize this loop, then we have to drop
103 // all the associated !alias.scope and !noalias.
104 if (VectorID == Intrinsic::experimental_noalias_scope_decl)
105 return false;
106
107 // These intrinsics are recognized by getVectorIntrinsicIDForCall
108 // but are not widenable. Emit them as replicate instead of widening.
109 if (VectorID == Intrinsic::assume ||
110 VectorID == Intrinsic::lifetime_end ||
111 VectorID == Intrinsic::lifetime_start ||
112 VectorID == Intrinsic::sideeffect ||
113 VectorID == Intrinsic::pseudoprobe) {
114 // If the operand of llvm.assume holds before vectorization, it will
115 // also hold per lane.
116 // llvm.pseudoprobe requires to be duplicated per lane for accurate
117 // sample count.
118 const bool IsSingleScalar = VectorID != Intrinsic::assume &&
119 VectorID != Intrinsic::pseudoprobe;
120 NewRecipe = new VPReplicateRecipe(CI, Ingredient.operands(),
121 /*IsSingleScalar=*/IsSingleScalar,
122 /*Mask=*/nullptr, *VPI, *VPI,
123 Ingredient.getDebugLoc());
124 } else {
125 NewRecipe = new VPWidenIntrinsicRecipe(
126 *CI, VectorID, drop_end(Ingredient.operands()), CI->getType(),
127 VPIRFlags(*CI), *VPI, CI->getDebugLoc());
128 }
129 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
130 NewRecipe = new VPWidenCastRecipe(
131 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
132 VPIRFlags(*CI), VPIRMetadata(*CI));
133 } else {
134 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
135 *VPI, Ingredient.getDebugLoc());
136 }
137 } else {
139 "inductions must be created earlier");
140 continue;
141 }
142
143 NewRecipe->insertBefore(&Ingredient);
144 if (NewRecipe->getNumDefinedValues() == 1)
145 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
146 else
147 assert(NewRecipe->getNumDefinedValues() == 0 &&
148 "Only recpies with zero or one defined values expected");
149 Ingredient.eraseFromParent();
150 }
151 }
152 return true;
153}
154
155/// Helper for extra no-alias checks via known-safe recipe and SCEV.
157 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
158 VPReplicateRecipe &GroupLeader;
160 const Loop &L;
161
162 // Return true if \p A and \p B are known to not alias for all VFs in the
163 // plan, checked via the distance between the accesses
164 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
165 if (A->getOpcode() != Instruction::Store ||
166 B->getOpcode() != Instruction::Store)
167 return false;
168
169 VPValue *AddrA = A->getOperand(1);
170 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
171 VPValue *AddrB = B->getOperand(1);
172 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
174 return false;
175
176 const APInt *Distance;
177 ScalarEvolution &SE = *PSE.getSE();
178 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
179 return false;
180
181 const DataLayout &DL = SE.getDataLayout();
182 Type *TyA = A->getOperand(0)->getScalarType();
183 uint64_t SizeA = DL.getTypeStoreSize(TyA);
184 Type *TyB = B->getOperand(0)->getScalarType();
185 uint64_t SizeB = DL.getTypeStoreSize(TyB);
186
187 // Use the maximum store size to ensure no overlap from either direction.
188 // Currently only handles fixed sizes, as it is only used for
189 // replicating VPReplicateRecipes.
190 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
191
192 auto VFs = B->getParent()->getPlan()->vectorFactors();
194 if (MaxVF.isScalable())
195 return false;
196 return Distance->abs().uge(
197 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
198 }
199
200public:
203 const Loop &L)
204 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
205 L(L) {}
206
207 /// Return true if \p R should be skipped during alias checking, either
208 /// because it's in the exclude set or because no-alias can be proven via
209 /// SCEV.
210 bool shouldSkip(VPRecipeBase &R) const {
211 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
212 return ExcludeRecipes.contains(&R) ||
213 (Store && isNoAliasViaDistance(Store, &GroupLeader));
214 }
215};
216
217/// Check if a memory operation doesn't alias with memory operations using
218/// scoped noalias metadata, in blocks in the single-successor chain between \p
219/// FirstBB and \p LastBB. If \p SinkInfo is std::nullopt, only recipes that may
220/// write to memory are checked (for load hoisting). Otherwise recipes that both
221/// read and write memory are checked, and SCEV is used to prove no-alias
222/// between the group leader and other replicate recipes (for store sinking).
223static bool
225 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
226 std::optional<SinkStoreInfo> SinkInfo = {}) {
227 bool CheckReads = SinkInfo.has_value();
228 if (!MemLoc.AATags.Scope)
229 return false;
230
231 for (VPBasicBlock *VPBB :
233 for (VPRecipeBase &R : *VPBB) {
234 if (SinkInfo && SinkInfo->shouldSkip(R))
235 continue;
236
237 // Skip recipes that don't need checking.
238 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
239 continue;
240
242 if (!Loc)
243 // Conservatively assume aliasing for memory operations without
244 // location.
245 return false;
246
248 return false;
249 }
250 }
251 return true;
252}
253
254/// Get the value type of the replicate load or store. \p IsLoad indicates
255/// whether it is a load.
257 return (IsLoad ? R : R->getOperand(0))->getScalarType();
258}
259
260/// Collect either replicated Loads or Stores grouped by their address SCEV and
261/// their load-store type, in a deep-traversal of the vector loop region in \p
262/// Plan.
263template <unsigned Opcode>
266 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
267 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
268 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
269 "Only Load and Store opcodes supported");
270 constexpr bool IsLoad = (Opcode == Instruction::Load);
273 RecipesByAddressAndType;
276 for (VPRecipeBase &R : *VPBB) {
277 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
278 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
279 continue;
280
281 // For loads, operand 0 is address; for stores, operand 1 is address.
282 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
283 const Type *LoadStoreTy = getLoadStoreValueType(RepR, IsLoad);
284 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
285 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
286 RecipesByAddressAndType[{AddrSCEV, LoadStoreTy}].push_back(RepR);
287 }
288 }
289 auto Groups = to_vector(RecipesByAddressAndType.values());
290 VPDominatorTree VPDT(Plan);
291 for (auto &Group : Groups) {
292 // Sort mem ops by dominance order, with earliest (most dominating) first.
294 return VPDT.properlyDominates(A, B);
295 });
296 }
297 return Groups;
298}
299
300static bool sinkScalarOperands(VPlan &Plan) {
301 auto Iter = vp_depth_first_deep(Plan.getEntry());
302 bool ScalarVFOnly = Plan.hasScalarVFOnly();
303 bool Changed = false;
304
306 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
307 VPBasicBlock *SinkTo, VPValue *Op) {
308 auto *Candidate =
309 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
310 if (!Candidate)
311 return;
312
313 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
314 // for now.
316 return;
317
318 if (Candidate->getParent() == SinkTo ||
319 vputils::cannotHoistOrSinkRecipe(*Candidate, /*Sinking=*/true))
320 return;
321
322 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
323 if (!ScalarVFOnly && RepR->isSingleScalar())
324 return;
325
326 WorkList.insert({SinkTo, Candidate});
327 };
328
329 // First, collect the operands of all recipes in replicate blocks as seeds for
330 // sinking.
332 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
333 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
334 continue;
335 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
336 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
337 continue;
338 for (auto &Recipe : *VPBB)
339 for (VPValue *Op : Recipe.operands())
340 InsertIfValidSinkCandidate(VPBB, Op);
341 }
342
343 // Try to sink each replicate or scalar IV steps recipe in the worklist.
344 for (unsigned I = 0; I != WorkList.size(); ++I) {
345 VPBasicBlock *SinkTo;
346 VPSingleDefRecipe *SinkCandidate;
347 std::tie(SinkTo, SinkCandidate) = WorkList[I];
348
349 // All recipe users of SinkCandidate must be in the same block SinkTo or all
350 // users outside of SinkTo must only use the first lane of SinkCandidate. In
351 // the latter case, we need to duplicate SinkCandidate.
352 auto UsersOutsideSinkTo =
353 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
354 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
355 });
356 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
357 return !U->usesFirstLaneOnly(SinkCandidate);
358 }))
359 continue;
360 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
361
362 if (NeedsDuplicating) {
363 if (ScalarVFOnly)
364 continue;
365 VPSingleDefRecipe *Clone;
366 if (auto *SinkCandidateRepR =
367 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
368 // TODO: Handle converting to uniform recipes as separate transform,
369 // then cloning should be sufficient here.
370 Instruction *I = SinkCandidate->getUnderlyingInstr();
371 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
372 nullptr /*Mask*/, *SinkCandidateRepR,
373 *SinkCandidateRepR);
374 // TODO: add ".cloned" suffix to name of Clone's VPValue.
375 } else {
376 Clone = SinkCandidate->clone();
377 }
378
379 Clone->insertBefore(SinkCandidate);
380 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
381 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
382 });
383 }
384 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
385 for (VPValue *Op : SinkCandidate->operands())
386 InsertIfValidSinkCandidate(SinkTo, Op);
387 Changed = true;
388 }
389 return Changed;
390}
391
392/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
393/// the mask.
395 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
396 if (!EntryBB || EntryBB->size() != 1 ||
397 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
398 return nullptr;
399
400 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
401}
402
403/// If \p R is a triangle region, return the 'then' block of the triangle.
405 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
406 if (EntryBB->getNumSuccessors() != 2)
407 return nullptr;
408
409 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
410 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
411 if (!Succ0 || !Succ1)
412 return nullptr;
413
414 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
415 return nullptr;
416 if (Succ0->getSingleSuccessor() == Succ1)
417 return Succ0;
418 if (Succ1->getSingleSuccessor() == Succ0)
419 return Succ1;
420 return nullptr;
421}
422
423// Merge replicate regions in their successor region, if a replicate region
424// is connected to a successor replicate region with the same predicate by a
425// single, empty VPBasicBlock.
427 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
428
429 // Collect replicate regions followed by an empty block, followed by another
430 // replicate region with matching masks to process front. This is to avoid
431 // iterator invalidation issues while merging regions.
434 vp_depth_first_deep(Plan.getEntry()))) {
435 if (!Region1->isReplicator())
436 continue;
437 auto *MiddleBasicBlock =
438 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
439 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
440 continue;
441
442 auto *Region2 =
443 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
444 if (!Region2 || !Region2->isReplicator())
445 continue;
446
447 VPValue *Mask1 = getPredicatedMask(Region1);
448 VPValue *Mask2 = getPredicatedMask(Region2);
449 if (!Mask1 || Mask1 != Mask2)
450 continue;
451
452 assert(Mask1 && Mask2 && "both region must have conditions");
453 WorkList.push_back(Region1);
454 }
455
456 // Move recipes from Region1 to its successor region, if both are triangles.
457 for (VPRegionBlock *Region1 : WorkList) {
458 if (TransformedRegions.contains(Region1))
459 continue;
460 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
461 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
462
463 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
464 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
465 if (!Then1 || !Then2)
466 continue;
467
468 // Note: No fusion-preventing memory dependencies are expected in either
469 // region. Such dependencies should be rejected during earlier dependence
470 // checks, which guarantee accesses can be re-ordered for vectorization.
471 //
472 // Move recipes to the successor region.
473 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
474 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
475
476 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
477 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
478
479 // Move VPPredInstPHIRecipes from the merge block to the successor region's
480 // merge block. Update all users inside the successor region to use the
481 // original values.
482 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
483 VPValue *PredInst1 =
484 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
485 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
486 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
487 return cast<VPRecipeBase>(&U)->getParent() == Then2;
488 });
489
490 // Remove phi recipes that are unused after merging the regions.
491 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
492 Phi1ToMove.eraseFromParent();
493 continue;
494 }
495 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
496 }
497
498 // Remove the dead recipes in Region1's entry block.
499 for (VPRecipeBase &R :
500 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
501 R.eraseFromParent();
502
503 // Finally, remove the first region.
504 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
505 VPBlockUtils::disconnectBlocks(Pred, Region1);
506 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
507 }
508 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
509 TransformedRegions.insert(Region1);
510 }
511
512 return !TransformedRegions.empty();
513}
514
516 VPRegionBlock *ParentRegion,
517 VPlan &Plan) {
518 Instruction *Instr = PredRecipe->getUnderlyingInstr();
519 // Build the triangular if-then region.
520 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
521 assert(Instr->getParent() && "Predicated instruction not in any basic block");
522 auto *BlockInMask = PredRecipe->getMask();
523 auto *MaskDef = BlockInMask->getDefiningRecipe();
524 auto *BOMRecipe = new VPBranchOnMaskRecipe(
525 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
526 auto *Entry =
527 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
528
529 // Replace predicated replicate recipe with a replicate recipe without a
530 // mask but in the replicate region.
531 auto *RecipeWithoutMask = new VPReplicateRecipe(
532 PredRecipe->getUnderlyingInstr(), PredRecipe->operandsWithoutMask(),
533 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
534 PredRecipe->getDebugLoc());
535 auto *Pred =
536 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
537 auto *Exiting = Plan.createVPBasicBlock(Twine(RegionName) + ".continue");
539 Plan.createReplicateRegion(Entry, Exiting, RegionName);
540
541 // Note: first set Entry as region entry and then connect successors starting
542 // from it in order, to propagate the "parent" of each VPBasicBlock.
543 Region->setParent(ParentRegion);
544 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
545 VPBlockUtils::connectBlocks(Pred, Exiting);
546
547 if (PredRecipe->getNumUsers() != 0) {
548 auto *PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
549 RecipeWithoutMask->getDebugLoc());
550 Exiting->appendRecipe(PHIRecipe);
551 PredRecipe->replaceAllUsesWith(PHIRecipe);
552 }
553 PredRecipe->eraseFromParent();
554 return Region;
555}
556
557static void addReplicateRegions(VPlan &Plan) {
560 vp_depth_first_deep(Plan.getEntry()))) {
561 for (VPRecipeBase &R : *VPBB)
562 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
563 if (RepR->isPredicated())
564 WorkList.push_back(RepR);
565 }
566 }
567
568 unsigned BBNum = 0;
569 for (VPReplicateRecipe *RepR : WorkList) {
570 VPBasicBlock *CurrentBlock = RepR->getParent();
571 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
572
573 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
574 SplitBlock->setName(
575 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
576 // Record predicated instructions for above packing optimizations.
578 createReplicateRegion(RepR, CurrentBlock->getParent(), Plan);
580
581 VPRegionBlock *ParentRegion = Region->getParent();
582 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
583 ParentRegion->setExiting(SplitBlock);
584 }
585}
586
590 vp_depth_first_deep(Plan.getEntry()))) {
591 // Don't fold the blocks in the skeleton of the Plan into their single
592 // predecessors for now.
593 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
594 if (!VPBB->getParent())
595 continue;
596 auto *PredVPBB =
597 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
598 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
599 isa<VPIRBasicBlock>(PredVPBB))
600 continue;
601 WorkList.push_back(VPBB);
602 }
603
604 for (VPBasicBlock *VPBB : WorkList) {
605 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
606 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
607 R.moveBefore(*PredVPBB, PredVPBB->end());
608 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
609 auto *ParentRegion = VPBB->getParent();
610 if (ParentRegion && ParentRegion->getExiting() == VPBB)
611 ParentRegion->setExiting(PredVPBB);
612 VPBlockUtils::transferSuccessors(VPBB, PredVPBB);
613 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
614 }
615 return !WorkList.empty();
616}
617
619 // Convert masked VPReplicateRecipes to if-then region blocks.
621
622 bool ShouldSimplify = true;
623 while (ShouldSimplify) {
624 ShouldSimplify = sinkScalarOperands(Plan);
625 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
626 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
627 }
628}
629
630/// Remove redundant casts of inductions.
631///
632/// Such redundant casts are casts of induction variables that can be ignored,
633/// because we already proved that the casted phi is equal to the uncasted phi
634/// in the vectorized loop. There is no need to vectorize the cast - the same
635/// value can be used for both the phi and casts in the vector loop.
637 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
639 if (!IV || IV->getTruncInst())
640 continue;
641
642 // A sequence of IR Casts has potentially been recorded for IV, which
643 // *must be bypassed* when the IV is vectorized, because the vectorized IV
644 // will produce the desired casted value. This sequence forms a def-use
645 // chain and is provided in reverse order, ending with the cast that uses
646 // the IV phi. Search for the recipe of the last cast in the chain and
647 // replace it with the original IV. Note that only the final cast is
648 // expected to have users outside the cast-chain and the dead casts left
649 // over will be cleaned up later.
650 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
651 VPValue *FindMyCast = IV;
652 for (Instruction *IRCast : reverse(Casts)) {
653 VPSingleDefRecipe *FoundUserCast = nullptr;
654 for (auto *U : FindMyCast->users()) {
655 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
656 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
657 FoundUserCast = UserCast;
658 break;
659 }
660 }
661 // A cast recipe in the chain may have been removed by earlier DCE.
662 if (!FoundUserCast)
663 break;
664 FindMyCast = FoundUserCast;
665 }
666 if (FindMyCast != IV)
667 FindMyCast->replaceAllUsesWith(IV);
668 }
669}
670
673 Instruction::BinaryOps InductionOpcode,
674 FPMathOperator *FPBinOp, Instruction *TruncI,
675 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
676 VPBuilder &Builder) {
677 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
678 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
679 VPValue *CanonicalIV = LoopRegion->getCanonicalIV();
680 VPSingleDefRecipe *BaseIV =
681 Builder.createDerivedIV(Kind, FPBinOp, StartV, CanonicalIV, Step);
682
683 // Truncate base induction if needed.
684 Type *ResultTy = BaseIV->getScalarType();
685 if (TruncI) {
686 Type *TruncTy = TruncI->getType();
687 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
688 "Not truncating.");
689 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
690 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
691 ResultTy = TruncTy;
692 }
693
694 // Truncate step if needed.
695 Type *StepTy = Step->getScalarType();
696 if (ResultTy != StepTy) {
697 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
698 "Not truncating.");
699 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
700 auto *VecPreheader =
702 VPBuilder::InsertPointGuard Guard(Builder);
703 Builder.setInsertPoint(VecPreheader);
704 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
705 }
706 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
707 &Plan.getVF(), DL);
708}
709
711 VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI,
713 const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
714 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
715 if (!LoopRegion)
716 return;
717
718 auto *WideCanIV =
720 if (!WideCanIV)
721 return;
722
723 Type *CanIVTy = LoopRegion->getCanonicalIVType();
724
725 // Replace the wide canonical IV with a scalar-iv-steps over the canonical
726 // IV.
727 if (Plan.hasScalarVFOnly() || vputils::onlyFirstLaneUsed(WideCanIV)) {
728 VPBuilder Builder(WideCanIV);
729 WideCanIV->replaceAllUsesWith(createScalarIVSteps(
730 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
731 nullptr, Plan.getZero(CanIVTy), Plan.getConstantInt(CanIVTy, 1),
732 WideCanIV->getDebugLoc(), Builder));
733 WideCanIV->eraseFromParent();
734 return;
735 }
736
737 if (vputils::onlyScalarValuesUsed(WideCanIV))
738 return;
739
740 // If a canonical VPWidenIntOrFpInductionRecipe already produces vector lanes
741 // in the header, reuse it instead of introducing another wide induction phi.
742 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
743 for (VPRecipeBase &Phi : Header->phis()) {
745 if (!match(&Phi, m_CanonicalWidenIV(WidenIV)))
746 continue;
747 // The reused wide IV feeds the header mask, whose lanes may extend past
748 // the trip count; drop flags that only hold inside the scalar loop.
749 WidenIV->dropPoisonGeneratingFlags();
750 WideCanIV->replaceAllUsesWith(WidenIV);
751 WideCanIV->eraseFromParent();
752 return;
753 }
754
755 // Introduce a new VPWidenIntOrFpInductionRecipe if profitable.
756 auto *VecTy = VectorType::get(CanIVTy, VF);
757 InstructionCost BroadcastCost = TTI.getShuffleCost(
759 InstructionCost PHICost = TTI.getCFInstrCost(Instruction::PHI, CostKind);
760 if (PHICost > BroadcastCost)
761 return;
762
763 // Bail out if the additional wide induction phi increase the expected spill
764 // cost.
765 VPRegisterUsage UnrolledBase =
766 calculateRegisterUsageForPlan(Plan, VF, TTI, ValuesToIgnore)[0];
767 for (unsigned &NumUsers : make_second_range(UnrolledBase.MaxLocalUsers))
768 NumUsers *= UF;
769 unsigned RegClass = TTI.getRegisterClassForType(/*Vector=*/true, VecTy);
770 VPRegisterUsage Projected = UnrolledBase;
771 Projected.MaxLocalUsers[RegClass] += TTI.getRegUsageForType(VecTy);
772 if (Projected.spillCost(TTI, CostKind) >
773 UnrolledBase.spillCost(TTI, CostKind))
774 return;
775
778 VPValue *StepV = Plan.getConstantInt(CanIVTy, 1);
779 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
780 /*IV=*/nullptr, Plan.getZero(CanIVTy), StepV, &Plan.getVF(), ID,
781 WideCanIV->getNoWrapFlags(), WideCanIV->getDebugLoc());
782 NewWideIV->insertBefore(&*Header->getFirstNonPhi());
783 WideCanIV->replaceAllUsesWith(NewWideIV);
784 WideCanIV->eraseFromParent();
785}
786
787/// Returns true if \p R is dead and can be removed.
788static bool isDeadRecipe(VPRecipeBase &R) {
789 // Do remove conditional assume instructions as their conditions may be
790 // flattened.
791 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
792 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
794 if (IsConditionalAssume)
795 return true;
796
797 if (R.mayHaveSideEffects())
798 return false;
799
800 // Recipe is dead if no user keeps the recipe alive.
801 return all_of(R.definedValues(),
802 [](VPValue *V) { return V->getNumUsers() == 0; });
803}
804
807 Plan.getEntry());
809 // The recipes in the block are processed in reverse order, to catch chains
810 // of dead recipes.
811 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
812 if (isDeadRecipe(R)) {
813 R.eraseFromParent();
814 continue;
815 }
816
817 // Check if R is a dead VPPhi <-> update cycle and remove it.
818 VPValue *Start, *Incoming;
819 if (!match(&R, m_VPPhi(m_VPValue(Start), m_VPValue(Incoming))))
820 continue;
821 auto *PhiR = cast<VPPhi>(&R);
822 VPUser *PhiUser = PhiR->getSingleUser();
823 if (!PhiUser)
824 continue;
825 if (PhiUser != Incoming->getDefiningRecipe() ||
826 Incoming->getNumUsers() != 1)
827 continue;
828 PhiR->replaceAllUsesWith(Start);
829 PhiR->eraseFromParent();
830 Incoming->getDefiningRecipe()->eraseFromParent();
831 }
832 }
833}
834
837 for (unsigned I = 0; I != Users.size(); ++I) {
839 for (VPValue *V : Cur->definedValues())
840 Users.insert_range(V->users());
841 }
842 return Users.takeVector();
843}
844
845/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
846/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
847/// generates scalar values.
848static VPValue *
850 VPlan &Plan, VPBuilder &Builder) {
852 VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());
853 VPValue *StepV = PtrIV->getOperand(1);
855 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
856 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
857
858 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
859 PtrIV->getDebugLoc(), "next.gep");
860}
861
862/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
863/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
864/// VPWidenPointerInductionRecipe will generate vectors only. If some users
865/// require vectors while other require scalars, the scalar uses need to extract
866/// the scalars from the generated vectors (Note that this is different to how
867/// int/fp inductions are handled). Legalize extract-from-ends using uniform
868/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
869/// the correct end value is available. Also optimize
870/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
871/// providing them scalar steps built on the canonical scalar IV and update the
872/// original IV's users. This is an optional optimization to reduce the needs of
873/// vector extracts.
876 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
877 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
878 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
879 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
880 if (!PhiR)
881 continue;
882
883 // Try to narrow wide and replicating recipes to uniform recipes, based on
884 // VPlan analysis.
885 // TODO: Apply to all recipes in the future, to replace legacy uniformity
886 // analysis.
887 auto Users = collectUsersRecursively(PhiR);
888 for (VPUser *U : reverse(Users)) {
889 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
890 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
891 // Skip recipes that shouldn't be narrowed.
892 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
893 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
894 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
895 continue;
896
897 // Skip recipes that may have other lanes than their first used.
899 continue;
900
901 // TODO: Support scalarizing ExtractValue.
902 if (match(Def,
904 continue;
905
906 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
907 Def->operands(), /*IsUniform*/ true,
908 /*Mask*/ nullptr, /*Flags*/ *Def);
909 Clone->insertAfter(Def);
910 Def->replaceAllUsesWith(Clone);
911 }
912
913 // Replace wide pointer inductions which have only their scalars used by
914 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
915 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
916 if (!Plan.hasScalarVFOnly() &&
917 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
918 continue;
919
920 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
921 PtrIV->replaceAllUsesWith(PtrAdd);
922 continue;
923 }
924
925 // Replace widened induction with scalar steps for users that only use
926 // scalars.
927 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
928 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
929 return U->usesScalars(WideIV);
930 }))
931 continue;
932
933 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
935 Plan, ID.getKind(), ID.getInductionOpcode(),
936 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
937 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
938 WideIV->getDebugLoc(), Builder);
939
940 // Update scalar users of IV to use Step instead.
941 if (!HasOnlyVectorVFs) {
942 assert(!Plan.hasScalableVF() &&
943 "plans containing a scalar VF cannot also include scalable VFs");
944 WideIV->replaceAllUsesWith(Steps);
945 } else {
946 bool HasScalableVF = Plan.hasScalableVF();
947 WideIV->replaceUsesWithIf(Steps,
948 [WideIV, HasScalableVF](VPUser &U, unsigned) {
949 if (HasScalableVF)
950 return U.usesFirstLaneOnly(WideIV);
951 return U.usesScalars(WideIV);
952 });
953 }
954 }
955}
956
957/// Check if \p VPV is an untruncated wide induction, either before or after the
958/// increment. If so return the header IV (before the increment), otherwise
959/// return null.
962 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
963 if (WideIV) {
964 // VPV itself is a wide induction, separately compute the end value for exit
965 // users if it is not a truncated IV.
966 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
967 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
968 }
969
970 // Check if VPV is an optimizable induction increment.
971 VPRecipeBase *Def = VPV->getDefiningRecipe();
972 if (!Def || Def->getNumOperands() != 2)
973 return nullptr;
974 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
975 if (!WideIV)
976 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
977 if (!WideIV)
978 return nullptr;
979
980 auto IsWideIVInc = [&]() {
981 auto &ID = WideIV->getInductionDescriptor();
982
983 // Check if VPV increments the induction by the induction step.
984 VPValue *IVStep = WideIV->getStepValue();
985 switch (ID.getInductionOpcode()) {
986 case Instruction::Add:
987 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
988 case Instruction::FAdd:
989 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
990 case Instruction::FSub:
991 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
992 m_Specific(IVStep)));
993 case Instruction::Sub: {
994 // IVStep will be the negated step of the subtraction. Check if Step == -1
995 // * IVStep.
996 VPValue *Step;
997 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
998 return false;
999 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
1000 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
1001 ScalarEvolution &SE = *PSE.getSE();
1002 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
1003 !isa<SCEVCouldNotCompute>(StepSCEV) &&
1004 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
1005 }
1006 default:
1007 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
1008 match(VPV, m_GetElementPtr(m_Specific(WideIV),
1009 m_Specific(WideIV->getStepValue())));
1010 }
1011 llvm_unreachable("should have been covered by switch above");
1012 };
1013 return IsWideIVInc() ? WideIV : nullptr;
1014}
1015
1016/// Attempts to optimize the induction variable exit values for users in the
1017/// early exit block.
1020 VPValue *Incoming, *Mask;
1022 m_VPValue(Incoming))))
1023 return nullptr;
1024
1025 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
1026 if (!WideIV)
1027 return nullptr;
1028
1029 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1030 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1031 return nullptr;
1032
1033 // Calculate the final index.
1034 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1035 auto *CanonicalIV = LoopRegion->getCanonicalIV();
1036 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
1037 auto *ExtractR = cast<VPInstruction>(Op);
1038 VPBuilder B(ExtractR);
1039
1040 DebugLoc DL = ExtractR->getDebugLoc();
1041 VPValue *FirstActiveLane = B.createFirstActiveLane(Mask, DL);
1042 FirstActiveLane = B.createScalarZExtOrTrunc(
1043 FirstActiveLane, CanonicalIVType, FirstActiveLane->getScalarType(), DL);
1044 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
1045
1046 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1047 // changed it means the exit is using the incremented value, so we need to
1048 // add the step.
1049 if (Incoming != WideIV) {
1050 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
1051 EndValue = B.createAdd(EndValue, One, DL);
1052 }
1053
1054 if (!match(WideIV, m_CanonicalWidenIV())) {
1055 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1056 VPIRValue *Start = WideIV->getStartValue();
1057 VPValue *Step = WideIV->getStepValue();
1058 EndValue = B.createDerivedIV(
1059 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1060 Start, EndValue, Step);
1061 }
1062
1063 return EndValue;
1064}
1065
1066/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1067/// VPDerivedIVRecipe for non-canonical inductions.
1069 VPBuilder &VectorPHBuilder,
1070 VPValue *VectorTC) {
1071 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1072 // Truncated wide inductions resume from the last lane of their vector value
1073 // in the last vector iteration which is handled elsewhere.
1074 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1075 return nullptr;
1076
1077 VPIRValue *Start = WideIV->getStartValue();
1078 VPValue *Step = WideIV->getStepValue();
1080 VPValue *EndValue = VectorTC;
1081 if (!match(WideIV, m_CanonicalWidenIV())) {
1082 EndValue = VectorPHBuilder.createDerivedIV(
1083 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1084 Start, VectorTC, Step);
1085 }
1086
1087 // EndValue is derived from the vector trip count (which has the same type as
1088 // the widest induction) and thus may be wider than the induction here.
1089 Type *ScalarTypeOfWideIV = WideIV->getScalarType();
1090 if (ScalarTypeOfWideIV != EndValue->getScalarType()) {
1091 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
1092 ScalarTypeOfWideIV,
1093 WideIV->getDebugLoc());
1094 }
1095
1096 return EndValue;
1097}
1098
1099/// Attempts to optimize the induction variable exit values for users in the
1100/// exit block coming from the latch in the original scalar loop.
1101static VPValue *
1105 VPValue *Incoming;
1107 return nullptr;
1108
1109 VPWidenInductionRecipe *WideIV = getOptimizableIVOf(Incoming, PSE);
1110 if (!WideIV)
1111 return nullptr;
1112
1113 VPValue *EndValue = EndValues.lookup(WideIV);
1114 assert(EndValue && "Must have computed the end value up front");
1115
1116 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1117 // changed it means the exit is using the incremented value, so we don't
1118 // need to subtract the step.
1119 if (Incoming != WideIV)
1120 return EndValue;
1121
1122 // Otherwise, subtract the step from the EndValue.
1123 auto *ExtractR = cast<VPInstruction>(Op);
1124 VPBuilder B(ExtractR);
1125 VPValue *Step = WideIV->getStepValue();
1126 Type *ScalarTy = WideIV->getScalarType();
1127 if (ScalarTy->isIntegerTy())
1128 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1129 if (ScalarTy->isPointerTy()) {
1130 Type *StepTy = Step->getScalarType();
1131 auto *Zero = Plan.getZero(StepTy);
1132 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1133 DebugLoc::getUnknown(), "ind.escape");
1134 }
1135 if (ScalarTy->isFloatingPointTy()) {
1136 const auto &ID = WideIV->getInductionDescriptor();
1137 return B.createNaryOp(
1138 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1139 ? Instruction::FSub
1140 : Instruction::FAdd,
1141 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1142 }
1143 llvm_unreachable("all possible induction types must be handled");
1144 return nullptr;
1145}
1146
1148 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1149 // Compute end values for all inductions.
1150 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1151 auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());
1152 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1154 VPValue *ResumeTC =
1155 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1156 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1157 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);
1158 if (!WideIV)
1159 continue;
1160 if (VPValue *EndValue =
1161 tryToComputeEndValueForInduction(WideIV, VectorPHBuilder, ResumeTC))
1162 EndValues[WideIV] = EndValue;
1163 }
1164
1165 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1166 for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
1167 VPValue *Op;
1168 if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))
1169 continue;
1170 auto *WideIV = cast<VPWidenInductionRecipe>(Op);
1171 if (VPValue *EndValue = EndValues.lookup(WideIV)) {
1172 R.getVPSingleValue()->replaceAllUsesWith(EndValue);
1173 R.eraseFromParent();
1174 }
1175 }
1176
1177 // Then, optimize exit block users.
1178 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1179 for (VPRecipeBase &R : ExitVPBB->phis()) {
1180 auto *ExitIRI = cast<VPIRPhi>(&R);
1181
1182 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1183 VPValue *Escape = nullptr;
1184 if (PredVPBB == MiddleVPBB)
1186 Plan, ExitIRI->getOperand(Idx), EndValues, PSE);
1187 else
1189 Plan, ExitIRI->getOperand(Idx), PSE);
1190 if (Escape)
1191 ExitIRI->setOperand(Idx, Escape);
1192 }
1193 }
1194 }
1195}
1196
1197/// Remove redundant ExpandSCEVRecipes in \p Plan's entry block by replacing
1198/// them with already existing recipes expanding the same SCEV expression.
1201
1202 for (VPRecipeBase &R :
1204 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1205 if (!ExpR)
1206 continue;
1207
1208 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1209 if (Inserted)
1210 continue;
1211
1212 ExpR->replaceAllUsesWith(V->second);
1213 if (ExpR == Plan.getTripCount())
1214 Plan.resetTripCount(V->second);
1215
1216 ExpR->eraseFromParent();
1217 }
1218}
1219
1221 SmallVector<VPValue *> WorkList;
1223 WorkList.push_back(V);
1224
1225 while (!WorkList.empty()) {
1226 VPValue *Cur = WorkList.pop_back_val();
1227 if (!Seen.insert(Cur).second)
1228 continue;
1229 VPRecipeBase *R = Cur->getDefiningRecipe();
1230 if (!R)
1231 continue;
1232 if (!isDeadRecipe(*R))
1233 continue;
1234 append_range(WorkList, R->operands());
1235 R->eraseFromParent();
1236 }
1237}
1238
1239/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1240/// Returns an optional pair, where the first element indicates whether it is
1241/// an intrinsic ID.
1242static std::optional<std::pair<bool, unsigned>>
1244 return TypeSwitch<const VPSingleDefRecipe *,
1245 std::optional<std::pair<bool, unsigned>>>(R)
1248 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1249 .Case([](const VPWidenIntrinsicRecipe *I) {
1250 return std::make_pair(true, I->getVectorIntrinsicID());
1251 })
1252 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe, VPScalarIVStepsRecipe>(
1253 [](auto *I) {
1254 // For recipes that do not directly map to LLVM IR instructions,
1255 // assign opcodes after the last VPInstruction opcode (which is also
1256 // after the last IR Instruction opcode), based on the VPRecipeID.
1257 return std::make_pair(false, VPInstruction::OpsEnd + 1 +
1258 I->getVPRecipeID());
1259 })
1260 .Default([](auto *) { return std::nullopt; });
1261}
1262
1263/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1264/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1265/// Operands are foldable live-ins.
1267 ArrayRef<VPValue *> Operands,
1268 const DataLayout &DL) {
1269 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1270 if (!OpcodeOrIID)
1271 return nullptr;
1272
1274 for (VPValue *Op : Operands) {
1275 VPValue *Candidate = Op;
1276 match(Op, m_Broadcast(m_VPValue(Candidate)));
1277 if (!match(Candidate, m_LiveIn()))
1278 return nullptr;
1279 Value *V = Candidate->getUnderlyingValue();
1280 if (!V)
1281 return nullptr;
1282 Ops.push_back(V);
1283 }
1284
1285 VPlan &Plan = *R.getParent()->getPlan();
1286 auto FoldToIRValue = [&]() -> Value * {
1287 InstSimplifyFolder Folder(DL);
1288 if (OpcodeOrIID->first) {
1289 if (R.getNumOperands() != 2)
1290 return nullptr;
1291 unsigned ID = OpcodeOrIID->second;
1292 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1], R.getScalarType());
1293 }
1294 unsigned Opcode = OpcodeOrIID->second;
1295 if (Instruction::isBinaryOp(Opcode))
1296 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1297 Ops[0], Ops[1]);
1298 if (Instruction::isCast(Opcode))
1299 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1300 R.getVPSingleValue()->getScalarType());
1301 switch (Opcode) {
1303 return Folder.FoldSelect(Ops[0], Ops[1],
1305 case VPInstruction::Not:
1306 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1308 case Instruction::Select:
1309 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1310 case Instruction::ICmp:
1311 case Instruction::FCmp:
1312 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1313 Ops[1]);
1314 case Instruction::GetElementPtr: {
1315 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1316 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1317 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1318 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1319 }
1322 return Folder.FoldGEP(IntegerType::getInt8Ty(Plan.getContext()), Ops[0],
1323 Ops[1],
1324 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1325 // An extract of a live-in is an extract of a broadcast, so return the
1326 // broadcasted element.
1327 case Instruction::ExtractElement:
1328 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1329 return Ops[0];
1330 }
1331 return nullptr;
1332 };
1333
1334 if (Value *V = FoldToIRValue())
1335 return Plan.getOrAddLiveIn(V);
1336 return nullptr;
1337}
1338
1339/// Try to simplify logical and bitwise recipes in \p Def.
1341 bool CanCreateNewRecipe) {
1342 VPlan *Plan = Def->getParent()->getPlan();
1343
1344 // Simplify (X && Y) | (X && !Y) -> X.
1345 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1346 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1347 // recipes to be visited during simplification.
1348 VPValue *X, *Y, *Z;
1349 if (match(Def,
1352 Def->replaceAllUsesWith(X);
1353 Def->eraseFromParent();
1354 return true;
1355 }
1356
1357 // x | AllOnes -> AllOnes
1358 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) {
1359 Def->replaceAllUsesWith(Plan->getAllOnesValue(Def->getScalarType()));
1360 return true;
1361 }
1362
1363 // x | 0 -> x
1364 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt()))) {
1365 Def->replaceAllUsesWith(X);
1366 return true;
1367 }
1368
1369 // x | !x -> AllOnes
1370 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_Not(m_Deferred(X))))) {
1371 Def->replaceAllUsesWith(Plan->getAllOnesValue(Def->getScalarType()));
1372 return true;
1373 }
1374
1375 // x & 0 -> 0
1376 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt()))) {
1377 Def->replaceAllUsesWith(Plan->getZero(Def->getScalarType()));
1378 return true;
1379 }
1380
1381 // x & AllOnes -> x
1382 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes()))) {
1383 Def->replaceAllUsesWith(X);
1384 return true;
1385 }
1386
1387 // x && false -> false
1388 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False()))) {
1389 Def->replaceAllUsesWith(Plan->getFalse());
1390 return true;
1391 }
1392
1393 // x && true -> x
1394 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True()))) {
1395 Def->replaceAllUsesWith(X);
1396 return true;
1397 }
1398
1399 // (x && y) | (x && z) -> x && (y | z)
1400 if (CanCreateNewRecipe &&
1403 // Simplify only if one of the operands has one use to avoid creating an
1404 // extra recipe.
1405 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1406 !Def->getOperand(1)->hasMoreThanOneUniqueUser())) {
1407 Def->replaceAllUsesWith(
1408 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1409 return true;
1410 }
1411
1412 // x && (x && y) -> x && y
1413 if (match(Def, m_LogicalAnd(m_VPValue(X),
1415 Def->replaceAllUsesWith(Def->getOperand(1));
1416 return true;
1417 }
1418
1419 // x && (y && x) -> x && y
1420 if (match(Def, m_LogicalAnd(m_VPValue(X),
1422 Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));
1423 return true;
1424 }
1425
1426 // x && !x -> 0
1427 if (match(Def, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) {
1428 Def->replaceAllUsesWith(Plan->getFalse());
1429 return true;
1430 }
1431
1432 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X)))) {
1433 Def->replaceAllUsesWith(X);
1434 return true;
1435 }
1436
1437 // select c, false, true -> not c
1438 VPValue *C;
1439 if (CanCreateNewRecipe &&
1440 match(Def, m_Select(m_VPValue(C), m_False(), m_True()))) {
1441 Def->replaceAllUsesWith(Builder.createNot(C));
1442 return true;
1443 }
1444
1445 // select !c, x, y -> select c, y, x
1446 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1447 Def->setOperand(0, C);
1448 Def->setOperand(1, Y);
1449 Def->setOperand(2, X);
1450 return true;
1451 }
1452
1453 // select x, (i1 y | z), y -> y | (x && z)
1454 if (CanCreateNewRecipe &&
1455 match(Def, m_Select(m_VPValue(X),
1457 m_Deferred(Y))) &&
1458 Y->getScalarType()->isIntegerTy(1)) {
1459 Def->replaceAllUsesWith(
1460 Builder.createOr(Y, Builder.createLogicalAnd(X, Z)));
1461 return true;
1462 }
1463
1464 return false;
1465}
1466
1467/// Try to simplify VPSingleDefRecipe \p Def.
1469 VPlan *Plan = Def->getParent()->getPlan();
1470
1471 // Simplification of live-in IR values for SingleDef recipes using
1472 // InstSimplifyFolder.
1473 const DataLayout &DL = Plan->getDataLayout();
1474 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL))
1475 return Def->replaceAllUsesWith(V);
1476
1477 // Fold PredPHI LiveIn -> LiveIn.
1478 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1479 VPValue *Op = PredPHI->getOperand(0);
1480 if (isa<VPIRValue>(Op))
1481 PredPHI->replaceAllUsesWith(Op);
1482 }
1483
1484 VPBuilder Builder(Def);
1485
1486 // Avoid replacing VPInstructions with underlying values with new
1487 // VPInstructions, as we would fail to create widen/replicate recpes from the
1488 // new VPInstructions without an underlying value, and miss out on some
1489 // transformations that only apply to widened/replicated recipes later, by
1490 // doing so.
1491 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1492 // VPInstructions without underlying values, as those will get skipped during
1493 // cost computation.
1494 bool CanCreateNewRecipe =
1495 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1496
1497 VPValue *A;
1498 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1499 Type *TruncTy = Def->getScalarType();
1500 Type *ATy = A->getScalarType();
1501 if (TruncTy == ATy) {
1502 Def->replaceAllUsesWith(A);
1503 } else {
1504 // Don't replace a non-widened cast recipe with a widened cast.
1505 if (!isa<VPWidenCastRecipe>(Def))
1506 return;
1507 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1508
1509 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1510 ? Instruction::SExt
1511 : Instruction::ZExt;
1512 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1513 TruncTy);
1514 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1515 // UnderlyingExt has distinct return type, used to retain legacy cost.
1516 Ext->setUnderlyingValue(UnderlyingExt);
1517 }
1518 Def->replaceAllUsesWith(Ext);
1519 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1520 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1521 Def->replaceAllUsesWith(Trunc);
1522 }
1523 }
1524 }
1525
1526 if (simplifyLogicalRecipe(Def, Builder, CanCreateNewRecipe))
1527 return;
1528
1529 VPValue *X, *Y, *C;
1530 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1531 return Def->replaceAllUsesWith(A);
1532
1533 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1534 return Def->replaceAllUsesWith(A);
1535
1536 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1537 return Def->replaceAllUsesWith(Plan->getZero(Def->getScalarType()));
1538
1539 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_AllOnes()))) {
1540 // Preserve nsw from the Mul on the new Sub.
1542 false, cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap()};
1543 return Def->replaceAllUsesWith(Builder.createSub(
1544 Plan->getZero(A->getScalarType()), A, Def->getDebugLoc(), "", NW));
1545 }
1546
1547 if (CanCreateNewRecipe &&
1549 // Preserve nsw from the Add and the Sub, if it's present on both, on the
1550 // new Sub.
1552 false,
1553 cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap() &&
1554 cast<VPRecipeWithIRFlags>(Def->getOperand(Def->getOperand(0) == X))
1555 ->hasNoSignedWrap()};
1556 return Def->replaceAllUsesWith(
1557 Builder.createSub(X, Y, Def->getDebugLoc(), "", NW));
1558 }
1559
1560 const APInt *APC;
1561 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1562 APC->isPowerOf2())
1563 return Def->replaceAllUsesWith(Builder.createNaryOp(
1564 Instruction::Shl,
1565 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1566 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1567
1568 if (CanCreateNewRecipe && match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) &&
1569 APC->isPowerOf2())
1570 return Def->replaceAllUsesWith(Builder.createNaryOp(
1571 Instruction::LShr,
1572 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1573 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1574
1575 if (match(Def, m_Not(m_VPValue(A)))) {
1576 if (match(A, m_Not(m_VPValue(A))))
1577 return Def->replaceAllUsesWith(A);
1578
1579 // Try to fold Not into compares by adjusting the predicate in-place.
1580 CmpPredicate Pred;
1581 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1582 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1583 if (all_of(Cmp->users(),
1585 m_Not(m_Specific(Cmp)),
1586 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1587 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1588 for (VPUser *U : to_vector(Cmp->users())) {
1589 auto *R = cast<VPSingleDefRecipe>(U);
1590 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1591 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1592 R->setOperand(1, Y);
1593 R->setOperand(2, X);
1594 } else {
1595 // not (cmp pred) -> cmp inv_pred
1596 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1597 R->replaceAllUsesWith(Cmp);
1598 }
1599 }
1600 // If Cmp doesn't have a debug location, use the one from the negation,
1601 // to preserve the location.
1602 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1603 Cmp->setDebugLoc(Def->getDebugLoc());
1604 }
1605 }
1606 }
1607
1608 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1609 // any-of (fcmp uno %A, %B), ...
1610 if (match(Def, m_AnyOf())) {
1612 VPRecipeBase *UnpairedCmp = nullptr;
1613 for (VPValue *Op : Def->operands()) {
1614 VPValue *X;
1615 if (Op->getNumUsers() > 1 ||
1617 m_Deferred(X)))) {
1618 NewOps.push_back(Op);
1619 } else if (!UnpairedCmp) {
1620 UnpairedCmp = Op->getDefiningRecipe();
1621 } else {
1622 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1623 UnpairedCmp->getOperand(0), X));
1624 UnpairedCmp = nullptr;
1625 }
1626 }
1627
1628 if (UnpairedCmp)
1629 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1630
1631 if (NewOps.size() < Def->getNumOperands()) {
1632 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1633 return Def->replaceAllUsesWith(NewAnyOf);
1634 }
1635 }
1636
1637 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1638 // This is useful for fmax/fmin without fast-math flags, where we need to
1639 // check if any operand is NaN.
1640 if (CanCreateNewRecipe &&
1642 m_Deferred(X)),
1644 m_Deferred(Y))))) {
1645 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1646 return Def->replaceAllUsesWith(NewCmp);
1647 }
1648
1649 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1650 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1651 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1652 Def->getOperand(1)->getScalarType() == Def->getScalarType())
1653 return Def->replaceAllUsesWith(Def->getOperand(1));
1654
1656 m_One()))) {
1657 Type *WideStepTy = Def->getScalarType();
1658 if (X->getScalarType() != WideStepTy)
1659 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1660 Def->replaceAllUsesWith(X);
1661 return;
1662 }
1663
1664 // For i1 vp.merges produced by AnyOf reductions:
1665 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1667 m_VPValue(X), m_VPValue())) &&
1669 Def->getScalarType()->isIntegerTy(1)) {
1670 Def->setOperand(1, Def->getOperand(0));
1671 Def->setOperand(0, Y);
1672 return;
1673 }
1674
1675 // Simplify MaskedCond with no block mask to its single operand.
1677 !cast<VPInstruction>(Def)->isMasked())
1678 return Def->replaceAllUsesWith(Def->getOperand(0));
1679
1680 // Look through ExtractLastLane.
1681 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1682 if (match(A, m_BuildVector())) {
1683 auto *BuildVector = cast<VPInstruction>(A);
1684 Def->replaceAllUsesWith(
1685 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1686 return;
1687 }
1688
1689 if (match(A, m_Broadcast(m_VPValue(X))))
1690 return Def->replaceAllUsesWith(X);
1691
1693 return Def->replaceAllUsesWith(A);
1694
1695 if (Plan->hasScalarVFOnly())
1696 return Def->replaceAllUsesWith(A);
1697 }
1698
1699 // Look through ExtractPenultimateElement (BuildVector ....).
1701 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1702 Def->replaceAllUsesWith(
1703 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1704 return;
1705 }
1706
1707 uint64_t Idx;
1709 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1710 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1711 return;
1712 }
1713
1714 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1715 Def->replaceAllUsesWith(
1716 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1717 return;
1718 }
1719
1720 // Look through broadcast of single-scalar when used as select conditions; in
1721 // that case the scalar condition can be used directly.
1722 if (match(Def,
1725 "broadcast operand must be single-scalar");
1726 Def->setOperand(0, C);
1727 return;
1728 }
1729
1730 if (match(Def, m_Broadcast(m_VPValue(X))))
1731 return Def->replaceUsesWithIf(
1732 X, [Def](const VPUser &U, unsigned) { return U.usesScalars(Def); });
1733
1735 if (Def->getNumOperands() == 1) {
1736 Def->replaceAllUsesWith(Def->getOperand(0));
1737 return;
1738 }
1739 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1740 if (all_equal(Phi->incoming_values()))
1741 Phi->replaceAllUsesWith(Phi->getOperand(0));
1742 }
1743 return;
1744 }
1745
1746 VPIRValue *IRV;
1747 if (Def->getNumOperands() == 1 &&
1749 return Def->replaceAllUsesWith(IRV);
1750
1751 // Some simplifications can only be applied after unrolling. Perform them
1752 // below.
1753 if (!Plan->isUnrolled())
1754 return;
1755
1756 // After unrolling, extract-lane may be used to extract values from multiple
1757 // scalar sources. Only simplify when extracting from a single scalar source.
1758 VPValue *LaneToExtract;
1759 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1760 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1762 return Def->replaceAllUsesWith(A);
1763
1764 // Replace extract-lane(0, canonical-WIDEN-INDUCTION) with the region's
1765 // scalar canonical IV.
1767 if (match(LaneToExtract, m_ZeroInt()) &&
1768 match(A, m_CanonicalWidenIV(WidenIV)))
1769 return Def->replaceAllUsesWith(WidenIV->getRegion()->getCanonicalIV());
1770
1771 // Simplify extract-lane with single source to extract-element.
1772 Def->replaceAllUsesWith(Builder.createNaryOp(
1773 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1774 return;
1775 }
1776
1777 // Look for cycles where Def is of the form:
1778 // X = phi(0, IVInc) ; used only by IVInc, or by IVInc and Inc = X + Y
1779 // IVInc = X + Step ; used by X and Def
1780 // Def = IVInc + Y
1781 // Fold the increment Y into the phi's start value, replace Def with IVInc,
1782 // and if Inc exists, replace it with X.
1783 if (match(Def, m_Add(m_Add(m_VPValue(X), m_VPValue()), m_VPValue(Y))) &&
1784 isa<VPIRValue>(Y) &&
1785 match(X, m_VPPhi(m_ZeroInt(), m_Specific(Def->getOperand(0))))) {
1786 auto *Phi = cast<VPPhi>(X);
1787 auto *IVInc = Def->getOperand(0);
1788 if (IVInc->getNumUsers() == 2) {
1789 // If Phi has a second user (besides IVInc's defining recipe), it must
1790 // be Inc = Phi + Y for the fold to apply.
1792 findUserOf(Phi, m_Add(m_Specific(Phi), m_Specific(Y))));
1793 if (Phi->getNumUsers() == 1 || (Phi->getNumUsers() == 2 && Inc)) {
1794 Def->replaceAllUsesWith(IVInc);
1795 if (Inc)
1796 Inc->replaceAllUsesWith(Phi);
1797 Phi->setOperand(0, Y);
1798 return;
1799 }
1800 }
1801 }
1802
1803 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1804 // just the pointer operand.
1805 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1806 if (!VPR->getVFxPart() || match(VPR->getVFxPart(), m_ZeroInt()))
1807 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1808
1809 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1810 // the start index is zero and only the first lane 0 is demanded.
1811 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1812 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1813 Steps->replaceAllUsesWith(Steps->getOperand(0));
1814 return;
1815 }
1816 }
1817 // Simplify redundant ReductionStartVector recipes after unrolling.
1818 VPValue *StartV;
1820 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1821 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1822 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1823 return PhiR && PhiR->isInLoop();
1824 });
1825 return;
1826 }
1827
1828 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1829 return Def->replaceAllUsesWith(A);
1830}
1831
1841
1842/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1843/// header mask to be simplified further when tail folding, e.g. in
1844/// optimizeEVLMasks.
1845static void reassociateHeaderMask(VPlan &Plan) {
1846 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1847 if (!HeaderMask)
1848 return;
1849
1850 SmallVector<VPUser *> Worklist;
1851 for (VPUser *U : HeaderMask->users())
1852 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1854
1855 while (!Worklist.empty()) {
1856 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1857 VPValue *X, *Y;
1858 if (!R || !match(R, m_LogicalAnd(
1859 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1860 m_VPValue(Y))))
1861 continue;
1862 append_range(Worklist, R->users());
1863 VPBuilder Builder(R);
1864 R->replaceAllUsesWith(
1865 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1866 }
1867}
1868
1869static std::optional<Instruction::BinaryOps>
1871 switch (ID) {
1872 case Intrinsic::masked_udiv:
1873 return Instruction::UDiv;
1874 case Intrinsic::masked_sdiv:
1875 return Instruction::SDiv;
1876 case Intrinsic::masked_urem:
1877 return Instruction::URem;
1878 case Intrinsic::masked_srem:
1879 return Instruction::SRem;
1880 default:
1881 return {};
1882 }
1883}
1884
1886 if (Plan.hasScalarVFOnly())
1887 return;
1888
1890 vp_depth_first_deep(Plan.getEntry()))) {
1891 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1894 continue;
1895 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1896 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1897 continue;
1898
1899 auto *RepOrWidenR = cast<VPRecipeWithIRFlags>(&R);
1900 if (RepR && RepR->getOpcode() == Instruction::Store &&
1901 vputils::isSingleScalar(RepR->getOperand(1))) {
1902 auto *Clone = new VPReplicateRecipe(
1903 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1904 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1905 *RepR /*Metadata*/, RepR->getDebugLoc());
1906 Clone->insertBefore(RepOrWidenR);
1907 VPBuilder Builder(Clone);
1908 VPValue *ExtractOp = Clone->getOperand(0);
1909 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1910 ExtractOp =
1911 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1912 ExtractOp =
1913 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1914 Clone->setOperand(0, ExtractOp);
1915 RepR->eraseFromParent();
1916 continue;
1917 }
1918
1919 // Narrow llvm.masked.{u,s}{div,rem} intrinsics with a safe divisor.
1920 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(RepOrWidenR)) {
1921 if (!vputils::onlyFirstLaneUsed(IntrR))
1922 continue;
1923 auto Opc = getUnmaskedDivRemOpcode(IntrR->getVectorIntrinsicID());
1924 if (!Opc)
1925 continue;
1926 VPBuilder Builder(IntrR);
1927 VPValue *SafeDivisor = Builder.createSelect(
1928 IntrR->getOperand(2), IntrR->getOperand(1),
1929 Plan.getConstantInt(IntrR->getScalarType(), 1));
1930 VPValue *Clone = Builder.createNaryOp(
1931 *Opc, {IntrR->getOperand(0), SafeDivisor},
1932 VPIRFlags::getDefaultFlags(*Opc), IntrR->getDebugLoc());
1933 IntrR->replaceAllUsesWith(Clone);
1934 IntrR->eraseFromParent();
1935 continue;
1936 }
1937
1938 // Skip recipes that aren't single scalars.
1939 if (!vputils::isSingleScalar(RepOrWidenR))
1940 continue;
1941
1942 // Predicate to check if a user of Op introduces extra broadcasts.
1943 auto IntroducesBCastOf = [](const VPValue *Op) {
1944 return [Op](const VPUser *U) {
1945 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1949 VPI->getOpcode()))
1950 return false;
1951 }
1952 return !U->usesScalars(Op);
1953 };
1954 };
1955
1956 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1957 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1958 if (any_of(
1959 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1960 IntroducesBCastOf(Op)))
1961 return false;
1962 // Non-constant live-ins require broadcasts, while constants do not
1963 // need explicit broadcasts.
1964 auto *IRV = dyn_cast<VPIRValue>(Op);
1965 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1966 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1967 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1968 }))
1969 continue;
1970
1971 auto *Clone = new VPReplicateRecipe(
1972 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1973 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1974 Clone->insertBefore(RepOrWidenR);
1975 RepOrWidenR->replaceAllUsesWith(Clone);
1976 if (isDeadRecipe(*RepOrWidenR))
1977 RepOrWidenR->eraseFromParent();
1978 }
1979 }
1980}
1981
1982/// Try to see if all of \p Blend's masks share a common value logically and'ed
1983/// and remove it from the masks.
1985 if (Blend->isNormalized())
1986 return;
1987 VPValue *CommonEdgeMask;
1988 if (!match(Blend->getMask(0),
1989 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1990 return;
1991 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1992 if (!match(Blend->getMask(I),
1993 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1994 return;
1995 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1996 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1997}
1998
1999/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
2000/// to make sure the masks are simplified.
2001static void simplifyBlends(VPlan &Plan) {
2004 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2005 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
2006 if (!Blend)
2007 continue;
2008
2009 removeCommonBlendMask(Blend);
2010
2011 // Try to remove redundant blend recipes.
2012 SmallPtrSet<VPValue *, 4> UniqueValues;
2013 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
2014 UniqueValues.insert(Blend->getIncomingValue(0));
2015 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
2016 if (!match(Blend->getMask(I), m_False()))
2017 UniqueValues.insert(Blend->getIncomingValue(I));
2018
2019 if (UniqueValues.size() == 1) {
2020 Blend->replaceAllUsesWith(*UniqueValues.begin());
2021 Blend->eraseFromParent();
2022 continue;
2023 }
2024
2025 if (Blend->isNormalized())
2026 continue;
2027
2028 // Normalize the blend so its first incoming value is used as the initial
2029 // value with the others blended into it.
2030
2031 unsigned StartIndex = 0;
2032 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
2033 // If a value's mask is used only by the blend then is can be deadcoded.
2034 // TODO: Find the most expensive mask that can be deadcoded, or a mask
2035 // that's used by multiple blends where it can be removed from them all.
2036 VPValue *Mask = Blend->getMask(I);
2037 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
2038 StartIndex = I;
2039 break;
2040 }
2041 }
2042
2043 SmallVector<VPValue *, 4> OperandsWithMask;
2044 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
2045
2046 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
2047 if (I == StartIndex)
2048 continue;
2049 OperandsWithMask.push_back(Blend->getIncomingValue(I));
2050 OperandsWithMask.push_back(Blend->getMask(I));
2051 }
2052
2053 auto *NewBlend =
2054 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
2055 OperandsWithMask, *Blend, Blend->getDebugLoc());
2056 NewBlend->insertBefore(&R);
2057
2058 VPValue *DeadMask = Blend->getMask(StartIndex);
2059 Blend->replaceAllUsesWith(NewBlend);
2060 Blend->eraseFromParent();
2062
2063 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
2064 VPValue *NewMask;
2065 if (NewBlend->getNumOperands() == 3 &&
2066 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
2067 VPValue *Inc0 = NewBlend->getOperand(0);
2068 VPValue *Inc1 = NewBlend->getOperand(1);
2069 VPValue *OldMask = NewBlend->getOperand(2);
2070 NewBlend->setOperand(0, Inc1);
2071 NewBlend->setOperand(1, Inc0);
2072 NewBlend->setOperand(2, NewMask);
2073 if (OldMask->getNumUsers() == 0)
2074 cast<VPInstruction>(OldMask)->eraseFromParent();
2075 }
2076 }
2077 }
2078}
2079
2080/// Optimize the width of vector induction variables in \p Plan based on a known
2081/// constant Trip Count, \p BestVF and \p BestUF.
2083 ElementCount BestVF,
2084 unsigned BestUF) {
2085 // Only proceed if we have not completely removed the vector region.
2086 if (!Plan.getVectorLoopRegion())
2087 return false;
2088
2089 const APInt *TC;
2090 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
2091 return false;
2092
2093 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
2094 // and UF. Returns at least 8.
2095 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
2096 APInt AlignedTC =
2099 APInt MaxVal = AlignedTC - 1;
2100 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
2101 };
2102 unsigned NewBitWidth =
2103 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
2104
2105 LLVMContext &Ctx = Plan.getContext();
2106 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
2107
2108 bool MadeChange = false;
2109
2110 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2111 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2112 // Currently only handle canonical IVs as it is trivial to replace the start
2113 // and stop values, and we currently only perform the optimization when the
2114 // IV has a single use.
2116 if (!match(&Phi, m_CanonicalWidenIV(WideIV)))
2117 continue;
2118 if (WideIV->hasMoreThanOneUniqueUser() ||
2119 NewIVTy == WideIV->getScalarType())
2120 continue;
2121
2122 // Currently only handle cases where the single user is a header-mask
2123 // comparison with the backedge-taken-count.
2124 VPUser *SingleUser = WideIV->getSingleUser();
2125 if (!SingleUser ||
2126 !match(SingleUser,
2127 m_ICmp(m_Specific(WideIV),
2129 continue;
2130
2131 // Update IV operands and comparison bound to use new narrower type.
2132 assert(!WideIV->getTruncInst() &&
2133 "canonical IV is not expected to have a truncation");
2134 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
2135 WideIV->getPHINode(), Plan.getZero(NewIVTy),
2136 Plan.getConstantInt(NewIVTy, 1), WideIV->getVFValue(),
2137 WideIV->getInductionDescriptor(), *WideIV, WideIV->getDebugLoc());
2138 NewWideIV->insertBefore(WideIV);
2139
2140 auto *NewBTC = new VPWidenCastRecipe(
2141 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2142 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
2143 Plan.getVectorPreheader()->appendRecipe(NewBTC);
2144 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
2145 Cmp->replaceAllUsesWith(
2146 VPBuilder(Cmp).createICmp(Cmp->getPredicate(), NewWideIV, NewBTC));
2147
2148 MadeChange = true;
2149 }
2150
2151 return MadeChange;
2152}
2153
2154/// Return true if \p Cond is known to be true for given \p BestVF and \p
2155/// BestUF.
2157 ElementCount BestVF, unsigned BestUF,
2160 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
2161 &PSE](VPValue *C) {
2162 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
2163 });
2164
2165 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2168 m_c_Add(m_Specific(CanIV), m_Specific(&Plan.getVFxUF())),
2169 m_Specific(&Plan.getVectorTripCount()))))
2170 return false;
2171
2172 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2173 // count is not conveniently available as SCEV so far, so we compare directly
2174 // against the original trip count. This is stricter than necessary, as we
2175 // will only return true if the trip count == vector trip count.
2176 const SCEV *VectorTripCount =
2178 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2179 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
2180 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2181 "Trip count SCEV must be computable");
2182 ScalarEvolution &SE = *PSE.getSE();
2183 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2184 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2185 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
2186}
2187
2188/// Try to replace multiple active lane masks used for control flow with
2189/// a single, wide active lane mask instruction followed by multiple
2190/// extract subvector intrinsics. This applies to the active lane mask
2191/// instructions both in the loop and in the preheader.
2192/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2193/// new extracts from the first active lane mask, which has it's last
2194/// operand (multiplier) set to UF.
2196 unsigned UF) {
2197 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2198 return false;
2199
2200 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2201 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2202 auto *Term = &ExitingVPBB->back();
2203
2204 using namespace llvm::VPlanPatternMatch;
2206 m_VPValue(), m_VPValue(), m_VPValue())))))
2207 return false;
2208
2209 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2210 LLVMContext &Ctx = Plan.getContext();
2211
2212 auto ExtractFromALM = [&](VPInstruction *ALM,
2213 SmallVectorImpl<VPValue *> &Extracts) {
2214 DebugLoc DL = ALM->getDebugLoc();
2215 for (unsigned Part = 0; Part < UF; ++Part) {
2217 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
2218 auto *Ext =
2219 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2220 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
2221 Extracts[Part] = Ext;
2222 Ext->insertAfter(ALM);
2223 }
2224 };
2225
2226 // Create a list of each active lane mask phi, ordered by unroll part.
2228 for (VPRecipeBase &R : Header->phis()) {
2230 if (!Phi)
2231 continue;
2232 VPValue *Index = nullptr;
2233 match(Phi->getBackedgeValue(),
2235 assert(Index && "Expected index from ActiveLaneMask instruction");
2236
2237 uint64_t Part;
2238 if (match(Index,
2240 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2241 Phis[Part] = Phi;
2242 else {
2243 // Anything other than a CanonicalIVIncrementForPart is part 0
2244 assert(!match(
2245 Index,
2247 Phis[0] = Phi;
2248 }
2249 }
2250
2251 assert(all_of(Phis, not_equal_to(nullptr)) &&
2252 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2253
2254 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2255 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2256
2257 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2258 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2259 "Expected incoming values of Phi to be ActiveLaneMasks");
2260
2261 // When using wide lane masks, the return type of the get.active.lane.mask
2262 // intrinsic is VF x UF (last operand).
2263 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2264 EntryALM->setOperand(2, ALMMultiplier);
2265 LoopALM->setOperand(2, ALMMultiplier);
2266
2267 // Create UF x extract vectors and insert into preheader.
2268 SmallVector<VPValue *> EntryExtracts(UF);
2269 ExtractFromALM(EntryALM, EntryExtracts);
2270
2271 // Create UF x extract vectors and insert before the loop compare & branch,
2272 // updating the compare to use the first extract.
2273 SmallVector<VPValue *> LoopExtracts(UF);
2274 ExtractFromALM(LoopALM, LoopExtracts);
2275 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2276 Not->setOperand(0, LoopExtracts[0]);
2277
2278 // Update the incoming values of active lane mask phis.
2279 for (unsigned Part = 0; Part < UF; ++Part) {
2280 Phis[Part]->setStartValue(EntryExtracts[Part]);
2281 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2282 }
2283
2284 return true;
2285}
2286
2287/// Try to simplify the branch condition of \p Plan. This may restrict the
2288/// resulting plan to \p BestVF and \p BestUF.
2290 unsigned BestUF,
2292 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2293 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2294 auto *Term = &ExitingVPBB->back();
2295 VPValue *Cond;
2296 auto m_CanIVInc = m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF()));
2297 // Check if the branch condition compares the canonical IV increment (for main
2298 // loop), or the canonical IV increment plus an offset (for epilog loop).
2299 if (match(Term, m_BranchOnCount(
2300 m_CombineOr(m_CanIVInc, m_c_Add(m_CanIVInc, m_LiveIn())),
2301 m_VPValue())) ||
2303 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2304 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2305 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2306 const SCEV *VectorTripCount =
2308 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2309 VectorTripCount =
2311 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2312 "Trip count SCEV must be computable");
2313 ScalarEvolution &SE = *PSE.getSE();
2314 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2315 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2316 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2317 return false;
2318 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2320 // For BranchOnCond, check if we can prove the condition to be true using VF
2321 // and UF.
2322 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2323 return false;
2324 } else {
2325 return false;
2326 }
2327
2328 // The vector loop region only executes once. Convert terminator of the
2329 // exiting block to exit in the first iteration.
2330 if (match(Term, m_BranchOnTwoConds())) {
2331 Term->setOperand(1, Plan.getTrue());
2332 return true;
2333 }
2334
2335 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, Plan.getTrue(), {},
2336 {}, Term->getDebugLoc());
2337 ExitingVPBB->appendRecipe(BOC);
2338 Term->eraseFromParent();
2339
2340 return true;
2341}
2342
2343/// From the definition of llvm.experimental.get.vector.length,
2344/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2348 vp_depth_first_deep(Plan.getEntry()))) {
2349 for (VPRecipeBase &R : *VPBB) {
2350 VPValue *AVL;
2351 if (!match(&R, m_EVL(m_VPValue(AVL))))
2352 continue;
2353
2354 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2355 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2356 continue;
2357 ScalarEvolution &SE = *PSE.getSE();
2358 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2359 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2360 continue;
2361
2363 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2364 R.getDebugLoc());
2365 if (Trunc != AVL) {
2366 auto *TruncR = cast<VPSingleDefRecipe>(Trunc);
2367 const DataLayout &DL = Plan.getDataLayout();
2368 if (VPValue *Folded = tryToFoldLiveIns(*TruncR, TruncR->operands(), DL))
2369 Trunc = Folded;
2370 }
2371 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2372 return true;
2373 }
2374 }
2375 return false;
2376}
2377
2379 unsigned BestUF,
2381 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2382 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2383
2384 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2385 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2386 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2387
2388 if (MadeChange) {
2389 Plan.setVF(BestVF);
2390 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2391 }
2392}
2393
2395 for (VPRecipeBase &R :
2397 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2398 if (!PhiR)
2399 continue;
2400 RecurKind RK = PhiR->getRecurrenceKind();
2401 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2403 continue;
2404
2405 for (VPUser *U : collectUsersRecursively(PhiR))
2406 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2407 RecWithFlags->dropPoisonGeneratingFlags();
2408 }
2409 }
2410}
2411
2412namespace {
2413struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2414 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2415 /// return that source element type.
2416 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2417 // All VPInstructions that lower to GEPs must have the i8 source element
2418 // type (as they are PtrAdds), so we omit it.
2420 .Case([](const VPReplicateRecipe *I) -> Type * {
2421 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2422 return GEP->getSourceElementType();
2423 return nullptr;
2424 })
2425 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2426 [](auto *I) { return I->getSourceElementType(); })
2427 .Default([](auto *) { return nullptr; });
2428 }
2429
2430 /// Returns true if recipe \p Def can be safely handed for CSE.
2431 static bool canHandle(const VPSingleDefRecipe *Def) {
2432 // We can extend the list of handled recipes in the future,
2433 // provided we account for the data embedded in them while checking for
2434 // equality or hashing.
2435 auto C = getOpcodeOrIntrinsicID(Def);
2436
2437 // The issue with (Insert|Extract)Value is that the index of the
2438 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2439 // VPlan.
2440 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2441 C->second == Instruction::ExtractValue)))
2442 return false;
2443
2444 // During CSE, we can only handle recipes that don't read from memory: if
2445 // they read from memory, there could be an intervening write to memory
2446 // before the next instance is CSE'd, leading to an incorrect result.
2447 return !Def->mayReadFromMemory();
2448 }
2449
2450 /// Hash the underlying data of \p Def.
2451 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2452 hash_code Result = hash_combine(
2453 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2454 getGEPSourceElementType(Def), Def->getScalarType(),
2456 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2457 if (RFlags->hasPredicate())
2458 return hash_combine(Result, RFlags->getPredicate());
2459 if (auto *SIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Def))
2460 return hash_combine(Result, SIVSteps->getInductionOpcode());
2461 return Result;
2462 }
2463
2464 /// Check equality of underlying data of \p L and \p R.
2465 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2466 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2468 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2470 !equal(L->operands(), R->operands()))
2471 return false;
2473 "must have valid opcode info for both recipes");
2474 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2475 if (LFlags->hasPredicate() &&
2476 LFlags->getPredicate() !=
2477 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2478 return false;
2479 if (auto *LSIV = dyn_cast<VPScalarIVStepsRecipe>(L))
2480 if (LSIV->getInductionOpcode() !=
2481 cast<VPScalarIVStepsRecipe>(R)->getInductionOpcode())
2482 return false;
2483 // Recipes in replicate regions implicitly depend on predicate. If either
2484 // recipe is in a replicate region, only consider them equal if both have
2485 // the same parent.
2486 const VPRegionBlock *RegionL = L->getRegion();
2487 const VPRegionBlock *RegionR = R->getRegion();
2488 if (((RegionL && RegionL->isReplicator()) ||
2489 (RegionR && RegionR->isReplicator())) &&
2490 L->getParent() != R->getParent())
2491 return false;
2492 return L->getScalarType() == R->getScalarType();
2493 }
2494};
2495} // end anonymous namespace
2496
2497/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2498/// Plan.
2500 VPDominatorTree VPDT(Plan);
2502
2504 Plan.getEntry());
2506 for (VPRecipeBase &R : *VPBB) {
2507 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2508 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2509 continue;
2510 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2511 // V must dominate Def for a valid replacement.
2512 if (!VPDT.dominates(V->getParent(), VPBB))
2513 continue;
2514 // Only keep flags present on both V and Def.
2515 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2516 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2517 Def->replaceAllUsesWith(V);
2518 continue;
2519 }
2520 CSEMap[Def] = Def;
2521 }
2522 }
2523}
2524
2525/// Return true if we do not know how to (mechanically) hoist or sink a
2526/// non-memory or memory recipe \p R out of a loop region.
2528 VPBasicBlock *LastBB) {
2529 if (!isa<VPReplicateRecipe>(R) || !R.mayReadFromMemory())
2531
2532 // Check that the load doesn't alias with stores between FirstBB and LastBB.
2533 auto MemLoc = vputils::getMemoryLocation(R);
2534 return !MemLoc || !canHoistOrSinkWithNoAliasCheck(*MemLoc, FirstBB, LastBB);
2535}
2536
2537/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2538static void licm(VPlan &Plan) {
2539 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2540
2541 // Hoist any loop invariant recipes from the vector loop region to the
2542 // preheader. Preform a shallow traversal of the vector loop region, to
2543 // exclude recipes in replicate regions. Since the top-level blocks in the
2544 // vector loop region are guaranteed to execute if the vector pre-header is,
2545 // we don't need to check speculation safety.
2546 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2547 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2548 "Expected vector prehader's successor to be the vector loop region");
2550 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2551 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2552 if (cannotHoistOrSinkRecipe(R, LoopRegion->getEntryBasicBlock(),
2553 LoopRegion->getExitingBasicBlock()))
2554 continue;
2555 if (any_of(R.operands(), [](VPValue *Op) {
2556 return !Op->isDefinedOutsideLoopRegions();
2557 }))
2558 continue;
2559 R.moveBefore(*Preheader, Preheader->end());
2560 }
2561 }
2562
2563#ifndef NDEBUG
2564 VPDominatorTree VPDT(Plan);
2565#endif
2566 // Sink recipes with no users inside the vector loop region if all users are
2567 // in the same exit block of the region.
2568 // TODO: Extend to sink recipes from inner loops.
2570 LoopRegion->getEntry());
2572 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2573 if (vputils::cannotHoistOrSinkRecipe(R, /*Sinking=*/true))
2574 continue;
2575
2576 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
2577 assert(!RepR->isPredicated() &&
2578 "Expected prior transformation of predicated replicates to "
2579 "replicate regions");
2580 // narrowToSingleScalarRecipes should have already maximally narrowed
2581 // replicates to single-scalar replicates.
2582 // TODO: When unrolling, replicateByVF doesn't handle sunk
2583 // non-single-scalar replicates correctly.
2584 if (!RepR->isSingleScalar())
2585 continue;
2586 }
2587
2588 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2589 // support recipes with multiple defined values (e.g., interleaved loads).
2590 auto *Def = cast<VPSingleDefRecipe>(&R);
2591
2592 // Cannot sink the recipe if the user is defined in a loop region or a
2593 // non-successor of the vector loop region. Cannot sink if user is a phi
2594 // either.
2595 VPBasicBlock *SinkBB = nullptr;
2596 if (any_of(Def->users(), [&SinkBB, &LoopRegion](VPUser *U) {
2597 auto *UserR = cast<VPRecipeBase>(U);
2598 VPBasicBlock *Parent = UserR->getParent();
2599 // TODO: Support sinking when users are in multiple blocks.
2600 if (SinkBB && SinkBB != Parent)
2601 return true;
2602 SinkBB = Parent;
2603 // TODO: If the user is a PHI node, we should check the block of
2604 // incoming value. Support PHI node users if needed.
2605 return UserR->isPhi() || Parent->getEnclosingLoopRegion() ||
2606 Parent->getSinglePredecessor() != LoopRegion;
2607 }))
2608 continue;
2609
2610 if (!SinkBB)
2611 SinkBB = cast<VPBasicBlock>(LoopRegion->getSingleSuccessor());
2612
2613 // TODO: This will need to be a check instead of a assert after
2614 // conditional branches in vectorized loops are supported.
2615 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2616 "Defining block must dominate sink block");
2617 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2618 // just moving.
2619 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2620 }
2621 }
2622}
2623
2625 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2626 if (Plan.hasScalarVFOnly())
2627 return;
2628 // Keep track of created truncates, so they can be re-used. Note that we
2629 // cannot use RAUW after creating a new truncate, as this would could make
2630 // other uses have different types for their operands, making them invalidly
2631 // typed.
2633 VPBasicBlock *PH = Plan.getVectorPreheader();
2636 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2639 continue;
2640
2641 VPValue *ResultVPV = R.getVPSingleValue();
2642 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2643 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2644 if (!NewResSizeInBits)
2645 continue;
2646
2647 // If the value wasn't vectorized, we must maintain the original scalar
2648 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2649 // skip casts which do not need to be handled explicitly here, as
2650 // redundant casts will be removed during recipe simplification.
2652 continue;
2653
2654 Type *OldResTy = ResultVPV->getScalarType();
2655 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2656 assert(OldResTy->isIntegerTy() && "only integer types supported");
2657 (void)OldResSizeInBits;
2658
2659 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2660
2661 // Any wrapping introduced by shrinking this operation shouldn't be
2662 // considered undefined behavior. So, we can't unconditionally copy
2663 // arithmetic wrapping flags to VPW.
2664 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2665 VPW->dropPoisonGeneratingFlags();
2666
2667 assert((OldResSizeInBits != NewResSizeInBits ||
2668 match(&R, m_ICmp(m_VPValue(), m_VPValue()))) &&
2669 "Only ICmps should not need extending the result.");
2670 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2671
2672 // For loads/intrinsics we don't recreate the recipe; just wrap the
2673 // original wide result in a ZExt to OldResTy.
2675 if (OldResSizeInBits != NewResSizeInBits) {
2677 Instruction::ZExt, ResultVPV, OldResTy);
2678 ResultVPV->replaceAllUsesWith(Ext);
2679 Ext->setOperand(0, ResultVPV);
2680 }
2681 continue;
2682 }
2683
2684 // Shrink operands by introducing truncates as needed.
2685 unsigned StartIdx =
2686 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2687 SmallVector<VPValue *> NewOperands(R.operands());
2688 for (VPValue *&Op : drop_begin(NewOperands, StartIdx)) {
2689 unsigned OpSizeInBits = Op->getScalarType()->getScalarSizeInBits();
2690 if (OpSizeInBits == NewResSizeInBits)
2691 continue;
2692 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2693 auto [ProcessedIter, Inserted] = ProcessedTruncs.try_emplace(Op);
2694 if (Inserted) {
2695 VPBuilder Builder;
2696 if (isa<VPIRValue>(Op))
2697 Builder.setInsertPoint(PH);
2698 else
2699 Builder.setInsertPoint(&R);
2700 ProcessedIter->second =
2701 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2702 }
2703 Op = ProcessedIter->second;
2704 }
2705
2706 auto *NWR = cast<VPWidenRecipe>(&R)->cloneWithOperands(NewOperands);
2707 NWR->insertBefore(&R);
2708
2709 // Wrap NWR in a ZExt to preserve the original wide type for downstream
2710 // users (unless this is an ICmp, which produces i1 regardless).
2711 VPValue *Replacement = NWR->getVPSingleValue();
2712 if (OldResSizeInBits != NewResSizeInBits)
2713 Replacement =
2715 .createWidenCast(Instruction::ZExt, Replacement, OldResTy)
2716 ->getVPSingleValue();
2717 ResultVPV->replaceAllUsesWith(Replacement);
2718 R.eraseFromParent();
2719 }
2720 }
2721}
2722
2723void VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) {
2724 std::optional<VPDominatorTree> VPDT;
2725 if (OnlyLatches)
2726 VPDT.emplace(Plan);
2727
2728 // Collect all blocks before modifying the CFG so we can identify unreachable
2729 // ones after constant branch removal.
2731
2732 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(AllBlocks)) {
2733 VPValue *Cond;
2734 // Skip blocks that are not terminated by BranchOnCond.
2735 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2736 continue;
2737
2738 if (OnlyLatches && !VPBlockUtils::isLatch(VPBB, *VPDT))
2739 continue;
2740
2741 assert(VPBB->getNumSuccessors() == 2 &&
2742 "Two successors expected for BranchOnCond");
2743 unsigned RemovedIdx;
2744 if (match(Cond, m_True()))
2745 RemovedIdx = 1;
2746 else if (match(Cond, m_False()))
2747 RemovedIdx = 0;
2748 else
2749 continue;
2750
2751 VPBasicBlock *RemovedSucc =
2752 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2753 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2754 "There must be a single edge between VPBB and its successor");
2755 // Values coming from VPBB into phi recipes of RemovedSucc are removed from
2756 // these recipes.
2757 for (VPRecipeBase &R : RemovedSucc->phis())
2758 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2759
2760 // Disconnect blocks and remove the terminator.
2761 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2762 VPBB->back().eraseFromParent();
2763 }
2764
2765 // Compute which blocks are still reachable from the entry after constant
2766 // branch removal.
2769
2770 // Detach all unreachable blocks from their successors, removing their recipes
2771 // and incoming values from phi recipes.
2772 VPSymbolicValue Tmp(nullptr);
2773 for (VPBlockBase *B : AllBlocks) {
2774 if (Reachable.contains(B))
2775 continue;
2776 for (VPBlockBase *Succ : to_vector(B->successors())) {
2777 if (auto *SuccBB = dyn_cast<VPBasicBlock>(Succ))
2778 for (VPRecipeBase &R : SuccBB->phis())
2779 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(B);
2781 }
2782 for (VPBasicBlock *DeadBB :
2784 for (VPRecipeBase &R : make_early_inc_range(*DeadBB)) {
2785 for (VPValue *Def : R.definedValues())
2786 Def->replaceAllUsesWith(&Tmp);
2787 R.eraseFromParent();
2788 }
2789 }
2790 }
2791}
2792
2812
2813// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2814// the loop terminator with a branch-on-cond recipe with the negated
2815// active-lane-mask as operand. Note that this turns the loop into an
2816// uncountable one. Only the existing terminator is replaced, all other existing
2817// recipes/users remain unchanged, except for poison-generating flags being
2818// dropped from the canonical IV increment. Return the created
2819// VPActiveLaneMaskPHIRecipe.
2820//
2821// The function adds the following recipes:
2822//
2823// vector.ph:
2824// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2825// %EntryALM = active-lane-mask %EntryInc, TC
2826//
2827// vector.body:
2828// ...
2829// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2830// ...
2831// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2832// %ALM = active-lane-mask %InLoopInc, TC
2833// %Negated = Not %ALM
2834// branch-on-cond %Negated
2835//
2838 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2839 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2840 VPValue *StartV = Plan.getZero(TopRegion->getCanonicalIVType());
2841 auto *CanonicalIVIncrement = TopRegion->getOrCreateCanonicalIVIncrement();
2842 // TODO: Check if dropping the flags is needed.
2843 TopRegion->clearCanonicalIVNUW(CanonicalIVIncrement);
2844 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2845 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2846 // we have to take unrolling into account. Each part needs to start at
2847 // Part * VF
2848 auto *VecPreheader = Plan.getVectorPreheader();
2849 VPBuilder Builder(VecPreheader);
2850
2851 // Create the ActiveLaneMask instruction using the correct start values.
2852 VPValue *TC = Plan.getTripCount();
2853 VPValue *VF = &Plan.getVF();
2854
2855 auto *EntryIncrement = Builder.createOverflowingOp(
2856 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
2857 DL, "index.part.next");
2858
2859 // Create the active lane mask instruction in the VPlan preheader.
2860 VPValue *ALMMultiplier =
2861 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2862 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2863 {EntryIncrement, TC, ALMMultiplier}, DL,
2864 "active.lane.mask.entry");
2865
2866 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2867 // preheader ActiveLaneMask instruction.
2868 auto *LaneMaskPhi =
2870 auto *HeaderVPBB = TopRegion->getEntryBasicBlock();
2871 LaneMaskPhi->insertBefore(*HeaderVPBB, HeaderVPBB->begin());
2872
2873 // Create the active lane mask for the next iteration of the loop before the
2874 // original terminator.
2875 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2876 Builder.setInsertPoint(OriginalTerminator);
2877 auto *InLoopIncrement = Builder.createOverflowingOp(
2879 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
2880 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2881 {InLoopIncrement, TC, ALMMultiplier}, DL,
2882 "active.lane.mask.next");
2883 LaneMaskPhi->addBackedgeValue(ALM);
2884
2885 // Replace the original terminator with BranchOnCond. We have to invert the
2886 // mask here because a true condition means jumping to the exit block.
2887 auto *NotMask = Builder.createNot(ALM, DL);
2888 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2889 OriginalTerminator->eraseFromParent();
2890 return LaneMaskPhi;
2891}
2892
2894 bool UseActiveLaneMaskForControlFlow) {
2895 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2896 auto *WideCanonicalIV =
2898 assert(WideCanonicalIV &&
2899 "Must have widened canonical IV when tail folding!");
2900 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
2901 VPSingleDefRecipe *LaneMask;
2902 if (UseActiveLaneMaskForControlFlow) {
2903 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
2904 } else {
2905 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2906 VPValue *ALMMultiplier =
2907 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
2908 LaneMask =
2909 B.createNaryOp(VPInstruction::ActiveLaneMask,
2910 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2911 nullptr, "active.lane.mask");
2912 }
2913
2914 // Walk users of WideCanonicalIV and replace the header mask of the form
2915 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2916 // removing the old one to ensure there is always only a single header mask.
2917 HeaderMask->replaceAllUsesWith(LaneMask);
2918 HeaderMask->eraseFromParent();
2919}
2920
2921template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2922 Op0_t In;
2924
2925 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2926
2927 template <typename OpTy> bool match(OpTy *V) const {
2928 if (m_Specific(In).match(V)) {
2929 Out = nullptr;
2930 return true;
2931 }
2932 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2933 }
2934};
2935
2936/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2937/// Returns the remaining part \p Out if so, or nullptr otherwise.
2938template <typename Op0_t, typename Op1_t>
2939static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2940 Op1_t &Out) {
2941 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2942}
2943
2944static std::optional<Intrinsic::ID> getVPDivRemIntrinsic(Intrinsic::ID IntrID) {
2945 switch (IntrID) {
2946 case Intrinsic::masked_udiv:
2947 return Intrinsic::vp_udiv;
2948 case Intrinsic::masked_sdiv:
2949 return Intrinsic::vp_sdiv;
2950 case Intrinsic::masked_urem:
2951 return Intrinsic::vp_urem;
2952 case Intrinsic::masked_srem:
2953 return Intrinsic::vp_srem;
2954 default:
2955 return std::nullopt;
2956 }
2957}
2958
2959/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2960/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2961/// recipe could be created.
2962/// \p HeaderMask Header Mask.
2963/// \p CurRecipe Recipe to be transform.
2964/// \p EVL The explicit vector length parameter of vector-predication
2965/// intrinsics.
2967 VPRecipeBase &CurRecipe, VPValue &EVL) {
2968 VPlan *Plan = CurRecipe.getParent()->getPlan();
2969 DebugLoc DL = CurRecipe.getDebugLoc();
2970 VPValue *Addr, *Mask, *EndPtr;
2971
2972 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2973 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2974 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2975 EVLEndPtr->insertBefore(&CurRecipe);
2976 // Cast EVL (i32) to match the VF operand's type.
2977 VPValue *EVLAsVF = VPBuilder(EVLEndPtr).createScalarZExtOrTrunc(
2978 &EVL, EVLEndPtr->getOperand(1)->getScalarType(), EVL.getScalarType(),
2980 EVLEndPtr->setOperand(1, EVLAsVF);
2981 return EVLEndPtr;
2982 };
2983
2984 auto GetVPReverse = [&CurRecipe, &EVL, Plan,
2986 if (!V)
2987 return nullptr;
2988 auto *Reverse = new VPWidenIntrinsicRecipe(
2989 Intrinsic::experimental_vp_reverse, {V, Plan->getTrue(), &EVL},
2990 V->getScalarType(), {}, {}, DL);
2991 Reverse->insertBefore(&CurRecipe);
2992 return Reverse;
2993 };
2994
2995 if (match(&CurRecipe,
2996 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))))
2997 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2998 EVL, Mask);
2999
3000 if (match(&CurRecipe,
3001 m_MaskedLoad(m_VPValue(EndPtr),
3002 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3003 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3004 Mask = GetVPReverse(Mask);
3005 Addr = AdjustEndPtr(EndPtr);
3006 auto *LoadR = new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
3007 Addr, EVL, Mask);
3008 LoadR->insertBefore(&CurRecipe);
3009 VPValue *Poison =
3010 Plan->getOrAddLiveIn(PoisonValue::get(LoadR->getScalarType()));
3011 return new VPWidenIntrinsicRecipe(Intrinsic::vector_splice_left,
3012 {Poison, LoadR, &EVL},
3013 LoadR->getScalarType(), {}, {}, DL);
3014 }
3015
3016 VPValue *Stride;
3018 m_VPValue(Addr), m_VPValue(Stride),
3019 m_RemoveMask(HeaderMask, Mask),
3020 m_TruncOrSelf(m_Specific(&Plan->getVF()))))) {
3021 if (!Mask)
3022 Mask = Plan->getTrue();
3023 auto *NewLoad = cast<VPWidenMemIntrinsicRecipe>(&CurRecipe)->clone();
3024 NewLoad->setOperand(2, Mask);
3025 NewLoad->setOperand(3, &EVL);
3026 return NewLoad;
3027 }
3028
3029 VPValue *StoredVal;
3030 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
3031 m_RemoveMask(HeaderMask, Mask))))
3032 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3033 StoredVal, EVL, Mask);
3034
3035 if (match(&CurRecipe,
3036 m_MaskedStore(m_VPValue(EndPtr), m_VPValue(StoredVal),
3037 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3038 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3039 Mask = GetVPReverse(Mask);
3040 Addr = AdjustEndPtr(EndPtr);
3041 VPValue *Poison =
3042 Plan->getOrAddLiveIn(PoisonValue::get(StoredVal->getScalarType()));
3043 auto *SpliceR = new VPWidenIntrinsicRecipe(
3044 Intrinsic::vector_splice_right, {StoredVal, Poison, &EVL},
3045 StoredVal->getScalarType(), {}, {}, DL);
3046 SpliceR->insertBefore(&CurRecipe);
3047 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3048 SpliceR, EVL, Mask);
3049 }
3050
3051 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
3052 if (Rdx->isConditional() &&
3053 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
3054 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3055
3056 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
3057 if (Interleave->getMask() &&
3058 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
3059 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3060
3061 VPValue *LHS, *RHS;
3062 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
3063 m_VPValue(RHS))))
3064 return new VPWidenIntrinsicRecipe(
3065 Intrinsic::vp_merge, {Mask ? Mask : Plan->getTrue(), LHS, RHS, &EVL},
3066 LHS->getScalarType(), {}, {}, DL);
3067
3068 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3069 Type *Ty = CurRecipe.getVPSingleValue()->getScalarType();
3070 VPValue *ZExt =
3071 VPBuilder(&CurRecipe)
3072 .createScalarZExtOrTrunc(&EVL, Ty, EVL.getScalarType(), DL);
3073 return new VPInstruction(
3074 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
3075 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
3076 }
3077
3078 // lhs | (headermask && rhs) -> vp.merge rhs, true, lhs, evl
3079 if (match(&CurRecipe,
3081 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(RHS)))))
3082 return new VPWidenIntrinsicRecipe(Intrinsic::vp_merge,
3083 {RHS, Plan->getTrue(), LHS, &EVL},
3084 LHS->getScalarType(), {}, {}, DL);
3085
3086 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(&CurRecipe))
3087 if (auto VPID = getVPDivRemIntrinsic(IntrR->getVectorIntrinsicID()))
3088 if (match(IntrR->getOperand(2), m_RemoveMask(HeaderMask, Mask)))
3089 return new VPWidenIntrinsicRecipe(*VPID,
3090 {IntrR->getOperand(0),
3091 IntrR->getOperand(1),
3092 Mask ? Mask : Plan->getTrue(), &EVL},
3093 IntrR->getScalarType(), {}, {}, DL);
3094
3095 return nullptr;
3096}
3097
3098/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3099/// The transforms here need to preserve the original semantics.
3101 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3102 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3105 m_VPValue(EVL))) &&
3106 match(EVL, m_EVL(m_VPValue()))) {
3107 HeaderMask = R.getVPSingleValue();
3108 break;
3109 }
3110 }
3111 if (!HeaderMask)
3112 return;
3113
3114 SmallVector<VPRecipeBase *> OldRecipes;
3115 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3117 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, *EVL)) {
3118 NewR->insertBefore(R);
3119 for (auto [Old, New] :
3120 zip_equal(R->definedValues(), NewR->definedValues()))
3121 Old->replaceAllUsesWith(New);
3122 OldRecipes.push_back(R);
3123 }
3124 }
3125
3126 // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,
3127 // False, EVL)
3128 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3129 VPValue *Mask;
3130 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) {
3131 auto *LogicalAnd = cast<VPInstruction>(U);
3132 auto *Merge = new VPWidenIntrinsicRecipe(
3133 Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},
3134 Mask->getScalarType(), {}, {}, LogicalAnd->getDebugLoc());
3135 Merge->insertBefore(LogicalAnd);
3136 LogicalAnd->replaceAllUsesWith(Merge);
3137 OldRecipes.push_back(LogicalAnd);
3138 }
3139 }
3140
3141 // Fold the following splice patterns into vp.reverse for reverse accesses:
3142 // vector.reverse(splice.left(poison, x, evl)) -> vp.reverse(x, true, evl)
3143 // splice.right(vector.reverse(x), poison, evl) -> vp.reverse(x, true, evl)
3144 for (VPUser *U : collectUsersRecursively(EVL)) {
3145 VPValue *X;
3146 if (!match(U,
3149 m_Poison(), m_VPValue(X), m_Specific(EVL))),
3151 m_Reverse(m_VPValue(X)), m_Poison(), m_Specific(EVL)))))
3152 continue;
3153
3154 auto *Def = cast<VPSingleDefRecipe>(U);
3155 auto *VPReverse = new VPWidenIntrinsicRecipe(
3156 Intrinsic::experimental_vp_reverse, {X, Plan.getTrue(), EVL},
3157 X->getScalarType(), {}, {}, Def->getDebugLoc());
3158 VPReverse->insertBefore(Def);
3159 Def->replaceAllUsesWith(VPReverse);
3160 OldRecipes.push_back(Def);
3161 }
3162
3163 for (VPRecipeBase *R : reverse(OldRecipes)) {
3164 SmallVector<VPValue *> PossiblyDead(R->operands());
3165 R->eraseFromParent();
3166 for (VPValue *Op : PossiblyDead)
3168 }
3169}
3170
3171/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3172/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3173/// iteration.
3174static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3175 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3176 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3177
3178 // EVL is i32 but VF/VFxUF are IdxTy. Convert as needed.
3179 VPValue *EVLAsIdx =
3183
3184 assert(all_of(Plan.getVF().users(),
3185 [&Plan](VPUser *U) {
3186 auto IsAllowedUser =
3187 IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
3188 VPWidenIntOrFpInductionRecipe,
3189 VPWidenMemIntrinsicRecipe>;
3190 if (match(U, m_Trunc(m_Specific(&Plan.getVF()))))
3191 return all_of(cast<VPSingleDefRecipe>(U)->users(),
3192 IsAllowedUser);
3193 return IsAllowedUser(U);
3194 }) &&
3195 "User of VF that we can't transform to EVL.");
3196 Plan.getVF().replaceUsesWithIf(EVLAsIdx, [](VPUser &U, unsigned Idx) {
3198 });
3199
3200 assert(all_of(Plan.getVFxUF().users(),
3202 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3203 m_Specific(&Plan.getVFxUF())),
3205 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3206 "increment of the canonical induction.");
3207 Plan.getVFxUF().replaceUsesWithIf(EVLAsIdx, [](VPUser &U, unsigned Idx) {
3208 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3209 // canonical induction must not be updated.
3211 });
3212
3213 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3214 // contained.
3215 bool ContainsFORs =
3217 if (ContainsFORs) {
3218 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3219 VPValue *MaxEVL = &Plan.getVF();
3220 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3221 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3222 MaxEVL = Builder.createScalarZExtOrTrunc(
3223 MaxEVL, Type::getInt32Ty(Plan.getContext()), MaxEVL->getScalarType(),
3225
3226 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3227 VPValue *PrevEVL = Builder.createScalarPhi(
3228 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3229
3232 for (VPRecipeBase &R : *VPBB) {
3233 VPValue *V1, *V2;
3234 if (!match(&R,
3236 m_VPValue(V1), m_VPValue(V2))))
3237 continue;
3238 VPValue *Imm = Plan.getOrAddLiveIn(
3241 Intrinsic::experimental_vp_splice,
3242 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3243 R.getVPSingleValue()->getScalarType(), {}, {}, R.getDebugLoc());
3244 VPSplice->insertBefore(&R);
3245 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3246 }
3247 }
3248 }
3249
3250 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3251 if (!HeaderMask)
3252 return;
3253
3254 // Ensure that any reduction that uses a select to mask off tail lanes does so
3255 // in the vector loop, not the middle block, since EVL tail folding can have
3256 // tail elements in the penultimate iteration.
3257 assert(all_of(*Plan.getMiddleBlock(), [&Plan, HeaderMask](VPRecipeBase &R) {
3258 if (match(&R, m_ComputeReductionResult(m_Select(m_Specific(HeaderMask),
3259 m_VPValue(), m_VPValue()))))
3260 return R.getOperand(0)->getDefiningRecipe()->getRegion() ==
3261 Plan.getVectorLoopRegion();
3262 return true;
3263 }));
3264
3265 // Replace header masks with a mask equivalent to predicating by EVL:
3266 //
3267 // icmp ule widen-canonical-iv backedge-taken-count
3268 // ->
3269 // icmp ult step-vector, EVL
3270 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3271 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3272 Type *EVLType = EVL.getScalarType();
3273 VPValue *EVLMask = Builder.createICmp(
3275 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3276 HeaderMask->replaceAllUsesWith(EVLMask);
3277}
3278
3279/// Converts a tail folded vector loop region to step by
3280/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3281/// iteration.
3282///
3283/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3284/// replaces all uses of the canonical IV except for the canonical IV
3285/// increment with a VPCurrentIterationPHIRecipe. The canonical IV is used
3286/// only for loop iterations counting after this transformation.
3287///
3288/// - The header mask is replaced with a header mask based on the EVL.
3289///
3290/// - Plans with FORs have a new phi added to keep track of the EVL of the
3291/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3292/// @llvm.vp.splice.
3293///
3294/// The function uses the following definitions:
3295/// %StartV is the canonical induction start value.
3296///
3297/// The function adds the following recipes:
3298///
3299/// vector.ph:
3300/// ...
3301///
3302/// vector.body:
3303/// ...
3304/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3305/// [ %NextIter, %vector.body ]
3306/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3307/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3308/// ...
3309/// %OpEVL = cast i32 %VPEVL to IVSize
3310/// %NextIter = add IVSize %OpEVL, %CurrentIter
3311/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3312/// ...
3313///
3314/// If MaxSafeElements is provided, the function adds the following recipes:
3315/// vector.ph:
3316/// ...
3317///
3318/// vector.body:
3319/// ...
3320/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3321/// [ %NextIter, %vector.body ]
3322/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3323/// %cmp = cmp ult %AVL, MaxSafeElements
3324/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3325/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3326/// ...
3327/// %OpEVL = cast i32 %VPEVL to IVSize
3328/// %NextIter = add IVSize %OpEVL, %CurrentIter
3329/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3330/// ...
3331///
3333 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3334 if (Plan.hasScalarVFOnly())
3335 return;
3336 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3337 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3338
3339 auto *CanonicalIV = LoopRegion->getCanonicalIV();
3340 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3341 VPValue *StartV = Plan.getZero(CanIVTy);
3342 auto *CanonicalIVIncrement = LoopRegion->getOrCreateCanonicalIVIncrement();
3343
3344 // Create the CurrentIteration recipe in the vector loop.
3345 auto *CurrentIteration =
3347 CurrentIteration->insertBefore(*Header, Header->begin());
3348 VPBuilder Builder(Header, Header->getFirstNonPhi());
3349 // Create the AVL (application vector length), starting from TC -> 0 in steps
3350 // of EVL.
3351 VPPhi *AVLPhi = Builder.createScalarPhi(
3352 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3353 VPValue *AVL = AVLPhi;
3354
3355 if (MaxSafeElements) {
3356 // Support for MaxSafeDist for correct loop emission.
3357 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3358 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3359 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3360 "safe_avl");
3361 }
3362 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3363 DebugLoc::getUnknown(), "evl");
3364
3365 Builder.setInsertPoint(CanonicalIVIncrement);
3366 VPValue *OpVPEVL = VPEVL;
3367
3368 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3369 OpVPEVL = Builder.createScalarZExtOrTrunc(
3370 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3371
3372 auto *NextIter = Builder.createAdd(
3373 OpVPEVL, CurrentIteration, CanonicalIVIncrement->getDebugLoc(),
3374 "current.iteration.next", CanonicalIVIncrement->getNoWrapFlags());
3375 CurrentIteration->addBackedgeValue(NextIter);
3376
3377 VPValue *NextAVL =
3378 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3379 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3380 AVLPhi->addIncoming(NextAVL);
3381
3382 fixupVFUsersForEVL(Plan, *VPEVL);
3383 removeDeadRecipes(Plan);
3384
3385 // Replace all uses of the canonical IV with VPCurrentIterationPHIRecipe
3386 // except for the canonical IV increment.
3387 CanonicalIV->replaceAllUsesWith(CurrentIteration);
3388 CanonicalIVIncrement->setOperand(0, CanonicalIV);
3389 // TODO: support unroll factor > 1.
3390 Plan.setUF(1);
3391}
3392
3394 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3395 // There should be only one VPCurrentIteration in the entire plan.
3396 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3397
3400 for (VPRecipeBase &R : VPBB->phis())
3401 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3402 assert(!CurrentIteration &&
3403 "Found multiple CurrentIteration. Only one expected");
3404 CurrentIteration = PhiR;
3405 }
3406
3407 // Early return if it is not variable-length stepping.
3408 if (!CurrentIteration)
3409 return;
3410
3411 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3412 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3413
3414 // Convert CurrentIteration to concrete recipe.
3415 auto *ScalarR =
3416 VPBuilder(CurrentIteration)
3418 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3419 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3420 CurrentIteration->replaceAllUsesWith(ScalarR);
3421 CurrentIteration->eraseFromParent();
3422
3423 // Replace CanonicalIVInc with CurrentIteration increment if it exists.
3424 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3425 if (auto *CanIVInc = findUserOf(
3426 CanonicalIV, m_c_Add(m_VPValue(), m_Specific(&Plan.getVFxUF())))) {
3427 cast<VPInstruction>(CanIVInc)->replaceAllUsesWith(CurrentIterationIncr);
3428 CanIVInc->eraseFromParent();
3429 }
3430}
3431
3433 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3434 if (!LoopRegion)
3435 return;
3436 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3437 if (Header->empty())
3438 return;
3439 // The EVL IV is always at the beginning.
3440 auto *EVLPhi = dyn_cast<VPCurrentIterationPHIRecipe>(&Header->front());
3441 if (!EVLPhi)
3442 return;
3443
3444 // Bail if not an EVL tail folded loop.
3445 VPValue *AVL;
3446 if (!match(EVLPhi->getBackedgeValue(),
3447 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3448 return;
3449
3450 // The AVL may be capped to a safe distance.
3451 VPValue *SafeAVL, *UnsafeAVL;
3452 if (match(AVL,
3454 m_VPValue(SafeAVL)),
3455 m_Deferred(UnsafeAVL), m_Deferred(SafeAVL))))
3456 AVL = UnsafeAVL;
3457
3458 VPValue *AVLNext;
3459 [[maybe_unused]] bool FoundAVLNext =
3461 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3462 assert(FoundAVLNext && "Didn't find AVL backedge?");
3463
3464 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3465 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3466 if (match(LatchBr, m_BranchOnCond(m_True())))
3467 return;
3468
3469 VPValue *CanIVInc;
3470 [[maybe_unused]] bool FoundIncrement = match(
3471 LatchBr,
3473 m_Specific(&Plan.getVectorTripCount()))));
3474 assert(FoundIncrement &&
3475 match(CanIVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
3476 m_Specific(&Plan.getVFxUF()))) &&
3477 "Expected BranchOnCond with ICmp comparing CanIV + VFxUF with vector "
3478 "trip count");
3479
3480 Type *AVLTy = AVLNext->getScalarType();
3481 VPBuilder Builder(LatchBr);
3482 LatchBr->setOperand(
3483 0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));
3484}
3485
3487 VPlan &Plan, PredicatedScalarEvolution &PSE,
3488 const DenseMap<Value *, const SCEV *> &StridesMap) {
3489 // Replace VPValues for known constant strides guaranteed by predicated scalar
3490 // evolution that are guaranteed to be guarded by the runtime checks; that is,
3491 // blocks dominated by the vector preheader.
3492 assert(!Plan.getVectorLoopRegion() &&
3493 "expected to run before loop regions are created");
3494 VPDominatorTree VPDT(Plan);
3495 VPBlockBase *Preheader = Plan.getEntry()->getSuccessors()[1];
3496 auto CanUseVersionedStride = [&VPDT, Preheader](VPUser &U, unsigned) {
3497 auto *R = cast<VPRecipeBase>(&U);
3498 VPBlockBase *Parent = R->getParent();
3499 return VPDT.dominates(Preheader, Parent);
3500 };
3501 ValueToSCEVMapTy RewriteMap;
3502 for (const SCEV *Stride : StridesMap.values()) {
3503 using namespace SCEVPatternMatch;
3504 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3505 const APInt *StrideConst;
3506 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3507 // Only handle constant strides for now.
3508 continue;
3509
3510 auto *CI = Plan.getConstantInt(*StrideConst);
3511 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3512 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3513
3514 // The versioned value may not be used in the loop directly but through a
3515 // sext/zext. Add new live-ins in those cases.
3516 for (Value *U : StrideV->users()) {
3518 continue;
3519 VPValue *StrideVPV = Plan.getLiveIn(U);
3520 if (!StrideVPV)
3521 continue;
3522 unsigned BW = U->getType()->getScalarSizeInBits();
3523 APInt C =
3524 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3525 VPValue *CI = Plan.getConstantInt(C);
3526 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3527 }
3528 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3529 }
3530
3531 for (VPRecipeBase &R : *Plan.getEntry()) {
3532 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3533 if (!ExpSCEV)
3534 continue;
3535 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3536 auto *NewSCEV =
3537 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3538 if (NewSCEV != ScevExpr) {
3539 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3540 ExpSCEV->replaceAllUsesWith(NewExp);
3541 if (Plan.getTripCount() == ExpSCEV)
3542 Plan.resetTripCount(NewExp);
3543 }
3544 }
3545}
3546
3548 // Collect recipes in the backward slice of `Root` that may generate a poison
3549 // value that is used after vectorization.
3551 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3553 Worklist.push_back(Root);
3554
3555 // Traverse the backward slice of Root through its use-def chain.
3556 while (!Worklist.empty()) {
3557 VPRecipeBase *CurRec = Worklist.pop_back_val();
3558
3559 if (!Visited.insert(CurRec).second)
3560 continue;
3561
3562 // Prune search if we find another recipe generating a widen memory
3563 // instruction. Widen memory instructions involved in address computation
3564 // will lead to gather/scatter instructions, which don't need to be
3565 // handled.
3567 VPHeaderPHIRecipe>(CurRec))
3568 continue;
3569
3570 // This recipe contributes to the address computation of a widen
3571 // load/store. If the underlying instruction has poison-generating flags,
3572 // drop them directly.
3573 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3574 VPValue *A, *B;
3575 // Dropping disjoint from an OR may yield incorrect results, as some
3576 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3577 // for dependence analysis). Instead, replace it with an equivalent Add.
3578 // This is possible as all users of the disjoint OR only access lanes
3579 // where the operands are disjoint or poison otherwise.
3580 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3581 RecWithFlags->isDisjoint()) {
3582 VPBuilder Builder(RecWithFlags);
3583 VPInstruction *New =
3584 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3585 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3586 RecWithFlags->replaceAllUsesWith(New);
3587 RecWithFlags->eraseFromParent();
3588 CurRec = New;
3589 } else
3590 RecWithFlags->dropPoisonGeneratingFlags();
3591 } else {
3594 (void)Instr;
3595 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3596 "found instruction with poison generating flags not covered by "
3597 "VPRecipeWithIRFlags");
3598 }
3599
3600 // Add new definitions to the worklist.
3601 for (VPValue *Operand : CurRec->operands())
3602 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3603 Worklist.push_back(OpDef);
3604 }
3605 });
3606
3607 // We want to exclude the tail folding case, as we don't need to drop flags
3608 // for operations computing the first lane in this case: the first lane of the
3609 // header mask must always be true.
3610 auto IsNotHeaderMask = [&Plan](VPValue *Mask) {
3611 return Mask && !vputils::isHeaderMask(Mask, Plan);
3612 };
3613
3614 // Traverse all the recipes in the VPlan and collect the poison-generating
3615 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3616 // VPInterleaveRecipe.
3617 auto Iter =
3620 for (VPRecipeBase &Recipe : *VPBB) {
3621 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3622 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3623 if (AddrDef && WidenRec->isConsecutive() &&
3624 IsNotHeaderMask(WidenRec->getMask()))
3625 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3626 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3627 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3628 if (AddrDef && IsNotHeaderMask(InterleaveRec->getMask()))
3629 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3630 }
3631 }
3632 }
3633}
3634
3636 VPlan &Plan,
3638 &InterleaveGroups,
3639 const bool &EpilogueAllowed) {
3640 if (InterleaveGroups.empty())
3641 return;
3642
3644 for (VPBasicBlock *VPBB :
3647 for (VPRecipeBase &R : make_filter_range(*VPBB, [](VPRecipeBase &R) {
3648 return isa<VPWidenMemoryRecipe>(&R);
3649 })) {
3650 auto *MemR = cast<VPWidenMemoryRecipe>(&R);
3651 IRMemberToRecipe[&MemR->getIngredient()] = MemR;
3652 }
3653
3654 // Interleave memory: for each Interleave Group we marked earlier as relevant
3655 // for this VPlan, replace the Recipes widening its memory instructions with a
3656 // single VPInterleaveRecipe at its insertion point.
3657 VPDominatorTree VPDT(Plan);
3658 for (const auto *IG : InterleaveGroups) {
3659 // Skip interleave groups where members don't have recipes. This can happen
3660 // when removeDeadRecipes removes recipes that are part of interleave groups
3661 // but have no users.
3662 if (llvm::any_of(IG->members(), [&IRMemberToRecipe](Instruction *Member) {
3663 return !IRMemberToRecipe.contains(Member);
3664 }))
3665 continue;
3666
3667 auto *Start = IRMemberToRecipe.lookup(IG->getMember(0));
3668 VPIRMetadata InterleaveMD(*Start);
3669 SmallVector<VPValue *, 4> StoredValues;
3670 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start->getAsRecipe()))
3671 StoredValues.push_back(StoreR->getStoredValue());
3672 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3673 Instruction *MemberI = IG->getMember(I);
3674 if (!MemberI)
3675 continue;
3676 VPWidenMemoryRecipe *MemoryR = IRMemberToRecipe.lookup(MemberI);
3677 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR->getAsRecipe()))
3678 StoredValues.push_back(StoreR->getStoredValue());
3679 InterleaveMD.intersect(*MemoryR);
3680 }
3681
3682 bool NeedsMaskForGaps =
3683 (IG->requiresScalarEpilogue() && !EpilogueAllowed) ||
3684 (!StoredValues.empty() && !IG->isFull());
3685
3686 Instruction *IRInsertPos = IG->getInsertPos();
3687 auto *InsertPos = IRMemberToRecipe.lookup(IRInsertPos);
3688 VPRecipeBase *InsertPosR = InsertPos->getAsRecipe();
3689
3691 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3692 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3693 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3694
3695 // Get or create the start address for the interleave group.
3696 VPValue *Addr = Start->getAddr();
3697 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3698 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPosR)) {
3699 // We cannot re-use the address of member zero because it does not
3700 // dominate the insert position. Instead, use the address of the insert
3701 // position and create a PtrAdd adjusting it to the address of member
3702 // zero.
3703 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3704 // InsertPos or sink loads above zero members to join it.
3705 assert(IG->getIndex(IRInsertPos) != 0 &&
3706 "index of insert position shouldn't be zero");
3707 auto &DL = IRInsertPos->getDataLayout();
3708 APInt Offset(32,
3709 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3710 IG->getIndex(IRInsertPos),
3711 /*IsSigned=*/true);
3712 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3713 VPBuilder B(InsertPosR);
3714 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3715 }
3716 // If the group is reverse, adjust the index to refer to the last vector
3717 // lane instead of the first. We adjust the index from the first vector
3718 // lane, rather than directly getting the pointer for lane VF - 1, because
3719 // the pointer operand of the interleaved access is supposed to be uniform.
3720 if (IG->isReverse()) {
3721 auto *ReversePtr = new VPVectorEndPointerRecipe(
3722 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3723 -(int64_t)IG->getFactor(), NW, InsertPosR->getDebugLoc());
3724 ReversePtr->insertBefore(InsertPosR);
3725 Addr = ReversePtr;
3726 }
3727 auto *VPIG = new VPInterleaveRecipe(
3728 IG, Addr, StoredValues, InsertPos->getMask(), NeedsMaskForGaps,
3729 InterleaveMD, InsertPosR->getDebugLoc());
3730 VPIG->insertBefore(InsertPosR);
3731
3732 unsigned J = 0;
3733 for (unsigned i = 0; i < IG->getFactor(); ++i)
3734 if (Instruction *Member = IG->getMember(i)) {
3735 VPRecipeBase *MemberR = IRMemberToRecipe.lookup(Member)->getAsRecipe();
3736 if (!Member->getType()->isVoidTy()) {
3737 VPValue *OriginalV = MemberR->getVPSingleValue();
3738 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3739 J++;
3740 }
3741 MemberR->eraseFromParent();
3742 }
3743 }
3744}
3745
3746/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3747/// value, phi and backedge value. In the following example:
3748///
3749/// vector.ph:
3750/// Successor(s): vector loop
3751///
3752/// <x1> vector loop: {
3753/// vector.body:
3754/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3755/// ...
3756/// EMIT branch-on-count ...
3757/// No successors
3758/// }
3759///
3760/// WIDEN-INDUCTION will get expanded to:
3761///
3762/// vector.ph:
3763/// ...
3764/// vp<%induction.start> = ...
3765/// vp<%induction.increment> = ...
3766///
3767/// Successor(s): vector loop
3768///
3769/// <x1> vector loop: {
3770/// vector.body:
3771/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3772/// ...
3773/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3774/// EMIT branch-on-count ...
3775/// No successors
3776/// }
3777static void
3779 VPlan *Plan = WidenIVR->getParent()->getPlan();
3780 VPValue *Start = WidenIVR->getStartValue();
3781 VPValue *Step = WidenIVR->getStepValue();
3782 VPValue *VF = WidenIVR->getVFValue();
3783 DebugLoc DL = WidenIVR->getDebugLoc();
3784
3785 // The value from the original loop to which we are mapping the new induction
3786 // variable.
3787 Type *Ty = WidenIVR->getScalarType();
3788
3789 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3792 VPIRFlags Flags = *WidenIVR;
3793 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3794 AddOp = Instruction::Add;
3795 MulOp = Instruction::Mul;
3796 } else {
3797 AddOp = ID.getInductionOpcode();
3798 MulOp = Instruction::FMul;
3799 }
3800
3801 // If the phi is truncated, truncate the start and step values.
3802 VPBuilder Builder(Plan->getVectorPreheader());
3803 Type *StepTy = Step->getScalarType();
3804 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3805 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3806 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3807 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3808 StepTy = Ty;
3809 }
3810
3811 // Construct the initial value of the vector IV in the vector loop preheader.
3812 Type *IVIntTy =
3814 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3815 if (StepTy->isFloatingPointTy())
3816 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3817
3818 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3819 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3820
3821 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3822 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3823 DebugLoc::getUnknown(), "induction");
3824
3825 // Create the widened phi of the vector IV.
3826 auto *WidePHI = VPBuilder(WidenIVR).createWidenPhi(
3827 Init, WidenIVR->getDebugLoc(), "vec.ind");
3828
3829 // Create the backedge value for the vector IV.
3830 VPValue *Inc;
3831 VPValue *Prev;
3832 // If unrolled, use the increment and prev value from the operands.
3833 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3834 Inc = SplatVF;
3835 Prev = WidenIVR->getLastUnrolledPartOperand();
3836 } else {
3837 // Move the insertion point after the VF definition when the VF is defined
3838 // inside a loop, such as for EVL tail-folding.
3839 if (VPRecipeBase *R = VF->getDefiningRecipe())
3840 if (R->getParent()->getEnclosingLoopRegion())
3841 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3842
3843 // Multiply the vectorization factor by the step using integer or
3844 // floating-point arithmetic as appropriate.
3845 if (StepTy->isFloatingPointTy())
3846 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3847 DL);
3848 else
3849 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, VF->getScalarType(), DL);
3850
3851 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3852 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3853 Prev = WidePHI;
3854 }
3855
3857 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3858 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3859 WidenIVR->getDebugLoc(), "vec.ind.next");
3860
3861 WidePHI->addIncoming(Next);
3862
3863 WidenIVR->replaceAllUsesWith(WidePHI);
3864}
3865
3866/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3867/// initial value, phi and backedge value. In the following example:
3868///
3869/// <x1> vector loop: {
3870/// vector.body:
3871/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3872/// ...
3873/// EMIT branch-on-count ...
3874/// }
3875///
3876/// WIDEN-POINTER-INDUCTION will get expanded to:
3877///
3878/// <x1> vector loop: {
3879/// vector.body:
3880/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3881/// EMIT %mul = mul %stepvector, %step
3882/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3883/// ...
3884/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3885/// EMIT branch-on-count ...
3886/// }
3888 VPlan *Plan = R->getParent()->getPlan();
3889 VPValue *Start = R->getStartValue();
3890 VPValue *Step = R->getStepValue();
3891 VPValue *VF = R->getVFValue();
3892
3893 assert(R->getInductionDescriptor().getKind() ==
3895 "Not a pointer induction according to InductionDescriptor!");
3896 assert(R->getScalarType()->isPointerTy() && "Unexpected type.");
3897 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3898 "Recipe should have been replaced");
3899
3900 VPBuilder Builder(R);
3901 DebugLoc DL = R->getDebugLoc();
3902
3903 // Build a scalar pointer phi.
3904 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3905
3906 // Create actual address geps that use the pointer phi as base and a
3907 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3908 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3909 Type *StepTy = Step->getScalarType();
3910 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3911 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3912 VPValue *PtrAdd =
3913 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3914 R->replaceAllUsesWith(PtrAdd);
3915
3916 // Create the backedge value for the scalar pointer phi.
3918 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3919 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, VF->getScalarType(), DL);
3920 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3921
3922 VPValue *InductionGEP =
3923 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3924 ScalarPtrPhi->addIncoming(InductionGEP);
3925}
3926
3927/// Expand a VPDerivedIVRecipe into executable recipes.
3929 VPBuilder Builder(R);
3930 VPIRValue *Start = R->getStartValue();
3931 VPValue *Step = R->getStepValue();
3932 VPValue *Index = R->getIndex();
3933 Type *StepTy = Step->getScalarType();
3934 Type *IndexTy = Index->getScalarType();
3935 Index = StepTy->isIntegerTy()
3936 ? Builder.createScalarSExtOrTrunc(
3937 Index, StepTy, IndexTy, DebugLoc::getCompilerGenerated())
3938 : Builder.createScalarCast(Instruction::SIToFP, Index, StepTy,
3940 switch (R->getInductionKind()) {
3942 assert(Index->getScalarType() == Start->getScalarType() &&
3943 "Index type does not match StartValue type");
3944 return R->replaceAllUsesWith(Builder.createAdd(
3945 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
3946 }
3948 return R->replaceAllUsesWith(Builder.createPtrAdd(
3949 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
3951 assert(StepTy->isFloatingPointTy() && "Expected FP Step value");
3952 const FPMathOperator *FPBinOp = R->getFPBinOp();
3953 assert(FPBinOp &&
3954 (FPBinOp->getOpcode() == Instruction::FAdd ||
3955 FPBinOp->getOpcode() == Instruction::FSub) &&
3956 "Original BinOp should be defined for FP induction");
3957 FastMathFlags FMF = FPBinOp->getFastMathFlags();
3958 VPValue *FMul = Builder.createNaryOp(Instruction::FMul, {Step, Index}, FMF);
3959 return R->replaceAllUsesWith(
3960 Builder.createNaryOp(FPBinOp->getOpcode(), {Start, FMul}, FMF));
3961 }
3963 return;
3964 }
3965 llvm_unreachable("Unhandled induction kind");
3966}
3967
3969 // Replace loop regions with explicity CFG.
3970 SmallVector<VPRegionBlock *> LoopRegions;
3972 vp_depth_first_deep(Plan.getEntry()))) {
3973 if (!R->isReplicator())
3974 LoopRegions.push_back(R);
3975 }
3976 for (VPRegionBlock *R : LoopRegions)
3977 R->dissolveToCFGLoop();
3978}
3979
3982 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3983 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3986 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3987 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3988 }
3989
3990 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3991 // single-condition branches:
3992 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3993 // the first condition is true, and otherwise jumps to a new interim block.
3994 // 2. A branch that ends the interim block, jumps to the second successor if
3995 // the second condition is true, and otherwise jumps to the third
3996 // successor.
3997 for (VPInstruction *Br : WorkList) {
3998 assert(Br->getNumOperands() == 2 &&
3999 "BranchOnTwoConds must have exactly 2 conditions");
4000 DebugLoc DL = Br->getDebugLoc();
4001 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
4002 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
4003 assert(Successors.size() == 3 &&
4004 "BranchOnTwoConds must have exactly 3 successors");
4005
4006 for (VPBlockBase *Succ : Successors)
4007 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
4008
4009 VPValue *Cond0 = Br->getOperand(0);
4010 VPValue *Cond1 = Br->getOperand(1);
4011 VPBlockBase *Succ0 = Successors[0];
4012 VPBlockBase *Succ1 = Successors[1];
4013 VPBlockBase *Succ2 = Successors[2];
4014
4015 // If the successor block for both conditions is the same, then combine the
4016 // two conditions and plant a single conditional branch.
4017 if (Succ0 == Succ1) {
4018 VPBuilder Builder(Br);
4019 VPValue *Combined = Builder.createOr(Cond0, Cond1, DL);
4020 Builder.createNaryOp(VPInstruction::BranchOnCond, {Combined}, DL);
4021 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
4022 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ2);
4023 Br->eraseFromParent();
4024 continue;
4025 }
4026
4027 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
4028 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
4029
4030 VPBasicBlock *InterimBB =
4031 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
4032
4033 VPBuilder(BrOnTwoCondsBB)
4035 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
4036 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
4037
4039 VPBlockUtils::connectBlocks(InterimBB, Succ1);
4040 VPBlockUtils::connectBlocks(InterimBB, Succ2);
4041 Br->eraseFromParent();
4042 }
4043}
4044
4047 vp_depth_first_deep(Plan.getEntry()))) {
4048 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4049 VPBuilder Builder(&R);
4050 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
4052 WidenIVR->eraseFromParent();
4053 continue;
4054 }
4055
4056 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
4057 // If the recipe only generates scalars, scalarize it instead of
4058 // expanding it.
4059 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
4060 VPValue *PtrAdd =
4061 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
4062 WidenIVR->replaceAllUsesWith(PtrAdd);
4063 WidenIVR->eraseFromParent();
4064 continue;
4065 }
4067 WidenIVR->eraseFromParent();
4068 continue;
4069 }
4070
4071 if (auto *DerivedIVR = dyn_cast<VPDerivedIVRecipe>(&R)) {
4072 expandVPDerivedIV(DerivedIVR);
4073 DerivedIVR->eraseFromParent();
4074 continue;
4075 }
4076
4077 if (auto *WideCanIV = dyn_cast<VPWidenCanonicalIVRecipe>(&R)) {
4078 VPValue *CanIV = WideCanIV->getCanonicalIV();
4079 Type *CanIVTy = CanIV->getScalarType();
4080 VPValue *Step = WideCanIV->getStepValue();
4081 if (!Step) {
4082 assert(Plan.getConcreteUF() == 1 &&
4083 "Expected unroller to have materialized step for UF != 1");
4084 Step = Plan.getZero(CanIVTy);
4085 }
4086 CanIV = Builder.createNaryOp(VPInstruction::Broadcast, CanIV);
4087 Step = Builder.createNaryOp(VPInstruction::Broadcast, Step);
4088 Step = Builder.createAdd(
4089 Step, Builder.createNaryOp(VPInstruction::StepVector, {}, CanIVTy));
4090 VPValue *CanVecIV =
4091 Builder.createAdd(CanIV, Step, WideCanIV->getDebugLoc(), "vec.iv",
4092 WideCanIV->getNoWrapFlags());
4093 WideCanIV->replaceAllUsesWith(CanVecIV);
4094 WideCanIV->eraseFromParent();
4095 continue;
4096 }
4097
4098 // Expand VPBlendRecipe into VPInstruction::Select.
4099 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
4100 VPValue *Select = Blend->getIncomingValue(0);
4101 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
4102 Select = Builder.createSelect(Blend->getMask(I),
4103 Blend->getIncomingValue(I), Select,
4104 R.getDebugLoc(), "predphi", *Blend);
4105 Blend->replaceAllUsesWith(Select);
4106 Blend->eraseFromParent();
4107 continue;
4108 }
4109
4110 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
4111 if (!VEPR->getOffset()) {
4112 assert(Plan.getConcreteUF() == 1 &&
4113 "Expected unroller to have materialized offset for UF != 1");
4114 VEPR->materializeOffset();
4115 }
4116 continue;
4117 }
4118
4119 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
4120 Expr->decompose();
4121 Expr->eraseFromParent();
4122 continue;
4123 }
4124
4125 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4126 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
4127 if (LastActiveL &&
4128 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4129 // Create Not(Mask) for all operands.
4131 for (VPValue *Op : LastActiveL->operands()) {
4132 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
4133 NotMasks.push_back(NotMask);
4134 }
4135
4136 // Create FirstActiveLane on the inverted masks.
4137 VPValue *FirstInactiveLane = Builder.createFirstActiveLane(
4138 NotMasks, LastActiveL->getDebugLoc(), "first.inactive.lane");
4139
4140 // Subtract 1 to get the last active lane.
4141 VPValue *One =
4142 Plan.getConstantInt(FirstInactiveLane->getScalarType(), 1);
4143 VPValue *LastLane =
4144 Builder.createSub(FirstInactiveLane, One,
4145 LastActiveL->getDebugLoc(), "last.active.lane");
4146
4147 LastActiveL->replaceAllUsesWith(LastLane);
4148 LastActiveL->eraseFromParent();
4149 continue;
4150 }
4151
4152 // Lower MaskedCond with block mask to LogicalAnd.
4154 auto *VPI = cast<VPInstruction>(&R);
4155 assert(VPI->isMasked() &&
4156 "Unmasked MaskedCond should be simplified earlier");
4157 VPI->replaceAllUsesWith(Builder.createNaryOp(
4158 VPInstruction::LogicalAnd, {VPI->getMask(), VPI->getOperand(0)}));
4159 VPI->eraseFromParent();
4160 continue;
4161 }
4162
4163 // Lower CanonicalIVIncrementForPart to plain Add.
4164 if (match(
4165 &R,
4167 auto *VPI = cast<VPInstruction>(&R);
4168 VPValue *Add = Builder.createOverflowingOp(
4169 Instruction::Add, VPI->operands(), VPI->getNoWrapFlags(),
4170 VPI->getDebugLoc());
4171 VPI->replaceAllUsesWith(Add);
4172 VPI->eraseFromParent();
4173 continue;
4174 }
4175
4176 // Lower BranchOnCount to ICmp + BranchOnCond.
4177 VPValue *IV, *TC;
4178 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
4179 auto *BranchOnCountInst = cast<VPInstruction>(&R);
4180 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4181 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
4182 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
4183 BranchOnCountInst->eraseFromParent();
4184 continue;
4185 }
4186
4187 VPValue *VectorStep;
4188 VPValue *ScalarStep;
4190 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4191 continue;
4192
4193 // Expand WideIVStep.
4194 auto *VPI = cast<VPInstruction>(&R);
4195 Type *IVTy = VPI->getScalarType();
4196 if (VectorStep->getScalarType() != IVTy) {
4198 ? Instruction::UIToFP
4199 : Instruction::Trunc;
4200 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4201 }
4202
4203 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4204 if (ScalarStep->getScalarType() != IVTy) {
4205 ScalarStep =
4206 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4207 }
4208
4209 VPIRFlags Flags;
4210 unsigned MulOpc;
4211 if (IVTy->isFloatingPointTy()) {
4212 MulOpc = Instruction::FMul;
4213 Flags = VPI->getFastMathFlagsOrNone();
4214 } else {
4215 MulOpc = Instruction::Mul;
4216 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4217 }
4218
4219 VPInstruction *Mul = Builder.createNaryOp(
4220 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4221 VectorStep = Mul;
4222 VPI->replaceAllUsesWith(VectorStep);
4223 VPI->eraseFromParent();
4224 }
4225 }
4226}
4227
4233
4234/// Update \p Plan to mask memory operations in the loop based on whether the
4235/// early exit is taken or not.
4238 VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB,
4239 Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT,
4240 AssumptionCache *AC, VPDominatorTree &VPDT) {
4241
4242 // Disconnect early exiting blocks from successors, remove branches. We
4243 // currently don't support multiple uses for recipes involved in creating
4244 // the uncountable exit condition.
4245 for (auto &Exit : Exits) {
4246 if (Exit.EarlyExitingVPBB == LatchVPBB)
4247 continue;
4248
4249 for (VPRecipeBase &R : Exit.EarlyExitVPBB->phis())
4250 cast<VPIRPhi>(&R)->removeIncomingValueFor(Exit.EarlyExitingVPBB);
4251 Exit.EarlyExitingVPBB->getTerminator()->eraseFromParent();
4252 VPBlockUtils::disconnectBlocks(Exit.EarlyExitingVPBB, Exit.EarlyExitVPBB);
4253 }
4254
4255 // We can abandon a VPlan entirely if we return false here, so we shouldn't
4256 // crash if some earlier assumptions on scalar IR don't hold for the vplan
4257 // version of the loop.
4259 SmallVector<VPInstruction *, 8> ConditionRecipes;
4260
4261 std::optional<VPValue *> Cond =
4262 vputils::getRecipesForUncountableExit(ConditionRecipes, GEPs, LatchVPBB);
4263 if (!Cond)
4264 return false;
4265
4266 // Find load contributing to condition.
4267 VPRecipeBase *CondLoad = nullptr;
4268 for (auto *Recipe : ConditionRecipes) {
4270 // TODO: Support more than one load. Needs legality updates too.
4271 assert(CondLoad == nullptr && "Too many condition loads");
4272 CondLoad = Recipe;
4273 }
4274 }
4275 assert(CondLoad && "Couldn't find load");
4276
4277 // Ensure that we are guaranteed to be able to dereference the memory used
4278 // for determining the uncountable exit for the maximum possible number of
4279 // scalar iterations of the loop.
4280 //
4281 // TODO: Support first-faulting loads in cases where we don't know whether
4282 // all possible addresses are dereferenceable.
4283 {
4286 VPValue *Ptr = Load->getOperand(0);
4287 const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(Ptr, PSE, TheLoop);
4288 const DataLayout &DL = Plan.getDataLayout();
4289 APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getScalarType()),
4290 DL.getTypeStoreSize(Load->getScalarType()).getFixedValue());
4292 PtrSCEV, cast<LoadInst>(Load->getUnderlyingInstr())->getAlign(),
4293 PSE.getSE()->getConstant(EltSize), TheLoop, *PSE.getSE(), DT, AC,
4294 &Predicates))
4295 return false;
4296 }
4297
4298 // Check GEPs to see if we can link them to a widen IV recipe with a step of
4299 // 1; we're only interested in contiguous accesses for the condition load
4300 // right now.
4301 for (auto *GEP : GEPs) {
4302 VPValue *MaybeIV = nullptr;
4304 m_LiveIn(), m_VPValue(MaybeIV))))
4305 return false;
4306
4307 auto *WIV = dyn_cast<VPWidenInductionRecipe>(MaybeIV);
4308 if (!WIV)
4309 return false;
4310
4311 if (!match(WIV->getStartValue(), m_SpecificInt(0)) ||
4312 !match(WIV->getStepValue(), m_SpecificInt(1)))
4313 return false;
4314 }
4315
4316 // Find an insertion point. Default to the end of the header but override
4317 // if we find a memory op that needs masking before the condition load.
4318 auto InsertIt = HeaderVPBB->end();
4319 VPRecipeBase *CondR = (*Cond)->getDefiningRecipe();
4320 bool CondMoveNeeded = CondR->getParent() != HeaderVPBB;
4321 for (VPRecipeBase &R : *HeaderVPBB) {
4322 if (&R == CondLoad)
4323 continue;
4324
4325 if (R.mayReadOrWriteMemory()) {
4326 if (!VPDT.properlyDominates(CondR, &R)) {
4327 CondMoveNeeded = true;
4328 InsertIt = R.getIterator();
4329 }
4330 break;
4331 }
4332 }
4333
4334 // If another memory operation would take place before the comparison to
4335 // determine whether to exit early or the comparison doesn't take place in
4336 // the header, move the comparison (and supporting recipes).
4337 if (CondMoveNeeded)
4338 for (auto *Recipe : reverse(ConditionRecipes))
4339 Recipe->moveBefore(*HeaderVPBB, InsertIt);
4340
4341 // Create a mask to represent all lanes that fully execute in the vector loop,
4342 // stopping short of any early exit.
4343 VPBuilder MaskBuilder(HeaderVPBB, InsertIt);
4344 VPValue *FirstActive = MaskBuilder.createFirstActiveLane(*Cond);
4345 VPValue *IV = cast<VPSingleDefRecipe>(&HeaderVPBB->front());
4346 Type *IVScalarTy = IV->getScalarType();
4347 Type *FirstActiveTy = FirstActive->getScalarType();
4348 VPValue *ALMMultiplier = Plan.getConstantInt(IVScalarTy, 1);
4349 VPValue *Zero = Plan.getZero(IVScalarTy);
4350 FirstActive = MaskBuilder.createScalarZExtOrTrunc(FirstActive, IVScalarTy,
4351 FirstActiveTy, DebugLoc());
4353 {Zero, FirstActive, ALMMultiplier},
4354 DebugLoc(), "uncountable.exit.mask");
4355
4356 // Convert all other memory operations to use the mask.
4357 for (VPBasicBlock *VPBB : vp_rpo_plain_cfg_loop_body(HeaderVPBB))
4358 for (VPRecipeBase &R : *VPBB)
4359 if (R.mayReadOrWriteMemory() && &R != CondLoad) {
4360 // TODO: Handle conditional memory operations in the loop.
4361 if (!VPDT.dominates(R.getParent(), LatchVPBB))
4362 return false;
4363 cast<VPInstruction>(&R)->addMask(Mask);
4364 }
4365
4366 // Update middle block branch to compare (IV + however many lanes were active)
4367 // against the full trip count, since we may be exiting the vector loop early.
4368 // If we didn't take an early exit, we should get the equivalent of VF from
4369 // the FirstActiveLane.
4370 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->end());
4371 VPValue *ScalarIV = MiddleBuilder.createNaryOp(VPInstruction::ExtractLane,
4372 {Zero, IV}, DebugLoc());
4373 VPValue *ExitIV = MiddleBuilder.createAdd(ScalarIV, FirstActive);
4374 VPValue *FullTC =
4375 MiddleBuilder.createICmp(CmpInst::ICMP_EQ, ExitIV, Plan.getTripCount());
4376 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {FullTC});
4377
4378 // Update resume phi in scalar.ph.
4379 VPBasicBlock *ScalarPH = Plan.getScalarPreheader();
4380 auto Phis = ScalarPH->phis();
4381 // TODO: Handle more than one Phi; re-derive from IV.
4382 // TODO: Handle reductions.
4383 if (range_size(Phis) != 1)
4384 return false;
4385 VPPhi *ContinueIV = cast<VPPhi>(Phis.begin());
4386 ContinueIV->setOperand(0, ExitIV);
4387 return true;
4388}
4389
4391 VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB,
4392 VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE,
4394 VPDominatorTree VPDT(Plan);
4395 VPBuilder LatchBuilder(LatchVPBB->getTerminator());
4397 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4398 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4399 if (Pred == MiddleVPBB)
4400 continue;
4401 // Collect condition for this early exit.
4402 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4403 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4404 VPValue *CondOfEarlyExitingVPBB;
4405 [[maybe_unused]] bool Matched =
4406 match(EarlyExitingVPBB->getTerminator(),
4407 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4408 assert(Matched && "Terminator must be BranchOnCond");
4409
4410 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4411 // the correct block mask.
4412 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4413 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4415 TrueSucc == ExitBlock
4416 ? CondOfEarlyExitingVPBB
4417 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4418 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4419 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4420 VPDT.properlyDominates(
4421 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4422 LatchVPBB)) &&
4423 "exit condition must dominate the latch");
4424 Exits.push_back({
4425 EarlyExitingVPBB,
4426 ExitBlock,
4427 CondToEarlyExit,
4428 });
4429 }
4430 }
4431
4432 assert(!Exits.empty() && "must have at least one early exit");
4433 // Sort exits by RPO order to get correct program order. RPO gives a
4434 // topological ordering of the CFG, ensuring upstream exits are checked
4435 // before downstream exits in the dispatch chain.
4437 HeaderVPBB);
4439 for (const auto &[Num, VPB] : enumerate(RPOT))
4440 RPOIdx[VPB] = Num;
4441 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4442 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4443 });
4444#ifndef NDEBUG
4445 // After RPO sorting, verify that for any pair where one exit dominates
4446 // another, the dominating exit comes first. This is guaranteed by RPO
4447 // (topological order) and is required for the dispatch chain correctness.
4448 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4449 for (unsigned J = I + 1; J < Exits.size(); ++J)
4450 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4451 Exits[I].EarlyExitingVPBB) &&
4452 "RPO sort must place dominating exits before dominated ones");
4453#endif
4454
4455 // Build the AnyOf condition for the latch terminator using logical OR
4456 // to avoid poison propagation from later exit conditions when an earlier
4457 // exit is taken.
4458 VPValue *Combined = Exits[0].CondToExit;
4459 for (const EarlyExitInfo &Info : drop_begin(Exits))
4460 Combined = LatchBuilder.createLogicalOr(Combined, Info.CondToExit);
4461
4462 VPValue *IsAnyExitTaken =
4463 LatchBuilder.createNaryOp(VPInstruction::AnyOf, {Combined});
4464
4465 // Create a comparison for the latch exit condition and replace the
4466 // BranchOnCond with a BranchOnTwoConds. The original BranchOnCond's condition
4467 // is used as the latch-exit condition; canonical IV recipes have not been
4468 // introduced yet, so there is no BranchOnCount to derive the condition from.
4469 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4470 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCond &&
4471 "Unexpected terminator");
4472 VPValue *IsLatchExitTaken = LatchExitingBranch->getOperand(0);
4473 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4474 LatchExitingBranch->eraseFromParent();
4475 LatchBuilder.setInsertPoint(LatchVPBB);
4477 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4478 LatchVPBB->clearSuccessors();
4479
4481 // If handling the exiting lane in the scalar loop, combine the exit
4482 // conditions into a single BranchOnCond.
4483 LatchVPBB->setSuccessors({MiddleVPBB, MiddleVPBB, HeaderVPBB});
4484 MiddleVPBB->clearPredecessors();
4485 MiddleVPBB->setPredecessors({LatchVPBB, LatchVPBB});
4486 return handleUncountableExitsWithSideEffects(Plan, Exits, HeaderVPBB,
4487 LatchVPBB, MiddleVPBB, TheLoop,
4488 PSE, DT, AC, VPDT);
4489 }
4490
4491 // Create the vector.early.exit blocks.
4492 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4493 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4494 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4495 VPBasicBlock *VectorEarlyExitVPBB =
4496 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4497 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4498 }
4499
4500 // Create the dispatch block (or reuse the single exit block if only one
4501 // exit). The dispatch block computes the first active lane of the combined
4502 // condition and, for multiple exits, chains through conditions to determine
4503 // which exit to take.
4504 VPBasicBlock *DispatchVPBB =
4505 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4506 : Plan.createVPBasicBlock("vector.early.exit.check");
4507 DispatchVPBB->setPredecessors({LatchVPBB});
4508 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4509 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4510 VPValue *FirstActiveLane = DispatchBuilder.createFirstActiveLane(
4511 {Combined}, DebugLoc::getUnknown(), "first.active.lane");
4512
4513 // For each early exit, disconnect the original exiting block
4514 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4515 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4516 // values at the first active lane:
4517 //
4518 // Input:
4519 // early.exiting.I:
4520 // ...
4521 // EMIT branch-on-cond vp<%cond.I>
4522 // Successor(s): in.loop.succ, ir-bb<exit.I>
4523 //
4524 // ir-bb<exit.I>:
4525 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4526 //
4527 // Output:
4528 // early.exiting.I:
4529 // ...
4530 // Successor(s): in.loop.succ
4531 //
4532 // vector.early.exit.I:
4533 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4534 // Successor(s): ir-bb<exit.I>
4535 //
4536 // ir-bb<exit.I>:
4537 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4538 // vector.early.exit.I)
4539 //
4540 for (auto [Exit, VectorEarlyExitVPBB] :
4541 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4542 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4543 // Adjust the phi nodes in EarlyExitVPBB.
4544 // 1. remove incoming values from EarlyExitingVPBB,
4545 // 2. extract the incoming value at FirstActiveLane
4546 // 3. add back the extracts as last operands for the phis
4547 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4548 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4549 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4550 // values from VectorEarlyExitVPBB.
4551 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4552 auto *ExitIRI = cast<VPIRPhi>(&R);
4553 VPValue *IncomingVal =
4554 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4555 VPValue *NewIncoming = IncomingVal;
4556 if (!isa<VPIRValue>(IncomingVal)) {
4557 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4558 NewIncoming = EarlyExitBuilder.createNaryOp(
4559 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4560 DebugLoc::getUnknown(), "early.exit.value");
4561 }
4562 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4563 ExitIRI->addIncoming(NewIncoming);
4564 }
4565
4566 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4567 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4568 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4569 }
4570
4571 // Chain through exits: for each exit, check if its condition is true at
4572 // the first active lane. If so, take that exit; otherwise, try the next.
4573 // The last exit needs no check since it must be taken if all others fail.
4574 //
4575 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4576 //
4577 // latch:
4578 // ...
4579 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4580 // ...
4581 //
4582 // vector.early.exit.check:
4583 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4584 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4585 // EMIT branch-on-cond vp<%at.cond.0>
4586 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4587 //
4588 // vector.early.exit.check.0:
4589 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4590 // EMIT branch-on-cond vp<%at.cond.1>
4591 // Successor(s): vector.early.exit.1, vector.early.exit.2
4592 VPBasicBlock *CurrentBB = DispatchVPBB;
4593 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4594 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4595 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4596 DebugLoc::getUnknown(), "exit.cond.at.lane");
4597
4598 // For the last dispatch, branch directly to the last exit on false;
4599 // otherwise, create a new check block.
4600 bool IsLastDispatch = (I + 2 == Exits.size());
4601 VPBasicBlock *FalseBB =
4602 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4603 : Plan.createVPBasicBlock(
4604 Twine("vector.early.exit.check.") + Twine(I));
4605
4606 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4607 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4608 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4609 FalseBB->setPredecessors({CurrentBB});
4610
4611 CurrentBB = FalseBB;
4612 DispatchBuilder.setInsertPoint(CurrentBB);
4613 }
4614
4615 return true;
4616}
4617
4618/// This function tries convert extended in-loop reductions to
4619/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4620/// valid. The created recipe must be decomposed to its constituent
4621/// recipes before execution.
4622static VPExpressionRecipe *
4624 VFRange &Range) {
4625 Type *RedTy = Red->getScalarType();
4626 VPValue *VecOp = Red->getVecOp();
4627
4628 assert(!Red->isPartialReduction() &&
4629 "This path does not support partial reductions");
4630
4631 // Clamp the range if using extended-reduction is profitable.
4632 auto IsExtendedRedValidAndClampRange =
4633 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4635 [&](ElementCount VF) {
4636 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4638
4640 InstructionCost ExtCost =
4641 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4642 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4643
4644 assert(!RedTy->isFloatingPointTy() &&
4645 "getExtendedReductionCost only supports integer types");
4646 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4647 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4648 Red->getFastMathFlagsOrNone(), CostKind);
4649 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4650 },
4651 Range);
4652 };
4653
4654 VPValue *A;
4655 // Match reduce(ext)).
4657 IsExtendedRedValidAndClampRange(
4658 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4659 cast<VPWidenCastRecipe>(VecOp)->getOpcode(), A->getScalarType()))
4660 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4661
4662 return nullptr;
4663}
4664
4665/// This function tries convert extended in-loop reductions to
4666/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4667/// and valid. The created VPExpressionRecipe must be decomposed to its
4668/// constituent recipes before execution. Patterns of the
4669/// VPExpressionRecipe:
4670/// reduce.add(mul(...)),
4671/// reduce.add(mul(ext(A), ext(B))),
4672/// reduce.add(ext(mul(ext(A), ext(B)))).
4673/// reduce.fadd(fmul(ext(A), ext(B)))
4674static VPExpressionRecipe *
4676 VPCostContext &Ctx, VFRange &Range) {
4677 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4678 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4679 Opcode != Instruction::FAdd)
4680 return nullptr;
4681
4682 assert(!Red->isPartialReduction() &&
4683 "This path does not support partial reductions");
4684 Type *RedTy = Red->getScalarType();
4685
4686 // Clamp the range if using multiply-accumulate-reduction is profitable.
4687 auto IsMulAccValidAndClampRange =
4689 VPWidenCastRecipe *OuterExt) -> bool {
4691 [&](ElementCount VF) {
4693 Type *SrcTy = Ext0 ? Ext0->getOperand(0)->getScalarType() : RedTy;
4694 InstructionCost MulAccCost;
4695
4696 // getMulAccReductionCost for in-loop reductions does not support
4697 // mixed or floating-point extends.
4698 if (Ext0 && Ext1 &&
4699 (Ext0->getOpcode() != Ext1->getOpcode() ||
4700 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4701 return false;
4702
4703 bool IsZExt =
4704 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4705 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4706 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4707 SrcVecTy, CostKind);
4708
4709 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4710 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4711 InstructionCost ExtCost = 0;
4712 if (Ext0)
4713 ExtCost += Ext0->computeCost(VF, Ctx);
4714 if (Ext1)
4715 ExtCost += Ext1->computeCost(VF, Ctx);
4716 if (OuterExt)
4717 ExtCost += OuterExt->computeCost(VF, Ctx);
4718
4719 return MulAccCost.isValid() &&
4720 MulAccCost < ExtCost + MulCost + RedCost;
4721 },
4722 Range);
4723 };
4724
4725 VPValue *VecOp = Red->getVecOp();
4726 VPRecipeBase *Sub = nullptr;
4727 VPValue *A, *B;
4728 VPValue *Tmp = nullptr;
4729
4730 if (RedTy->isFloatingPointTy())
4731 return nullptr;
4732
4733 // Sub reductions could have a sub between the add reduction and vec op.
4734 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4735 Sub = VecOp->getDefiningRecipe();
4736 VecOp = Tmp;
4737 }
4738
4739 // If ValB is a constant and can be safely extended, truncate it to the same
4740 // type as ExtA's operand, then extend it to the same type as ExtA. This
4741 // creates two uniform extends that can more easily be matched by the rest of
4742 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4743 // replaced with the new extend of the constant.
4744 auto ExtendAndReplaceConstantOp = [](VPWidenCastRecipe *ExtA,
4745 VPWidenCastRecipe *&ExtB, VPValue *&ValB,
4746 VPWidenRecipe *Mul) {
4747 if (!ExtA || ExtB || !isa<VPIRValue>(ValB))
4748 return;
4749 Type *NarrowTy = ExtA->getOperand(0)->getScalarType();
4750 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4751 const APInt *Const;
4752 if (!match(ValB, m_APInt(Const)) ||
4754 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4755 return;
4756 // The truncate ensures that the type of each extended operand is the
4757 // same, and it's been proven that the constant can be extended from
4758 // NarrowTy safely. Necessary since ExtA's extended operand would be
4759 // e.g. an i8, while the const will likely be an i32. This will be
4760 // elided by later optimisations.
4761 VPBuilder Builder(Mul);
4762 auto *Trunc =
4763 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4764 Type *WideTy = ExtA->getScalarType();
4765 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4766 Mul->setOperand(1, ExtB);
4767 };
4768
4769 // Try to match reduce.add(mul(...)).
4770 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4771 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(A);
4772 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(B);
4773 auto *Mul = cast<VPWidenRecipe>(VecOp);
4774
4775 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4776 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4777
4778 // Match reduce.add/sub(mul(ext, ext)).
4779 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4780 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4781 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4782 if (Sub)
4783 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4784 cast<VPWidenRecipe>(Sub), Red);
4785 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4786 }
4787 // TODO: Add an expression type for this variant with a negated mul
4788 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4789 return new VPExpressionRecipe(Mul, Red);
4790 }
4791 // TODO: Add an expression type for negated versions of other expression
4792 // variants.
4793 if (Sub)
4794 return nullptr;
4795
4796 // Match reduce.add(ext(mul(A, B))).
4797 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4798 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4799 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4800 auto *Ext0 = dyn_cast<VPWidenCastRecipe>(A);
4801 auto *Ext1 = dyn_cast<VPWidenCastRecipe>(B);
4802
4803 // reduce.add(ext(mul(ext, const)))
4804 // -> reduce.add(ext(mul(ext, ext(const))))
4805 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4806
4807 // reduce.add(ext(mul(ext(A), ext(B))))
4808 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4809 // The inner extends must either have the same opcode as the outer extend or
4810 // be the same, in which case the multiply can never result in a negative
4811 // value and the outer extend can be folded away by doing wider
4812 // extends for the operands of the mul.
4813 if (Ext0 && Ext1 &&
4814 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4815 Ext0->getOpcode() == Ext1->getOpcode() &&
4816 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4817 auto *NewExt0 = new VPWidenCastRecipe(
4818 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getScalarType(), nullptr,
4819 *Ext0, *Ext0, Ext0->getDebugLoc());
4820 NewExt0->insertBefore(Ext0);
4821
4822 VPWidenCastRecipe *NewExt1 = NewExt0;
4823 if (Ext0 != Ext1) {
4824 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4825 Ext->getScalarType(), nullptr, *Ext1,
4826 *Ext1, Ext1->getDebugLoc());
4827 NewExt1->insertBefore(Ext1);
4828 }
4829 auto *NewMul = Mul->cloneWithOperands({NewExt0, NewExt1});
4830 NewMul->insertBefore(Mul);
4831 Ext->replaceAllUsesWith(NewMul);
4832 Ext->eraseFromParent();
4833 Mul->eraseFromParent();
4834 return new VPExpressionRecipe(NewExt0, NewExt1, NewMul, Red);
4835 }
4836 }
4837 return nullptr;
4838}
4839
4840/// This function tries to create abstract recipes from the reduction recipe for
4841/// following optimizations and cost estimation.
4843 VPCostContext &Ctx,
4844 VFRange &Range) {
4845 // Creation of VPExpressions for partial reductions is entirely handled in
4846 // transformToPartialReduction.
4847 assert(!Red->isPartialReduction() &&
4848 "This path does not support partial reductions");
4849
4850 VPExpressionRecipe *AbstractR = nullptr;
4851 auto IP = std::next(Red->getIterator());
4852 auto *VPBB = Red->getParent();
4853 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4854 AbstractR = MulAcc;
4855 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4856 AbstractR = ExtRed;
4857 // Cannot create abstract inloop reduction recipes.
4858 if (!AbstractR)
4859 return;
4860
4861 AbstractR->insertBefore(*VPBB, IP);
4862 Red->replaceAllUsesWith(AbstractR);
4863}
4864
4875
4877 if (Plan.hasScalarVFOnly())
4878 return;
4879
4880#ifndef NDEBUG
4881 VPDominatorTree VPDT(Plan);
4882#endif
4883
4884 SmallVector<VPValue *> VPValues;
4885 if (VPValue *BTC = Plan.getBackedgeTakenCount())
4886 VPValues.push_back(BTC);
4887 append_range(VPValues, Plan.getLiveIns());
4888 for (VPRecipeBase &R : *Plan.getEntry())
4889 append_range(VPValues, R.definedValues());
4890
4891 auto *VectorPreheader = Plan.getVectorPreheader();
4892 for (VPValue *VPV : VPValues) {
4894 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4895 continue;
4896
4897 // Add explicit broadcast at the insert point that dominates all users.
4898 VPBasicBlock *HoistBlock = VectorPreheader;
4899 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4900 for (VPUser *User : VPV->users()) {
4901 if (User->usesScalars(VPV))
4902 continue;
4903 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4904 HoistPoint = HoistBlock->begin();
4905 else
4906 assert(VPDT.dominates(VectorPreheader,
4907 cast<VPRecipeBase>(User)->getParent()) &&
4908 "All users must be in the vector preheader or dominated by it");
4909 }
4910
4911 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4912 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4913 VPV->replaceUsesWithIf(Broadcast,
4914 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4915 return Broadcast != &U && !U.usesScalars(VPV);
4916 });
4917 }
4918}
4919
4920// Collect common metadata from a group of replicate recipes by intersecting
4921// metadata from all recipes in the group.
4923 VPIRMetadata CommonMetadata = *Recipes.front();
4924 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4925 CommonMetadata.intersect(*Recipe);
4926 return CommonMetadata;
4927}
4928
4929template <unsigned Opcode>
4933 const Loop *L) {
4934 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4935 "Only Load and Store opcodes supported");
4936 [[maybe_unused]] constexpr bool IsLoad = (Opcode == Instruction::Load);
4937
4938 // For each address, collect operations with the same or complementary masks.
4941 Plan, PSE, L,
4942 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
4943 for (auto Recipes : Groups) {
4944 if (Recipes.size() < 2)
4945 continue;
4946
4948 map_range(Recipes, bind_back<getLoadStoreValueType>(IsLoad))) &&
4949 "Expected all recipes in group to have the same load-store type");
4950
4951 // Collect groups with the same or complementary masks.
4952 for (VPReplicateRecipe *&RecipeI : Recipes) {
4953 if (!RecipeI)
4954 continue;
4955
4956 VPValue *MaskI = RecipeI->getMask();
4958 Group.push_back(RecipeI);
4959 RecipeI = nullptr;
4960
4961 // Find all operations with the same or complementary masks.
4962 bool HasComplementaryMask = false;
4963 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4964 if (!RecipeJ)
4965 continue;
4966
4967 VPValue *MaskJ = RecipeJ->getMask();
4968 // Check if any operation in the group has a complementary mask with
4969 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4970 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4971 match(MaskJ, m_Not(m_Specific(MaskI)));
4972 Group.push_back(RecipeJ);
4973 RecipeJ = nullptr;
4974 }
4975
4976 if (HasComplementaryMask) {
4977 assert(Group.size() >= 2 && "must have at least 2 entries");
4978 AllGroups.push_back(std::move(Group));
4979 }
4980 }
4981 }
4982
4983 return AllGroups;
4984}
4985
4986// Find the recipe with minimum alignment in the group.
4987template <typename InstType>
4988static VPReplicateRecipe *
4990 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4991 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4992 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4993 });
4994}
4995
4998 const Loop *L) {
4999 auto Groups =
5001 if (Groups.empty())
5002 return;
5003
5004 // Process each group of loads.
5005 for (auto &Group : Groups) {
5006 // Try to use the earliest (most dominating) load to replace all others.
5007 VPReplicateRecipe *EarliestLoad = Group[0];
5008 VPBasicBlock *FirstBB = EarliestLoad->getParent();
5009 VPBasicBlock *LastBB = Group.back()->getParent();
5010
5011 // Check that the load doesn't alias with stores between first and last.
5012 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
5013 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
5014 continue;
5015
5016 // Collect common metadata from all loads in the group.
5017 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
5018
5019 // Find the load with minimum alignment to use.
5020 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
5021
5022 bool IsSingleScalar = EarliestLoad->isSingleScalar();
5023 assert(all_of(Group,
5024 [IsSingleScalar](VPReplicateRecipe *R) {
5025 return R->isSingleScalar() == IsSingleScalar;
5026 }) &&
5027 "all members in group must agree on IsSingleScalar");
5028
5029 // Create an unpredicated version of the earliest load with common
5030 // metadata.
5031 auto *UnpredicatedLoad = new VPReplicateRecipe(
5032 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
5033 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
5034
5035 UnpredicatedLoad->insertBefore(EarliestLoad);
5036
5037 // Replace all loads in the group with the unpredicated load.
5038 for (VPReplicateRecipe *Load : Group) {
5039 Load->replaceAllUsesWith(UnpredicatedLoad);
5040 Load->eraseFromParent();
5041 }
5042 }
5043}
5044
5045static bool
5047 PredicatedScalarEvolution &PSE, const Loop &L) {
5048 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
5049 if (!StoreLoc || !StoreLoc->AATags.Scope)
5050 return false;
5051
5052 // When sinking a group of stores, all members of the group alias each other.
5053 // Skip them during the alias checks.
5054 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
5055 StoresToSink.end());
5056
5057 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
5058 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
5059 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L);
5060 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
5061}
5062
5065 const Loop *L) {
5066 auto Groups =
5068 if (Groups.empty())
5069 return;
5070
5071 for (auto &Group : Groups) {
5072 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L))
5073 continue;
5074
5075 // Use the last (most dominated) store's location for the unconditional
5076 // store.
5077 VPReplicateRecipe *LastStore = Group.back();
5078 VPBasicBlock *InsertBB = LastStore->getParent();
5079
5080 // Collect common alias metadata from all stores in the group.
5081 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
5082
5083 // Build select chain for stored values.
5084 VPValue *SelectedValue = Group[0]->getOperand(0);
5085 VPBuilder Builder(InsertBB, LastStore->getIterator());
5086
5087 bool IsSingleScalar = Group[0]->isSingleScalar();
5088 for (unsigned I = 1; I < Group.size(); ++I) {
5089 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
5090 "all members in group must agree on IsSingleScalar");
5091 VPValue *Mask = Group[I]->getMask();
5092 VPValue *Value = Group[I]->getOperand(0);
5093 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
5094 Group[I]->getDebugLoc());
5095 }
5096
5097 // Find the store with minimum alignment to use.
5098 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
5099
5100 // Create unconditional store with selected value and common metadata.
5101 auto *UnpredicatedStore = new VPReplicateRecipe(
5102 StoreWithMinAlign->getUnderlyingInstr(),
5103 {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,
5104 /*Mask=*/nullptr, *LastStore, CommonMetadata);
5105 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
5106
5107 // Remove all predicated stores from the group.
5108 for (VPReplicateRecipe *Store : Group)
5109 Store->eraseFromParent();
5110 }
5111}
5112
5114 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
5116 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
5117 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
5118
5119 VPValue *TC = Plan.getTripCount();
5120 if (TC->getNumUsers() == 0)
5121 return;
5122
5123 // Skip cases for which the trip count may be non-trivial to materialize.
5124 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
5125 // tail is required.
5126 if (!Plan.hasScalarTail() ||
5128 Plan.getScalarPreheader() ||
5129 !isa<VPIRValue>(TC))
5130 return;
5131
5132 // Materialize vector trip counts for constants early if it can simply
5133 // be computed as (Original TC / VF * UF) * VF * UF.
5134 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
5135 // tail-folded loops.
5136 ScalarEvolution &SE = *PSE.getSE();
5137 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
5138 if (!isa<SCEVConstant>(TCScev))
5139 return;
5140 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
5141 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
5142 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
5143 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
5144}
5145
5147 VPBasicBlock *VectorPH) {
5149 if (BTC->getNumUsers() == 0)
5150 return;
5151
5152 VPBuilder Builder(VectorPH, VectorPH->begin());
5153 auto *TCTy = Plan.getTripCount()->getScalarType();
5154 auto *TCMO =
5155 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
5156 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
5157 BTC->replaceAllUsesWith(TCMO);
5158}
5159
5161 if (Plan.hasScalarVFOnly())
5162 return;
5163
5164 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
5165 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5167 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5168 vp_depth_first_shallow(LoopRegion->getEntry()));
5169 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
5170 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
5171 // regions. Those are not materialized explicitly yet.
5172 // TODO: materialize build vectors for replicating recipes in replicating
5173 // regions.
5174 for (VPBasicBlock *VPBB :
5175 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
5176 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5178 continue;
5179 auto *DefR = cast<VPSingleDefRecipe>(&R);
5180 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
5181 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5182 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
5183 };
5184 if ((isa<VPReplicateRecipe>(DefR) &&
5185 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
5186 (isa<VPInstruction>(DefR) &&
5188 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
5189 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
5190 continue;
5191
5192 Type *ScalarTy = DefR->getScalarType();
5193 unsigned Opcode = ScalarTy->isStructTy()
5196 auto *BuildVector = new VPInstruction(Opcode, {DefR});
5197 BuildVector->insertAfter(DefR);
5198
5199 DefR->replaceUsesWithIf(
5200 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
5201 VPUser &U, unsigned) {
5202 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
5203 });
5204 }
5205 }
5206
5207 // Create explicit VPInstructions to convert vectors to scalars. The current
5208 // implementation is conservative - it may miss some cases that may or may not
5209 // be vector values. TODO: introduce Unpacks speculatively - remove them later
5210 // if they are known to operate on scalar values.
5211 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5212 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5214 VPDerivedIVRecipe>(&R))
5215 continue;
5216 for (VPValue *Def : R.definedValues()) {
5217 // Skip recipes that are single-scalar or only have their first lane
5218 // used.
5219 // TODO: The Defs skipped here may or may not be vector values.
5220 // Introduce Unpacks, and remove them later, if they are guaranteed to
5221 // produce scalar values.
5223 continue;
5224
5225 // At the moment, we create unpacks only for scalar users outside
5226 // replicate regions. Recipes inside replicate regions still extract the
5227 // required lanes implicitly.
5228 // TODO: Remove once replicate regions are unrolled completely.
5229 auto IsCandidateUnpackUser = [Def](VPUser *U) {
5230 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5231 return U->usesScalars(Def) &&
5232 (!ParentRegion || !ParentRegion->isReplicator());
5233 };
5234 if (none_of(Def->users(), IsCandidateUnpackUser))
5235 continue;
5236
5237 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
5238 if (R.isPhi())
5239 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
5240 else
5241 Unpack->insertAfter(&R);
5242 Def->replaceUsesWithIf(Unpack,
5243 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
5244 return IsCandidateUnpackUser(&U);
5245 });
5246 }
5247 }
5248 }
5249}
5250
5252 VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking,
5253 bool RequiresScalarEpilogue, VPValue *Step,
5254 std::optional<uint64_t> MaxRuntimeStep) {
5255 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5256 // There's nothing to do if there are no users of the vector trip count or its
5257 // IR value has already been set.
5258 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
5259 return;
5260
5261 VPValue *TC = Plan.getTripCount();
5262 Type *TCTy = TC->getScalarType();
5263 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5264 if (auto *StepR = Step->getDefiningRecipe()) {
5265 assert(VPDominatorTree(Plan).dominates(StepR->getParent(), VectorPHVPBB) &&
5266 "Step VPBB must dominate VectorPHVPBB");
5267 // Insert after Step's definition to maintain valid def-use ordering.
5268 InsertPt = std::next(StepR->getIterator());
5269 }
5270 VPBuilder Builder(VectorPHVPBB, InsertPt);
5271
5272 // For scalable steps, if TC is a constant and is divisible by the maximum
5273 // possible runtime step, then TC % Step == 0 for all valid vscale values
5274 // and the vector trip count equals TC directly.
5275 const APInt *TCVal;
5276 if (!RequiresScalarEpilogue && match(TC, m_APInt(TCVal)) && MaxRuntimeStep &&
5277 TCVal->getZExtValue() % *MaxRuntimeStep == 0) {
5278 VectorTC.replaceAllUsesWith(TC);
5279 return;
5280 }
5281
5282 // If the tail is to be folded by masking, round the number of iterations N
5283 // up to a multiple of Step instead of rounding down. This is done by first
5284 // adding Step-1 and then rounding down. Note that it's ok if this addition
5285 // overflows: the vector induction variable will eventually wrap to zero given
5286 // that it starts at zero and its Step is a power of two; the loop will then
5287 // exit, with the last early-exit vector comparison also producing all-true.
5288 if (TailByMasking) {
5289 TC = Builder.createAdd(
5290 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5291 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5292 }
5293
5294 // Now we need to generate the expression for the part of the loop that the
5295 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5296 // iterations are not required for correctness, or N - Step, otherwise. Step
5297 // is equal to the vectorization factor (number of SIMD elements) times the
5298 // unroll factor (number of SIMD instructions).
5299 VPValue *R =
5300 Builder.createNaryOp(Instruction::URem, {TC, Step},
5301 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5302
5303 // There are cases where we *must* run at least one iteration in the remainder
5304 // loop. See the cost model for when this can happen. If the step evenly
5305 // divides the trip count, we set the remainder to be equal to the step. If
5306 // the step does not evenly divide the trip count, no adjustment is necessary
5307 // since there will already be scalar iterations. Note that the minimum
5308 // iterations check ensures that N >= Step.
5309 if (RequiresScalarEpilogue) {
5310 assert(!TailByMasking &&
5311 "requiring scalar epilogue is not supported with fail folding");
5312 VPValue *IsZero =
5313 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));
5314 R = Builder.createSelect(IsZero, Step, R);
5315 }
5316
5317 VPValue *Res =
5318 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5319 VectorTC.replaceAllUsesWith(Res);
5320}
5321
5323 ElementCount VFEC) {
5324 // If VF and VFxUF have already been materialized (no remaining users),
5325 // there's nothing more to do.
5326 if (Plan.getVF().isMaterialized()) {
5327 assert(Plan.getVFxUF().isMaterialized() &&
5328 "VF and VFxUF must be materialized together");
5329 return;
5330 }
5331
5332 VPBuilder Builder(VectorPH, VectorPH->begin());
5333 Type *TCTy = Plan.getTripCount()->getScalarType();
5334 VPValue &VF = Plan.getVF();
5335 VPValue &VFxUF = Plan.getVFxUF();
5336 // If there are no users of the runtime VF, compute VFxUF by constant folding
5337 // the multiplication of VF and UF.
5338 if (VF.getNumUsers() == 0) {
5339 VPValue *RuntimeVFxUF =
5340 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5341 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5342 return;
5343 }
5344
5345 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5346 // vscale) * UF.
5347 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5349 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5351 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5352 }
5353 VF.replaceAllUsesWith(RuntimeVF);
5354
5355 VPValue *MulByUF = Builder.createOverflowingOp(
5356 Instruction::Mul,
5357 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5358 {true, false});
5359 VFxUF.replaceAllUsesWith(MulByUF);
5360}
5361
5363 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
5364 auto *HeaderMaskDef = HeaderMask->getDefiningRecipe();
5365 Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());
5366
5367 VPBuilder Builder(Plan.getVectorPreheader());
5368 auto *AliasMask = Builder.createNaryOp(
5369 VPInstruction::IncomingAliasMask, {}, nullptr, {}, {},
5370 DebugLoc::getUnknown(), "incoming.alias.mask", I1Ty);
5371
5372 if (HeaderMaskDef->isPhi())
5373 Builder = VPBuilder(&*HeaderMaskDef->getParent()->getFirstNonPhi());
5374 else
5375 Builder = VPBuilder::getToInsertAfter(HeaderMaskDef);
5376
5377 // Update all existing users of the header mask to "HeaderMask & AliasMask".
5378 auto *ClampedHeaderMask = Builder.createAnd(HeaderMask, AliasMask);
5379 HeaderMask->replaceUsesWithIf(ClampedHeaderMask, [&](VPUser &U, unsigned) {
5380 return &U != ClampedHeaderMask;
5381 });
5382}
5383
5384VPValue *
5386 ArrayRef<PointerDiffInfo> DiffChecks) {
5387 VPBuilder Builder(AliasCheckVPBB);
5388 Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());
5389
5390 VPValue *IncomingAliasMask = vputils::findIncomingAliasMask(Plan);
5391 assert(IncomingAliasMask && "Expected an alias mask!");
5392
5393 VPValue *AliasMask = nullptr;
5394 for (const PointerDiffInfo &Check : DiffChecks) {
5396 VPValue *Sink =
5398 Type *AddrType = Src->getScalarType();
5399
5400 // TODO: Only freeze the required pointer (not both src and sink).
5401 if (Check.NeedsFreeze) {
5402 Src = Builder.createScalarFreeze(Src, AddrType, DebugLoc::getUnknown());
5403 Sink = Builder.createScalarFreeze(Sink, AddrType, DebugLoc::getUnknown());
5404 }
5405
5406 // TODO: Generate loop_dependence_raw_mask when there's a read-after-write
5407 // dependency between the source and the sink. This is not necessary for
5408 // correctness of the mask, but using the "raw" variant prevents loads
5409 // depending on the completion of stores.
5410 VPWidenIntrinsicRecipe *WARMask = Builder.insert(new VPWidenIntrinsicRecipe(
5411 Intrinsic::loop_dependence_war_mask,
5412 {Src, Sink, Plan.getConstantInt(AddrType, Check.AccessSize)}, I1Ty));
5413
5414 if (AliasMask)
5415 AliasMask = Builder.createAnd(AliasMask, WARMask);
5416 else
5417 AliasMask = WARMask;
5418 }
5419
5421 Type *IndexTy = Plan.getDataLayout().getIndexType(Plan.getContext(), 0);
5422 VPValue *NumActive = Builder.createNaryOp(
5423 VPInstruction::NumActiveLanes, {AliasMask}, nullptr, {}, {},
5424 DebugLoc::getUnknown(), "num.active.lanes", IndexTy);
5425 VPValue *ClampedVF = Builder.createScalarZExtOrTrunc(
5426 NumActive, IVTy, IndexTy, DebugLoc::getCompilerGenerated());
5427
5428 IncomingAliasMask->replaceAllUsesWith(AliasMask);
5429
5430 return ClampedVF;
5431}
5432
5434 VPlan &Plan, ArrayRef<PointerDiffInfo> DiffChecks, bool HasBranchWeights) {
5435 VPBasicBlock *ClampedVFCheck =
5436 Plan.createVPBasicBlock("vector.clamped.vf.check");
5437
5438 VPValue *ClampedVF = materializeAliasMask(Plan, ClampedVFCheck, DiffChecks);
5439 VPBuilder Builder(ClampedVFCheck);
5441 Type *TCTy = Plan.getTripCount()->getScalarType();
5442
5443 // Check the "ClampedVF" from the alias mask is larger than one.
5444 VPValue *IsScalar =
5445 Builder.createICmp(CmpInst::ICMP_ULE, ClampedVF,
5446 Plan.getConstantInt(TCTy, 1), DL, "vf.is.scalar");
5447
5448 VPValue *TripCount = Plan.getTripCount();
5449 VPValue *MaxUIntTripCount =
5451 VPValue *DistanceToMax = Builder.createSub(MaxUIntTripCount, TripCount);
5452
5453 // For tail-folding: Don't execute the vector loop if (UMax - n) < ClampedVF.
5454 // Note: The ClampedVF may not be a power-of-two. This means the loop exit
5455 // condition (index.next == n.vec) may not be correct in the case of an
5456 // overflow. The issue is `n.vec` could be zero due to an overflow, but
5457 // index.next is not guaranteed to overflow to zero as the ClampedVF is not a
5458 // power-of-two).
5459 VPValue *TripCountCheck = Builder.createICmp(
5460 ICmpInst::ICMP_ULT, DistanceToMax, ClampedVF, DL, "vf.step.overflow");
5461
5462 VPValue *Cond = Builder.createOr(IsScalar, TripCountCheck, DL);
5463 attachVPCheckBlock(Plan, Cond, ClampedVFCheck, HasBranchWeights);
5464
5465 // Materialize the trip count early as this will add a use of (VFxUF) that
5466 // needs to be replaced with the ClampedVF.
5468 /*TailByMasking=*/true,
5469 /*RequiresScalarEpilogue=*/false,
5470 &Plan.getVFxUF());
5471
5472 assert(Plan.getConcreteUF() == 1 &&
5473 "Clamped VF not supported with interleaving");
5474 Plan.getVF().replaceAllUsesWith(ClampedVF);
5475 Plan.getVFxUF().replaceAllUsesWith(ClampedVF);
5476}
5477
5479 ScalarEvolution &SE) {
5480 auto *Entry = Plan.getEntry();
5481 VPBuilder Builder(Entry, Entry->begin());
5483 ->getIRBasicBlock()
5484 ->getTerminator()
5485 ->getDebugLoc();
5486 VPSCEVExpander Expander(Builder, SE, DL);
5487
5488 // Expand VPExpandSCEVRecipes to VPInstructions using VPSCEVExpander. During
5489 // the transition, unsupported VPExpandSCEVRecipes are skipped and left for
5490 // late expansion.
5491 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5492 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5493 if (!ExpSCEV || ExpSCEV->getNumUsers() == 0)
5494 continue;
5495 Builder.setInsertPoint(ExpSCEV);
5496 VPValue *Expanded = Expander.tryToExpand(ExpSCEV->getSCEV());
5497 if (!Expanded)
5498 continue;
5499 ExpSCEV->replaceAllUsesWith(Expanded);
5500 // TripCount should not be used after expansion to VPInstructions. Reset to
5501 // poison to avoid dangling references.
5502 if (Plan.getTripCount() == ExpSCEV)
5503 Plan.resetTripCount(
5504 Plan.getOrAddLiveIn(PoisonValue::get(ExpSCEV->getScalarType())));
5505 ExpSCEV->eraseFromParent();
5506 }
5507}
5508
5511 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5512
5513 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5514 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5515 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5516 // Expand remaining VPExpandSCEVRecipes to IR instructions using SCEVExpander.
5517 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5518 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5519 if (!ExpSCEV)
5520 continue;
5521 const SCEV *Expr = ExpSCEV->getSCEV();
5522 Value *Res =
5523 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5524 ExpandedSCEVs[Expr] = Res;
5525 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5526 ExpSCEV->replaceAllUsesWith(Exp);
5527 if (Plan.getTripCount() == ExpSCEV)
5528 Plan.resetTripCount(Exp);
5529 ExpSCEV->eraseFromParent();
5530 }
5532 "all VPExpandSCEVRecipes must have been expanded");
5533 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5534 // to the VPIRBasicBlock.
5535 auto EI = Entry->begin();
5536 for (Instruction &I : drop_end(*EntryBB)) {
5537 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5538 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5539 EI++;
5540 continue;
5541 }
5543 }
5544
5545 return ExpandedSCEVs;
5546}
5547
5548/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5549/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5550/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5551/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5552/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5553/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5554/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5555/// is defined at \p Idx of a load interleave group.
5556static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5557 VPValue *OpV, unsigned Idx, bool IsScalable) {
5558 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5559 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5560 if (!Member0OpR)
5561 return Member0Op == OpV;
5562 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5563 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5564 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5565 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5566 Member0Op == OpV;
5567 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5568 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5569 return false;
5570}
5571
5572static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5574 auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Ops[0]);
5575 if (!WideMember0)
5576 return false;
5577 for (VPValue *V : Ops) {
5579 return false;
5580 auto *R = cast<VPSingleDefRecipe>(V);
5581 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5582 return false;
5583 }
5584
5585 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5587 for (VPValue *Op : Ops)
5588 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5589
5590 if (canNarrowOps(OpsI, IsScalable))
5591 continue;
5592
5593 if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {
5594 const auto &[OpIdx, OpV] = P;
5595 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5596 }))
5597 return false;
5598 }
5599
5600 return true;
5601}
5602
5603/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5604/// number of members both equal to VF. The interleave group must also access
5605/// the full vector width.
5606static std::optional<ElementCount>
5609 const TargetTransformInfo &TTI) {
5610 if (!InterleaveR || InterleaveR->getMask())
5611 return std::nullopt;
5612
5613 Type *GroupElementTy = nullptr;
5614 if (InterleaveR->getStoredValues().empty()) {
5615 GroupElementTy = InterleaveR->getVPValue(0)->getScalarType();
5616 if (!all_of(InterleaveR->definedValues(), [GroupElementTy](VPValue *Op) {
5617 return Op->getScalarType() == GroupElementTy;
5618 }))
5619 return std::nullopt;
5620 } else {
5621 GroupElementTy = InterleaveR->getStoredValues()[0]->getScalarType();
5622 if (!all_of(InterleaveR->getStoredValues(), [GroupElementTy](VPValue *Op) {
5623 return Op->getScalarType() == GroupElementTy;
5624 }))
5625 return std::nullopt;
5626 }
5627
5628 auto IG = InterleaveR->getInterleaveGroup();
5629 if (IG->getFactor() != IG->getNumMembers())
5630 return std::nullopt;
5631
5632 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5633 TypeSize Size = TTI.getRegisterBitWidth(
5636 assert(Size.isScalable() == VF.isScalable() &&
5637 "if Size is scalable, VF must be scalable and vice versa");
5638 return Size.getKnownMinValue();
5639 };
5640
5641 for (ElementCount VF : VFs) {
5642 unsigned MinVal = VF.getKnownMinValue();
5643 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5644 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5645 return {VF};
5646 }
5647 return std::nullopt;
5648}
5649
5650/// Returns true if \p VPValue is a narrow VPValue.
5651static bool isAlreadyNarrow(VPValue *VPV) {
5652 if (isa<VPIRValue>(VPV))
5653 return true;
5654 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5655 return RepR && RepR->isSingleScalar();
5656}
5657
5658// Convert the wide recipes defining the VPValues in \p Members feeding an
5659// interleave group to a single narrow variant. The first member is reused as
5660// the narrowed recipe.
5661static VPValue *
5663 SmallPtrSetImpl<VPValue *> &NarrowedOps) {
5664 VPValue *V = Members.front();
5665 auto *R = V->getDefiningRecipe();
5666 if (!R || NarrowedOps.contains(V))
5667 return V;
5668
5669 if (isAlreadyNarrow(V))
5670 return V;
5671
5673 auto *WideMember0 = cast<VPRecipeWithIRFlags>(R);
5674 for (VPValue *Member : Members.drop_front())
5675 WideMember0->intersectFlags(*cast<VPRecipeWithIRFlags>(Member));
5676 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx) {
5678 for (VPValue *Member : Members)
5679 OpsI.push_back(Member->getDefiningRecipe()->getOperand(Idx));
5680 WideMember0->setOperand(Idx, narrowInterleaveGroupOp(OpsI, NarrowedOps));
5681 }
5682 return V;
5683 }
5684
5685 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5686 // Narrow interleave group to wide load, as transformed VPlan will only
5687 // process one original iteration.
5688 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5689 auto *L = new VPWidenLoadRecipe(*LI, LoadGroup->getAddr(),
5690 LoadGroup->getMask(), /*Consecutive=*/true,
5691 *LoadGroup, LoadGroup->getDebugLoc());
5692 L->insertBefore(LoadGroup);
5693 NarrowedOps.insert(L);
5694 return L;
5695 }
5696
5697 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5698 assert(RepR->isSingleScalar() && RepR->getOpcode() == Instruction::Load &&
5699 "must be a single scalar load");
5700 NarrowedOps.insert(RepR);
5701 return RepR;
5702 }
5703
5704 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5705 VPValue *PtrOp = WideLoad->getAddr();
5706 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5707 PtrOp = VecPtr->getOperand(0);
5708 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5709 // process one original iteration.
5710 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5711 /*IsUniform*/ true,
5712 /*Mask*/ nullptr, {}, *WideLoad);
5713 N->insertBefore(WideLoad);
5714 NarrowedOps.insert(N);
5715 return N;
5716}
5717
5718std::unique_ptr<VPlan>
5720 const TargetTransformInfo &TTI) {
5721 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5722
5723 if (!VectorLoop)
5724 return nullptr;
5725
5726 // Only handle single-block loops for now.
5727 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5728 return nullptr;
5729
5730 // Skip plans when we may not be able to properly narrow.
5731 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5732 if (!match(&Exiting->back(), m_BranchOnCount()))
5733 return nullptr;
5734
5735 assert(match(&Exiting->back(),
5737 m_Specific(&Plan.getVectorTripCount()))) &&
5738 "unexpected branch-on-count");
5739
5741 std::optional<ElementCount> VFToOptimize;
5742 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5745 continue;
5746
5747 // Bail out on recipes not supported at the moment:
5748 // * phi recipes other than the canonical induction
5749 // * recipes writing to memory except interleave groups
5750 // Only support plans with a canonical induction phi.
5751 if (R.isPhi())
5752 return nullptr;
5753
5754 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5755 if (R.mayWriteToMemory() && !InterleaveR)
5756 return nullptr;
5757
5758 // Bail out if any recipe defines a vector value used outside the
5759 // vector loop region.
5760 if (any_of(R.definedValues(), [&](VPValue *V) {
5761 return any_of(V->users(), [&](VPUser *U) {
5762 auto *UR = cast<VPRecipeBase>(U);
5763 return UR->getParent()->getParent() != VectorLoop;
5764 });
5765 }))
5766 return nullptr;
5767
5768 // All other ops are allowed, but we reject uses that cannot be converted
5769 // when checking all allowed consumers (store interleave groups) below.
5770 if (!InterleaveR)
5771 continue;
5772
5773 // Try to find a single VF, where all interleave groups are consecutive and
5774 // saturate the full vector width. If we already have a candidate VF, check
5775 // if it is applicable for the current InterleaveR, otherwise look for a
5776 // suitable VF across the Plan's VFs.
5778 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5779 : to_vector(Plan.vectorFactors());
5780 std::optional<ElementCount> NarrowedVF =
5781 isConsecutiveInterleaveGroup(InterleaveR, VFs, TTI);
5782 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
5783 return nullptr;
5784 VFToOptimize = NarrowedVF;
5785
5786 // Skip read interleave groups.
5787 if (InterleaveR->getStoredValues().empty())
5788 continue;
5789
5790 // Narrow interleave groups, if all operands are already matching narrow
5791 // ops.
5792 auto *Member0 = InterleaveR->getStoredValues()[0];
5793 if (isAlreadyNarrow(Member0) &&
5794 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5795 StoreGroups.push_back(InterleaveR);
5796 continue;
5797 }
5798
5799 // For now, we only support full interleave groups storing load interleave
5800 // groups.
5801 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5802 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5803 if (!DefR)
5804 return false;
5805 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5806 return IR && IR->getInterleaveGroup()->isFull() &&
5807 IR->getVPValue(Op.index()) == Op.value();
5808 })) {
5809 StoreGroups.push_back(InterleaveR);
5810 continue;
5811 }
5812
5813 // Check if all values feeding InterleaveR are matching wide recipes, which
5814 // operands that can be narrowed.
5815 if (!canNarrowOps(InterleaveR->getStoredValues(),
5816 VFToOptimize->isScalable()))
5817 return nullptr;
5818 StoreGroups.push_back(InterleaveR);
5819 }
5820
5821 if (StoreGroups.empty())
5822 return nullptr;
5823
5824 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5825 bool RequiresScalarEpilogue =
5826 MiddleVPBB->getNumSuccessors() == 1 &&
5827 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
5828 // Bail out for tail-folding (middle block with a single successor to exit).
5829 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
5830 return nullptr;
5831
5832 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5833 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
5834 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5835 // TODO: Handle cases where only some interleave groups can be narrowed.
5836 std::unique_ptr<VPlan> NewPlan;
5837 if (size(Plan.vectorFactors()) != 1) {
5838 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5839 Plan.setVF(*VFToOptimize);
5840 NewPlan->removeVF(*VFToOptimize);
5841 }
5842
5843 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5844 SmallPtrSet<VPValue *, 4> NarrowedOps;
5845 // Narrow operation tree rooted at store groups.
5846 for (auto *StoreGroup : StoreGroups) {
5847 VPValue *Res =
5848 narrowInterleaveGroupOp(StoreGroup->getStoredValues(), NarrowedOps);
5849 auto *SI =
5850 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5851 auto *S = new VPWidenStoreRecipe(*SI, StoreGroup->getAddr(), Res, nullptr,
5852 /*Consecutive=*/true, *StoreGroup,
5853 StoreGroup->getDebugLoc());
5854 S->insertBefore(StoreGroup);
5855 StoreGroup->eraseFromParent();
5856 }
5857
5858 // Adjust induction to reflect that the transformed plan only processes one
5859 // original iteration.
5861 Type *CanIVTy = VectorLoop->getCanonicalIVType();
5862 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
5863 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
5864
5865 VPValue *UF = &Plan.getUF();
5866 VPValue *Step;
5867 if (VFToOptimize->isScalable()) {
5868 VPValue *VScale =
5869 PHBuilder.createElementCount(CanIVTy, ElementCount::getScalable(1));
5870 Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},
5871 {true, false});
5872 Plan.getVF().replaceAllUsesWith(VScale);
5873 } else {
5874 Step = UF;
5875 Plan.getVF().replaceAllUsesWith(Plan.getConstantInt(CanIVTy, 1));
5876 }
5877 // Materialize vector trip count with the narrowed step.
5878 materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,
5879 RequiresScalarEpilogue, Step);
5880
5881 CanIVInc->setOperand(1, Step);
5882 Plan.getVFxUF().replaceAllUsesWith(Step);
5883
5884 removeDeadRecipes(Plan);
5885 assert(none_of(*VectorLoop->getEntryBasicBlock(),
5887 "All VPVectorPointerRecipes should have been removed");
5888 return NewPlan;
5889}
5890
5891/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5892/// BranchOnCond recipe.
5894 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5895 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5896 auto *MiddleTerm =
5898 // Only add branch metadata if there is a (conditional) terminator.
5899 if (!MiddleTerm)
5900 return;
5901
5902 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5903 "must have a BranchOnCond");
5904 // Assume that `TripCount % VectorStep ` is equally distributed.
5905 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5906 if (VF.isScalable() && VScaleForTuning.has_value())
5907 VectorStep *= *VScaleForTuning;
5908 assert(VectorStep > 0 && "trip count should not be zero");
5909 MDBuilder MDB(Plan.getContext());
5910 MDNode *BranchWeights =
5911 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5912 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5913}
5914
5916 VFRange &Range) {
5917 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5918 auto *MiddleVPBB = Plan.getMiddleBlock();
5919 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5920
5921 auto IsScalableOne = [](ElementCount VF) -> bool {
5922 return VF == ElementCount::getScalable(1);
5923 };
5924
5925 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5926 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5927 if (!FOR)
5928 continue;
5929
5930 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5931 "Cannot handle loops with uncountable early exits");
5932
5933 // Find the existing splice for this FOR, created in
5934 // createHeaderPhiRecipes. All uses of FOR have already been replaced with
5935 // RecurSplice there; only RecurSplice itself still references FOR.
5936 auto *RecurSplice =
5938 assert(RecurSplice && "expected FirstOrderRecurrenceSplice");
5939
5940 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5941 // penultimate value of the recurrence. Instead we rely on the existing
5942 // extract of the last element from the result of
5943 // VPInstruction::FirstOrderRecurrenceSplice.
5944 // TODO: Consider vscale_range info and UF.
5945 if (any_of(RecurSplice->users(),
5946 [](VPUser *U) { return !cast<VPRecipeBase>(U)->getRegion(); }) &&
5948 Range))
5949 return;
5950
5951 // This is the second phase of vectorizing first-order recurrences, creating
5952 // extracts for users outside the loop. An overview of the transformation is
5953 // described below. Suppose we have the following loop with some use after
5954 // the loop of the last a[i-1],
5955 //
5956 // for (int i = 0; i < n; ++i) {
5957 // t = a[i - 1];
5958 // b[i] = a[i] - t;
5959 // }
5960 // use t;
5961 //
5962 // There is a first-order recurrence on "a". For this loop, the shorthand
5963 // scalar IR looks like:
5964 //
5965 // scalar.ph:
5966 // s.init = a[-1]
5967 // br scalar.body
5968 //
5969 // scalar.body:
5970 // i = phi [0, scalar.ph], [i+1, scalar.body]
5971 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5972 // s2 = a[i]
5973 // b[i] = s2 - s1
5974 // br cond, scalar.body, exit.block
5975 //
5976 // exit.block:
5977 // use = lcssa.phi [s1, scalar.body]
5978 //
5979 // In this example, s1 is a recurrence because it's value depends on the
5980 // previous iteration. In the first phase of vectorization, we created a
5981 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5982 // for users in the scalar preheader and exit block.
5983 //
5984 // vector.ph:
5985 // v_init = vector(..., ..., ..., a[-1])
5986 // br vector.body
5987 //
5988 // vector.body
5989 // i = phi [0, vector.ph], [i+4, vector.body]
5990 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5991 // v2 = a[i, i+1, i+2, i+3]
5992 // v1' = splice(v1(3), v2(0, 1, 2))
5993 // b[i, i+1, i+2, i+3] = v2 - v1'
5994 // br cond, vector.body, middle.block
5995 //
5996 // middle.block:
5997 // vector.recur.extract.for.phi = v2(2)
5998 // vector.recur.extract = v2(3)
5999 // br cond, scalar.ph, exit.block
6000 //
6001 // scalar.ph:
6002 // scalar.recur.init = phi [vector.recur.extract, middle.block],
6003 // [s.init, otherwise]
6004 // br scalar.body
6005 //
6006 // scalar.body:
6007 // i = phi [0, scalar.ph], [i+1, scalar.body]
6008 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
6009 // s2 = a[i]
6010 // b[i] = s2 - s1
6011 // br cond, scalar.body, exit.block
6012 //
6013 // exit.block:
6014 // lo = lcssa.phi [s1, scalar.body],
6015 // [vector.recur.extract.for.phi, middle.block]
6016 //
6017 // Update extracts of the splice in the middle block: they extract the
6018 // penultimate element of the recurrence.
6020 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
6021 if (!match(&R, m_ExtractLastLaneOfLastPart(m_Specific(RecurSplice))))
6022 continue;
6023
6024 auto *ExtractR = cast<VPInstruction>(&R);
6025 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
6026 VPInstruction::ExtractPenultimateElement, RecurSplice->getOperand(1),
6027 {}, "vector.recur.extract.for.phi");
6028 for (VPUser *ExitU : to_vector(ExtractR->users())) {
6029 if (auto *ExitPhi = dyn_cast<VPIRPhi>(ExitU))
6030 ExitPhi->replaceUsesOfWith(ExtractR, PenultimateElement);
6031 }
6032 }
6033 }
6034}
6035
6036/// Check if \p V is a binary expression of a widened IV and a loop-invariant
6037/// value. Returns the widened IV if found, nullptr otherwise.
6039 auto *BinOp = dyn_cast<VPWidenRecipe>(V);
6040 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||
6041 Instruction::isIntDivRem(BinOp->getOpcode()))
6042 return nullptr;
6043
6044 VPValue *WidenIVCandidate = BinOp->getOperand(0);
6045 VPValue *InvariantCandidate = BinOp->getOperand(1);
6046 if (!isa<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate))
6047 std::swap(WidenIVCandidate, InvariantCandidate);
6048
6049 if (!InvariantCandidate->isDefinedOutsideLoopRegions())
6050 return nullptr;
6051
6052 return dyn_cast<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate);
6053}
6054
6055/// Create a scalar version of \p BinOp, with its \p WidenIV operand replaced
6056/// by \p ScalarIV, and place it after \p ScalarIV's defining recipe.
6060 BinOp->getNumOperands() == 2 && "BinOp must have 2 operands");
6061 auto *ClonedOp = BinOp->clone();
6062 if (ClonedOp->getOperand(0) == WidenIV) {
6063 ClonedOp->setOperand(0, ScalarIV);
6064 } else {
6065 assert(ClonedOp->getOperand(1) == WidenIV && "one operand must be WideIV");
6066 ClonedOp->setOperand(1, ScalarIV);
6067 }
6068 ClonedOp->insertAfter(ScalarIV->getDefiningRecipe());
6069 return ClonedOp;
6070}
6071
6074 Loop &L) {
6075 ScalarEvolution &SE = *PSE.getSE();
6076 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
6077
6078 // Helper lambda to check if the IV range excludes the sentinel value. Try
6079 // signed first, then unsigned. Return an excluded sentinel if found,
6080 // otherwise return std::nullopt.
6081 auto CheckSentinel = [&SE](const SCEV *IVSCEV,
6082 bool UseMax) -> std::optional<APSInt> {
6083 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
6084 for (bool Signed : {true, false}) {
6085 APSInt Sentinel = UseMax ? APSInt::getMinValue(BW, /*Unsigned=*/!Signed)
6086 : APSInt::getMaxValue(BW, /*Unsigned=*/!Signed);
6087
6088 ConstantRange IVRange =
6089 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
6090 if (!IVRange.contains(Sentinel))
6091 return Sentinel;
6092 }
6093 return std::nullopt;
6094 };
6095
6096 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
6097 for (VPRecipeBase &Phi :
6098 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
6099 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
6101 PhiR->getRecurrenceKind()))
6102 continue;
6103
6104 Type *PhiTy = PhiR->getScalarType();
6105 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
6106 continue;
6107
6108 // If there's a header mask, the backedge select will not be the find-last
6109 // select.
6110 VPValue *BackedgeVal = PhiR->getBackedgeValue();
6111 auto *FindLastSelect = cast<VPSingleDefRecipe>(BackedgeVal);
6112 if (HeaderMask &&
6113 !match(BackedgeVal,
6114 m_Select(m_Specific(HeaderMask),
6115 m_VPSingleDefRecipe(FindLastSelect), m_Specific(PhiR))))
6116 continue;
6117
6118 // Get the find-last expression from the find-last select of the reduction
6119 // phi. The find-last select should be a select between the phi and the
6120 // find-last expression.
6121 VPValue *Cond, *FindLastExpression;
6122 if (!match(FindLastSelect, m_Select(m_VPValue(Cond), m_Specific(PhiR),
6123 m_VPValue(FindLastExpression))) &&
6124 !match(FindLastSelect,
6125 m_Select(m_VPValue(Cond), m_VPValue(FindLastExpression),
6126 m_Specific(PhiR))))
6127 continue;
6128
6129 // Check if FindLastExpression is a simple expression of a widened IV. If
6130 // so, we can track the underlying IV instead and sink the expression.
6131 auto *IVOfExpressionToSink = getExpressionIV(FindLastExpression);
6132 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(
6133 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression, PSE,
6134 &L);
6135 const SCEV *Step;
6136 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
6137 assert(!match(vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L),
6139 "IVOfExpressionToSink not being an AddRec must imply "
6140 "FindLastExpression not being an AddRec.");
6141 continue;
6142 }
6143
6144 // Determine direction from SCEV step.
6145 if (!SE.isKnownNonZero(Step))
6146 continue;
6147
6148 // Positive step means we need UMax/SMax to find the last IV value, and
6149 // UMin/SMin otherwise.
6150 bool UseMax = SE.isKnownPositive(Step);
6151 std::optional<APSInt> SentinelVal = CheckSentinel(IVSCEV, UseMax);
6152 bool UseSigned = SentinelVal && SentinelVal->isSigned();
6153
6154 // Sinking an expression will disable epilogue vectorization. Only use it,
6155 // if FindLastExpression cannot be vectorized via a sentinel. Sinking may
6156 // also prevent vectorizing using a sentinel (e.g., if the expression is a
6157 // multiply or divide by large constant, respectively), which also makes
6158 // sinking undesirable.
6159 if (IVOfExpressionToSink) {
6160 const SCEV *FindLastExpressionSCEV =
6161 vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L);
6162 if (match(FindLastExpressionSCEV,
6163 m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
6164 bool NewUseMax = SE.isKnownPositive(Step);
6165 if (auto NewSentinel =
6166 CheckSentinel(FindLastExpressionSCEV, NewUseMax)) {
6167 // The original expression already has a sentinel, so prefer not
6168 // sinking to keep epilogue vectorization possible.
6169 SentinelVal = *NewSentinel;
6170 UseSigned = NewSentinel->isSigned();
6171 UseMax = NewUseMax;
6172 IVSCEV = FindLastExpressionSCEV;
6173 IVOfExpressionToSink = nullptr;
6174 }
6175 }
6176 }
6177
6178 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
6179 // if the condition was ever true. Requires the IV to not wrap, otherwise we
6180 // cannot use min/max.
6181 if (!SentinelVal) {
6182 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
6183 if (AR->hasNoSignedWrap())
6184 UseSigned = true;
6185 else if (AR->hasNoUnsignedWrap())
6186 UseSigned = false;
6187 else
6188 continue;
6189 }
6190
6192 BackedgeVal,
6194
6195 VPValue *NewFindLastSelect = BackedgeVal;
6196 VPValue *SelectCond = Cond;
6197 if (!SentinelVal || IVOfExpressionToSink) {
6198 // When we need to create a new select, normalize the condition so that
6199 // PhiR is the last operand and include the header mask if needed.
6200 DebugLoc DL = FindLastSelect->getDefiningRecipe()->getDebugLoc();
6201 VPBuilder LoopBuilder(FindLastSelect->getDefiningRecipe());
6202 if (FindLastSelect->getDefiningRecipe()->getOperand(1) == PhiR)
6203 SelectCond = LoopBuilder.createNot(SelectCond);
6204
6205 // When tail folding, mask the condition with the header mask to prevent
6206 // propagating poison from inactive lanes in the last vector iteration.
6207 if (HeaderMask)
6208 SelectCond = LoopBuilder.createLogicalAnd(HeaderMask, SelectCond);
6209
6210 if (SelectCond != Cond || IVOfExpressionToSink) {
6211 NewFindLastSelect = LoopBuilder.createSelect(
6212 SelectCond,
6213 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression,
6214 PhiR, DL);
6215 }
6216 }
6217
6218 // Create the reduction result in the middle block using sentinel directly.
6219 RecurKind MinMaxKind =
6220 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
6221 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
6222 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
6223 FastMathFlags());
6224 DebugLoc ExitDL = RdxResult->getDebugLoc();
6225 VPBuilder MiddleBuilder(RdxResult);
6226 VPValue *ReducedIV =
6228 NewFindLastSelect, Flags, ExitDL);
6229
6230 // If IVOfExpressionToSink is an expression to sink, sink it now.
6231 VPValue *VectorRegionExitingVal = ReducedIV;
6232 if (IVOfExpressionToSink)
6233 VectorRegionExitingVal =
6234 cloneBinOpForScalarIV(cast<VPWidenRecipe>(FindLastExpression),
6235 ReducedIV, IVOfExpressionToSink);
6236
6237 VPValue *NewRdxResult;
6238 VPValue *StartVPV = PhiR->getStartValue();
6239 if (SentinelVal) {
6240 // Sentinel-based approach: reduce IVs with min/max, compare against
6241 // sentinel to detect if condition was ever true, select accordingly.
6242 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
6243 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
6244 Sentinel, ExitDL);
6245 NewRdxResult = MiddleBuilder.createSelect(Cmp, VectorRegionExitingVal,
6246 StartVPV, ExitDL);
6247 StartVPV = Sentinel;
6248 } else {
6249 // Introduce a boolean AnyOf reduction to track if the condition was ever
6250 // true in the loop. Use it to select the initial start value, if it was
6251 // never true.
6252 auto *AnyOfPhi = new VPReductionPHIRecipe(
6253 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
6254 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
6255 AnyOfPhi->insertAfter(PhiR);
6256
6257 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
6258 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, SelectCond);
6259 AnyOfPhi->setOperand(1, OrVal);
6260
6261 NewRdxResult = MiddleBuilder.createAnyOfReduction(
6262 OrVal, VectorRegionExitingVal, StartVPV, ExitDL);
6263
6264 // Initialize the IV reduction phi with the neutral element, not the
6265 // original start value, to ensure correct min/max reduction results.
6266 StartVPV = Plan.getOrAddLiveIn(
6267 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
6268 }
6269 RdxResult->replaceAllUsesWith(NewRdxResult);
6270 RdxResult->eraseFromParent();
6271
6272 auto *NewPhiR = new VPReductionPHIRecipe(
6273 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
6274 *NewFindLastSelect, RdxUnordered{1}, {},
6275 PhiR->hasUsesOutsideReductionChain());
6276 NewPhiR->insertBefore(PhiR);
6277 PhiR->replaceAllUsesWith(NewPhiR);
6278 PhiR->eraseFromParent();
6279 }
6280}
6281
6282namespace {
6283
6284using ExtendKind = TTI::PartialReductionExtendKind;
6285struct ReductionExtend {
6286 Type *SrcType = nullptr;
6287 ExtendKind Kind = ExtendKind::PR_None;
6288};
6289
6290/// Describes the extends used to compute the extended reduction operand.
6291/// ExtendB is optional. If ExtendB is present, ExtendsUser is a binary
6292/// operation.
6293struct ExtendedReductionOperand {
6294 /// The recipe that consumes the extends.
6295 VPWidenRecipe *ExtendsUser = nullptr;
6296 /// Extend descriptions (inputs to getPartialReductionCost).
6297 ReductionExtend ExtendA, ExtendB;
6298};
6299
6300/// A chain of recipes that form a partial reduction. Matches either
6301/// reduction_bin_op (extended op, accumulator), or
6302/// reduction_bin_op (accumulator, extended op).
6303/// The possible forms of the "extended op" are listed in
6304/// matchExtendedReductionOperand.
6305struct VPPartialReductionChain {
6306 /// The top-level binary operation that forms the reduction to a scalar
6307 /// after the loop body.
6308 VPWidenRecipe *ReductionBinOp = nullptr;
6309 /// The user of the extends that is then reduced.
6310 ExtendedReductionOperand ExtendedOp;
6311 /// The recurrence kind for the entire partial reduction chain.
6312 /// This allows distinguishing between Sub and AddWithSub recurrences,
6313 /// when the ReductionBinOp is a Instruction::Sub.
6314 RecurKind RK;
6315 /// The index of the accumulator operand of ReductionBinOp. The extended op
6316 /// is `1 - AccumulatorOpIdx`.
6317 unsigned AccumulatorOpIdx;
6318 unsigned ScaleFactor;
6319};
6320
6321static VPSingleDefRecipe *
6322optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op) {
6323 // reduce.add(mul(ext(A), C))
6324 // -> reduce.add(mul(ext(A), ext(trunc(C))))
6325 const APInt *Const;
6326 if (match(Op, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
6327 auto *ExtA = cast<VPWidenCastRecipe>(Op->getOperand(0));
6328 Instruction::CastOps ExtOpc = ExtA->getOpcode();
6329 Type *NarrowTy = ExtA->getOperand(0)->getScalarType();
6330 if (!Op->hasOneUse() ||
6332 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
6333 return Op;
6334
6335 VPBuilder Builder(Op);
6336 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
6337 Op->getOperand(1), NarrowTy);
6338 Type *WideTy = ExtA->getScalarType();
6339 Op->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
6340 return Op;
6341 }
6342
6343 // reduce.add(abs(sub(ext(A), ext(B))))
6344 // -> reduce.add(ext(absolute-difference(A, B)))
6345 VPValue *X, *Y;
6348 auto *Sub = Op->getOperand(0)->getDefiningRecipe();
6349 auto *Ext = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6350 assert(Ext->getOpcode() ==
6351 cast<VPWidenCastRecipe>(Sub->getOperand(1))->getOpcode() &&
6352 "Expected both the LHS and RHS extends to be the same");
6353 bool IsSigned = Ext->getOpcode() == Instruction::SExt;
6354 VPBuilder Builder(Op);
6355 Type *SrcTy = X->getScalarType();
6356 auto *FreezeX = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {X}));
6357 auto *FreezeY = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {Y}));
6358 auto *Max = Builder.insert(
6359 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smax : Intrinsic::umax,
6360 {FreezeX, FreezeY}, SrcTy));
6361 auto *Min = Builder.insert(
6362 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smin : Intrinsic::umin,
6363 {FreezeX, FreezeY}, SrcTy));
6364 auto *AbsDiff =
6365 Builder.insert(new VPWidenRecipe(Instruction::Sub, {Max, Min}));
6366 return Builder.createWidenCast(Instruction::CastOps::ZExt, AbsDiff,
6367 Op->getScalarType());
6368 }
6369
6370 // reduce.add(ext(mul(ext(A), ext(B))))
6371 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
6372 // TODO: Support this optimization for float types.
6374 m_ZExtOrSExt(m_VPValue()))))) {
6375 auto *Ext = cast<VPWidenCastRecipe>(Op);
6376 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
6377 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6378 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6379 if (!Mul->hasOneUse() ||
6380 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
6381 MulLHS->getOpcode() != MulRHS->getOpcode())
6382 return Op;
6383 VPBuilder Builder(Mul);
6384 auto *NewLHS = Builder.createWidenCast(
6385 MulLHS->getOpcode(), MulLHS->getOperand(0), Ext->getScalarType());
6386 auto *NewRHS = MulLHS == MulRHS
6387 ? NewLHS
6388 : Builder.createWidenCast(MulRHS->getOpcode(),
6389 MulRHS->getOperand(0),
6390 Ext->getScalarType());
6391 auto *NewMul = Mul->cloneWithOperands({NewLHS, NewRHS});
6392 Builder.insert(NewMul);
6393 Op->replaceAllUsesWith(NewMul);
6394 Op->eraseFromParent();
6395 Mul->eraseFromParent();
6396 return NewMul;
6397 }
6398
6399 return Op;
6400}
6401
6402static VPExpressionRecipe *
6403createPartialReductionExpression(VPReductionRecipe *Red) {
6404 VPValue *VecOp = Red->getVecOp();
6405
6406 // reduce.[f]add(ext(op))
6407 // -> VPExpressionRecipe(op, red)
6408 if (match(VecOp, m_WidenAnyExtend(m_VPValue())))
6409 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
6410
6411 // reduce.[f]add(neg(ext(op)))
6412 // -> VPExpressionRecipe(op, sub/neg, red)
6413 if (match(VecOp, m_AnyNeg(m_WidenAnyExtend(m_VPValue())))) {
6414 auto *Neg = cast<VPWidenRecipe>(VecOp);
6415 auto *Ext =
6416 cast<VPWidenCastRecipe>(Neg->getOperand(Neg->getNumOperands() - 1));
6417 return new VPExpressionRecipe(Ext, Neg, Red);
6418 }
6419
6420 // reduce.[f]add([f]mul(ext(a), ext(b)))
6421 // -> VPExpressionRecipe(a, b, mul, red)
6422 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue()))) ||
6423 match(VecOp,
6425 auto *Mul = cast<VPWidenRecipe>(VecOp);
6426 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6427 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6428 return new VPExpressionRecipe(ExtA, ExtB, Mul, Red);
6429 }
6430
6431 // reduce.fadd(fneg(fmul(fpext(a), fpext(b))))
6432 // -> VPExpressionRecipe(a, b, fmul, fsub, red)
6433 if (match(VecOp,
6435 auto *FNeg = cast<VPWidenRecipe>(VecOp);
6436 auto *FMul = cast<VPWidenRecipe>(FNeg->getOperand(0));
6437 auto *ExtA = cast<VPWidenCastRecipe>(FMul->getOperand(0));
6438 auto *ExtB = cast<VPWidenCastRecipe>(FMul->getOperand(1));
6439 return new VPExpressionRecipe(ExtA, ExtB, FMul, FNeg, Red);
6440 }
6441
6442 // reduce.add(neg(mul(ext(a), ext(b))))
6443 // -> VPExpressionRecipe(a, b, mul, sub, red)
6445 m_ZExtOrSExt(m_VPValue()))))) {
6446 auto *Sub = cast<VPWidenRecipe>(VecOp);
6447 auto *Mul = cast<VPWidenRecipe>(Sub->getOperand(1));
6448 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6449 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6450 return new VPExpressionRecipe(ExtA, ExtB, Mul, Sub, Red);
6451 }
6452
6453 llvm_unreachable("Unsupported expression");
6454}
6455
6456// Helper to transform a partial reduction chain into a partial reduction
6457// recipe. Assumes profitability has been checked.
6458static void transformToPartialReduction(const VPPartialReductionChain &Chain,
6459 VPlan &Plan,
6460 VPReductionPHIRecipe *RdxPhi) {
6461 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6462 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
6463
6464 VPValue *Accumulator = WidenRecipe->getOperand(Chain.AccumulatorOpIdx);
6465 auto *ExtendedOp = cast<VPSingleDefRecipe>(
6466 WidenRecipe->getOperand(1 - Chain.AccumulatorOpIdx));
6467
6468 // FIXME: Do these transforms before invoking the cost-model.
6469 ExtendedOp = optimizeExtendsForPartialReduction(ExtendedOp);
6470
6471 // Sub-reductions can be implemented in two ways:
6472 // (1) negate the operand in the vector loop (the default way).
6473 // (2) subtract the reduced value from the init value in the middle block.
6474 // Both ways keep the reduction itself as an 'add' reduction.
6475 //
6476 // The ISD nodes for partial reductions don't support folding the
6477 // sub/negation into its operands because the following is not a valid
6478 // transformation:
6479 // sub(0, mul(ext(a), ext(b)))
6480 // -> mul(ext(a), ext(sub(0, b)))
6481 //
6482 // It's therefore better to choose option (2) such that the partial
6483 // reduction is always positive (starting at '0') and to do a final
6484 // subtract in the middle block.
6485 if ((WidenRecipe->getOpcode() == Instruction::Sub &&
6486 Chain.RK != RecurKind::Sub) ||
6487 (WidenRecipe->getOpcode() == Instruction::FSub &&
6488 Chain.RK != RecurKind::FSub)) {
6489 VPBuilder Builder(WidenRecipe);
6490 Type *ElemTy = ExtendedOp->getScalarType();
6491 VPWidenRecipe *NegRecipe;
6492 if (WidenRecipe->getOpcode() == Instruction::FSub) {
6493 NegRecipe =
6494 new VPWidenRecipe(Instruction::FNeg, {ExtendedOp}, VPIRFlags(),
6496 } else {
6497 auto *Zero = Plan.getZero(ElemTy);
6498 NegRecipe =
6499 new VPWidenRecipe(Instruction::Sub, {Zero, ExtendedOp}, VPIRFlags(),
6501 }
6502 Builder.insert(NegRecipe);
6503 ExtendedOp = NegRecipe;
6504 }
6505
6506 // Check if WidenRecipe is the final result of the reduction. If so look
6507 // through selects for predicated reductions.
6508 VPValue *Cond = nullptr;
6510 findUserOf(WidenRecipe, m_Select(m_VPValue(Cond), m_Specific(WidenRecipe),
6511 m_Specific(RdxPhi))));
6512 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
6513 RdxPhi->getBackedgeValue() == ExitValue;
6514 assert((!ExitValue || IsLastInChain) &&
6515 "if we found ExitValue, it must match RdxPhi's backedge value");
6516
6517 Type *PhiType = RdxPhi->getScalarType();
6518 RecurKind RdxKind =
6520 auto *PartialRed = new VPReductionRecipe(
6521 RdxKind,
6522 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlagsOrNone()
6523 : FastMathFlags(),
6524 WidenRecipe->getUnderlyingInstr(), Accumulator, ExtendedOp, Cond,
6525 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6526 PartialRed->insertBefore(WidenRecipe);
6527
6528 if (Cond)
6529 ExitValue->replaceAllUsesWith(PartialRed);
6530 WidenRecipe->replaceAllUsesWith(PartialRed);
6531
6532 // For cost-model purposes, fold this into a VPExpression.
6533 VPExpressionRecipe *E = createPartialReductionExpression(PartialRed);
6534 E->insertBefore(WidenRecipe);
6535 PartialRed->replaceAllUsesWith(E);
6536
6537 // We only need to update the PHI node once, which is when we find the
6538 // last reduction in the chain.
6539 if (!IsLastInChain)
6540 return;
6541
6542 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6543 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6544 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6545
6546 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
6547 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6548 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
6549 StartInst->setOperand(2, NewScaleFactor);
6550
6551 // If this is the last value in a sub-reduction chain, then update the PHI
6552 // node to start at `0` and update the reduction-result to subtract from
6553 // the PHI's start value.
6554 if (Chain.RK != RecurKind::Sub && Chain.RK != RecurKind::FSub)
6555 return;
6556
6557 VPValue *OldStartValue = StartInst->getOperand(0);
6558 StartInst->setOperand(0, StartInst->getOperand(1));
6559
6560 // Replace reduction_result by 'sub (startval, reductionresult)'.
6562 assert(RdxResult && "Could not find reduction result");
6563
6564 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
6565 unsigned SubOpc = Chain.RK == RecurKind::FSub ? Instruction::BinaryOps::FSub
6566 : Instruction::BinaryOps::Sub;
6567 VPInstruction *NewResult = Builder.createNaryOp(
6568 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
6569 RdxPhi->getDebugLoc());
6570 RdxResult->replaceUsesWithIf(
6571 NewResult,
6572 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6573}
6574
6575/// Returns the cost of a link in a partial-reduction chain for a given VF.
6576static InstructionCost
6577getPartialReductionLinkCost(VPCostContext &CostCtx,
6578 const VPPartialReductionChain &Link,
6579 ElementCount VF) {
6580 Type *RdxType = Link.ReductionBinOp->getScalarType();
6581 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6582 std::optional<unsigned> BinOpc = std::nullopt;
6583 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6584 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6585 BinOpc = ExtendedOp.ExtendsUser->getOpcode();
6586
6587 std::optional<llvm::FastMathFlags> Flags;
6588 if (RdxType->isFloatingPointTy())
6589 Flags = Link.ReductionBinOp->getFastMathFlagsOrNone();
6590
6591 auto GetLinkOpcode = [&Link]() -> unsigned {
6592 switch (Link.RK) {
6593 case RecurKind::Sub:
6594 return Instruction::Add;
6595 case RecurKind::FSub:
6596 return Instruction::FAdd;
6597 default:
6598 return Link.ReductionBinOp->getOpcode();
6599 }
6600 };
6601
6602 return CostCtx.TTI.getPartialReductionCost(
6603 GetLinkOpcode(), ExtendedOp.ExtendA.SrcType, ExtendedOp.ExtendB.SrcType,
6604 RdxType, VF, ExtendedOp.ExtendA.Kind, ExtendedOp.ExtendB.Kind, BinOpc,
6605 CostCtx.CostKind, Flags);
6606}
6607
6608static ExtendKind getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
6610}
6611
6612/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6613/// operand. This is an operand where the source of the value (e.g. a load) has
6614/// been extended (sext, zext, or fpext) before it is used in the reduction.
6615///
6616/// Possible forms matched by this function:
6617/// - UpdateR(PrevValue, ext(...))
6618/// - UpdateR(PrevValue, mul(ext(...), ext(...)))
6619/// - UpdateR(PrevValue, mul(ext(...), Constant))
6620/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6621/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6622/// - UpdateR(PrevValue, abs(sub(ext(...), ext(...)))
6623///
6624/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6625static std::optional<ExtendedReductionOperand>
6626matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op) {
6627 assert(is_contained(UpdateR->operands(), Op) &&
6628 "Op should be operand of UpdateR");
6629
6630 // Try matching an absolute difference operand of the form
6631 // `abs(sub(ext(A), ext(B)))`. This will be later transformed into
6632 // `ext(absolute-difference(A, B))`. This allows us to perform the absolute
6633 // difference on a wider type and get the extend for "free" from the partial
6634 // reduction.
6635 VPValue *X, *Y;
6636 if (Op->hasOneUse() &&
6640 auto *Abs = cast<VPWidenIntrinsicRecipe>(Op);
6641 auto *Sub = cast<VPWidenRecipe>(Abs->getOperand(0));
6642 auto *LHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6643 auto *RHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(1));
6644 Type *LHSInputType = X->getScalarType();
6645 Type *RHSInputType = Y->getScalarType();
6646 if (LHSInputType != RHSInputType ||
6647 LHSExt->getOpcode() != RHSExt->getOpcode())
6648 return std::nullopt;
6649 // Note: This is essentially the same as matching ext(...) as we will
6650 // rewrite this operand to ext(absolute-difference(A, B)).
6651 return ExtendedReductionOperand{
6652 Sub,
6653 /*ExtendA=*/{LHSInputType, getPartialReductionExtendKind(LHSExt)},
6654 /*ExtendB=*/{}};
6655 }
6656
6657 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6659 auto *CastRecipe = cast<VPWidenCastRecipe>(Op);
6660 VPValue *CastSource = CastRecipe->getOperand(0);
6661 OuterExtKind = getPartialReductionExtendKind(CastRecipe);
6662 if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) ||
6663 match(CastSource, m_FMul(m_VPValue(), m_VPValue()))) {
6664 // Match: ext(mul(...))
6665 // Record the outer extend kind and set `Op` to the mul. We can then match
6666 // this as a binary operation. Note: We can optimize out the outer extend
6667 // by widening the inner extends to match it. See
6668 // optimizeExtendsForPartialReduction.
6669 Op = CastSource;
6670 } else {
6671 return ExtendedReductionOperand{
6672 UpdateR,
6673 /*ExtendA=*/{CastSource->getScalarType(), *OuterExtKind},
6674 /*ExtendB=*/{}};
6675 }
6676 }
6677
6678 if (!Op->hasOneUse())
6679 return std::nullopt;
6680
6682 if (!MulOp ||
6683 !is_contained({Instruction::Mul, Instruction::FMul}, MulOp->getOpcode()))
6684 return std::nullopt;
6685
6686 // The rest of the matching assumes `Op` is a (possibly extended) mul
6687 // operation.
6688
6689 VPValue *LHS = MulOp->getOperand(0);
6690 VPValue *RHS = MulOp->getOperand(1);
6691
6692 // The LHS of the operation must always be an extend.
6694 return std::nullopt;
6695
6696 auto *LHSCast = cast<VPWidenCastRecipe>(LHS);
6697 Type *LHSInputType = LHSCast->getOperand(0)->getScalarType();
6698 ExtendKind LHSExtendKind = getPartialReductionExtendKind(LHSCast);
6699
6700 // The RHS of the operation can be an extend or a constant integer.
6701 const APInt *RHSConst = nullptr;
6702 VPWidenCastRecipe *RHSCast = nullptr;
6704 RHSCast = cast<VPWidenCastRecipe>(RHS);
6705 else if (!match(RHS, m_APInt(RHSConst)) ||
6706 !canConstantBeExtended(RHSConst, LHSInputType, LHSExtendKind))
6707 return std::nullopt;
6708
6709 // The outer extend kind must match the inner extends for folding.
6710 for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
6711 if (Cast && OuterExtKind &&
6712 getPartialReductionExtendKind(Cast) != OuterExtKind)
6713 return std::nullopt;
6714
6715 Type *RHSInputType = LHSInputType;
6716 ExtendKind RHSExtendKind = LHSExtendKind;
6717 if (RHSCast) {
6718 RHSInputType = RHSCast->getOperand(0)->getScalarType();
6719 RHSExtendKind = getPartialReductionExtendKind(RHSCast);
6720 }
6721
6722 return ExtendedReductionOperand{
6723 MulOp, {LHSInputType, LHSExtendKind}, {RHSInputType, RHSExtendKind}};
6724}
6725
6726/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6727/// and determines if the target can use a cheaper operation with a wider
6728/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6729/// of operations in the reduction.
6730static std::optional<SmallVector<VPPartialReductionChain>>
6731getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,
6732 VFRange &Range) {
6733 // Get the backedge value from the reduction PHI and find the
6734 // ComputeReductionResult that uses it (directly or through a select for
6735 // predicated reductions).
6736 auto *RdxResult = vputils::findComputeReductionResult(RedPhiR);
6737 if (!RdxResult)
6738 return std::nullopt;
6739 VPValue *ExitValue = RdxResult->getOperand(0);
6740 match(ExitValue, m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6741
6743 RecurKind RK = RedPhiR->getRecurrenceKind();
6744 Type *PhiType = RedPhiR->getScalarType();
6745 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6746
6747 // Work backwards from the ExitValue examining each reduction operation.
6748 VPValue *CurrentValue = ExitValue;
6749 while (CurrentValue != RedPhiR) {
6750 auto *UpdateR = dyn_cast<VPWidenRecipe>(CurrentValue);
6751 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6752 return std::nullopt;
6753
6754 VPValue *Op = UpdateR->getOperand(1);
6755 VPValue *PrevValue = UpdateR->getOperand(0);
6756
6757 // Find the extended operand. The other operand (PrevValue) is the next link
6758 // in the reduction chain.
6759 std::optional<ExtendedReductionOperand> ExtendedOp =
6760 matchExtendedReductionOperand(UpdateR, Op);
6761 if (!ExtendedOp) {
6762 ExtendedOp = matchExtendedReductionOperand(UpdateR, PrevValue);
6763 if (!ExtendedOp)
6764 return std::nullopt;
6765 std::swap(Op, PrevValue);
6766 }
6767
6768 Type *ExtSrcType = ExtendedOp->ExtendA.SrcType;
6769 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6770 if (!PHISize.hasKnownScalarFactor(ExtSrcSize))
6771 return std::nullopt;
6772
6773 // Check if a partial reduction chain is supported by the target (i.e. does
6774 // not have an invalid cost) for the given VF range. Clamps the range and
6775 // returns true if feasible for any VF.
6776 VPPartialReductionChain Link(
6777 {UpdateR, *ExtendedOp, RK,
6778 PrevValue == UpdateR->getOperand(0) ? 0U : 1U,
6779 static_cast<unsigned>(PHISize.getKnownScalarFactor(ExtSrcSize))});
6780 Chain.push_back(Link);
6781 CurrentValue = PrevValue;
6782 }
6783
6784 // The chain links were collected by traversing backwards from the exit value.
6785 // Reverse the chains so they are in program order.
6786 std::reverse(Chain.begin(), Chain.end());
6787 return Chain;
6788}
6789} // namespace
6790
6792 VPCostContext &CostCtx,
6793 VFRange &Range) {
6794 // Find all possible valid partial reductions, grouping chains by their PHI.
6795 // This grouping allows invalidating the whole chain, if any link is not a
6796 // valid partial reduction.
6798 ChainsByPhi;
6799 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6800 for (VPRecipeBase &R : HeaderVPBB->phis()) {
6801 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
6802 if (!RedPhiR)
6803 continue;
6804
6805 if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))
6806 ChainsByPhi.try_emplace(RedPhiR, std::move(*Chains));
6807 }
6808
6809 if (ChainsByPhi.empty())
6810 return;
6811
6812 // Build set of partial reduction operations for extend user validation and
6813 // a map of reduction bin ops to their scale factors for scale validation.
6814 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
6815 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
6816 for (const auto &[_, Chains] : ChainsByPhi)
6817 for (const VPPartialReductionChain &Chain : Chains) {
6818 PartialReductionOps.insert(Chain.ExtendedOp.ExtendsUser);
6819 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
6820 }
6821
6822 // A partial reduction is invalid if any of its extends are used by
6823 // something that isn't another partial reduction. This is because the
6824 // extends are intended to be lowered along with the reduction itself.
6825 auto ExtendUsersValid = [&](VPValue *Ext) {
6826 return !isa<VPWidenCastRecipe>(Ext) || all_of(Ext->users(), [&](VPUser *U) {
6827 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
6828 });
6829 };
6830
6831 auto IsProfitablePartialReductionChainForVF =
6832 [&](ArrayRef<VPPartialReductionChain> Chain, ElementCount VF) -> bool {
6833 InstructionCost PartialCost = 0, RegularCost = 0;
6834
6835 // The chain is a profitable partial reduction chain if the cost of handling
6836 // the entire chain is cheaper when using partial reductions than when
6837 // handling the entire chain using regular reductions.
6838 for (const VPPartialReductionChain &Link : Chain) {
6839 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6840 InstructionCost LinkCost = getPartialReductionLinkCost(CostCtx, Link, VF);
6841 if (!LinkCost.isValid())
6842 return false;
6843
6844 PartialCost += LinkCost;
6845 RegularCost += Link.ReductionBinOp->computeCost(VF, CostCtx);
6846 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6847 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6848 RegularCost += ExtendedOp.ExtendsUser->computeCost(VF, CostCtx);
6849 for (VPValue *Op : ExtendedOp.ExtendsUser->operands())
6850 if (auto *Extend = dyn_cast<VPWidenCastRecipe>(Op))
6851 RegularCost += Extend->computeCost(VF, CostCtx);
6852 }
6853 return PartialCost.isValid() && PartialCost < RegularCost;
6854 };
6855
6856 // Validate chains: check that extends are only used by partial reductions,
6857 // and that reduction bin ops are only used by other partial reductions with
6858 // matching scale factors, are outside the loop region or the select
6859 // introduced by tail-folding. Otherwise we would create users of scaled
6860 // reductions where the types of the other operands don't match.
6861 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6862 for (const VPPartialReductionChain &Chain : Chains) {
6863 if (!all_of(Chain.ExtendedOp.ExtendsUser->operands(), ExtendUsersValid)) {
6864 Chains.clear();
6865 break;
6866 }
6867 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6868 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
6869 return PhiR == RedPhiR;
6870 auto *R = cast<VPSingleDefRecipe>(U);
6871 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
6873 m_Specific(Chain.ReductionBinOp))) ||
6874 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
6875 m_Specific(RedPhiR)));
6876 };
6877 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
6878 Chains.clear();
6879 break;
6880 }
6881
6882 // Check if the compute-reduction-result is used by a sunk store.
6883 // TODO: Also form partial reductions in those cases.
6884 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6885 if (any_of(RdxResult->users(), [](VPUser *U) {
6886 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
6887 return RepR && RepR->getOpcode() == Instruction::Store;
6888 })) {
6889 Chains.clear();
6890 break;
6891 }
6892 }
6893 }
6894
6895 // Clear the chain if it is not profitable.
6897 [&, &Chains = Chains](ElementCount VF) {
6898 return IsProfitablePartialReductionChainForVF(Chains, VF);
6899 },
6900 Range))
6901 Chains.clear();
6902 }
6903
6904 for (auto &[Phi, Chains] : ChainsByPhi)
6905 for (const VPPartialReductionChain &Chain : Chains)
6906 transformToPartialReduction(Chain, Plan, Phi);
6907}
6908
6910 VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder) {
6911 // Collect all loads/stores first. We will start with ones having simpler
6912 // decisions followed by more complex ones that are potentially
6913 // guided/dependent on the simpler ones.
6915 for (VPBasicBlock *VPBB :
6918 for (VPRecipeBase &R : *VPBB) {
6919 auto *VPI = dyn_cast<VPInstruction>(&R);
6920 if (VPI && VPI->getUnderlyingValue() &&
6921 is_contained({Instruction::Load, Instruction::Store},
6922 VPI->getOpcode()))
6923 MemOps.push_back(VPI);
6924 }
6925 }
6926
6927 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
6928 VPBuilder FinalRedStoresBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
6929
6930 for (VPInstruction *VPI : MemOps) {
6931 auto ReplaceWith = [&](VPRecipeBase *New) {
6932 New->insertBefore(VPI);
6933 if (VPI->getOpcode() == Instruction::Load)
6934 VPI->replaceAllUsesWith(New->getVPSingleValue());
6935 VPI->eraseFromParent();
6936 };
6937
6938 // Note: we must do that for scalar VPlan as well.
6939 if (RecipeBuilder.replaceWithFinalIfReductionStore(VPI,
6940 FinalRedStoresBuilder))
6941 continue;
6942
6943 // Filter out scalar VPlan for the remaining memory operations.
6945 [](ElementCount VF) { return VF.isScalar(); }, Range))
6946 continue;
6947
6948 if (VPHistogramRecipe *Histogram = RecipeBuilder.widenIfHistogram(VPI)) {
6949 ReplaceWith(Histogram);
6950 continue;
6951 }
6952
6953 VPRecipeBase *Recipe = RecipeBuilder.tryToWidenMemory(VPI, Range);
6954 if (!Recipe)
6955 Recipe = RecipeBuilder.handleReplication(VPI, Range);
6956
6957 ReplaceWith(Recipe);
6958 }
6959}
6960
6963 [&](ElementCount VF) { return VF.isScalar(); }, Range))
6964 return;
6965
6967 Plan.getEntry());
6969 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
6970 auto *VPI = dyn_cast<VPInstruction>(&R);
6971 if (!VPI)
6972 continue;
6973
6974 auto *I = cast_or_null<Instruction>(VPI->getUnderlyingValue());
6975 // Wouldn't be able to create a `VPReplicateRecipe` anyway.
6976 if (!I)
6977 continue;
6978
6979 // If executing other lanes produces side-effects we can't avoid them.
6980 if (VPI->mayHaveSideEffects())
6981 continue;
6982
6983 // We want to drop the mask operand, verify we can safely do that.
6984 if (VPI->isMasked() && !VPI->isSafeToSpeculativelyExecute())
6985 continue;
6986
6987 // Avoid rewriting IV increment as that interferes with
6988 // `removeRedundantCanonicalIVs`.
6989 if (VPI->getOpcode() == Instruction::Add &&
6991 continue;
6992
6993 // Other lanes are needed - can't drop them.
6995 continue;
6996
6997 auto *Recipe = new VPReplicateRecipe(
6998 I, VPI->operandsWithoutMask(), /*IsSingleScalar=*/true,
6999 /*Mask=*/nullptr, *VPI, *VPI, VPI->getDebugLoc());
7000 Recipe->insertBefore(VPI);
7001 VPI->replaceAllUsesWith(Recipe);
7002 VPI->eraseFromParent();
7003 }
7004 }
7005}
7006
7007/// Returns true if \p Info's parameter kinds are compatible with \p Args.
7008static bool areVFParamsOk(const VFInfo &Info, ArrayRef<VPValue *> Args,
7009 PredicatedScalarEvolution &PSE, const Loop *L) {
7010 ScalarEvolution *SE = PSE.getSE();
7011 return all_of(Info.Shape.Parameters, [&](VFParameter Param) {
7012 switch (Param.ParamKind) {
7013 case VFParamKind::Vector:
7014 case VFParamKind::GlobalPredicate:
7015 return true;
7016 case VFParamKind::OMP_Uniform:
7017 return SE->isSCEVable(Args[Param.ParamPos]->getScalarType()) &&
7018 SE->isLoopInvariant(
7019 vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),
7020 L);
7021 case VFParamKind::OMP_Linear:
7022 return match(vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),
7023 m_scev_AffineAddRec(
7024 m_SCEV(), m_scev_SpecificSInt(Param.LinearStepOrPos),
7025 m_SpecificLoop(L)));
7026 default:
7027 return false;
7028 }
7029 });
7030}
7031
7032/// Find a vector variant of \p CI for \p VF, respecting \p MaskRequired.
7033/// Returns the variant function, or nullptr. Masked variants are assumed to
7034/// take the mask as a trailing parameter.
7036 ElementCount VF, bool MaskRequired,
7038 const Loop *L) {
7039 if (CI->isNoBuiltin())
7040 return nullptr;
7041 auto Mappings = VFDatabase::getMappings(*CI);
7042 const auto *It = find_if(Mappings, [&](const VFInfo &Info) {
7043 return Info.Shape.VF == VF && (!MaskRequired || Info.isMasked()) &&
7044 areVFParamsOk(Info, Args, PSE, L);
7045 });
7046 if (It == Mappings.end())
7047 return nullptr;
7048 return CI->getModule()->getFunction(It->VectorName);
7049}
7050
7051namespace {
7052/// The outcome of choosing how to widen a call at a given VF.
7053struct CallWideningDecision {
7054 enum class KindTy { Scalarize, Intrinsic, VectorVariant };
7055 CallWideningDecision(KindTy Kind, Function *Variant = nullptr)
7056 : Kind(Kind), Variant(Variant) {}
7057 KindTy Kind;
7058
7059 /// Set when Kind == VectorVariant.
7061
7062 bool operator==(const CallWideningDecision &Other) const {
7063 return Kind == Other.Kind && Variant == Other.Variant;
7064 }
7065};
7066} // namespace
7067
7068/// Pick the cheapest widening for the call \p VPI at \p VF among scalarization,
7069/// vector intrinsic, and vector library variant.
7070static CallWideningDecision decideCallWidening(VPInstruction &VPI,
7072 ElementCount VF,
7073 VPCostContext &CostCtx) {
7074 auto *CI = cast<CallInst>(VPI.getUnderlyingInstr());
7075
7076 // Scalar VFs and calls forced or known to scalarize always replicate.
7077 if (VF.isScalar() || CostCtx.willBeScalarized(CI, VF))
7078 return CallWideningDecision::KindTy::Scalarize;
7079
7080 auto *CalledFn = cast<Function>(
7082 Type *ResultTy = VPI.getScalarType();
7084 bool MaskRequired = CostCtx.isMaskRequired(CI);
7085
7086 // Pseudo intrinsics (assume, lifetime, ...) are always scalarized.
7088 return CallWideningDecision::KindTy::Scalarize;
7089
7090 InstructionCost ScalarCost =
7091 VPReplicateRecipe::computeCallCost(CalledFn, ResultTy, Ops,
7092 /*IsSingleScalar=*/false, VF, CostCtx);
7093
7094 Function *VecFunc =
7095 findVectorVariant(CI, Ops, VF, MaskRequired, CostCtx.PSE, CostCtx.L);
7097 if (VecFunc)
7098 VecCallCost = VPWidenCallRecipe::computeCallCost(VecFunc, CostCtx);
7099
7100 // Prefer the intrinsic if it is at least as cheap as scalarizing and any
7101 // available vector variant.
7102 if (ID) {
7105 if (IntrinsicCost.isValid() && ScalarCost >= IntrinsicCost &&
7106 (!VecFunc || VecCallCost >= IntrinsicCost))
7107 return CallWideningDecision::KindTy::Intrinsic;
7108 }
7109
7110 // Otherwise, use a vector library variant when it beats scalarizing.
7111 if (VecFunc && ScalarCost >= VecCallCost)
7112 return {CallWideningDecision::KindTy::VectorVariant, VecFunc};
7113
7114 return CallWideningDecision::KindTy::Scalarize;
7115}
7116
7118 VPRecipeBuilder &RecipeBuilder,
7119 VPCostContext &CostCtx) {
7122 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
7123 auto *VPI = dyn_cast<VPInstruction>(&R);
7124 if (!VPI || !VPI->getUnderlyingValue() ||
7125 VPI->getOpcode() != Instruction::Call)
7126 continue;
7127
7128 auto *CI = cast<CallInst>(VPI->getUnderlyingInstr());
7129 SmallVector<VPValue *, 4> Ops(VPI->op_begin(),
7130 VPI->op_begin() + CI->arg_size());
7131
7132 CallWideningDecision Decision =
7133 decideCallWidening(*VPI, Ops, Range.Start, CostCtx);
7135 [&](ElementCount VF) {
7136 return Decision == decideCallWidening(*VPI, Ops, VF, CostCtx);
7137 },
7138 Range);
7139
7140 VPSingleDefRecipe *Replacement = nullptr;
7141 switch (Decision.Kind) {
7142 case CallWideningDecision::KindTy::Intrinsic: {
7144 Type *ResultTy = VPI->getScalarType();
7145 Replacement = new VPWidenIntrinsicRecipe(*CI, ID, Ops, ResultTy, *VPI,
7146 *VPI, VPI->getDebugLoc());
7147 break;
7148 }
7149 case CallWideningDecision::KindTy::VectorVariant: {
7150 // Masked variants take the mask as a trailing parameter, so they have
7151 // one more parameter than the original call's arguments.
7152 if (Decision.Variant->arg_size() > Ops.size()) {
7153 VPValue *Mask = VPI->isMasked() ? VPI->getMask() : Plan.getTrue();
7154 Ops.push_back(Mask);
7155 }
7156 Ops.push_back(VPI->getOperand(VPI->getNumOperandsWithoutMask() - 1));
7157 Replacement = new VPWidenCallRecipe(CI, Decision.Variant, Ops, *VPI,
7158 *VPI, VPI->getDebugLoc());
7159 break;
7160 }
7161 case CallWideningDecision::KindTy::Scalarize:
7162 Replacement = RecipeBuilder.handleReplication(VPI, Range);
7163 break;
7164 }
7165
7166 Replacement->insertBefore(VPI);
7167 VPI->replaceAllUsesWith(Replacement);
7168 VPI->eraseFromParent();
7169 }
7170 }
7171}
7172
7175 Loop &L, VPCostContext &Ctx,
7176 VFRange &Range) {
7177 if (Plan.hasScalarVFOnly())
7178 return;
7179
7180 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
7181 VPValue *I32VF = nullptr;
7183 vp_depth_first_shallow(VectorLoop->getEntry()))) {
7184 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
7185 auto *LoadR = dyn_cast<VPWidenLoadRecipe>(&R);
7186 // TODO: Support strided store.
7187 // TODO: Transform reverse access into strided access with -1 stride.
7188 // TODO: Transform gather/scatter with uniform address into strided access
7189 // with 0 stride.
7190 // TODO: Transform interleave access into multiple strided accesses.
7191 if (!LoadR || LoadR->isConsecutive())
7192 continue;
7193
7194 auto *Ptr = dyn_cast<VPWidenGEPRecipe>(LoadR->getAddr());
7195 if (!Ptr)
7196 continue;
7197
7198 // Check if this is a strided access by analyzing the address SCEV for an
7199 // affine addRec.
7200 const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(Ptr, PSE, &L);
7201 const SCEV *Start;
7202 const SCEVConstant *Step;
7203 // TODO: Support non-constant loop invariant stride.
7204 if (!match(PtrSCEV,
7206 m_SpecificLoop(&L))))
7207 continue;
7208
7209 Type *LoadTy = LoadR->getScalarType();
7210 Align Alignment = LoadR->getAlign();
7211 auto IsProfitable = [&](ElementCount VF) {
7212 Type *DataTy = toVectorTy(LoadTy, VF);
7213 if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))
7214 return false;
7215 const InstructionCost CurrentCost = LoadR->computeCost(VF, Ctx);
7216 const InstructionCost StridedLoadStoreCost =
7218 Intrinsic::experimental_vp_strided_load, DataTy,
7219 LoadR->isMasked(), Alignment, Ctx);
7220 return StridedLoadStoreCost < CurrentCost;
7221 };
7222
7224 Range))
7225 continue;
7226
7227 // Invalidate the legacy widening decision so the cost of replaced load is
7228 // not counted during precomputeCosts.
7229 // TODO: Remove once the legacy exit cost computation is retired.
7230 for (ElementCount VF : Range)
7231 Ctx.invalidateWideningDecision(&LoadR->getIngredient(), VF);
7232
7233 // Get VF as i32 for the vector length operand.
7234 if (!I32VF) {
7235 VPBuilder Builder(Plan.getVectorPreheader());
7236 I32VF = Builder.createScalarZExtOrTrunc(
7237 &Plan.getVF(), Type::getInt32Ty(Plan.getContext()),
7239 }
7240
7241 VPBuilder Builder(LoadR);
7242 // Create the base pointer of strided access.
7243 // TODO: reuse VPDerivedIVRecipe for base pointer computation when it
7244 // supports a general VPValue as the start value.
7245 VPValue *StartVPV = vputils::getOrCreateVPValueForSCEVExpr(Plan, Start);
7246 VPValue *StrideInBytes = Plan.getOrAddLiveIn(Step->getValue());
7247 Type *IndexTy = Plan.getDataLayout().getIndexType(Ptr->getScalarType());
7248 assert(IndexTy == StrideInBytes->getScalarType() &&
7249 "Stride type from SCEV must match the index type");
7250 VPValue *CanIV = Builder.createScalarSExtOrTrunc(
7251 VectorLoop->getCanonicalIV(), IndexTy,
7252 VectorLoop->getCanonicalIVType(), DebugLoc::getUnknown());
7253 auto *AddRecPtr = cast<SCEVAddRecExpr>(PtrSCEV);
7254 auto *Offset = Builder.createOverflowingOp(
7255 Instruction::Mul, {CanIV, StrideInBytes},
7256 {AddRecPtr->hasNoUnsignedWrap(), AddRecPtr->hasNoSignedWrap()});
7257 auto *BasePtr = Builder.createNoWrapPtrAdd(
7258 StartVPV, Offset,
7259 AddRecPtr->hasNoUnsignedWrap() ? GEPNoWrapFlags::noUnsignedWrap()
7261
7262 // Create a new vector pointer for strided access.
7263 VPValue *NewPtr = Builder.createVectorPointer(
7264 BasePtr, Type::getInt8Ty(Plan.getContext()), StrideInBytes,
7265 Ptr->getGEPNoWrapFlags(), Ptr->getDebugLoc());
7266
7267 VPValue *Mask = LoadR->getMask();
7268 if (!Mask)
7269 Mask = Plan.getTrue();
7270 auto *StridedLoad = Builder.createWidenMemIntrinsic(
7271 Intrinsic::experimental_vp_strided_load,
7272 {NewPtr, StrideInBytes, Mask, I32VF}, LoadTy, Alignment, *LoadR,
7273 LoadR->getDebugLoc());
7274 LoadR->replaceAllUsesWith(StridedLoad);
7275 }
7276 }
7277}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static bool dominates(InstrPosIndexes &PosIndexes, const MachineInstr &A, const MachineInstr &B)
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static CallWideningDecision decideCallWidening(VPInstruction &VPI, ArrayRef< VPValue * > Ops, ElementCount VF, VPCostContext &CostCtx)
Pick the cheapest widening for the call VPI at VF among scalarization, vector intrinsic,...
static bool areVFParamsOk(const VFInfo &Info, ArrayRef< VPValue * > Args, PredicatedScalarEvolution &PSE, const Loop *L)
Returns true if Info's parameter kinds are compatible with Args.
static bool simplifyLogicalRecipe(VPSingleDefRecipe *Def, VPBuilder &Builder, bool CanCreateNewRecipe)
Try to simplify logical and bitwise recipes in Def.
static bool sinkScalarOperands(VPlan &Plan)
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static VPValue * cloneBinOpForScalarIV(VPWidenRecipe *BinOp, VPValue *ScalarIV, VPWidenIntOrFpInductionRecipe *WidenIV)
Create a scalar version of BinOp, with its WidenIV operand replaced by ScalarIV, and place it after S...
static VPWidenIntOrFpInductionRecipe * getExpressionIV(VPValue *V)
Check if V is a binary expression of a widened IV and a loop-invariant value.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static Type * getLoadStoreValueType(VPReplicateRecipe *R, bool IsLoad)
Get the value type of the replicate load or store.
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static Function * findVectorVariant(CallInst *CI, ArrayRef< VPValue * > Args, ElementCount VF, bool MaskRequired, PredicatedScalarEvolution &PSE, const Loop *L)
Find a vector variant of CI for VF, respecting MaskRequired.
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static bool handleUncountableExitsWithSideEffects(VPlan &Plan, SmallVectorImpl< EarlyExitInfo > &Exits, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT, AssumptionCache *AC, VPDominatorTree &VPDT)
Update Plan to mask memory operations in the loop based on whether the early exit is taken or not.
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx, bool IsScalable)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void simplifyRecipe(VPSingleDefRecipe *Def)
Try to simplify VPSingleDefRecipe Def.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectGroupedReplicateMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L, function_ref< bool(VPReplicateRecipe *)> FilterFn)
Collect either replicated Loads or Stores grouped by their address SCEV and their load-store type,...
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
static std::optional< Intrinsic::ID > getVPDivRemIntrinsic(Intrinsic::ID IntrID)
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant ExpandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static VPValue * narrowInterleaveGroupOp(ArrayRef< VPValue * > Members, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL)
Try to fold R using InstSimplifyFolder.
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static void expandVPDerivedIV(VPDerivedIVRecipe *R)
Expand a VPDerivedIVRecipe into executable recipes.
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations using scoped noalias metadata,...
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPRegionBlock *ParentRegion, VPlan &Plan)
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static std::optional< Instruction::BinaryOps > getUnmaskedDivRemOpcode(Intrinsic::ID ID)
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool cannotHoistOrSinkRecipe(VPRecipeBase &R, VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Return true if we do not know how to (mechanically) hoist or sink a non-memory or memory recipe R out...
static bool canNarrowOps(ArrayRef< VPValue * > Ops, bool IsScalable)
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L)
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1055
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
APInt abs() const
Get the absolute value.
Definition APInt.h:1818
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:1028
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getMinValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the minimum integer value with the given bit width and signedness.
Definition APSInt.h:310
static APSInt getMaxValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the maximum integer value with the given bit width and signedness.
Definition APSInt.h:302
@ NoAlias
The two locations do not alias at all.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
Get the last element.
Definition ArrayRef.h:150
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:194
const T & front() const
Get the first element.
Definition ArrayRef.h:144
iterator end() const
Definition ArrayRef.h:130
iterator begin() const
Definition ArrayRef.h:129
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:852
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:152
static DebugLoc getUnknown()
Definition DebugLoc.h:151
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:252
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:301
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:262
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:202
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:291
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
size_t arg_size() const
Definition Function.h:901
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags noUnsignedWrap()
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
static LLVM_ABI InductionDescriptor getCanonicalIntInduction(Type *Ty, ScalarEvolution &SE)
Returns the canonical integer induction for type Ty with start = 0 and step = 1.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:350
The group of interleaved loads/stores sharing the same stride and close to each other.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1666
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1069
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:110
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:118
bool empty() const
Definition MapVector.h:79
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition Module.cpp:235
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Post-order traversal of a graph.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class represents a constant integer value.
ConstantInt * getValue() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(SCEVUse SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getUDivExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
@ SK_Broadcast
Broadcast element 0 to all other elements.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:276
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
op_range operands()
Definition User.h:267
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:76
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:4044
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4399
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4474
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4426
iterator end()
Definition VPlan.h:4436
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4434
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4487
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:266
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:560
const VPRecipeBase & front() const
Definition VPlan.h:4446
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:639
const VPRecipeBase & back() const
Definition VPlan.h:4448
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2956
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:3006
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2996
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:3012
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2992
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:94
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:315
VPRegionBlock * getParent()
Definition VPlan.h:186
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:236
size_t getNumSuccessors() const
Definition VPlan.h:237
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:306
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:222
VPlan * getPlan()
Definition VPlan.cpp:211
const std::string & getName() const
Definition VPlan.h:177
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:325
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:233
void clearPredecessors()
Remove all the predecessor of this block.
Definition VPlan.h:322
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:216
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:279
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:227
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:211
static auto blocksAs(T &&Range)
Return an iterator range over Range with each block cast to BlockTy.
Definition VPlanUtils.h:343
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:362
static bool isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT)
Returns true if VPB is a loop latch, using isHeader().
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:252
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:270
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:288
static auto blocksOnly(T &&Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:324
static void transferSuccessors(VPBlockBase *Old, VPBlockBase *New)
Transfer successors from Old to New. New must have no successors.
Definition VPlanUtils.h:308
static SmallVector< VPBasicBlock * > blocksInSingleSuccessorChainBetween(VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Returns the blocks between FirstBB and LastBB, where FirstBB to LastBB forms a single-sucessor chain.
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3493
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createFirstActiveLane(ArrayRef< VPValue * > Masks, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAdd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false})
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPInstruction * createLogicalOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAnyOfReduction(VPValue *ChainOp, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown())
Create an AnyOf reduction pattern: or-reduce ChainOp, freeze the result, then select between TrueVal ...
Definition VPlan.cpp:1653
VPInstruction * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step)
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
VPWidenPHIRecipe * createWidenPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPWidenCastRecipe * createWidenCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={}, Type *ResultTy=nullptr)
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", Type *ResultTy=nullptr)
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:4076
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:561
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:534
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:546
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:556
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:4177
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3538
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2430
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2477
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2466
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition VPlan.h:2157
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4552
Class to record and manage LLVM IR flags.
Definition VPlan.h:695
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlagsOrNone() const
void dropPoisonGeneratingFlags()
Drop all poison-generating flags.
Definition VPlan.h:892
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1171
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1226
unsigned getNumOperandsWithoutMask() const
Returns the number of operands, excluding the mask if the VPInstruction is masked.
Definition VPlan.h:1473
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1319
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1269
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1315
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1264
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1261
@ CanonicalIVIncrementForPart
Definition VPlan.h:1245
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1272
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:3108
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3100
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:3129
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:3181
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:3139
void addIncoming(VPValue *IncomingV)
Append IncomingV as an incoming value to the phi-like recipe.
Definition VPlan.h:1659
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3702
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:402
VPBasicBlock * getParent()
Definition VPlan.h:477
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:555
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPHistogramRecipe * widenIfHistogram(VPInstruction *VPI)
If VPI represents a histogram operation (as determined by LoopVectorizationLegality) make that safe f...
VPRecipeBase * tryToWidenMemory(VPInstruction *VPI, VFRange &Range)
Check if the load or store instruction VPI should widened for Range.Start and potentially masked.
bool replaceWithFinalIfReductionStore(VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder)
If VPI is a store of a reduction into an invariant address, delete it.
VPReplicateRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a VPReplicationRecipe for VPI.
Type * getScalarType() const
Returns the scalar type of this VPRecipeValue.
Definition VPlanValue.h:337
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3352
A recipe for handling reduction phis.
Definition VPlan.h:2858
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2909
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2902
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2920
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3232
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4609
const VPBlockBase * getEntry() const
Definition VPlan.h:4653
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4685
VPInstruction * getOrCreateCanonicalIVIncrement()
Get the canonical IV increment instruction if it exists.
Definition VPlan.cpp:857
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4670
Type * getCanonicalIVType() const
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4729
void clearCanonicalIVNUW(VPInstruction *Increment)
Unsets NUW for the canonical IV increment Increment, for loop regions.
Definition VPlan.h:4737
VPRegionValue * getCanonicalIV()
Return the canonical induction variable of the region, null for replicating regions.
Definition VPlan.h:4721
const VPBlockBase * getExiting() const
Definition VPlan.h:4665
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4678
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3397
bool isSingleScalar() const
Definition VPlan.h:3453
static InstructionCost computeCallCost(Function *CalledFn, Type *ResultTy, ArrayRef< const VPValue * > ArgOps, bool IsSingleScalar, ElementCount VF, VPCostContext &Ctx)
Return the cost of scalarizing a call to CalledFn with argument operands ArgOps for a given VF.
operand_range operandsWithoutMask()
Return the recipe's operands, excluding the mask of a predicated recipe.
Definition VPlan.h:3478
bool isPredicated() const
Definition VPlan.h:3455
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3472
Lightweight SCEV-to-VPlan expander.
Definition VPlanUtils.h:190
VPValue * tryToExpand(const SCEV *S)
Try to expand S into recipes and live-ins using the builder.
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:4244
VPSingleDefRecipe is a base class for recipes that model a sequence of one or more output IR that def...
Definition VPlan.h:609
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:680
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:384
operand_range operands()
Definition VPlanValue.h:457
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:430
unsigned getNumOperands() const
Definition VPlanValue.h:424
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:425
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:50
Type * getScalarType() const
Returns the scalar type of this VPValue, dispatching based on the concrete subclass.
Definition VPlan.cpp:149
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:143
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1478
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:130
bool hasMoreThanOneUniqueUser() const
Returns true if the value has more than one unique user.
Definition VPlanValue.h:163
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:75
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:208
VPUser * getSingleUser()
Return the single user of this value, or nullptr if there is not exactly one user.
Definition VPlanValue.h:178
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1481
unsigned getNumUsers() const
Definition VPlanValue.h:115
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1487
user_range users()
Definition VPlanValue.h:157
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2260
A recipe for widening Call instructions using library calls.
Definition VPlan.h:2091
static InstructionCost computeCallCost(Function *Variant, VPCostContext &Ctx)
Return the cost of widening a call using the vector function Variant.
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1872
Instruction::CastOps getOpcode() const
Definition VPlan.h:1908
A recipe for handling GEP instructions.
Definition VPlan.h:2200
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2510
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2558
PHINode * getPHINode() const
Returns the underlying PHINode if one exists, or null otherwise.
Definition VPlan.h:2576
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2561
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2581
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2617
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2664
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2668
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2679
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2690
A recipe for widening vector intrinsics.
Definition VPlan.h:1919
static InstructionCost computeCallCost(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost of a vector intrinsic with ID and Operands.
static InstructionCost computeMemIntrinsicCost(Intrinsic::ID IID, Type *Ty, bool IsMasked, Align Alignment, VPCostContext &Ctx)
Helper function for computing the cost of vector memory intrinsic.
A common mixin class for widening memory operations.
Definition VPlan.h:3738
virtual VPRecipeBase * getAsRecipe()=0
Return a VPRecipeBase* to the current object.
A recipe for widened phis.
Definition VPlan.h:2748
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1811
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
VPWidenRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1832
unsigned getOpcode() const
Definition VPlan.h:1851
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4757
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:5082
bool hasVF(ElementCount VF) const
Definition VPlan.h:4980
const DataLayout & getDataLayout() const
Definition VPlan.h:4962
LLVMContext & getContext() const
Definition VPlan.h:4958
VPBasicBlock * getEntry()
Definition VPlan.h:4853
bool hasScalableVF() const
Definition VPlan.h:4981
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4916
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4937
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4987
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:5053
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4956
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:5059
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:5131
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:5085
bool hasUF(unsigned UF) const
Definition VPlan.h:5005
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4906
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4946
VPValue * getBackedgeTakenCount() const
Definition VPlan.h:4943
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:5030
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:5056
void setVF(ElementCount VF)
Definition VPlan.h:4968
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:5021
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1068
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:5008
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4930
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4882
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:5108
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:5050
VPBasicBlock * getVectorPreheader() const
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4858
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4953
bool hasScalarVFOnly() const
Definition VPlan.h:4998
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4896
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4949
void setUF(unsigned UF)
Definition VPlan.h:5013
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop, i.e.
Definition VPlan.h:5163
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1224
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:5064
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:261
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2815
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition DwarfDebug.h:190
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
match_combine_or< CastInst_match< OpTy, TruncInst >, OpTy > m_TruncOrSelf(const OpTy &Op)
auto m_Poison()
Match an arbitrary poison constant.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
specificloop_ty m_SpecificLoop(const Loop *L)
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
auto m_WidenAnyExtend(const Op0_t &Op0)
match_bind< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
auto m_VPPhi(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
auto m_WidenIntrinsic(const T &...Ops)
canonical_widen_iv_match m_CanonicalWidenIV()
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
match_bind< VPSingleDefRecipe > m_VPSingleDefRecipe(VPSingleDefRecipe *&V)
Match a VPSingleDefRecipe, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
auto m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
match_bind< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
auto m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
auto m_AnyNeg(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
bool cannotHoistOrSinkRecipe(const VPRecipeBase &R, bool Sinking=false)
Return true if we do not know how to (mechanically) hoist or sink R.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
VPInstruction * findCanonicalIVIncrement(VPlan &Plan)
Find the canonical IV increment of Plan's vector loop region.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPValue * findIncomingAliasMask(const VPlan &Plan)
Finds the incoming alias-mask within the vector preheader.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:140
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) Note: If ...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isUniformAcrossVFsAndUFs(const VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
LLVM_ABI_FOR_TEST std::optional< VPValue * > getRecipesForUncountableExit(SmallVectorImpl< VPInstruction * > &Recipes, SmallVectorImpl< VPInstruction * > &GEPs, VPBasicBlock *LatchVPBB)
Returns the VPValue representing the uncountable exit comparison used by AnyOf if the recipes it depe...
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
SmallVector< VPBasicBlock * > vp_rpo_plain_cfg_loop_body(VPBasicBlock *Header)
Returns the VPBasicBlocks forming the loop body of a plain (pre-region) VPlan in reverse post-order s...
Definition VPlanCFG.h:265
@ Offset
Definition DWP.cpp:558
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2179
void stable_sort(R &&Range)
Definition STLExtras.h:2115
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2077
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1668
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
constexpr auto bind_back(FnT &&Fn, BindArgsT &&...BindArgs)
C++23 bind_back.
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:288
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
SmallVector< VPRegisterUsage, 8 > calculateRegisterUsageForPlan(VPlan &Plan, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Estimate the register usage for Plan and vectorization factors in VFs by calculating the highest numb...
auto map_range(ContainerTy &&C, FuncTy F)
Return a range that applies F to the elements of C.
Definition STLExtras.h:365
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1151
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
constexpr size_t range_size(R &&Range)
Returns the size of the Range, i.e., the number of elements.
Definition STLExtras.h:1693
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:79
@ MaskedHandleExitInScalarLoop
All memory operations other than the load(s) required to determine whether an uncountable exit occurr...
Definition VPlan.h:89
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:551
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1850
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ FSub
Subtraction of floats.
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2011
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2087
ArrayRef(const T &OneElt) -> ArrayRef< T >
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1408
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1771
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:305
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2145
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT, AssumptionCache *AC=nullptr, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Return true if we can prove that the given load (which is assumed to be within the specified loop) wo...
Definition Loads.cpp:300
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:285
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
VPBasicBlock * EarlyExitingVPBB
VPIRBasicBlock * EarlyExitVPBB
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2840
Holds the VFShape for a specific scalar to vector function mapping.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
static bool isFreeScalarIntrinsic(Intrinsic::ID ID)
Returns true if ID is a pseudo intrinsic that is dropped via scalarization rather than widened.
Definition VPlan.cpp:1946
bool isMaskRequired(Instruction *I) const
Forwards to LoopVectorizationCostModel::isMaskRequired.
PredicatedScalarEvolution & PSE
bool willBeScalarized(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalarized at VF.
TargetTransformInfo::TargetCostKind CostKind
const TargetLibraryInfo & TLI
const TargetTransformInfo & TTI
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:246
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:147
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
InstructionCost spillCost(const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, unsigned OverrideMaxNumRegs=0) const
Calculate the estimated cost of any spills due to using more registers than the number available for ...
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:286
bool isMaterialized() const
Returns true if this symbolic value has been materialized.
Definition VPlanValue.h:297
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3852
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3802
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3955
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3901
static VPValue * materializeAliasMask(VPlan &Plan, VPBasicBlock *AliasCheckVPBB, ArrayRef< PointerDiffInfo > DiffChecks)
Materializes within the AliasCheckVPBB block.
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder)
Convert load/store VPInstructions in Plan into widened or replicate recipes.
static void expandSCEVsToVPInstructions(VPlan &Plan, ScalarEvolution &SE)
Try to expand VPExpandSCEVRecipes in Plan's entry block to VPInstructions.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, const bool &EpilogueAllowed)
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
static void removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void replaceWideCanonicalIVWithWideIV(VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, ElementCount VF, unsigned UF, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Replace a VPWidenCanonicalIVRecipe if it is present in Plan, with a VPWidenIntOrFpInductionRecipe,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static void materializeAliasMaskCheckBlock(VPlan &Plan, ArrayRef< PointerDiffInfo > DiffChecks, bool HasBranchWeights)
Materializes the alias mask within a check block before the loop.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand remaining VPExpandSCEVRecipes in Plan's entry block using SCEVExpander.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step, std::optional< uint64_t > MaxRuntimeStep=std::nullopt)
Materialize vector trip count computations to a set of VPInstructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static bool mergeBlocksIntoPredecessors(VPlan &Plan)
Remove redundant VPBasicBlocks by merging them into their single predecessor if the latter has a sing...
static void attachAliasMaskToHeaderMask(VPlan &Plan)
Attaches the alias-mask to the existing header-mask.
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void makeScalarizationDecisions(VPlan &Plan, VFRange &Range)
Make VPlan-based scalarization decision prior to delegating to the ones made by the legacy CM.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses of the canonical ...
static void makeCallWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder, VPCostContext &CostCtx)
Convert call VPInstructions in Plan into widened call, vector intrinsic or replicate recipes based on...
static void adjustFirstOrderRecurrenceMiddleUsers(VPlan &Plan, VFRange &Range)
Adjust first-order recurrence users in the middle block: create penultimate element extracts for LCSS...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void convertToStridedAccesses(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L, VPCostContext &Ctx, VFRange &Range)
Transform widen memory recipes into strided access recipes when legal and profitable.
static bool handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT, AssumptionCache *AC, UncountableExitStyle Style)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void attachVPCheckBlock(VPlan &Plan, VPValue *Cond, VPBasicBlock *CheckBlock, bool AddBranchWeights)
Wrap runtime check block CheckBlock in a VPIRBB and Cond in a VPValue and connect the block to Plan,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static void dropPoisonGeneratingRecipes(VPlan &Plan)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...