LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/TypeSwitch.h"
32#include "llvm/Analysis/Loads.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(PhiR->operands(), PhiR->getDebugLoc(),
75 Phi->getName());
76 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
77 assert(!isa<PHINode>(Inst) && "phis should be handled above");
78 // Create VPWidenMemoryRecipe for loads and stores.
79 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
80 NewRecipe = new VPWidenLoadRecipe(
81 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
82 false /*Consecutive*/, *VPI, Ingredient.getDebugLoc());
83 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
84 NewRecipe = new VPWidenStoreRecipe(
85 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
86 nullptr /*Mask*/, false /*Consecutive*/, *VPI,
87 Ingredient.getDebugLoc());
89 NewRecipe = new VPWidenGEPRecipe(GEP->getSourceElementType(),
90 Ingredient.operands(), *VPI,
91 Ingredient.getDebugLoc(), GEP);
92 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
93 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
94 if (VectorID == Intrinsic::not_intrinsic)
95 return false;
96
97 // The noalias.scope.decl intrinsic declares a noalias scope that
98 // is valid for a single iteration. Emitting it as a single-scalar
99 // replicate would incorrectly extend the scope across multiple
100 // original iterations packed into one vector iteration.
101 // FIXME: If we want to vectorize this loop, then we have to drop
102 // all the associated !alias.scope and !noalias.
103 if (VectorID == Intrinsic::experimental_noalias_scope_decl)
104 return false;
105
106 // These intrinsics are recognized by getVectorIntrinsicIDForCall
107 // but are not widenable. Emit them as replicate instead of widening.
108 if (VectorID == Intrinsic::assume ||
109 VectorID == Intrinsic::lifetime_end ||
110 VectorID == Intrinsic::lifetime_start ||
111 VectorID == Intrinsic::sideeffect ||
112 VectorID == Intrinsic::pseudoprobe) {
113 // If the operand of llvm.assume holds before vectorization, it will
114 // also hold per lane.
115 // llvm.pseudoprobe requires to be duplicated per lane for accurate
116 // sample count.
117 const bool IsSingleScalar = VectorID != Intrinsic::assume &&
118 VectorID != Intrinsic::pseudoprobe;
119 NewRecipe = new VPReplicateRecipe(CI, Ingredient.operands(),
120 /*IsSingleScalar=*/IsSingleScalar,
121 /*Mask=*/nullptr, *VPI, *VPI,
122 Ingredient.getDebugLoc());
123 } else {
124 NewRecipe = new VPWidenIntrinsicRecipe(
125 *CI, VectorID, drop_end(Ingredient.operands()), CI->getType(),
126 VPIRFlags(*CI), *VPI, CI->getDebugLoc());
127 }
128 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
129 NewRecipe = new VPWidenCastRecipe(
130 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
131 VPIRFlags(*CI), VPIRMetadata(*CI));
132 } else {
133 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
134 *VPI, Ingredient.getDebugLoc());
135 }
136 } else {
138 "inductions must be created earlier");
139 continue;
140 }
141
142 NewRecipe->insertBefore(&Ingredient);
143 if (NewRecipe->getNumDefinedValues() == 1)
144 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
145 else
146 assert(NewRecipe->getNumDefinedValues() == 0 &&
147 "Only recpies with zero or one defined values expected");
148 Ingredient.eraseFromParent();
149 }
150 }
151 return true;
152}
153
154/// Helper for extra no-alias checks via known-safe recipe and SCEV.
156 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
157 VPReplicateRecipe &GroupLeader;
159 const Loop &L;
160
161 // Return true if \p A and \p B are known to not alias for all VFs in the
162 // plan, checked via the distance between the accesses
163 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
164 if (A->getOpcode() != Instruction::Store ||
165 B->getOpcode() != Instruction::Store)
166 return false;
167
168 VPValue *AddrA = A->getOperand(1);
169 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
170 VPValue *AddrB = B->getOperand(1);
171 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
173 return false;
174
175 const APInt *Distance;
176 ScalarEvolution &SE = *PSE.getSE();
177 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
178 return false;
179
180 const DataLayout &DL = SE.getDataLayout();
181 Type *TyA = A->getOperand(0)->getScalarType();
182 uint64_t SizeA = DL.getTypeStoreSize(TyA);
183 Type *TyB = B->getOperand(0)->getScalarType();
184 uint64_t SizeB = DL.getTypeStoreSize(TyB);
185
186 // Use the maximum store size to ensure no overlap from either direction.
187 // Currently only handles fixed sizes, as it is only used for
188 // replicating VPReplicateRecipes.
189 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
190
191 auto VFs = B->getParent()->getPlan()->vectorFactors();
193 if (MaxVF.isScalable())
194 return false;
195 return Distance->abs().uge(
196 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
197 }
198
199public:
202 const Loop &L)
203 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
204 L(L) {}
205
206 /// Return true if \p R should be skipped during alias checking, either
207 /// because it's in the exclude set or because no-alias can be proven via
208 /// SCEV.
209 bool shouldSkip(VPRecipeBase &R) const {
210 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
211 return ExcludeRecipes.contains(&R) ||
212 (Store && isNoAliasViaDistance(Store, &GroupLeader));
213 }
214};
215
216/// Check if a memory operation doesn't alias with memory operations using
217/// scoped noalias metadata, in blocks in the single-successor chain between \p
218/// FirstBB and \p LastBB. If \p SinkInfo is std::nullopt, only recipes that may
219/// write to memory are checked (for load hoisting). Otherwise recipes that both
220/// read and write memory are checked, and SCEV is used to prove no-alias
221/// between the group leader and other replicate recipes (for store sinking).
222static bool
224 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
225 std::optional<SinkStoreInfo> SinkInfo = {}) {
226 bool CheckReads = SinkInfo.has_value();
227 if (!MemLoc.AATags.Scope)
228 return false;
229
230 for (VPBasicBlock *VPBB :
232 for (VPRecipeBase &R : *VPBB) {
233 if (SinkInfo && SinkInfo->shouldSkip(R))
234 continue;
235
236 // Skip recipes that don't need checking.
237 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
238 continue;
239
241 if (!Loc)
242 // Conservatively assume aliasing for memory operations without
243 // location.
244 return false;
245
247 return false;
248 }
249 }
250 return true;
251}
252
253/// Get the value type of the replicate load or store. \p IsLoad indicates
254/// whether it is a load.
256 return (IsLoad ? R : R->getOperand(0))->getScalarType();
257}
258
259/// Collect either replicated Loads or Stores grouped by their address SCEV and
260/// their load-store type, in a deep-traversal of the vector loop region in \p
261/// Plan.
262template <unsigned Opcode>
265 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
266 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
267 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
268 "Only Load and Store opcodes supported");
269 constexpr bool IsLoad = (Opcode == Instruction::Load);
272 RecipesByAddressAndType;
275 for (VPRecipeBase &R : *VPBB) {
276 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
277 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
278 continue;
279
280 // For loads, operand 0 is address; for stores, operand 1 is address.
281 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
282 const Type *LoadStoreTy = getLoadStoreValueType(RepR, IsLoad);
283 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
284 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
285 RecipesByAddressAndType[{AddrSCEV, LoadStoreTy}].push_back(RepR);
286 }
287 }
288 auto Groups = to_vector(RecipesByAddressAndType.values());
289 VPDominatorTree VPDT(Plan);
290 for (auto &Group : Groups) {
291 // Sort mem ops by dominance order, with earliest (most dominating) first.
293 return VPDT.properlyDominates(A, B);
294 });
295 }
296 return Groups;
297}
298
299static bool sinkScalarOperands(VPlan &Plan) {
300 auto Iter = vp_depth_first_deep(Plan.getEntry());
301 bool ScalarVFOnly = Plan.hasScalarVFOnly();
302 bool Changed = false;
303
305 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
306 VPBasicBlock *SinkTo, VPValue *Op) {
307 auto *Candidate =
308 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
309 if (!Candidate)
310 return;
311
312 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
313 // for now.
315 return;
316
317 if (Candidate->getParent() == SinkTo ||
318 vputils::cannotHoistOrSinkRecipe(*Candidate, /*Sinking=*/true))
319 return;
320
321 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
322 if (!ScalarVFOnly && RepR->isSingleScalar())
323 return;
324
325 WorkList.insert({SinkTo, Candidate});
326 };
327
328 // First, collect the operands of all recipes in replicate blocks as seeds for
329 // sinking.
331 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
332 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
333 continue;
334 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
335 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
336 continue;
337 for (auto &Recipe : *VPBB)
338 for (VPValue *Op : Recipe.operands())
339 InsertIfValidSinkCandidate(VPBB, Op);
340 }
341
342 // Try to sink each replicate or scalar IV steps recipe in the worklist.
343 for (unsigned I = 0; I != WorkList.size(); ++I) {
344 VPBasicBlock *SinkTo;
345 VPSingleDefRecipe *SinkCandidate;
346 std::tie(SinkTo, SinkCandidate) = WorkList[I];
347
348 // All recipe users of SinkCandidate must be in the same block SinkTo or all
349 // users outside of SinkTo must only use the first lane of SinkCandidate. In
350 // the latter case, we need to duplicate SinkCandidate.
351 auto UsersOutsideSinkTo =
352 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
353 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
354 });
355 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
356 return !U->usesFirstLaneOnly(SinkCandidate);
357 }))
358 continue;
359 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
360
361 if (NeedsDuplicating) {
362 if (ScalarVFOnly)
363 continue;
364 VPSingleDefRecipe *Clone;
365 if (auto *SinkCandidateRepR =
366 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
367 // TODO: Handle converting to uniform recipes as separate transform,
368 // then cloning should be sufficient here.
370 SinkCandidateRepR->getOpcode(), SinkCandidate->operands(),
371 /*Mask=*/nullptr, *SinkCandidateRepR, *SinkCandidateRepR,
372 SinkCandidate->getDebugLoc(), SinkCandidate->getUnderlyingInstr());
373 // TODO: add ".cloned" suffix to name of Clone's VPValue.
374 } else {
375 Clone = SinkCandidate->clone();
376 }
377
378 Clone->insertBefore(SinkCandidate);
379 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
380 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
381 });
382 }
383 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
384 for (VPValue *Op : SinkCandidate->operands())
385 InsertIfValidSinkCandidate(SinkTo, Op);
386 Changed = true;
387 }
388 return Changed;
389}
390
391/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
392/// the mask.
394 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
395 if (!EntryBB || EntryBB->size() != 1 ||
396 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
397 return nullptr;
398
399 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
400}
401
402/// If \p R is a triangle region, return the 'then' block of the triangle.
404 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
405 if (EntryBB->getNumSuccessors() != 2)
406 return nullptr;
407
408 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
409 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
410 if (!Succ0 || !Succ1)
411 return nullptr;
412
413 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
414 return nullptr;
415 if (Succ0->getSingleSuccessor() == Succ1)
416 return Succ0;
417 if (Succ1->getSingleSuccessor() == Succ0)
418 return Succ1;
419 return nullptr;
420}
421
422// Merge replicate regions in their successor region, if a replicate region
423// is connected to a successor replicate region with the same predicate by a
424// single, empty VPBasicBlock.
426 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
427
428 // Collect replicate regions followed by an empty block, followed by another
429 // replicate region with matching masks to process front. This is to avoid
430 // iterator invalidation issues while merging regions.
433 vp_depth_first_deep(Plan.getEntry()))) {
434 if (!Region1->isReplicator())
435 continue;
436 auto *MiddleBasicBlock =
437 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
438 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
439 continue;
440
441 auto *Region2 =
442 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
443 if (!Region2 || !Region2->isReplicator())
444 continue;
445
446 VPValue *Mask1 = getPredicatedMask(Region1);
447 VPValue *Mask2 = getPredicatedMask(Region2);
448 if (!Mask1 || Mask1 != Mask2)
449 continue;
450
451 assert(Mask1 && Mask2 && "both region must have conditions");
452 WorkList.push_back(Region1);
453 }
454
455 // Move recipes from Region1 to its successor region, if both are triangles.
456 for (VPRegionBlock *Region1 : WorkList) {
457 if (TransformedRegions.contains(Region1))
458 continue;
459 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
460 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
461
462 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
463 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
464 if (!Then1 || !Then2)
465 continue;
466
467 // Note: No fusion-preventing memory dependencies are expected in either
468 // region. Such dependencies should be rejected during earlier dependence
469 // checks, which guarantee accesses can be re-ordered for vectorization.
470 //
471 // Move recipes to the successor region.
472 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
473 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
474
475 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
476 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
477
478 // Move VPPredInstPHIRecipes from the merge block to the successor region's
479 // merge block. Update all users inside the successor region to use the
480 // original values.
481 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
482 VPValue *PredInst1 =
483 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
484 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
485 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
486 return cast<VPRecipeBase>(&U)->getParent() == Then2;
487 });
488
489 // Remove phi recipes that are unused after merging the regions.
490 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
491 Phi1ToMove.eraseFromParent();
492 continue;
493 }
494 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
495 }
496
497 // Remove the dead recipes in Region1's entry block.
498 for (VPRecipeBase &R :
499 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
500 R.eraseFromParent();
501
502 // Finally, remove the first region.
503 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
504 VPBlockUtils::disconnectBlocks(Pred, Region1);
505 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
506 }
507 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
508 TransformedRegions.insert(Region1);
509 }
510
511 return !TransformedRegions.empty();
512}
513
515 VPRegionBlock *ParentRegion,
516 VPlan &Plan) {
517 Instruction *Instr = PredRecipe->getUnderlyingInstr();
518 // Build the triangular if-then region.
519 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
520 assert(Instr->getParent() && "Predicated instruction not in any basic block");
521 auto *BlockInMask = PredRecipe->getMask();
522 auto *MaskDef = BlockInMask->getDefiningRecipe();
523 auto *BOMRecipe = new VPBranchOnMaskRecipe(
524 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
525 auto *Entry =
526 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
527
528 // Replace predicated replicate recipe with a replicate recipe without a
529 // mask but in the replicate region.
530 auto *RecipeWithoutMask = new VPReplicateRecipe(
531 PredRecipe->getUnderlyingInstr(), PredRecipe->operandsWithoutMask(),
532 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
533 PredRecipe->getDebugLoc());
534 auto *Pred =
535 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
536 auto *Exiting = Plan.createVPBasicBlock(Twine(RegionName) + ".continue");
538 Plan.createReplicateRegion(Entry, Exiting, RegionName);
539
540 // Note: first set Entry as region entry and then connect successors starting
541 // from it in order, to propagate the "parent" of each VPBasicBlock.
542 Region->setParent(ParentRegion);
543 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
544 VPBlockUtils::connectBlocks(Pred, Exiting);
545
546 if (PredRecipe->getNumUsers() != 0) {
547 auto *PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
548 RecipeWithoutMask->getDebugLoc());
549 Exiting->appendRecipe(PHIRecipe);
550 PredRecipe->replaceAllUsesWith(PHIRecipe);
551 }
552 PredRecipe->eraseFromParent();
553 return Region;
554}
555
556static void addReplicateRegions(VPlan &Plan) {
559 vp_depth_first_deep(Plan.getEntry()))) {
560 for (VPRecipeBase &R : *VPBB)
561 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
562 if (RepR->isPredicated())
563 WorkList.push_back(RepR);
564 }
565 }
566
567 unsigned BBNum = 0;
568 for (VPReplicateRecipe *RepR : WorkList) {
569 VPBasicBlock *CurrentBlock = RepR->getParent();
570 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
571
572 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
573 SplitBlock->setName(
574 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
575 // Record predicated instructions for above packing optimizations.
577 createReplicateRegion(RepR, CurrentBlock->getParent(), Plan);
579
580 VPRegionBlock *ParentRegion = Region->getParent();
581 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
582 ParentRegion->setExiting(SplitBlock);
583 }
584}
585
589 vp_depth_first_deep(Plan.getEntry()))) {
590 // Don't fold the blocks in the skeleton of the Plan into their single
591 // predecessors for now.
592 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
593 if (!VPBB->getParent())
594 continue;
595 auto *PredVPBB =
596 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
597 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
598 isa<VPIRBasicBlock>(PredVPBB))
599 continue;
600 WorkList.push_back(VPBB);
601 }
602
603 for (VPBasicBlock *VPBB : WorkList) {
604 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
605 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
606 R.moveBefore(*PredVPBB, PredVPBB->end());
607 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
608 auto *ParentRegion = VPBB->getParent();
609 if (ParentRegion && ParentRegion->getExiting() == VPBB)
610 ParentRegion->setExiting(PredVPBB);
611 VPBlockUtils::transferSuccessors(VPBB, PredVPBB);
612 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
613 }
614 return !WorkList.empty();
615}
616
618 // Convert masked VPReplicateRecipes to if-then region blocks.
620
621 bool ShouldSimplify = true;
622 while (ShouldSimplify) {
623 ShouldSimplify = sinkScalarOperands(Plan);
624 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
625 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
626 }
627}
628
629/// Remove redundant casts of inductions.
630///
631/// Such redundant casts are casts of induction variables that can be ignored,
632/// because we already proved that the casted phi is equal to the uncasted phi
633/// in the vectorized loop. There is no need to vectorize the cast - the same
634/// value can be used for both the phi and casts in the vector loop.
636 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
638 if (!IV || IV->getTruncInst())
639 continue;
640
641 // A sequence of IR Casts has potentially been recorded for IV, which
642 // *must be bypassed* when the IV is vectorized, because the vectorized IV
643 // will produce the desired casted value. This sequence forms a def-use
644 // chain and is provided in reverse order, ending with the cast that uses
645 // the IV phi. Search for the recipe of the last cast in the chain and
646 // replace it with the original IV. Note that only the final cast is
647 // expected to have users outside the cast-chain and the dead casts left
648 // over will be cleaned up later.
649 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
650 VPValue *FindMyCast = IV;
651 for (Instruction *IRCast : reverse(Casts)) {
652 VPSingleDefRecipe *FoundUserCast = nullptr;
653 for (auto *U : FindMyCast->users()) {
654 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
655 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
656 FoundUserCast = UserCast;
657 break;
658 }
659 }
660 // A cast recipe in the chain may have been removed by earlier DCE.
661 if (!FoundUserCast)
662 break;
663 FindMyCast = FoundUserCast;
664 }
665 if (FindMyCast != IV)
666 FindMyCast->replaceAllUsesWith(IV);
667 }
668}
669
672 Instruction::BinaryOps InductionOpcode,
673 FPMathOperator *FPBinOp, Instruction *TruncI,
674 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
675 VPBuilder &Builder) {
676 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
677 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
678 VPValue *CanonicalIV = LoopRegion->getCanonicalIV();
679 VPSingleDefRecipe *BaseIV =
680 Builder.createDerivedIV(Kind, FPBinOp, StartV, CanonicalIV, Step);
681
682 // Truncate base induction if needed.
683 Type *ResultTy = BaseIV->getScalarType();
684 if (TruncI) {
685 Type *TruncTy = TruncI->getType();
686 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
687 "Not truncating.");
688 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
689 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
690 ResultTy = TruncTy;
691 }
692
693 // Truncate step if needed.
694 Type *StepTy = Step->getScalarType();
695 if (ResultTy != StepTy) {
696 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
697 "Not truncating.");
698 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
699 auto *VecPreheader =
701 VPBuilder::InsertPointGuard Guard(Builder);
702 Builder.setInsertPoint(VecPreheader);
703 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
704 }
705 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
706 &Plan.getVF(), DL);
707}
708
710 VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI,
712 const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
713 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
714 if (!LoopRegion)
715 return;
716
717 auto *WideCanIV =
719 if (!WideCanIV)
720 return;
721
722 Type *CanIVTy = LoopRegion->getCanonicalIVType();
723
724 // Replace the wide canonical IV with a scalar-iv-steps over the canonical
725 // IV.
726 if (Plan.hasScalarVFOnly() || vputils::onlyFirstLaneUsed(WideCanIV)) {
727 VPBuilder Builder(WideCanIV);
728 WideCanIV->replaceAllUsesWith(createScalarIVSteps(
729 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
730 nullptr, Plan.getZero(CanIVTy), Plan.getConstantInt(CanIVTy, 1),
731 WideCanIV->getDebugLoc(), Builder));
732 WideCanIV->eraseFromParent();
733 return;
734 }
735
736 if (vputils::onlyScalarValuesUsed(WideCanIV))
737 return;
738
739 // If a canonical VPWidenIntOrFpInductionRecipe already produces vector lanes
740 // in the header, reuse it instead of introducing another wide induction phi.
741 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
742 for (VPRecipeBase &Phi : Header->phis()) {
744 if (!match(&Phi, m_CanonicalWidenIV(WidenIV)))
745 continue;
746 // The reused wide IV feeds the header mask, whose lanes may extend past
747 // the trip count; drop flags that only hold inside the scalar loop.
748 WidenIV->dropPoisonGeneratingFlags();
749 WideCanIV->replaceAllUsesWith(WidenIV);
750 WideCanIV->eraseFromParent();
751 return;
752 }
753
754 // Introduce a new VPWidenIntOrFpInductionRecipe if profitable.
755 auto *VecTy = VectorType::get(CanIVTy, VF);
756 InstructionCost BroadcastCost = TTI.getShuffleCost(
758 InstructionCost PHICost = TTI.getCFInstrCost(Instruction::PHI, CostKind);
759 if (PHICost > BroadcastCost)
760 return;
761
762 // Bail out if the additional wide induction phi increase the expected spill
763 // cost.
764 VPRegisterUsage UnrolledBase =
765 calculateRegisterUsageForPlan(Plan, VF, TTI, ValuesToIgnore)[0];
766 for (unsigned &NumUsers : make_second_range(UnrolledBase.MaxLocalUsers))
767 NumUsers *= UF;
768 unsigned RegClass = TTI.getRegisterClassForType(/*Vector=*/true, VecTy);
769 VPRegisterUsage Projected = UnrolledBase;
770 Projected.MaxLocalUsers[RegClass] += TTI.getRegUsageForType(VecTy);
771 if (Projected.spillCost(TTI, CostKind) >
772 UnrolledBase.spillCost(TTI, CostKind))
773 return;
774
777 VPValue *StepV = Plan.getConstantInt(CanIVTy, 1);
778 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
779 /*IV=*/nullptr, Plan.getZero(CanIVTy), StepV, &Plan.getVF(), ID,
780 WideCanIV->getNoWrapFlags(), WideCanIV->getDebugLoc());
781 NewWideIV->insertBefore(&*Header->getFirstNonPhi());
782 WideCanIV->replaceAllUsesWith(NewWideIV);
783 WideCanIV->eraseFromParent();
784}
785
786/// Returns true if \p R is dead and can be removed.
787static bool isDeadRecipe(VPRecipeBase &R) {
788 // Do remove conditional assume instructions as their conditions may be
789 // flattened.
790 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
791 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
793 if (IsConditionalAssume)
794 return true;
795
796 if (R.mayHaveSideEffects())
797 return false;
798
799 // Recipe is dead if no user keeps the recipe alive.
800 return all_of(R.definedValues(),
801 [](VPValue *V) { return V->getNumUsers() == 0; });
802}
803
806 Plan.getEntry());
808 // The recipes in the block are processed in reverse order, to catch chains
809 // of dead recipes.
810 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
811 if (isDeadRecipe(R)) {
812 R.eraseFromParent();
813 continue;
814 }
815
816 // Check if R is a dead VPPhi <-> update cycle and remove it.
817 VPValue *Start, *Incoming;
818 if (!match(&R, m_VPPhi(m_VPValue(Start), m_VPValue(Incoming))))
819 continue;
820 auto *PhiR = cast<VPPhi>(&R);
821 VPUser *PhiUser = PhiR->getSingleUser();
822 if (!PhiUser)
823 continue;
824 if (PhiUser != Incoming->getDefiningRecipe() ||
825 Incoming->getNumUsers() != 1)
826 continue;
827 PhiR->replaceAllUsesWith(Start);
828 PhiR->eraseFromParent();
829 Incoming->getDefiningRecipe()->eraseFromParent();
830 }
831 }
832}
833
836 for (unsigned I = 0; I != Users.size(); ++I) {
838 for (VPValue *V : Cur->definedValues())
839 Users.insert_range(V->users());
840 }
841 return Users.takeVector();
842}
843
844/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
845/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
846/// generates scalar values.
847static VPValue *
849 VPlan &Plan, VPBuilder &Builder) {
851 VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());
852 VPValue *StepV = PtrIV->getOperand(1);
854 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
855 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
856
857 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
858 PtrIV->getDebugLoc(), "next.gep");
859}
860
861/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
862/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
863/// VPWidenPointerInductionRecipe will generate vectors only. If some users
864/// require vectors while other require scalars, the scalar uses need to extract
865/// the scalars from the generated vectors (Note that this is different to how
866/// int/fp inductions are handled). Legalize extract-from-ends using uniform
867/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
868/// the correct end value is available. Also optimize
869/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
870/// providing them scalar steps built on the canonical scalar IV and update the
871/// original IV's users. This is an optional optimization to reduce the needs of
872/// vector extracts.
875 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
876 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
877 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
878 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
879 if (!PhiR)
880 continue;
881
882 // Try to narrow wide and replicating recipes to uniform recipes, based on
883 // VPlan analysis.
884 // TODO: Apply to all recipes in the future, to replace legacy uniformity
885 // analysis.
886 auto Users = collectUsersRecursively(PhiR);
887 for (VPUser *U : reverse(Users)) {
888 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
889 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
890 // Skip recipes that shouldn't be narrowed.
891 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
892 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
893 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
894 continue;
895
896 // Skip recipes that may have other lanes than their first used.
898 continue;
899
900 // TODO: Support scalarizing ExtractValue.
901 if (match(Def,
903 continue;
904
906 Def->getUnderlyingInstr()->getOpcode(), Def->operands(),
907 /*Mask=*/nullptr, *Def, {}, DebugLoc::getUnknown(),
908 Def->getUnderlyingInstr());
909 Clone->insertAfter(Def);
910 Def->replaceAllUsesWith(Clone);
911 }
912
913 // Replace wide pointer inductions which have only their scalars used by
914 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
915 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
916 if (!Plan.hasScalarVFOnly() &&
917 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
918 continue;
919
920 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
921 PtrIV->replaceAllUsesWith(PtrAdd);
922 continue;
923 }
924
925 // Replace widened induction with scalar steps for users that only use
926 // scalars.
927 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
928 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
929 return U->usesScalars(WideIV);
930 }))
931 continue;
932
933 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
935 Plan, ID.getKind(), ID.getInductionOpcode(),
936 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
937 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
938 WideIV->getDebugLoc(), Builder);
939
940 // Update scalar users of IV to use Step instead.
941 if (!HasOnlyVectorVFs) {
942 assert(!Plan.hasScalableVF() &&
943 "plans containing a scalar VF cannot also include scalable VFs");
944 WideIV->replaceAllUsesWith(Steps);
945 } else {
946 bool HasScalableVF = Plan.hasScalableVF();
947 WideIV->replaceUsesWithIf(Steps,
948 [WideIV, HasScalableVF](VPUser &U, unsigned) {
949 if (HasScalableVF)
950 return U.usesFirstLaneOnly(WideIV);
951 return U.usesScalars(WideIV);
952 });
953 }
954 }
955}
956
957/// Check if \p VPV is an untruncated wide induction, either before or after the
958/// increment. If so return the header IV (before the increment), otherwise
959/// return null.
962 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
963 if (WideIV) {
964 // VPV itself is a wide induction, separately compute the end value for exit
965 // users if it is not a truncated IV.
966 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
967 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
968 }
969
970 // Check if VPV is an optimizable induction increment.
971 VPRecipeBase *Def = VPV->getDefiningRecipe();
972 if (!Def || Def->getNumOperands() != 2)
973 return nullptr;
974 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
975 if (!WideIV)
976 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
977 if (!WideIV)
978 return nullptr;
979
980 auto IsWideIVInc = [&]() {
981 auto &ID = WideIV->getInductionDescriptor();
982
983 // Check if VPV increments the induction by the induction step.
984 VPValue *IVStep = WideIV->getStepValue();
985 switch (ID.getInductionOpcode()) {
986 case Instruction::Add:
987 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
988 case Instruction::FAdd:
989 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
990 case Instruction::FSub:
991 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
992 m_Specific(IVStep)));
993 case Instruction::Sub: {
994 // IVStep will be the negated step of the subtraction. Check if Step == -1
995 // * IVStep.
996 VPValue *Step;
997 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
998 return false;
999 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
1000 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
1001 ScalarEvolution &SE = *PSE.getSE();
1002 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
1003 !isa<SCEVCouldNotCompute>(StepSCEV) &&
1004 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
1005 }
1006 default:
1007 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
1008 match(VPV, m_GetElementPtr(m_Specific(WideIV),
1009 m_Specific(WideIV->getStepValue())));
1010 }
1011 llvm_unreachable("should have been covered by switch above");
1012 };
1013 return IsWideIVInc() ? WideIV : nullptr;
1014}
1015
1016/// Attempts to optimize the induction variable exit values for users in the
1017/// early exit block.
1020 VPValue *Incoming, *Mask;
1022 m_VPValue(Incoming))))
1023 return nullptr;
1024
1025 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
1026 if (!WideIV)
1027 return nullptr;
1028
1029 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1030 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1031 return nullptr;
1032
1033 // Calculate the final index.
1034 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1035 auto *CanonicalIV = LoopRegion->getCanonicalIV();
1036 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
1037 auto *ExtractR = cast<VPInstruction>(Op);
1038 VPBuilder B(ExtractR);
1039
1040 DebugLoc DL = ExtractR->getDebugLoc();
1041 VPValue *FirstActiveLane = B.createFirstActiveLane(Mask, DL);
1042 FirstActiveLane = B.createScalarZExtOrTrunc(
1043 FirstActiveLane, CanonicalIVType, FirstActiveLane->getScalarType(), DL);
1044 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
1045
1046 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1047 // changed it means the exit is using the incremented value, so we need to
1048 // add the step.
1049 if (Incoming != WideIV) {
1050 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
1051 EndValue = B.createAdd(EndValue, One, DL);
1052 }
1053
1054 if (!match(WideIV, m_CanonicalWidenIV())) {
1055 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1056 VPIRValue *Start = WideIV->getStartValue();
1057 VPValue *Step = WideIV->getStepValue();
1058 EndValue = B.createDerivedIV(
1059 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1060 Start, EndValue, Step);
1061 }
1062
1063 return EndValue;
1064}
1065
1066/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1067/// VPDerivedIVRecipe for non-canonical inductions.
1069 VPBuilder &VectorPHBuilder,
1070 VPValue *VectorTC) {
1071 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1072 // Truncated wide inductions resume from the last lane of their vector value
1073 // in the last vector iteration which is handled elsewhere.
1074 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1075 return nullptr;
1076
1077 VPIRValue *Start = WideIV->getStartValue();
1078 VPValue *Step = WideIV->getStepValue();
1080 VPValue *EndValue = VectorTC;
1081 if (!match(WideIV, m_CanonicalWidenIV())) {
1082 EndValue = VectorPHBuilder.createDerivedIV(
1083 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1084 Start, VectorTC, Step);
1085 }
1086
1087 // EndValue is derived from the vector trip count (which has the same type as
1088 // the widest induction) and thus may be wider than the induction here.
1089 Type *ScalarTypeOfWideIV = WideIV->getScalarType();
1090 if (ScalarTypeOfWideIV != EndValue->getScalarType()) {
1091 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
1092 ScalarTypeOfWideIV,
1093 WideIV->getDebugLoc());
1094 }
1095
1096 return EndValue;
1097}
1098
1099/// Attempts to optimize the induction variable exit values for users in the
1100/// exit block coming from the latch in the original scalar loop.
1101static VPValue *
1105 VPValue *Incoming;
1107 return nullptr;
1108
1109 VPWidenInductionRecipe *WideIV = getOptimizableIVOf(Incoming, PSE);
1110 if (!WideIV)
1111 return nullptr;
1112
1113 VPValue *EndValue = EndValues.lookup(WideIV);
1114 assert(EndValue && "Must have computed the end value up front");
1115
1116 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1117 // changed it means the exit is using the incremented value, so we don't
1118 // need to subtract the step.
1119 if (Incoming != WideIV)
1120 return EndValue;
1121
1122 // Otherwise, subtract the step from the EndValue.
1123 auto *ExtractR = cast<VPInstruction>(Op);
1124 VPBuilder B(ExtractR);
1125 VPValue *Step = WideIV->getStepValue();
1126 Type *ScalarTy = WideIV->getScalarType();
1127 if (ScalarTy->isIntegerTy())
1128 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1129 if (ScalarTy->isPointerTy()) {
1130 Type *StepTy = Step->getScalarType();
1131 auto *Zero = Plan.getZero(StepTy);
1132 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1133 DebugLoc::getUnknown(), "ind.escape");
1134 }
1135 if (ScalarTy->isFloatingPointTy()) {
1136 const auto &ID = WideIV->getInductionDescriptor();
1137 return B.createNaryOp(
1138 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1139 ? Instruction::FSub
1140 : Instruction::FAdd,
1141 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1142 }
1143 llvm_unreachable("all possible induction types must be handled");
1144 return nullptr;
1145}
1146
1148 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1149 // Compute end values for all inductions.
1150 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1151 auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());
1152 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1154 VPValue *ResumeTC =
1155 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1156 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1157 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);
1158 if (!WideIV)
1159 continue;
1160 if (VPValue *EndValue =
1161 tryToComputeEndValueForInduction(WideIV, VectorPHBuilder, ResumeTC))
1162 EndValues[WideIV] = EndValue;
1163 }
1164
1165 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1166 for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
1167 VPValue *Op;
1168 if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))
1169 continue;
1170 auto *WideIV = cast<VPWidenInductionRecipe>(Op);
1171 if (VPValue *EndValue = EndValues.lookup(WideIV)) {
1172 R.getVPSingleValue()->replaceAllUsesWith(EndValue);
1173 R.eraseFromParent();
1174 }
1175 }
1176
1177 // Then, optimize exit block users.
1178 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1179 for (VPRecipeBase &R : ExitVPBB->phis()) {
1180 auto *ExitIRI = cast<VPIRPhi>(&R);
1181
1182 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1183 VPValue *Escape = nullptr;
1184 if (PredVPBB == MiddleVPBB)
1186 Plan, ExitIRI->getOperand(Idx), EndValues, PSE);
1187 else
1189 Plan, ExitIRI->getOperand(Idx), PSE);
1190 if (Escape)
1191 ExitIRI->setOperand(Idx, Escape);
1192 }
1193 }
1194 }
1195}
1196
1197/// Remove redundant ExpandSCEVRecipes in \p Plan's entry block by replacing
1198/// them with already existing recipes expanding the same SCEV expression.
1201
1202 for (VPRecipeBase &R :
1204 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1205 if (!ExpR)
1206 continue;
1207
1208 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1209 if (Inserted)
1210 continue;
1211
1212 ExpR->replaceAllUsesWith(V->second);
1213 if (ExpR == Plan.getTripCount())
1214 Plan.resetTripCount(V->second);
1215
1216 ExpR->eraseFromParent();
1217 }
1218}
1219
1221 SmallVector<VPValue *> WorkList;
1223 WorkList.push_back(V);
1224
1225 while (!WorkList.empty()) {
1226 VPValue *Cur = WorkList.pop_back_val();
1227 if (!Seen.insert(Cur).second)
1228 continue;
1229 VPRecipeBase *R = Cur->getDefiningRecipe();
1230 if (!R)
1231 continue;
1232 if (!isDeadRecipe(*R))
1233 continue;
1234 append_range(WorkList, R->operands());
1235 R->eraseFromParent();
1236 }
1237}
1238
1239/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1240/// Returns an optional pair, where the first element indicates whether it is
1241/// an intrinsic ID.
1242static std::optional<std::pair<bool, unsigned>>
1244 return TypeSwitch<const VPSingleDefRecipe *,
1245 std::optional<std::pair<bool, unsigned>>>(R)
1248 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1249 .Case([](const VPWidenIntrinsicRecipe *I) {
1250 return std::make_pair(true, I->getVectorIntrinsicID());
1251 })
1252 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe, VPScalarIVStepsRecipe>(
1253 [](auto *I) {
1254 // For recipes that do not directly map to LLVM IR instructions,
1255 // assign opcodes after the last VPInstruction opcode (which is also
1256 // after the last IR Instruction opcode), based on the VPRecipeID.
1257 return std::make_pair(false, VPInstruction::OpsEnd + 1 +
1258 I->getVPRecipeID());
1259 })
1260 .Default([](auto *) { return std::nullopt; });
1261}
1262
1263/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1264/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1265/// Operands are foldable live-ins.
1267 ArrayRef<VPValue *> Operands,
1268 const DataLayout &DL) {
1269 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1270 if (!OpcodeOrIID)
1271 return nullptr;
1272
1274 for (VPValue *Op : Operands) {
1275 VPValue *Candidate = Op;
1276 match(Op, m_Broadcast(m_VPValue(Candidate)));
1277 if (!match(Candidate, m_LiveIn()))
1278 return nullptr;
1279 Value *V = Candidate->getUnderlyingValue();
1280 if (!V)
1281 return nullptr;
1282 Ops.push_back(V);
1283 }
1284
1285 VPlan &Plan = *R.getParent()->getPlan();
1286 auto FoldToIRValue = [&]() -> Value * {
1287 InstSimplifyFolder Folder(DL);
1288 if (OpcodeOrIID->first) {
1289 if (R.getNumOperands() != 2)
1290 return nullptr;
1291 unsigned ID = OpcodeOrIID->second;
1292 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1], R.getScalarType());
1293 }
1294 unsigned Opcode = OpcodeOrIID->second;
1295 if (Instruction::isBinaryOp(Opcode))
1296 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1297 Ops[0], Ops[1]);
1298 if (Instruction::isCast(Opcode))
1299 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1300 R.getVPSingleValue()->getScalarType());
1301 switch (Opcode) {
1302 case VPInstruction::Not:
1303 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1305 case Instruction::Select:
1306 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1307 case Instruction::ICmp:
1308 case Instruction::FCmp:
1309 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1310 Ops[1]);
1311 case Instruction::GetElementPtr: {
1312 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1313 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1314 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1315 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1316 }
1319 return Folder.FoldGEP(IntegerType::getInt8Ty(Plan.getContext()), Ops[0],
1320 Ops[1],
1321 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1322 // An extract of a live-in is an extract of a broadcast, so return the
1323 // broadcasted element.
1324 case Instruction::ExtractElement:
1325 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1326 return Ops[0];
1327 }
1328 return nullptr;
1329 };
1330
1331 if (Value *V = FoldToIRValue())
1332 return Plan.getOrAddLiveIn(V);
1333 return nullptr;
1334}
1335
1336/// Try to simplify logical and bitwise recipes in \p Def.
1338 bool CanCreateNewRecipe) {
1339 VPlan *Plan = Def->getParent()->getPlan();
1340
1341 // Simplify (X && Y) | (X && !Y) -> X.
1342 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1343 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1344 // recipes to be visited during simplification.
1345 VPValue *X, *Y, *Z;
1346 if (match(Def,
1349 Def->replaceAllUsesWith(X);
1350 Def->eraseFromParent();
1351 return true;
1352 }
1353
1354 // x | AllOnes -> AllOnes
1355 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) {
1356 Def->replaceAllUsesWith(Plan->getAllOnesValue(Def->getScalarType()));
1357 return true;
1358 }
1359
1360 // x | 0 -> x
1361 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt()))) {
1362 Def->replaceAllUsesWith(X);
1363 return true;
1364 }
1365
1366 // x | !x -> AllOnes
1367 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_Not(m_Deferred(X))))) {
1368 Def->replaceAllUsesWith(Plan->getAllOnesValue(Def->getScalarType()));
1369 return true;
1370 }
1371
1372 // x & 0 -> 0
1373 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt()))) {
1374 Def->replaceAllUsesWith(Plan->getZero(Def->getScalarType()));
1375 return true;
1376 }
1377
1378 // x & AllOnes -> x
1379 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes()))) {
1380 Def->replaceAllUsesWith(X);
1381 return true;
1382 }
1383
1384 // x && false -> false
1385 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False()))) {
1386 Def->replaceAllUsesWith(Plan->getFalse());
1387 return true;
1388 }
1389
1390 // x && true -> x
1391 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True()))) {
1392 Def->replaceAllUsesWith(X);
1393 return true;
1394 }
1395
1396 // (x && y) | (x && z) -> x && (y | z)
1397 if (CanCreateNewRecipe &&
1400 // Simplify only if one of the operands has one use to avoid creating an
1401 // extra recipe.
1402 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1403 !Def->getOperand(1)->hasMoreThanOneUniqueUser())) {
1404 Def->replaceAllUsesWith(
1405 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1406 return true;
1407 }
1408
1409 // x && (x && y) -> x && y
1410 if (match(Def, m_LogicalAnd(m_VPValue(X),
1412 Def->replaceAllUsesWith(Def->getOperand(1));
1413 return true;
1414 }
1415
1416 // x && (y && x) -> x && y
1417 if (match(Def, m_LogicalAnd(m_VPValue(X),
1419 Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));
1420 return true;
1421 }
1422
1423 // x && !x -> 0
1424 if (match(Def, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) {
1425 Def->replaceAllUsesWith(Plan->getFalse());
1426 return true;
1427 }
1428
1429 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X)))) {
1430 Def->replaceAllUsesWith(X);
1431 return true;
1432 }
1433
1434 // select c, false, true -> not c
1435 VPValue *C;
1436 if (CanCreateNewRecipe &&
1437 match(Def, m_Select(m_VPValue(C), m_False(), m_True()))) {
1438 Def->replaceAllUsesWith(Builder.createNot(C));
1439 return true;
1440 }
1441
1442 // select !c, x, y -> select c, y, x
1443 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1444 Def->setOperand(0, C);
1445 Def->setOperand(1, Y);
1446 Def->setOperand(2, X);
1447 return true;
1448 }
1449
1450 // select x, (i1 y | z), y -> y | (x && z)
1451 if (CanCreateNewRecipe &&
1452 match(Def, m_Select(m_VPValue(X),
1454 m_Deferred(Y))) &&
1455 Y->getScalarType()->isIntegerTy(1)) {
1456 Def->replaceAllUsesWith(
1457 Builder.createOr(Y, Builder.createLogicalAnd(X, Z)));
1458 return true;
1459 }
1460
1461 return false;
1462}
1463
1464/// Try to simplify VPSingleDefRecipe \p Def.
1466 VPlan *Plan = Def->getParent()->getPlan();
1467
1468 // Simplification of live-in IR values for SingleDef recipes using
1469 // InstSimplifyFolder.
1470 const DataLayout &DL = Plan->getDataLayout();
1471 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL))
1472 return Def->replaceAllUsesWith(V);
1473
1474 // Fold PredPHI LiveIn -> LiveIn.
1475 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1476 VPValue *Op = PredPHI->getOperand(0);
1477 if (isa<VPIRValue>(Op))
1478 PredPHI->replaceAllUsesWith(Op);
1479 }
1480
1481 VPBuilder Builder(Def);
1482
1483 // Avoid replacing VPInstructions with underlying values with new
1484 // VPInstructions, as we would fail to create widen/replicate recpes from the
1485 // new VPInstructions without an underlying value, and miss out on some
1486 // transformations that only apply to widened/replicated recipes later, by
1487 // doing so.
1488 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1489 // VPInstructions without underlying values, as those will get skipped during
1490 // cost computation.
1491 bool CanCreateNewRecipe =
1492 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1493
1494 VPValue *A;
1495 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1496 Type *TruncTy = Def->getScalarType();
1497 Type *ATy = A->getScalarType();
1498 if (TruncTy == ATy) {
1499 Def->replaceAllUsesWith(A);
1500 } else {
1501 // Don't replace a non-widened cast recipe with a widened cast.
1502 if (!isa<VPWidenCastRecipe>(Def))
1503 return;
1504 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1505
1506 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1507 ? Instruction::SExt
1508 : Instruction::ZExt;
1509 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1510 TruncTy);
1511 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1512 // UnderlyingExt has distinct return type, used to retain legacy cost.
1513 Ext->setUnderlyingValue(UnderlyingExt);
1514 }
1515 Def->replaceAllUsesWith(Ext);
1516 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1517 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1518 Def->replaceAllUsesWith(Trunc);
1519 }
1520 }
1521 }
1522
1523 if (simplifyLogicalRecipe(Def, Builder, CanCreateNewRecipe))
1524 return;
1525
1526 VPValue *X, *Y, *C;
1527 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1528 return Def->replaceAllUsesWith(A);
1529
1530 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1531 return Def->replaceAllUsesWith(A);
1532
1533 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1534 return Def->replaceAllUsesWith(Plan->getZero(Def->getScalarType()));
1535
1536 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_AllOnes()))) {
1537 // Preserve nsw from the Mul on the new Sub.
1539 false, cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap()};
1540 return Def->replaceAllUsesWith(Builder.createSub(
1541 Plan->getZero(A->getScalarType()), A, Def->getDebugLoc(), "", NW));
1542 }
1543
1544 if (CanCreateNewRecipe &&
1546 // Preserve nsw from the Add and the Sub, if it's present on both, on the
1547 // new Sub.
1549 false,
1550 cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap() &&
1551 cast<VPRecipeWithIRFlags>(Def->getOperand(Def->getOperand(0) == X))
1552 ->hasNoSignedWrap()};
1553 return Def->replaceAllUsesWith(
1554 Builder.createSub(X, Y, Def->getDebugLoc(), "", NW));
1555 }
1556
1557 const APInt *APC;
1558 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1559 APC->isPowerOf2())
1560 return Def->replaceAllUsesWith(Builder.createNaryOp(
1561 Instruction::Shl,
1562 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1563 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1564
1565 if (CanCreateNewRecipe && match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) &&
1566 APC->isPowerOf2())
1567 return Def->replaceAllUsesWith(Builder.createNaryOp(
1568 Instruction::LShr,
1569 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1570 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1571
1572 if (match(Def, m_Not(m_VPValue(A)))) {
1573 if (match(A, m_Not(m_VPValue(A))))
1574 return Def->replaceAllUsesWith(A);
1575
1576 // Try to fold Not into compares by adjusting the predicate in-place.
1577 CmpPredicate Pred;
1578 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1579 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1580 if (all_of(Cmp->users(),
1582 m_Not(m_Specific(Cmp)),
1583 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1584 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1585 for (VPUser *U : to_vector(Cmp->users())) {
1586 auto *R = cast<VPSingleDefRecipe>(U);
1587 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1588 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1589 R->setOperand(1, Y);
1590 R->setOperand(2, X);
1591 } else {
1592 // not (cmp pred) -> cmp inv_pred
1593 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1594 R->replaceAllUsesWith(Cmp);
1595 }
1596 }
1597 // If Cmp doesn't have a debug location, use the one from the negation,
1598 // to preserve the location.
1599 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1600 Cmp->setDebugLoc(Def->getDebugLoc());
1601 }
1602 }
1603 }
1604
1605 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1606 // any-of (fcmp uno %A, %B), ...
1607 if (match(Def, m_AnyOf())) {
1609 VPRecipeBase *UnpairedCmp = nullptr;
1610 for (VPValue *Op : Def->operands()) {
1611 VPValue *X;
1612 if (Op->getNumUsers() > 1 ||
1614 m_Deferred(X)))) {
1615 NewOps.push_back(Op);
1616 } else if (!UnpairedCmp) {
1617 UnpairedCmp = Op->getDefiningRecipe();
1618 } else {
1619 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1620 UnpairedCmp->getOperand(0), X));
1621 UnpairedCmp = nullptr;
1622 }
1623 }
1624
1625 if (UnpairedCmp)
1626 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1627
1628 if (NewOps.size() < Def->getNumOperands()) {
1629 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1630 return Def->replaceAllUsesWith(NewAnyOf);
1631 }
1632 }
1633
1634 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1635 // This is useful for fmax/fmin without fast-math flags, where we need to
1636 // check if any operand is NaN.
1637 if (CanCreateNewRecipe &&
1639 m_Deferred(X)),
1641 m_Deferred(Y))))) {
1642 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1643 return Def->replaceAllUsesWith(NewCmp);
1644 }
1645
1646 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1647 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1648 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1649 Def->getOperand(1)->getScalarType() == Def->getScalarType())
1650 return Def->replaceAllUsesWith(Def->getOperand(1));
1651
1653 m_One()))) {
1654 Type *WideStepTy = Def->getScalarType();
1655 if (X->getScalarType() != WideStepTy)
1656 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1657 Def->replaceAllUsesWith(X);
1658 return;
1659 }
1660
1661 // For i1 vp.merges produced by AnyOf reductions:
1662 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1664 m_VPValue(X), m_VPValue())) &&
1666 Def->getScalarType()->isIntegerTy(1)) {
1667 Def->setOperand(1, Def->getOperand(0));
1668 Def->setOperand(0, Y);
1669 return;
1670 }
1671
1672 // Simplify MaskedCond with no block mask to its single operand.
1674 !cast<VPInstruction>(Def)->isMasked())
1675 return Def->replaceAllUsesWith(Def->getOperand(0));
1676
1677 // Look through ExtractLastLane.
1678 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1679 if (match(A, m_BuildVector())) {
1680 auto *BuildVector = cast<VPInstruction>(A);
1681 Def->replaceAllUsesWith(
1682 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1683 return;
1684 }
1685
1686 if (match(A, m_Broadcast(m_VPValue(X))))
1687 return Def->replaceAllUsesWith(X);
1688
1690 return Def->replaceAllUsesWith(A);
1691
1692 if (Plan->hasScalarVFOnly())
1693 return Def->replaceAllUsesWith(A);
1694 }
1695
1696 // Look through ExtractPenultimateElement (BuildVector ....).
1698 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1699 Def->replaceAllUsesWith(
1700 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1701 return;
1702 }
1703
1704 uint64_t Idx;
1706 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1707 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1708 return;
1709 }
1710
1711 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1712 Def->replaceAllUsesWith(
1713 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1714 return;
1715 }
1716
1717 // Look through broadcast of single-scalar when used as select conditions; in
1718 // that case the scalar condition can be used directly.
1719 if (match(Def,
1722 "broadcast operand must be single-scalar");
1723 Def->setOperand(0, C);
1724 return;
1725 }
1726
1727 if (match(Def, m_Broadcast(m_VPValue(X))))
1728 return Def->replaceUsesWithIf(
1729 X, [Def](const VPUser &U, unsigned) { return U.usesScalars(Def); });
1730
1732 if (Def->getNumOperands() == 1) {
1733 Def->replaceAllUsesWith(Def->getOperand(0));
1734 return;
1735 }
1736 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1737 if (all_equal(Phi->incoming_values()))
1738 Phi->replaceAllUsesWith(Phi->getOperand(0));
1739 }
1740 return;
1741 }
1742
1743 VPIRValue *IRV;
1744 if (Def->getNumOperands() == 1 &&
1746 return Def->replaceAllUsesWith(IRV);
1747
1748 // Some simplifications can only be applied after unrolling. Perform them
1749 // below.
1750 if (!Plan->isUnrolled())
1751 return;
1752
1753 // After unrolling, extract-lane may be used to extract values from multiple
1754 // scalar sources. Only simplify when extracting from a single scalar source.
1755 VPValue *LaneToExtract;
1756 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1757 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1759 return Def->replaceAllUsesWith(A);
1760
1761 // Replace extract-lane(0, canonical-WIDEN-INDUCTION) with the region's
1762 // scalar canonical IV.
1764 if (match(LaneToExtract, m_ZeroInt()) &&
1765 match(A, m_CanonicalWidenIV(WidenIV)))
1766 return Def->replaceAllUsesWith(WidenIV->getRegion()->getCanonicalIV());
1767
1768 // Simplify extract-lane with single source to extract-element.
1769 Def->replaceAllUsesWith(Builder.createNaryOp(
1770 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1771 return;
1772 }
1773
1774 // Look for cycles where Def is of the form:
1775 // X = phi(0, IVInc) ; used only by IVInc, or by IVInc and Inc = X + Y
1776 // IVInc = X + Step ; used by X and Def
1777 // Def = IVInc + Y
1778 // Fold the increment Y into the phi's start value, replace Def with IVInc,
1779 // and if Inc exists, replace it with X.
1780 if (match(Def, m_Add(m_Add(m_VPValue(X), m_VPValue()), m_VPValue(Y))) &&
1781 isa<VPIRValue>(Y) &&
1782 match(X, m_VPPhi(m_ZeroInt(), m_Specific(Def->getOperand(0))))) {
1783 auto *Phi = cast<VPPhi>(X);
1784 auto *IVInc = Def->getOperand(0);
1785 if (IVInc->getNumUsers() == 2) {
1786 // If Phi has a second user (besides IVInc's defining recipe), it must
1787 // be Inc = Phi + Y for the fold to apply.
1789 findUserOf(Phi, m_Add(m_Specific(Phi), m_Specific(Y))));
1790 if (Phi->getNumUsers() == 1 || (Phi->getNumUsers() == 2 && Inc)) {
1791 Def->replaceAllUsesWith(IVInc);
1792 if (Inc)
1793 Inc->replaceAllUsesWith(Phi);
1794 Phi->setOperand(0, Y);
1795 return;
1796 }
1797 }
1798 }
1799
1800 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1801 // just the pointer operand.
1802 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1803 if (!VPR->getVFxPart() || match(VPR->getVFxPart(), m_ZeroInt()))
1804 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1805
1806 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1807 // the start index is zero and only the first lane 0 is demanded.
1808 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1809 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1810 Steps->replaceAllUsesWith(Steps->getOperand(0));
1811 return;
1812 }
1813 }
1814 // Simplify redundant ReductionStartVector recipes after unrolling.
1815 VPValue *StartV;
1817 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1818 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1819 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1820 return PhiR && PhiR->isInLoop();
1821 });
1822 return;
1823 }
1824
1825 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1826 return Def->replaceAllUsesWith(A);
1827}
1828
1838
1839/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1840/// header mask to be simplified further when tail folding, e.g. in
1841/// optimizeEVLMasks.
1842static void reassociateHeaderMask(VPlan &Plan) {
1843 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1844 if (!HeaderMask)
1845 return;
1846
1847 SmallVector<VPUser *> Worklist;
1848 for (VPUser *U : HeaderMask->users())
1849 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1851
1852 while (!Worklist.empty()) {
1853 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1854 VPValue *X, *Y;
1855 if (!R || !match(R, m_LogicalAnd(
1856 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1857 m_VPValue(Y))))
1858 continue;
1859 append_range(Worklist, R->users());
1860 VPBuilder Builder(R);
1861 R->replaceAllUsesWith(
1862 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1863 }
1864}
1865
1866static std::optional<Instruction::BinaryOps>
1868 switch (ID) {
1869 case Intrinsic::masked_udiv:
1870 return Instruction::UDiv;
1871 case Intrinsic::masked_sdiv:
1872 return Instruction::SDiv;
1873 case Intrinsic::masked_urem:
1874 return Instruction::URem;
1875 case Intrinsic::masked_srem:
1876 return Instruction::SRem;
1877 default:
1878 return {};
1879 }
1880}
1881
1883 if (Plan.hasScalarVFOnly())
1884 return;
1885
1887 vp_depth_first_deep(Plan.getEntry()))) {
1888 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1891 continue;
1892 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1893 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1894 continue;
1895
1896 auto *RepOrWidenR = cast<VPRecipeWithIRFlags>(&R);
1897 if (RepR && RepR->getOpcode() == Instruction::Store &&
1898 vputils::isSingleScalar(RepR->getOperand(1))) {
1899 auto *Clone = new VPReplicateRecipe(
1900 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1901 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1902 *RepR /*Metadata*/, RepR->getDebugLoc());
1903 Clone->insertBefore(RepOrWidenR);
1904 VPBuilder Builder(Clone);
1905 VPValue *ExtractOp = Clone->getOperand(0);
1906 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1907 ExtractOp =
1908 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1909 ExtractOp =
1910 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1911 Clone->setOperand(0, ExtractOp);
1912 RepR->eraseFromParent();
1913 continue;
1914 }
1915
1916 // Narrow llvm.masked.{u,s}{div,rem} intrinsics with a safe divisor.
1917 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(RepOrWidenR)) {
1918 if (!vputils::onlyFirstLaneUsed(IntrR))
1919 continue;
1920 auto Opc = getUnmaskedDivRemOpcode(IntrR->getVectorIntrinsicID());
1921 if (!Opc)
1922 continue;
1923 VPBuilder Builder(IntrR);
1924 VPValue *SafeDivisor = Builder.createSelect(
1925 IntrR->getOperand(2), IntrR->getOperand(1),
1926 Plan.getConstantInt(IntrR->getScalarType(), 1));
1927 VPValue *Clone = Builder.createNaryOp(
1928 *Opc, {IntrR->getOperand(0), SafeDivisor},
1929 VPIRFlags::getDefaultFlags(*Opc), IntrR->getDebugLoc());
1930 IntrR->replaceAllUsesWith(Clone);
1931 IntrR->eraseFromParent();
1932 continue;
1933 }
1934
1935 // Skip recipes that aren't single scalars.
1936 if (!vputils::isSingleScalar(RepOrWidenR))
1937 continue;
1938
1939 // Predicate to check if a user of Op introduces extra broadcasts.
1940 auto IntroducesBCastOf = [](const VPValue *Op) {
1941 return [Op](const VPUser *U) {
1942 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1946 VPI->getOpcode()))
1947 return false;
1948 }
1949 return !U->usesScalars(Op);
1950 };
1951 };
1952
1953 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1954 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1955 if (any_of(
1956 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1957 IntroducesBCastOf(Op)))
1958 return false;
1959 // Non-constant live-ins require broadcasts, while constants do not
1960 // need explicit broadcasts.
1961 auto *IRV = dyn_cast<VPIRValue>(Op);
1962 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1963 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1964 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1965 }))
1966 continue;
1967
1968 auto *Clone = VPBuilder::createSingleScalarOp(
1969 getOpcodeOrIntrinsicID(RepOrWidenR)->second, RepOrWidenR->operands(),
1970 /*Mask=*/nullptr, *RepOrWidenR, {}, DebugLoc::getUnknown(),
1971 RepOrWidenR->getUnderlyingInstr());
1972 Clone->insertBefore(RepOrWidenR);
1973 RepOrWidenR->replaceAllUsesWith(Clone);
1974 if (isDeadRecipe(*RepOrWidenR))
1975 RepOrWidenR->eraseFromParent();
1976 }
1977 }
1978}
1979
1980/// Try to see if all of \p Blend's masks share a common value logically and'ed
1981/// and remove it from the masks.
1983 if (Blend->isNormalized())
1984 return;
1985 VPValue *CommonEdgeMask;
1986 if (!match(Blend->getMask(0),
1987 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1988 return;
1989 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1990 if (!match(Blend->getMask(I),
1991 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1992 return;
1993 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1994 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1995}
1996
1997/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1998/// to make sure the masks are simplified.
1999static void simplifyBlends(VPlan &Plan) {
2002 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2003 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
2004 if (!Blend)
2005 continue;
2006
2007 removeCommonBlendMask(Blend);
2008
2009 // Try to remove redundant blend recipes.
2010 SmallPtrSet<VPValue *, 4> UniqueValues;
2011 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
2012 UniqueValues.insert(Blend->getIncomingValue(0));
2013 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
2014 if (!match(Blend->getMask(I), m_False()))
2015 UniqueValues.insert(Blend->getIncomingValue(I));
2016
2017 if (UniqueValues.size() == 1) {
2018 Blend->replaceAllUsesWith(*UniqueValues.begin());
2019 Blend->eraseFromParent();
2020 continue;
2021 }
2022
2023 if (Blend->isNormalized())
2024 continue;
2025
2026 // Normalize the blend so its first incoming value is used as the initial
2027 // value with the others blended into it.
2028
2029 unsigned StartIndex = 0;
2030 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
2031 // If a value's mask is used only by the blend then is can be deadcoded.
2032 // TODO: Find the most expensive mask that can be deadcoded, or a mask
2033 // that's used by multiple blends where it can be removed from them all.
2034 VPValue *Mask = Blend->getMask(I);
2035 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
2036 StartIndex = I;
2037 break;
2038 }
2039 }
2040
2041 SmallVector<VPValue *, 4> OperandsWithMask;
2042 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
2043
2044 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
2045 if (I == StartIndex)
2046 continue;
2047 OperandsWithMask.push_back(Blend->getIncomingValue(I));
2048 OperandsWithMask.push_back(Blend->getMask(I));
2049 }
2050
2051 auto *NewBlend =
2052 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
2053 OperandsWithMask, *Blend, Blend->getDebugLoc());
2054 NewBlend->insertBefore(&R);
2055
2056 VPValue *DeadMask = Blend->getMask(StartIndex);
2057 Blend->replaceAllUsesWith(NewBlend);
2058 Blend->eraseFromParent();
2060
2061 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
2062 VPValue *NewMask;
2063 if (NewBlend->getNumOperands() == 3 &&
2064 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
2065 VPValue *Inc0 = NewBlend->getOperand(0);
2066 VPValue *Inc1 = NewBlend->getOperand(1);
2067 VPValue *OldMask = NewBlend->getOperand(2);
2068 NewBlend->setOperand(0, Inc1);
2069 NewBlend->setOperand(1, Inc0);
2070 NewBlend->setOperand(2, NewMask);
2071 if (OldMask->getNumUsers() == 0)
2072 cast<VPInstruction>(OldMask)->eraseFromParent();
2073 }
2074 }
2075 }
2076}
2077
2078/// Optimize the width of vector induction variables in \p Plan based on a known
2079/// constant Trip Count, \p BestVF and \p BestUF.
2081 ElementCount BestVF,
2082 unsigned BestUF) {
2083 // Only proceed if we have not completely removed the vector region.
2084 if (!Plan.getVectorLoopRegion())
2085 return false;
2086
2087 const APInt *TC;
2088 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
2089 return false;
2090
2091 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
2092 // and UF. Returns at least 8.
2093 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
2094 APInt AlignedTC =
2097 APInt MaxVal = AlignedTC - 1;
2098 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
2099 };
2100 unsigned NewBitWidth =
2101 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
2102
2103 LLVMContext &Ctx = Plan.getContext();
2104 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
2105
2106 bool MadeChange = false;
2107
2108 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2109 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2110 // Currently only handle canonical IVs as it is trivial to replace the start
2111 // and stop values, and we currently only perform the optimization when the
2112 // IV has a single use.
2114 if (!match(&Phi, m_CanonicalWidenIV(WideIV)))
2115 continue;
2116 if (WideIV->hasMoreThanOneUniqueUser() ||
2117 NewIVTy == WideIV->getScalarType())
2118 continue;
2119
2120 // Currently only handle cases where the single user is a header-mask
2121 // comparison with the backedge-taken-count.
2122 VPUser *SingleUser = WideIV->getSingleUser();
2123 if (!SingleUser ||
2124 !match(SingleUser,
2125 m_ICmp(m_Specific(WideIV),
2127 continue;
2128
2129 // Update IV operands and comparison bound to use new narrower type.
2130 assert(!WideIV->getTruncInst() &&
2131 "canonical IV is not expected to have a truncation");
2132 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
2133 WideIV->getPHINode(), Plan.getZero(NewIVTy),
2134 Plan.getConstantInt(NewIVTy, 1), WideIV->getVFValue(),
2135 WideIV->getInductionDescriptor(), *WideIV, WideIV->getDebugLoc());
2136 NewWideIV->insertBefore(WideIV);
2137
2138 auto *NewBTC = new VPWidenCastRecipe(
2139 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2140 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
2141 Plan.getVectorPreheader()->appendRecipe(NewBTC);
2142 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
2143 Cmp->replaceAllUsesWith(
2144 VPBuilder(Cmp).createICmp(Cmp->getPredicate(), NewWideIV, NewBTC));
2145
2146 MadeChange = true;
2147 }
2148
2149 return MadeChange;
2150}
2151
2152/// Return true if \p Cond is known to be true for given \p BestVF and \p
2153/// BestUF.
2155 ElementCount BestVF, unsigned BestUF,
2158 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
2159 &PSE](VPValue *C) {
2160 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
2161 });
2162
2163 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2166 m_c_Add(m_Specific(CanIV), m_Specific(&Plan.getVFxUF())),
2167 m_Specific(&Plan.getVectorTripCount()))))
2168 return false;
2169
2170 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2171 // count is not conveniently available as SCEV so far, so we compare directly
2172 // against the original trip count. This is stricter than necessary, as we
2173 // will only return true if the trip count == vector trip count.
2174 const SCEV *VectorTripCount =
2176 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2177 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
2178 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2179 "Trip count SCEV must be computable");
2180 ScalarEvolution &SE = *PSE.getSE();
2181 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2182 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2183 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
2184}
2185
2186/// Try to replace multiple active lane masks used for control flow with
2187/// a single, wide active lane mask instruction followed by multiple
2188/// extract subvector intrinsics. This applies to the active lane mask
2189/// instructions both in the loop and in the preheader.
2190/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2191/// new extracts from the first active lane mask, which has it's last
2192/// operand (multiplier) set to UF.
2194 unsigned UF) {
2195 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2196 return false;
2197
2198 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2199 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2200 auto *Term = &ExitingVPBB->back();
2201
2202 using namespace llvm::VPlanPatternMatch;
2204 m_VPValue(), m_VPValue(), m_VPValue())))))
2205 return false;
2206
2207 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2208 LLVMContext &Ctx = Plan.getContext();
2209
2210 auto ExtractFromALM = [&](VPInstruction *ALM,
2211 SmallVectorImpl<VPValue *> &Extracts) {
2212 DebugLoc DL = ALM->getDebugLoc();
2213 for (unsigned Part = 0; Part < UF; ++Part) {
2215 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
2216 auto *Ext =
2217 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2218 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
2219 Extracts[Part] = Ext;
2220 Ext->insertAfter(ALM);
2221 }
2222 };
2223
2224 // Create a list of each active lane mask phi, ordered by unroll part.
2226 for (VPRecipeBase &R : Header->phis()) {
2228 if (!Phi)
2229 continue;
2230 VPValue *Index = nullptr;
2231 match(Phi->getBackedgeValue(),
2233 assert(Index && "Expected index from ActiveLaneMask instruction");
2234
2235 uint64_t Part;
2236 if (match(Index,
2238 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2239 Phis[Part] = Phi;
2240 else {
2241 // Anything other than a CanonicalIVIncrementForPart is part 0
2242 assert(!match(
2243 Index,
2245 Phis[0] = Phi;
2246 }
2247 }
2248
2249 assert(all_of(Phis, not_equal_to(nullptr)) &&
2250 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2251
2252 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2253 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2254
2255 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2256 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2257 "Expected incoming values of Phi to be ActiveLaneMasks");
2258
2259 // When using wide lane masks, the return type of the get.active.lane.mask
2260 // intrinsic is VF x UF (last operand).
2261 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2262 EntryALM->setOperand(2, ALMMultiplier);
2263 LoopALM->setOperand(2, ALMMultiplier);
2264
2265 // Create UF x extract vectors and insert into preheader.
2266 SmallVector<VPValue *> EntryExtracts(UF);
2267 ExtractFromALM(EntryALM, EntryExtracts);
2268
2269 // Create UF x extract vectors and insert before the loop compare & branch,
2270 // updating the compare to use the first extract.
2271 SmallVector<VPValue *> LoopExtracts(UF);
2272 ExtractFromALM(LoopALM, LoopExtracts);
2273 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2274 Not->setOperand(0, LoopExtracts[0]);
2275
2276 // Update the incoming values of active lane mask phis.
2277 for (unsigned Part = 0; Part < UF; ++Part) {
2278 Phis[Part]->setStartValue(EntryExtracts[Part]);
2279 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2280 }
2281
2282 return true;
2283}
2284
2285/// Try to simplify the branch condition of \p Plan. This may restrict the
2286/// resulting plan to \p BestVF and \p BestUF.
2288 unsigned BestUF,
2290 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2291 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2292 auto *Term = &ExitingVPBB->back();
2293 VPValue *Cond;
2294 auto m_CanIVInc = m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF()));
2295 // Check if the branch condition compares the canonical IV increment (for main
2296 // loop), or the canonical IV increment plus an offset (for epilog loop).
2297 if (match(Term, m_BranchOnCount(
2298 m_CombineOr(m_CanIVInc, m_c_Add(m_CanIVInc, m_LiveIn())),
2299 m_VPValue())) ||
2301 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2302 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2303 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2304 const SCEV *VectorTripCount =
2306 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2307 VectorTripCount =
2309 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2310 "Trip count SCEV must be computable");
2311 ScalarEvolution &SE = *PSE.getSE();
2312 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2313 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2314 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2315 return false;
2316 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2318 // For BranchOnCond, check if we can prove the condition to be true using VF
2319 // and UF.
2320 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2321 return false;
2322 } else {
2323 return false;
2324 }
2325
2326 // The vector loop region only executes once. Convert terminator of the
2327 // exiting block to exit in the first iteration.
2328 if (match(Term, m_BranchOnTwoConds())) {
2329 Term->setOperand(1, Plan.getTrue());
2330 return true;
2331 }
2332
2333 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, Plan.getTrue(), {},
2334 {}, Term->getDebugLoc());
2335 ExitingVPBB->appendRecipe(BOC);
2336 Term->eraseFromParent();
2337
2338 return true;
2339}
2340
2341/// From the definition of llvm.experimental.get.vector.length,
2342/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2346 vp_depth_first_deep(Plan.getEntry()))) {
2347 for (VPRecipeBase &R : *VPBB) {
2348 VPValue *AVL;
2349 if (!match(&R, m_EVL(m_VPValue(AVL))))
2350 continue;
2351
2352 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2353 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2354 continue;
2355 ScalarEvolution &SE = *PSE.getSE();
2356 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2357 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2358 continue;
2359
2361 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2362 R.getDebugLoc());
2363 if (Trunc != AVL) {
2364 auto *TruncR = cast<VPSingleDefRecipe>(Trunc);
2365 const DataLayout &DL = Plan.getDataLayout();
2366 if (VPValue *Folded = tryToFoldLiveIns(*TruncR, TruncR->operands(), DL))
2367 Trunc = Folded;
2368 }
2369 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2370 return true;
2371 }
2372 }
2373 return false;
2374}
2375
2377 unsigned BestUF,
2379 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2380 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2381
2382 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2383 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2384 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2385
2386 if (MadeChange) {
2387 Plan.setVF(BestVF);
2388 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2389 }
2390}
2391
2393 for (VPRecipeBase &R :
2395 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2396 if (!PhiR)
2397 continue;
2398 RecurKind RK = PhiR->getRecurrenceKind();
2399 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2401 continue;
2402
2403 for (VPUser *U : collectUsersRecursively(PhiR))
2404 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2405 RecWithFlags->dropPoisonGeneratingFlags();
2406 }
2407 }
2408}
2409
2410namespace {
2411struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2412 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2413 /// return that source element type.
2414 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2415 // All VPInstructions that lower to GEPs must have the i8 source element
2416 // type (as they are PtrAdds), so we omit it.
2418 .Case([](const VPReplicateRecipe *I) -> Type * {
2419 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2420 return GEP->getSourceElementType();
2421 return nullptr;
2422 })
2423 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2424 [](auto *I) { return I->getSourceElementType(); })
2425 .Default([](auto *) { return nullptr; });
2426 }
2427
2428 /// Returns true if recipe \p Def can be safely handed for CSE.
2429 static bool canHandle(const VPSingleDefRecipe *Def) {
2430 // We can extend the list of handled recipes in the future,
2431 // provided we account for the data embedded in them while checking for
2432 // equality or hashing.
2433 auto C = getOpcodeOrIntrinsicID(Def);
2434
2435 // The issue with (Insert|Extract)Value is that the index of the
2436 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2437 // VPlan.
2438 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2439 C->second == Instruction::ExtractValue)))
2440 return false;
2441
2442 // During CSE, we can only handle recipes that don't read from memory: if
2443 // they read from memory, there could be an intervening write to memory
2444 // before the next instance is CSE'd, leading to an incorrect result.
2445 return !Def->mayReadFromMemory();
2446 }
2447
2448 /// Hash the underlying data of \p Def.
2449 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2450 hash_code Result = hash_combine(
2451 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2452 getGEPSourceElementType(Def), Def->getScalarType(),
2454 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2455 if (RFlags->hasPredicate())
2456 return hash_combine(Result, RFlags->getPredicate());
2457 if (auto *SIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Def))
2458 return hash_combine(Result, SIVSteps->getInductionOpcode());
2459 return Result;
2460 }
2461
2462 /// Check equality of underlying data of \p L and \p R.
2463 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2464 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2466 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2468 !equal(L->operands(), R->operands()))
2469 return false;
2471 "must have valid opcode info for both recipes");
2472 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2473 if (LFlags->hasPredicate() &&
2474 LFlags->getPredicate() !=
2475 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2476 return false;
2477 if (auto *LSIV = dyn_cast<VPScalarIVStepsRecipe>(L))
2478 if (LSIV->getInductionOpcode() !=
2479 cast<VPScalarIVStepsRecipe>(R)->getInductionOpcode())
2480 return false;
2481 // Recipes in replicate regions implicitly depend on predicate. If either
2482 // recipe is in a replicate region, only consider them equal if both have
2483 // the same parent.
2484 const VPRegionBlock *RegionL = L->getRegion();
2485 const VPRegionBlock *RegionR = R->getRegion();
2486 if (((RegionL && RegionL->isReplicator()) ||
2487 (RegionR && RegionR->isReplicator())) &&
2488 L->getParent() != R->getParent())
2489 return false;
2490 return L->getScalarType() == R->getScalarType();
2491 }
2492};
2493} // end anonymous namespace
2494
2495/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2496/// Plan.
2498 VPDominatorTree VPDT(Plan);
2500
2502 Plan.getEntry());
2504 for (VPRecipeBase &R : *VPBB) {
2505 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2506 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2507 continue;
2508 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2509 // V must dominate Def for a valid replacement.
2510 if (!VPDT.dominates(V->getParent(), VPBB))
2511 continue;
2512 // Only keep flags present on both V and Def.
2513 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2514 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2515 Def->replaceAllUsesWith(V);
2516 continue;
2517 }
2518 CSEMap[Def] = Def;
2519 }
2520 }
2521}
2522
2523/// Return true if we do not know how to (mechanically) hoist or sink a
2524/// non-memory or memory recipe \p R out of a loop region.
2526 VPBasicBlock *LastBB) {
2527 if (!isa<VPReplicateRecipe>(R) || !R.mayReadFromMemory())
2529
2530 // Check that the load doesn't alias with stores between FirstBB and LastBB.
2531 auto MemLoc = vputils::getMemoryLocation(R);
2532 return !MemLoc || !canHoistOrSinkWithNoAliasCheck(*MemLoc, FirstBB, LastBB);
2533}
2534
2535/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2536static void licm(VPlan &Plan) {
2537 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2538
2539 // Hoist any loop invariant recipes from the vector loop region to the
2540 // preheader. Preform a shallow traversal of the vector loop region, to
2541 // exclude recipes in replicate regions. Since the top-level blocks in the
2542 // vector loop region are guaranteed to execute if the vector pre-header is,
2543 // we don't need to check speculation safety.
2544 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2545 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2546 "Expected vector prehader's successor to be the vector loop region");
2548 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2549 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2550 if (cannotHoistOrSinkRecipe(R, LoopRegion->getEntryBasicBlock(),
2551 LoopRegion->getExitingBasicBlock()))
2552 continue;
2553 if (any_of(R.operands(), [](VPValue *Op) {
2554 return !Op->isDefinedOutsideLoopRegions();
2555 }))
2556 continue;
2557 R.moveBefore(*Preheader, Preheader->end());
2558 }
2559 }
2560
2561#ifndef NDEBUG
2562 VPDominatorTree VPDT(Plan);
2563#endif
2564 // Sink recipes with no users inside the vector loop region if all users are
2565 // in the same exit block of the region.
2566 // TODO: Extend to sink recipes from inner loops.
2568 LoopRegion->getEntry());
2570 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2571 if (vputils::cannotHoistOrSinkRecipe(R, /*Sinking=*/true))
2572 continue;
2573
2574 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
2575 assert(!RepR->isPredicated() &&
2576 "Expected prior transformation of predicated replicates to "
2577 "replicate regions");
2578 // narrowToSingleScalarRecipes should have already maximally narrowed
2579 // replicates to single-scalar replicates.
2580 // TODO: When unrolling, replicateByVF doesn't handle sunk
2581 // non-single-scalar replicates correctly.
2582 if (!RepR->isSingleScalar())
2583 continue;
2584 }
2585
2586 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2587 // support recipes with multiple defined values (e.g., interleaved loads).
2588 auto *Def = cast<VPSingleDefRecipe>(&R);
2589
2590 // Cannot sink the recipe if the user is defined in a loop region or a
2591 // non-successor of the vector loop region. Cannot sink if user is a phi
2592 // either.
2593 VPBasicBlock *SinkBB = nullptr;
2594 if (any_of(Def->users(), [&SinkBB, &LoopRegion](VPUser *U) {
2595 auto *UserR = cast<VPRecipeBase>(U);
2596 VPBasicBlock *Parent = UserR->getParent();
2597 // TODO: Support sinking when users are in multiple blocks.
2598 if (SinkBB && SinkBB != Parent)
2599 return true;
2600 SinkBB = Parent;
2601 // TODO: If the user is a PHI node, we should check the block of
2602 // incoming value. Support PHI node users if needed.
2603 return UserR->isPhi() || Parent->getEnclosingLoopRegion() ||
2604 Parent->getSinglePredecessor() != LoopRegion;
2605 }))
2606 continue;
2607
2608 if (!SinkBB)
2609 SinkBB = cast<VPBasicBlock>(LoopRegion->getSingleSuccessor());
2610
2611 // TODO: This will need to be a check instead of a assert after
2612 // conditional branches in vectorized loops are supported.
2613 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2614 "Defining block must dominate sink block");
2615 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2616 // just moving.
2617 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2618 }
2619 }
2620}
2621
2623 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2624 if (Plan.hasScalarVFOnly())
2625 return;
2626 // Keep track of created truncates, so they can be re-used. Note that we
2627 // cannot use RAUW after creating a new truncate, as this would could make
2628 // other uses have different types for their operands, making them invalidly
2629 // typed.
2631 VPBasicBlock *PH = Plan.getVectorPreheader();
2634 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2637 continue;
2638
2639 VPValue *ResultVPV = R.getVPSingleValue();
2640 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2641 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2642 if (!NewResSizeInBits)
2643 continue;
2644
2645 // If the value wasn't vectorized, we must maintain the original scalar
2646 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2647 // skip casts which do not need to be handled explicitly here, as
2648 // redundant casts will be removed during recipe simplification.
2650 continue;
2651
2652 Type *OldResTy = ResultVPV->getScalarType();
2653 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2654 assert(OldResTy->isIntegerTy() && "only integer types supported");
2655 (void)OldResSizeInBits;
2656
2657 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2658
2659 // Any wrapping introduced by shrinking this operation shouldn't be
2660 // considered undefined behavior. So, we can't unconditionally copy
2661 // arithmetic wrapping flags to VPW.
2662 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2663 VPW->dropPoisonGeneratingFlags();
2664
2665 assert((OldResSizeInBits != NewResSizeInBits ||
2666 match(&R, m_ICmp(m_VPValue(), m_VPValue()))) &&
2667 "Only ICmps should not need extending the result.");
2668 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2669
2670 // For loads/intrinsics we don't recreate the recipe; just wrap the
2671 // original wide result in a ZExt to OldResTy.
2673 if (OldResSizeInBits != NewResSizeInBits) {
2675 Instruction::ZExt, ResultVPV, OldResTy);
2676 ResultVPV->replaceAllUsesWith(Ext);
2677 Ext->setOperand(0, ResultVPV);
2678 }
2679 continue;
2680 }
2681
2682 // Shrink operands by introducing truncates as needed.
2683 unsigned StartIdx =
2684 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2685 SmallVector<VPValue *> NewOperands(R.operands());
2686 for (VPValue *&Op : drop_begin(NewOperands, StartIdx)) {
2687 unsigned OpSizeInBits = Op->getScalarType()->getScalarSizeInBits();
2688 if (OpSizeInBits == NewResSizeInBits)
2689 continue;
2690 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2691 auto [ProcessedIter, Inserted] = ProcessedTruncs.try_emplace(Op);
2692 if (Inserted) {
2693 VPBuilder Builder;
2694 if (isa<VPIRValue>(Op))
2695 Builder.setInsertPoint(PH);
2696 else
2697 Builder.setInsertPoint(&R);
2698 ProcessedIter->second =
2699 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2700 }
2701 Op = ProcessedIter->second;
2702 }
2703
2704 auto *NWR = cast<VPWidenRecipe>(&R)->cloneWithOperands(NewOperands);
2705 NWR->insertBefore(&R);
2706
2707 // Wrap NWR in a ZExt to preserve the original wide type for downstream
2708 // users (unless this is an ICmp, which produces i1 regardless).
2709 VPValue *Replacement = NWR->getVPSingleValue();
2710 if (OldResSizeInBits != NewResSizeInBits)
2711 Replacement =
2713 .createWidenCast(Instruction::ZExt, Replacement, OldResTy)
2714 ->getVPSingleValue();
2715 ResultVPV->replaceAllUsesWith(Replacement);
2716 R.eraseFromParent();
2717 }
2718 }
2719}
2720
2721bool VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) {
2722 std::optional<VPDominatorTree> VPDT;
2723 if (OnlyLatches)
2724 VPDT.emplace(Plan);
2725
2726 // Collect all blocks before modifying the CFG so we can identify unreachable
2727 // ones after constant branch removal.
2729
2730 bool SimplifiedPhi = false;
2731 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(AllBlocks)) {
2732 VPValue *Cond;
2733 // Skip blocks that are not terminated by BranchOnCond.
2734 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2735 continue;
2736
2737 if (OnlyLatches && !VPBlockUtils::isLatch(VPBB, *VPDT))
2738 continue;
2739
2740 assert(VPBB->getNumSuccessors() == 2 &&
2741 "Two successors expected for BranchOnCond");
2742 unsigned RemovedIdx;
2743 if (match(Cond, m_True()))
2744 RemovedIdx = 1;
2745 else if (match(Cond, m_False()))
2746 RemovedIdx = 0;
2747 else
2748 continue;
2749
2750 VPBasicBlock *RemovedSucc =
2751 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2752 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2753 "There must be a single edge between VPBB and its successor");
2754 // Values coming from VPBB into phi recipes of RemovedSucc are removed from
2755 // these recipes.
2756 auto Phis = RemovedSucc->phis();
2757 for (VPRecipeBase &R : Phis)
2758 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2759 SimplifiedPhi |= !std::empty(Phis);
2760
2761 // Disconnect blocks and remove the terminator.
2762 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2763 VPBB->back().eraseFromParent();
2764 }
2765
2766 // Compute which blocks are still reachable from the entry after constant
2767 // branch removal.
2770
2771 // Detach all unreachable blocks from their successors, removing their recipes
2772 // and incoming values from phi recipes.
2773 VPSymbolicValue Tmp(nullptr);
2774 for (VPBlockBase *B : AllBlocks) {
2775 if (Reachable.contains(B))
2776 continue;
2777 for (VPBlockBase *Succ : to_vector(B->successors())) {
2778 if (auto *SuccBB = dyn_cast<VPBasicBlock>(Succ))
2779 for (VPRecipeBase &R : SuccBB->phis())
2780 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(B);
2782 }
2783 for (VPBasicBlock *DeadBB :
2785 for (VPRecipeBase &R : make_early_inc_range(*DeadBB)) {
2786 for (VPValue *Def : R.definedValues())
2787 Def->replaceAllUsesWith(&Tmp);
2788 R.eraseFromParent();
2789 }
2790 }
2791 }
2792 return SimplifiedPhi;
2793}
2794
2814
2815// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2816// the loop terminator with a branch-on-cond recipe with the negated
2817// active-lane-mask as operand. Note that this turns the loop into an
2818// uncountable one. Only the existing terminator is replaced, all other existing
2819// recipes/users remain unchanged, except for poison-generating flags being
2820// dropped from the canonical IV increment. Return the created
2821// VPActiveLaneMaskPHIRecipe.
2822//
2823// The function adds the following recipes:
2824//
2825// vector.ph:
2826// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2827// %EntryALM = active-lane-mask %EntryInc, TC
2828//
2829// vector.body:
2830// ...
2831// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2832// ...
2833// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2834// %ALM = active-lane-mask %InLoopInc, TC
2835// %Negated = Not %ALM
2836// branch-on-cond %Negated
2837//
2840 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2841 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2842 VPValue *StartV = Plan.getZero(TopRegion->getCanonicalIVType());
2843 auto *CanonicalIVIncrement = TopRegion->getOrCreateCanonicalIVIncrement();
2844 // TODO: Check if dropping the flags is needed.
2845 TopRegion->clearCanonicalIVNUW(CanonicalIVIncrement);
2846 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2847 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2848 // we have to take unrolling into account. Each part needs to start at
2849 // Part * VF
2850 auto *VecPreheader = Plan.getVectorPreheader();
2851 VPBuilder Builder(VecPreheader);
2852
2853 // Create the ActiveLaneMask instruction using the correct start values.
2854 VPValue *TC = Plan.getTripCount();
2855 VPValue *VF = &Plan.getVF();
2856
2857 auto *EntryIncrement = Builder.createOverflowingOp(
2858 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
2859 DL, "index.part.next");
2860
2861 // Create the active lane mask instruction in the VPlan preheader.
2862 VPValue *ALMMultiplier =
2863 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2864 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2865 {EntryIncrement, TC, ALMMultiplier}, DL,
2866 "active.lane.mask.entry");
2867
2868 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2869 // preheader ActiveLaneMask instruction.
2870 auto *LaneMaskPhi =
2872 auto *HeaderVPBB = TopRegion->getEntryBasicBlock();
2873 LaneMaskPhi->insertBefore(*HeaderVPBB, HeaderVPBB->begin());
2874
2875 // Create the active lane mask for the next iteration of the loop before the
2876 // original terminator.
2877 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2878 Builder.setInsertPoint(OriginalTerminator);
2879 auto *InLoopIncrement = Builder.createOverflowingOp(
2881 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
2882 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2883 {InLoopIncrement, TC, ALMMultiplier}, DL,
2884 "active.lane.mask.next");
2885 LaneMaskPhi->addBackedgeValue(ALM);
2886
2887 // Replace the original terminator with BranchOnCond. We have to invert the
2888 // mask here because a true condition means jumping to the exit block.
2889 auto *NotMask = Builder.createNot(ALM, DL);
2890 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2891 OriginalTerminator->eraseFromParent();
2892 return LaneMaskPhi;
2893}
2894
2896 bool UseActiveLaneMaskForControlFlow) {
2897 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2898 auto *WideCanonicalIV =
2900 assert(WideCanonicalIV &&
2901 "Must have widened canonical IV when tail folding!");
2902 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
2903 VPSingleDefRecipe *LaneMask;
2904 if (UseActiveLaneMaskForControlFlow) {
2905 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
2906 } else {
2907 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2908 VPValue *ALMMultiplier =
2909 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
2910 LaneMask =
2911 B.createNaryOp(VPInstruction::ActiveLaneMask,
2912 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2913 nullptr, "active.lane.mask");
2914 }
2915
2916 // Walk users of WideCanonicalIV and replace the header mask of the form
2917 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2918 // removing the old one to ensure there is always only a single header mask.
2919 HeaderMask->replaceAllUsesWith(LaneMask);
2920 HeaderMask->eraseFromParent();
2921}
2922
2923template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2924 Op0_t In;
2926
2927 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2928
2929 template <typename OpTy> bool match(OpTy *V) const {
2930 if (m_Specific(In).match(V)) {
2931 Out = nullptr;
2932 return true;
2933 }
2934 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2935 }
2936};
2937
2938/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2939/// Returns the remaining part \p Out if so, or nullptr otherwise.
2940template <typename Op0_t, typename Op1_t>
2941static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2942 Op1_t &Out) {
2943 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2944}
2945
2946static std::optional<Intrinsic::ID> getVPDivRemIntrinsic(Intrinsic::ID IntrID) {
2947 switch (IntrID) {
2948 case Intrinsic::masked_udiv:
2949 return Intrinsic::vp_udiv;
2950 case Intrinsic::masked_sdiv:
2951 return Intrinsic::vp_sdiv;
2952 case Intrinsic::masked_urem:
2953 return Intrinsic::vp_urem;
2954 case Intrinsic::masked_srem:
2955 return Intrinsic::vp_srem;
2956 default:
2957 return std::nullopt;
2958 }
2959}
2960
2961/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2962/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2963/// recipe could be created.
2964/// \p HeaderMask Header Mask.
2965/// \p CurRecipe Recipe to be transform.
2966/// \p EVL The explicit vector length parameter of vector-predication
2967/// intrinsics.
2969 VPRecipeBase &CurRecipe, VPValue &EVL) {
2970 VPlan *Plan = CurRecipe.getParent()->getPlan();
2971 DebugLoc DL = CurRecipe.getDebugLoc();
2972 VPValue *Addr, *Mask, *EndPtr;
2973
2974 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2975 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2976 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2977 EVLEndPtr->insertBefore(&CurRecipe);
2978 // Cast EVL (i32) to match the VF operand's type.
2979 VPValue *EVLAsVF = VPBuilder(EVLEndPtr).createScalarZExtOrTrunc(
2980 &EVL, EVLEndPtr->getOperand(1)->getScalarType(), EVL.getScalarType(),
2982 EVLEndPtr->setOperand(1, EVLAsVF);
2983 return EVLEndPtr;
2984 };
2985
2986 auto GetVPReverse = [&CurRecipe, &EVL, Plan,
2988 if (!V)
2989 return nullptr;
2990 auto *Reverse = new VPWidenIntrinsicRecipe(
2991 Intrinsic::experimental_vp_reverse, {V, Plan->getTrue(), &EVL},
2992 V->getScalarType(), {}, {}, DL);
2993 Reverse->insertBefore(&CurRecipe);
2994 return Reverse;
2995 };
2996
2997 if (match(&CurRecipe,
2998 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))))
2999 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
3000 EVL, Mask);
3001
3002 if (match(&CurRecipe,
3003 m_MaskedLoad(m_VPValue(EndPtr),
3004 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3005 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3006 Mask = GetVPReverse(Mask);
3007 Addr = AdjustEndPtr(EndPtr);
3008 auto *LoadR = new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
3009 Addr, EVL, Mask);
3010 LoadR->insertBefore(&CurRecipe);
3011 VPValue *Poison =
3012 Plan->getOrAddLiveIn(PoisonValue::get(LoadR->getScalarType()));
3013 return new VPWidenIntrinsicRecipe(Intrinsic::vector_splice_left,
3014 {Poison, LoadR, &EVL},
3015 LoadR->getScalarType(), {}, {}, DL);
3016 }
3017
3018 VPValue *Stride;
3020 m_VPValue(Addr), m_VPValue(Stride),
3021 m_RemoveMask(HeaderMask, Mask),
3022 m_TruncOrSelf(m_Specific(&Plan->getVF()))))) {
3023 if (!Mask)
3024 Mask = Plan->getTrue();
3025 auto *NewLoad = cast<VPWidenMemIntrinsicRecipe>(&CurRecipe)->clone();
3026 NewLoad->setOperand(2, Mask);
3027 NewLoad->setOperand(3, &EVL);
3028 return NewLoad;
3029 }
3030
3031 VPValue *StoredVal;
3032 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
3033 m_RemoveMask(HeaderMask, Mask))))
3034 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3035 StoredVal, EVL, Mask);
3036
3037 if (match(&CurRecipe,
3038 m_MaskedStore(m_VPValue(EndPtr), m_VPValue(StoredVal),
3039 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3040 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3041 Mask = GetVPReverse(Mask);
3042 Addr = AdjustEndPtr(EndPtr);
3043 VPValue *Poison =
3044 Plan->getOrAddLiveIn(PoisonValue::get(StoredVal->getScalarType()));
3045 auto *SpliceR = new VPWidenIntrinsicRecipe(
3046 Intrinsic::vector_splice_right, {StoredVal, Poison, &EVL},
3047 StoredVal->getScalarType(), {}, {}, DL);
3048 SpliceR->insertBefore(&CurRecipe);
3049 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3050 SpliceR, EVL, Mask);
3051 }
3052
3053 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
3054 if (Rdx->isConditional() &&
3055 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
3056 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3057
3058 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
3059 if (Interleave->getMask() &&
3060 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
3061 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3062
3063 VPValue *LHS, *RHS;
3064 if (match(&CurRecipe, m_SelectLike(m_RemoveMask(HeaderMask, Mask),
3066 return new VPWidenIntrinsicRecipe(
3067 Intrinsic::vp_merge, {Mask ? Mask : Plan->getTrue(), LHS, RHS, &EVL},
3068 LHS->getScalarType(), {}, {}, DL);
3069
3070 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3071 Type *Ty = CurRecipe.getVPSingleValue()->getScalarType();
3072 VPValue *ZExt =
3073 VPBuilder(&CurRecipe)
3074 .createScalarZExtOrTrunc(&EVL, Ty, EVL.getScalarType(), DL);
3075 return new VPInstruction(
3076 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
3077 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
3078 }
3079
3080 // lhs | (headermask && rhs) -> vp.merge rhs, true, lhs, evl
3081 if (match(&CurRecipe,
3083 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(RHS)))))
3084 return new VPWidenIntrinsicRecipe(Intrinsic::vp_merge,
3085 {RHS, Plan->getTrue(), LHS, &EVL},
3086 LHS->getScalarType(), {}, {}, DL);
3087
3088 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(&CurRecipe))
3089 if (auto VPID = getVPDivRemIntrinsic(IntrR->getVectorIntrinsicID()))
3090 if (match(IntrR->getOperand(2), m_RemoveMask(HeaderMask, Mask)))
3091 return new VPWidenIntrinsicRecipe(*VPID,
3092 {IntrR->getOperand(0),
3093 IntrR->getOperand(1),
3094 Mask ? Mask : Plan->getTrue(), &EVL},
3095 IntrR->getScalarType(), {}, {}, DL);
3096
3097 return nullptr;
3098}
3099
3100/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3101/// The transforms here need to preserve the original semantics.
3103 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3104 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3107 m_VPValue(EVL))) &&
3108 match(EVL, m_EVL(m_VPValue()))) {
3109 HeaderMask = R.getVPSingleValue();
3110 break;
3111 }
3112 }
3113 if (!HeaderMask)
3114 return;
3115
3116 SmallVector<VPRecipeBase *> OldRecipes;
3117 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3119 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, *EVL)) {
3120 NewR->insertBefore(R);
3121 for (auto [Old, New] :
3122 zip_equal(R->definedValues(), NewR->definedValues()))
3123 Old->replaceAllUsesWith(New);
3124 OldRecipes.push_back(R);
3125 }
3126 }
3127
3128 // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,
3129 // False, EVL)
3130 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3131 VPValue *Mask;
3132 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) {
3133 auto *LogicalAnd = cast<VPInstruction>(U);
3134 auto *Merge = new VPWidenIntrinsicRecipe(
3135 Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},
3136 Mask->getScalarType(), {}, {}, LogicalAnd->getDebugLoc());
3137 Merge->insertBefore(LogicalAnd);
3138 LogicalAnd->replaceAllUsesWith(Merge);
3139 OldRecipes.push_back(LogicalAnd);
3140 }
3141 }
3142
3143 // Fold the following splice patterns into vp.reverse for reverse accesses:
3144 // vector.reverse(splice.left(poison, x, evl)) -> vp.reverse(x, true, evl)
3145 // splice.right(vector.reverse(x), poison, evl) -> vp.reverse(x, true, evl)
3146 for (VPUser *U : collectUsersRecursively(EVL)) {
3147 VPValue *X;
3148 if (!match(U,
3151 m_Poison(), m_VPValue(X), m_Specific(EVL))),
3153 m_Reverse(m_VPValue(X)), m_Poison(), m_Specific(EVL)))))
3154 continue;
3155
3156 auto *Def = cast<VPSingleDefRecipe>(U);
3157 auto *VPReverse = new VPWidenIntrinsicRecipe(
3158 Intrinsic::experimental_vp_reverse, {X, Plan.getTrue(), EVL},
3159 X->getScalarType(), {}, {}, Def->getDebugLoc());
3160 VPReverse->insertBefore(Def);
3161 Def->replaceAllUsesWith(VPReverse);
3162 OldRecipes.push_back(Def);
3163 }
3164
3165 for (VPRecipeBase *R : reverse(OldRecipes)) {
3166 SmallVector<VPValue *> PossiblyDead(R->operands());
3167 R->eraseFromParent();
3168 for (VPValue *Op : PossiblyDead)
3170 }
3171}
3172
3173/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3174/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3175/// iteration.
3176static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3177 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3178 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3179
3180 // EVL is i32 but VF/VFxUF are IdxTy. Convert as needed.
3181 VPValue *EVLAsIdx =
3185
3186 assert(all_of(Plan.getVF().users(),
3187 [&Plan](VPUser *U) {
3188 auto IsAllowedUser =
3189 IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
3190 VPWidenIntOrFpInductionRecipe,
3191 VPWidenMemIntrinsicRecipe>;
3192 if (match(U, m_Trunc(m_Specific(&Plan.getVF()))))
3193 return all_of(cast<VPSingleDefRecipe>(U)->users(),
3194 IsAllowedUser);
3195 return IsAllowedUser(U);
3196 }) &&
3197 "User of VF that we can't transform to EVL.");
3198 Plan.getVF().replaceUsesWithIf(EVLAsIdx, [](VPUser &U, unsigned Idx) {
3200 });
3201
3202 assert(all_of(Plan.getVFxUF().users(),
3204 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3205 m_Specific(&Plan.getVFxUF())),
3207 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3208 "increment of the canonical induction.");
3209 Plan.getVFxUF().replaceUsesWithIf(EVLAsIdx, [](VPUser &U, unsigned Idx) {
3210 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3211 // canonical induction must not be updated.
3213 });
3214
3215 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3216 // contained.
3217 bool ContainsFORs =
3219 if (ContainsFORs) {
3220 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3221 VPValue *MaxEVL = &Plan.getVF();
3222 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3223 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3224 MaxEVL = Builder.createScalarZExtOrTrunc(
3225 MaxEVL, Type::getInt32Ty(Plan.getContext()), MaxEVL->getScalarType(),
3227
3228 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3229 VPValue *PrevEVL = Builder.createScalarPhi(
3230 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3231
3234 for (VPRecipeBase &R : *VPBB) {
3235 VPValue *V1, *V2;
3236 if (!match(&R,
3238 m_VPValue(V1), m_VPValue(V2))))
3239 continue;
3240 VPValue *Imm = Plan.getOrAddLiveIn(
3243 Intrinsic::experimental_vp_splice,
3244 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3245 R.getVPSingleValue()->getScalarType(), {}, {}, R.getDebugLoc());
3246 VPSplice->insertBefore(&R);
3247 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3248 }
3249 }
3250 }
3251
3252 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3253 if (!HeaderMask)
3254 return;
3255
3256 // Ensure that any reduction that uses a select to mask off tail lanes does so
3257 // in the vector loop, not the middle block, since EVL tail folding can have
3258 // tail elements in the penultimate iteration.
3259 assert(all_of(*Plan.getMiddleBlock(), [&Plan, HeaderMask](VPRecipeBase &R) {
3260 if (match(&R, m_ComputeReductionResult(m_Select(m_Specific(HeaderMask),
3261 m_VPValue(), m_VPValue()))))
3262 return R.getOperand(0)->getDefiningRecipe()->getRegion() ==
3263 Plan.getVectorLoopRegion();
3264 return true;
3265 }));
3266
3267 // Replace header masks with a mask equivalent to predicating by EVL:
3268 //
3269 // icmp ule widen-canonical-iv backedge-taken-count
3270 // ->
3271 // icmp ult step-vector, EVL
3272 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3273 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3274 Type *EVLType = EVL.getScalarType();
3275 VPValue *EVLMask = Builder.createICmp(
3277 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3278 HeaderMask->replaceAllUsesWith(EVLMask);
3279}
3280
3281/// Converts a tail folded vector loop region to step by
3282/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3283/// iteration.
3284///
3285/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3286/// replaces all uses of the canonical IV except for the canonical IV
3287/// increment with a VPCurrentIterationPHIRecipe. The canonical IV is used
3288/// only for loop iterations counting after this transformation.
3289///
3290/// - The header mask is replaced with a header mask based on the EVL.
3291///
3292/// - Plans with FORs have a new phi added to keep track of the EVL of the
3293/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3294/// @llvm.vp.splice.
3295///
3296/// The function uses the following definitions:
3297/// %StartV is the canonical induction start value.
3298///
3299/// The function adds the following recipes:
3300///
3301/// vector.ph:
3302/// ...
3303///
3304/// vector.body:
3305/// ...
3306/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3307/// [ %NextIter, %vector.body ]
3308/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3309/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3310/// ...
3311/// %OpEVL = cast i32 %VPEVL to IVSize
3312/// %NextIter = add IVSize %OpEVL, %CurrentIter
3313/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3314/// ...
3315///
3316/// If MaxSafeElements is provided, the function adds the following recipes:
3317/// vector.ph:
3318/// ...
3319///
3320/// vector.body:
3321/// ...
3322/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3323/// [ %NextIter, %vector.body ]
3324/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3325/// %cmp = cmp ult %AVL, MaxSafeElements
3326/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3327/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3328/// ...
3329/// %OpEVL = cast i32 %VPEVL to IVSize
3330/// %NextIter = add IVSize %OpEVL, %CurrentIter
3331/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3332/// ...
3333///
3335 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3336 if (Plan.hasScalarVFOnly())
3337 return;
3338 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3339 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3340
3341 auto *CanonicalIV = LoopRegion->getCanonicalIV();
3342 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3343 VPValue *StartV = Plan.getZero(CanIVTy);
3344 auto *CanonicalIVIncrement = LoopRegion->getOrCreateCanonicalIVIncrement();
3345
3346 // Create the CurrentIteration recipe in the vector loop.
3347 auto *CurrentIteration =
3349 CurrentIteration->insertBefore(*Header, Header->begin());
3350 VPBuilder Builder(Header, Header->getFirstNonPhi());
3351 // Create the AVL (application vector length), starting from TC -> 0 in steps
3352 // of EVL.
3353 VPPhi *AVLPhi = Builder.createScalarPhi(
3354 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3355 VPValue *AVL = AVLPhi;
3356
3357 if (MaxSafeElements) {
3358 // Support for MaxSafeDist for correct loop emission.
3359 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3360 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3361 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3362 "safe_avl");
3363 }
3364 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3365 DebugLoc::getUnknown(), "evl");
3366
3367 Builder.setInsertPoint(CanonicalIVIncrement);
3368 VPValue *OpVPEVL = VPEVL;
3369
3370 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3371 OpVPEVL = Builder.createScalarZExtOrTrunc(
3372 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3373
3374 auto *NextIter = Builder.createAdd(
3375 OpVPEVL, CurrentIteration, CanonicalIVIncrement->getDebugLoc(),
3376 "current.iteration.next", CanonicalIVIncrement->getNoWrapFlags());
3377 CurrentIteration->addBackedgeValue(NextIter);
3378
3379 VPValue *NextAVL =
3380 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3381 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3382 AVLPhi->addIncoming(NextAVL);
3383
3384 fixupVFUsersForEVL(Plan, *VPEVL);
3385 removeDeadRecipes(Plan);
3386
3387 // Replace all uses of the canonical IV with VPCurrentIterationPHIRecipe
3388 // except for the canonical IV increment.
3389 CanonicalIV->replaceAllUsesWith(CurrentIteration);
3390 CanonicalIVIncrement->setOperand(0, CanonicalIV);
3391 // TODO: support unroll factor > 1.
3392 Plan.setUF(1);
3393}
3394
3396 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3397 // There should be only one VPCurrentIteration in the entire plan.
3398 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3399
3402 for (VPRecipeBase &R : VPBB->phis())
3403 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3404 assert(!CurrentIteration &&
3405 "Found multiple CurrentIteration. Only one expected");
3406 CurrentIteration = PhiR;
3407 }
3408
3409 // Early return if it is not variable-length stepping.
3410 if (!CurrentIteration)
3411 return;
3412
3413 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3414 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3415
3416 // Convert CurrentIteration to concrete recipe.
3417 auto *ScalarR =
3418 VPBuilder(CurrentIteration)
3420 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3421 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3422 CurrentIteration->replaceAllUsesWith(ScalarR);
3423 CurrentIteration->eraseFromParent();
3424
3425 // Replace CanonicalIVInc with CurrentIteration increment if it exists.
3426 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3427 if (auto *CanIVInc = findUserOf(
3428 CanonicalIV, m_c_Add(m_VPValue(), m_Specific(&Plan.getVFxUF())))) {
3429 cast<VPInstruction>(CanIVInc)->replaceAllUsesWith(CurrentIterationIncr);
3430 CanIVInc->eraseFromParent();
3431 }
3432}
3433
3435 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3436 if (!LoopRegion)
3437 return;
3438 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3439 if (Header->empty())
3440 return;
3441 // The EVL IV is always at the beginning.
3442 auto *EVLPhi = dyn_cast<VPCurrentIterationPHIRecipe>(&Header->front());
3443 if (!EVLPhi)
3444 return;
3445
3446 // Bail if not an EVL tail folded loop.
3447 VPValue *AVL;
3448 if (!match(EVLPhi->getBackedgeValue(),
3449 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3450 return;
3451
3452 // The AVL may be capped to a safe distance.
3453 VPValue *SafeAVL, *UnsafeAVL;
3454 if (match(AVL,
3456 m_VPValue(SafeAVL)),
3457 m_Deferred(UnsafeAVL), m_Deferred(SafeAVL))))
3458 AVL = UnsafeAVL;
3459
3460 VPValue *AVLNext;
3461 [[maybe_unused]] bool FoundAVLNext =
3463 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3464 assert(FoundAVLNext && "Didn't find AVL backedge?");
3465
3466 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3467 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3468 if (match(LatchBr, m_BranchOnCond(m_True())))
3469 return;
3470
3471 VPValue *CanIVInc;
3472 [[maybe_unused]] bool FoundIncrement = match(
3473 LatchBr,
3475 m_Specific(&Plan.getVectorTripCount()))));
3476 assert(FoundIncrement &&
3477 match(CanIVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
3478 m_Specific(&Plan.getVFxUF()))) &&
3479 "Expected BranchOnCond with ICmp comparing CanIV + VFxUF with vector "
3480 "trip count");
3481
3482 Type *AVLTy = AVLNext->getScalarType();
3483 VPBuilder Builder(LatchBr);
3484 LatchBr->setOperand(
3485 0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));
3486}
3487
3489 VPlan &Plan, PredicatedScalarEvolution &PSE,
3490 const DenseMap<Value *, const SCEV *> &StridesMap,
3491 const VPDominatorTree &VPDT) {
3492 // Replace VPValues for known constant strides guaranteed by predicated scalar
3493 // evolution that are guaranteed to be guarded by the runtime checks; that is,
3494 // blocks dominated by the vector preheader.
3495 assert(!Plan.getVectorLoopRegion() &&
3496 "expected to run before loop regions are created");
3497 VPBlockBase *Preheader = Plan.getEntry()->getSuccessors()[1];
3498 auto CanUseVersionedStride = [&VPDT, Preheader](VPUser &U, unsigned) {
3499 auto *R = cast<VPRecipeBase>(&U);
3500 VPBlockBase *Parent = R->getParent();
3501 return VPDT.dominates(Preheader, Parent);
3502 };
3503 ValueToSCEVMapTy RewriteMap;
3504 for (const SCEV *Stride : StridesMap.values()) {
3505 using namespace SCEVPatternMatch;
3506 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3507 const APInt *StrideConst;
3508 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3509 // Only handle constant strides for now.
3510 continue;
3511
3512 auto *CI = Plan.getConstantInt(*StrideConst);
3513 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3514 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3515
3516 // The versioned value may not be used in the loop directly but through a
3517 // sext/zext. Add new live-ins in those cases.
3518 for (Value *U : StrideV->users()) {
3520 continue;
3521 VPValue *StrideVPV = Plan.getLiveIn(U);
3522 if (!StrideVPV)
3523 continue;
3524 unsigned BW = U->getType()->getScalarSizeInBits();
3525 APInt C =
3526 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3527 VPValue *CI = Plan.getConstantInt(C);
3528 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3529 }
3530 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3531 }
3532
3533 for (VPRecipeBase &R : *Plan.getEntry()) {
3534 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3535 if (!ExpSCEV)
3536 continue;
3537 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3538 auto *NewSCEV =
3539 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3540 if (NewSCEV != ScevExpr) {
3541 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3542 ExpSCEV->replaceAllUsesWith(NewExp);
3543 if (Plan.getTripCount() == ExpSCEV)
3544 Plan.resetTripCount(NewExp);
3545 }
3546 }
3547}
3548
3550 // Collect recipes in the backward slice of `Root` that may generate a poison
3551 // value that is used after vectorization.
3553 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3555 Worklist.push_back(Root);
3556
3557 // Traverse the backward slice of Root through its use-def chain.
3558 while (!Worklist.empty()) {
3559 VPRecipeBase *CurRec = Worklist.pop_back_val();
3560
3561 if (!Visited.insert(CurRec).second)
3562 continue;
3563
3564 // Prune search if we find another recipe generating a widen memory
3565 // instruction. Widen memory instructions involved in address computation
3566 // will lead to gather/scatter instructions, which don't need to be
3567 // handled.
3569 VPHeaderPHIRecipe>(CurRec))
3570 continue;
3571
3572 // This recipe contributes to the address computation of a widen
3573 // load/store. If the underlying instruction has poison-generating flags,
3574 // drop them directly.
3575 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3576 VPValue *A, *B;
3577 // Dropping disjoint from an OR may yield incorrect results, as some
3578 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3579 // for dependence analysis). Instead, replace it with an equivalent Add.
3580 // This is possible as all users of the disjoint OR only access lanes
3581 // where the operands are disjoint or poison otherwise.
3582 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3583 RecWithFlags->isDisjoint()) {
3584 VPBuilder Builder(RecWithFlags);
3585 VPInstruction *New =
3586 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3587 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3588 RecWithFlags->replaceAllUsesWith(New);
3589 RecWithFlags->eraseFromParent();
3590 CurRec = New;
3591 } else
3592 RecWithFlags->dropPoisonGeneratingFlags();
3593 } else {
3596 (void)Instr;
3597 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3598 "found instruction with poison generating flags not covered by "
3599 "VPRecipeWithIRFlags");
3600 }
3601
3602 // Add new definitions to the worklist.
3603 for (VPValue *Operand : CurRec->operands())
3604 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3605 Worklist.push_back(OpDef);
3606 }
3607 });
3608
3609 // We want to exclude the tail folding case, as we don't need to drop flags
3610 // for operations computing the first lane in this case: the first lane of the
3611 // header mask must always be true.
3612 auto IsNotHeaderMask = [&Plan](VPValue *Mask) {
3613 return Mask && !vputils::isHeaderMask(Mask, Plan);
3614 };
3615
3616 // Traverse all the recipes in the VPlan and collect the poison-generating
3617 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3618 // VPInterleaveRecipe.
3619 auto Iter =
3622 for (VPRecipeBase &Recipe : *VPBB) {
3623 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3624 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3625 if (AddrDef && WidenRec->isConsecutive() &&
3626 IsNotHeaderMask(WidenRec->getMask()))
3627 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3628 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3629 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3630 if (AddrDef && IsNotHeaderMask(InterleaveRec->getMask()))
3631 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3632 }
3633 }
3634 }
3635}
3636
3638 VPlan &Plan,
3640 &InterleaveGroups,
3641 const bool &EpilogueAllowed) {
3642 if (InterleaveGroups.empty())
3643 return;
3644
3646 for (VPBasicBlock *VPBB :
3649 for (VPRecipeBase &R : make_filter_range(*VPBB, [](VPRecipeBase &R) {
3650 return isa<VPWidenMemoryRecipe>(&R);
3651 })) {
3652 auto *MemR = cast<VPWidenMemoryRecipe>(&R);
3653 IRMemberToRecipe[&MemR->getIngredient()] = MemR;
3654 }
3655
3656 // Interleave memory: for each Interleave Group we marked earlier as relevant
3657 // for this VPlan, replace the Recipes widening its memory instructions with a
3658 // single VPInterleaveRecipe at its insertion point.
3659 VPDominatorTree VPDT(Plan);
3660 for (const auto *IG : InterleaveGroups) {
3661 // Skip interleave groups where members don't have recipes. This can happen
3662 // when removeDeadRecipes removes recipes that are part of interleave groups
3663 // but have no users.
3664 if (llvm::any_of(IG->members(), [&IRMemberToRecipe](Instruction *Member) {
3665 return !IRMemberToRecipe.contains(Member);
3666 }))
3667 continue;
3668
3669 auto *Start = IRMemberToRecipe.lookup(IG->getMember(0));
3670 VPIRMetadata InterleaveMD(*Start);
3671 SmallVector<VPValue *, 4> StoredValues;
3672 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start->getAsRecipe()))
3673 StoredValues.push_back(StoreR->getStoredValue());
3674 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3675 Instruction *MemberI = IG->getMember(I);
3676 if (!MemberI)
3677 continue;
3678 VPWidenMemoryRecipe *MemoryR = IRMemberToRecipe.lookup(MemberI);
3679 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR->getAsRecipe()))
3680 StoredValues.push_back(StoreR->getStoredValue());
3681 InterleaveMD.intersect(*MemoryR);
3682 }
3683
3684 bool NeedsMaskForGaps =
3685 (IG->requiresScalarEpilogue() && !EpilogueAllowed) ||
3686 (!StoredValues.empty() && !IG->isFull());
3687
3688 Instruction *IRInsertPos = IG->getInsertPos();
3689 auto *InsertPos = IRMemberToRecipe.lookup(IRInsertPos);
3690 VPRecipeBase *InsertPosR = InsertPos->getAsRecipe();
3691
3693 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3694 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3695 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3696
3697 // Get or create the start address for the interleave group.
3698 VPValue *Addr = Start->getAddr();
3699 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3700 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPosR)) {
3701 // We cannot re-use the address of member zero because it does not
3702 // dominate the insert position. Instead, use the address of the insert
3703 // position and create a PtrAdd adjusting it to the address of member
3704 // zero.
3705 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3706 // InsertPos or sink loads above zero members to join it.
3707 assert(IG->getIndex(IRInsertPos) != 0 &&
3708 "index of insert position shouldn't be zero");
3709 auto &DL = IRInsertPos->getDataLayout();
3710 APInt Offset(32,
3711 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3712 IG->getIndex(IRInsertPos),
3713 /*IsSigned=*/true);
3714 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3715 VPBuilder B(InsertPosR);
3716 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3717 }
3718 // If the group is reverse, adjust the index to refer to the last vector
3719 // lane instead of the first. We adjust the index from the first vector
3720 // lane, rather than directly getting the pointer for lane VF - 1, because
3721 // the pointer operand of the interleaved access is supposed to be uniform.
3722 if (IG->isReverse()) {
3723 auto *ReversePtr = new VPVectorEndPointerRecipe(
3724 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3725 -(int64_t)IG->getFactor(), NW, InsertPosR->getDebugLoc());
3726 ReversePtr->insertBefore(InsertPosR);
3727 Addr = ReversePtr;
3728 }
3729 auto *VPIG = new VPInterleaveRecipe(
3730 IG, Addr, StoredValues, InsertPos->getMask(), NeedsMaskForGaps,
3731 InterleaveMD, InsertPosR->getDebugLoc());
3732 VPIG->insertBefore(InsertPosR);
3733
3734 unsigned J = 0;
3735 for (unsigned i = 0; i < IG->getFactor(); ++i)
3736 if (Instruction *Member = IG->getMember(i)) {
3737 VPRecipeBase *MemberR = IRMemberToRecipe.lookup(Member)->getAsRecipe();
3738 if (!Member->getType()->isVoidTy()) {
3739 VPValue *OriginalV = MemberR->getVPSingleValue();
3740 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3741 J++;
3742 }
3743 MemberR->eraseFromParent();
3744 }
3745 }
3746}
3747
3748/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3749/// value, phi and backedge value. In the following example:
3750///
3751/// vector.ph:
3752/// Successor(s): vector loop
3753///
3754/// <x1> vector loop: {
3755/// vector.body:
3756/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3757/// ...
3758/// EMIT branch-on-count ...
3759/// No successors
3760/// }
3761///
3762/// WIDEN-INDUCTION will get expanded to:
3763///
3764/// vector.ph:
3765/// ...
3766/// vp<%induction.start> = ...
3767/// vp<%induction.increment> = ...
3768///
3769/// Successor(s): vector loop
3770///
3771/// <x1> vector loop: {
3772/// vector.body:
3773/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3774/// ...
3775/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3776/// EMIT branch-on-count ...
3777/// No successors
3778/// }
3779static void
3781 VPlan *Plan = WidenIVR->getParent()->getPlan();
3782 VPValue *Start = WidenIVR->getStartValue();
3783 VPValue *Step = WidenIVR->getStepValue();
3784 VPValue *VF = WidenIVR->getVFValue();
3785 DebugLoc DL = WidenIVR->getDebugLoc();
3786
3787 // The value from the original loop to which we are mapping the new induction
3788 // variable.
3789 Type *Ty = WidenIVR->getScalarType();
3790
3791 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3794 VPIRFlags Flags = *WidenIVR;
3795 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3796 AddOp = Instruction::Add;
3797 MulOp = Instruction::Mul;
3798 } else {
3799 AddOp = ID.getInductionOpcode();
3800 MulOp = Instruction::FMul;
3801 }
3802
3803 // If the phi is truncated, truncate the start and step values.
3804 VPBuilder Builder(Plan->getVectorPreheader());
3805 Type *StepTy = Step->getScalarType();
3806 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3807 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3808 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3809 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3810 StepTy = Ty;
3811 }
3812
3813 // Construct the initial value of the vector IV in the vector loop preheader.
3814 Type *IVIntTy =
3816 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3817 if (StepTy->isFloatingPointTy())
3818 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3819
3820 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3821 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3822
3823 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3824 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3825 DebugLoc::getUnknown(), "induction");
3826
3827 // Create the widened phi of the vector IV.
3828 auto *WidePHI = VPBuilder(WidenIVR).createWidenPhi(
3829 Init, WidenIVR->getDebugLoc(), "vec.ind");
3830
3831 // Create the backedge value for the vector IV.
3832 VPValue *Inc;
3833 VPValue *Prev;
3834 // If unrolled, use the increment and prev value from the operands.
3835 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3836 Inc = SplatVF;
3837 Prev = WidenIVR->getLastUnrolledPartOperand();
3838 } else {
3839 // Move the insertion point after the VF definition when the VF is defined
3840 // inside a loop, such as for EVL tail-folding.
3841 if (VPRecipeBase *R = VF->getDefiningRecipe())
3842 if (R->getParent()->getEnclosingLoopRegion())
3843 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3844
3845 // Multiply the vectorization factor by the step using integer or
3846 // floating-point arithmetic as appropriate.
3847 if (StepTy->isFloatingPointTy())
3848 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3849 DL);
3850 else
3851 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, VF->getScalarType(), DL);
3852
3853 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3854 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3855 Prev = WidePHI;
3856 }
3857
3859 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3860 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3861 WidenIVR->getDebugLoc(), "vec.ind.next");
3862
3863 WidePHI->addIncoming(Next);
3864
3865 WidenIVR->replaceAllUsesWith(WidePHI);
3866}
3867
3868/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3869/// initial value, phi and backedge value. In the following example:
3870///
3871/// <x1> vector loop: {
3872/// vector.body:
3873/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3874/// ...
3875/// EMIT branch-on-count ...
3876/// }
3877///
3878/// WIDEN-POINTER-INDUCTION will get expanded to:
3879///
3880/// <x1> vector loop: {
3881/// vector.body:
3882/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3883/// EMIT %mul = mul %stepvector, %step
3884/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3885/// ...
3886/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3887/// EMIT branch-on-count ...
3888/// }
3890 VPlan *Plan = R->getParent()->getPlan();
3891 VPValue *Start = R->getStartValue();
3892 VPValue *Step = R->getStepValue();
3893 VPValue *VF = R->getVFValue();
3894
3895 assert(R->getInductionDescriptor().getKind() ==
3897 "Not a pointer induction according to InductionDescriptor!");
3898 assert(R->getScalarType()->isPointerTy() && "Unexpected type.");
3899 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3900 "Recipe should have been replaced");
3901
3902 VPBuilder Builder(R);
3903 DebugLoc DL = R->getDebugLoc();
3904
3905 // Build a scalar pointer phi.
3906 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3907
3908 // Create actual address geps that use the pointer phi as base and a
3909 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3910 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3911 Type *StepTy = Step->getScalarType();
3912 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3913 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3914 VPValue *PtrAdd =
3915 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3916 R->replaceAllUsesWith(PtrAdd);
3917
3918 // Create the backedge value for the scalar pointer phi.
3920 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3921 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, VF->getScalarType(), DL);
3922 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3923
3924 VPValue *InductionGEP =
3925 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3926 ScalarPtrPhi->addIncoming(InductionGEP);
3927}
3928
3929/// Expand a VPDerivedIVRecipe into executable recipes.
3931 VPBuilder Builder(R);
3932 VPIRValue *Start = R->getStartValue();
3933 VPValue *Step = R->getStepValue();
3934 VPValue *Index = R->getIndex();
3935 Type *StepTy = Step->getScalarType();
3936 Type *IndexTy = Index->getScalarType();
3937 Index = StepTy->isIntegerTy()
3938 ? Builder.createScalarSExtOrTrunc(
3939 Index, StepTy, IndexTy, DebugLoc::getCompilerGenerated())
3940 : Builder.createScalarCast(Instruction::SIToFP, Index, StepTy,
3942 switch (R->getInductionKind()) {
3944 assert(Index->getScalarType() == Start->getScalarType() &&
3945 "Index type does not match StartValue type");
3946 return R->replaceAllUsesWith(Builder.createAdd(
3947 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
3948 }
3950 return R->replaceAllUsesWith(Builder.createPtrAdd(
3951 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
3953 assert(StepTy->isFloatingPointTy() && "Expected FP Step value");
3954 const FPMathOperator *FPBinOp = R->getFPBinOp();
3955 assert(FPBinOp &&
3956 (FPBinOp->getOpcode() == Instruction::FAdd ||
3957 FPBinOp->getOpcode() == Instruction::FSub) &&
3958 "Original BinOp should be defined for FP induction");
3959 FastMathFlags FMF = FPBinOp->getFastMathFlags();
3960 VPValue *FMul = Builder.createNaryOp(Instruction::FMul, {Step, Index}, FMF);
3961 return R->replaceAllUsesWith(
3962 Builder.createNaryOp(FPBinOp->getOpcode(), {Start, FMul}, FMF));
3963 }
3965 return;
3966 }
3967 llvm_unreachable("Unhandled induction kind");
3968}
3969
3971 // Replace loop regions with explicity CFG.
3972 SmallVector<VPRegionBlock *> LoopRegions;
3974 vp_depth_first_deep(Plan.getEntry()))) {
3975 if (!R->isReplicator())
3976 LoopRegions.push_back(R);
3977 }
3978 for (VPRegionBlock *R : LoopRegions)
3979 R->dissolveToCFGLoop();
3980}
3981
3984 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3985 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3988 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3989 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3990 }
3991
3992 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3993 // single-condition branches:
3994 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3995 // the first condition is true, and otherwise jumps to a new interim block.
3996 // 2. A branch that ends the interim block, jumps to the second successor if
3997 // the second condition is true, and otherwise jumps to the third
3998 // successor.
3999 for (VPInstruction *Br : WorkList) {
4000 assert(Br->getNumOperands() == 2 &&
4001 "BranchOnTwoConds must have exactly 2 conditions");
4002 DebugLoc DL = Br->getDebugLoc();
4003 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
4004 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
4005 assert(Successors.size() == 3 &&
4006 "BranchOnTwoConds must have exactly 3 successors");
4007
4008 for (VPBlockBase *Succ : Successors)
4009 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
4010
4011 VPValue *Cond0 = Br->getOperand(0);
4012 VPValue *Cond1 = Br->getOperand(1);
4013 VPBlockBase *Succ0 = Successors[0];
4014 VPBlockBase *Succ1 = Successors[1];
4015 VPBlockBase *Succ2 = Successors[2];
4016
4017 // If the successor block for both conditions is the same, then combine the
4018 // two conditions and plant a single conditional branch.
4019 if (Succ0 == Succ1) {
4020 VPBuilder Builder(Br);
4021 VPValue *Combined = Builder.createOr(Cond0, Cond1, DL);
4022 Builder.createNaryOp(VPInstruction::BranchOnCond, {Combined}, DL);
4023 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
4024 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ2);
4025 Br->eraseFromParent();
4026 continue;
4027 }
4028
4029 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
4030 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
4031
4032 VPBasicBlock *InterimBB =
4033 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
4034
4035 VPBuilder(BrOnTwoCondsBB)
4037 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
4038 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
4039
4041 VPBlockUtils::connectBlocks(InterimBB, Succ1);
4042 VPBlockUtils::connectBlocks(InterimBB, Succ2);
4043 Br->eraseFromParent();
4044 }
4045}
4046
4049 vp_depth_first_deep(Plan.getEntry()))) {
4050 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4051 VPBuilder Builder(&R);
4052 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
4054 WidenIVR->eraseFromParent();
4055 continue;
4056 }
4057
4058 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
4059 // If the recipe only generates scalars, scalarize it instead of
4060 // expanding it.
4061 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
4062 VPValue *PtrAdd =
4063 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
4064 WidenIVR->replaceAllUsesWith(PtrAdd);
4065 WidenIVR->eraseFromParent();
4066 continue;
4067 }
4069 WidenIVR->eraseFromParent();
4070 continue;
4071 }
4072
4073 if (auto *DerivedIVR = dyn_cast<VPDerivedIVRecipe>(&R)) {
4074 expandVPDerivedIV(DerivedIVR);
4075 DerivedIVR->eraseFromParent();
4076 continue;
4077 }
4078
4079 if (auto *WideCanIV = dyn_cast<VPWidenCanonicalIVRecipe>(&R)) {
4080 VPValue *CanIV = WideCanIV->getCanonicalIV();
4081 Type *CanIVTy = CanIV->getScalarType();
4082 VPValue *Step = WideCanIV->getStepValue();
4083 if (!Step) {
4084 assert(Plan.getConcreteUF() == 1 &&
4085 "Expected unroller to have materialized step for UF != 1");
4086 Step = Plan.getZero(CanIVTy);
4087 }
4088 CanIV = Builder.createNaryOp(VPInstruction::Broadcast, CanIV);
4089 Step = Builder.createNaryOp(VPInstruction::Broadcast, Step);
4090 Step = Builder.createAdd(
4091 Step, Builder.createNaryOp(VPInstruction::StepVector, {}, CanIVTy));
4092 VPValue *CanVecIV =
4093 Builder.createAdd(CanIV, Step, WideCanIV->getDebugLoc(), "vec.iv",
4094 WideCanIV->getNoWrapFlags());
4095 WideCanIV->replaceAllUsesWith(CanVecIV);
4096 WideCanIV->eraseFromParent();
4097 continue;
4098 }
4099
4100 // Expand VPBlendRecipe into VPInstruction::Select.
4101 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
4102 VPValue *Select = Blend->getIncomingValue(0);
4103 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
4104 Select = Builder.createSelect(Blend->getMask(I),
4105 Blend->getIncomingValue(I), Select,
4106 R.getDebugLoc(), "predphi", *Blend);
4107 Blend->replaceAllUsesWith(Select);
4108 Blend->eraseFromParent();
4109 continue;
4110 }
4111
4112 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
4113 if (!VEPR->getOffset()) {
4114 assert(Plan.getConcreteUF() == 1 &&
4115 "Expected unroller to have materialized offset for UF != 1");
4116 VEPR->materializeOffset();
4117 }
4118 continue;
4119 }
4120
4121 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
4122 Expr->decompose();
4123 Expr->eraseFromParent();
4124 continue;
4125 }
4126
4127 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4128 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
4129 if (LastActiveL &&
4130 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4131 // Create Not(Mask) for all operands.
4133 for (VPValue *Op : LastActiveL->operands()) {
4134 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
4135 NotMasks.push_back(NotMask);
4136 }
4137
4138 // Create FirstActiveLane on the inverted masks.
4139 VPValue *FirstInactiveLane = Builder.createFirstActiveLane(
4140 NotMasks, LastActiveL->getDebugLoc(), "first.inactive.lane");
4141
4142 // Subtract 1 to get the last active lane.
4143 VPValue *One =
4144 Plan.getConstantInt(FirstInactiveLane->getScalarType(), 1);
4145 VPValue *LastLane =
4146 Builder.createSub(FirstInactiveLane, One,
4147 LastActiveL->getDebugLoc(), "last.active.lane");
4148
4149 LastActiveL->replaceAllUsesWith(LastLane);
4150 LastActiveL->eraseFromParent();
4151 continue;
4152 }
4153
4154 // Lower MaskedCond with block mask to LogicalAnd.
4156 auto *VPI = cast<VPInstruction>(&R);
4157 assert(VPI->isMasked() &&
4158 "Unmasked MaskedCond should be simplified earlier");
4159 VPI->replaceAllUsesWith(Builder.createNaryOp(
4160 VPInstruction::LogicalAnd, {VPI->getMask(), VPI->getOperand(0)}));
4161 VPI->eraseFromParent();
4162 continue;
4163 }
4164
4165 // Lower CanonicalIVIncrementForPart to plain Add.
4166 if (match(
4167 &R,
4169 auto *VPI = cast<VPInstruction>(&R);
4170 VPValue *Add = Builder.createOverflowingOp(
4171 Instruction::Add, VPI->operands(), VPI->getNoWrapFlags(),
4172 VPI->getDebugLoc());
4173 VPI->replaceAllUsesWith(Add);
4174 VPI->eraseFromParent();
4175 continue;
4176 }
4177
4178 // Lower BranchOnCount to ICmp + BranchOnCond.
4179 VPValue *IV, *TC;
4180 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
4181 auto *BranchOnCountInst = cast<VPInstruction>(&R);
4182 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4183 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
4184 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
4185 BranchOnCountInst->eraseFromParent();
4186 continue;
4187 }
4188
4189 VPValue *VectorStep;
4190 VPValue *ScalarStep;
4192 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4193 continue;
4194
4195 // Expand WideIVStep.
4196 auto *VPI = cast<VPInstruction>(&R);
4197 Type *IVTy = VPI->getScalarType();
4198 if (VectorStep->getScalarType() != IVTy) {
4200 ? Instruction::UIToFP
4201 : Instruction::Trunc;
4202 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4203 }
4204
4205 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4206 if (ScalarStep->getScalarType() != IVTy) {
4207 ScalarStep =
4208 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4209 }
4210
4211 VPIRFlags Flags;
4212 unsigned MulOpc;
4213 if (IVTy->isFloatingPointTy()) {
4214 MulOpc = Instruction::FMul;
4215 Flags = VPI->getFastMathFlagsOrNone();
4216 } else {
4217 MulOpc = Instruction::Mul;
4218 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4219 }
4220
4221 VPInstruction *Mul = Builder.createNaryOp(
4222 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4223 VectorStep = Mul;
4224 VPI->replaceAllUsesWith(VectorStep);
4225 VPI->eraseFromParent();
4226 }
4227 }
4228}
4229
4235
4236/// Update \p Plan to mask memory operations in the loop based on whether the
4237/// early exit is taken or not.
4240 VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB,
4241 Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT,
4242 AssumptionCache *AC, VPDominatorTree &VPDT) {
4243
4244 // Disconnect early exiting blocks from successors, remove branches. We
4245 // currently don't support multiple uses for recipes involved in creating
4246 // the uncountable exit condition.
4247 for (auto &Exit : Exits) {
4248 if (Exit.EarlyExitingVPBB == LatchVPBB)
4249 continue;
4250
4251 for (VPRecipeBase &R : Exit.EarlyExitVPBB->phis())
4252 cast<VPIRPhi>(&R)->removeIncomingValueFor(Exit.EarlyExitingVPBB);
4253 Exit.EarlyExitingVPBB->getTerminator()->eraseFromParent();
4254 VPBlockUtils::disconnectBlocks(Exit.EarlyExitingVPBB, Exit.EarlyExitVPBB);
4255 }
4256
4257 VPDT.recalculate(Plan);
4258
4259 // We can abandon a VPlan entirely if we return false here, so we shouldn't
4260 // crash if some earlier assumptions on scalar IR don't hold for the vplan
4261 // version of the loop.
4263 SmallVector<VPInstruction *, 8> ConditionRecipes;
4264
4265 std::optional<VPValue *> Cond =
4266 vputils::getRecipesForUncountableExit(ConditionRecipes, GEPs, LatchVPBB);
4267 if (!Cond)
4268 return false;
4269
4270 // Find load contributing to condition.
4271 VPRecipeBase *CondLoad = nullptr;
4272 for (auto *Recipe : ConditionRecipes) {
4274 // TODO: Support more than one load. Needs legality updates too.
4275 assert(CondLoad == nullptr && "Too many condition loads");
4276 CondLoad = Recipe;
4277 }
4278 }
4279 assert(CondLoad && "Couldn't find load");
4280
4281 // Ensure that we are guaranteed to be able to dereference the memory used
4282 // for determining the uncountable exit for the maximum possible number of
4283 // scalar iterations of the loop.
4284 //
4285 // TODO: Support first-faulting loads in cases where we don't know whether
4286 // all possible addresses are dereferenceable.
4287 {
4290 VPValue *Ptr = Load->getOperand(0);
4291 const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(Ptr, PSE, TheLoop);
4292 const DataLayout &DL = Plan.getDataLayout();
4293 APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getScalarType()),
4294 DL.getTypeStoreSize(Load->getScalarType()).getFixedValue());
4296 PtrSCEV, cast<LoadInst>(Load->getUnderlyingInstr())->getAlign(),
4297 PSE.getSE()->getConstant(EltSize), TheLoop, *PSE.getSE(), DT, AC,
4298 &Predicates))
4299 return false;
4300 }
4301
4302 // Check GEPs to see if we can link them to a widen IV recipe with a step of
4303 // 1; we're only interested in contiguous accesses for the condition load
4304 // right now.
4305 for (auto *GEP : GEPs) {
4306 VPValue *MaybeIV = nullptr;
4308 m_LiveIn(), m_VPValue(MaybeIV))))
4309 return false;
4310
4311 auto *WIV = dyn_cast<VPWidenInductionRecipe>(MaybeIV);
4312 if (!WIV)
4313 return false;
4314
4315 if (!match(WIV->getStartValue(), m_SpecificInt(0)) ||
4316 !match(WIV->getStepValue(), m_SpecificInt(1)))
4317 return false;
4318 }
4319
4320 // Find an insertion point. Default to the end of the header but override
4321 // if we find a memory op that needs masking before the condition load.
4322 auto InsertIt = HeaderVPBB->end();
4323 VPRecipeBase *CondR = (*Cond)->getDefiningRecipe();
4324 bool CondMoveNeeded = CondR->getParent() != HeaderVPBB;
4325 for (VPRecipeBase &R : *HeaderVPBB) {
4326 if (&R == CondLoad)
4327 continue;
4328
4329 if (R.mayReadOrWriteMemory()) {
4330 if (!VPDT.properlyDominates(CondR, &R)) {
4331 CondMoveNeeded = true;
4332 InsertIt = R.getIterator();
4333 }
4334 break;
4335 }
4336 }
4337
4338 // If another memory operation would take place before the comparison to
4339 // determine whether to exit early or the comparison doesn't take place in
4340 // the header, move the comparison (and supporting recipes).
4341 if (CondMoveNeeded)
4342 for (auto *Recipe : reverse(ConditionRecipes))
4343 Recipe->moveBefore(*HeaderVPBB, InsertIt);
4344
4345 // Create a mask to represent all lanes that fully execute in the vector loop,
4346 // stopping short of any early exit.
4347 VPBuilder MaskBuilder(HeaderVPBB, InsertIt);
4348 VPValue *FirstActive = MaskBuilder.createFirstActiveLane(*Cond);
4349 VPValue *IV = cast<VPSingleDefRecipe>(&HeaderVPBB->front());
4350 Type *IVScalarTy = IV->getScalarType();
4351 Type *FirstActiveTy = FirstActive->getScalarType();
4352 VPValue *ALMMultiplier = Plan.getConstantInt(IVScalarTy, 1);
4353 VPValue *Zero = Plan.getZero(IVScalarTy);
4354 FirstActive = MaskBuilder.createScalarZExtOrTrunc(FirstActive, IVScalarTy,
4355 FirstActiveTy, DebugLoc());
4357 {Zero, FirstActive, ALMMultiplier},
4358 DebugLoc(), "uncountable.exit.mask");
4359
4360 // Convert all other memory operations to use the mask.
4361 for (VPBasicBlock *VPBB : vp_rpo_plain_cfg_loop_body(HeaderVPBB))
4362 for (VPRecipeBase &R : *VPBB)
4363 if (R.mayReadOrWriteMemory() && &R != CondLoad) {
4364 // TODO: Handle conditional memory operations in the loop.
4365 if (!VPDT.dominates(R.getParent(), LatchVPBB))
4366 return false;
4367 cast<VPInstruction>(&R)->addMask(Mask);
4368 }
4369
4370 // Update middle block branch to compare (IV + however many lanes were active)
4371 // against the full trip count, since we may be exiting the vector loop early.
4372 // If we didn't take an early exit, we should get the equivalent of VF from
4373 // the FirstActiveLane.
4374 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->end());
4375 VPValue *ScalarIV = MiddleBuilder.createNaryOp(VPInstruction::ExtractLane,
4376 {Zero, IV}, DebugLoc());
4377 VPValue *ExitIV = MiddleBuilder.createAdd(ScalarIV, FirstActive);
4378 VPValue *FullTC =
4379 MiddleBuilder.createICmp(CmpInst::ICMP_EQ, ExitIV, Plan.getTripCount());
4380 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {FullTC});
4381
4382 // Update resume phi in scalar.ph.
4383 VPBasicBlock *ScalarPH = Plan.getScalarPreheader();
4384 auto Phis = ScalarPH->phis();
4385 // TODO: Handle more than one Phi; re-derive from IV.
4386 // TODO: Handle reductions.
4387 if (range_size(Phis) != 1)
4388 return false;
4389 VPPhi *ContinueIV = cast<VPPhi>(Phis.begin());
4390 ContinueIV->setOperand(0, ExitIV);
4391 return true;
4392}
4393
4395 VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB,
4396 VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE,
4398 VPDominatorTree VPDT(Plan);
4399 VPBuilder LatchBuilder(LatchVPBB->getTerminator());
4401 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4402 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4403 if (Pred == MiddleVPBB)
4404 continue;
4405 // Collect condition for this early exit.
4406 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4407 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4408 VPValue *CondOfEarlyExitingVPBB;
4409 [[maybe_unused]] bool Matched =
4410 match(EarlyExitingVPBB->getTerminator(),
4411 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4412 assert(Matched && "Terminator must be BranchOnCond");
4413
4414 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4415 // the correct block mask.
4416 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4417 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4419 TrueSucc == ExitBlock
4420 ? CondOfEarlyExitingVPBB
4421 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4422 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4423 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4424 VPDT.properlyDominates(
4425 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4426 LatchVPBB)) &&
4427 "exit condition must dominate the latch");
4428 Exits.push_back({
4429 EarlyExitingVPBB,
4430 ExitBlock,
4431 CondToEarlyExit,
4432 });
4433 }
4434 }
4435
4436 assert(!Exits.empty() && "must have at least one early exit");
4437 // Sort exits by RPO order to get correct program order. RPO gives a
4438 // topological ordering of the CFG, ensuring upstream exits are checked
4439 // before downstream exits in the dispatch chain.
4441 HeaderVPBB);
4443 for (const auto &[Num, VPB] : enumerate(RPOT))
4444 RPOIdx[VPB] = Num;
4445 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4446 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4447 });
4448#ifndef NDEBUG
4449 // After RPO sorting, verify that for any pair where one exit dominates
4450 // another, the dominating exit comes first. This is guaranteed by RPO
4451 // (topological order) and is required for the dispatch chain correctness.
4452 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4453 for (unsigned J = I + 1; J < Exits.size(); ++J)
4454 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4455 Exits[I].EarlyExitingVPBB) &&
4456 "RPO sort must place dominating exits before dominated ones");
4457#endif
4458
4459 // Build the AnyOf condition for the latch terminator using logical OR
4460 // to avoid poison propagation from later exit conditions when an earlier
4461 // exit is taken.
4462 VPValue *Combined = Exits[0].CondToExit;
4463 for (const EarlyExitInfo &Info : drop_begin(Exits))
4464 Combined = LatchBuilder.createLogicalOr(Combined, Info.CondToExit);
4465
4466 VPValue *IsAnyExitTaken =
4467 LatchBuilder.createNaryOp(VPInstruction::AnyOf, {Combined});
4468
4469 // Create a comparison for the latch exit condition and replace the
4470 // BranchOnCond with a BranchOnTwoConds. The original BranchOnCond's condition
4471 // is used as the latch-exit condition; canonical IV recipes have not been
4472 // introduced yet, so there is no BranchOnCount to derive the condition from.
4473 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4474 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCond &&
4475 "Unexpected terminator");
4476 VPValue *IsLatchExitTaken = LatchExitingBranch->getOperand(0);
4477 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4478 LatchExitingBranch->eraseFromParent();
4479 LatchBuilder.setInsertPoint(LatchVPBB);
4481 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4482 LatchVPBB->clearSuccessors();
4483
4485 // If handling the exiting lane in the scalar loop, combine the exit
4486 // conditions into a single BranchOnCond.
4487 LatchVPBB->setSuccessors({MiddleVPBB, MiddleVPBB, HeaderVPBB});
4488 MiddleVPBB->clearPredecessors();
4489 MiddleVPBB->setPredecessors({LatchVPBB, LatchVPBB});
4490 return handleUncountableExitsWithSideEffects(Plan, Exits, HeaderVPBB,
4491 LatchVPBB, MiddleVPBB, TheLoop,
4492 PSE, DT, AC, VPDT);
4493 }
4494
4495 // Create the vector.early.exit blocks.
4496 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4497 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4498 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4499 VPBasicBlock *VectorEarlyExitVPBB =
4500 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4501 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4502 }
4503
4504 // Create the dispatch block (or reuse the single exit block if only one
4505 // exit). The dispatch block computes the first active lane of the combined
4506 // condition and, for multiple exits, chains through conditions to determine
4507 // which exit to take.
4508 VPBasicBlock *DispatchVPBB =
4509 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4510 : Plan.createVPBasicBlock("vector.early.exit.check");
4511 DispatchVPBB->setPredecessors({LatchVPBB});
4512 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4513 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4514 VPValue *FirstActiveLane = DispatchBuilder.createFirstActiveLane(
4515 {Combined}, DebugLoc::getUnknown(), "first.active.lane");
4516
4517 // For each early exit, disconnect the original exiting block
4518 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4519 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4520 // values at the first active lane:
4521 //
4522 // Input:
4523 // early.exiting.I:
4524 // ...
4525 // EMIT branch-on-cond vp<%cond.I>
4526 // Successor(s): in.loop.succ, ir-bb<exit.I>
4527 //
4528 // ir-bb<exit.I>:
4529 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4530 //
4531 // Output:
4532 // early.exiting.I:
4533 // ...
4534 // Successor(s): in.loop.succ
4535 //
4536 // vector.early.exit.I:
4537 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4538 // Successor(s): ir-bb<exit.I>
4539 //
4540 // ir-bb<exit.I>:
4541 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4542 // vector.early.exit.I)
4543 //
4544 for (auto [Exit, VectorEarlyExitVPBB] :
4545 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4546 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4547 // Adjust the phi nodes in EarlyExitVPBB.
4548 // 1. remove incoming values from EarlyExitingVPBB,
4549 // 2. extract the incoming value at FirstActiveLane
4550 // 3. add back the extracts as last operands for the phis
4551 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4552 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4553 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4554 // values from VectorEarlyExitVPBB.
4555 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4556 auto *ExitIRI = cast<VPIRPhi>(&R);
4557 VPValue *IncomingVal =
4558 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4559 VPValue *NewIncoming = IncomingVal;
4560 if (!isa<VPIRValue>(IncomingVal)) {
4561 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4562 NewIncoming = EarlyExitBuilder.createNaryOp(
4563 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4564 DebugLoc::getUnknown(), "early.exit.value");
4565 }
4566 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4567 ExitIRI->addIncoming(NewIncoming);
4568 }
4569
4570 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4571 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4572 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4573 }
4574
4575 // Chain through exits: for each exit, check if its condition is true at
4576 // the first active lane. If so, take that exit; otherwise, try the next.
4577 // The last exit needs no check since it must be taken if all others fail.
4578 //
4579 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4580 //
4581 // latch:
4582 // ...
4583 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4584 // ...
4585 //
4586 // vector.early.exit.check:
4587 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4588 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4589 // EMIT branch-on-cond vp<%at.cond.0>
4590 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4591 //
4592 // vector.early.exit.check.0:
4593 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4594 // EMIT branch-on-cond vp<%at.cond.1>
4595 // Successor(s): vector.early.exit.1, vector.early.exit.2
4596 VPBasicBlock *CurrentBB = DispatchVPBB;
4597 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4598 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4599 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4600 DebugLoc::getUnknown(), "exit.cond.at.lane");
4601
4602 // For the last dispatch, branch directly to the last exit on false;
4603 // otherwise, create a new check block.
4604 bool IsLastDispatch = (I + 2 == Exits.size());
4605 VPBasicBlock *FalseBB =
4606 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4607 : Plan.createVPBasicBlock(
4608 Twine("vector.early.exit.check.") + Twine(I));
4609
4610 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4611 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4612 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4613 FalseBB->setPredecessors({CurrentBB});
4614
4615 CurrentBB = FalseBB;
4616 DispatchBuilder.setInsertPoint(CurrentBB);
4617 }
4618
4619 return true;
4620}
4621
4622/// This function tries convert extended in-loop reductions to
4623/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4624/// valid. The created recipe must be decomposed to its constituent
4625/// recipes before execution.
4626static VPExpressionRecipe *
4628 VFRange &Range) {
4629 Type *RedTy = Red->getScalarType();
4630 VPValue *VecOp = Red->getVecOp();
4631
4632 assert(!Red->isPartialReduction() &&
4633 "This path does not support partial reductions");
4634
4635 // Clamp the range if using extended-reduction is profitable.
4636 auto IsExtendedRedValidAndClampRange =
4637 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4639 [&](ElementCount VF) {
4640 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4642
4644 InstructionCost ExtCost =
4645 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4646 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4647
4648 assert(!RedTy->isFloatingPointTy() &&
4649 "getExtendedReductionCost only supports integer types");
4650 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4651 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4652 Red->getFastMathFlagsOrNone(), CostKind);
4653 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4654 },
4655 Range);
4656 };
4657
4658 VPValue *A;
4659 // Match reduce(ext)).
4661 IsExtendedRedValidAndClampRange(
4662 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4663 cast<VPWidenCastRecipe>(VecOp)->getOpcode(), A->getScalarType()))
4664 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4665
4666 return nullptr;
4667}
4668
4669/// This function tries convert extended in-loop reductions to
4670/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4671/// and valid. The created VPExpressionRecipe must be decomposed to its
4672/// constituent recipes before execution. Patterns of the
4673/// VPExpressionRecipe:
4674/// reduce.add(mul(...)),
4675/// reduce.add(mul(ext(A), ext(B))),
4676/// reduce.add(ext(mul(ext(A), ext(B)))).
4677/// reduce.fadd(fmul(ext(A), ext(B)))
4678static VPExpressionRecipe *
4680 VPCostContext &Ctx, VFRange &Range) {
4681 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4682 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4683 Opcode != Instruction::FAdd)
4684 return nullptr;
4685
4686 assert(!Red->isPartialReduction() &&
4687 "This path does not support partial reductions");
4688 Type *RedTy = Red->getScalarType();
4689
4690 // Clamp the range if using multiply-accumulate-reduction is profitable.
4691 auto IsMulAccValidAndClampRange =
4693 VPWidenCastRecipe *OuterExt) -> bool {
4695 [&](ElementCount VF) {
4697 Type *SrcTy = Ext0 ? Ext0->getOperand(0)->getScalarType() : RedTy;
4698 InstructionCost MulAccCost;
4699
4700 // getMulAccReductionCost for in-loop reductions does not support
4701 // mixed or floating-point extends.
4702 if (Ext0 && Ext1 &&
4703 (Ext0->getOpcode() != Ext1->getOpcode() ||
4704 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4705 return false;
4706
4707 bool IsZExt =
4708 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4709 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4710 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4711 SrcVecTy, CostKind);
4712
4713 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4714 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4715 InstructionCost ExtCost = 0;
4716 if (Ext0)
4717 ExtCost += Ext0->computeCost(VF, Ctx);
4718 if (Ext1)
4719 ExtCost += Ext1->computeCost(VF, Ctx);
4720 if (OuterExt)
4721 ExtCost += OuterExt->computeCost(VF, Ctx);
4722
4723 return MulAccCost.isValid() &&
4724 MulAccCost < ExtCost + MulCost + RedCost;
4725 },
4726 Range);
4727 };
4728
4729 VPValue *VecOp = Red->getVecOp();
4730 VPRecipeBase *Sub = nullptr;
4731 VPValue *A, *B;
4732 VPValue *Tmp = nullptr;
4733
4734 if (RedTy->isFloatingPointTy())
4735 return nullptr;
4736
4737 // Sub reductions could have a sub between the add reduction and vec op.
4738 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4739 Sub = VecOp->getDefiningRecipe();
4740 VecOp = Tmp;
4741 }
4742
4743 // If ValB is a constant and can be safely extended, truncate it to the same
4744 // type as ExtA's operand, then extend it to the same type as ExtA. This
4745 // creates two uniform extends that can more easily be matched by the rest of
4746 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4747 // replaced with the new extend of the constant.
4748 auto ExtendAndReplaceConstantOp = [](VPWidenCastRecipe *ExtA,
4749 VPWidenCastRecipe *&ExtB, VPValue *&ValB,
4750 VPWidenRecipe *Mul) {
4751 if (!ExtA || ExtB || !isa<VPIRValue>(ValB))
4752 return;
4753 Type *NarrowTy = ExtA->getOperand(0)->getScalarType();
4754 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4755 const APInt *Const;
4756 if (!match(ValB, m_APInt(Const)) ||
4758 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4759 return;
4760 // The truncate ensures that the type of each extended operand is the
4761 // same, and it's been proven that the constant can be extended from
4762 // NarrowTy safely. Necessary since ExtA's extended operand would be
4763 // e.g. an i8, while the const will likely be an i32. This will be
4764 // elided by later optimisations.
4765 VPBuilder Builder(Mul);
4766 auto *Trunc =
4767 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4768 Type *WideTy = ExtA->getScalarType();
4769 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4770 Mul->setOperand(1, ExtB);
4771 };
4772
4773 // Try to match reduce.add(mul(...)).
4774 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4775 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(A);
4776 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(B);
4777 auto *Mul = cast<VPWidenRecipe>(VecOp);
4778
4779 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4780 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4781
4782 // Match reduce.add/sub(mul(ext, ext)).
4783 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4784 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4785 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4786 if (Sub)
4787 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4788 cast<VPWidenRecipe>(Sub), Red);
4789 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4790 }
4791 // TODO: Add an expression type for this variant with a negated mul
4792 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4793 return new VPExpressionRecipe(Mul, Red);
4794 }
4795 // TODO: Add an expression type for negated versions of other expression
4796 // variants.
4797 if (Sub)
4798 return nullptr;
4799
4800 // Match reduce.add(ext(mul(A, B))).
4801 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4802 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4803 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4804 auto *Ext0 = dyn_cast<VPWidenCastRecipe>(A);
4805 auto *Ext1 = dyn_cast<VPWidenCastRecipe>(B);
4806
4807 // reduce.add(ext(mul(ext, const)))
4808 // -> reduce.add(ext(mul(ext, ext(const))))
4809 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4810
4811 // reduce.add(ext(mul(ext(A), ext(B))))
4812 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4813 // The inner extends must either have the same opcode as the outer extend or
4814 // be the same, in which case the multiply can never result in a negative
4815 // value and the outer extend can be folded away by doing wider
4816 // extends for the operands of the mul.
4817 if (Ext0 && Ext1 &&
4818 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4819 Ext0->getOpcode() == Ext1->getOpcode() &&
4820 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4821 auto *NewExt0 = new VPWidenCastRecipe(
4822 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getScalarType(), nullptr,
4823 *Ext0, *Ext0, Ext0->getDebugLoc());
4824 NewExt0->insertBefore(Ext0);
4825
4826 VPWidenCastRecipe *NewExt1 = NewExt0;
4827 if (Ext0 != Ext1) {
4828 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4829 Ext->getScalarType(), nullptr, *Ext1,
4830 *Ext1, Ext1->getDebugLoc());
4831 NewExt1->insertBefore(Ext1);
4832 }
4833 auto *NewMul = Mul->cloneWithOperands({NewExt0, NewExt1});
4834 NewMul->insertBefore(Mul);
4835 Ext->replaceAllUsesWith(NewMul);
4836 Ext->eraseFromParent();
4837 Mul->eraseFromParent();
4838 return new VPExpressionRecipe(NewExt0, NewExt1, NewMul, Red);
4839 }
4840 }
4841 return nullptr;
4842}
4843
4844/// This function tries to create abstract recipes from the reduction recipe for
4845/// following optimizations and cost estimation.
4847 VPCostContext &Ctx,
4848 VFRange &Range) {
4849 // Creation of VPExpressions for partial reductions is entirely handled in
4850 // transformToPartialReduction.
4851 assert(!Red->isPartialReduction() &&
4852 "This path does not support partial reductions");
4853
4854 VPExpressionRecipe *AbstractR = nullptr;
4855 auto IP = std::next(Red->getIterator());
4856 auto *VPBB = Red->getParent();
4857 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4858 AbstractR = MulAcc;
4859 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4860 AbstractR = ExtRed;
4861 // Cannot create abstract inloop reduction recipes.
4862 if (!AbstractR)
4863 return;
4864
4865 AbstractR->insertBefore(*VPBB, IP);
4866 Red->replaceAllUsesWith(AbstractR);
4867}
4868
4879
4881 if (Plan.hasScalarVFOnly())
4882 return;
4883
4884#ifndef NDEBUG
4885 VPDominatorTree VPDT(Plan);
4886#endif
4887
4888 SmallVector<VPValue *> VPValues;
4889 if (VPValue *BTC = Plan.getBackedgeTakenCount())
4890 VPValues.push_back(BTC);
4891 append_range(VPValues, Plan.getLiveIns());
4892 for (VPRecipeBase &R : *Plan.getEntry())
4893 append_range(VPValues, R.definedValues());
4894
4895 auto *VectorPreheader = Plan.getVectorPreheader();
4896 for (VPValue *VPV : VPValues) {
4898 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4899 continue;
4900
4901 // Add explicit broadcast at the insert point that dominates all users.
4902 VPBasicBlock *HoistBlock = VectorPreheader;
4903 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4904 for (VPUser *User : VPV->users()) {
4905 if (User->usesScalars(VPV))
4906 continue;
4907 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4908 HoistPoint = HoistBlock->begin();
4909 else
4910 assert(VPDT.dominates(VectorPreheader,
4911 cast<VPRecipeBase>(User)->getParent()) &&
4912 "All users must be in the vector preheader or dominated by it");
4913 }
4914
4915 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4916 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4917 VPV->replaceUsesWithIf(Broadcast,
4918 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4919 return Broadcast != &U && !U.usesScalars(VPV);
4920 });
4921 }
4922}
4923
4924// Collect common metadata from a group of replicate recipes by intersecting
4925// metadata from all recipes in the group.
4927 VPIRMetadata CommonMetadata = *Recipes.front();
4928 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4929 CommonMetadata.intersect(*Recipe);
4930 return CommonMetadata;
4931}
4932
4933template <unsigned Opcode>
4937 const Loop *L) {
4938 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4939 "Only Load and Store opcodes supported");
4940 [[maybe_unused]] constexpr bool IsLoad = (Opcode == Instruction::Load);
4941
4942 // For each address, collect operations with the same or complementary masks.
4945 Plan, PSE, L,
4946 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
4947 for (auto Recipes : Groups) {
4948 if (Recipes.size() < 2)
4949 continue;
4950
4952 map_range(Recipes, bind_back<getLoadStoreValueType>(IsLoad))) &&
4953 "Expected all recipes in group to have the same load-store type");
4954
4955 // Collect groups with the same or complementary masks.
4956 for (VPReplicateRecipe *&RecipeI : Recipes) {
4957 if (!RecipeI)
4958 continue;
4959
4960 VPValue *MaskI = RecipeI->getMask();
4962 Group.push_back(RecipeI);
4963 RecipeI = nullptr;
4964
4965 // Find all operations with the same or complementary masks.
4966 bool HasComplementaryMask = false;
4967 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4968 if (!RecipeJ)
4969 continue;
4970
4971 VPValue *MaskJ = RecipeJ->getMask();
4972 // Check if any operation in the group has a complementary mask with
4973 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4974 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4975 match(MaskJ, m_Not(m_Specific(MaskI)));
4976 Group.push_back(RecipeJ);
4977 RecipeJ = nullptr;
4978 }
4979
4980 if (HasComplementaryMask) {
4981 assert(Group.size() >= 2 && "must have at least 2 entries");
4982 AllGroups.push_back(std::move(Group));
4983 }
4984 }
4985 }
4986
4987 return AllGroups;
4988}
4989
4990// Find the recipe with minimum alignment in the group.
4991template <typename InstType>
4992static VPReplicateRecipe *
4994 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4995 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4996 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4997 });
4998}
4999
5002 const Loop *L) {
5003 auto Groups =
5005 if (Groups.empty())
5006 return;
5007
5008 // Process each group of loads.
5009 for (auto &Group : Groups) {
5010 // Try to use the earliest (most dominating) load to replace all others.
5011 VPReplicateRecipe *EarliestLoad = Group[0];
5012 VPBasicBlock *FirstBB = EarliestLoad->getParent();
5013 VPBasicBlock *LastBB = Group.back()->getParent();
5014
5015 // Check that the load doesn't alias with stores between first and last.
5016 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
5017 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
5018 continue;
5019
5020 // Collect common metadata from all loads in the group.
5021 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
5022
5023 // Find the load with minimum alignment to use.
5024 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
5025
5026 bool IsSingleScalar = EarliestLoad->isSingleScalar();
5027 assert(all_of(Group,
5028 [IsSingleScalar](VPReplicateRecipe *R) {
5029 return R->isSingleScalar() == IsSingleScalar;
5030 }) &&
5031 "all members in group must agree on IsSingleScalar");
5032
5033 // Create an unpredicated version of the earliest load with common
5034 // metadata.
5035 auto *UnpredicatedLoad = new VPReplicateRecipe(
5036 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
5037 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
5038
5039 UnpredicatedLoad->insertBefore(EarliestLoad);
5040
5041 // Replace all loads in the group with the unpredicated load.
5042 for (VPReplicateRecipe *Load : Group) {
5043 Load->replaceAllUsesWith(UnpredicatedLoad);
5044 Load->eraseFromParent();
5045 }
5046 }
5047}
5048
5049static bool
5051 PredicatedScalarEvolution &PSE, const Loop &L) {
5052 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
5053 if (!StoreLoc || !StoreLoc->AATags.Scope)
5054 return false;
5055
5056 // When sinking a group of stores, all members of the group alias each other.
5057 // Skip them during the alias checks.
5058 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
5059 StoresToSink.end());
5060
5061 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
5062 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
5063 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L);
5064 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
5065}
5066
5069 const Loop *L) {
5070 auto Groups =
5072 if (Groups.empty())
5073 return;
5074
5075 for (auto &Group : Groups) {
5076 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L))
5077 continue;
5078
5079 // Use the last (most dominated) store's location for the unconditional
5080 // store.
5081 VPReplicateRecipe *LastStore = Group.back();
5082 VPBasicBlock *InsertBB = LastStore->getParent();
5083
5084 // Collect common alias metadata from all stores in the group.
5085 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
5086
5087 // Build select chain for stored values.
5088 VPValue *SelectedValue = Group[0]->getOperand(0);
5089 VPBuilder Builder(InsertBB, LastStore->getIterator());
5090
5091 bool IsSingleScalar = Group[0]->isSingleScalar();
5092 for (unsigned I = 1; I < Group.size(); ++I) {
5093 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
5094 "all members in group must agree on IsSingleScalar");
5095 VPValue *Mask = Group[I]->getMask();
5096 VPValue *Value = Group[I]->getOperand(0);
5097 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
5098 Group[I]->getDebugLoc());
5099 }
5100
5101 // Find the store with minimum alignment to use.
5102 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
5103
5104 // Create unconditional store with selected value and common metadata.
5105 auto *UnpredicatedStore = new VPReplicateRecipe(
5106 StoreWithMinAlign->getUnderlyingInstr(),
5107 {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,
5108 /*Mask=*/nullptr, *LastStore, CommonMetadata);
5109 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
5110
5111 // Remove all predicated stores from the group.
5112 for (VPReplicateRecipe *Store : Group)
5113 Store->eraseFromParent();
5114 }
5115}
5116
5118 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
5120 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
5121 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
5122
5123 VPValue *TC = Plan.getTripCount();
5124 if (TC->getNumUsers() == 0)
5125 return;
5126
5127 // Skip cases for which the trip count may be non-trivial to materialize.
5128 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
5129 // tail is required.
5130 if (!Plan.hasScalarTail() ||
5132 Plan.getScalarPreheader() ||
5133 !isa<VPIRValue>(TC))
5134 return;
5135
5136 // Materialize vector trip counts for constants early if it can simply
5137 // be computed as (Original TC / VF * UF) * VF * UF.
5138 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
5139 // tail-folded loops.
5140 ScalarEvolution &SE = *PSE.getSE();
5141 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
5142 if (!isa<SCEVConstant>(TCScev))
5143 return;
5144 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
5145 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
5146 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
5147 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
5148}
5149
5151 VPBasicBlock *VectorPH) {
5153 if (BTC->getNumUsers() == 0)
5154 return;
5155
5156 VPBuilder Builder(VectorPH, VectorPH->begin());
5157 auto *TCTy = Plan.getTripCount()->getScalarType();
5158 auto *TCMO =
5159 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
5160 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
5161 BTC->replaceAllUsesWith(TCMO);
5162}
5163
5165 if (Plan.hasScalarVFOnly())
5166 return;
5167
5168 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
5169 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5171 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5172 vp_depth_first_shallow(LoopRegion->getEntry()));
5173 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
5174 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
5175 // regions. Those are not materialized explicitly yet.
5176 // TODO: materialize build vectors for replicating recipes in replicating
5177 // regions.
5178 for (VPBasicBlock *VPBB :
5179 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
5180 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5182 continue;
5183 auto *DefR = cast<VPSingleDefRecipe>(&R);
5184 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
5185 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5186 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
5187 };
5188 if ((isa<VPReplicateRecipe>(DefR) &&
5189 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
5190 (isa<VPInstruction>(DefR) &&
5192 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
5193 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
5194 continue;
5195
5196 Type *ScalarTy = DefR->getScalarType();
5197 unsigned Opcode = ScalarTy->isStructTy()
5200 auto *BuildVector = new VPInstruction(Opcode, {DefR});
5201 BuildVector->insertAfter(DefR);
5202
5203 DefR->replaceUsesWithIf(
5204 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
5205 VPUser &U, unsigned) {
5206 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
5207 });
5208 }
5209 }
5210
5211 // Create explicit VPInstructions to convert vectors to scalars. The current
5212 // implementation is conservative - it may miss some cases that may or may not
5213 // be vector values. TODO: introduce Unpacks speculatively - remove them later
5214 // if they are known to operate on scalar values.
5215 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5216 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5218 VPDerivedIVRecipe>(&R))
5219 continue;
5220 for (VPValue *Def : R.definedValues()) {
5221 // Skip recipes that are single-scalar or only have their first lane
5222 // used.
5223 // TODO: The Defs skipped here may or may not be vector values.
5224 // Introduce Unpacks, and remove them later, if they are guaranteed to
5225 // produce scalar values.
5227 continue;
5228
5229 // At the moment, we create unpacks only for scalar users outside
5230 // replicate regions. Recipes inside replicate regions still extract the
5231 // required lanes implicitly.
5232 // TODO: Remove once replicate regions are unrolled completely.
5233 auto IsCandidateUnpackUser = [Def](VPUser *U) {
5234 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5235 return U->usesScalars(Def) &&
5236 (!ParentRegion || !ParentRegion->isReplicator());
5237 };
5238 if (none_of(Def->users(), IsCandidateUnpackUser))
5239 continue;
5240
5241 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
5242 if (R.isPhi())
5243 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
5244 else
5245 Unpack->insertAfter(&R);
5246 Def->replaceUsesWithIf(Unpack,
5247 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
5248 return IsCandidateUnpackUser(&U);
5249 });
5250 }
5251 }
5252 }
5253}
5254
5256 VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking,
5257 bool RequiresScalarEpilogue, VPValue *Step,
5258 std::optional<uint64_t> MaxRuntimeStep) {
5259 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5260 // There's nothing to do if there are no users of the vector trip count or its
5261 // IR value has already been set.
5262 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
5263 return;
5264
5265 VPValue *TC = Plan.getTripCount();
5266 Type *TCTy = TC->getScalarType();
5267 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5268 if (auto *StepR = Step->getDefiningRecipe()) {
5269 assert(VPDominatorTree(Plan).dominates(StepR->getParent(), VectorPHVPBB) &&
5270 "Step VPBB must dominate VectorPHVPBB");
5271 // Insert after Step's definition to maintain valid def-use ordering.
5272 InsertPt = std::next(StepR->getIterator());
5273 }
5274 VPBuilder Builder(VectorPHVPBB, InsertPt);
5275
5276 // For scalable steps, if TC is a constant and is divisible by the maximum
5277 // possible runtime step, then TC % Step == 0 for all valid vscale values
5278 // and the vector trip count equals TC directly.
5279 const APInt *TCVal;
5280 if (!RequiresScalarEpilogue && match(TC, m_APInt(TCVal)) && MaxRuntimeStep &&
5281 TCVal->urem(*MaxRuntimeStep) == 0) {
5282 VectorTC.replaceAllUsesWith(TC);
5283 return;
5284 }
5285
5286 // If the tail is to be folded by masking, round the number of iterations N
5287 // up to a multiple of Step instead of rounding down. This is done by first
5288 // adding Step-1 and then rounding down. Note that it's ok if this addition
5289 // overflows: the vector induction variable will eventually wrap to zero given
5290 // that it starts at zero and its Step is a power of two; the loop will then
5291 // exit, with the last early-exit vector comparison also producing all-true.
5292 if (TailByMasking) {
5293 TC = Builder.createAdd(
5294 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5295 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5296 }
5297
5298 // Now we need to generate the expression for the part of the loop that the
5299 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5300 // iterations are not required for correctness, or N - Step, otherwise. Step
5301 // is equal to the vectorization factor (number of SIMD elements) times the
5302 // unroll factor (number of SIMD instructions).
5303 VPValue *R =
5304 Builder.createNaryOp(Instruction::URem, {TC, Step},
5305 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5306
5307 // There are cases where we *must* run at least one iteration in the remainder
5308 // loop. See the cost model for when this can happen. If the step evenly
5309 // divides the trip count, we set the remainder to be equal to the step. If
5310 // the step does not evenly divide the trip count, no adjustment is necessary
5311 // since there will already be scalar iterations. Note that the minimum
5312 // iterations check ensures that N >= Step.
5313 if (RequiresScalarEpilogue) {
5314 assert(!TailByMasking &&
5315 "requiring scalar epilogue is not supported with fail folding");
5316 VPValue *IsZero =
5317 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));
5318 R = Builder.createSelect(IsZero, Step, R);
5319 }
5320
5321 VPValue *Res =
5322 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5323 VectorTC.replaceAllUsesWith(Res);
5324}
5325
5327 ElementCount VFEC) {
5328 // If VF and VFxUF have already been materialized (no remaining users),
5329 // there's nothing more to do.
5330 if (Plan.getVF().isMaterialized()) {
5331 assert(Plan.getVFxUF().isMaterialized() &&
5332 "VF and VFxUF must be materialized together");
5333 return;
5334 }
5335
5336 VPBuilder Builder(VectorPH, VectorPH->begin());
5337 Type *TCTy = Plan.getTripCount()->getScalarType();
5338 VPValue &VF = Plan.getVF();
5339 VPValue &VFxUF = Plan.getVFxUF();
5340 // If there are no users of the runtime VF, compute VFxUF by constant folding
5341 // the multiplication of VF and UF.
5342 if (VF.getNumUsers() == 0) {
5343 VPValue *RuntimeVFxUF =
5344 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5345 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5346 return;
5347 }
5348
5349 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5350 // vscale) * UF.
5351 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5353 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5355 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5356 }
5357 VF.replaceAllUsesWith(RuntimeVF);
5358
5359 VPValue *MulByUF = Builder.createOverflowingOp(
5360 Instruction::Mul,
5361 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5362 {true, false});
5363 VFxUF.replaceAllUsesWith(MulByUF);
5364}
5365
5367 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
5368 auto *HeaderMaskDef = HeaderMask->getDefiningRecipe();
5369 Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());
5370
5371 VPBuilder Builder(Plan.getVectorPreheader());
5372 auto *AliasMask = Builder.createNaryOp(
5373 VPInstruction::IncomingAliasMask, {}, nullptr, {}, {},
5374 DebugLoc::getUnknown(), "incoming.alias.mask", I1Ty);
5375
5376 if (HeaderMaskDef->isPhi())
5377 Builder = VPBuilder(&*HeaderMaskDef->getParent()->getFirstNonPhi());
5378 else
5379 Builder = VPBuilder::getToInsertAfter(HeaderMaskDef);
5380
5381 // Update all existing users of the header mask to "HeaderMask & AliasMask".
5382 auto *ClampedHeaderMask = Builder.createAnd(HeaderMask, AliasMask);
5383 HeaderMask->replaceUsesWithIf(ClampedHeaderMask, [&](VPUser &U, unsigned) {
5384 return &U != ClampedHeaderMask;
5385 });
5386}
5387
5388VPValue *
5390 ArrayRef<PointerDiffInfo> DiffChecks) {
5391 VPBuilder Builder(AliasCheckVPBB);
5392 Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());
5393
5394 VPValue *IncomingAliasMask = vputils::findIncomingAliasMask(Plan);
5395 assert(IncomingAliasMask && "Expected an alias mask!");
5396
5397 VPValue *AliasMask = nullptr;
5398 for (const PointerDiffInfo &Check : DiffChecks) {
5400 VPValue *Sink =
5402 Type *AddrType = Src->getScalarType();
5403
5404 // TODO: Only freeze the required pointer (not both src and sink).
5405 if (Check.NeedsFreeze) {
5406 Src = Builder.createScalarFreeze(Src, AddrType, DebugLoc::getUnknown());
5407 Sink = Builder.createScalarFreeze(Sink, AddrType, DebugLoc::getUnknown());
5408 }
5409
5410 // TODO: Generate loop_dependence_raw_mask when there's a read-after-write
5411 // dependency between the source and the sink. This is not necessary for
5412 // correctness of the mask, but using the "raw" variant prevents loads
5413 // depending on the completion of stores.
5414 VPWidenIntrinsicRecipe *WARMask = Builder.insert(new VPWidenIntrinsicRecipe(
5415 Intrinsic::loop_dependence_war_mask,
5416 {Src, Sink, Plan.getConstantInt(AddrType, Check.AccessSize)}, I1Ty));
5417
5418 if (AliasMask)
5419 AliasMask = Builder.createAnd(AliasMask, WARMask);
5420 else
5421 AliasMask = WARMask;
5422 }
5423
5425 Type *IndexTy = Plan.getDataLayout().getIndexType(Plan.getContext(), 0);
5426 VPValue *NumActive = Builder.createNaryOp(
5427 VPInstruction::NumActiveLanes, {AliasMask}, nullptr, {}, {},
5428 DebugLoc::getUnknown(), "num.active.lanes", IndexTy);
5429 VPValue *ClampedVF = Builder.createScalarZExtOrTrunc(
5430 NumActive, IVTy, IndexTy, DebugLoc::getCompilerGenerated());
5431
5432 IncomingAliasMask->replaceAllUsesWith(AliasMask);
5433
5434 return ClampedVF;
5435}
5436
5438 VPlan &Plan, ArrayRef<PointerDiffInfo> DiffChecks, bool HasBranchWeights) {
5439 VPBasicBlock *ClampedVFCheck =
5440 Plan.createVPBasicBlock("vector.clamped.vf.check");
5441
5442 VPValue *ClampedVF = materializeAliasMask(Plan, ClampedVFCheck, DiffChecks);
5443 VPBuilder Builder(ClampedVFCheck);
5445 Type *TCTy = Plan.getTripCount()->getScalarType();
5446
5447 // Check the "ClampedVF" from the alias mask is larger than one.
5448 VPValue *IsScalar =
5449 Builder.createICmp(CmpInst::ICMP_ULE, ClampedVF,
5450 Plan.getConstantInt(TCTy, 1), DL, "vf.is.scalar");
5451
5452 VPValue *TripCount = Plan.getTripCount();
5453 VPValue *MaxUIntTripCount =
5455 VPValue *DistanceToMax = Builder.createSub(MaxUIntTripCount, TripCount);
5456
5457 // For tail-folding: Don't execute the vector loop if (UMax - n) < ClampedVF.
5458 // Note: The ClampedVF may not be a power-of-two. This means the loop exit
5459 // condition (index.next == n.vec) may not be correct in the case of an
5460 // overflow. The issue is `n.vec` could be zero due to an overflow, but
5461 // index.next is not guaranteed to overflow to zero as the ClampedVF is not a
5462 // power-of-two).
5463 VPValue *TripCountCheck = Builder.createICmp(
5464 ICmpInst::ICMP_ULT, DistanceToMax, ClampedVF, DL, "vf.step.overflow");
5465
5466 VPValue *Cond = Builder.createOr(IsScalar, TripCountCheck, DL);
5467 attachVPCheckBlock(Plan, Cond, ClampedVFCheck, HasBranchWeights);
5468
5469 // Materialize the trip count early as this will add a use of (VFxUF) that
5470 // needs to be replaced with the ClampedVF.
5472 /*TailByMasking=*/true,
5473 /*RequiresScalarEpilogue=*/false,
5474 &Plan.getVFxUF());
5475
5476 assert(Plan.getConcreteUF() == 1 &&
5477 "Clamped VF not supported with interleaving");
5478 Plan.getVF().replaceAllUsesWith(ClampedVF);
5479 Plan.getVFxUF().replaceAllUsesWith(ClampedVF);
5480}
5481
5483 ScalarEvolution &SE) {
5484 auto *Entry = Plan.getEntry();
5485 VPBuilder Builder(Entry, Entry->begin());
5487 ->getIRBasicBlock()
5488 ->getTerminator()
5489 ->getDebugLoc();
5490 VPSCEVExpander Expander(Builder, SE, DL);
5491
5492 // Expand VPExpandSCEVRecipes to VPInstructions using VPSCEVExpander. During
5493 // the transition, unsupported VPExpandSCEVRecipes are skipped and left for
5494 // late expansion.
5495 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5496 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5497 if (!ExpSCEV || ExpSCEV->getNumUsers() == 0)
5498 continue;
5499 Builder.setInsertPoint(ExpSCEV);
5500 VPValue *Expanded = Expander.tryToExpand(ExpSCEV->getSCEV());
5501 if (!Expanded)
5502 continue;
5503 ExpSCEV->replaceAllUsesWith(Expanded);
5504 // TripCount should not be used after expansion to VPInstructions. Reset to
5505 // poison to avoid dangling references.
5506 if (Plan.getTripCount() == ExpSCEV)
5507 Plan.resetTripCount(
5508 Plan.getOrAddLiveIn(PoisonValue::get(ExpSCEV->getScalarType())));
5509 ExpSCEV->eraseFromParent();
5510 }
5511}
5512
5515 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5516
5517 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5518 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5519 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5520 // Expand remaining VPExpandSCEVRecipes to IR instructions using SCEVExpander.
5521 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5522 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5523 if (!ExpSCEV)
5524 continue;
5525 const SCEV *Expr = ExpSCEV->getSCEV();
5526 Value *Res =
5527 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5528 ExpandedSCEVs[Expr] = Res;
5529 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5530 ExpSCEV->replaceAllUsesWith(Exp);
5531 if (Plan.getTripCount() == ExpSCEV)
5532 Plan.resetTripCount(Exp);
5533 ExpSCEV->eraseFromParent();
5534 }
5536 "all VPExpandSCEVRecipes must have been expanded");
5537 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5538 // to the VPIRBasicBlock.
5539 auto EI = Entry->begin();
5540 for (Instruction &I : drop_end(*EntryBB)) {
5541 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5542 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5543 EI++;
5544 continue;
5545 }
5547 }
5548
5549 return ExpandedSCEVs;
5550}
5551
5552/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5553/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5554/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5555/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5556/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5557/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5558/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5559/// is defined at \p Idx of a load interleave group.
5560static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5561 VPValue *OpV, unsigned Idx, bool IsScalable) {
5562 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5563 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5564 if (!Member0OpR)
5565 return Member0Op == OpV;
5566 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5567 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5568 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5569 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5570 Member0Op == OpV;
5571 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5572 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5573 return false;
5574}
5575
5576static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5578 auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Ops[0]);
5579 if (!WideMember0)
5580 return false;
5581 for (VPValue *V : Ops) {
5583 return false;
5584 auto *R = cast<VPSingleDefRecipe>(V);
5585 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5586 return false;
5587 }
5588
5589 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5591 for (VPValue *Op : Ops)
5592 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5593
5594 if (canNarrowOps(OpsI, IsScalable))
5595 continue;
5596
5597 if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {
5598 const auto &[OpIdx, OpV] = P;
5599 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5600 }))
5601 return false;
5602 }
5603
5604 return true;
5605}
5606
5607/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5608/// number of members both equal to VF. The interleave group must also access
5609/// the full vector width.
5610static std::optional<ElementCount>
5613 const TargetTransformInfo &TTI) {
5614 if (!InterleaveR || InterleaveR->getMask())
5615 return std::nullopt;
5616
5617 Type *GroupElementTy = nullptr;
5618 if (InterleaveR->getStoredValues().empty()) {
5619 GroupElementTy = InterleaveR->getVPValue(0)->getScalarType();
5620 if (!all_of(InterleaveR->definedValues(), [GroupElementTy](VPValue *Op) {
5621 return Op->getScalarType() == GroupElementTy;
5622 }))
5623 return std::nullopt;
5624 } else {
5625 GroupElementTy = InterleaveR->getStoredValues()[0]->getScalarType();
5626 if (!all_of(InterleaveR->getStoredValues(), [GroupElementTy](VPValue *Op) {
5627 return Op->getScalarType() == GroupElementTy;
5628 }))
5629 return std::nullopt;
5630 }
5631
5632 auto IG = InterleaveR->getInterleaveGroup();
5633 if (IG->getFactor() != IG->getNumMembers())
5634 return std::nullopt;
5635
5636 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5637 TypeSize Size = TTI.getRegisterBitWidth(
5640 assert(Size.isScalable() == VF.isScalable() &&
5641 "if Size is scalable, VF must be scalable and vice versa");
5642 return Size.getKnownMinValue();
5643 };
5644
5645 for (ElementCount VF : VFs) {
5646 unsigned MinVal = VF.getKnownMinValue();
5647 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5648 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5649 return {VF};
5650 }
5651 return std::nullopt;
5652}
5653
5654/// Returns true if \p VPValue is a narrow VPValue.
5655static bool isAlreadyNarrow(VPValue *VPV) {
5656 if (isa<VPIRValue>(VPV))
5657 return true;
5658 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5659 return RepR && RepR->isSingleScalar();
5660}
5661
5662// Convert the wide recipes defining the VPValues in \p Members feeding an
5663// interleave group to a single narrow variant. The first member is reused as
5664// the narrowed recipe.
5665static VPValue *
5667 SmallPtrSetImpl<VPValue *> &NarrowedOps) {
5668 VPValue *V = Members.front();
5669 auto *R = V->getDefiningRecipe();
5670 if (!R || NarrowedOps.contains(V))
5671 return V;
5672
5673 if (isAlreadyNarrow(V))
5674 return V;
5675
5677 auto *WideMember0 = cast<VPRecipeWithIRFlags>(R);
5678 for (VPValue *Member : Members.drop_front())
5679 WideMember0->intersectFlags(*cast<VPRecipeWithIRFlags>(Member));
5680 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx) {
5682 for (VPValue *Member : Members)
5683 OpsI.push_back(Member->getDefiningRecipe()->getOperand(Idx));
5684 WideMember0->setOperand(Idx, narrowInterleaveGroupOp(OpsI, NarrowedOps));
5685 }
5686 return V;
5687 }
5688
5689 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5690 // Narrow interleave group to wide load, as transformed VPlan will only
5691 // process one original iteration.
5692 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5693 auto *L = new VPWidenLoadRecipe(*LI, LoadGroup->getAddr(),
5694 LoadGroup->getMask(), /*Consecutive=*/true,
5695 *LoadGroup, LoadGroup->getDebugLoc());
5696 L->insertBefore(LoadGroup);
5697 NarrowedOps.insert(L);
5698 return L;
5699 }
5700
5701 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5702 assert(RepR->isSingleScalar() && RepR->getOpcode() == Instruction::Load &&
5703 "must be a single scalar load");
5704 NarrowedOps.insert(RepR);
5705 return RepR;
5706 }
5707
5708 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5709 VPValue *PtrOp = WideLoad->getAddr();
5710 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5711 PtrOp = VecPtr->getOperand(0);
5712 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5713 // process one original iteration.
5714 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5715 /*IsUniform*/ true,
5716 /*Mask*/ nullptr, {}, *WideLoad);
5717 N->insertBefore(WideLoad);
5718 NarrowedOps.insert(N);
5719 return N;
5720}
5721
5722std::unique_ptr<VPlan>
5724 const TargetTransformInfo &TTI) {
5725 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5726
5727 if (!VectorLoop)
5728 return nullptr;
5729
5730 // Only handle single-block loops for now.
5731 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5732 return nullptr;
5733
5734 // Skip plans when we may not be able to properly narrow.
5735 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5736 if (!match(&Exiting->back(), m_BranchOnCount()))
5737 return nullptr;
5738
5739 assert(match(&Exiting->back(),
5741 m_Specific(&Plan.getVectorTripCount()))) &&
5742 "unexpected branch-on-count");
5743
5745 std::optional<ElementCount> VFToOptimize;
5746 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5749 continue;
5750
5751 // Bail out on recipes not supported at the moment:
5752 // * phi recipes other than the canonical induction
5753 // * recipes writing to memory except interleave groups
5754 // Only support plans with a canonical induction phi.
5755 if (R.isPhi())
5756 return nullptr;
5757
5758 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5759 if (R.mayWriteToMemory() && !InterleaveR)
5760 return nullptr;
5761
5762 // Bail out if any recipe defines a vector value used outside the
5763 // vector loop region.
5764 if (any_of(R.definedValues(), [&](VPValue *V) {
5765 return any_of(V->users(), [&](VPUser *U) {
5766 auto *UR = cast<VPRecipeBase>(U);
5767 return UR->getParent()->getParent() != VectorLoop;
5768 });
5769 }))
5770 return nullptr;
5771
5772 // All other ops are allowed, but we reject uses that cannot be converted
5773 // when checking all allowed consumers (store interleave groups) below.
5774 if (!InterleaveR)
5775 continue;
5776
5777 // Try to find a single VF, where all interleave groups are consecutive and
5778 // saturate the full vector width. If we already have a candidate VF, check
5779 // if it is applicable for the current InterleaveR, otherwise look for a
5780 // suitable VF across the Plan's VFs.
5782 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5783 : to_vector(Plan.vectorFactors());
5784 std::optional<ElementCount> NarrowedVF =
5785 isConsecutiveInterleaveGroup(InterleaveR, VFs, TTI);
5786 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
5787 return nullptr;
5788 VFToOptimize = NarrowedVF;
5789
5790 // Skip read interleave groups.
5791 if (InterleaveR->getStoredValues().empty())
5792 continue;
5793
5794 // Narrow interleave groups, if all operands are already matching narrow
5795 // ops.
5796 auto *Member0 = InterleaveR->getStoredValues()[0];
5797 if (isAlreadyNarrow(Member0) &&
5798 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5799 StoreGroups.push_back(InterleaveR);
5800 continue;
5801 }
5802
5803 // For now, we only support full interleave groups storing load interleave
5804 // groups.
5805 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5806 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5807 if (!DefR)
5808 return false;
5809 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5810 return IR && IR->getInterleaveGroup()->isFull() &&
5811 IR->getVPValue(Op.index()) == Op.value();
5812 })) {
5813 StoreGroups.push_back(InterleaveR);
5814 continue;
5815 }
5816
5817 // Check if all values feeding InterleaveR are matching wide recipes, which
5818 // operands that can be narrowed.
5819 if (!canNarrowOps(InterleaveR->getStoredValues(),
5820 VFToOptimize->isScalable()))
5821 return nullptr;
5822 StoreGroups.push_back(InterleaveR);
5823 }
5824
5825 if (StoreGroups.empty())
5826 return nullptr;
5827
5828 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5829 bool RequiresScalarEpilogue =
5830 MiddleVPBB->getNumSuccessors() == 1 &&
5831 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
5832 // Bail out for tail-folding (middle block with a single successor to exit).
5833 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
5834 return nullptr;
5835
5836 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5837 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
5838 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5839 // TODO: Handle cases where only some interleave groups can be narrowed.
5840 std::unique_ptr<VPlan> NewPlan;
5841 if (size(Plan.vectorFactors()) != 1) {
5842 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5843 Plan.setVF(*VFToOptimize);
5844 NewPlan->removeVF(*VFToOptimize);
5845 }
5846
5847 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5848 SmallPtrSet<VPValue *, 4> NarrowedOps;
5849 // Narrow operation tree rooted at store groups.
5850 for (auto *StoreGroup : StoreGroups) {
5851 VPValue *Res =
5852 narrowInterleaveGroupOp(StoreGroup->getStoredValues(), NarrowedOps);
5853 auto *SI =
5854 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5855 auto *S = new VPWidenStoreRecipe(*SI, StoreGroup->getAddr(), Res, nullptr,
5856 /*Consecutive=*/true, *StoreGroup,
5857 StoreGroup->getDebugLoc());
5858 S->insertBefore(StoreGroup);
5859 StoreGroup->eraseFromParent();
5860 }
5861
5862 // Adjust induction to reflect that the transformed plan only processes one
5863 // original iteration.
5865 Type *CanIVTy = VectorLoop->getCanonicalIVType();
5866 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
5867 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
5868
5869 VPValue *UF = &Plan.getUF();
5870 VPValue *Step;
5871 if (VFToOptimize->isScalable()) {
5872 VPValue *VScale =
5873 PHBuilder.createElementCount(CanIVTy, ElementCount::getScalable(1));
5874 Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},
5875 {true, false});
5876 Plan.getVF().replaceAllUsesWith(VScale);
5877 } else {
5878 Step = UF;
5879 Plan.getVF().replaceAllUsesWith(Plan.getConstantInt(CanIVTy, 1));
5880 }
5881 // Materialize vector trip count with the narrowed step.
5882 materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,
5883 RequiresScalarEpilogue, Step);
5884
5885 CanIVInc->setOperand(1, Step);
5886 Plan.getVFxUF().replaceAllUsesWith(Step);
5887
5888 removeDeadRecipes(Plan);
5889 assert(none_of(*VectorLoop->getEntryBasicBlock(),
5891 "All VPVectorPointerRecipes should have been removed");
5892 return NewPlan;
5893}
5894
5895/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5896/// BranchOnCond recipe.
5898 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5899 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5900 auto *MiddleTerm =
5902 // Only add branch metadata if there is a (conditional) terminator.
5903 if (!MiddleTerm)
5904 return;
5905
5906 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5907 "must have a BranchOnCond");
5908 // Assume that `TripCount % VectorStep ` is equally distributed.
5909 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5910 if (VF.isScalable() && VScaleForTuning.has_value())
5911 VectorStep *= *VScaleForTuning;
5912 assert(VectorStep > 0 && "trip count should not be zero");
5913 MDBuilder MDB(Plan.getContext());
5914 MDNode *BranchWeights =
5915 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5916 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5917}
5918
5920 VFRange &Range) {
5921 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5922 auto *MiddleVPBB = Plan.getMiddleBlock();
5923 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5924
5925 auto IsScalableOne = [](ElementCount VF) -> bool {
5926 return VF == ElementCount::getScalable(1);
5927 };
5928
5929 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5930 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5931 if (!FOR)
5932 continue;
5933
5934 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5935 "Cannot handle loops with uncountable early exits");
5936
5937 // Find the existing splice for this FOR, created in
5938 // createHeaderPhiRecipes. All uses of FOR have already been replaced with
5939 // RecurSplice there; only RecurSplice itself still references FOR.
5940 auto *RecurSplice =
5942 assert(RecurSplice && "expected FirstOrderRecurrenceSplice");
5943
5944 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5945 // penultimate value of the recurrence. Instead we rely on the existing
5946 // extract of the last element from the result of
5947 // VPInstruction::FirstOrderRecurrenceSplice.
5948 // TODO: Consider vscale_range info and UF.
5949 if (any_of(RecurSplice->users(),
5950 [](VPUser *U) { return !cast<VPRecipeBase>(U)->getRegion(); }) &&
5952 Range))
5953 return;
5954
5955 // This is the second phase of vectorizing first-order recurrences, creating
5956 // extracts for users outside the loop. An overview of the transformation is
5957 // described below. Suppose we have the following loop with some use after
5958 // the loop of the last a[i-1],
5959 //
5960 // for (int i = 0; i < n; ++i) {
5961 // t = a[i - 1];
5962 // b[i] = a[i] - t;
5963 // }
5964 // use t;
5965 //
5966 // There is a first-order recurrence on "a". For this loop, the shorthand
5967 // scalar IR looks like:
5968 //
5969 // scalar.ph:
5970 // s.init = a[-1]
5971 // br scalar.body
5972 //
5973 // scalar.body:
5974 // i = phi [0, scalar.ph], [i+1, scalar.body]
5975 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5976 // s2 = a[i]
5977 // b[i] = s2 - s1
5978 // br cond, scalar.body, exit.block
5979 //
5980 // exit.block:
5981 // use = lcssa.phi [s1, scalar.body]
5982 //
5983 // In this example, s1 is a recurrence because it's value depends on the
5984 // previous iteration. In the first phase of vectorization, we created a
5985 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5986 // for users in the scalar preheader and exit block.
5987 //
5988 // vector.ph:
5989 // v_init = vector(..., ..., ..., a[-1])
5990 // br vector.body
5991 //
5992 // vector.body
5993 // i = phi [0, vector.ph], [i+4, vector.body]
5994 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5995 // v2 = a[i, i+1, i+2, i+3]
5996 // v1' = splice(v1(3), v2(0, 1, 2))
5997 // b[i, i+1, i+2, i+3] = v2 - v1'
5998 // br cond, vector.body, middle.block
5999 //
6000 // middle.block:
6001 // vector.recur.extract.for.phi = v2(2)
6002 // vector.recur.extract = v2(3)
6003 // br cond, scalar.ph, exit.block
6004 //
6005 // scalar.ph:
6006 // scalar.recur.init = phi [vector.recur.extract, middle.block],
6007 // [s.init, otherwise]
6008 // br scalar.body
6009 //
6010 // scalar.body:
6011 // i = phi [0, scalar.ph], [i+1, scalar.body]
6012 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
6013 // s2 = a[i]
6014 // b[i] = s2 - s1
6015 // br cond, scalar.body, exit.block
6016 //
6017 // exit.block:
6018 // lo = lcssa.phi [s1, scalar.body],
6019 // [vector.recur.extract.for.phi, middle.block]
6020 //
6021 // Update extracts of the splice in the middle block: they extract the
6022 // penultimate element of the recurrence.
6024 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
6025 if (!match(&R, m_ExtractLastLaneOfLastPart(m_Specific(RecurSplice))))
6026 continue;
6027
6028 auto *ExtractR = cast<VPInstruction>(&R);
6029 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
6030 VPInstruction::ExtractPenultimateElement, RecurSplice->getOperand(1),
6031 {}, "vector.recur.extract.for.phi");
6032 for (VPUser *ExitU : to_vector(ExtractR->users())) {
6033 if (auto *ExitPhi = dyn_cast<VPIRPhi>(ExitU))
6034 ExitPhi->replaceUsesOfWith(ExtractR, PenultimateElement);
6035 }
6036 }
6037 }
6038}
6039
6040/// Check if \p V is a binary expression of a widened IV and a loop-invariant
6041/// value. Returns the widened IV if found, nullptr otherwise.
6043 auto *BinOp = dyn_cast<VPWidenRecipe>(V);
6044 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||
6045 Instruction::isIntDivRem(BinOp->getOpcode()))
6046 return nullptr;
6047
6048 VPValue *WidenIVCandidate = BinOp->getOperand(0);
6049 VPValue *InvariantCandidate = BinOp->getOperand(1);
6050 if (!isa<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate))
6051 std::swap(WidenIVCandidate, InvariantCandidate);
6052
6053 if (!InvariantCandidate->isDefinedOutsideLoopRegions())
6054 return nullptr;
6055
6056 return dyn_cast<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate);
6057}
6058
6059/// Create a scalar version of \p BinOp, with its \p WidenIV operand replaced
6060/// by \p ScalarIV, and place it after \p ScalarIV's defining recipe.
6064 BinOp->getNumOperands() == 2 && "BinOp must have 2 operands");
6065 auto *ClonedOp = BinOp->clone();
6066 if (ClonedOp->getOperand(0) == WidenIV) {
6067 ClonedOp->setOperand(0, ScalarIV);
6068 } else {
6069 assert(ClonedOp->getOperand(1) == WidenIV && "one operand must be WideIV");
6070 ClonedOp->setOperand(1, ScalarIV);
6071 }
6072 ClonedOp->insertAfter(ScalarIV->getDefiningRecipe());
6073 return ClonedOp;
6074}
6075
6078 Loop &L) {
6079 ScalarEvolution &SE = *PSE.getSE();
6080 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
6081
6082 // Helper lambda to check if the IV range excludes the sentinel value. Try
6083 // signed first, then unsigned. Return an excluded sentinel if found,
6084 // otherwise return std::nullopt.
6085 auto CheckSentinel = [&SE](const SCEV *IVSCEV,
6086 bool UseMax) -> std::optional<APSInt> {
6087 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
6088 for (bool Signed : {true, false}) {
6089 APSInt Sentinel = UseMax ? APSInt::getMinValue(BW, /*Unsigned=*/!Signed)
6090 : APSInt::getMaxValue(BW, /*Unsigned=*/!Signed);
6091
6092 ConstantRange IVRange =
6093 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
6094 if (!IVRange.contains(Sentinel))
6095 return Sentinel;
6096 }
6097 return std::nullopt;
6098 };
6099
6100 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
6101 for (VPRecipeBase &Phi :
6102 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
6103 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
6105 PhiR->getRecurrenceKind()))
6106 continue;
6107
6108 Type *PhiTy = PhiR->getScalarType();
6109 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
6110 continue;
6111
6112 // If there's a header mask, the backedge select will not be the find-last
6113 // select.
6114 VPValue *BackedgeVal = PhiR->getBackedgeValue();
6115 auto *FindLastSelect = cast<VPSingleDefRecipe>(BackedgeVal);
6116 if (HeaderMask &&
6117 !match(BackedgeVal,
6118 m_Select(m_Specific(HeaderMask),
6119 m_VPSingleDefRecipe(FindLastSelect), m_Specific(PhiR))))
6120 continue;
6121
6122 // Get the find-last expression from the find-last select of the reduction
6123 // phi. The find-last select should be a select between the phi and the
6124 // find-last expression.
6125 VPValue *Cond, *FindLastExpression;
6126 if (!match(FindLastSelect, m_SelectLike(m_VPValue(Cond), m_Specific(PhiR),
6127 m_VPValue(FindLastExpression))) &&
6128 !match(FindLastSelect,
6129 m_SelectLike(m_VPValue(Cond), m_VPValue(FindLastExpression),
6130 m_Specific(PhiR))))
6131 continue;
6132
6133 // Check if FindLastExpression is a simple expression of a widened IV. If
6134 // so, we can track the underlying IV instead and sink the expression.
6135 auto *IVOfExpressionToSink = getExpressionIV(FindLastExpression);
6136 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(
6137 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression, PSE,
6138 &L);
6139 const SCEV *Step;
6140 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
6141 assert(!match(vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L),
6143 "IVOfExpressionToSink not being an AddRec must imply "
6144 "FindLastExpression not being an AddRec.");
6145 continue;
6146 }
6147
6148 // Determine direction from SCEV step.
6149 if (!SE.isKnownNonZero(Step))
6150 continue;
6151
6152 // Positive step means we need UMax/SMax to find the last IV value, and
6153 // UMin/SMin otherwise.
6154 bool UseMax = SE.isKnownPositive(Step);
6155 std::optional<APSInt> SentinelVal = CheckSentinel(IVSCEV, UseMax);
6156 bool UseSigned = SentinelVal && SentinelVal->isSigned();
6157
6158 // Sinking an expression will disable epilogue vectorization. Only use it,
6159 // if FindLastExpression cannot be vectorized via a sentinel. Sinking may
6160 // also prevent vectorizing using a sentinel (e.g., if the expression is a
6161 // multiply or divide by large constant, respectively), which also makes
6162 // sinking undesirable.
6163 if (IVOfExpressionToSink) {
6164 const SCEV *FindLastExpressionSCEV =
6165 vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L);
6166 if (match(FindLastExpressionSCEV,
6167 m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
6168 bool NewUseMax = SE.isKnownPositive(Step);
6169 if (auto NewSentinel =
6170 CheckSentinel(FindLastExpressionSCEV, NewUseMax)) {
6171 // The original expression already has a sentinel, so prefer not
6172 // sinking to keep epilogue vectorization possible.
6173 SentinelVal = *NewSentinel;
6174 UseSigned = NewSentinel->isSigned();
6175 UseMax = NewUseMax;
6176 IVSCEV = FindLastExpressionSCEV;
6177 IVOfExpressionToSink = nullptr;
6178 }
6179 }
6180 }
6181
6182 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
6183 // if the condition was ever true. Requires the IV to not wrap, otherwise we
6184 // cannot use min/max.
6185 if (!SentinelVal) {
6186 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
6187 if (AR->hasNoSignedWrap())
6188 UseSigned = true;
6189 else if (AR->hasNoUnsignedWrap())
6190 UseSigned = false;
6191 else
6192 continue;
6193 }
6194
6196 BackedgeVal,
6198
6199 VPValue *NewFindLastSelect = BackedgeVal;
6200 VPValue *SelectCond = Cond;
6201 if (!SentinelVal || IVOfExpressionToSink) {
6202 // When we need to create a new select, normalize the condition so that
6203 // PhiR is the last operand and include the header mask if needed.
6204 DebugLoc DL = FindLastSelect->getDefiningRecipe()->getDebugLoc();
6205 VPBuilder LoopBuilder(FindLastSelect->getDefiningRecipe());
6206 if (FindLastSelect->getDefiningRecipe()->getOperand(1) == PhiR)
6207 SelectCond = LoopBuilder.createNot(SelectCond);
6208
6209 // When tail folding, mask the condition with the header mask to prevent
6210 // propagating poison from inactive lanes in the last vector iteration.
6211 if (HeaderMask)
6212 SelectCond = LoopBuilder.createLogicalAnd(HeaderMask, SelectCond);
6213
6214 if (SelectCond != Cond || IVOfExpressionToSink) {
6215 NewFindLastSelect = LoopBuilder.createSelect(
6216 SelectCond,
6217 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression,
6218 PhiR, DL);
6219 }
6220 }
6221
6222 // Create the reduction result in the middle block using sentinel directly.
6223 RecurKind MinMaxKind =
6224 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
6225 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
6226 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
6227 FastMathFlags());
6228 DebugLoc ExitDL = RdxResult->getDebugLoc();
6229 VPBuilder MiddleBuilder(RdxResult);
6230 VPValue *ReducedIV =
6232 NewFindLastSelect, Flags, ExitDL);
6233
6234 // If IVOfExpressionToSink is an expression to sink, sink it now.
6235 VPValue *VectorRegionExitingVal = ReducedIV;
6236 if (IVOfExpressionToSink)
6237 VectorRegionExitingVal =
6238 cloneBinOpForScalarIV(cast<VPWidenRecipe>(FindLastExpression),
6239 ReducedIV, IVOfExpressionToSink);
6240
6241 VPValue *NewRdxResult;
6242 VPValue *StartVPV = PhiR->getStartValue();
6243 if (SentinelVal) {
6244 // Sentinel-based approach: reduce IVs with min/max, compare against
6245 // sentinel to detect if condition was ever true, select accordingly.
6246 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
6247 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
6248 Sentinel, ExitDL);
6249 NewRdxResult = MiddleBuilder.createSelect(Cmp, VectorRegionExitingVal,
6250 StartVPV, ExitDL);
6251 StartVPV = Sentinel;
6252 } else {
6253 // Introduce a boolean AnyOf reduction to track if the condition was ever
6254 // true in the loop. Use it to select the initial start value, if it was
6255 // never true.
6256 auto *AnyOfPhi = new VPReductionPHIRecipe(
6257 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
6258 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
6259 AnyOfPhi->insertAfter(PhiR);
6260
6261 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
6262 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, SelectCond);
6263 AnyOfPhi->setOperand(1, OrVal);
6264
6265 NewRdxResult = MiddleBuilder.createAnyOfReduction(
6266 OrVal, VectorRegionExitingVal, StartVPV, ExitDL);
6267
6268 // Initialize the IV reduction phi with the neutral element, not the
6269 // original start value, to ensure correct min/max reduction results.
6270 StartVPV = Plan.getOrAddLiveIn(
6271 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
6272 }
6273 RdxResult->replaceAllUsesWith(NewRdxResult);
6274 RdxResult->eraseFromParent();
6275
6276 auto *NewPhiR = new VPReductionPHIRecipe(
6277 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
6278 *NewFindLastSelect, RdxUnordered{1}, {},
6279 PhiR->hasUsesOutsideReductionChain());
6280 NewPhiR->insertBefore(PhiR);
6281 PhiR->replaceAllUsesWith(NewPhiR);
6282 PhiR->eraseFromParent();
6283 }
6284}
6285
6286namespace {
6287
6288using ExtendKind = TTI::PartialReductionExtendKind;
6289struct ReductionExtend {
6290 Type *SrcType = nullptr;
6291 ExtendKind Kind = ExtendKind::PR_None;
6292};
6293
6294/// Describes the extends used to compute the extended reduction operand.
6295/// ExtendB is optional. If ExtendB is present, ExtendsUser is a binary
6296/// operation.
6297struct ExtendedReductionOperand {
6298 /// The recipe that consumes the extends.
6299 VPWidenRecipe *ExtendsUser = nullptr;
6300 /// Extend descriptions (inputs to getPartialReductionCost).
6301 ReductionExtend ExtendA, ExtendB;
6302};
6303
6304/// A chain of recipes that form a partial reduction. Matches either
6305/// reduction_bin_op (extended op, accumulator), or
6306/// reduction_bin_op (accumulator, extended op).
6307/// The possible forms of the "extended op" are listed in
6308/// matchExtendedReductionOperand.
6309struct VPPartialReductionChain {
6310 /// The top-level binary operation that forms the reduction to a scalar
6311 /// after the loop body.
6312 VPWidenRecipe *ReductionBinOp = nullptr;
6313 /// The user of the extends that is then reduced.
6314 ExtendedReductionOperand ExtendedOp;
6315 /// The recurrence kind for the entire partial reduction chain.
6316 /// This allows distinguishing between Sub and AddWithSub recurrences,
6317 /// when the ReductionBinOp is a Instruction::Sub.
6318 RecurKind RK;
6319 /// The index of the accumulator operand of ReductionBinOp. The extended op
6320 /// is `1 - AccumulatorOpIdx`.
6321 unsigned AccumulatorOpIdx;
6322 unsigned ScaleFactor;
6323};
6324
6325static VPSingleDefRecipe *
6326optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op) {
6327 // reduce.add(mul(ext(A), C))
6328 // -> reduce.add(mul(ext(A), ext(trunc(C))))
6329 const APInt *Const;
6330 if (match(Op, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
6331 auto *ExtA = cast<VPWidenCastRecipe>(Op->getOperand(0));
6332 Instruction::CastOps ExtOpc = ExtA->getOpcode();
6333 Type *NarrowTy = ExtA->getOperand(0)->getScalarType();
6334 if (!Op->hasOneUse() ||
6336 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
6337 return Op;
6338
6339 VPBuilder Builder(Op);
6340 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
6341 Op->getOperand(1), NarrowTy);
6342 Type *WideTy = ExtA->getScalarType();
6343 Op->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
6344 return Op;
6345 }
6346
6347 // reduce.add(abs(sub(ext(A), ext(B))))
6348 // -> reduce.add(ext(absolute-difference(A, B)))
6349 VPValue *X, *Y;
6352 auto *Sub = Op->getOperand(0)->getDefiningRecipe();
6353 auto *Ext = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6354 assert(Ext->getOpcode() ==
6355 cast<VPWidenCastRecipe>(Sub->getOperand(1))->getOpcode() &&
6356 "Expected both the LHS and RHS extends to be the same");
6357 bool IsSigned = Ext->getOpcode() == Instruction::SExt;
6358 VPBuilder Builder(Op);
6359 Type *SrcTy = X->getScalarType();
6360 auto *FreezeX = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {X}));
6361 auto *FreezeY = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {Y}));
6362 auto *Max = Builder.insert(
6363 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smax : Intrinsic::umax,
6364 {FreezeX, FreezeY}, SrcTy));
6365 auto *Min = Builder.insert(
6366 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smin : Intrinsic::umin,
6367 {FreezeX, FreezeY}, SrcTy));
6368 auto *AbsDiff =
6369 Builder.insert(new VPWidenRecipe(Instruction::Sub, {Max, Min}));
6370 return Builder.createWidenCast(Instruction::CastOps::ZExt, AbsDiff,
6371 Op->getScalarType());
6372 }
6373
6374 // reduce.add(ext(mul(ext(A), ext(B))))
6375 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
6376 // TODO: Support this optimization for float types.
6378 m_ZExtOrSExt(m_VPValue()))))) {
6379 auto *Ext = cast<VPWidenCastRecipe>(Op);
6380 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
6381 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6382 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6383 if (!Mul->hasOneUse() ||
6384 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
6385 MulLHS->getOpcode() != MulRHS->getOpcode())
6386 return Op;
6387 VPBuilder Builder(Mul);
6388 auto *NewLHS = Builder.createWidenCast(
6389 MulLHS->getOpcode(), MulLHS->getOperand(0), Ext->getScalarType());
6390 auto *NewRHS = MulLHS == MulRHS
6391 ? NewLHS
6392 : Builder.createWidenCast(MulRHS->getOpcode(),
6393 MulRHS->getOperand(0),
6394 Ext->getScalarType());
6395 auto *NewMul = Mul->cloneWithOperands({NewLHS, NewRHS});
6396 Builder.insert(NewMul);
6397 Op->replaceAllUsesWith(NewMul);
6398 Op->eraseFromParent();
6399 Mul->eraseFromParent();
6400 return NewMul;
6401 }
6402
6403 return Op;
6404}
6405
6406static VPExpressionRecipe *
6407createPartialReductionExpression(VPReductionRecipe *Red) {
6408 VPValue *VecOp = Red->getVecOp();
6409
6410 // reduce.[f]add(ext(op))
6411 // -> VPExpressionRecipe(op, red)
6412 if (match(VecOp, m_WidenAnyExtend(m_VPValue())))
6413 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
6414
6415 // reduce.[f]add(neg(ext(op)))
6416 // -> VPExpressionRecipe(op, sub/neg, red)
6417 if (match(VecOp, m_AnyNeg(m_WidenAnyExtend(m_VPValue())))) {
6418 auto *Neg = cast<VPWidenRecipe>(VecOp);
6419 auto *Ext =
6420 cast<VPWidenCastRecipe>(Neg->getOperand(Neg->getNumOperands() - 1));
6421 return new VPExpressionRecipe(Ext, Neg, Red);
6422 }
6423
6424 // reduce.[f]add([f]mul(ext(a), ext(b)))
6425 // -> VPExpressionRecipe(a, b, mul, red)
6426 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue()))) ||
6427 match(VecOp,
6429 auto *Mul = cast<VPWidenRecipe>(VecOp);
6430 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6431 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6432 return new VPExpressionRecipe(ExtA, ExtB, Mul, Red);
6433 }
6434
6435 // reduce.fadd(fneg(fmul(fpext(a), fpext(b))))
6436 // -> VPExpressionRecipe(a, b, fmul, fsub, red)
6437 if (match(VecOp,
6439 auto *FNeg = cast<VPWidenRecipe>(VecOp);
6440 auto *FMul = cast<VPWidenRecipe>(FNeg->getOperand(0));
6441 auto *ExtA = cast<VPWidenCastRecipe>(FMul->getOperand(0));
6442 auto *ExtB = cast<VPWidenCastRecipe>(FMul->getOperand(1));
6443 return new VPExpressionRecipe(ExtA, ExtB, FMul, FNeg, Red);
6444 }
6445
6446 // reduce.add(neg(mul(ext(a), ext(b))))
6447 // -> VPExpressionRecipe(a, b, mul, sub, red)
6449 m_ZExtOrSExt(m_VPValue()))))) {
6450 auto *Sub = cast<VPWidenRecipe>(VecOp);
6451 auto *Mul = cast<VPWidenRecipe>(Sub->getOperand(1));
6452 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6453 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6454 return new VPExpressionRecipe(ExtA, ExtB, Mul, Sub, Red);
6455 }
6456
6457 llvm_unreachable("Unsupported expression");
6458}
6459
6460// Helper to transform a partial reduction chain into a partial reduction
6461// recipe. Assumes profitability has been checked.
6462static void transformToPartialReduction(const VPPartialReductionChain &Chain,
6463 VPlan &Plan,
6464 VPReductionPHIRecipe *RdxPhi) {
6465 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6466 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
6467
6468 VPValue *Accumulator = WidenRecipe->getOperand(Chain.AccumulatorOpIdx);
6469 auto *ExtendedOp = cast<VPSingleDefRecipe>(
6470 WidenRecipe->getOperand(1 - Chain.AccumulatorOpIdx));
6471
6472 // FIXME: Do these transforms before invoking the cost-model.
6473 ExtendedOp = optimizeExtendsForPartialReduction(ExtendedOp);
6474
6475 // Sub-reductions can be implemented in two ways:
6476 // (1) negate the operand in the vector loop (the default way).
6477 // (2) subtract the reduced value from the init value in the middle block.
6478 // Both ways keep the reduction itself as an 'add' reduction.
6479 //
6480 // The ISD nodes for partial reductions don't support folding the
6481 // sub/negation into its operands because the following is not a valid
6482 // transformation:
6483 // sub(0, mul(ext(a), ext(b)))
6484 // -> mul(ext(a), ext(sub(0, b)))
6485 //
6486 // It's therefore better to choose option (2) such that the partial
6487 // reduction is always positive (starting at '0') and to do a final
6488 // subtract in the middle block.
6489 if ((WidenRecipe->getOpcode() == Instruction::Sub &&
6490 Chain.RK != RecurKind::Sub) ||
6491 (WidenRecipe->getOpcode() == Instruction::FSub &&
6492 Chain.RK != RecurKind::FSub)) {
6493 VPBuilder Builder(WidenRecipe);
6494 Type *ElemTy = ExtendedOp->getScalarType();
6495 VPWidenRecipe *NegRecipe;
6496 if (WidenRecipe->getOpcode() == Instruction::FSub) {
6497 NegRecipe =
6498 new VPWidenRecipe(Instruction::FNeg, {ExtendedOp}, VPIRFlags(),
6500 } else {
6501 auto *Zero = Plan.getZero(ElemTy);
6502 NegRecipe =
6503 new VPWidenRecipe(Instruction::Sub, {Zero, ExtendedOp}, VPIRFlags(),
6505 }
6506 Builder.insert(NegRecipe);
6507 ExtendedOp = NegRecipe;
6508 }
6509
6510 // Check if WidenRecipe is the final result of the reduction. If so look
6511 // through selects for predicated reductions.
6512 VPValue *Cond = nullptr;
6514 findUserOf(WidenRecipe, m_Select(m_VPValue(Cond), m_Specific(WidenRecipe),
6515 m_Specific(RdxPhi))));
6516 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
6517 RdxPhi->getBackedgeValue() == ExitValue;
6518 assert((!ExitValue || IsLastInChain) &&
6519 "if we found ExitValue, it must match RdxPhi's backedge value");
6520
6521 Type *PhiType = RdxPhi->getScalarType();
6522 RecurKind RdxKind =
6524 auto *PartialRed = new VPReductionRecipe(
6525 RdxKind,
6526 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlagsOrNone()
6527 : FastMathFlags(),
6528 WidenRecipe->getUnderlyingInstr(), Accumulator, ExtendedOp, Cond,
6529 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6530 PartialRed->insertBefore(WidenRecipe);
6531
6532 if (Cond)
6533 ExitValue->replaceAllUsesWith(PartialRed);
6534 WidenRecipe->replaceAllUsesWith(PartialRed);
6535
6536 // For cost-model purposes, fold this into a VPExpression.
6537 VPExpressionRecipe *E = createPartialReductionExpression(PartialRed);
6538 E->insertBefore(WidenRecipe);
6539 PartialRed->replaceAllUsesWith(E);
6540
6541 // We only need to update the PHI node once, which is when we find the
6542 // last reduction in the chain.
6543 if (!IsLastInChain)
6544 return;
6545
6546 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6547 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6548 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6549
6550 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
6551 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6552 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
6553 StartInst->setOperand(2, NewScaleFactor);
6554
6555 // If this is the last value in a sub-reduction chain, then update the PHI
6556 // node to start at `0` and update the reduction-result to subtract from
6557 // the PHI's start value.
6558 if (Chain.RK != RecurKind::Sub && Chain.RK != RecurKind::FSub)
6559 return;
6560
6561 VPValue *OldStartValue = StartInst->getOperand(0);
6562 StartInst->setOperand(0, StartInst->getOperand(1));
6563
6564 // Replace reduction_result by 'sub (startval, reductionresult)'.
6566 assert(RdxResult && "Could not find reduction result");
6567
6568 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
6569 unsigned SubOpc = Chain.RK == RecurKind::FSub ? Instruction::BinaryOps::FSub
6570 : Instruction::BinaryOps::Sub;
6571 VPInstruction *NewResult = Builder.createNaryOp(
6572 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
6573 RdxPhi->getDebugLoc());
6574 RdxResult->replaceUsesWithIf(
6575 NewResult,
6576 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6577}
6578
6579/// Returns the cost of a link in a partial-reduction chain for a given VF.
6580static InstructionCost
6581getPartialReductionLinkCost(VPCostContext &CostCtx,
6582 const VPPartialReductionChain &Link,
6583 ElementCount VF) {
6584 Type *RdxType = Link.ReductionBinOp->getScalarType();
6585 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6586 std::optional<unsigned> BinOpc = std::nullopt;
6587 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6588 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6589 BinOpc = ExtendedOp.ExtendsUser->getOpcode();
6590
6591 std::optional<llvm::FastMathFlags> Flags;
6592 if (RdxType->isFloatingPointTy())
6593 Flags = Link.ReductionBinOp->getFastMathFlagsOrNone();
6594
6595 auto GetLinkOpcode = [&Link]() -> unsigned {
6596 switch (Link.RK) {
6597 case RecurKind::Sub:
6598 return Instruction::Add;
6599 case RecurKind::FSub:
6600 return Instruction::FAdd;
6601 default:
6602 return Link.ReductionBinOp->getOpcode();
6603 }
6604 };
6605
6606 return CostCtx.TTI.getPartialReductionCost(
6607 GetLinkOpcode(), ExtendedOp.ExtendA.SrcType, ExtendedOp.ExtendB.SrcType,
6608 RdxType, VF, ExtendedOp.ExtendA.Kind, ExtendedOp.ExtendB.Kind, BinOpc,
6609 CostCtx.CostKind, Flags);
6610}
6611
6612static ExtendKind getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
6614}
6615
6616/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6617/// operand. This is an operand where the source of the value (e.g. a load) has
6618/// been extended (sext, zext, or fpext) before it is used in the reduction.
6619///
6620/// Possible forms matched by this function:
6621/// - UpdateR(PrevValue, ext(...))
6622/// - UpdateR(PrevValue, mul(ext(...), ext(...)))
6623/// - UpdateR(PrevValue, mul(ext(...), Constant))
6624/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6625/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6626/// - UpdateR(PrevValue, abs(sub(ext(...), ext(...)))
6627///
6628/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6629static std::optional<ExtendedReductionOperand>
6630matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op) {
6631 assert(is_contained(UpdateR->operands(), Op) &&
6632 "Op should be operand of UpdateR");
6633
6634 // Try matching an absolute difference operand of the form
6635 // `abs(sub(ext(A), ext(B)))`. This will be later transformed into
6636 // `ext(absolute-difference(A, B))`. This allows us to perform the absolute
6637 // difference on a wider type and get the extend for "free" from the partial
6638 // reduction.
6639 VPValue *X, *Y;
6640 if (Op->hasOneUse() &&
6644 auto *Abs = cast<VPWidenIntrinsicRecipe>(Op);
6645 auto *Sub = cast<VPWidenRecipe>(Abs->getOperand(0));
6646 auto *LHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6647 auto *RHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(1));
6648 Type *LHSInputType = X->getScalarType();
6649 Type *RHSInputType = Y->getScalarType();
6650 if (LHSInputType != RHSInputType ||
6651 LHSExt->getOpcode() != RHSExt->getOpcode())
6652 return std::nullopt;
6653 // Note: This is essentially the same as matching ext(...) as we will
6654 // rewrite this operand to ext(absolute-difference(A, B)).
6655 return ExtendedReductionOperand{
6656 Sub,
6657 /*ExtendA=*/{LHSInputType, getPartialReductionExtendKind(LHSExt)},
6658 /*ExtendB=*/{}};
6659 }
6660
6661 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6663 auto *CastRecipe = cast<VPWidenCastRecipe>(Op);
6664 VPValue *CastSource = CastRecipe->getOperand(0);
6665 OuterExtKind = getPartialReductionExtendKind(CastRecipe);
6666 if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) ||
6667 match(CastSource, m_FMul(m_VPValue(), m_VPValue()))) {
6668 // Match: ext(mul(...))
6669 // Record the outer extend kind and set `Op` to the mul. We can then match
6670 // this as a binary operation. Note: We can optimize out the outer extend
6671 // by widening the inner extends to match it. See
6672 // optimizeExtendsForPartialReduction.
6673 Op = CastSource;
6674 } else {
6675 return ExtendedReductionOperand{
6676 UpdateR,
6677 /*ExtendA=*/{CastSource->getScalarType(), *OuterExtKind},
6678 /*ExtendB=*/{}};
6679 }
6680 }
6681
6682 if (!Op->hasOneUse())
6683 return std::nullopt;
6684
6686 if (!MulOp ||
6687 !is_contained({Instruction::Mul, Instruction::FMul}, MulOp->getOpcode()))
6688 return std::nullopt;
6689
6690 // The rest of the matching assumes `Op` is a (possibly extended) mul
6691 // operation.
6692
6693 VPValue *LHS = MulOp->getOperand(0);
6694 VPValue *RHS = MulOp->getOperand(1);
6695
6696 // The LHS of the operation must always be an extend.
6698 return std::nullopt;
6699
6700 auto *LHSCast = cast<VPWidenCastRecipe>(LHS);
6701 Type *LHSInputType = LHSCast->getOperand(0)->getScalarType();
6702 ExtendKind LHSExtendKind = getPartialReductionExtendKind(LHSCast);
6703
6704 // The RHS of the operation can be an extend or a constant integer.
6705 const APInt *RHSConst = nullptr;
6706 VPWidenCastRecipe *RHSCast = nullptr;
6708 RHSCast = cast<VPWidenCastRecipe>(RHS);
6709 else if (!match(RHS, m_APInt(RHSConst)) ||
6710 !canConstantBeExtended(RHSConst, LHSInputType, LHSExtendKind))
6711 return std::nullopt;
6712
6713 // The outer extend kind must match the inner extends for folding.
6714 for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
6715 if (Cast && OuterExtKind &&
6716 getPartialReductionExtendKind(Cast) != OuterExtKind)
6717 return std::nullopt;
6718
6719 Type *RHSInputType = LHSInputType;
6720 ExtendKind RHSExtendKind = LHSExtendKind;
6721 if (RHSCast) {
6722 RHSInputType = RHSCast->getOperand(0)->getScalarType();
6723 RHSExtendKind = getPartialReductionExtendKind(RHSCast);
6724 }
6725
6726 return ExtendedReductionOperand{
6727 MulOp, {LHSInputType, LHSExtendKind}, {RHSInputType, RHSExtendKind}};
6728}
6729
6730/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6731/// and determines if the target can use a cheaper operation with a wider
6732/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6733/// of operations in the reduction.
6734static std::optional<SmallVector<VPPartialReductionChain>>
6735getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,
6736 VFRange &Range) {
6737 // Get the backedge value from the reduction PHI and find the
6738 // ComputeReductionResult that uses it (directly or through a select for
6739 // predicated reductions).
6740 auto *RdxResult = vputils::findComputeReductionResult(RedPhiR);
6741 if (!RdxResult)
6742 return std::nullopt;
6743 VPValue *ExitValue = RdxResult->getOperand(0);
6744 match(ExitValue, m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6745
6747 RecurKind RK = RedPhiR->getRecurrenceKind();
6748 Type *PhiType = RedPhiR->getScalarType();
6749 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6750
6751 // Work backwards from the ExitValue examining each reduction operation.
6752 VPValue *CurrentValue = ExitValue;
6753 while (CurrentValue != RedPhiR) {
6754 auto *UpdateR = dyn_cast<VPWidenRecipe>(CurrentValue);
6755 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6756 return std::nullopt;
6757
6758 VPValue *Op = UpdateR->getOperand(1);
6759 VPValue *PrevValue = UpdateR->getOperand(0);
6760
6761 // Find the extended operand. The other operand (PrevValue) is the next link
6762 // in the reduction chain.
6763 std::optional<ExtendedReductionOperand> ExtendedOp =
6764 matchExtendedReductionOperand(UpdateR, Op);
6765 if (!ExtendedOp) {
6766 ExtendedOp = matchExtendedReductionOperand(UpdateR, PrevValue);
6767 if (!ExtendedOp)
6768 return std::nullopt;
6769 std::swap(Op, PrevValue);
6770 }
6771
6772 Type *ExtSrcType = ExtendedOp->ExtendA.SrcType;
6773 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6774 if (!PHISize.hasKnownScalarFactor(ExtSrcSize))
6775 return std::nullopt;
6776
6777 // Check if a partial reduction chain is supported by the target (i.e. does
6778 // not have an invalid cost) for the given VF range. Clamps the range and
6779 // returns true if feasible for any VF.
6780 VPPartialReductionChain Link(
6781 {UpdateR, *ExtendedOp, RK,
6782 PrevValue == UpdateR->getOperand(0) ? 0U : 1U,
6783 static_cast<unsigned>(PHISize.getKnownScalarFactor(ExtSrcSize))});
6784 Chain.push_back(Link);
6785 CurrentValue = PrevValue;
6786 }
6787
6788 // The chain links were collected by traversing backwards from the exit value.
6789 // Reverse the chains so they are in program order.
6790 std::reverse(Chain.begin(), Chain.end());
6791 return Chain;
6792}
6793} // namespace
6794
6796 VPCostContext &CostCtx,
6797 VFRange &Range) {
6798 // Find all possible valid partial reductions, grouping chains by their PHI.
6799 // This grouping allows invalidating the whole chain, if any link is not a
6800 // valid partial reduction.
6802 ChainsByPhi;
6803 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6804 for (VPRecipeBase &R : HeaderVPBB->phis()) {
6805 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
6806 if (!RedPhiR)
6807 continue;
6808
6809 if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))
6810 ChainsByPhi.try_emplace(RedPhiR, std::move(*Chains));
6811 }
6812
6813 if (ChainsByPhi.empty())
6814 return;
6815
6816 // Build set of partial reduction operations for extend user validation and
6817 // a map of reduction bin ops to their scale factors for scale validation.
6818 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
6819 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
6820 for (const auto &[_, Chains] : ChainsByPhi)
6821 for (const VPPartialReductionChain &Chain : Chains) {
6822 PartialReductionOps.insert(Chain.ExtendedOp.ExtendsUser);
6823 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
6824 }
6825
6826 // A partial reduction is invalid if any of its extends are used by
6827 // something that isn't another partial reduction. This is because the
6828 // extends are intended to be lowered along with the reduction itself.
6829 auto ExtendUsersValid = [&](VPValue *Ext) {
6830 return !isa<VPWidenCastRecipe>(Ext) || all_of(Ext->users(), [&](VPUser *U) {
6831 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
6832 });
6833 };
6834
6835 auto IsProfitablePartialReductionChainForVF =
6836 [&](ArrayRef<VPPartialReductionChain> Chain, ElementCount VF) -> bool {
6837 InstructionCost PartialCost = 0, RegularCost = 0;
6838
6839 // The chain is a profitable partial reduction chain if the cost of handling
6840 // the entire chain is cheaper when using partial reductions than when
6841 // handling the entire chain using regular reductions.
6842 for (const VPPartialReductionChain &Link : Chain) {
6843 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6844 InstructionCost LinkCost = getPartialReductionLinkCost(CostCtx, Link, VF);
6845 if (!LinkCost.isValid())
6846 return false;
6847
6848 PartialCost += LinkCost;
6849 RegularCost += Link.ReductionBinOp->computeCost(VF, CostCtx);
6850 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6851 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6852 RegularCost += ExtendedOp.ExtendsUser->computeCost(VF, CostCtx);
6853 for (VPValue *Op : ExtendedOp.ExtendsUser->operands())
6854 if (auto *Extend = dyn_cast<VPWidenCastRecipe>(Op))
6855 RegularCost += Extend->computeCost(VF, CostCtx);
6856 }
6857 return PartialCost.isValid() && PartialCost < RegularCost;
6858 };
6859
6860 // Validate chains: check that extends are only used by partial reductions,
6861 // and that reduction bin ops are only used by other partial reductions with
6862 // matching scale factors, are outside the loop region or the select
6863 // introduced by tail-folding. Otherwise we would create users of scaled
6864 // reductions where the types of the other operands don't match.
6865 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6866 for (const VPPartialReductionChain &Chain : Chains) {
6867 if (!all_of(Chain.ExtendedOp.ExtendsUser->operands(), ExtendUsersValid)) {
6868 Chains.clear();
6869 break;
6870 }
6871 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6872 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
6873 return PhiR == RedPhiR;
6874 auto *R = cast<VPSingleDefRecipe>(U);
6875 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
6877 m_Specific(Chain.ReductionBinOp))) ||
6878 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
6879 m_Specific(RedPhiR)));
6880 };
6881 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
6882 Chains.clear();
6883 break;
6884 }
6885
6886 // Check if the compute-reduction-result is used by a sunk store.
6887 // TODO: Also form partial reductions in those cases.
6888 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6889 if (any_of(RdxResult->users(), [](VPUser *U) {
6890 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
6891 return RepR && RepR->getOpcode() == Instruction::Store;
6892 })) {
6893 Chains.clear();
6894 break;
6895 }
6896 }
6897 }
6898
6899 // Clear the chain if it is not profitable.
6901 [&, &Chains = Chains](ElementCount VF) {
6902 return IsProfitablePartialReductionChainForVF(Chains, VF);
6903 },
6904 Range))
6905 Chains.clear();
6906 }
6907
6908 for (auto &[Phi, Chains] : ChainsByPhi)
6909 for (const VPPartialReductionChain &Chain : Chains)
6910 transformToPartialReduction(Chain, Plan, Phi);
6911}
6912
6914 VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder) {
6915 // Collect all loads/stores first. We will start with ones having simpler
6916 // decisions followed by more complex ones that are potentially
6917 // guided/dependent on the simpler ones.
6919 for (VPBasicBlock *VPBB :
6922 for (VPRecipeBase &R : *VPBB) {
6923 auto *VPI = dyn_cast<VPInstruction>(&R);
6924 if (VPI && VPI->getUnderlyingValue() &&
6925 is_contained({Instruction::Load, Instruction::Store},
6926 VPI->getOpcode()))
6927 MemOps.push_back(VPI);
6928 }
6929 }
6930
6931 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
6932 VPBuilder FinalRedStoresBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
6933
6934 for (VPInstruction *VPI : MemOps) {
6935 auto ReplaceWith = [&](VPRecipeBase *New) {
6936 New->insertBefore(VPI);
6937 if (VPI->getOpcode() == Instruction::Load)
6938 VPI->replaceAllUsesWith(New->getVPSingleValue());
6939 VPI->eraseFromParent();
6940 };
6941
6942 // Note: we must do that for scalar VPlan as well.
6943 if (RecipeBuilder.replaceWithFinalIfReductionStore(VPI,
6944 FinalRedStoresBuilder))
6945 continue;
6946
6947 // Filter out scalar VPlan for the remaining memory operations.
6949 [](ElementCount VF) { return VF.isScalar(); }, Range))
6950 continue;
6951
6952 if (VPHistogramRecipe *Histogram = RecipeBuilder.widenIfHistogram(VPI)) {
6953 ReplaceWith(Histogram);
6954 continue;
6955 }
6956
6957 VPRecipeBase *Recipe = RecipeBuilder.tryToWidenMemory(VPI, Range);
6958 if (!Recipe)
6959 Recipe = RecipeBuilder.handleReplication(VPI, Range);
6960
6961 ReplaceWith(Recipe);
6962 }
6963}
6964
6967 [&](ElementCount VF) { return VF.isScalar(); }, Range))
6968 return;
6969
6971 Plan.getEntry());
6973 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
6974 auto *VPI = dyn_cast<VPInstruction>(&R);
6975 if (!VPI)
6976 continue;
6977
6978 auto *I = cast_or_null<Instruction>(VPI->getUnderlyingValue());
6979 // Wouldn't be able to create a `VPReplicateRecipe` anyway.
6980 if (!I)
6981 continue;
6982
6983 // If executing other lanes produces side-effects we can't avoid them.
6984 if (VPI->mayHaveSideEffects())
6985 continue;
6986
6987 // We want to drop the mask operand, verify we can safely do that.
6988 if (VPI->isMasked() && !VPI->isSafeToSpeculativelyExecute())
6989 continue;
6990
6991 // Avoid rewriting IV increment as that interferes with
6992 // `removeRedundantCanonicalIVs`.
6993 if (VPI->getOpcode() == Instruction::Add &&
6995 continue;
6996
6997 // Other lanes are needed - can't drop them.
6999 continue;
7000
7001 auto *Recipe = VPBuilder::createSingleScalarOp(
7002 VPI->getOpcode(), VPI->operandsWithoutMask(), /*Mask=*/nullptr, *VPI,
7003 *VPI, VPI->getDebugLoc(), I);
7004 Recipe->insertBefore(VPI);
7005 VPI->replaceAllUsesWith(Recipe);
7006 VPI->eraseFromParent();
7007 }
7008 }
7009}
7010
7011/// Returns true if \p Info's parameter kinds are compatible with \p Args.
7012static bool areVFParamsOk(const VFInfo &Info, ArrayRef<VPValue *> Args,
7013 PredicatedScalarEvolution &PSE, const Loop *L) {
7014 ScalarEvolution *SE = PSE.getSE();
7015 return all_of(Info.Shape.Parameters, [&](VFParameter Param) {
7016 switch (Param.ParamKind) {
7017 case VFParamKind::Vector:
7018 case VFParamKind::GlobalPredicate:
7019 return true;
7020 case VFParamKind::OMP_Uniform:
7021 return SE->isSCEVable(Args[Param.ParamPos]->getScalarType()) &&
7022 SE->isLoopInvariant(
7023 vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),
7024 L);
7025 case VFParamKind::OMP_Linear:
7026 return match(vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),
7027 m_scev_AffineAddRec(
7028 m_SCEV(), m_scev_SpecificSInt(Param.LinearStepOrPos),
7029 m_SpecificLoop(L)));
7030 default:
7031 return false;
7032 }
7033 });
7034}
7035
7036/// Find a vector variant of \p CI for \p VF, respecting \p MaskRequired.
7037/// Returns the variant function, or nullptr. Masked variants are assumed to
7038/// take the mask as a trailing parameter.
7040 ElementCount VF, bool MaskRequired,
7042 const Loop *L) {
7043 if (CI->isNoBuiltin())
7044 return nullptr;
7045 auto Mappings = VFDatabase::getMappings(*CI);
7046 const auto *It = find_if(Mappings, [&](const VFInfo &Info) {
7047 return Info.Shape.VF == VF && (!MaskRequired || Info.isMasked()) &&
7048 areVFParamsOk(Info, Args, PSE, L);
7049 });
7050 if (It == Mappings.end())
7051 return nullptr;
7052 return CI->getModule()->getFunction(It->VectorName);
7053}
7054
7055namespace {
7056/// The outcome of choosing how to widen a call at a given VF.
7057struct CallWideningDecision {
7058 enum class KindTy { Scalarize, Intrinsic, VectorVariant };
7059 CallWideningDecision(KindTy Kind, Function *Variant = nullptr)
7060 : Kind(Kind), Variant(Variant) {}
7061 KindTy Kind;
7062
7063 /// Set when Kind == VectorVariant.
7065
7066 bool operator==(const CallWideningDecision &Other) const {
7067 return Kind == Other.Kind && Variant == Other.Variant;
7068 }
7069};
7070} // namespace
7071
7072/// Pick the cheapest widening for the call \p VPI at \p VF among scalarization,
7073/// vector intrinsic, and vector library variant.
7074static CallWideningDecision decideCallWidening(VPInstruction &VPI,
7076 ElementCount VF,
7077 VPCostContext &CostCtx) {
7078 auto *CI = cast<CallInst>(VPI.getUnderlyingInstr());
7079
7080 // Scalar VFs and calls forced or known to scalarize always replicate.
7081 if (VF.isScalar() || CostCtx.willBeScalarized(CI, VF))
7082 return CallWideningDecision::KindTy::Scalarize;
7083
7084 auto *CalledFn = cast<Function>(
7086 Type *ResultTy = VPI.getScalarType();
7088 bool MaskRequired = CostCtx.isMaskRequired(CI);
7089
7090 // Pseudo intrinsics (assume, lifetime, ...) are always scalarized.
7092 return CallWideningDecision::KindTy::Scalarize;
7093
7094 InstructionCost ScalarCost =
7095 VPReplicateRecipe::computeCallCost(CalledFn, ResultTy, Ops,
7096 /*IsSingleScalar=*/false, VF, CostCtx);
7097
7098 Function *VecFunc =
7099 findVectorVariant(CI, Ops, VF, MaskRequired, CostCtx.PSE, CostCtx.L);
7101 if (VecFunc)
7102 VecCallCost = VPWidenCallRecipe::computeCallCost(VecFunc, CostCtx);
7103
7104 // Prefer the intrinsic if it is at least as cheap as scalarizing and any
7105 // available vector variant.
7106 if (ID) {
7109 if (IntrinsicCost.isValid() && ScalarCost >= IntrinsicCost &&
7110 (!VecFunc || VecCallCost >= IntrinsicCost))
7111 return CallWideningDecision::KindTy::Intrinsic;
7112 }
7113
7114 // Otherwise, use a vector library variant when it beats scalarizing.
7115 if (VecFunc && ScalarCost >= VecCallCost)
7116 return {CallWideningDecision::KindTy::VectorVariant, VecFunc};
7117
7118 return CallWideningDecision::KindTy::Scalarize;
7119}
7120
7122 VPRecipeBuilder &RecipeBuilder,
7123 VPCostContext &CostCtx) {
7126 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
7127 auto *VPI = dyn_cast<VPInstruction>(&R);
7128 if (!VPI || !VPI->getUnderlyingValue() ||
7129 VPI->getOpcode() != Instruction::Call)
7130 continue;
7131
7132 auto *CI = cast<CallInst>(VPI->getUnderlyingInstr());
7133 SmallVector<VPValue *, 4> Ops(VPI->op_begin(),
7134 VPI->op_begin() + CI->arg_size());
7135
7136 CallWideningDecision Decision =
7137 decideCallWidening(*VPI, Ops, Range.Start, CostCtx);
7139 [&](ElementCount VF) {
7140 return Decision == decideCallWidening(*VPI, Ops, VF, CostCtx);
7141 },
7142 Range);
7143
7144 VPSingleDefRecipe *Replacement = nullptr;
7145 switch (Decision.Kind) {
7146 case CallWideningDecision::KindTy::Intrinsic: {
7148 Type *ResultTy = VPI->getScalarType();
7149 Replacement = new VPWidenIntrinsicRecipe(*CI, ID, Ops, ResultTy, *VPI,
7150 *VPI, VPI->getDebugLoc());
7151 break;
7152 }
7153 case CallWideningDecision::KindTy::VectorVariant: {
7154 // Masked variants take the mask as a trailing parameter, so they have
7155 // one more parameter than the original call's arguments.
7156 if (Decision.Variant->arg_size() > Ops.size()) {
7157 VPValue *Mask = VPI->isMasked() ? VPI->getMask() : Plan.getTrue();
7158 Ops.push_back(Mask);
7159 }
7160 Ops.push_back(VPI->getOperand(VPI->getNumOperandsWithoutMask() - 1));
7161 Replacement = new VPWidenCallRecipe(CI, Decision.Variant, Ops, *VPI,
7162 *VPI, VPI->getDebugLoc());
7163 break;
7164 }
7165 case CallWideningDecision::KindTy::Scalarize:
7166 Replacement = RecipeBuilder.handleReplication(VPI, Range);
7167 break;
7168 }
7169
7170 Replacement->insertBefore(VPI);
7171 VPI->replaceAllUsesWith(Replacement);
7172 VPI->eraseFromParent();
7173 }
7174 }
7175}
7176
7179 Loop &L, VPCostContext &Ctx,
7180 VFRange &Range) {
7181 if (Plan.hasScalarVFOnly())
7182 return;
7183
7184 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
7185 VPValue *I32VF = nullptr;
7187 vp_depth_first_shallow(VectorLoop->getEntry()))) {
7188 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
7189 auto *LoadR = dyn_cast<VPWidenLoadRecipe>(&R);
7190 // TODO: Support strided store.
7191 // TODO: Transform reverse access into strided access with -1 stride.
7192 // TODO: Transform gather/scatter with uniform address into strided access
7193 // with 0 stride.
7194 // TODO: Transform interleave access into multiple strided accesses.
7195 if (!LoadR || LoadR->isConsecutive())
7196 continue;
7197
7198 auto *Ptr = dyn_cast<VPWidenGEPRecipe>(LoadR->getAddr());
7199 if (!Ptr)
7200 continue;
7201
7202 // Check if this is a strided access by analyzing the address SCEV for an
7203 // affine addRec.
7204 const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(Ptr, PSE, &L);
7205 const SCEV *Start;
7206 const SCEVConstant *Step;
7207 // TODO: Support non-constant loop invariant stride.
7208 if (!match(PtrSCEV,
7210 m_SpecificLoop(&L))))
7211 continue;
7212
7213 Type *LoadTy = LoadR->getScalarType();
7214 Align Alignment = LoadR->getAlign();
7215 auto IsProfitable = [&](ElementCount VF) {
7216 Type *DataTy = toVectorTy(LoadTy, VF);
7217 if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))
7218 return false;
7219 const InstructionCost CurrentCost = LoadR->computeCost(VF, Ctx);
7220 const InstructionCost StridedLoadStoreCost =
7222 Intrinsic::experimental_vp_strided_load, DataTy,
7223 LoadR->isMasked(), Alignment, Ctx);
7224 return StridedLoadStoreCost < CurrentCost;
7225 };
7226
7228 Range))
7229 continue;
7230
7231 // Invalidate the legacy widening decision so the cost of replaced load is
7232 // not counted during precomputeCosts.
7233 // TODO: Remove once the legacy exit cost computation is retired.
7234 for (ElementCount VF : Range)
7235 Ctx.invalidateWideningDecision(&LoadR->getIngredient(), VF);
7236
7237 // Get VF as i32 for the vector length operand.
7238 if (!I32VF) {
7239 VPBuilder Builder(Plan.getVectorPreheader());
7240 I32VF = Builder.createScalarZExtOrTrunc(
7241 &Plan.getVF(), Type::getInt32Ty(Plan.getContext()),
7243 }
7244
7245 VPBuilder Builder(LoadR);
7246 // Create the base pointer of strided access.
7247 // TODO: reuse VPDerivedIVRecipe for base pointer computation when it
7248 // supports a general VPValue as the start value.
7249 VPValue *StartVPV = vputils::getOrCreateVPValueForSCEVExpr(Plan, Start);
7250 VPValue *StrideInBytes = Plan.getOrAddLiveIn(Step->getValue());
7251 Type *IndexTy = Plan.getDataLayout().getIndexType(Ptr->getScalarType());
7252 assert(IndexTy == StrideInBytes->getScalarType() &&
7253 "Stride type from SCEV must match the index type");
7254 VPValue *CanIV = Builder.createScalarSExtOrTrunc(
7255 VectorLoop->getCanonicalIV(), IndexTy,
7256 VectorLoop->getCanonicalIVType(), DebugLoc::getUnknown());
7257 auto *AddRecPtr = cast<SCEVAddRecExpr>(PtrSCEV);
7258 auto *Offset = Builder.createOverflowingOp(
7259 Instruction::Mul, {CanIV, StrideInBytes},
7260 {AddRecPtr->hasNoUnsignedWrap(), AddRecPtr->hasNoSignedWrap()});
7261 auto *BasePtr = Builder.createNoWrapPtrAdd(
7262 StartVPV, Offset,
7263 AddRecPtr->hasNoUnsignedWrap() ? GEPNoWrapFlags::noUnsignedWrap()
7265
7266 // Create a new vector pointer for strided access.
7267 VPValue *NewPtr = Builder.createVectorPointer(
7268 BasePtr, Type::getInt8Ty(Plan.getContext()), StrideInBytes,
7269 Ptr->getGEPNoWrapFlags(), Ptr->getDebugLoc());
7270
7271 VPValue *Mask = LoadR->getMask();
7272 if (!Mask)
7273 Mask = Plan.getTrue();
7274 auto *StridedLoad = Builder.createWidenMemIntrinsic(
7275 Intrinsic::experimental_vp_strided_load,
7276 {NewPtr, StrideInBytes, Mask, I32VF}, LoadTy, Alignment, *LoadR,
7277 LoadR->getDebugLoc());
7278 LoadR->replaceAllUsesWith(StridedLoad);
7279 }
7280 }
7281}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static bool dominates(InstrPosIndexes &PosIndexes, const MachineInstr &A, const MachineInstr &B)
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static CallWideningDecision decideCallWidening(VPInstruction &VPI, ArrayRef< VPValue * > Ops, ElementCount VF, VPCostContext &CostCtx)
Pick the cheapest widening for the call VPI at VF among scalarization, vector intrinsic,...
static bool areVFParamsOk(const VFInfo &Info, ArrayRef< VPValue * > Args, PredicatedScalarEvolution &PSE, const Loop *L)
Returns true if Info's parameter kinds are compatible with Args.
static bool simplifyLogicalRecipe(VPSingleDefRecipe *Def, VPBuilder &Builder, bool CanCreateNewRecipe)
Try to simplify logical and bitwise recipes in Def.
static bool sinkScalarOperands(VPlan &Plan)
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static VPValue * cloneBinOpForScalarIV(VPWidenRecipe *BinOp, VPValue *ScalarIV, VPWidenIntOrFpInductionRecipe *WidenIV)
Create a scalar version of BinOp, with its WidenIV operand replaced by ScalarIV, and place it after S...
static VPWidenIntOrFpInductionRecipe * getExpressionIV(VPValue *V)
Check if V is a binary expression of a widened IV and a loop-invariant value.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static Type * getLoadStoreValueType(VPReplicateRecipe *R, bool IsLoad)
Get the value type of the replicate load or store.
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static Function * findVectorVariant(CallInst *CI, ArrayRef< VPValue * > Args, ElementCount VF, bool MaskRequired, PredicatedScalarEvolution &PSE, const Loop *L)
Find a vector variant of CI for VF, respecting MaskRequired.
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static bool handleUncountableExitsWithSideEffects(VPlan &Plan, SmallVectorImpl< EarlyExitInfo > &Exits, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT, AssumptionCache *AC, VPDominatorTree &VPDT)
Update Plan to mask memory operations in the loop based on whether the early exit is taken or not.
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx, bool IsScalable)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void simplifyRecipe(VPSingleDefRecipe *Def)
Try to simplify VPSingleDefRecipe Def.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectGroupedReplicateMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L, function_ref< bool(VPReplicateRecipe *)> FilterFn)
Collect either replicated Loads or Stores grouped by their address SCEV and their load-store type,...
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
static std::optional< Intrinsic::ID > getVPDivRemIntrinsic(Intrinsic::ID IntrID)
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant ExpandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static VPValue * narrowInterleaveGroupOp(ArrayRef< VPValue * > Members, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL)
Try to fold R using InstSimplifyFolder.
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static void expandVPDerivedIV(VPDerivedIVRecipe *R)
Expand a VPDerivedIVRecipe into executable recipes.
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations using scoped noalias metadata,...
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPRegionBlock *ParentRegion, VPlan &Plan)
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static std::optional< Instruction::BinaryOps > getUnmaskedDivRemOpcode(Intrinsic::ID ID)
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool cannotHoistOrSinkRecipe(VPRecipeBase &R, VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Return true if we do not know how to (mechanically) hoist or sink a non-memory or memory recipe R out...
static bool canNarrowOps(ArrayRef< VPValue * > Ops, bool IsScalable)
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L)
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1055
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
APInt abs() const
Get the absolute value.
Definition APInt.h:1818
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1692
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:1028
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getMinValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the minimum integer value with the given bit width and signedness.
Definition APSInt.h:310
static APSInt getMaxValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the maximum integer value with the given bit width and signedness.
Definition APSInt.h:302
@ NoAlias
The two locations do not alias at all.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
Get the last element.
Definition ArrayRef.h:150
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:194
const T & front() const
Get the first element.
Definition ArrayRef.h:144
iterator end() const
Definition ArrayRef.h:130
iterator begin() const
Definition ArrayRef.h:129
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:852
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:152
static DebugLoc getUnknown()
Definition DebugLoc.h:151
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:252
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:301
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:262
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:202
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:291
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
size_t arg_size() const
Definition Function.h:901
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags noUnsignedWrap()
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
static LLVM_ABI InductionDescriptor getCanonicalIntInduction(Type *Ty, ScalarEvolution &SE)
Returns the canonical integer induction for type Ty with start = 0 and step = 1.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:350
The group of interleaved loads/stores sharing the same stride and close to each other.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1666
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1069
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:110
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:118
bool empty() const
Definition MapVector.h:79
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition Module.cpp:235
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Post-order traversal of a graph.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class represents a constant integer value.
ConstantInt * getValue() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(SCEVUse SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getUDivExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
@ SK_Broadcast
Broadcast element 0 to all other elements.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:276
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
op_range operands()
Definition User.h:267
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:76
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:4049
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4404
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4479
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4431
iterator end()
Definition VPlan.h:4441
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4439
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4492
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:266
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:560
const VPRecipeBase & front() const
Definition VPlan.h:4451
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:639
const VPRecipeBase & back() const
Definition VPlan.h:4453
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2957
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:3007
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2997
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:3013
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2993
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:94
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:315
VPRegionBlock * getParent()
Definition VPlan.h:186
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:236
size_t getNumSuccessors() const
Definition VPlan.h:237
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:306
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:222
VPlan * getPlan()
Definition VPlan.cpp:211
const std::string & getName() const
Definition VPlan.h:177
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:325
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:233
void clearPredecessors()
Remove all the predecessor of this block.
Definition VPlan.h:322
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:216
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:279
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:227
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:211
static auto blocksAs(T &&Range)
Return an iterator range over Range with each block cast to BlockTy.
Definition VPlanUtils.h:343
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:362
static bool isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT)
Returns true if VPB is a loop latch, using isHeader().
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:252
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:270
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:288
static auto blocksOnly(T &&Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:324
static void transferSuccessors(VPBlockBase *Old, VPBlockBase *New)
Transfer successors from Old to New. New must have no successors.
Definition VPlanUtils.h:308
static SmallVector< VPBasicBlock * > blocksInSingleSuccessorChainBetween(VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Returns the blocks between FirstBB and LastBB, where FirstBB to LastBB forms a single-sucessor chain.
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3496
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createFirstActiveLane(ArrayRef< VPValue * > Masks, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAdd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false})
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPInstruction * createLogicalOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAnyOfReduction(VPValue *ChainOp, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown())
Create an AnyOf reduction pattern: or-reduce ChainOp, freeze the result, then select between TrueVal ...
Definition VPlan.cpp:1653
VPInstruction * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step)
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
VPWidenPHIRecipe * createWidenPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPWidenCastRecipe * createWidenCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={}, Type *ResultTy=nullptr)
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", Type *ResultTy=nullptr)
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
static VPSingleDefRecipe * createSingleScalarOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPValue *Mask, const VPIRFlags &Flags, const VPIRMetadata &Metadata, DebugLoc DL, Instruction *UV)
Create a single-scalar recipe with Opcode and Operands without inserting it.
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:4081
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:561
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:534
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:546
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:556
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:4182
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B) const
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3541
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2436
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2483
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2472
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition VPlan.h:2163
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4557
Class to record and manage LLVM IR flags.
Definition VPlan.h:695
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlagsOrNone() const
void dropPoisonGeneratingFlags()
Drop all poison-generating flags.
Definition VPlan.h:892
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1171
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1226
unsigned getNumOperandsWithoutMask() const
Returns the number of operands, excluding the mask if the VPInstruction is masked.
Definition VPlan.h:1473
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1319
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1269
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1315
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1264
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1261
@ CanonicalIVIncrementForPart
Definition VPlan.h:1245
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1272
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:3109
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3101
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:3130
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:3182
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:3140
void addIncoming(VPValue *IncomingV)
Append IncomingV as an incoming value to the phi-like recipe.
Definition VPlan.h:1665
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3707
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:402
VPBasicBlock * getParent()
Definition VPlan.h:477
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:555
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPHistogramRecipe * widenIfHistogram(VPInstruction *VPI)
If VPI represents a histogram operation (as determined by LoopVectorizationLegality) make that safe f...
VPRecipeBase * tryToWidenMemory(VPInstruction *VPI, VFRange &Range)
Check if the load or store instruction VPI should widened for Range.Start and potentially masked.
bool replaceWithFinalIfReductionStore(VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder)
If VPI is a store of a reduction into an invariant address, delete it.
VPSingleDefRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a replicating or single-scalar recipe for VPI.
Type * getScalarType() const
Returns the scalar type of this VPRecipeValue.
Definition VPlanValue.h:337
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3353
A recipe for handling reduction phis.
Definition VPlan.h:2864
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2915
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2908
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2921
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3233
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4614
const VPBlockBase * getEntry() const
Definition VPlan.h:4658
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4690
VPInstruction * getOrCreateCanonicalIVIncrement()
Get the canonical IV increment instruction if it exists.
Definition VPlan.cpp:857
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4675
Type * getCanonicalIVType() const
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4734
void clearCanonicalIVNUW(VPInstruction *Increment)
Unsets NUW for the canonical IV increment Increment, for loop regions.
Definition VPlan.h:4742
VPRegionValue * getCanonicalIV()
Return the canonical induction variable of the region, null for replicating regions.
Definition VPlan.h:4726
const VPBlockBase * getExiting() const
Definition VPlan.h:4670
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4683
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3398
bool isSingleScalar() const
Definition VPlan.h:3456
static InstructionCost computeCallCost(Function *CalledFn, Type *ResultTy, ArrayRef< const VPValue * > ArgOps, bool IsSingleScalar, ElementCount VF, VPCostContext &Ctx)
Return the cost of scalarizing a call to CalledFn with argument operands ArgOps for a given VF.
operand_range operandsWithoutMask()
Return the recipe's operands, excluding the mask of a predicated recipe.
Definition VPlan.h:3481
bool isPredicated() const
Definition VPlan.h:3458
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3475
Lightweight SCEV-to-VPlan expander.
Definition VPlanUtils.h:190
VPValue * tryToExpand(const SCEV *S)
Try to expand S into recipes and live-ins using the builder.
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:4249
VPSingleDefRecipe is a base class for recipes that model a sequence of one or more output IR that def...
Definition VPlan.h:609
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:680
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:384
operand_range operands()
Definition VPlanValue.h:457
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:430
unsigned getNumOperands() const
Definition VPlanValue.h:424
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:425
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:50
Type * getScalarType() const
Returns the scalar type of this VPValue, dispatching based on the concrete subclass.
Definition VPlan.cpp:149
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:143
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1478
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:130
bool hasMoreThanOneUniqueUser() const
Returns true if the value has more than one unique user.
Definition VPlanValue.h:163
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:75
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:208
VPUser * getSingleUser()
Return the single user of this value, or nullptr if there is not exactly one user.
Definition VPlanValue.h:178
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1481
unsigned getNumUsers() const
Definition VPlanValue.h:115
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1487
user_range users()
Definition VPlanValue.h:157
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2266
A recipe for widening Call instructions using library calls.
Definition VPlan.h:2097
static InstructionCost computeCallCost(Function *Variant, VPCostContext &Ctx)
Return the cost of widening a call using the vector function Variant.
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1878
Instruction::CastOps getOpcode() const
Definition VPlan.h:1914
A recipe for handling GEP instructions.
Definition VPlan.h:2206
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2516
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2564
PHINode * getPHINode() const
Returns the underlying PHINode if one exists, or null otherwise.
Definition VPlan.h:2582
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2567
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2587
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2623
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2670
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2674
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2685
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2696
A recipe for widening vector intrinsics.
Definition VPlan.h:1925
static InstructionCost computeCallCost(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost of a vector intrinsic with ID and Operands.
static InstructionCost computeMemIntrinsicCost(Intrinsic::ID IID, Type *Ty, bool IsMasked, Align Alignment, VPCostContext &Ctx)
Helper function for computing the cost of vector memory intrinsic.
A common mixin class for widening memory operations.
Definition VPlan.h:3743
virtual VPRecipeBase * getAsRecipe()=0
Return a VPRecipeBase* to the current object.
A recipe for widened phis.
Definition VPlan.h:2754
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1817
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
VPWidenRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1838
unsigned getOpcode() const
Definition VPlan.h:1857
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4762
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:5087
bool hasVF(ElementCount VF) const
Definition VPlan.h:4985
const DataLayout & getDataLayout() const
Definition VPlan.h:4967
LLVMContext & getContext() const
Definition VPlan.h:4963
VPBasicBlock * getEntry()
Definition VPlan.h:4858
bool hasScalableVF() const
Definition VPlan.h:4986
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4921
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4942
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4992
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:5058
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4961
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:5064
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:5136
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:5090
bool hasUF(unsigned UF) const
Definition VPlan.h:5010
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4911
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4951
VPValue * getBackedgeTakenCount() const
Definition VPlan.h:4948
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:5035
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:5061
void setVF(ElementCount VF)
Definition VPlan.h:4973
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:5026
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1068
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:5013
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4935
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4887
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:5113
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:5055
VPBasicBlock * getVectorPreheader() const
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4863
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4958
bool hasScalarVFOnly() const
Definition VPlan.h:5003
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4901
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4954
void setUF(unsigned UF)
Definition VPlan.h:5018
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop, i.e.
Definition VPlan.h:5168
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1224
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:5069
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:261
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:319
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2798
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition DwarfDebug.h:190
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
match_combine_or< CastInst_match< OpTy, TruncInst >, OpTy > m_TruncOrSelf(const OpTy &Op)
auto m_Poison()
Match an arbitrary poison constant.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
SelectLike_match< CondTy, LTy, RTy > m_SelectLike(const CondTy &C, const LTy &TrueC, const RTy &FalseC)
Matches a value that behaves like a boolean-controlled select, i.e.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
specificloop_ty m_SpecificLoop(const Loop *L)
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
auto m_WidenAnyExtend(const Op0_t &Op0)
match_bind< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
auto m_VPPhi(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
auto m_WidenIntrinsic(const T &...Ops)
canonical_widen_iv_match m_CanonicalWidenIV()
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
match_bind< VPSingleDefRecipe > m_VPSingleDefRecipe(VPSingleDefRecipe *&V)
Match a VPSingleDefRecipe, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
auto m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
match_bind< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
auto m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
auto m_AnyNeg(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
bool cannotHoistOrSinkRecipe(const VPRecipeBase &R, bool Sinking=false)
Return true if we do not know how to (mechanically) hoist or sink R.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
VPInstruction * findCanonicalIVIncrement(VPlan &Plan)
Find the canonical IV increment of Plan's vector loop region.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPValue * findIncomingAliasMask(const VPlan &Plan)
Finds the incoming alias-mask within the vector preheader.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:140
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) Note: If ...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isUniformAcrossVFsAndUFs(const VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
LLVM_ABI_FOR_TEST std::optional< VPValue * > getRecipesForUncountableExit(SmallVectorImpl< VPInstruction * > &Recipes, SmallVectorImpl< VPInstruction * > &GEPs, VPBasicBlock *LatchVPBB)
Returns the VPValue representing the uncountable exit comparison used by AnyOf if the recipes it depe...
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
SmallVector< VPBasicBlock * > vp_rpo_plain_cfg_loop_body(VPBasicBlock *Header)
Returns the VPBasicBlocks forming the loop body of a plain (pre-region) VPlan in reverse post-order s...
Definition VPlanCFG.h:265
@ Offset
Definition DWP.cpp:558
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2179
void stable_sort(R &&Range)
Definition STLExtras.h:2115
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2077
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1668
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
constexpr auto bind_back(FnT &&Fn, BindArgsT &&...BindArgs)
C++23 bind_back.
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:288
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
SmallVector< VPRegisterUsage, 8 > calculateRegisterUsageForPlan(VPlan &Plan, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Estimate the register usage for Plan and vectorization factors in VFs by calculating the highest numb...
auto map_range(ContainerTy &&C, FuncTy F)
Return a range that applies F to the elements of C.
Definition STLExtras.h:365
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1151
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
constexpr size_t range_size(R &&Range)
Returns the size of the Range, i.e., the number of elements.
Definition STLExtras.h:1693
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:79
@ MaskedHandleExitInScalarLoop
All memory operations other than the load(s) required to determine whether an uncountable exit occurr...
Definition VPlan.h:89
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:551
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1850
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ FSub
Subtraction of floats.
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2011
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2087
ArrayRef(const T &OneElt) -> ArrayRef< T >
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1408
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1771
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:305
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2145
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT, AssumptionCache *AC=nullptr, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Return true if we can prove that the given load (which is assumed to be within the specified loop) wo...
Definition Loads.cpp:300
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:285
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
VPBasicBlock * EarlyExitingVPBB
VPIRBasicBlock * EarlyExitVPBB
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2846
Holds the VFShape for a specific scalar to vector function mapping.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
static bool isFreeScalarIntrinsic(Intrinsic::ID ID)
Returns true if ID is a pseudo intrinsic that is dropped via scalarization rather than widened.
Definition VPlan.cpp:1946
bool isMaskRequired(Instruction *I) const
Forwards to LoopVectorizationCostModel::isMaskRequired.
PredicatedScalarEvolution & PSE
bool willBeScalarized(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalarized at VF.
TargetTransformInfo::TargetCostKind CostKind
const TargetLibraryInfo & TLI
const TargetTransformInfo & TTI
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:246
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:147
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
InstructionCost spillCost(const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, unsigned OverrideMaxNumRegs=0) const
Calculate the estimated cost of any spills due to using more registers than the number available for ...
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:286
bool isMaterialized() const
Returns true if this symbolic value has been materialized.
Definition VPlanValue.h:297
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3857
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3807
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3960
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3906
static VPValue * materializeAliasMask(VPlan &Plan, VPBasicBlock *AliasCheckVPBB, ArrayRef< PointerDiffInfo > DiffChecks)
Materializes within the AliasCheckVPBB block.
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder)
Convert load/store VPInstructions in Plan into widened or replicate recipes.
static void expandSCEVsToVPInstructions(VPlan &Plan, ScalarEvolution &SE)
Try to expand VPExpandSCEVRecipes in Plan's entry block to VPInstructions.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, const bool &EpilogueAllowed)
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void replaceWideCanonicalIVWithWideIV(VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, ElementCount VF, unsigned UF, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Replace a VPWidenCanonicalIVRecipe if it is present in Plan, with a VPWidenIntOrFpInductionRecipe,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static void materializeAliasMaskCheckBlock(VPlan &Plan, ArrayRef< PointerDiffInfo > DiffChecks, bool HasBranchWeights)
Materializes the alias mask within a check block before the loop.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand remaining VPExpandSCEVRecipes in Plan's entry block using SCEVExpander.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step, std::optional< uint64_t > MaxRuntimeStep=std::nullopt)
Materialize vector trip count computations to a set of VPInstructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static bool mergeBlocksIntoPredecessors(VPlan &Plan)
Remove redundant VPBasicBlocks by merging them into their single predecessor if the latter has a sing...
static void attachAliasMaskToHeaderMask(VPlan &Plan)
Attaches the alias-mask to the existing header-mask.
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void makeScalarizationDecisions(VPlan &Plan, VFRange &Range)
Make VPlan-based scalarization decision prior to delegating to the ones made by the legacy CM.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses of the canonical ...
static void makeCallWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder, VPCostContext &CostCtx)
Convert call VPInstructions in Plan into widened call, vector intrinsic or replicate recipes based on...
static void adjustFirstOrderRecurrenceMiddleUsers(VPlan &Plan, VFRange &Range)
Adjust first-order recurrence users in the middle block: create penultimate element extracts for LCSS...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap, const VPDominatorTree &VPDT)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static bool removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void convertToStridedAccesses(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L, VPCostContext &Ctx, VFRange &Range)
Transform widen memory recipes into strided access recipes when legal and profitable.
static bool handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT, AssumptionCache *AC, UncountableExitStyle Style)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void attachVPCheckBlock(VPlan &Plan, VPValue *Cond, VPBasicBlock *CheckBlock, bool AddBranchWeights)
Wrap runtime check block CheckBlock in a VPIRBB and Cond in a VPValue and connect the block to Plan,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static void dropPoisonGeneratingRecipes(VPlan &Plan)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...