LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
75 for (VPValue *Op : PhiR->operands())
76 NewRecipe->addOperand(Op);
77 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
78 assert(!isa<PHINode>(Inst) && "phis should be handled above");
79 // Create VPWidenMemoryRecipe for loads and stores.
80 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
81 NewRecipe = new VPWidenLoadRecipe(
82 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
83 false /*Consecutive*/, false /*Reverse*/, *VPI,
84 Ingredient.getDebugLoc());
85 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
86 NewRecipe = new VPWidenStoreRecipe(
87 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
88 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
89 Ingredient.getDebugLoc());
91 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
92 Ingredient.getDebugLoc());
93 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
94 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
95 if (VectorID == Intrinsic::not_intrinsic)
96 return false;
97 NewRecipe = new VPWidenIntrinsicRecipe(
98 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
99 drop_end(Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
100 *VPI, CI->getDebugLoc());
101 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
102 NewRecipe = new VPWidenCastRecipe(
103 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
104 VPIRFlags(*CI), VPIRMetadata(*CI));
105 } else {
106 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
107 *VPI, Ingredient.getDebugLoc());
108 }
109 } else {
111 "inductions must be created earlier");
112 continue;
113 }
114
115 NewRecipe->insertBefore(&Ingredient);
116 if (NewRecipe->getNumDefinedValues() == 1)
117 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
118 else
119 assert(NewRecipe->getNumDefinedValues() == 0 &&
120 "Only recpies with zero or one defined values expected");
121 Ingredient.eraseFromParent();
122 }
123 }
124 return true;
125}
126
127/// Helper for extra no-alias checks via known-safe recipe and SCEV.
129 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
130 VPReplicateRecipe &GroupLeader;
132 const Loop &L;
133 VPTypeAnalysis &TypeInfo;
134
135 // Return true if \p A and \p B are known to not alias for all VFs in the
136 // plan, checked via the distance between the accesses
137 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
138 if (A->getOpcode() != Instruction::Store ||
139 B->getOpcode() != Instruction::Store)
140 return false;
141
142 VPValue *AddrA = A->getOperand(1);
143 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
144 VPValue *AddrB = B->getOperand(1);
145 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
147 return false;
148
149 const APInt *Distance;
150 ScalarEvolution &SE = *PSE.getSE();
151 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
152 return false;
153
154 const DataLayout &DL = SE.getDataLayout();
155 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
156 uint64_t SizeA = DL.getTypeStoreSize(TyA);
157 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
158 uint64_t SizeB = DL.getTypeStoreSize(TyB);
159
160 // Use the maximum store size to ensure no overlap from either direction.
161 // Currently only handles fixed sizes, as it is only used for
162 // replicating VPReplicateRecipes.
163 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
164
165 auto VFs = B->getParent()->getPlan()->vectorFactors();
167 if (MaxVF.isScalable())
168 return false;
169 return Distance->abs().uge(
170 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
171 }
172
173public:
176 const Loop &L, VPTypeAnalysis &TypeInfo)
177 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
178 L(L), TypeInfo(TypeInfo) {}
179
180 /// Return true if \p R should be skipped during alias checking, either
181 /// because it's in the exclude set or because no-alias can be proven via
182 /// SCEV.
183 bool shouldSkip(VPRecipeBase &R) const {
184 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
185 return ExcludeRecipes.contains(&R) ||
186 (Store && isNoAliasViaDistance(Store, &GroupLeader));
187 }
188};
189
190/// Check if a memory operation doesn't alias with memory operations in blocks
191/// between \p FirstBB and \p LastBB using scoped noalias metadata. If
192/// \p SinkInfo is std::nullopt, only recipes that may write to memory are
193/// checked (for load hoisting). Otherwise recipes that both read and write
194/// memory are checked, and SCEV is used to prove no-alias between the group
195/// leader and other replicate recipes (for store sinking).
196static bool
198 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
199 std::optional<SinkStoreInfo> SinkInfo = {}) {
200 bool CheckReads = SinkInfo.has_value();
201 if (!MemLoc.AATags.Scope)
202 return false;
203
204 for (VPBlockBase *Block = FirstBB; Block;
205 Block = Block->getSingleSuccessor()) {
206 assert(Block->getNumSuccessors() <= 1 &&
207 "Expected at most one successor in block chain");
208 auto *VPBB = cast<VPBasicBlock>(Block);
209 for (VPRecipeBase &R : *VPBB) {
210 if (SinkInfo && SinkInfo->shouldSkip(R))
211 continue;
212
213 // Skip recipes that don't need checking.
214 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
215 continue;
216
218 if (!Loc)
219 // Conservatively assume aliasing for memory operations without
220 // location.
221 return false;
222
224 return false;
225 }
226
227 if (Block == LastBB)
228 break;
229 }
230 return true;
231}
232
233/// Collect either replicated Loads or Stores grouped by their address SCEV.
234template <unsigned Opcode>
237 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
238 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
239 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
240 "Only Load and Store opcodes supported");
241 constexpr bool IsLoad = (Opcode == Instruction::Load);
243 RecipesByAddress;
244 for (VPBlockBase *Block :
246 auto *VPBB = cast<VPBasicBlock>(Block);
247 for (VPRecipeBase &R : *VPBB) {
248 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
249 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
250 continue;
251
252 // For loads, operand 0 is address; for stores, operand 1 is address.
253 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
254 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
255 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
256 RecipesByAddress[AddrSCEV].push_back(RepR);
257 }
258 }
259 auto Groups = to_vector(RecipesByAddress.values());
260 VPDominatorTree VPDT(Plan);
261 for (auto &Group : Groups) {
262 // Sort mem ops by dominance order, with earliest (most dominating) first.
264 return VPDT.properlyDominates(A, B);
265 });
266 }
267 return Groups;
268}
269
270/// Return true if we do not know how to (mechanically) hoist or sink \p R out
271/// of a loop region.
273 // Assumes don't alias anything or throw; as long as they're guaranteed to
274 // execute, they're safe to hoist.
276 return false;
277
278 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
279 // memory location is not modified in the vector loop.
280 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
281 return true;
282
283 // Allocas cannot be hoisted.
284 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
285 return RepR && RepR->getOpcode() == Instruction::Alloca;
286}
287
288static bool sinkScalarOperands(VPlan &Plan) {
289 auto Iter = vp_depth_first_deep(Plan.getEntry());
290 bool ScalarVFOnly = Plan.hasScalarVFOnly();
291 bool Changed = false;
292
294 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
295 VPBasicBlock *SinkTo, VPValue *Op) {
296 auto *Candidate =
297 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
298 if (!Candidate)
299 return;
300
301 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
302 // for now.
304 return;
305
306 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
307 return;
308
309 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
310 if (!ScalarVFOnly && RepR->isSingleScalar())
311 return;
312
313 WorkList.insert({SinkTo, Candidate});
314 };
315
316 // First, collect the operands of all recipes in replicate blocks as seeds for
317 // sinking.
319 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
320 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
321 continue;
322 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
323 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
324 continue;
325 for (auto &Recipe : *VPBB)
326 for (VPValue *Op : Recipe.operands())
327 InsertIfValidSinkCandidate(VPBB, Op);
328 }
329
330 // Try to sink each replicate or scalar IV steps recipe in the worklist.
331 for (unsigned I = 0; I != WorkList.size(); ++I) {
332 VPBasicBlock *SinkTo;
333 VPSingleDefRecipe *SinkCandidate;
334 std::tie(SinkTo, SinkCandidate) = WorkList[I];
335
336 // All recipe users of SinkCandidate must be in the same block SinkTo or all
337 // users outside of SinkTo must only use the first lane of SinkCandidate. In
338 // the latter case, we need to duplicate SinkCandidate.
339 auto UsersOutsideSinkTo =
340 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
341 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
342 });
343 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
344 return !U->usesFirstLaneOnly(SinkCandidate);
345 }))
346 continue;
347 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
348
349 if (NeedsDuplicating) {
350 if (ScalarVFOnly)
351 continue;
352 VPSingleDefRecipe *Clone;
353 if (auto *SinkCandidateRepR =
354 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
355 // TODO: Handle converting to uniform recipes as separate transform,
356 // then cloning should be sufficient here.
357 Instruction *I = SinkCandidate->getUnderlyingInstr();
358 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
359 nullptr /*Mask*/, *SinkCandidateRepR,
360 *SinkCandidateRepR);
361 // TODO: add ".cloned" suffix to name of Clone's VPValue.
362 } else {
363 Clone = SinkCandidate->clone();
364 }
365
366 Clone->insertBefore(SinkCandidate);
367 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
368 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
369 });
370 }
371 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
372 for (VPValue *Op : SinkCandidate->operands())
373 InsertIfValidSinkCandidate(SinkTo, Op);
374 Changed = true;
375 }
376 return Changed;
377}
378
379/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
380/// the mask.
382 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
383 if (!EntryBB || EntryBB->size() != 1 ||
384 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
385 return nullptr;
386
387 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
388}
389
390/// If \p R is a triangle region, return the 'then' block of the triangle.
392 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
393 if (EntryBB->getNumSuccessors() != 2)
394 return nullptr;
395
396 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
397 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
398 if (!Succ0 || !Succ1)
399 return nullptr;
400
401 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
402 return nullptr;
403 if (Succ0->getSingleSuccessor() == Succ1)
404 return Succ0;
405 if (Succ1->getSingleSuccessor() == Succ0)
406 return Succ1;
407 return nullptr;
408}
409
410// Merge replicate regions in their successor region, if a replicate region
411// is connected to a successor replicate region with the same predicate by a
412// single, empty VPBasicBlock.
414 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
415
416 // Collect replicate regions followed by an empty block, followed by another
417 // replicate region with matching masks to process front. This is to avoid
418 // iterator invalidation issues while merging regions.
421 vp_depth_first_deep(Plan.getEntry()))) {
422 if (!Region1->isReplicator())
423 continue;
424 auto *MiddleBasicBlock =
425 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
426 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
427 continue;
428
429 auto *Region2 =
430 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
431 if (!Region2 || !Region2->isReplicator())
432 continue;
433
434 VPValue *Mask1 = getPredicatedMask(Region1);
435 VPValue *Mask2 = getPredicatedMask(Region2);
436 if (!Mask1 || Mask1 != Mask2)
437 continue;
438
439 assert(Mask1 && Mask2 && "both region must have conditions");
440 WorkList.push_back(Region1);
441 }
442
443 // Move recipes from Region1 to its successor region, if both are triangles.
444 for (VPRegionBlock *Region1 : WorkList) {
445 if (TransformedRegions.contains(Region1))
446 continue;
447 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
448 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
449
450 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
451 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
452 if (!Then1 || !Then2)
453 continue;
454
455 // Note: No fusion-preventing memory dependencies are expected in either
456 // region. Such dependencies should be rejected during earlier dependence
457 // checks, which guarantee accesses can be re-ordered for vectorization.
458 //
459 // Move recipes to the successor region.
460 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
461 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
462
463 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
464 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
465
466 // Move VPPredInstPHIRecipes from the merge block to the successor region's
467 // merge block. Update all users inside the successor region to use the
468 // original values.
469 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
470 VPValue *PredInst1 =
471 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
472 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
473 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
474 return cast<VPRecipeBase>(&U)->getParent() == Then2;
475 });
476
477 // Remove phi recipes that are unused after merging the regions.
478 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
479 Phi1ToMove.eraseFromParent();
480 continue;
481 }
482 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
483 }
484
485 // Remove the dead recipes in Region1's entry block.
486 for (VPRecipeBase &R :
487 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
488 R.eraseFromParent();
489
490 // Finally, remove the first region.
491 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
492 VPBlockUtils::disconnectBlocks(Pred, Region1);
493 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
494 }
495 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
496 TransformedRegions.insert(Region1);
497 }
498
499 return !TransformedRegions.empty();
500}
501
503 VPlan &Plan) {
504 Instruction *Instr = PredRecipe->getUnderlyingInstr();
505 // Build the triangular if-then region.
506 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
507 assert(Instr->getParent() && "Predicated instruction not in any basic block");
508 auto *BlockInMask = PredRecipe->getMask();
509 auto *MaskDef = BlockInMask->getDefiningRecipe();
510 auto *BOMRecipe = new VPBranchOnMaskRecipe(
511 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
512 auto *Entry =
513 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
514
515 // Replace predicated replicate recipe with a replicate recipe without a
516 // mask but in the replicate region.
517 auto *RecipeWithoutMask = new VPReplicateRecipe(
518 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
519 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
520 PredRecipe->getDebugLoc());
521 auto *Pred =
522 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
523
524 VPPredInstPHIRecipe *PHIRecipe = nullptr;
525 if (PredRecipe->getNumUsers() != 0) {
526 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
527 RecipeWithoutMask->getDebugLoc());
528 PredRecipe->replaceAllUsesWith(PHIRecipe);
529 PHIRecipe->setOperand(0, RecipeWithoutMask);
530 }
531 PredRecipe->eraseFromParent();
532 auto *Exiting =
533 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
535 Plan.createReplicateRegion(Entry, Exiting, RegionName);
536
537 // Note: first set Entry as region entry and then connect successors starting
538 // from it in order, to propagate the "parent" of each VPBasicBlock.
539 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
540 VPBlockUtils::connectBlocks(Pred, Exiting);
541
542 return Region;
543}
544
545static void addReplicateRegions(VPlan &Plan) {
548 vp_depth_first_deep(Plan.getEntry()))) {
549 for (VPRecipeBase &R : *VPBB)
550 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
551 if (RepR->isPredicated())
552 WorkList.push_back(RepR);
553 }
554 }
555
556 unsigned BBNum = 0;
557 for (VPReplicateRecipe *RepR : WorkList) {
558 VPBasicBlock *CurrentBlock = RepR->getParent();
559 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
560
561 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
562 SplitBlock->setName(
563 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
564 // Record predicated instructions for above packing optimizations.
566 Region->setParent(CurrentBlock->getParent());
568
569 VPRegionBlock *ParentRegion = Region->getParent();
570 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
571 ParentRegion->setExiting(SplitBlock);
572 }
573}
574
575/// Remove redundant VPBasicBlocks by merging them into their predecessor if
576/// the predecessor has a single successor.
580 vp_depth_first_deep(Plan.getEntry()))) {
581 // Don't fold the blocks in the skeleton of the Plan into their single
582 // predecessors for now.
583 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
584 if (!VPBB->getParent())
585 continue;
586 auto *PredVPBB =
587 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
588 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
589 isa<VPIRBasicBlock>(PredVPBB))
590 continue;
591 WorkList.push_back(VPBB);
592 }
593
594 for (VPBasicBlock *VPBB : WorkList) {
595 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
596 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
597 R.moveBefore(*PredVPBB, PredVPBB->end());
598 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
599 auto *ParentRegion = VPBB->getParent();
600 if (ParentRegion && ParentRegion->getExiting() == VPBB)
601 ParentRegion->setExiting(PredVPBB);
602 for (auto *Succ : to_vector(VPBB->successors())) {
604 VPBlockUtils::connectBlocks(PredVPBB, Succ);
605 }
606 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
607 }
608 return !WorkList.empty();
609}
610
612 // Convert masked VPReplicateRecipes to if-then region blocks.
614
615 bool ShouldSimplify = true;
616 while (ShouldSimplify) {
617 ShouldSimplify = sinkScalarOperands(Plan);
618 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
619 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
620 }
621}
622
623/// Remove redundant casts of inductions.
624///
625/// Such redundant casts are casts of induction variables that can be ignored,
626/// because we already proved that the casted phi is equal to the uncasted phi
627/// in the vectorized loop. There is no need to vectorize the cast - the same
628/// value can be used for both the phi and casts in the vector loop.
630 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
632 if (!IV || IV->getTruncInst())
633 continue;
634
635 // A sequence of IR Casts has potentially been recorded for IV, which
636 // *must be bypassed* when the IV is vectorized, because the vectorized IV
637 // will produce the desired casted value. This sequence forms a def-use
638 // chain and is provided in reverse order, ending with the cast that uses
639 // the IV phi. Search for the recipe of the last cast in the chain and
640 // replace it with the original IV. Note that only the final cast is
641 // expected to have users outside the cast-chain and the dead casts left
642 // over will be cleaned up later.
643 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
644 VPValue *FindMyCast = IV;
645 for (Instruction *IRCast : reverse(Casts)) {
646 VPSingleDefRecipe *FoundUserCast = nullptr;
647 for (auto *U : FindMyCast->users()) {
648 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
649 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
650 FoundUserCast = UserCast;
651 break;
652 }
653 }
654 FindMyCast = FoundUserCast;
655 }
656 FindMyCast->replaceAllUsesWith(IV);
657 }
658}
659
660/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
661/// recipe, if it exists.
663 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
664 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
665 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
666 for (VPUser *U : CanonicalIV->users()) {
668 if (WidenNewIV)
669 break;
670 }
671
672 if (!WidenNewIV)
673 return;
674
675 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
676 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
677 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
678
679 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
680 continue;
681
682 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
683 // everything WidenNewIV's users need. That is, WidenOriginalIV will
684 // generate a vector phi or all users of WidenNewIV demand the first lane
685 // only.
686 if (Plan.hasScalarVFOnly() ||
687 !vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
688 vputils::onlyFirstLaneUsed(WidenNewIV)) {
689 // We are replacing a wide canonical iv with a suitable wide induction.
690 // This is used to compute header mask, hence all lanes will be used and
691 // we need to drop wrap flags only applying to lanes guranteed to execute
692 // in the original scalar loop.
693 WidenOriginalIV->dropPoisonGeneratingFlags();
694 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
695 WidenNewIV->eraseFromParent();
696 return;
697 }
698 }
699}
700
701/// Returns true if \p R is dead and can be removed.
702static bool isDeadRecipe(VPRecipeBase &R) {
703 // Do remove conditional assume instructions as their conditions may be
704 // flattened.
705 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
706 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
708 if (IsConditionalAssume)
709 return true;
710
711 if (R.mayHaveSideEffects())
712 return false;
713
714 // Recipe is dead if no user keeps the recipe alive.
715 return all_of(R.definedValues(),
716 [](VPValue *V) { return V->getNumUsers() == 0; });
717}
718
721 vp_post_order_deep(Plan.getEntry()))) {
722 // The recipes in the block are processed in reverse order, to catch chains
723 // of dead recipes.
724 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
725 if (isDeadRecipe(R)) {
726 R.eraseFromParent();
727 continue;
728 }
729
730 // Check if R is a dead VPPhi <-> update cycle and remove it.
731 auto *PhiR = dyn_cast<VPPhi>(&R);
732 if (!PhiR || PhiR->getNumOperands() != 2)
733 continue;
734 VPUser *PhiUser = PhiR->getSingleUser();
735 if (!PhiUser)
736 continue;
737 VPValue *Incoming = PhiR->getOperand(1);
738 if (PhiUser != Incoming->getDefiningRecipe() ||
739 Incoming->getNumUsers() != 1)
740 continue;
741 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
742 PhiR->eraseFromParent();
743 Incoming->getDefiningRecipe()->eraseFromParent();
744 }
745 }
746}
747
750 Instruction::BinaryOps InductionOpcode,
751 FPMathOperator *FPBinOp, Instruction *TruncI,
752 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
753 VPBuilder &Builder) {
754 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
755 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
756 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
757 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
758 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
759
760 // Truncate base induction if needed.
761 VPTypeAnalysis TypeInfo(Plan);
762 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
763 if (TruncI) {
764 Type *TruncTy = TruncI->getType();
765 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
766 "Not truncating.");
767 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
768 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
769 ResultTy = TruncTy;
770 }
771
772 // Truncate step if needed.
773 Type *StepTy = TypeInfo.inferScalarType(Step);
774 if (ResultTy != StepTy) {
775 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
776 "Not truncating.");
777 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
778 auto *VecPreheader =
780 VPBuilder::InsertPointGuard Guard(Builder);
781 Builder.setInsertPoint(VecPreheader);
782 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
783 }
784 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
785 &Plan.getVF(), DL);
786}
787
790 for (unsigned I = 0; I != Users.size(); ++I) {
792 if (isa<VPHeaderPHIRecipe>(Cur))
793 continue;
794 for (VPValue *V : Cur->definedValues())
795 Users.insert_range(V->users());
796 }
797 return Users.takeVector();
798}
799
800/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
801/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
802/// generates scalar values.
803static VPValue *
805 VPlan &Plan, VPBuilder &Builder) {
807 VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());
808 VPValue *StepV = PtrIV->getOperand(1);
810 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
811 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
812
813 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
814 PtrIV->getDebugLoc(), "next.gep");
815}
816
817/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
818/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
819/// VPWidenPointerInductionRecipe will generate vectors only. If some users
820/// require vectors while other require scalars, the scalar uses need to extract
821/// the scalars from the generated vectors (Note that this is different to how
822/// int/fp inductions are handled). Legalize extract-from-ends using uniform
823/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
824/// the correct end value is available. Also optimize
825/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
826/// providing them scalar steps built on the canonical scalar IV and update the
827/// original IV's users. This is an optional optimization to reduce the needs of
828/// vector extracts.
831 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
832 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
833 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
834 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
835 if (!PhiR)
836 continue;
837
838 // Try to narrow wide and replicating recipes to uniform recipes, based on
839 // VPlan analysis.
840 // TODO: Apply to all recipes in the future, to replace legacy uniformity
841 // analysis.
842 auto Users = collectUsersRecursively(PhiR);
843 for (VPUser *U : reverse(Users)) {
844 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
845 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
846 // Skip recipes that shouldn't be narrowed.
847 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
848 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
849 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
850 continue;
851
852 // Skip recipes that may have other lanes than their first used.
854 continue;
855
856 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
857 Def->operands(), /*IsUniform*/ true,
858 /*Mask*/ nullptr, /*Flags*/ *Def);
859 Clone->insertAfter(Def);
860 Def->replaceAllUsesWith(Clone);
861 }
862
863 // Replace wide pointer inductions which have only their scalars used by
864 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
865 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
866 if (!Plan.hasScalarVFOnly() &&
867 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
868 continue;
869
870 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
871 PtrIV->replaceAllUsesWith(PtrAdd);
872 continue;
873 }
874
875 // Replace widened induction with scalar steps for users that only use
876 // scalars.
877 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
878 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
879 return U->usesScalars(WideIV);
880 }))
881 continue;
882
883 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
885 Plan, ID.getKind(), ID.getInductionOpcode(),
886 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
887 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
888 WideIV->getDebugLoc(), Builder);
889
890 // Update scalar users of IV to use Step instead.
891 if (!HasOnlyVectorVFs) {
892 assert(!Plan.hasScalableVF() &&
893 "plans containing a scalar VF cannot also include scalable VFs");
894 WideIV->replaceAllUsesWith(Steps);
895 } else {
896 bool HasScalableVF = Plan.hasScalableVF();
897 WideIV->replaceUsesWithIf(Steps,
898 [WideIV, HasScalableVF](VPUser &U, unsigned) {
899 if (HasScalableVF)
900 return U.usesFirstLaneOnly(WideIV);
901 return U.usesScalars(WideIV);
902 });
903 }
904 }
905}
906
907/// Check if \p VPV is an untruncated wide induction, either before or after the
908/// increment. If so return the header IV (before the increment), otherwise
909/// return null.
912 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
913 if (WideIV) {
914 // VPV itself is a wide induction, separately compute the end value for exit
915 // users if it is not a truncated IV.
916 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
917 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
918 }
919
920 // Check if VPV is an optimizable induction increment.
921 VPRecipeBase *Def = VPV->getDefiningRecipe();
922 if (!Def || Def->getNumOperands() != 2)
923 return nullptr;
924 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
925 if (!WideIV)
926 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
927 if (!WideIV)
928 return nullptr;
929
930 auto IsWideIVInc = [&]() {
931 auto &ID = WideIV->getInductionDescriptor();
932
933 // Check if VPV increments the induction by the induction step.
934 VPValue *IVStep = WideIV->getStepValue();
935 switch (ID.getInductionOpcode()) {
936 case Instruction::Add:
937 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
938 case Instruction::FAdd:
939 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
940 case Instruction::FSub:
941 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
942 m_Specific(IVStep)));
943 case Instruction::Sub: {
944 // IVStep will be the negated step of the subtraction. Check if Step == -1
945 // * IVStep.
946 VPValue *Step;
947 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
948 return false;
949 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
950 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
951 ScalarEvolution &SE = *PSE.getSE();
952 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
953 !isa<SCEVCouldNotCompute>(StepSCEV) &&
954 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
955 }
956 default:
957 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
958 match(VPV, m_GetElementPtr(m_Specific(WideIV),
959 m_Specific(WideIV->getStepValue())));
960 }
961 llvm_unreachable("should have been covered by switch above");
962 };
963 return IsWideIVInc() ? WideIV : nullptr;
964}
965
966/// Attempts to optimize the induction variable exit values for users in the
967/// early exit block.
969 VPTypeAnalysis &TypeInfo,
970 VPBlockBase *PredVPBB,
971 VPValue *Op,
973 VPValue *Incoming, *Mask;
976 return nullptr;
977
978 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
979 if (!WideIV)
980 return nullptr;
981
982 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
983 if (WideIntOrFp && WideIntOrFp->getTruncInst())
984 return nullptr;
985
986 // Calculate the final index.
987 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
988 auto *CanonicalIV = LoopRegion->getCanonicalIV();
989 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
990 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
991
992 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
993 VPValue *FirstActiveLane =
994 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
995 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
996 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
997 FirstActiveLaneType, DL);
998 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
999
1000 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1001 // changed it means the exit is using the incremented value, so we need to
1002 // add the step.
1003 if (Incoming != WideIV) {
1004 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
1005 EndValue = B.createAdd(EndValue, One, DL);
1006 }
1007
1008 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1009 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1010 VPIRValue *Start = WideIV->getStartValue();
1011 VPValue *Step = WideIV->getStepValue();
1012 EndValue = B.createDerivedIV(
1013 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1014 Start, EndValue, Step);
1015 }
1016
1017 return EndValue;
1018}
1019
1020/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1021/// VPDerivedIVRecipe for non-canonical inductions.
1023 VPBuilder &VectorPHBuilder,
1024 VPTypeAnalysis &TypeInfo,
1025 VPValue *VectorTC) {
1026 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1027 // Truncated wide inductions resume from the last lane of their vector value
1028 // in the last vector iteration which is handled elsewhere.
1029 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1030 return nullptr;
1031
1032 VPIRValue *Start = WideIV->getStartValue();
1033 VPValue *Step = WideIV->getStepValue();
1035 VPValue *EndValue = VectorTC;
1036 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1037 EndValue = VectorPHBuilder.createDerivedIV(
1038 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1039 Start, VectorTC, Step);
1040 }
1041
1042 // EndValue is derived from the vector trip count (which has the same type as
1043 // the widest induction) and thus may be wider than the induction here.
1044 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
1045 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
1046 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
1047 ScalarTypeOfWideIV,
1048 WideIV->getDebugLoc());
1049 }
1050
1051 return EndValue;
1052}
1053
1054/// Attempts to optimize the induction variable exit values for users in the
1055/// exit block coming from the latch in the original scalar loop.
1057 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
1060 VPWidenInductionRecipe *WideIV = nullptr;
1062 WideIV = getOptimizableIVOf(Incoming, PSE);
1063
1064 if (!WideIV)
1065 return nullptr;
1066
1067 VPValue *EndValue = EndValues.lookup(WideIV);
1068 assert(EndValue && "Must have computed the end value up front");
1069
1070 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1071 // changed it means the exit is using the incremented value, so we don't
1072 // need to subtract the step.
1073 if (Incoming != WideIV)
1074 return EndValue;
1075
1076 // Otherwise, subtract the step from the EndValue.
1077 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
1078 VPValue *Step = WideIV->getStepValue();
1079 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1080 if (ScalarTy->isIntegerTy())
1081 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1082 if (ScalarTy->isPointerTy()) {
1083 Type *StepTy = TypeInfo.inferScalarType(Step);
1084 auto *Zero = Plan.getZero(StepTy);
1085 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1086 DebugLoc::getUnknown(), "ind.escape");
1087 }
1088 if (ScalarTy->isFloatingPointTy()) {
1089 const auto &ID = WideIV->getInductionDescriptor();
1090 return B.createNaryOp(
1091 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1092 ? Instruction::FSub
1093 : Instruction::FAdd,
1094 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1095 }
1096 llvm_unreachable("all possible induction types must be handled");
1097 return nullptr;
1098}
1099
1101 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1102 // Compute end values for all inductions.
1103 VPTypeAnalysis TypeInfo(Plan);
1104 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1105 auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());
1106 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1108 VPValue *ResumeTC =
1109 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1110 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1111 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);
1112 if (!WideIV)
1113 continue;
1115 WideIV, VectorPHBuilder, TypeInfo, ResumeTC))
1116 EndValues[WideIV] = EndValue;
1117 }
1118
1119 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1120 for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
1121 VPValue *Op;
1122 if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))
1123 continue;
1124 auto *WideIV = cast<VPWidenInductionRecipe>(Op);
1125 if (VPValue *EndValue = EndValues.lookup(WideIV)) {
1126 R.getVPSingleValue()->replaceAllUsesWith(EndValue);
1127 R.eraseFromParent();
1128 }
1129 }
1130
1131 // Then, optimize exit block users.
1132 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1133 for (VPRecipeBase &R : ExitVPBB->phis()) {
1134 auto *ExitIRI = cast<VPIRPhi>(&R);
1135
1136 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1137 VPValue *Escape = nullptr;
1138 if (PredVPBB == MiddleVPBB)
1139 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1140 ExitIRI->getOperand(Idx),
1141 EndValues, PSE);
1142 else
1144 Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), PSE);
1145 if (Escape)
1146 ExitIRI->setOperand(Idx, Escape);
1147 }
1148 }
1149 }
1150}
1151
1152/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1153/// them with already existing recipes expanding the same SCEV expression.
1156
1157 for (VPRecipeBase &R :
1159 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1160 if (!ExpR)
1161 continue;
1162
1163 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1164 if (Inserted)
1165 continue;
1166 ExpR->replaceAllUsesWith(V->second);
1167 ExpR->eraseFromParent();
1168 }
1169}
1170
1172 SmallVector<VPValue *> WorkList;
1174 WorkList.push_back(V);
1175
1176 while (!WorkList.empty()) {
1177 VPValue *Cur = WorkList.pop_back_val();
1178 if (!Seen.insert(Cur).second)
1179 continue;
1180 VPRecipeBase *R = Cur->getDefiningRecipe();
1181 if (!R)
1182 continue;
1183 if (!isDeadRecipe(*R))
1184 continue;
1185 append_range(WorkList, R->operands());
1186 R->eraseFromParent();
1187 }
1188}
1189
1190/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1191/// Returns an optional pair, where the first element indicates whether it is
1192/// an intrinsic ID.
1193static std::optional<std::pair<bool, unsigned>>
1195 return TypeSwitch<const VPSingleDefRecipe *,
1196 std::optional<std::pair<bool, unsigned>>>(R)
1199 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1200 .Case([](const VPWidenIntrinsicRecipe *I) {
1201 return std::make_pair(true, I->getVectorIntrinsicID());
1202 })
1203 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1204 // For recipes that do not directly map to LLVM IR instructions,
1205 // assign opcodes after the last VPInstruction opcode (which is also
1206 // after the last IR Instruction opcode), based on the VPRecipeID.
1207 return std::make_pair(false,
1208 VPInstruction::OpsEnd + 1 + I->getVPRecipeID());
1209 })
1210 .Default([](auto *) { return std::nullopt; });
1211}
1212
1213/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1214/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1215/// Operands are foldable live-ins.
1217 ArrayRef<VPValue *> Operands,
1218 const DataLayout &DL,
1219 VPTypeAnalysis &TypeInfo) {
1220 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1221 if (!OpcodeOrIID)
1222 return nullptr;
1223
1225 for (VPValue *Op : Operands) {
1226 if (!match(Op, m_LiveIn()))
1227 return nullptr;
1228 Value *V = Op->getUnderlyingValue();
1229 if (!V)
1230 return nullptr;
1231 Ops.push_back(V);
1232 }
1233
1234 auto FoldToIRValue = [&]() -> Value * {
1235 InstSimplifyFolder Folder(DL);
1236 if (OpcodeOrIID->first) {
1237 if (R.getNumOperands() != 2)
1238 return nullptr;
1239 unsigned ID = OpcodeOrIID->second;
1240 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1241 TypeInfo.inferScalarType(&R));
1242 }
1243 unsigned Opcode = OpcodeOrIID->second;
1244 if (Instruction::isBinaryOp(Opcode))
1245 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1246 Ops[0], Ops[1]);
1247 if (Instruction::isCast(Opcode))
1248 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1249 TypeInfo.inferScalarType(R.getVPSingleValue()));
1250 switch (Opcode) {
1252 return Folder.FoldSelect(Ops[0], Ops[1],
1254 case VPInstruction::Not:
1255 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1257 case Instruction::Select:
1258 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1259 case Instruction::ICmp:
1260 case Instruction::FCmp:
1261 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1262 Ops[1]);
1263 case Instruction::GetElementPtr: {
1264 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1265 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1266 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1267 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1268 }
1271 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1272 Ops[0], Ops[1],
1273 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1274 // An extract of a live-in is an extract of a broadcast, so return the
1275 // broadcasted element.
1276 case Instruction::ExtractElement:
1277 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1278 return Ops[0];
1279 }
1280 return nullptr;
1281 };
1282
1283 if (Value *V = FoldToIRValue())
1284 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1285 return nullptr;
1286}
1287
1288/// Try to simplify VPSingleDefRecipe \p Def.
1290 VPlan *Plan = Def->getParent()->getPlan();
1291
1292 // Simplification of live-in IR values for SingleDef recipes using
1293 // InstSimplifyFolder.
1294 const DataLayout &DL = Plan->getDataLayout();
1295 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1296 return Def->replaceAllUsesWith(V);
1297
1298 // Fold PredPHI LiveIn -> LiveIn.
1299 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1300 VPValue *Op = PredPHI->getOperand(0);
1301 if (isa<VPIRValue>(Op))
1302 PredPHI->replaceAllUsesWith(Op);
1303 }
1304
1305 VPBuilder Builder(Def);
1306
1307 // Avoid replacing VPInstructions with underlying values with new
1308 // VPInstructions, as we would fail to create widen/replicate recpes from the
1309 // new VPInstructions without an underlying value, and miss out on some
1310 // transformations that only apply to widened/replicated recipes later, by
1311 // doing so.
1312 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1313 // VPInstructions without underlying values, as those will get skipped during
1314 // cost computation.
1315 bool CanCreateNewRecipe =
1316 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1317
1318 VPValue *A;
1319 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1320 Type *TruncTy = TypeInfo.inferScalarType(Def);
1321 Type *ATy = TypeInfo.inferScalarType(A);
1322 if (TruncTy == ATy) {
1323 Def->replaceAllUsesWith(A);
1324 } else {
1325 // Don't replace a non-widened cast recipe with a widened cast.
1326 if (!isa<VPWidenCastRecipe>(Def))
1327 return;
1328 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1329
1330 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1331 ? Instruction::SExt
1332 : Instruction::ZExt;
1333 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1334 TruncTy);
1335 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1336 // UnderlyingExt has distinct return type, used to retain legacy cost.
1337 Ext->setUnderlyingValue(UnderlyingExt);
1338 }
1339 Def->replaceAllUsesWith(Ext);
1340 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1341 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1342 Def->replaceAllUsesWith(Trunc);
1343 }
1344 }
1345#ifndef NDEBUG
1346 // Verify that the cached type info is for both A and its users is still
1347 // accurate by comparing it to freshly computed types.
1348 VPTypeAnalysis TypeInfo2(*Plan);
1349 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1350 for (VPUser *U : A->users()) {
1351 auto *R = cast<VPRecipeBase>(U);
1352 for (VPValue *VPV : R->definedValues())
1353 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1354 }
1355#endif
1356 }
1357
1358 // Simplify (X && Y) | (X && !Y) -> X.
1359 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1360 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1361 // recipes to be visited during simplification.
1362 VPValue *X, *Y, *Z;
1363 if (match(Def,
1366 Def->replaceAllUsesWith(X);
1367 Def->eraseFromParent();
1368 return;
1369 }
1370
1371 // x | AllOnes -> AllOnes
1372 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1373 return Def->replaceAllUsesWith(
1374 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1375
1376 // x | 0 -> x
1377 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1378 return Def->replaceAllUsesWith(X);
1379
1380 // x | !x -> AllOnes
1382 return Def->replaceAllUsesWith(
1383 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1384
1385 // x & 0 -> 0
1386 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1387 return Def->replaceAllUsesWith(
1388 Plan->getZero(TypeInfo.inferScalarType(Def)));
1389
1390 // x & AllOnes -> x
1391 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes())))
1392 return Def->replaceAllUsesWith(X);
1393
1394 // x && false -> false
1395 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False())))
1396 return Def->replaceAllUsesWith(Plan->getFalse());
1397
1398 // x && true -> x
1399 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True())))
1400 return Def->replaceAllUsesWith(X);
1401
1402 // (x && y) | (x && z) -> x && (y | z)
1403 if (CanCreateNewRecipe &&
1406 // Simplify only if one of the operands has one use to avoid creating an
1407 // extra recipe.
1408 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1409 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1410 return Def->replaceAllUsesWith(
1411 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1412
1413 // x && (x && y) -> x && y
1414 if (match(Def, m_LogicalAnd(m_VPValue(X),
1416 return Def->replaceAllUsesWith(Def->getOperand(1));
1417
1418 // x && (y && x) -> x && y
1419 if (match(Def, m_LogicalAnd(m_VPValue(X),
1421 return Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));
1422
1423 // x && !x -> 0
1425 return Def->replaceAllUsesWith(Plan->getFalse());
1426
1427 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1428 return Def->replaceAllUsesWith(X);
1429
1430 // select c, false, true -> not c
1431 VPValue *C;
1432 if (CanCreateNewRecipe &&
1433 match(Def, m_Select(m_VPValue(C), m_False(), m_True())))
1434 return Def->replaceAllUsesWith(Builder.createNot(C));
1435
1436 // select !c, x, y -> select c, y, x
1437 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1438 Def->setOperand(0, C);
1439 Def->setOperand(1, Y);
1440 Def->setOperand(2, X);
1441 return;
1442 }
1443
1444 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1445 return Def->replaceAllUsesWith(A);
1446
1447 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1448 return Def->replaceAllUsesWith(A);
1449
1450 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1451 return Def->replaceAllUsesWith(
1452 Plan->getZero(TypeInfo.inferScalarType(Def)));
1453
1454 const APInt *APC;
1455 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1456 APC->isPowerOf2())
1457 return Def->replaceAllUsesWith(Builder.createNaryOp(
1458 Instruction::Shl,
1459 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1460 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1461
1462 // Don't convert udiv to lshr inside a replicate region, as VPInstructions are
1463 // not allowed in them.
1464 const VPRegionBlock *ParentRegion = Def->getParent()->getParent();
1465 bool IsInReplicateRegion = ParentRegion && ParentRegion->isReplicator();
1466 if (CanCreateNewRecipe && !IsInReplicateRegion &&
1467 match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) && APC->isPowerOf2())
1468 return Def->replaceAllUsesWith(Builder.createNaryOp(
1469 Instruction::LShr,
1470 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1471 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1472
1473 if (match(Def, m_Not(m_VPValue(A)))) {
1474 if (match(A, m_Not(m_VPValue(A))))
1475 return Def->replaceAllUsesWith(A);
1476
1477 // Try to fold Not into compares by adjusting the predicate in-place.
1478 CmpPredicate Pred;
1479 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1480 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1481 if (all_of(Cmp->users(),
1483 m_Not(m_Specific(Cmp)),
1484 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1485 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1486 for (VPUser *U : to_vector(Cmp->users())) {
1487 auto *R = cast<VPSingleDefRecipe>(U);
1488 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1489 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1490 R->setOperand(1, Y);
1491 R->setOperand(2, X);
1492 } else {
1493 // not (cmp pred) -> cmp inv_pred
1494 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1495 R->replaceAllUsesWith(Cmp);
1496 }
1497 }
1498 // If Cmp doesn't have a debug location, use the one from the negation,
1499 // to preserve the location.
1500 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1501 Cmp->setDebugLoc(Def->getDebugLoc());
1502 }
1503 }
1504 }
1505
1506 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1507 // any-of (fcmp uno %A, %B), ...
1508 if (match(Def, m_AnyOf())) {
1510 VPRecipeBase *UnpairedCmp = nullptr;
1511 for (VPValue *Op : Def->operands()) {
1512 VPValue *X;
1513 if (Op->getNumUsers() > 1 ||
1515 m_Deferred(X)))) {
1516 NewOps.push_back(Op);
1517 } else if (!UnpairedCmp) {
1518 UnpairedCmp = Op->getDefiningRecipe();
1519 } else {
1520 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1521 UnpairedCmp->getOperand(0), X));
1522 UnpairedCmp = nullptr;
1523 }
1524 }
1525
1526 if (UnpairedCmp)
1527 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1528
1529 if (NewOps.size() < Def->getNumOperands()) {
1530 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1531 return Def->replaceAllUsesWith(NewAnyOf);
1532 }
1533 }
1534
1535 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1536 // This is useful for fmax/fmin without fast-math flags, where we need to
1537 // check if any operand is NaN.
1538 if (CanCreateNewRecipe &&
1540 m_Deferred(X)),
1542 m_Deferred(Y))))) {
1543 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1544 return Def->replaceAllUsesWith(NewCmp);
1545 }
1546
1547 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1548 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1549 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1550 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1551 TypeInfo.inferScalarType(Def))
1552 return Def->replaceAllUsesWith(Def->getOperand(1));
1553
1555 m_One()))) {
1556 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1557 if (TypeInfo.inferScalarType(X) != WideStepTy)
1558 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1559 Def->replaceAllUsesWith(X);
1560 return;
1561 }
1562
1563 // For i1 vp.merges produced by AnyOf reductions:
1564 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1566 m_VPValue(X), m_VPValue())) &&
1568 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1569 Def->setOperand(1, Def->getOperand(0));
1570 Def->setOperand(0, Y);
1571 return;
1572 }
1573
1574 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1575 if (Phi->getOperand(0) == Phi->getOperand(1))
1576 Phi->replaceAllUsesWith(Phi->getOperand(0));
1577 return;
1578 }
1579
1580 // Simplify MaskedCond with no block mask to its single operand.
1582 !cast<VPInstruction>(Def)->isMasked())
1583 return Def->replaceAllUsesWith(Def->getOperand(0));
1584
1585 // Look through ExtractLastLane.
1586 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1587 if (match(A, m_BuildVector())) {
1588 auto *BuildVector = cast<VPInstruction>(A);
1589 Def->replaceAllUsesWith(
1590 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1591 return;
1592 }
1593 if (Plan->hasScalarVFOnly())
1594 return Def->replaceAllUsesWith(A);
1595 }
1596
1597 // Look through ExtractPenultimateElement (BuildVector ....).
1599 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1600 Def->replaceAllUsesWith(
1601 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1602 return;
1603 }
1604
1605 uint64_t Idx;
1607 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1608 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1609 return;
1610 }
1611
1612 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1613 Def->replaceAllUsesWith(
1614 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1615 return;
1616 }
1617
1618 // Look through broadcast of single-scalar when used as select conditions; in
1619 // that case the scalar condition can be used directly.
1620 if (match(Def,
1623 "broadcast operand must be single-scalar");
1624 Def->setOperand(0, C);
1625 return;
1626 }
1627
1629 if (Def->getNumOperands() == 1)
1630 Def->replaceAllUsesWith(Def->getOperand(0));
1631 return;
1632 }
1633
1634 VPIRValue *IRV;
1635 if (Def->getNumOperands() == 1 &&
1637 return Def->replaceAllUsesWith(IRV);
1638
1639 // Some simplifications can only be applied after unrolling. Perform them
1640 // below.
1641 if (!Plan->isUnrolled())
1642 return;
1643
1644 // After unrolling, extract-lane may be used to extract values from multiple
1645 // scalar sources. Only simplify when extracting from a single scalar source.
1646 VPValue *LaneToExtract;
1647 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1648 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1650 return Def->replaceAllUsesWith(A);
1651
1652 // Simplify extract-lane with single source to extract-element.
1653 Def->replaceAllUsesWith(Builder.createNaryOp(
1654 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1655 return;
1656 }
1657
1658 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1659 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && isa<VPIRValue>(Y) &&
1660 isa<VPPhi>(X)) {
1661 auto *Phi = cast<VPPhi>(X);
1662 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1663 Phi->getSingleUser() == Def) {
1664 Phi->setOperand(0, Y);
1665 Def->replaceAllUsesWith(Phi);
1666 return;
1667 }
1668 }
1669
1670 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1671 // just the pointer operand.
1672 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1673 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1674 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1675
1676 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1677 // the start index is zero and only the first lane 0 is demanded.
1678 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1679 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1680 Steps->replaceAllUsesWith(Steps->getOperand(0));
1681 return;
1682 }
1683 }
1684 // Simplify redundant ReductionStartVector recipes after unrolling.
1685 VPValue *StartV;
1687 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1688 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1689 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1690 return PhiR && PhiR->isInLoop();
1691 });
1692 return;
1693 }
1694
1696 Def->replaceAllUsesWith(A);
1697 return;
1698 }
1699
1700 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1703 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1704 all_of(A->users(),
1705 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1706 return Def->replaceAllUsesWith(A);
1707 }
1708
1709 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1710 return Def->replaceAllUsesWith(A);
1711}
1712
1715 Plan.getEntry());
1716 VPTypeAnalysis TypeInfo(Plan);
1718 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1719 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1720 simplifyRecipe(Def, TypeInfo);
1721 }
1722}
1723
1724/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1725/// header mask to be simplified further when tail folding, e.g. in
1726/// optimizeEVLMasks.
1727static void reassociateHeaderMask(VPlan &Plan) {
1728 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1729 if (!HeaderMask)
1730 return;
1731
1732 SmallVector<VPUser *> Worklist;
1733 for (VPUser *U : HeaderMask->users())
1734 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1736
1737 while (!Worklist.empty()) {
1738 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1739 VPValue *X, *Y;
1740 if (!R || !match(R, m_LogicalAnd(
1741 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1742 m_VPValue(Y))))
1743 continue;
1744 append_range(Worklist, R->users());
1745 VPBuilder Builder(R);
1746 R->replaceAllUsesWith(
1747 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1748 }
1749}
1750
1752 if (Plan.hasScalarVFOnly())
1753 return;
1754
1755 // Try to narrow wide and replicating recipes to single scalar recipes,
1756 // based on VPlan analysis. Only process blocks in the loop region for now,
1757 // without traversing into nested regions, as recipes in replicate regions
1758 // cannot be converted yet.
1761 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1763 VPWidenStoreRecipe>(&R))
1764 continue;
1765 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1766 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1767 continue;
1768
1769 // Convert an unmasked scatter with an uniform address into
1770 // extract-last-lane + scalar store.
1771 // TODO: Add a profitability check comparing the cost of a scatter vs.
1772 // extract + scalar store.
1773 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
1774 if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
1775 !WidenStoreR->isConsecutive()) {
1776 assert(!WidenStoreR->isReverse() &&
1777 "Not consecutive memory recipes shouldn't be reversed");
1778 VPValue *Mask = WidenStoreR->getMask();
1779
1780 // Only convert the scatter to a scalar store if it is unmasked.
1781 // TODO: Support converting scatter masked by the header mask to scalar
1782 // store.
1783 if (Mask)
1784 continue;
1785
1787 {WidenStoreR->getOperand(1)});
1788 Extract->insertBefore(WidenStoreR);
1789
1790 // TODO: Sink the scalar store recipe to middle block if possible.
1791 auto *ScalarStore = new VPReplicateRecipe(
1792 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1793 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1794 *WidenStoreR /*Metadata*/);
1795 ScalarStore->insertBefore(WidenStoreR);
1796 WidenStoreR->eraseFromParent();
1797 continue;
1798 }
1799
1800 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(&R);
1801 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1802 vputils::isSingleScalar(RepR->getOperand(1))) {
1803 auto *Clone = new VPReplicateRecipe(
1804 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1805 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1806 *RepR /*Metadata*/, RepR->getDebugLoc());
1807 Clone->insertBefore(RepOrWidenR);
1808 VPBuilder Builder(Clone);
1809 VPValue *ExtractOp = Clone->getOperand(0);
1810 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1811 ExtractOp =
1812 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1813 ExtractOp =
1814 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1815 Clone->setOperand(0, ExtractOp);
1816 RepR->eraseFromParent();
1817 continue;
1818 }
1819
1820 // Skip recipes that aren't single scalars.
1821 if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR))
1822 continue;
1823
1824 // Predicate to check if a user of Op introduces extra broadcasts.
1825 auto IntroducesBCastOf = [](const VPValue *Op) {
1826 return [Op](const VPUser *U) {
1827 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1831 VPI->getOpcode()))
1832 return false;
1833 }
1834 return !U->usesScalars(Op);
1835 };
1836 };
1837
1838 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1839 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1840 if (any_of(
1841 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1842 IntroducesBCastOf(Op)))
1843 return false;
1844 // Non-constant live-ins require broadcasts, while constants do not
1845 // need explicit broadcasts.
1846 auto *IRV = dyn_cast<VPIRValue>(Op);
1847 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1848 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1849 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1850 }))
1851 continue;
1852
1853 auto *Clone = new VPReplicateRecipe(
1854 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1855 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1856 Clone->insertBefore(RepOrWidenR);
1857 RepOrWidenR->replaceAllUsesWith(Clone);
1858 if (isDeadRecipe(*RepOrWidenR))
1859 RepOrWidenR->eraseFromParent();
1860 }
1861 }
1862}
1863
1864/// Try to see if all of \p Blend's masks share a common value logically and'ed
1865/// and remove it from the masks.
1867 if (Blend->isNormalized())
1868 return;
1869 VPValue *CommonEdgeMask;
1870 if (!match(Blend->getMask(0),
1871 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1872 return;
1873 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1874 if (!match(Blend->getMask(I),
1875 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1876 return;
1877 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1878 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1879}
1880
1881/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1882/// to make sure the masks are simplified.
1883static void simplifyBlends(VPlan &Plan) {
1886 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1887 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1888 if (!Blend)
1889 continue;
1890
1891 removeCommonBlendMask(Blend);
1892
1893 // Try to remove redundant blend recipes.
1894 SmallPtrSet<VPValue *, 4> UniqueValues;
1895 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1896 UniqueValues.insert(Blend->getIncomingValue(0));
1897 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1898 if (!match(Blend->getMask(I), m_False()))
1899 UniqueValues.insert(Blend->getIncomingValue(I));
1900
1901 if (UniqueValues.size() == 1) {
1902 Blend->replaceAllUsesWith(*UniqueValues.begin());
1903 Blend->eraseFromParent();
1904 continue;
1905 }
1906
1907 if (Blend->isNormalized())
1908 continue;
1909
1910 // Normalize the blend so its first incoming value is used as the initial
1911 // value with the others blended into it.
1912
1913 unsigned StartIndex = 0;
1914 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1915 // If a value's mask is used only by the blend then is can be deadcoded.
1916 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1917 // that's used by multiple blends where it can be removed from them all.
1918 VPValue *Mask = Blend->getMask(I);
1919 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1920 StartIndex = I;
1921 break;
1922 }
1923 }
1924
1925 SmallVector<VPValue *, 4> OperandsWithMask;
1926 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1927
1928 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1929 if (I == StartIndex)
1930 continue;
1931 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1932 OperandsWithMask.push_back(Blend->getMask(I));
1933 }
1934
1935 auto *NewBlend =
1936 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1937 OperandsWithMask, *Blend, Blend->getDebugLoc());
1938 NewBlend->insertBefore(&R);
1939
1940 VPValue *DeadMask = Blend->getMask(StartIndex);
1941 Blend->replaceAllUsesWith(NewBlend);
1942 Blend->eraseFromParent();
1944
1945 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1946 VPValue *NewMask;
1947 if (NewBlend->getNumOperands() == 3 &&
1948 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1949 VPValue *Inc0 = NewBlend->getOperand(0);
1950 VPValue *Inc1 = NewBlend->getOperand(1);
1951 VPValue *OldMask = NewBlend->getOperand(2);
1952 NewBlend->setOperand(0, Inc1);
1953 NewBlend->setOperand(1, Inc0);
1954 NewBlend->setOperand(2, NewMask);
1955 if (OldMask->getNumUsers() == 0)
1956 cast<VPInstruction>(OldMask)->eraseFromParent();
1957 }
1958 }
1959 }
1960}
1961
1962/// Optimize the width of vector induction variables in \p Plan based on a known
1963/// constant Trip Count, \p BestVF and \p BestUF.
1965 ElementCount BestVF,
1966 unsigned BestUF) {
1967 // Only proceed if we have not completely removed the vector region.
1968 if (!Plan.getVectorLoopRegion())
1969 return false;
1970
1971 const APInt *TC;
1972 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1973 return false;
1974
1975 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1976 // and UF. Returns at least 8.
1977 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1978 APInt AlignedTC =
1981 APInt MaxVal = AlignedTC - 1;
1982 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1983 };
1984 unsigned NewBitWidth =
1985 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1986
1987 LLVMContext &Ctx = Plan.getContext();
1988 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1989
1990 bool MadeChange = false;
1991
1992 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1993 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1994 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1995
1996 // Currently only handle canonical IVs as it is trivial to replace the start
1997 // and stop values, and we currently only perform the optimization when the
1998 // IV has a single use.
1999 if (!WideIV || !WideIV->isCanonical() ||
2000 WideIV->hasMoreThanOneUniqueUser() ||
2001 NewIVTy == WideIV->getScalarType())
2002 continue;
2003
2004 // Currently only handle cases where the single user is a header-mask
2005 // comparison with the backedge-taken-count.
2006 VPUser *SingleUser = WideIV->getSingleUser();
2007 if (!SingleUser ||
2008 !match(SingleUser, m_ICmp(m_Specific(WideIV),
2011 continue;
2012
2013 // Update IV operands and comparison bound to use new narrower type.
2014 auto *NewStart = Plan.getZero(NewIVTy);
2015 WideIV->setStartValue(NewStart);
2016 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
2017 WideIV->setStepValue(NewStep);
2018
2019 auto *NewBTC = new VPWidenCastRecipe(
2020 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2021 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
2022 Plan.getVectorPreheader()->appendRecipe(NewBTC);
2023 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
2024 Cmp->setOperand(1, NewBTC);
2025
2026 MadeChange = true;
2027 }
2028
2029 return MadeChange;
2030}
2031
2032/// Return true if \p Cond is known to be true for given \p BestVF and \p
2033/// BestUF.
2035 ElementCount BestVF, unsigned BestUF,
2038 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
2039 &PSE](VPValue *C) {
2040 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
2041 });
2042
2043 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2045 m_Specific(CanIV->getBackedgeValue()),
2046 m_Specific(&Plan.getVectorTripCount()))))
2047 return false;
2048
2049 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2050 // count is not conveniently available as SCEV so far, so we compare directly
2051 // against the original trip count. This is stricter than necessary, as we
2052 // will only return true if the trip count == vector trip count.
2053 const SCEV *VectorTripCount =
2055 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2056 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
2057 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2058 "Trip count SCEV must be computable");
2059 ScalarEvolution &SE = *PSE.getSE();
2060 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2061 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2062 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
2063}
2064
2065/// Try to replace multiple active lane masks used for control flow with
2066/// a single, wide active lane mask instruction followed by multiple
2067/// extract subvector intrinsics. This applies to the active lane mask
2068/// instructions both in the loop and in the preheader.
2069/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2070/// new extracts from the first active lane mask, which has it's last
2071/// operand (multiplier) set to UF.
2073 unsigned UF) {
2074 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2075 return false;
2076
2077 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2078 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2079 auto *Term = &ExitingVPBB->back();
2080
2081 using namespace llvm::VPlanPatternMatch;
2083 m_VPValue(), m_VPValue(), m_VPValue())))))
2084 return false;
2085
2086 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2087 LLVMContext &Ctx = Plan.getContext();
2088
2089 auto ExtractFromALM = [&](VPInstruction *ALM,
2090 SmallVectorImpl<VPValue *> &Extracts) {
2091 DebugLoc DL = ALM->getDebugLoc();
2092 for (unsigned Part = 0; Part < UF; ++Part) {
2094 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
2095 auto *Ext =
2096 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2097 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
2098 Extracts[Part] = Ext;
2099 Ext->insertAfter(ALM);
2100 }
2101 };
2102
2103 // Create a list of each active lane mask phi, ordered by unroll part.
2105 for (VPRecipeBase &R : Header->phis()) {
2107 if (!Phi)
2108 continue;
2109 VPValue *Index = nullptr;
2110 match(Phi->getBackedgeValue(),
2112 assert(Index && "Expected index from ActiveLaneMask instruction");
2113
2114 uint64_t Part;
2115 if (match(Index,
2117 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2118 Phis[Part] = Phi;
2119 else {
2120 // Anything other than a CanonicalIVIncrementForPart is part 0
2121 assert(!match(
2122 Index,
2124 Phis[0] = Phi;
2125 }
2126 }
2127
2128 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
2129 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2130
2131 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2132 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2133
2134 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2135 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2136 "Expected incoming values of Phi to be ActiveLaneMasks");
2137
2138 // When using wide lane masks, the return type of the get.active.lane.mask
2139 // intrinsic is VF x UF (last operand).
2140 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2141 EntryALM->setOperand(2, ALMMultiplier);
2142 LoopALM->setOperand(2, ALMMultiplier);
2143
2144 // Create UF x extract vectors and insert into preheader.
2145 SmallVector<VPValue *> EntryExtracts(UF);
2146 ExtractFromALM(EntryALM, EntryExtracts);
2147
2148 // Create UF x extract vectors and insert before the loop compare & branch,
2149 // updating the compare to use the first extract.
2150 SmallVector<VPValue *> LoopExtracts(UF);
2151 ExtractFromALM(LoopALM, LoopExtracts);
2152 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2153 Not->setOperand(0, LoopExtracts[0]);
2154
2155 // Update the incoming values of active lane mask phis.
2156 for (unsigned Part = 0; Part < UF; ++Part) {
2157 Phis[Part]->setStartValue(EntryExtracts[Part]);
2158 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2159 }
2160
2161 return true;
2162}
2163
2164/// Try to simplify the branch condition of \p Plan. This may restrict the
2165/// resulting plan to \p BestVF and \p BestUF.
2167 unsigned BestUF,
2169 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2170 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2171 auto *Term = &ExitingVPBB->back();
2172 VPValue *Cond;
2173 if (match(Term,
2175 m_VPValue())) ||
2177 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2178 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2179 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2180 const SCEV *VectorTripCount =
2182 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2183 VectorTripCount =
2185 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2186 "Trip count SCEV must be computable");
2187 ScalarEvolution &SE = *PSE.getSE();
2188 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2189 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2190 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2191 return false;
2192 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2194 // For BranchOnCond, check if we can prove the condition to be true using VF
2195 // and UF.
2196 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2197 return false;
2198 } else {
2199 return false;
2200 }
2201
2202 // The vector loop region only executes once. If possible, completely remove
2203 // the region, otherwise replace the terminator controlling the latch with
2204 // (BranchOnCond true).
2205 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
2206 // support for other non-canonical widen induction recipes (e.g.,
2207 // VPWidenPointerInductionRecipe).
2208 // TODO: fold branch-on-constant after dissolving region.
2209 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2210 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
2211 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
2212 return R->isCanonical();
2213 return isa<VPCanonicalIVPHIRecipe, VPCurrentIterationPHIRecipe,
2214 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
2215 })) {
2216 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
2217 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
2218 VPBuilder Builder(Plan.getVectorPreheader());
2219 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
2220 R->getScalarType());
2221 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
2222 HeaderR.eraseFromParent();
2223 continue;
2224 }
2225 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
2226 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
2227 HeaderR.eraseFromParent();
2228 }
2229
2230 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
2231 SmallVector<VPBlockBase *> Exits = to_vector(VectorRegion->getSuccessors());
2232 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
2233 for (VPBlockBase *Exit : Exits)
2234 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
2235
2236 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
2237 B->setParent(nullptr);
2238
2239 VPBlockUtils::connectBlocks(Preheader, Header);
2240
2241 for (VPBlockBase *Exit : Exits)
2242 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
2243
2244 // Replace terminating branch-on-two-conds with branch-on-cond to early
2245 // exit.
2246 if (Exits.size() != 1) {
2247 assert(match(Term, m_BranchOnTwoConds()) && Exits.size() == 2 &&
2248 "BranchOnTwoConds needs 2 remaining exits");
2250 Term->getOperand(0));
2251 }
2253 } else {
2254 // The vector region contains header phis for which we cannot remove the
2255 // loop region yet.
2256
2257 // For BranchOnTwoConds, set the latch exit condition to true directly.
2258 if (match(Term, m_BranchOnTwoConds())) {
2259 Term->setOperand(1, Plan.getTrue());
2260 return true;
2261 }
2262
2263 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
2264 {}, {}, Term->getDebugLoc());
2265 ExitingVPBB->appendRecipe(BOC);
2266 }
2267
2268 Term->eraseFromParent();
2269
2270 return true;
2271}
2272
2273/// From the definition of llvm.experimental.get.vector.length,
2274/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2278 vp_depth_first_deep(Plan.getEntry()))) {
2279 for (VPRecipeBase &R : *VPBB) {
2280 VPValue *AVL;
2281 if (!match(&R, m_EVL(m_VPValue(AVL))))
2282 continue;
2283
2284 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2285 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2286 continue;
2287 ScalarEvolution &SE = *PSE.getSE();
2288 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2289 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2290 continue;
2291
2293 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2294 R.getDebugLoc());
2295 if (Trunc != AVL) {
2296 auto *TruncR = cast<VPSingleDefRecipe>(Trunc);
2297 const DataLayout &DL = Plan.getDataLayout();
2298 VPTypeAnalysis TypeInfo(Plan);
2299 if (VPValue *Folded =
2300 tryToFoldLiveIns(*TruncR, TruncR->operands(), DL, TypeInfo))
2301 Trunc = Folded;
2302 }
2303 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2304 return true;
2305 }
2306 }
2307 return false;
2308}
2309
2311 unsigned BestUF,
2313 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2314 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2315
2316 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2317 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2318 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2319 MadeChange |= simplifyKnownEVL(Plan, BestVF, PSE);
2320
2321 if (MadeChange) {
2322 Plan.setVF(BestVF);
2323 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2324 }
2325}
2326
2327/// Sink users of \p FOR after the recipe defining the previous value \p
2328/// Previous of the recurrence. \returns true if all users of \p FOR could be
2329/// re-arranged as needed or false if it is not possible.
2330static bool
2332 VPRecipeBase *Previous,
2333 VPDominatorTree &VPDT) {
2334 // If Previous is a live-in (no defining recipe), it naturally dominates all
2335 // recipes in the loop, so no sinking is needed.
2336 if (!Previous)
2337 return true;
2338
2339 // Collect recipes that need sinking.
2342 Seen.insert(Previous);
2343 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2344 // The previous value must not depend on the users of the recurrence phi. In
2345 // that case, FOR is not a fixed order recurrence.
2346 if (SinkCandidate == Previous)
2347 return false;
2348
2349 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2350 !Seen.insert(SinkCandidate).second ||
2351 VPDT.properlyDominates(Previous, SinkCandidate))
2352 return true;
2353
2354 if (cannotHoistOrSinkRecipe(*SinkCandidate))
2355 return false;
2356
2357 WorkList.push_back(SinkCandidate);
2358 return true;
2359 };
2360
2361 // Recursively sink users of FOR after Previous.
2362 WorkList.push_back(FOR);
2363 for (unsigned I = 0; I != WorkList.size(); ++I) {
2364 VPRecipeBase *Current = WorkList[I];
2365 assert(Current->getNumDefinedValues() == 1 &&
2366 "only recipes with a single defined value expected");
2367
2368 for (VPUser *User : Current->getVPSingleValue()->users()) {
2369 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2370 return false;
2371 }
2372 }
2373
2374 // Keep recipes to sink ordered by dominance so earlier instructions are
2375 // processed first.
2376 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2377 return VPDT.properlyDominates(A, B);
2378 });
2379
2380 for (VPRecipeBase *SinkCandidate : WorkList) {
2381 if (SinkCandidate == FOR)
2382 continue;
2383
2384 SinkCandidate->moveAfter(Previous);
2385 Previous = SinkCandidate;
2386 }
2387 return true;
2388}
2389
2390/// Try to hoist \p Previous and its operands before all users of \p FOR.
2392 VPRecipeBase *Previous,
2393 VPDominatorTree &VPDT) {
2394 if (cannotHoistOrSinkRecipe(*Previous))
2395 return false;
2396
2397 // Collect recipes that need hoisting.
2398 SmallVector<VPRecipeBase *> HoistCandidates;
2400 VPRecipeBase *HoistPoint = nullptr;
2401 // Find the closest hoist point by looking at all users of FOR and selecting
2402 // the recipe dominating all other users.
2403 for (VPUser *U : FOR->users()) {
2404 auto *R = cast<VPRecipeBase>(U);
2405 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2406 HoistPoint = R;
2407 }
2408 assert(all_of(FOR->users(),
2409 [&VPDT, HoistPoint](VPUser *U) {
2410 auto *R = cast<VPRecipeBase>(U);
2411 return HoistPoint == R ||
2412 VPDT.properlyDominates(HoistPoint, R);
2413 }) &&
2414 "HoistPoint must dominate all users of FOR");
2415
2416 auto NeedsHoisting = [HoistPoint, &VPDT,
2417 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2418 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2419 if (!HoistCandidate)
2420 return nullptr;
2421 VPRegionBlock *EnclosingLoopRegion =
2422 HoistCandidate->getParent()->getEnclosingLoopRegion();
2423 assert((!HoistCandidate->getRegion() ||
2424 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2425 "CFG in VPlan should still be flat, without replicate regions");
2426 // Hoist candidate was already visited, no need to hoist.
2427 if (!Visited.insert(HoistCandidate).second)
2428 return nullptr;
2429
2430 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2431 // hoisting.
2432 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2433 return nullptr;
2434
2435 // If we reached a recipe that dominates HoistPoint, we don't need to
2436 // hoist the recipe.
2437 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2438 return nullptr;
2439 return HoistCandidate;
2440 };
2441
2442 if (!NeedsHoisting(Previous->getVPSingleValue()))
2443 return true;
2444
2445 // Recursively try to hoist Previous and its operands before all users of FOR.
2446 HoistCandidates.push_back(Previous);
2447
2448 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2449 VPRecipeBase *Current = HoistCandidates[I];
2450 assert(Current->getNumDefinedValues() == 1 &&
2451 "only recipes with a single defined value expected");
2452 if (cannotHoistOrSinkRecipe(*Current))
2453 return false;
2454
2455 for (VPValue *Op : Current->operands()) {
2456 // If we reach FOR, it means the original Previous depends on some other
2457 // recurrence that in turn depends on FOR. If that is the case, we would
2458 // also need to hoist recipes involving the other FOR, which may break
2459 // dependencies.
2460 if (Op == FOR)
2461 return false;
2462
2463 if (auto *R = NeedsHoisting(Op)) {
2464 // Bail out if the recipe defines multiple values.
2465 // TODO: Hoisting such recipes requires additional handling.
2466 if (R->getNumDefinedValues() != 1)
2467 return false;
2468 HoistCandidates.push_back(R);
2469 }
2470 }
2471 }
2472
2473 // Order recipes to hoist by dominance so earlier instructions are processed
2474 // first.
2475 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2476 return VPDT.properlyDominates(A, B);
2477 });
2478
2479 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2480 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2481 HoistPoint->getIterator());
2482 }
2483
2484 return true;
2485}
2486
2488 VPBuilder &LoopBuilder) {
2489 VPDominatorTree VPDT(Plan);
2490 VPTypeAnalysis TypeInfo(Plan);
2491
2493 for (VPRecipeBase &R :
2496 RecurrencePhis.push_back(FOR);
2497
2498 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2500 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2501 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2502 // to terminate.
2503 while (auto *PrevPhi =
2505 assert(PrevPhi->getParent() == FOR->getParent());
2506 assert(SeenPhis.insert(PrevPhi).second);
2507 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2508 }
2509
2510 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2511 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2512 return false;
2513
2514 // Introduce a recipe to combine the incoming and previous values of a
2515 // fixed-order recurrence.
2516 VPBasicBlock *InsertBlock =
2517 Previous ? Previous->getParent() : FOR->getParent();
2518 if (!Previous || isa<VPHeaderPHIRecipe>(Previous))
2519 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2520 else
2521 LoopBuilder.setInsertPoint(InsertBlock,
2522 std::next(Previous->getIterator()));
2523
2524 auto *RecurSplice =
2526 {FOR, FOR->getBackedgeValue()});
2527
2528 FOR->replaceAllUsesWith(RecurSplice);
2529 // Set the first operand of RecurSplice to FOR again, after replacing
2530 // all users.
2531 RecurSplice->setOperand(0, FOR);
2532
2533 // Check for users extracting at the penultimate active lane of the FOR.
2534 // If only a single lane is active in the current iteration, we need to
2535 // select the last element from the previous iteration (from the FOR phi
2536 // directly).
2537 for (VPUser *U : RecurSplice->users()) {
2539 m_Specific(RecurSplice))))
2540 continue;
2541
2543 VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2544 Type *Ty = TypeInfo.inferScalarType(LastActiveLane);
2545 VPValue *Zero = Plan.getConstantInt(Ty, 0);
2546 VPValue *One = Plan.getConstantInt(Ty, 1);
2547 VPValue *PenultimateIndex = B.createSub(LastActiveLane, One);
2548 VPValue *PenultimateLastIter =
2549 B.createNaryOp(VPInstruction::ExtractLane,
2550 {PenultimateIndex, FOR->getBackedgeValue()});
2551 VPValue *LastPrevIter =
2552 B.createNaryOp(VPInstruction::ExtractLastLane, FOR);
2553
2554 VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2555 VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2556 cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2557 }
2558 }
2559 return true;
2560}
2561
2563 for (VPRecipeBase &R :
2565 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2566 if (!PhiR)
2567 continue;
2568 RecurKind RK = PhiR->getRecurrenceKind();
2569 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2571 continue;
2572
2573 for (VPUser *U : collectUsersRecursively(PhiR))
2574 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2575 RecWithFlags->dropPoisonGeneratingFlags();
2576 }
2577 }
2578}
2579
2580namespace {
2581struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2582 static bool isSentinel(const VPSingleDefRecipe *Def) {
2583 return Def == getEmptyKey() || Def == getTombstoneKey();
2584 }
2585
2586 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2587 /// return that source element type.
2588 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2589 // All VPInstructions that lower to GEPs must have the i8 source element
2590 // type (as they are PtrAdds), so we omit it.
2592 .Case([](const VPReplicateRecipe *I) -> Type * {
2593 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2594 return GEP->getSourceElementType();
2595 return nullptr;
2596 })
2597 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2598 [](auto *I) { return I->getSourceElementType(); })
2599 .Default([](auto *) { return nullptr; });
2600 }
2601
2602 /// Returns true if recipe \p Def can be safely handed for CSE.
2603 static bool canHandle(const VPSingleDefRecipe *Def) {
2604 // We can extend the list of handled recipes in the future,
2605 // provided we account for the data embedded in them while checking for
2606 // equality or hashing.
2607 auto C = getOpcodeOrIntrinsicID(Def);
2608
2609 // The issue with (Insert|Extract)Value is that the index of the
2610 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2611 // VPlan.
2612 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2613 C->second == Instruction::ExtractValue)))
2614 return false;
2615
2616 // During CSE, we can only handle recipes that don't read from memory: if
2617 // they read from memory, there could be an intervening write to memory
2618 // before the next instance is CSE'd, leading to an incorrect result.
2619 return !Def->mayReadFromMemory();
2620 }
2621
2622 /// Hash the underlying data of \p Def.
2623 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2624 const VPlan *Plan = Def->getParent()->getPlan();
2625 VPTypeAnalysis TypeInfo(*Plan);
2626 hash_code Result = hash_combine(
2627 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2628 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2630 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2631 if (RFlags->hasPredicate())
2632 return hash_combine(Result, RFlags->getPredicate());
2633 return Result;
2634 }
2635
2636 /// Check equality of underlying data of \p L and \p R.
2637 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2638 if (isSentinel(L) || isSentinel(R))
2639 return L == R;
2640 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2642 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2644 !equal(L->operands(), R->operands()))
2645 return false;
2647 "must have valid opcode info for both recipes");
2648 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2649 if (LFlags->hasPredicate() &&
2650 LFlags->getPredicate() !=
2651 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2652 return false;
2653 // Recipes in replicate regions implicitly depend on predicate. If either
2654 // recipe is in a replicate region, only consider them equal if both have
2655 // the same parent.
2656 const VPRegionBlock *RegionL = L->getRegion();
2657 const VPRegionBlock *RegionR = R->getRegion();
2658 if (((RegionL && RegionL->isReplicator()) ||
2659 (RegionR && RegionR->isReplicator())) &&
2660 L->getParent() != R->getParent())
2661 return false;
2662 const VPlan *Plan = L->getParent()->getPlan();
2663 VPTypeAnalysis TypeInfo(*Plan);
2664 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2665 }
2666};
2667} // end anonymous namespace
2668
2669/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2670/// Plan.
2672 VPDominatorTree VPDT(Plan);
2674
2676 vp_depth_first_deep(Plan.getEntry()))) {
2677 for (VPRecipeBase &R : *VPBB) {
2678 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2679 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2680 continue;
2681 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2682 // V must dominate Def for a valid replacement.
2683 if (!VPDT.dominates(V->getParent(), VPBB))
2684 continue;
2685 // Only keep flags present on both V and Def.
2686 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2687 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2688 Def->replaceAllUsesWith(V);
2689 continue;
2690 }
2691 CSEMap[Def] = Def;
2692 }
2693 }
2694}
2695
2696/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2697static void licm(VPlan &Plan) {
2698 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2699
2700 // Hoist any loop invariant recipes from the vector loop region to the
2701 // preheader. Preform a shallow traversal of the vector loop region, to
2702 // exclude recipes in replicate regions. Since the top-level blocks in the
2703 // vector loop region are guaranteed to execute if the vector pre-header is,
2704 // we don't need to check speculation safety.
2705 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2706 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2707 "Expected vector prehader's successor to be the vector loop region");
2709 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2710 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2712 continue;
2713 if (any_of(R.operands(), [](VPValue *Op) {
2714 return !Op->isDefinedOutsideLoopRegions();
2715 }))
2716 continue;
2717 R.moveBefore(*Preheader, Preheader->end());
2718 }
2719 }
2720
2721#ifndef NDEBUG
2722 VPDominatorTree VPDT(Plan);
2723#endif
2724 // Sink recipes with no users inside the vector loop region if all users are
2725 // in the same exit block of the region.
2726 // TODO: Extend to sink recipes from inner loops.
2728 vp_post_order_shallow(LoopRegion->getEntry()))) {
2729 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2731 continue;
2732
2733 // TODO: Support sinking VPReplicateRecipe after ensuring replicateByVF
2734 // handles sunk recipes correctly.
2735 if (isa<VPReplicateRecipe>(&R))
2736 continue;
2737
2738 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2739 // support recipes with multiple defined values (e.g., interleaved loads).
2740 auto *Def = cast<VPSingleDefRecipe>(&R);
2741 // Skip recipes without users as we cannot determine a sink block.
2742 // TODO: Clone sinkable recipes without users to all exit blocks to reduce
2743 // their execution frequency.
2744 if (Def->getNumUsers() == 0)
2745 continue;
2746
2747 VPBasicBlock *SinkBB = nullptr;
2748 // Cannot sink the recipe if any user
2749 // * is defined in any loop region, or
2750 // * is a phi, or
2751 // * multiple users in different blocks.
2752 if (any_of(Def->users(), [&SinkBB](VPUser *U) {
2753 auto *UserR = cast<VPRecipeBase>(U);
2754 VPBasicBlock *Parent = UserR->getParent();
2755 // TODO: If the user is a PHI node, we should check the block of
2756 // incoming value. Support PHI node users if needed.
2757 if (UserR->isPhi() || Parent->getEnclosingLoopRegion())
2758 return true;
2759 // TODO: Support sinking when users are in multiple blocks.
2760 if (SinkBB && SinkBB != Parent)
2761 return true;
2762 SinkBB = Parent;
2763 return false;
2764 }))
2765 continue;
2766
2767 // Only sink to dedicated exit blocks of the loop region.
2768 if (SinkBB->getSinglePredecessor() != LoopRegion)
2769 continue;
2770
2771 // TODO: This will need to be a check instead of a assert after
2772 // conditional branches in vectorized loops are supported.
2773 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2774 "Defining block must dominate sink block");
2775 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2776 // just moving.
2777 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2778 }
2779 }
2780}
2781
2783 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2784 if (Plan.hasScalarVFOnly())
2785 return;
2786 // Keep track of created truncates, so they can be re-used. Note that we
2787 // cannot use RAUW after creating a new truncate, as this would could make
2788 // other uses have different types for their operands, making them invalidly
2789 // typed.
2791 VPTypeAnalysis TypeInfo(Plan);
2792 VPBasicBlock *PH = Plan.getVectorPreheader();
2795 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2798 continue;
2799
2800 VPValue *ResultVPV = R.getVPSingleValue();
2801 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2802 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2803 if (!NewResSizeInBits)
2804 continue;
2805
2806 // If the value wasn't vectorized, we must maintain the original scalar
2807 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2808 // skip casts which do not need to be handled explicitly here, as
2809 // redundant casts will be removed during recipe simplification.
2811 continue;
2812
2813 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2814 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2815 assert(OldResTy->isIntegerTy() && "only integer types supported");
2816 (void)OldResSizeInBits;
2817
2818 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2819
2820 // Any wrapping introduced by shrinking this operation shouldn't be
2821 // considered undefined behavior. So, we can't unconditionally copy
2822 // arithmetic wrapping flags to VPW.
2823 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2824 VPW->dropPoisonGeneratingFlags();
2825
2826 if (OldResSizeInBits != NewResSizeInBits &&
2827 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2828 // Extend result to original width.
2829 auto *Ext = new VPWidenCastRecipe(
2830 Instruction::ZExt, ResultVPV, OldResTy, nullptr,
2831 VPIRFlags::getDefaultFlags(Instruction::ZExt));
2832 Ext->insertAfter(&R);
2833 ResultVPV->replaceAllUsesWith(Ext);
2834 Ext->setOperand(0, ResultVPV);
2835 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2836 } else {
2837 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2838 "Only ICmps should not need extending the result.");
2839 }
2840
2841 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2843 continue;
2844
2845 // Shrink operands by introducing truncates as needed.
2846 unsigned StartIdx =
2847 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2848 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2849 auto *Op = R.getOperand(Idx);
2850 unsigned OpSizeInBits =
2852 if (OpSizeInBits == NewResSizeInBits)
2853 continue;
2854 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2855 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2856 if (!IterIsEmpty) {
2857 R.setOperand(Idx, ProcessedIter->second);
2858 continue;
2859 }
2860
2861 VPBuilder Builder;
2862 if (isa<VPIRValue>(Op))
2863 Builder.setInsertPoint(PH);
2864 else
2865 Builder.setInsertPoint(&R);
2866 VPWidenCastRecipe *NewOp =
2867 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2868 ProcessedIter->second = NewOp;
2869 R.setOperand(Idx, NewOp);
2870 }
2871
2872 }
2873 }
2874}
2875
2879 VPValue *Cond;
2880 // Skip blocks that are not terminated by BranchOnCond.
2881 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2882 continue;
2883
2884 assert(VPBB->getNumSuccessors() == 2 &&
2885 "Two successors expected for BranchOnCond");
2886 unsigned RemovedIdx;
2887 if (match(Cond, m_True()))
2888 RemovedIdx = 1;
2889 else if (match(Cond, m_False()))
2890 RemovedIdx = 0;
2891 else
2892 continue;
2893
2894 VPBasicBlock *RemovedSucc =
2895 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2896 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2897 "There must be a single edge between VPBB and its successor");
2898 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2899 // these recipes.
2900 for (VPRecipeBase &R : RemovedSucc->phis())
2901 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2902
2903 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2904 // automatically on VPlan destruction if it becomes unreachable.
2905 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2906 VPBB->back().eraseFromParent();
2907 }
2908}
2909
2931
2932// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2933// the loop terminator with a branch-on-cond recipe with the negated
2934// active-lane-mask as operand. Note that this turns the loop into an
2935// uncountable one. Only the existing terminator is replaced, all other existing
2936// recipes/users remain unchanged, except for poison-generating flags being
2937// dropped from the canonical IV increment. Return the created
2938// VPActiveLaneMaskPHIRecipe.
2939//
2940// The function adds the following recipes:
2941//
2942// vector.ph:
2943// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2944// %EntryALM = active-lane-mask %EntryInc, TC
2945//
2946// vector.body:
2947// ...
2948// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2949// ...
2950// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2951// %ALM = active-lane-mask %InLoopInc, TC
2952// %Negated = Not %ALM
2953// branch-on-cond %Negated
2954//
2957 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2958 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2959 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2960 VPValue *StartV = CanonicalIVPHI->getStartValue();
2961
2962 auto *CanonicalIVIncrement =
2963 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2964 // TODO: Check if dropping the flags is needed.
2965 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2966 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2967 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2968 // we have to take unrolling into account. Each part needs to start at
2969 // Part * VF
2970 auto *VecPreheader = Plan.getVectorPreheader();
2971 VPBuilder Builder(VecPreheader);
2972
2973 // Create the ActiveLaneMask instruction using the correct start values.
2974 VPValue *TC = Plan.getTripCount();
2975 VPValue *VF = &Plan.getVF();
2976
2977 auto *EntryIncrement = Builder.createOverflowingOp(
2978 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
2979 DL, "index.part.next");
2980
2981 // Create the active lane mask instruction in the VPlan preheader.
2982 VPValue *ALMMultiplier =
2983 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2984 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2985 {EntryIncrement, TC, ALMMultiplier}, DL,
2986 "active.lane.mask.entry");
2987
2988 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2989 // preheader ActiveLaneMask instruction.
2990 auto *LaneMaskPhi =
2992 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2993
2994 // Create the active lane mask for the next iteration of the loop before the
2995 // original terminator.
2996 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2997 Builder.setInsertPoint(OriginalTerminator);
2998 auto *InLoopIncrement = Builder.createOverflowingOp(
3000 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
3001 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
3002 {InLoopIncrement, TC, ALMMultiplier}, DL,
3003 "active.lane.mask.next");
3004 LaneMaskPhi->addOperand(ALM);
3005
3006 // Replace the original terminator with BranchOnCond. We have to invert the
3007 // mask here because a true condition means jumping to the exit block.
3008 auto *NotMask = Builder.createNot(ALM, DL);
3009 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
3010 OriginalTerminator->eraseFromParent();
3011 return LaneMaskPhi;
3012}
3013
3015 bool UseActiveLaneMaskForControlFlow) {
3016 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3017 auto *FoundWidenCanonicalIVUser = find_if(
3019 assert(FoundWidenCanonicalIVUser &&
3020 "Must have widened canonical IV when tail folding!");
3021 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
3022 auto *WideCanonicalIV =
3023 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
3024 VPSingleDefRecipe *LaneMask;
3025 if (UseActiveLaneMaskForControlFlow) {
3026 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
3027 } else {
3028 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
3029 VPValue *ALMMultiplier =
3030 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
3031 LaneMask =
3032 B.createNaryOp(VPInstruction::ActiveLaneMask,
3033 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
3034 nullptr, "active.lane.mask");
3035 }
3036
3037 // Walk users of WideCanonicalIV and replace the header mask of the form
3038 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
3039 // removing the old one to ensure there is always only a single header mask.
3040 HeaderMask->replaceAllUsesWith(LaneMask);
3041 HeaderMask->eraseFromParent();
3042}
3043
3044template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
3045 Op0_t In;
3047
3048 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
3049
3050 template <typename OpTy> bool match(OpTy *V) const {
3051 if (m_Specific(In).match(V)) {
3052 Out = nullptr;
3053 return true;
3054 }
3055 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
3056 }
3057};
3058
3059/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
3060/// Returns the remaining part \p Out if so, or nullptr otherwise.
3061template <typename Op0_t, typename Op1_t>
3062static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
3063 Op1_t &Out) {
3064 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
3065}
3066
3067/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
3068/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
3069/// recipe could be created.
3070/// \p HeaderMask Header Mask.
3071/// \p CurRecipe Recipe to be transform.
3072/// \p TypeInfo VPlan-based type analysis.
3073/// \p EVL The explicit vector length parameter of vector-predication
3074/// intrinsics.
3076 VPRecipeBase &CurRecipe,
3077 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
3078 VPlan *Plan = CurRecipe.getParent()->getPlan();
3079 DebugLoc DL = CurRecipe.getDebugLoc();
3080 VPValue *Addr, *Mask, *EndPtr;
3081
3082 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
3083 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
3084 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
3085 EVLEndPtr->insertBefore(&CurRecipe);
3086 EVLEndPtr->setOperand(1, &EVL);
3087 return EVLEndPtr;
3088 };
3089
3090 if (match(&CurRecipe,
3091 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
3092 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
3093 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
3094 EVL, Mask);
3095
3096 VPValue *ReversedVal;
3097 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
3098 match(ReversedVal,
3099 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
3100 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
3101 cast<VPWidenLoadRecipe>(ReversedVal)->isReverse()) {
3102 auto *LoadR = new VPWidenLoadEVLRecipe(
3103 *cast<VPWidenLoadRecipe>(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
3104 LoadR->insertBefore(&CurRecipe);
3105 return new VPWidenIntrinsicRecipe(
3106 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
3107 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
3108 }
3109
3110 VPValue *StoredVal;
3111 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
3112 m_RemoveMask(HeaderMask, Mask))) &&
3113 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
3114 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3115 StoredVal, EVL, Mask);
3116
3117 if (match(&CurRecipe,
3118 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
3119 m_RemoveMask(HeaderMask, Mask))) &&
3120 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
3121 cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
3122 auto *NewReverse = new VPWidenIntrinsicRecipe(
3123 Intrinsic::experimental_vp_reverse,
3124 {ReversedVal, Plan->getTrue(), &EVL},
3125 TypeInfo.inferScalarType(ReversedVal), {}, {}, DL);
3126 NewReverse->insertBefore(&CurRecipe);
3127 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
3128 AdjustEndPtr(EndPtr), NewReverse, EVL,
3129 Mask);
3130 }
3131
3132 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
3133 if (Rdx->isConditional() &&
3134 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
3135 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3136
3137 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
3138 if (Interleave->getMask() &&
3139 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
3140 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3141
3142 VPValue *LHS, *RHS;
3143 if (match(&CurRecipe,
3144 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
3145 return new VPWidenIntrinsicRecipe(
3146 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
3147 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3148
3149 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
3150 m_VPValue(RHS))))
3151 return new VPWidenIntrinsicRecipe(
3152 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
3153 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3154
3155 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3156 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
3157 VPValue *ZExt = VPBuilder(&CurRecipe)
3159 &EVL, Ty, TypeInfo.inferScalarType(&EVL), DL);
3160 return new VPInstruction(
3161 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
3162 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
3163 }
3164
3165 return nullptr;
3166}
3167
3168/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3169/// The transforms here need to preserve the original semantics.
3171 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3172 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3175 m_VPValue(EVL))) &&
3176 match(EVL, m_EVL(m_VPValue()))) {
3177 HeaderMask = R.getVPSingleValue();
3178 break;
3179 }
3180 }
3181 if (!HeaderMask)
3182 return;
3183
3184 VPTypeAnalysis TypeInfo(Plan);
3185 SmallVector<VPRecipeBase *> OldRecipes;
3186 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3188 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, TypeInfo, *EVL)) {
3189 NewR->insertBefore(R);
3190 for (auto [Old, New] :
3191 zip_equal(R->definedValues(), NewR->definedValues()))
3192 Old->replaceAllUsesWith(New);
3193 OldRecipes.push_back(R);
3194 }
3195 }
3196 // Erase old recipes at the end so we don't invalidate TypeInfo.
3197 for (VPRecipeBase *R : reverse(OldRecipes)) {
3198 SmallVector<VPValue *> PossiblyDead(R->operands());
3199 R->eraseFromParent();
3200 for (VPValue *Op : PossiblyDead)
3202 }
3203}
3204
3205/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3206/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3207/// iteration.
3208static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3209 VPTypeAnalysis TypeInfo(Plan);
3210 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3211 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3212
3213 assert(all_of(Plan.getVF().users(),
3216 "User of VF that we can't transform to EVL.");
3217 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3219 });
3220
3221 assert(all_of(Plan.getVFxUF().users(),
3222 [&LoopRegion, &Plan](VPUser *U) {
3223 return match(U,
3224 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3225 m_Specific(&Plan.getVFxUF()))) ||
3226 isa<VPWidenPointerInductionRecipe>(U);
3227 }) &&
3228 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3229 "increment of the canonical induction.");
3230 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3231 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3232 // canonical induction must not be updated.
3234 });
3235
3236 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3237 // contained.
3238 bool ContainsFORs =
3240 if (ContainsFORs) {
3241 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3242 VPValue *MaxEVL = &Plan.getVF();
3243 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3244 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3245 MaxEVL = Builder.createScalarZExtOrTrunc(
3246 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3247 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3248
3249 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3250 VPValue *PrevEVL = Builder.createScalarPhi(
3251 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3252
3255 for (VPRecipeBase &R : *VPBB) {
3256 VPValue *V1, *V2;
3257 if (!match(&R,
3259 m_VPValue(V1), m_VPValue(V2))))
3260 continue;
3261 VPValue *Imm = Plan.getOrAddLiveIn(
3264 Intrinsic::experimental_vp_splice,
3265 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3266 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3267 R.getDebugLoc());
3268 VPSplice->insertBefore(&R);
3269 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3270 }
3271 }
3272 }
3273
3274 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3275 if (!HeaderMask)
3276 return;
3277
3278 // Replace header masks with a mask equivalent to predicating by EVL:
3279 //
3280 // icmp ule widen-canonical-iv backedge-taken-count
3281 // ->
3282 // icmp ult step-vector, EVL
3283 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3284 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3285 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3286 VPValue *EVLMask = Builder.createICmp(
3288 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3289 HeaderMask->replaceAllUsesWith(EVLMask);
3290}
3291
3292/// Converts a tail folded vector loop region to step by
3293/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3294/// iteration.
3295///
3296/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3297/// replaces all uses except the canonical IV increment of
3298/// VPCanonicalIVPHIRecipe with a VPCurrentIterationPHIRecipe.
3299/// VPCanonicalIVPHIRecipe is used only for loop iterations counting after
3300/// this transformation.
3301///
3302/// - The header mask is replaced with a header mask based on the EVL.
3303///
3304/// - Plans with FORs have a new phi added to keep track of the EVL of the
3305/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3306/// @llvm.vp.splice.
3307///
3308/// The function uses the following definitions:
3309/// %StartV is the canonical induction start value.
3310///
3311/// The function adds the following recipes:
3312///
3313/// vector.ph:
3314/// ...
3315///
3316/// vector.body:
3317/// ...
3318/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3319/// [ %NextIter, %vector.body ]
3320/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3321/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3322/// ...
3323/// %OpEVL = cast i32 %VPEVL to IVSize
3324/// %NextIter = add IVSize %OpEVL, %CurrentIter
3325/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3326/// ...
3327///
3328/// If MaxSafeElements is provided, the function adds the following recipes:
3329/// vector.ph:
3330/// ...
3331///
3332/// vector.body:
3333/// ...
3334/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3335/// [ %NextIter, %vector.body ]
3336/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3337/// %cmp = cmp ult %AVL, MaxSafeElements
3338/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3339/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3340/// ...
3341/// %OpEVL = cast i32 %VPEVL to IVSize
3342/// %NextIter = add IVSize %OpEVL, %CurrentIter
3343/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3344/// ...
3345///
3347 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3348 if (Plan.hasScalarVFOnly())
3349 return;
3350 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3351 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3352
3353 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3354 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3355 VPValue *StartV = CanonicalIVPHI->getStartValue();
3356
3357 // Create the CurrentIteration recipe in the vector loop.
3358 auto *CurrentIteration =
3360 CurrentIteration->insertAfter(CanonicalIVPHI);
3361 VPBuilder Builder(Header, Header->getFirstNonPhi());
3362 // Create the AVL (application vector length), starting from TC -> 0 in steps
3363 // of EVL.
3364 VPPhi *AVLPhi = Builder.createScalarPhi(
3365 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3366 VPValue *AVL = AVLPhi;
3367
3368 if (MaxSafeElements) {
3369 // Support for MaxSafeDist for correct loop emission.
3370 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3371 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3372 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3373 "safe_avl");
3374 }
3375 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3376 DebugLoc::getUnknown(), "evl");
3377
3378 auto *CanonicalIVIncrement =
3379 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
3380 Builder.setInsertPoint(CanonicalIVIncrement);
3381 VPValue *OpVPEVL = VPEVL;
3382
3383 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3384 OpVPEVL = Builder.createScalarZExtOrTrunc(
3385 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3386
3387 auto *NextIter = Builder.createAdd(OpVPEVL, CurrentIteration,
3388 CanonicalIVIncrement->getDebugLoc(),
3389 "current.iteration.next",
3390 {CanonicalIVIncrement->hasNoUnsignedWrap(),
3391 CanonicalIVIncrement->hasNoSignedWrap()});
3392 CurrentIteration->addOperand(NextIter);
3393
3394 VPValue *NextAVL =
3395 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3396 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3397 AVLPhi->addOperand(NextAVL);
3398
3399 fixupVFUsersForEVL(Plan, *VPEVL);
3400 removeDeadRecipes(Plan);
3401
3402 // Replace all uses of VPCanonicalIVPHIRecipe by
3403 // VPCurrentIterationPHIRecipe except for the canonical IV increment.
3404 CanonicalIVPHI->replaceAllUsesWith(CurrentIteration);
3405 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
3406 // TODO: support unroll factor > 1.
3407 Plan.setUF(1);
3408}
3409
3411 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3412 // There should be only one VPCurrentIteration in the entire plan.
3413 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3414
3417 for (VPRecipeBase &R : VPBB->phis())
3418 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3419 assert(!CurrentIteration &&
3420 "Found multiple CurrentIteration. Only one expected");
3421 CurrentIteration = PhiR;
3422 }
3423
3424 // Early return if it is not variable-length stepping.
3425 if (!CurrentIteration)
3426 return;
3427
3428 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3429 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3430
3431 // Convert CurrentIteration to concrete recipe.
3432 auto *ScalarR =
3433 VPBuilder(CurrentIteration)
3435 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3436 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3437 CurrentIteration->replaceAllUsesWith(ScalarR);
3438 CurrentIteration->eraseFromParent();
3439
3440 // Replace CanonicalIVInc with CurrentIteration increment.
3441 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3442 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
3443 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3444 m_Specific(&Plan.getVFxUF()))) &&
3445 "Unexpected canonical iv");
3446 Backedge->replaceAllUsesWith(CurrentIterationIncr);
3447
3448 // Remove unused phi and increment.
3449 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3450 CanonicalIVIncrement->eraseFromParent();
3451 CanonicalIV->eraseFromParent();
3452}
3453
3455 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3456 // The canonical IV may not exist at this stage.
3457 if (!LoopRegion ||
3459 return;
3460 VPCanonicalIVPHIRecipe *CanIV = LoopRegion->getCanonicalIV();
3461 if (std::next(CanIV->getIterator()) == CanIV->getParent()->end())
3462 return;
3463 // The EVL IV is always immediately after the canonical IV.
3465 std::next(CanIV->getIterator()));
3466 if (!EVLPhi)
3467 return;
3468
3469 // Bail if not an EVL tail folded loop.
3470 VPValue *AVL;
3471 if (!match(EVLPhi->getBackedgeValue(),
3472 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3473 return;
3474
3475 // The AVL may be capped to a safe distance.
3476 VPValue *SafeAVL;
3477 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
3478 AVL = SafeAVL;
3479
3480 VPValue *AVLNext;
3481 [[maybe_unused]] bool FoundAVLNext =
3483 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3484 assert(FoundAVLNext && "Didn't find AVL backedge?");
3485
3486 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3487 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3488 if (match(LatchBr, m_BranchOnCond(m_True())))
3489 return;
3490
3491 assert(
3492 match(LatchBr,
3495 m_Specific(&Plan.getVectorTripCount())))) &&
3496 "Expected BranchOnCond with ICmp comparing CanIV increment with vector "
3497 "trip count");
3498
3499 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3500 VPBuilder Builder(LatchBr);
3501 LatchBr->setOperand(
3502 0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));
3503}
3504
3506 VPlan &Plan, PredicatedScalarEvolution &PSE,
3507 const DenseMap<Value *, const SCEV *> &StridesMap) {
3508 // Replace VPValues for known constant strides guaranteed by predicate scalar
3509 // evolution.
3510 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3511 auto *R = cast<VPRecipeBase>(&U);
3512 return R->getRegion() ||
3513 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3514 };
3515 ValueToSCEVMapTy RewriteMap;
3516 for (const SCEV *Stride : StridesMap.values()) {
3517 using namespace SCEVPatternMatch;
3518 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3519 const APInt *StrideConst;
3520 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3521 // Only handle constant strides for now.
3522 continue;
3523
3524 auto *CI = Plan.getConstantInt(*StrideConst);
3525 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3526 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3527
3528 // The versioned value may not be used in the loop directly but through a
3529 // sext/zext. Add new live-ins in those cases.
3530 for (Value *U : StrideV->users()) {
3532 continue;
3533 VPValue *StrideVPV = Plan.getLiveIn(U);
3534 if (!StrideVPV)
3535 continue;
3536 unsigned BW = U->getType()->getScalarSizeInBits();
3537 APInt C =
3538 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3539 VPValue *CI = Plan.getConstantInt(C);
3540 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3541 }
3542 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3543 }
3544
3545 for (VPRecipeBase &R : *Plan.getEntry()) {
3546 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3547 if (!ExpSCEV)
3548 continue;
3549 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3550 auto *NewSCEV =
3551 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3552 if (NewSCEV != ScevExpr) {
3553 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3554 ExpSCEV->replaceAllUsesWith(NewExp);
3555 if (Plan.getTripCount() == ExpSCEV)
3556 Plan.resetTripCount(NewExp);
3557 }
3558 }
3559}
3560
3562 VPlan &Plan,
3563 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3564 // Collect recipes in the backward slice of `Root` that may generate a poison
3565 // value that is used after vectorization.
3567 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3569 Worklist.push_back(Root);
3570
3571 // Traverse the backward slice of Root through its use-def chain.
3572 while (!Worklist.empty()) {
3573 VPRecipeBase *CurRec = Worklist.pop_back_val();
3574
3575 if (!Visited.insert(CurRec).second)
3576 continue;
3577
3578 // Prune search if we find another recipe generating a widen memory
3579 // instruction. Widen memory instructions involved in address computation
3580 // will lead to gather/scatter instructions, which don't need to be
3581 // handled.
3583 VPHeaderPHIRecipe>(CurRec))
3584 continue;
3585
3586 // This recipe contributes to the address computation of a widen
3587 // load/store. If the underlying instruction has poison-generating flags,
3588 // drop them directly.
3589 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3590 VPValue *A, *B;
3591 // Dropping disjoint from an OR may yield incorrect results, as some
3592 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3593 // for dependence analysis). Instead, replace it with an equivalent Add.
3594 // This is possible as all users of the disjoint OR only access lanes
3595 // where the operands are disjoint or poison otherwise.
3596 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3597 RecWithFlags->isDisjoint()) {
3598 VPBuilder Builder(RecWithFlags);
3599 VPInstruction *New =
3600 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3601 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3602 RecWithFlags->replaceAllUsesWith(New);
3603 RecWithFlags->eraseFromParent();
3604 CurRec = New;
3605 } else
3606 RecWithFlags->dropPoisonGeneratingFlags();
3607 } else {
3610 (void)Instr;
3611 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3612 "found instruction with poison generating flags not covered by "
3613 "VPRecipeWithIRFlags");
3614 }
3615
3616 // Add new definitions to the worklist.
3617 for (VPValue *Operand : CurRec->operands())
3618 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3619 Worklist.push_back(OpDef);
3620 }
3621 });
3622
3623 // Traverse all the recipes in the VPlan and collect the poison-generating
3624 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3625 // VPInterleaveRecipe.
3626 auto Iter = vp_depth_first_deep(Plan.getEntry());
3628 for (VPRecipeBase &Recipe : *VPBB) {
3629 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3630 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3631 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3632 if (AddrDef && WidenRec->isConsecutive() &&
3633 BlockNeedsPredication(UnderlyingInstr.getParent()))
3634 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3635 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3636 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3637 if (AddrDef) {
3638 // Check if any member of the interleave group needs predication.
3639 const InterleaveGroup<Instruction> *InterGroup =
3640 InterleaveRec->getInterleaveGroup();
3641 bool NeedPredication = false;
3642 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3643 I < NumMembers; ++I) {
3644 Instruction *Member = InterGroup->getMember(I);
3645 if (Member)
3646 NeedPredication |= BlockNeedsPredication(Member->getParent());
3647 }
3648
3649 if (NeedPredication)
3650 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3651 }
3652 }
3653 }
3654 }
3655}
3656
3658 VPlan &Plan,
3660 &InterleaveGroups,
3661 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3662 if (InterleaveGroups.empty())
3663 return;
3664
3665 // Interleave memory: for each Interleave Group we marked earlier as relevant
3666 // for this VPlan, replace the Recipes widening its memory instructions with a
3667 // single VPInterleaveRecipe at its insertion point.
3668 VPDominatorTree VPDT(Plan);
3669 for (const auto *IG : InterleaveGroups) {
3670 auto *Start =
3671 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3672 VPIRMetadata InterleaveMD(*Start);
3673 SmallVector<VPValue *, 4> StoredValues;
3674 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3675 StoredValues.push_back(StoreR->getStoredValue());
3676 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3677 Instruction *MemberI = IG->getMember(I);
3678 if (!MemberI)
3679 continue;
3680 VPWidenMemoryRecipe *MemoryR =
3681 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3682 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3683 StoredValues.push_back(StoreR->getStoredValue());
3684 InterleaveMD.intersect(*MemoryR);
3685 }
3686
3687 bool NeedsMaskForGaps =
3688 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3689 (!StoredValues.empty() && !IG->isFull());
3690
3691 Instruction *IRInsertPos = IG->getInsertPos();
3692 auto *InsertPos =
3693 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3694
3696 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3697 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3698 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3699
3700 // Get or create the start address for the interleave group.
3701 VPValue *Addr = Start->getAddr();
3702 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3703 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3704 // We cannot re-use the address of member zero because it does not
3705 // dominate the insert position. Instead, use the address of the insert
3706 // position and create a PtrAdd adjusting it to the address of member
3707 // zero.
3708 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3709 // InsertPos or sink loads above zero members to join it.
3710 assert(IG->getIndex(IRInsertPos) != 0 &&
3711 "index of insert position shouldn't be zero");
3712 auto &DL = IRInsertPos->getDataLayout();
3713 APInt Offset(32,
3714 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3715 IG->getIndex(IRInsertPos),
3716 /*IsSigned=*/true);
3717 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3718 VPBuilder B(InsertPos);
3719 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3720 }
3721 // If the group is reverse, adjust the index to refer to the last vector
3722 // lane instead of the first. We adjust the index from the first vector
3723 // lane, rather than directly getting the pointer for lane VF - 1, because
3724 // the pointer operand of the interleaved access is supposed to be uniform.
3725 if (IG->isReverse()) {
3726 auto *ReversePtr = new VPVectorEndPointerRecipe(
3727 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3728 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3729 ReversePtr->insertBefore(InsertPos);
3730 Addr = ReversePtr;
3731 }
3732 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3733 InsertPos->getMask(), NeedsMaskForGaps,
3734 InterleaveMD, InsertPos->getDebugLoc());
3735 VPIG->insertBefore(InsertPos);
3736
3737 unsigned J = 0;
3738 for (unsigned i = 0; i < IG->getFactor(); ++i)
3739 if (Instruction *Member = IG->getMember(i)) {
3740 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3741 if (!Member->getType()->isVoidTy()) {
3742 VPValue *OriginalV = MemberR->getVPSingleValue();
3743 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3744 J++;
3745 }
3746 MemberR->eraseFromParent();
3747 }
3748 }
3749}
3750
3751/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3752/// value, phi and backedge value. In the following example:
3753///
3754/// vector.ph:
3755/// Successor(s): vector loop
3756///
3757/// <x1> vector loop: {
3758/// vector.body:
3759/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3760/// ...
3761/// EMIT branch-on-count ...
3762/// No successors
3763/// }
3764///
3765/// WIDEN-INDUCTION will get expanded to:
3766///
3767/// vector.ph:
3768/// ...
3769/// vp<%induction.start> = ...
3770/// vp<%induction.increment> = ...
3771///
3772/// Successor(s): vector loop
3773///
3774/// <x1> vector loop: {
3775/// vector.body:
3776/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3777/// ...
3778/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3779/// EMIT branch-on-count ...
3780/// No successors
3781/// }
3782static void
3784 VPTypeAnalysis &TypeInfo) {
3785 VPlan *Plan = WidenIVR->getParent()->getPlan();
3786 VPValue *Start = WidenIVR->getStartValue();
3787 VPValue *Step = WidenIVR->getStepValue();
3788 VPValue *VF = WidenIVR->getVFValue();
3789 DebugLoc DL = WidenIVR->getDebugLoc();
3790
3791 // The value from the original loop to which we are mapping the new induction
3792 // variable.
3793 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3794
3795 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3798 VPIRFlags Flags = *WidenIVR;
3799 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3800 AddOp = Instruction::Add;
3801 MulOp = Instruction::Mul;
3802 } else {
3803 AddOp = ID.getInductionOpcode();
3804 MulOp = Instruction::FMul;
3805 }
3806
3807 // If the phi is truncated, truncate the start and step values.
3808 VPBuilder Builder(Plan->getVectorPreheader());
3809 Type *StepTy = TypeInfo.inferScalarType(Step);
3810 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3811 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3812 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3813 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3814 // Truncation doesn't preserve WrapFlags.
3815 Flags.dropPoisonGeneratingFlags();
3816 StepTy = Ty;
3817 }
3818
3819 // Construct the initial value of the vector IV in the vector loop preheader.
3820 Type *IVIntTy =
3822 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3823 if (StepTy->isFloatingPointTy())
3824 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3825
3826 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3827 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3828
3829 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3830 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3831 DebugLoc::getUnknown(), "induction");
3832
3833 // Create the widened phi of the vector IV.
3834 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3835 WidenIVR->getDebugLoc(), "vec.ind");
3836 WidePHI->insertBefore(WidenIVR);
3837
3838 // Create the backedge value for the vector IV.
3839 VPValue *Inc;
3840 VPValue *Prev;
3841 // If unrolled, use the increment and prev value from the operands.
3842 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3843 Inc = SplatVF;
3844 Prev = WidenIVR->getLastUnrolledPartOperand();
3845 } else {
3846 if (VPRecipeBase *R = VF->getDefiningRecipe())
3847 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3848 // Multiply the vectorization factor by the step using integer or
3849 // floating-point arithmetic as appropriate.
3850 if (StepTy->isFloatingPointTy())
3851 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3852 DL);
3853 else
3854 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3855 TypeInfo.inferScalarType(VF), DL);
3856
3857 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3858 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3859 Prev = WidePHI;
3860 }
3861
3863 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3864 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3865 WidenIVR->getDebugLoc(), "vec.ind.next");
3866
3867 WidePHI->addOperand(Next);
3868
3869 WidenIVR->replaceAllUsesWith(WidePHI);
3870}
3871
3872/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3873/// initial value, phi and backedge value. In the following example:
3874///
3875/// <x1> vector loop: {
3876/// vector.body:
3877/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3878/// ...
3879/// EMIT branch-on-count ...
3880/// }
3881///
3882/// WIDEN-POINTER-INDUCTION will get expanded to:
3883///
3884/// <x1> vector loop: {
3885/// vector.body:
3886/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3887/// EMIT %mul = mul %stepvector, %step
3888/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3889/// ...
3890/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3891/// EMIT branch-on-count ...
3892/// }
3894 VPTypeAnalysis &TypeInfo) {
3895 VPlan *Plan = R->getParent()->getPlan();
3896 VPValue *Start = R->getStartValue();
3897 VPValue *Step = R->getStepValue();
3898 VPValue *VF = R->getVFValue();
3899
3900 assert(R->getInductionDescriptor().getKind() ==
3902 "Not a pointer induction according to InductionDescriptor!");
3903 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3904 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3905 "Recipe should have been replaced");
3906
3907 VPBuilder Builder(R);
3908 DebugLoc DL = R->getDebugLoc();
3909
3910 // Build a scalar pointer phi.
3911 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3912
3913 // Create actual address geps that use the pointer phi as base and a
3914 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3915 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3916 Type *StepTy = TypeInfo.inferScalarType(Step);
3917 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3918 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3919 VPValue *PtrAdd =
3920 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3921 R->replaceAllUsesWith(PtrAdd);
3922
3923 // Create the backedge value for the scalar pointer phi.
3925 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3926 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3927 DL);
3928 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3929
3930 VPValue *InductionGEP =
3931 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3932 ScalarPtrPhi->addOperand(InductionGEP);
3933}
3934
3936 // Replace loop regions with explicity CFG.
3937 SmallVector<VPRegionBlock *> LoopRegions;
3939 vp_depth_first_deep(Plan.getEntry()))) {
3940 if (!R->isReplicator())
3941 LoopRegions.push_back(R);
3942 }
3943 for (VPRegionBlock *R : LoopRegions)
3944 R->dissolveToCFGLoop();
3945}
3946
3949 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3950 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3953 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3954 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3955 }
3956
3957 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3958 // single-condition branches:
3959 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3960 // the first condition is true, and otherwise jumps to a new interim block.
3961 // 2. A branch that ends the interim block, jumps to the second successor if
3962 // the second condition is true, and otherwise jumps to the third
3963 // successor.
3964 for (VPInstruction *Br : WorkList) {
3965 assert(Br->getNumOperands() == 2 &&
3966 "BranchOnTwoConds must have exactly 2 conditions");
3967 DebugLoc DL = Br->getDebugLoc();
3968 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
3969 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
3970 assert(Successors.size() == 3 &&
3971 "BranchOnTwoConds must have exactly 3 successors");
3972
3973 for (VPBlockBase *Succ : Successors)
3974 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
3975
3976 VPValue *Cond0 = Br->getOperand(0);
3977 VPValue *Cond1 = Br->getOperand(1);
3978 VPBlockBase *Succ0 = Successors[0];
3979 VPBlockBase *Succ1 = Successors[1];
3980 VPBlockBase *Succ2 = Successors[2];
3981 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
3982 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
3983
3984 VPBasicBlock *InterimBB =
3985 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
3986
3987 VPBuilder(BrOnTwoCondsBB)
3989 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
3990 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
3991
3993 VPBlockUtils::connectBlocks(InterimBB, Succ1);
3994 VPBlockUtils::connectBlocks(InterimBB, Succ2);
3995 Br->eraseFromParent();
3996 }
3997}
3998
4000 VPTypeAnalysis TypeInfo(Plan);
4003 vp_depth_first_deep(Plan.getEntry()))) {
4004 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4005 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
4006 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
4007 ToRemove.push_back(WidenIVR);
4008 continue;
4009 }
4010
4011 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
4012 // If the recipe only generates scalars, scalarize it instead of
4013 // expanding it.
4014 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
4015 VPBuilder Builder(WidenIVR);
4016 VPValue *PtrAdd =
4017 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
4018 WidenIVR->replaceAllUsesWith(PtrAdd);
4019 ToRemove.push_back(WidenIVR);
4020 continue;
4021 }
4022 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
4023 ToRemove.push_back(WidenIVR);
4024 continue;
4025 }
4026
4027 // Expand VPBlendRecipe into VPInstruction::Select.
4028 VPBuilder Builder(&R);
4029 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
4030 VPValue *Select = Blend->getIncomingValue(0);
4031 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
4032 Select = Builder.createSelect(Blend->getMask(I),
4033 Blend->getIncomingValue(I), Select,
4034 R.getDebugLoc(), "predphi", *Blend);
4035 Blend->replaceAllUsesWith(Select);
4036 ToRemove.push_back(Blend);
4037 }
4038
4039 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
4040 if (!VEPR->getOffset()) {
4041 assert(Plan.getConcreteUF() == 1 &&
4042 "Expected unroller to have materialized offset for UF != 1");
4043 VEPR->materializeOffset();
4044 }
4045 }
4046
4047 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
4048 Expr->decompose();
4049 ToRemove.push_back(Expr);
4050 }
4051
4052 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4053 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
4054 if (LastActiveL &&
4055 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4056 // Create Not(Mask) for all operands.
4058 for (VPValue *Op : LastActiveL->operands()) {
4059 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
4060 NotMasks.push_back(NotMask);
4061 }
4062
4063 // Create FirstActiveLane on the inverted masks.
4064 VPValue *FirstInactiveLane = Builder.createNaryOp(
4066 LastActiveL->getDebugLoc(), "first.inactive.lane");
4067
4068 // Subtract 1 to get the last active lane.
4069 VPValue *One =
4070 Plan.getConstantInt(TypeInfo.inferScalarType(FirstInactiveLane), 1);
4071 VPValue *LastLane =
4072 Builder.createSub(FirstInactiveLane, One,
4073 LastActiveL->getDebugLoc(), "last.active.lane");
4074
4075 LastActiveL->replaceAllUsesWith(LastLane);
4076 ToRemove.push_back(LastActiveL);
4077 continue;
4078 }
4079
4080 // Lower MaskedCond with block mask to LogicalAnd.
4082 auto *VPI = cast<VPInstruction>(&R);
4083 assert(VPI->isMasked() &&
4084 "Unmasked MaskedCond should be simplified earlier");
4085 VPI->replaceAllUsesWith(Builder.createNaryOp(
4086 VPInstruction::LogicalAnd, {VPI->getMask(), VPI->getOperand(0)}));
4087 ToRemove.push_back(VPI);
4088 continue;
4089 }
4090
4091 // Lower BranchOnCount to ICmp + BranchOnCond.
4092 VPValue *IV, *TC;
4093 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
4094 auto *BranchOnCountInst = cast<VPInstruction>(&R);
4095 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4096 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
4097 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
4098 ToRemove.push_back(BranchOnCountInst);
4099 continue;
4100 }
4101
4102 VPValue *VectorStep;
4103 VPValue *ScalarStep;
4105 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4106 continue;
4107
4108 // Expand WideIVStep.
4109 auto *VPI = cast<VPInstruction>(&R);
4110 Type *IVTy = TypeInfo.inferScalarType(VPI);
4111 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
4113 ? Instruction::UIToFP
4114 : Instruction::Trunc;
4115 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4116 }
4117
4118 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4119 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
4120 ScalarStep =
4121 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4122 }
4123
4124 VPIRFlags Flags;
4125 unsigned MulOpc;
4126 if (IVTy->isFloatingPointTy()) {
4127 MulOpc = Instruction::FMul;
4128 Flags = VPI->getFastMathFlags();
4129 } else {
4130 MulOpc = Instruction::Mul;
4131 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4132 }
4133
4134 VPInstruction *Mul = Builder.createNaryOp(
4135 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4136 VectorStep = Mul;
4137 VPI->replaceAllUsesWith(VectorStep);
4138 ToRemove.push_back(VPI);
4139 }
4140 }
4141
4142 for (VPRecipeBase *R : ToRemove)
4143 R->eraseFromParent();
4144}
4145
4147 VPBasicBlock *HeaderVPBB,
4148 VPBasicBlock *LatchVPBB,
4149 VPBasicBlock *MiddleVPBB) {
4150 struct EarlyExitInfo {
4151 VPBasicBlock *EarlyExitingVPBB;
4152 VPIRBasicBlock *EarlyExitVPBB;
4153 VPValue *CondToExit;
4154 };
4155
4156 VPDominatorTree VPDT(Plan);
4157 VPBuilder Builder(LatchVPBB->getTerminator());
4159 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4160 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4161 if (Pred == MiddleVPBB)
4162 continue;
4163 // Collect condition for this early exit.
4164 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4165 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4166 VPValue *CondOfEarlyExitingVPBB;
4167 [[maybe_unused]] bool Matched =
4168 match(EarlyExitingVPBB->getTerminator(),
4169 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4170 assert(Matched && "Terminator must be BranchOnCond");
4171
4172 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4173 // the correct block mask.
4174 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4175 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4177 TrueSucc == ExitBlock
4178 ? CondOfEarlyExitingVPBB
4179 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4180 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4181 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4182 VPDT.properlyDominates(
4183 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4184 LatchVPBB)) &&
4185 "exit condition must dominate the latch");
4186 Exits.push_back({
4187 EarlyExitingVPBB,
4188 ExitBlock,
4189 CondToEarlyExit,
4190 });
4191 }
4192 }
4193
4194 assert(!Exits.empty() && "must have at least one early exit");
4195 // Sort exits by RPO order to get correct program order. RPO gives a
4196 // topological ordering of the CFG, ensuring upstream exits are checked
4197 // before downstream exits in the dispatch chain.
4199 HeaderVPBB);
4201 for (const auto &[Num, VPB] : enumerate(RPOT))
4202 RPOIdx[VPB] = Num;
4203 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4204 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4205 });
4206#ifndef NDEBUG
4207 // After RPO sorting, verify that for any pair where one exit dominates
4208 // another, the dominating exit comes first. This is guaranteed by RPO
4209 // (topological order) and is required for the dispatch chain correctness.
4210 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4211 for (unsigned J = I + 1; J < Exits.size(); ++J)
4212 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4213 Exits[I].EarlyExitingVPBB) &&
4214 "RPO sort must place dominating exits before dominated ones");
4215#endif
4216
4217 // Build the AnyOf condition for the latch terminator using logical OR
4218 // to avoid poison propagation from later exit conditions when an earlier
4219 // exit is taken.
4220 VPValue *Combined = Exits[0].CondToExit;
4221 for (const EarlyExitInfo &Info : drop_begin(Exits))
4222 Combined = Builder.createLogicalOr(Combined, Info.CondToExit);
4223
4224 VPValue *IsAnyExitTaken =
4225 Builder.createNaryOp(VPInstruction::AnyOf, {Combined});
4226
4227 // Create the vector.early.exit blocks.
4228 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4229 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4230 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4231 VPBasicBlock *VectorEarlyExitVPBB =
4232 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4233 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4234 }
4235
4236 // Create the dispatch block (or reuse the single exit block if only one
4237 // exit). The dispatch block computes the first active lane of the combined
4238 // condition and, for multiple exits, chains through conditions to determine
4239 // which exit to take.
4240 VPBasicBlock *DispatchVPBB =
4241 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4242 : Plan.createVPBasicBlock("vector.early.exit.check");
4243 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4244 VPValue *FirstActiveLane =
4245 DispatchBuilder.createNaryOp(VPInstruction::FirstActiveLane, {Combined},
4246 DebugLoc::getUnknown(), "first.active.lane");
4247
4248 // For each early exit, disconnect the original exiting block
4249 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4250 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4251 // values at the first active lane:
4252 //
4253 // Input:
4254 // early.exiting.I:
4255 // ...
4256 // EMIT branch-on-cond vp<%cond.I>
4257 // Successor(s): in.loop.succ, ir-bb<exit.I>
4258 //
4259 // ir-bb<exit.I>:
4260 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4261 //
4262 // Output:
4263 // early.exiting.I:
4264 // ...
4265 // Successor(s): in.loop.succ
4266 //
4267 // vector.early.exit.I:
4268 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4269 // Successor(s): ir-bb<exit.I>
4270 //
4271 // ir-bb<exit.I>:
4272 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4273 // vector.early.exit.I)
4274 //
4275 for (auto [Exit, VectorEarlyExitVPBB] :
4276 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4277 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4278 // Adjust the phi nodes in EarlyExitVPBB.
4279 // 1. remove incoming values from EarlyExitingVPBB,
4280 // 2. extract the incoming value at FirstActiveLane
4281 // 3. add back the extracts as last operands for the phis
4282 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4283 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4284 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4285 // values from VectorEarlyExitVPBB.
4286 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4287 auto *ExitIRI = cast<VPIRPhi>(&R);
4288 VPValue *IncomingVal =
4289 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4290 VPValue *NewIncoming = IncomingVal;
4291 if (!isa<VPIRValue>(IncomingVal)) {
4292 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4293 NewIncoming = EarlyExitBuilder.createNaryOp(
4294 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4295 DebugLoc::getUnknown(), "early.exit.value");
4296 }
4297 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4298 ExitIRI->addOperand(NewIncoming);
4299 }
4300
4301 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4302 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4303 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4304 }
4305
4306 // Chain through exits: for each exit, check if its condition is true at
4307 // the first active lane. If so, take that exit; otherwise, try the next.
4308 // The last exit needs no check since it must be taken if all others fail.
4309 //
4310 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4311 //
4312 // latch:
4313 // ...
4314 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4315 // ...
4316 //
4317 // vector.early.exit.check:
4318 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4319 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4320 // EMIT branch-on-cond vp<%at.cond.0>
4321 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4322 //
4323 // vector.early.exit.check.0:
4324 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4325 // EMIT branch-on-cond vp<%at.cond.1>
4326 // Successor(s): vector.early.exit.1, vector.early.exit.2
4327 VPBasicBlock *CurrentBB = DispatchVPBB;
4328 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4329 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4330 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4331 DebugLoc::getUnknown(), "exit.cond.at.lane");
4332
4333 // For the last dispatch, branch directly to the last exit on false;
4334 // otherwise, create a new check block.
4335 bool IsLastDispatch = (I + 2 == Exits.size());
4336 VPBasicBlock *FalseBB =
4337 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4338 : Plan.createVPBasicBlock(
4339 Twine("vector.early.exit.check.") + Twine(I));
4340
4341 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4342 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4343 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4344 FalseBB->setPredecessors({CurrentBB});
4345
4346 CurrentBB = FalseBB;
4347 DispatchBuilder.setInsertPoint(CurrentBB);
4348 }
4349
4350 // Replace the latch terminator with the new branching logic.
4351 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4352 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4353 "Unexpected terminator");
4354 auto *IsLatchExitTaken =
4355 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
4356 LatchExitingBranch->getOperand(1));
4357
4358 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4359 LatchExitingBranch->eraseFromParent();
4360 Builder.setInsertPoint(LatchVPBB);
4361 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
4362 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4363 LatchVPBB->clearSuccessors();
4364 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4365 DispatchVPBB->setPredecessors({LatchVPBB});
4366}
4367
4368/// This function tries convert extended in-loop reductions to
4369/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4370/// valid. The created recipe must be decomposed to its constituent
4371/// recipes before execution.
4372static VPExpressionRecipe *
4374 VFRange &Range) {
4375 Type *RedTy = Ctx.Types.inferScalarType(Red);
4376 VPValue *VecOp = Red->getVecOp();
4377
4378 // Clamp the range if using extended-reduction is profitable.
4379 auto IsExtendedRedValidAndClampRange =
4380 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4382 [&](ElementCount VF) {
4383 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4385
4387 InstructionCost ExtCost =
4388 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4389 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4390
4391 if (Red->isPartialReduction()) {
4394 // FIXME: Move partial reduction creation, costing and clamping
4395 // here from LoopVectorize.cpp.
4396 ExtRedCost = Ctx.TTI.getPartialReductionCost(
4397 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
4398 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind,
4399 RedTy->isFloatingPointTy()
4400 ? std::optional{Red->getFastMathFlags()}
4401 : std::nullopt);
4402 } else if (!RedTy->isFloatingPointTy()) {
4403 // TTI::getExtendedReductionCost only supports integer types.
4404 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4405 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4406 Red->getFastMathFlags(), CostKind);
4407 }
4408 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4409 },
4410 Range);
4411 };
4412
4413 VPValue *A;
4414 // Match reduce(ext)).
4415 if (isa<VPWidenCastRecipe>(VecOp) &&
4416 (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) ||
4417 match(VecOp, m_FPExt(m_VPValue(A)))) &&
4418 IsExtendedRedValidAndClampRange(
4419 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4420 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
4421 Ctx.Types.inferScalarType(A)))
4422 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4423
4424 return nullptr;
4425}
4426
4427/// This function tries convert extended in-loop reductions to
4428/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4429/// and valid. The created VPExpressionRecipe must be decomposed to its
4430/// constituent recipes before execution. Patterns of the
4431/// VPExpressionRecipe:
4432/// reduce.add(mul(...)),
4433/// reduce.add(mul(ext(A), ext(B))),
4434/// reduce.add(ext(mul(ext(A), ext(B)))).
4435/// reduce.fadd(fmul(ext(A), ext(B)))
4436static VPExpressionRecipe *
4438 VPCostContext &Ctx, VFRange &Range) {
4439 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4440 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4441 Opcode != Instruction::FAdd)
4442 return nullptr;
4443
4444 Type *RedTy = Ctx.Types.inferScalarType(Red);
4445
4446 // Clamp the range if using multiply-accumulate-reduction is profitable.
4447 auto IsMulAccValidAndClampRange =
4449 VPWidenCastRecipe *OuterExt) -> bool {
4451 [&](ElementCount VF) {
4453 Type *SrcTy =
4454 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4455 InstructionCost MulAccCost;
4456
4457 if (Red->isPartialReduction()) {
4458 Type *SrcTy2 =
4459 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
4460 // FIXME: Move partial reduction creation, costing and clamping
4461 // here from LoopVectorize.cpp.
4462 MulAccCost = Ctx.TTI.getPartialReductionCost(
4463 Opcode, SrcTy, SrcTy2, RedTy, VF,
4465 Ext0->getOpcode())
4468 Ext1->getOpcode())
4470 Mul->getOpcode(), CostKind,
4471 RedTy->isFloatingPointTy()
4472 ? std::optional{Red->getFastMathFlags()}
4473 : std::nullopt);
4474 } else {
4475 // Only partial reductions support mixed or floating-point extends
4476 // at the moment.
4477 if (Ext0 && Ext1 &&
4478 (Ext0->getOpcode() != Ext1->getOpcode() ||
4479 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4480 return false;
4481
4482 bool IsZExt =
4483 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4484 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4485 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4486 SrcVecTy, CostKind);
4487 }
4488
4489 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4490 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4491 InstructionCost ExtCost = 0;
4492 if (Ext0)
4493 ExtCost += Ext0->computeCost(VF, Ctx);
4494 if (Ext1)
4495 ExtCost += Ext1->computeCost(VF, Ctx);
4496 if (OuterExt)
4497 ExtCost += OuterExt->computeCost(VF, Ctx);
4498
4499 return MulAccCost.isValid() &&
4500 MulAccCost < ExtCost + MulCost + RedCost;
4501 },
4502 Range);
4503 };
4504
4505 VPValue *VecOp = Red->getVecOp();
4506 VPRecipeBase *Sub = nullptr;
4507 VPValue *A, *B;
4508 VPValue *Tmp = nullptr;
4509
4510 // Try to match reduce.fadd(fmul(fpext(...), fpext(...))).
4511 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue())))) {
4512 assert(Opcode == Instruction::FAdd &&
4513 "MulAccumulateReduction from an FMul must accumulate into an FAdd "
4514 "instruction");
4515 auto *FMul = dyn_cast<VPWidenRecipe>(VecOp);
4516 if (!FMul)
4517 return nullptr;
4518
4519 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(0));
4520 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(1));
4521
4522 if (RecipeA && RecipeB &&
4523 IsMulAccValidAndClampRange(FMul, RecipeA, RecipeB, nullptr)) {
4524 return new VPExpressionRecipe(RecipeA, RecipeB, FMul, Red);
4525 }
4526 }
4527 if (RedTy->isFloatingPointTy())
4528 return nullptr;
4529
4530 // Sub reductions could have a sub between the add reduction and vec op.
4531 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4532 Sub = VecOp->getDefiningRecipe();
4533 VecOp = Tmp;
4534 }
4535
4536 // If ValB is a constant and can be safely extended, truncate it to the same
4537 // type as ExtA's operand, then extend it to the same type as ExtA. This
4538 // creates two uniform extends that can more easily be matched by the rest of
4539 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4540 // replaced with the new extend of the constant.
4541 auto ExtendAndReplaceConstantOp = [&Ctx, &Red](VPWidenCastRecipe *ExtA,
4542 VPWidenCastRecipe *&ExtB,
4543 VPValue *&ValB,
4544 VPWidenRecipe *Mul) {
4545 if (!ExtA || ExtB || !isa<VPIRValue>(ValB) || Red->isPartialReduction())
4546 return;
4547 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4548 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4549 const APInt *Const;
4550 if (!match(ValB, m_APInt(Const)) ||
4552 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4553 return;
4554 // The truncate ensures that the type of each extended operand is the
4555 // same, and it's been proven that the constant can be extended from
4556 // NarrowTy safely. Necessary since ExtA's extended operand would be
4557 // e.g. an i8, while the const will likely be an i32. This will be
4558 // elided by later optimisations.
4559 VPBuilder Builder(Mul);
4560 auto *Trunc =
4561 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4562 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4563 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4564 Mul->setOperand(1, ExtB);
4565 };
4566
4567 // Try to match reduce.add(mul(...)).
4568 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4571 auto *Mul = cast<VPWidenRecipe>(VecOp);
4572
4573 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4574 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4575
4576 // Match reduce.add/sub(mul(ext, ext)).
4577 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4578 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4579 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4580 if (Sub)
4581 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4582 cast<VPWidenRecipe>(Sub), Red);
4583 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4584 }
4585 // TODO: Add an expression type for this variant with a negated mul
4586 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4587 return new VPExpressionRecipe(Mul, Red);
4588 }
4589 // TODO: Add an expression type for negated versions of other expression
4590 // variants.
4591 if (Sub)
4592 return nullptr;
4593
4594 // Match reduce.add(ext(mul(A, B))).
4595 if (!Red->isPartialReduction() &&
4596 match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4597 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4598 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4601
4602 // reduce.add(ext(mul(ext, const)))
4603 // -> reduce.add(ext(mul(ext, ext(const))))
4604 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4605
4606 // reduce.add(ext(mul(ext(A), ext(B))))
4607 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4608 // The inner extends must either have the same opcode as the outer extend or
4609 // be the same, in which case the multiply can never result in a negative
4610 // value and the outer extend can be folded away by doing wider
4611 // extends for the operands of the mul.
4612 if (Ext0 && Ext1 &&
4613 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4614 Ext0->getOpcode() == Ext1->getOpcode() &&
4615 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4616 auto *NewExt0 = new VPWidenCastRecipe(
4617 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4618 *Ext0, *Ext0, Ext0->getDebugLoc());
4619 NewExt0->insertBefore(Ext0);
4620
4621 VPWidenCastRecipe *NewExt1 = NewExt0;
4622 if (Ext0 != Ext1) {
4623 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4624 Ext->getResultType(), nullptr, *Ext1,
4625 *Ext1, Ext1->getDebugLoc());
4626 NewExt1->insertBefore(Ext1);
4627 }
4628 Mul->setOperand(0, NewExt0);
4629 Mul->setOperand(1, NewExt1);
4630 Red->setOperand(1, Mul);
4631 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4632 }
4633 }
4634 return nullptr;
4635}
4636
4637/// This function tries to create abstract recipes from the reduction recipe for
4638/// following optimizations and cost estimation.
4640 VPCostContext &Ctx,
4641 VFRange &Range) {
4642 VPExpressionRecipe *AbstractR = nullptr;
4643 auto IP = std::next(Red->getIterator());
4644 auto *VPBB = Red->getParent();
4645 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4646 AbstractR = MulAcc;
4647 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4648 AbstractR = ExtRed;
4649 // Cannot create abstract inloop reduction recipes.
4650 if (!AbstractR)
4651 return;
4652
4653 AbstractR->insertBefore(*VPBB, IP);
4654 Red->replaceAllUsesWith(AbstractR);
4655}
4656
4667
4669 if (Plan.hasScalarVFOnly())
4670 return;
4671
4672#ifndef NDEBUG
4673 VPDominatorTree VPDT(Plan);
4674#endif
4675
4676 SmallVector<VPValue *> VPValues;
4679 append_range(VPValues, Plan.getLiveIns());
4680 for (VPRecipeBase &R : *Plan.getEntry())
4681 append_range(VPValues, R.definedValues());
4682
4683 auto *VectorPreheader = Plan.getVectorPreheader();
4684 for (VPValue *VPV : VPValues) {
4686 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4687 continue;
4688
4689 // Add explicit broadcast at the insert point that dominates all users.
4690 VPBasicBlock *HoistBlock = VectorPreheader;
4691 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4692 for (VPUser *User : VPV->users()) {
4693 if (User->usesScalars(VPV))
4694 continue;
4695 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4696 HoistPoint = HoistBlock->begin();
4697 else
4698 assert(VPDT.dominates(VectorPreheader,
4699 cast<VPRecipeBase>(User)->getParent()) &&
4700 "All users must be in the vector preheader or dominated by it");
4701 }
4702
4703 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4704 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4705 VPV->replaceUsesWithIf(Broadcast,
4706 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4707 return Broadcast != &U && !U.usesScalars(VPV);
4708 });
4709 }
4710}
4711
4713 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4714
4715 // Collect candidate loads with invariant addresses and noalias scopes
4716 // metadata and memory-writing recipes with noalias metadata.
4720 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4721 for (VPRecipeBase &R : *VPBB) {
4722 // Only handle single-scalar replicated loads with invariant addresses.
4723 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4724 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4725 RepR->getOpcode() != Instruction::Load)
4726 continue;
4727
4728 VPValue *Addr = RepR->getOperand(0);
4729 if (Addr->isDefinedOutsideLoopRegions()) {
4731 if (!Loc.AATags.Scope)
4732 continue;
4733 CandidateLoads.push_back({RepR, Loc});
4734 }
4735 }
4736 if (R.mayWriteToMemory()) {
4738 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4739 return;
4740 Stores.push_back(*Loc);
4741 }
4742 }
4743 }
4744
4745 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4746 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4747 // Hoist the load to the preheader if it doesn't alias with any stores
4748 // according to the noalias metadata. Other loads should have been hoisted
4749 // by other passes
4750 const AAMDNodes &LoadAA = LoadLoc.AATags;
4751 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4753 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4754 })) {
4755 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4756 }
4757 }
4758}
4759
4760// Collect common metadata from a group of replicate recipes by intersecting
4761// metadata from all recipes in the group.
4763 VPIRMetadata CommonMetadata = *Recipes.front();
4764 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4765 CommonMetadata.intersect(*Recipe);
4766 return CommonMetadata;
4767}
4768
4769template <unsigned Opcode>
4773 const Loop *L) {
4774 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4775 "Only Load and Store opcodes supported");
4776 constexpr bool IsLoad = (Opcode == Instruction::Load);
4777 VPTypeAnalysis TypeInfo(Plan);
4778
4779 // For each address, collect operations with the same or complementary masks.
4781 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4782 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4783 };
4785 Plan, PSE, L,
4786 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
4787 for (auto Recipes : Groups) {
4788 if (Recipes.size() < 2)
4789 continue;
4790
4791 // Collect groups with the same or complementary masks.
4792 for (VPReplicateRecipe *&RecipeI : Recipes) {
4793 if (!RecipeI)
4794 continue;
4795
4796 VPValue *MaskI = RecipeI->getMask();
4797 Type *TypeI = GetLoadStoreValueType(RecipeI);
4799 Group.push_back(RecipeI);
4800 RecipeI = nullptr;
4801
4802 // Find all operations with the same or complementary masks.
4803 bool HasComplementaryMask = false;
4804 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4805 if (!RecipeJ)
4806 continue;
4807
4808 VPValue *MaskJ = RecipeJ->getMask();
4809 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4810 if (TypeI == TypeJ) {
4811 // Check if any operation in the group has a complementary mask with
4812 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4813 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4814 match(MaskJ, m_Not(m_Specific(MaskI)));
4815 Group.push_back(RecipeJ);
4816 RecipeJ = nullptr;
4817 }
4818 }
4819
4820 if (HasComplementaryMask) {
4821 assert(Group.size() >= 2 && "must have at least 2 entries");
4822 AllGroups.push_back(std::move(Group));
4823 }
4824 }
4825 }
4826
4827 return AllGroups;
4828}
4829
4830// Find the recipe with minimum alignment in the group.
4831template <typename InstType>
4832static VPReplicateRecipe *
4834 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4835 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4836 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4837 });
4838}
4839
4842 const Loop *L) {
4843 auto Groups =
4845 if (Groups.empty())
4846 return;
4847
4848 // Process each group of loads.
4849 for (auto &Group : Groups) {
4850 // Try to use the earliest (most dominating) load to replace all others.
4851 VPReplicateRecipe *EarliestLoad = Group[0];
4852 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4853 VPBasicBlock *LastBB = Group.back()->getParent();
4854
4855 // Check that the load doesn't alias with stores between first and last.
4856 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4857 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4858 continue;
4859
4860 // Collect common metadata from all loads in the group.
4861 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4862
4863 // Find the load with minimum alignment to use.
4864 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4865
4866 bool IsSingleScalar = EarliestLoad->isSingleScalar();
4867 assert(all_of(Group,
4868 [IsSingleScalar](VPReplicateRecipe *R) {
4869 return R->isSingleScalar() == IsSingleScalar;
4870 }) &&
4871 "all members in group must agree on IsSingleScalar");
4872
4873 // Create an unpredicated version of the earliest load with common
4874 // metadata.
4875 auto *UnpredicatedLoad = new VPReplicateRecipe(
4876 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4877 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
4878
4879 UnpredicatedLoad->insertBefore(EarliestLoad);
4880
4881 // Replace all loads in the group with the unpredicated load.
4882 for (VPReplicateRecipe *Load : Group) {
4883 Load->replaceAllUsesWith(UnpredicatedLoad);
4884 Load->eraseFromParent();
4885 }
4886 }
4887}
4888
4889static bool
4891 PredicatedScalarEvolution &PSE, const Loop &L,
4892 VPTypeAnalysis &TypeInfo) {
4893 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4894 if (!StoreLoc || !StoreLoc->AATags.Scope)
4895 return false;
4896
4897 // When sinking a group of stores, all members of the group alias each other.
4898 // Skip them during the alias checks.
4899 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4900 StoresToSink.end());
4901
4902 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4903 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4904 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4905 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4906}
4907
4910 const Loop *L) {
4911 auto Groups =
4913 if (Groups.empty())
4914 return;
4915
4916 VPTypeAnalysis TypeInfo(Plan);
4917
4918 for (auto &Group : Groups) {
4919 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4920 continue;
4921
4922 // Use the last (most dominated) store's location for the unconditional
4923 // store.
4924 VPReplicateRecipe *LastStore = Group.back();
4925 VPBasicBlock *InsertBB = LastStore->getParent();
4926
4927 // Collect common alias metadata from all stores in the group.
4928 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4929
4930 // Build select chain for stored values.
4931 VPValue *SelectedValue = Group[0]->getOperand(0);
4932 VPBuilder Builder(InsertBB, LastStore->getIterator());
4933
4934 bool IsSingleScalar = Group[0]->isSingleScalar();
4935 for (unsigned I = 1; I < Group.size(); ++I) {
4936 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
4937 "all members in group must agree on IsSingleScalar");
4938 VPValue *Mask = Group[I]->getMask();
4939 VPValue *Value = Group[I]->getOperand(0);
4940 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4941 Group[I]->getDebugLoc());
4942 }
4943
4944 // Find the store with minimum alignment to use.
4945 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4946
4947 // Create unconditional store with selected value and common metadata.
4948 auto *UnpredicatedStore = new VPReplicateRecipe(
4949 StoreWithMinAlign->getUnderlyingInstr(),
4950 {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,
4951 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4952 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4953
4954 // Remove all predicated stores from the group.
4955 for (VPReplicateRecipe *Store : Group)
4956 Store->eraseFromParent();
4957 }
4958}
4959
4961 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4963 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4964 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4965
4966 VPValue *TC = Plan.getTripCount();
4967 if (TC->getNumUsers() == 0)
4968 return;
4969
4970 // Skip cases for which the trip count may be non-trivial to materialize.
4971 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4972 // tail is required.
4973 if (!Plan.hasScalarTail() ||
4975 Plan.getScalarPreheader() ||
4976 !isa<VPIRValue>(TC))
4977 return;
4978
4979 // Materialize vector trip counts for constants early if it can simply
4980 // be computed as (Original TC / VF * UF) * VF * UF.
4981 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4982 // tail-folded loops.
4983 ScalarEvolution &SE = *PSE.getSE();
4984 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4985 if (!isa<SCEVConstant>(TCScev))
4986 return;
4987 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4988 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4989 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4990 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4991}
4992
4994 VPBasicBlock *VectorPH) {
4996 if (BTC->getNumUsers() == 0)
4997 return;
4998
4999 VPBuilder Builder(VectorPH, VectorPH->begin());
5000 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
5001 auto *TCMO =
5002 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
5003 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
5004 BTC->replaceAllUsesWith(TCMO);
5005}
5006
5008 if (Plan.hasScalarVFOnly())
5009 return;
5010
5011 VPTypeAnalysis TypeInfo(Plan);
5012 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
5013 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5015 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5016 vp_depth_first_shallow(LoopRegion->getEntry()));
5017 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
5018 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
5019 // regions. Those are not materialized explicitly yet. Those vector users are
5020 // still handled in VPReplicateRegion::execute(), via shouldPack().
5021 // TODO: materialize build vectors for replicating recipes in replicating
5022 // regions.
5023 for (VPBasicBlock *VPBB :
5024 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
5025 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5027 continue;
5028 auto *DefR = cast<VPSingleDefRecipe>(&R);
5029 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
5030 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5031 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
5032 };
5033 if ((isa<VPReplicateRecipe>(DefR) &&
5034 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
5035 (isa<VPInstruction>(DefR) &&
5037 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
5038 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
5039 continue;
5040
5041 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
5042 unsigned Opcode = ScalarTy->isStructTy()
5045 auto *BuildVector = new VPInstruction(Opcode, {DefR});
5046 BuildVector->insertAfter(DefR);
5047
5048 DefR->replaceUsesWithIf(
5049 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
5050 VPUser &U, unsigned) {
5051 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
5052 });
5053 }
5054 }
5055
5056 // Create explicit VPInstructions to convert vectors to scalars. The current
5057 // implementation is conservative - it may miss some cases that may or may not
5058 // be vector values. TODO: introduce Unpacks speculatively - remove them later
5059 // if they are known to operate on scalar values.
5060 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5061 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5064 continue;
5065 for (VPValue *Def : R.definedValues()) {
5066 // Skip recipes that are single-scalar or only have their first lane
5067 // used.
5068 // TODO: The Defs skipped here may or may not be vector values.
5069 // Introduce Unpacks, and remove them later, if they are guaranteed to
5070 // produce scalar values.
5072 continue;
5073
5074 // At the moment, we create unpacks only for scalar users outside
5075 // replicate regions. Recipes inside replicate regions still extract the
5076 // required lanes implicitly.
5077 // TODO: Remove once replicate regions are unrolled completely.
5078 auto IsCandidateUnpackUser = [Def](VPUser *U) {
5079 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5080 return U->usesScalars(Def) &&
5081 (!ParentRegion || !ParentRegion->isReplicator());
5082 };
5083 if (none_of(Def->users(), IsCandidateUnpackUser))
5084 continue;
5085
5086 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
5087 if (R.isPhi())
5088 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
5089 else
5090 Unpack->insertAfter(&R);
5091 Def->replaceUsesWithIf(Unpack,
5092 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
5093 return IsCandidateUnpackUser(&U);
5094 });
5095 }
5096 }
5097 }
5098}
5099
5101 VPBasicBlock *VectorPHVPBB,
5102 bool TailByMasking,
5103 bool RequiresScalarEpilogue,
5104 VPValue *Step) {
5105 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5106 // There's nothing to do if there are no users of the vector trip count or its
5107 // IR value has already been set.
5108 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
5109 return;
5110
5111 VPValue *TC = Plan.getTripCount();
5112 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
5113 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5114 if (auto *StepR = Step->getDefiningRecipe()) {
5115 assert(StepR->getParent() == VectorPHVPBB &&
5116 "Step must be defined in VectorPHVPBB");
5117 // Insert after Step's definition to maintain valid def-use ordering.
5118 InsertPt = std::next(StepR->getIterator());
5119 }
5120 VPBuilder Builder(VectorPHVPBB, InsertPt);
5121
5122 // If the tail is to be folded by masking, round the number of iterations N
5123 // up to a multiple of Step instead of rounding down. This is done by first
5124 // adding Step-1 and then rounding down. Note that it's ok if this addition
5125 // overflows: the vector induction variable will eventually wrap to zero given
5126 // that it starts at zero and its Step is a power of two; the loop will then
5127 // exit, with the last early-exit vector comparison also producing all-true.
5128 if (TailByMasking) {
5129 TC = Builder.createAdd(
5130 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5131 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5132 }
5133
5134 // Now we need to generate the expression for the part of the loop that the
5135 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5136 // iterations are not required for correctness, or N - Step, otherwise. Step
5137 // is equal to the vectorization factor (number of SIMD elements) times the
5138 // unroll factor (number of SIMD instructions).
5139 VPValue *R =
5140 Builder.createNaryOp(Instruction::URem, {TC, Step},
5141 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5142
5143 // There are cases where we *must* run at least one iteration in the remainder
5144 // loop. See the cost model for when this can happen. If the step evenly
5145 // divides the trip count, we set the remainder to be equal to the step. If
5146 // the step does not evenly divide the trip count, no adjustment is necessary
5147 // since there will already be scalar iterations. Note that the minimum
5148 // iterations check ensures that N >= Step.
5149 if (RequiresScalarEpilogue) {
5150 assert(!TailByMasking &&
5151 "requiring scalar epilogue is not supported with fail folding");
5152 VPValue *IsZero =
5153 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));
5154 R = Builder.createSelect(IsZero, Step, R);
5155 }
5156
5157 VPValue *Res =
5158 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5159 VectorTC.replaceAllUsesWith(Res);
5160}
5161
5163 ElementCount VFEC) {
5164 // If VF and VFxUF have already been materialized (no remaining users),
5165 // there's nothing more to do.
5166 if (Plan.getVF().isMaterialized()) {
5167 assert(Plan.getVFxUF().isMaterialized() &&
5168 "VF and VFxUF must be materialized together");
5169 return;
5170 }
5171
5172 VPBuilder Builder(VectorPH, VectorPH->begin());
5173 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
5174 VPValue &VF = Plan.getVF();
5175 VPValue &VFxUF = Plan.getVFxUF();
5176 // If there are no users of the runtime VF, compute VFxUF by constant folding
5177 // the multiplication of VF and UF.
5178 if (VF.getNumUsers() == 0) {
5179 VPValue *RuntimeVFxUF =
5180 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5181 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5182 return;
5183 }
5184
5185 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5186 // vscale) * UF.
5187 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5189 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5191 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5192 }
5193 VF.replaceAllUsesWith(RuntimeVF);
5194
5195 VPValue *MulByUF = Builder.createOverflowingOp(
5196 Instruction::Mul,
5197 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5198 {true, false});
5199 VFxUF.replaceAllUsesWith(MulByUF);
5200}
5201
5204 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5205
5206 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5207 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5208 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5209 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5211 continue;
5212 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5213 if (!ExpSCEV)
5214 break;
5215 const SCEV *Expr = ExpSCEV->getSCEV();
5216 Value *Res =
5217 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5218 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
5219 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5220 ExpSCEV->replaceAllUsesWith(Exp);
5221 if (Plan.getTripCount() == ExpSCEV)
5222 Plan.resetTripCount(Exp);
5223 ExpSCEV->eraseFromParent();
5224 }
5226 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
5227 "before any VPIRInstructions");
5228 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5229 // to the VPIRBasicBlock.
5230 auto EI = Entry->begin();
5231 for (Instruction &I : drop_end(*EntryBB)) {
5232 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5233 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5234 EI++;
5235 continue;
5236 }
5238 }
5239
5240 return ExpandedSCEVs;
5241}
5242
5243/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5244/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5245/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5246/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5247/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5248/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5249/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5250/// is defined at \p Idx of a load interleave group.
5251static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5252 VPValue *OpV, unsigned Idx, bool IsScalable) {
5253 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5254 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5255 if (!Member0OpR)
5256 return Member0Op == OpV;
5257 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5258 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5259 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5260 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5261 Member0Op == OpV;
5262 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5263 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5264 return false;
5265}
5266
5267static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5269 auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Ops[0]);
5270 if (!WideMember0)
5271 return false;
5272 for (VPValue *V : Ops) {
5274 return false;
5275 auto *R = cast<VPSingleDefRecipe>(V);
5276 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5277 return false;
5278 }
5279
5280 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5282 for (VPValue *Op : Ops)
5283 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5284
5285 if (canNarrowOps(OpsI, IsScalable))
5286 continue;
5287
5288 if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {
5289 const auto &[OpIdx, OpV] = P;
5290 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5291 }))
5292 return false;
5293 }
5294
5295 return true;
5296}
5297
5298/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5299/// number of members both equal to VF. The interleave group must also access
5300/// the full vector width.
5301static std::optional<ElementCount> isConsecutiveInterleaveGroup(
5303 VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
5304 if (!InterleaveR || InterleaveR->getMask())
5305 return std::nullopt;
5306
5307 Type *GroupElementTy = nullptr;
5308 if (InterleaveR->getStoredValues().empty()) {
5309 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
5310 if (!all_of(InterleaveR->definedValues(),
5311 [&TypeInfo, GroupElementTy](VPValue *Op) {
5312 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5313 }))
5314 return std::nullopt;
5315 } else {
5316 GroupElementTy =
5317 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
5318 if (!all_of(InterleaveR->getStoredValues(),
5319 [&TypeInfo, GroupElementTy](VPValue *Op) {
5320 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5321 }))
5322 return std::nullopt;
5323 }
5324
5325 auto IG = InterleaveR->getInterleaveGroup();
5326 if (IG->getFactor() != IG->getNumMembers())
5327 return std::nullopt;
5328
5329 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5330 TypeSize Size = TTI.getRegisterBitWidth(
5333 assert(Size.isScalable() == VF.isScalable() &&
5334 "if Size is scalable, VF must be scalable and vice versa");
5335 return Size.getKnownMinValue();
5336 };
5337
5338 for (ElementCount VF : VFs) {
5339 unsigned MinVal = VF.getKnownMinValue();
5340 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5341 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5342 return {VF};
5343 }
5344 return std::nullopt;
5345}
5346
5347/// Returns true if \p VPValue is a narrow VPValue.
5348static bool isAlreadyNarrow(VPValue *VPV) {
5349 if (isa<VPIRValue>(VPV))
5350 return true;
5351 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5352 return RepR && RepR->isSingleScalar();
5353}
5354
5355// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
5356// a narrow variant.
5357static VPValue *
5359 auto *R = V->getDefiningRecipe();
5360 if (!R || NarrowedOps.contains(V))
5361 return V;
5362
5363 if (isAlreadyNarrow(V))
5364 return V;
5365
5367 auto *WideMember0 = cast<VPSingleDefRecipe>(R);
5368 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
5369 WideMember0->setOperand(
5370 Idx,
5371 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
5372 return V;
5373 }
5374
5375 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5376 // Narrow interleave group to wide load, as transformed VPlan will only
5377 // process one original iteration.
5378 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5379 auto *L = new VPWidenLoadRecipe(
5380 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
5381 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
5382 L->insertBefore(LoadGroup);
5383 NarrowedOps.insert(L);
5384 return L;
5385 }
5386
5387 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5388 assert(RepR->isSingleScalar() &&
5389 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
5390 "must be a single scalar load");
5391 NarrowedOps.insert(RepR);
5392 return RepR;
5393 }
5394
5395 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5396 VPValue *PtrOp = WideLoad->getAddr();
5397 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5398 PtrOp = VecPtr->getOperand(0);
5399 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5400 // process one original iteration.
5401 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5402 /*IsUniform*/ true,
5403 /*Mask*/ nullptr, {}, *WideLoad);
5404 N->insertBefore(WideLoad);
5405 NarrowedOps.insert(N);
5406 return N;
5407}
5408
5409std::unique_ptr<VPlan>
5411 const TargetTransformInfo &TTI) {
5412 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5413
5414 if (!VectorLoop)
5415 return nullptr;
5416
5417 // Only handle single-block loops for now.
5418 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5419 return nullptr;
5420
5421 // Skip plans when we may not be able to properly narrow.
5422 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5423 if (!match(&Exiting->back(), m_BranchOnCount()))
5424 return nullptr;
5425
5426 assert(match(&Exiting->back(),
5428 m_Specific(&Plan.getVectorTripCount()))) &&
5429 "unexpected branch-on-count");
5430
5431 VPTypeAnalysis TypeInfo(Plan);
5433 std::optional<ElementCount> VFToOptimize;
5434 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5436 continue;
5437
5440 continue;
5441
5442 // Bail out on recipes not supported at the moment:
5443 // * phi recipes other than the canonical induction
5444 // * recipes writing to memory except interleave groups
5445 // Only support plans with a canonical induction phi.
5446 if (R.isPhi())
5447 return nullptr;
5448
5449 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5450 if (R.mayWriteToMemory() && !InterleaveR)
5451 return nullptr;
5452
5453 // All other ops are allowed, but we reject uses that cannot be converted
5454 // when checking all allowed consumers (store interleave groups) below.
5455 if (!InterleaveR)
5456 continue;
5457
5458 // Try to find a single VF, where all interleave groups are consecutive and
5459 // saturate the full vector width. If we already have a candidate VF, check
5460 // if it is applicable for the current InterleaveR, otherwise look for a
5461 // suitable VF across the Plan's VFs.
5463 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5464 : to_vector(Plan.vectorFactors());
5465 std::optional<ElementCount> NarrowedVF =
5466 isConsecutiveInterleaveGroup(InterleaveR, VFs, TypeInfo, TTI);
5467 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
5468 return nullptr;
5469 VFToOptimize = NarrowedVF;
5470
5471 // Skip read interleave groups.
5472 if (InterleaveR->getStoredValues().empty())
5473 continue;
5474
5475 // Narrow interleave groups, if all operands are already matching narrow
5476 // ops.
5477 auto *Member0 = InterleaveR->getStoredValues()[0];
5478 if (isAlreadyNarrow(Member0) &&
5479 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5480 StoreGroups.push_back(InterleaveR);
5481 continue;
5482 }
5483
5484 // For now, we only support full interleave groups storing load interleave
5485 // groups.
5486 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5487 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5488 if (!DefR)
5489 return false;
5490 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5491 return IR && IR->getInterleaveGroup()->isFull() &&
5492 IR->getVPValue(Op.index()) == Op.value();
5493 })) {
5494 StoreGroups.push_back(InterleaveR);
5495 continue;
5496 }
5497
5498 // Check if all values feeding InterleaveR are matching wide recipes, which
5499 // operands that can be narrowed.
5500 if (!canNarrowOps(InterleaveR->getStoredValues(),
5501 VFToOptimize->isScalable()))
5502 return nullptr;
5503 StoreGroups.push_back(InterleaveR);
5504 }
5505
5506 if (StoreGroups.empty())
5507 return nullptr;
5508
5509 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5510 bool RequiresScalarEpilogue =
5511 MiddleVPBB->getNumSuccessors() == 1 &&
5512 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
5513 // Bail out for tail-folding (middle block with a single successor to exit).
5514 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
5515 return nullptr;
5516
5517 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5518 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
5519 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5520 // TODO: Handle cases where only some interleave groups can be narrowed.
5521 std::unique_ptr<VPlan> NewPlan;
5522 if (size(Plan.vectorFactors()) != 1) {
5523 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5524 Plan.setVF(*VFToOptimize);
5525 NewPlan->removeVF(*VFToOptimize);
5526 }
5527
5528 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5529 SmallPtrSet<VPValue *, 4> NarrowedOps;
5530 // Narrow operation tree rooted at store groups.
5531 for (auto *StoreGroup : StoreGroups) {
5532 VPValue *Res =
5533 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5534 auto *SI =
5535 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5536 auto *S = new VPWidenStoreRecipe(
5537 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
5538 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
5539 S->insertBefore(StoreGroup);
5540 StoreGroup->eraseFromParent();
5541 }
5542
5543 // Adjust induction to reflect that the transformed plan only processes one
5544 // original iteration.
5545 auto *CanIV = VectorLoop->getCanonicalIV();
5546 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
5547 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
5548 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
5549
5550 VPValue *UF = &Plan.getUF();
5551 VPValue *Step;
5552 if (VFToOptimize->isScalable()) {
5553 VPValue *VScale = PHBuilder.createElementCount(
5555 Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},
5556 {true, false});
5557 Plan.getVF().replaceAllUsesWith(VScale);
5558 } else {
5559 Step = UF;
5561 Plan.getConstantInt(CanIV->getScalarType(), 1));
5562 }
5563 // Materialize vector trip count with the narrowed step.
5564 materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,
5565 RequiresScalarEpilogue, Step);
5566
5567 Inc->setOperand(1, Step);
5568 Plan.getVFxUF().replaceAllUsesWith(Step);
5569
5570 removeDeadRecipes(Plan);
5571 assert(none_of(*VectorLoop->getEntryBasicBlock(),
5573 "All VPVectorPointerRecipes should have been removed");
5574 return NewPlan;
5575}
5576
5577/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5578/// BranchOnCond recipe.
5580 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5581 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5582 auto *MiddleTerm =
5584 // Only add branch metadata if there is a (conditional) terminator.
5585 if (!MiddleTerm)
5586 return;
5587
5588 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5589 "must have a BranchOnCond");
5590 // Assume that `TripCount % VectorStep ` is equally distributed.
5591 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5592 if (VF.isScalable() && VScaleForTuning.has_value())
5593 VectorStep *= *VScaleForTuning;
5594 assert(VectorStep > 0 && "trip count should not be zero");
5595 MDBuilder MDB(Plan.getContext());
5596 MDNode *BranchWeights =
5597 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5598 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5599}
5600
5602 VFRange &Range) {
5603 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5604 auto *MiddleVPBB = Plan.getMiddleBlock();
5605 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5606
5607 auto IsScalableOne = [](ElementCount VF) -> bool {
5608 return VF == ElementCount::getScalable(1);
5609 };
5610
5611 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5612 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5613 if (!FOR)
5614 continue;
5615
5616 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5617 "Cannot handle loops with uncountable early exits");
5618
5619 // This is the second phase of vectorizing first-order recurrences, creating
5620 // extract for users outside the loop. An overview of the transformation is
5621 // described below. Suppose we have the following loop with some use after
5622 // the loop of the last a[i-1],
5623 //
5624 // for (int i = 0; i < n; ++i) {
5625 // t = a[i - 1];
5626 // b[i] = a[i] - t;
5627 // }
5628 // use t;
5629 //
5630 // There is a first-order recurrence on "a". For this loop, the shorthand
5631 // scalar IR looks like:
5632 //
5633 // scalar.ph:
5634 // s.init = a[-1]
5635 // br scalar.body
5636 //
5637 // scalar.body:
5638 // i = phi [0, scalar.ph], [i+1, scalar.body]
5639 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5640 // s2 = a[i]
5641 // b[i] = s2 - s1
5642 // br cond, scalar.body, exit.block
5643 //
5644 // exit.block:
5645 // use = lcssa.phi [s1, scalar.body]
5646 //
5647 // In this example, s1 is a recurrence because it's value depends on the
5648 // previous iteration. In the first phase of vectorization, we created a
5649 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5650 // for users in the scalar preheader and exit block.
5651 //
5652 // vector.ph:
5653 // v_init = vector(..., ..., ..., a[-1])
5654 // br vector.body
5655 //
5656 // vector.body
5657 // i = phi [0, vector.ph], [i+4, vector.body]
5658 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5659 // v2 = a[i, i+1, i+2, i+3]
5660 // b[i] = v2 - v1
5661 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5662 // b[i, i+1, i+2, i+3] = v2 - v1
5663 // br cond, vector.body, middle.block
5664 //
5665 // middle.block:
5666 // vector.recur.extract.for.phi = v2(2)
5667 // vector.recur.extract = v2(3)
5668 // br cond, scalar.ph, exit.block
5669 //
5670 // scalar.ph:
5671 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5672 // [s.init, otherwise]
5673 // br scalar.body
5674 //
5675 // scalar.body:
5676 // i = phi [0, scalar.ph], [i+1, scalar.body]
5677 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5678 // s2 = a[i]
5679 // b[i] = s2 - s1
5680 // br cond, scalar.body, exit.block
5681 //
5682 // exit.block:
5683 // lo = lcssa.phi [s1, scalar.body],
5684 // [vector.recur.extract.for.phi, middle.block]
5685 //
5686 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5687 // Extract the penultimate value of the recurrence and use it as operand for
5688 // the VPIRInstruction modeling the phi.
5690 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5692 continue;
5693
5694 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5695 // penultimate value of the recurrence. Instead we rely on the existing
5696 // extract of the last element from the result of
5697 // VPInstruction::FirstOrderRecurrenceSplice.
5698 // TODO: Consider vscale_range info and UF.
5700 Range))
5701 return;
5702 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5703 VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
5704 "vector.recur.extract.for.phi");
5705 for (VPUser *U : to_vector(cast<VPInstruction>(&R)->users())) {
5706 auto *ExitPhi = dyn_cast<VPIRPhi>(U);
5707 if (!ExitPhi)
5708 continue;
5709 ExitPhi->replaceUsesOfWith(cast<VPInstruction>(&R), PenultimateElement);
5710 }
5711 }
5712 }
5713}
5714
5717 Loop &L) {
5718 ScalarEvolution &SE = *PSE.getSE();
5719 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
5720
5721 // Helper lambda to check if the IV range excludes the sentinel value.
5722 auto CheckSentinel = [&SE](const SCEV *IVSCEV, bool UseMax,
5723 bool Signed) -> std::optional<APInt> {
5724 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
5725 APInt Sentinel =
5726 UseMax
5729
5730 ConstantRange IVRange =
5731 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
5732 if (!IVRange.contains(Sentinel))
5733 return Sentinel;
5734 return std::nullopt;
5735 };
5736
5737 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
5738 for (VPRecipeBase &Phi :
5739 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
5740 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
5742 PhiR->getRecurrenceKind()))
5743 continue;
5744
5745 Type *PhiTy = VPTypeAnalysis(Plan).inferScalarType(PhiR);
5746 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
5747 continue;
5748
5749 // If there's a header mask, the backedge select will not be the find-last
5750 // select.
5751 VPValue *BackedgeVal = PhiR->getBackedgeValue();
5752 VPValue *CondSelect = BackedgeVal;
5753 if (HeaderMask &&
5754 !match(BackedgeVal, m_Select(m_Specific(HeaderMask),
5755 m_VPValue(CondSelect), m_Specific(PhiR))))
5756 llvm_unreachable("expected header mask select");
5757
5758 // Get the IV from the conditional select of the reduction phi.
5759 // The conditional select should be a select between the phi and the IV.
5760 VPValue *Cond, *TrueVal, *FalseVal;
5761 if (!match(CondSelect, m_Select(m_VPValue(Cond), m_VPValue(TrueVal),
5762 m_VPValue(FalseVal))))
5763 continue;
5764
5765 // The non-phi operand of the select is the IV.
5766 assert(is_contained(CondSelect->getDefiningRecipe()->operands(), PhiR));
5767 VPValue *IV = TrueVal == PhiR ? FalseVal : TrueVal;
5768
5769 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(IV, PSE, &L);
5770 const SCEV *Step;
5771 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step))))
5772 continue;
5773
5774 // Determine direction from SCEV step.
5775 if (!SE.isKnownNonZero(Step))
5776 continue;
5777
5778 // Positive step means we need UMax/SMax to find the last IV value, and
5779 // UMin/SMin otherwise.
5780 bool UseMax = SE.isKnownPositive(Step);
5781 bool UseSigned = true;
5782 std::optional<APInt> SentinelVal =
5783 CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/true);
5784 if (!SentinelVal) {
5785 SentinelVal = CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/false);
5786 UseSigned = false;
5787 }
5788
5789 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
5790 // if the condition was ever true. Requires the IV to not wrap, otherwise we
5791 // cannot use min/max.
5792 if (!SentinelVal) {
5793 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
5794 if (AR->hasNoSignedWrap())
5795 UseSigned = true;
5796 else if (AR->hasNoUnsignedWrap())
5797 UseSigned = false;
5798 else
5799 continue;
5800 }
5801
5803 BackedgeVal,
5805
5806 RecurKind MinMaxKind =
5807 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
5808 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
5809 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
5810 FastMathFlags());
5811 DebugLoc ExitDL = RdxResult->getDebugLoc();
5812 VPBuilder MiddleBuilder(RdxResult);
5813 VPValue *ReducedIV =
5815 RdxResult->getOperand(0), Flags, ExitDL);
5816
5817 VPValue *NewRdxResult;
5818 VPValue *StartVPV = PhiR->getStartValue();
5819 if (SentinelVal) {
5820 // Sentinel-based approach: reduce IVs with min/max, compare against
5821 // sentinel to detect if condition was ever true, select accordingly.
5822 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
5823 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
5824 Sentinel, ExitDL);
5825 NewRdxResult =
5826 MiddleBuilder.createSelect(Cmp, ReducedIV, StartVPV, ExitDL);
5827 StartVPV = Sentinel;
5828 } else {
5829 // Introduce a boolean AnyOf reduction to track if the condition was ever
5830 // true in the loop. Use it to select the initial start value, if it was
5831 // never true.
5832 auto *AnyOfPhi = new VPReductionPHIRecipe(
5833 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
5834 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
5835 AnyOfPhi->insertAfter(PhiR);
5836
5837 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
5838 VPValue *AnyOfCond = Cond;
5839 if (TrueVal == PhiR)
5840 AnyOfCond = LoopBuilder.createNot(Cond);
5841 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, AnyOfCond);
5842 AnyOfPhi->setOperand(1, OrVal);
5843
5844 NewRdxResult =
5846 {StartVPV, ReducedIV, OrVal}, {}, ExitDL);
5847
5848 // Initialize the IV reduction phi with the neutral element, not the
5849 // original start value, to ensure correct min/max reduction results.
5850 StartVPV = Plan.getOrAddLiveIn(
5851 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
5852 }
5853 RdxResult->replaceAllUsesWith(NewRdxResult);
5854 RdxResult->eraseFromParent();
5855
5856 auto *NewPhiR = new VPReductionPHIRecipe(
5857 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
5858 *CondSelect, RdxUnordered{1}, {}, PhiR->hasUsesOutsideReductionChain());
5859 NewPhiR->insertBefore(PhiR);
5860 PhiR->replaceAllUsesWith(NewPhiR);
5861 PhiR->eraseFromParent();
5862 }
5863}
5864
5865namespace {
5866
5867/// Holds the binary operation used to compute the extended operand and the
5868/// casts that feed into it.
5869struct ExtendedReductionOperand {
5870 VPWidenRecipe *BinOp = nullptr;
5871 // Note: The second cast recipe may be null.
5872 std::array<VPWidenCastRecipe *, 2> CastRecipes = {};
5873};
5874
5875/// A chain of recipes that form a partial reduction. Matches either
5876/// reduction_bin_op (extend (A), accumulator), or
5877/// reduction_bin_op (bin_op (extend (A), (extend (B))), accumulator).
5878struct VPPartialReductionChain {
5879 /// The top-level binary operation that forms the reduction to a scalar
5880 /// after the loop body.
5881 VPWidenRecipe *ReductionBinOp;
5882 /// The user of the extends that is then reduced.
5883 ExtendedReductionOperand ExtendedOp;
5884 unsigned ScaleFactor;
5885 /// The recurrence kind for the entire partial reduction chain.
5886 /// This allows distinguishing between Sub and AddWithSub recurrences,
5887 /// when the ReductionBinOp is a Instruction::Sub.
5888 RecurKind RK;
5889};
5890
5891static VPSingleDefRecipe *
5892optimizeExtendsForPartialReduction(VPSingleDefRecipe *BinOp,
5893 VPTypeAnalysis &TypeInfo) {
5894 // reduce.add(mul(ext(A), C))
5895 // -> reduce.add(mul(ext(A), ext(trunc(C))))
5896 const APInt *Const;
5897 if (match(BinOp, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
5898 auto *ExtA = cast<VPWidenCastRecipe>(BinOp->getOperand(0));
5899 Instruction::CastOps ExtOpc = ExtA->getOpcode();
5900 Type *NarrowTy = TypeInfo.inferScalarType(ExtA->getOperand(0));
5901 if (!BinOp->hasOneUse() ||
5903 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
5904 return BinOp;
5905
5906 VPBuilder Builder(BinOp);
5907 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
5908 BinOp->getOperand(1), NarrowTy);
5909 Type *WideTy = TypeInfo.inferScalarType(ExtA);
5910 BinOp->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
5911 return BinOp;
5912 }
5913
5914 // reduce.add(ext(mul(ext(A), ext(B))))
5915 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
5917 m_ZExtOrSExt(m_VPValue()))))) {
5918 auto *Ext = cast<VPWidenCastRecipe>(BinOp);
5919 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
5920 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
5921 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
5922 if (!Mul->hasOneUse() ||
5923 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
5924 MulLHS->getOpcode() != MulRHS->getOpcode())
5925 return BinOp;
5926 VPBuilder Builder(Mul);
5927 Mul->setOperand(0, Builder.createWidenCast(MulLHS->getOpcode(),
5928 MulLHS->getOperand(0),
5929 Ext->getResultType()));
5930 Mul->setOperand(1, MulLHS == MulRHS
5931 ? Mul->getOperand(0)
5932 : Builder.createWidenCast(MulRHS->getOpcode(),
5933 MulRHS->getOperand(0),
5934 Ext->getResultType()));
5935 return Mul;
5936 }
5937
5938 return BinOp;
5939}
5940
5941// Helper to transform a partial reduction chain into a partial reduction
5942// recipe. Assumes profitability has been checked.
5943static void transformToPartialReduction(const VPPartialReductionChain &Chain,
5944 VPTypeAnalysis &TypeInfo, VPlan &Plan,
5945 VPReductionPHIRecipe *RdxPhi) {
5946 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
5947 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
5948
5949 VPValue *BinOpVal = WidenRecipe->getOperand(0);
5950 VPValue *Accumulator = WidenRecipe->getOperand(1);
5951
5952 // Swap if needed to ensure Accumulator is the PHI or partial reduction.
5954 isa<VPExpressionRecipe>(BinOpVal))
5955 std::swap(BinOpVal, Accumulator);
5956 auto *BinOp = cast<VPSingleDefRecipe>(BinOpVal->getDefiningRecipe());
5957
5958 // Sub-reductions can be implemented in two ways:
5959 // (1) negate the operand in the vector loop (the default way).
5960 // (2) subtract the reduced value from the init value in the middle block.
5961 // Both ways keep the reduction itself as an 'add' reduction.
5962 //
5963 // The ISD nodes for partial reductions don't support folding the
5964 // sub/negation into its operands because the following is not a valid
5965 // transformation:
5966 // sub(0, mul(ext(a), ext(b)))
5967 // -> mul(ext(a), ext(sub(0, b)))
5968 //
5969 // It's therefore better to choose option (2) such that the partial
5970 // reduction is always positive (starting at '0') and to do a final
5971 // subtract in the middle block.
5972 if (WidenRecipe->getOpcode() == Instruction::Sub &&
5973 Chain.RK != RecurKind::Sub) {
5974 VPBuilder Builder(WidenRecipe);
5975 Type *ElemTy = TypeInfo.inferScalarType(BinOp);
5976 auto *Zero = Plan.getZero(ElemTy);
5977 VPIRFlags Flags = WidenRecipe->getUnderlyingInstr()
5978 ? VPIRFlags(*WidenRecipe->getUnderlyingInstr())
5979 : VPIRFlags();
5980 auto *NegRecipe = new VPWidenRecipe(Instruction::Sub, {Zero, BinOp}, Flags,
5982 Builder.insert(NegRecipe);
5983 BinOp = NegRecipe;
5984 }
5985
5986 // FIXME: Do these transforms before invoking the cost-model.
5987 BinOp = optimizeExtendsForPartialReduction(BinOp, TypeInfo);
5988
5989 // Check if WidenRecipe is the final result of the reduction. If so look
5990 // through selects for predicated reductions.
5991 VPValue *Cond = nullptr;
5993 WidenRecipe,
5994 m_Select(m_VPValue(Cond), m_Specific(WidenRecipe), m_Specific(RdxPhi))));
5995 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
5996 RdxPhi->getBackedgeValue() == ExitValue;
5997 assert((!ExitValue || IsLastInChain) &&
5998 "if we found ExitValue, it must match RdxPhi's backedge value");
5999
6000 Type *PhiType = TypeInfo.inferScalarType(RdxPhi);
6001 RecurKind RdxKind =
6003 auto *PartialRed = new VPReductionRecipe(
6004 RdxKind,
6005 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
6006 : FastMathFlags(),
6007 WidenRecipe->getUnderlyingInstr(), Accumulator, BinOp, Cond,
6008 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6009 PartialRed->insertBefore(WidenRecipe);
6010
6011 if (Cond)
6012 ExitValue->replaceAllUsesWith(PartialRed);
6013 WidenRecipe->replaceAllUsesWith(PartialRed);
6014
6015 // We only need to update the PHI node once, which is when we find the
6016 // last reduction in the chain.
6017 if (!IsLastInChain)
6018 return;
6019
6020 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6021 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6022 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6023
6024 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
6025 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6026 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
6027 StartInst->setOperand(2, NewScaleFactor);
6028
6029 // If this is the last value in a sub-reduction chain, then update the PHI
6030 // node to start at `0` and update the reduction-result to subtract from
6031 // the PHI's start value.
6032 if (Chain.RK != RecurKind::Sub)
6033 return;
6034
6035 VPValue *OldStartValue = StartInst->getOperand(0);
6036 StartInst->setOperand(0, StartInst->getOperand(1));
6037
6038 // Replace reduction_result by 'sub (startval, reductionresult)'.
6040 assert(RdxResult && "Could not find reduction result");
6041
6042 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
6043 constexpr unsigned SubOpc = Instruction::BinaryOps::Sub;
6044 VPInstruction *NewResult = Builder.createNaryOp(
6045 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
6046 RdxPhi->getDebugLoc());
6047 RdxResult->replaceUsesWithIf(
6048 NewResult,
6049 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6050}
6051
6052/// Check if a partial reduction chain is is supported by the target (i.e. does
6053/// not have an invalid cost) for the given VF range. Clamps the range and
6054/// returns true if profitable for any VF.
6055static bool isValidPartialReduction(const VPPartialReductionChain &Chain,
6056 Type *PhiType, VPCostContext &CostCtx,
6057 VFRange &Range) {
6058 auto GetExtInfo = [&CostCtx](VPWidenCastRecipe *Ext)
6059 -> std::pair<Type *, TargetTransformInfo::PartialReductionExtendKind> {
6060 if (!Ext)
6061 return {nullptr, TargetTransformInfo::PR_None};
6062 Type *ExtOpType = CostCtx.Types.inferScalarType(Ext->getOperand(0));
6064 static_cast<Instruction::CastOps>(Ext->getOpcode()));
6065 return {ExtOpType, ExtKind};
6066 };
6067 ExtendedReductionOperand ExtendedOp = Chain.ExtendedOp;
6068 VPWidenCastRecipe *ExtendA = ExtendedOp.CastRecipes[0];
6069 VPWidenCastRecipe *ExtendB = ExtendedOp.CastRecipes[1];
6070
6071 Type *ExtOpTypeA, *ExtOpTypeB;
6073 std::tie(ExtOpTypeA, ExtKindA) = GetExtInfo(ExtendA);
6074 std::tie(ExtOpTypeB, ExtKindB) = GetExtInfo(ExtendB);
6075
6076 // If ExtendB is nullptr but there's a separate BinOp, the second operand
6077 // was a constant that can use the same extend kind as the first.
6078 if (!ExtendB && ExtendedOp.BinOp &&
6079 ExtendedOp.BinOp != Chain.ReductionBinOp) {
6080 const APInt *Const = nullptr;
6081 for (VPValue *Op : ExtendedOp.BinOp->operands()) {
6082 if (match(Op, m_APInt(Const)))
6083 break;
6084 }
6085 if (!Const || !canConstantBeExtended(Const, ExtOpTypeA, ExtKindA))
6086 return false;
6087 ExtOpTypeB = ExtOpTypeA;
6088 ExtKindB = ExtKindA;
6089 }
6090
6091 std::optional<unsigned> BinOpc;
6092 if (ExtendedOp.BinOp && ExtendedOp.BinOp != Chain.ReductionBinOp)
6093 BinOpc = ExtendedOp.BinOp->getOpcode();
6094
6095 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6097 [&](ElementCount VF) {
6098 return CostCtx.TTI
6100 WidenRecipe->getOpcode(), ExtOpTypeA, ExtOpTypeB, PhiType, VF,
6101 ExtKindA, ExtKindB, BinOpc, CostCtx.CostKind,
6102 PhiType->isFloatingPointTy()
6103 ? std::optional{WidenRecipe->getFastMathFlags()}
6104 : std::nullopt)
6105 .isValid();
6106 },
6107 Range);
6108}
6109
6110/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6111/// operand. This is an operand where the source of the value (e.g. a load) has
6112/// been extended (sext, zext, or fpext) before it is used in the reduction.
6113///
6114/// Possible forms matched by this function:
6115/// - UpdateR(PrevValue, ext(...))
6116/// - UpdateR(PrevValue, BinOp(ext(...), ext(...)))
6117/// - UpdateR(PrevValue, BinOp(ext(...), Constant))
6118/// - UpdateR(PrevValue, neg(BinOp(ext(...), ext(...))))
6119/// - UpdateR(PrevValue, neg(BinOp(ext(...), Constant)))
6120/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6121/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6122///
6123/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6124static std::optional<ExtendedReductionOperand>
6125matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op) {
6126 assert(is_contained(UpdateR->operands(), Op) &&
6127 "Op should be operand of UpdateR");
6128
6129 // If Op is an extend, then it's still a valid partial reduction if the
6130 // extended mul fulfills the other requirements.
6131 // For example, reduce.add(ext(mul(ext(A), ext(B)))) is still a valid partial
6132 // reduction since the inner extends will be widened. We already have oneUse
6133 // checks on the inner extends so widening them is safe.
6134 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6137 auto *CastRecipe = dyn_cast<VPWidenCastRecipe>(Op);
6138 if (!CastRecipe)
6139 return std::nullopt;
6140 auto CastOp = static_cast<Instruction::CastOps>(CastRecipe->getOpcode());
6141 OuterExtKind = TTI::getPartialReductionExtendKind(CastOp);
6142 Op = CastRecipe->getOperand(0);
6143 }
6144
6145 // If the update is a binary op, check both of its operands to see if
6146 // they are extends. Otherwise, see if the update comes directly from an
6147 // extend.
6148 std::array<VPWidenCastRecipe *, 2> CastRecipes = {};
6149
6150 // Match extends and populate CastRecipes. Returns false if matching fails.
6151 auto MatchExtends = [OuterExtKind,
6152 &CastRecipes](ArrayRef<VPValue *> Operands) {
6153 assert(Operands.size() <= 2 && "expected at most 2 operands");
6154
6155 for (const auto &[I, OpVal] : enumerate(Operands)) {
6156 // Allow constant as second operand - validation happens in
6157 // isValidPartialReduction.
6158 const APInt *Unused;
6159 if (I > 0 && CastRecipes[0] && match(OpVal, m_APInt(Unused)))
6160 continue;
6161
6162 VPValue *ExtInput;
6163 if (!match(OpVal, m_ZExtOrSExt(m_VPValue(ExtInput))) &&
6164 !match(OpVal, m_FPExt(m_VPValue(ExtInput))))
6165 return false;
6166
6167 CastRecipes[I] = dyn_cast<VPWidenCastRecipe>(OpVal);
6168 if (!CastRecipes[I])
6169 return false;
6170
6171 // The outer extend kind must match the inner extends for folding.
6172 if (OuterExtKind) {
6173 auto CastOp =
6174 static_cast<Instruction::CastOps>(CastRecipes[I]->getOpcode());
6175 if (*OuterExtKind != TTI::getPartialReductionExtendKind(CastOp))
6176 return false;
6177 }
6178 }
6179 return CastRecipes[0] != nullptr;
6180 };
6181
6182 // If Op is a binary operator, check both of its operands to see if they are
6183 // extends. Otherwise, see if the update comes directly from an extend.
6184 auto *BinOp = dyn_cast<VPWidenRecipe>(Op);
6185 if (BinOp && Instruction::isBinaryOp(BinOp->getOpcode())) {
6186 if (!BinOp->hasOneUse())
6187 return std::nullopt;
6188
6189 // Handle neg(binop(ext, ext)) pattern.
6190 VPValue *OtherOp = nullptr;
6191 if (match(BinOp, m_Sub(m_ZeroInt(), m_VPValue(OtherOp))))
6192 BinOp = dyn_cast<VPWidenRecipe>(OtherOp);
6193
6194 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||
6195 !MatchExtends(BinOp->operands()))
6196 return std::nullopt;
6197 } else if (match(UpdateR, m_Add(m_VPValue(), m_VPValue())) ||
6198 match(UpdateR, m_FAdd(m_VPValue(), m_VPValue()))) {
6199 // We already know Op is an operand of UpdateR.
6200 if (!MatchExtends({Op}))
6201 return std::nullopt;
6202 BinOp = UpdateR;
6203 } else {
6204 return std::nullopt;
6205 }
6206
6207 return ExtendedReductionOperand{BinOp, CastRecipes};
6208}
6209
6210/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6211/// and determines if the target can use a cheaper operation with a wider
6212/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6213/// of operations in the reduction.
6214static std::optional<SmallVector<VPPartialReductionChain>>
6215getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,
6216 VFRange &Range) {
6217 // Get the backedge value from the reduction PHI and find the
6218 // ComputeReductionResult that uses it (directly or through a select for
6219 // predicated reductions).
6220 auto *RdxResult = vputils::findComputeReductionResult(RedPhiR);
6221 if (!RdxResult)
6222 return std::nullopt;
6223 VPValue *ExitValue = RdxResult->getOperand(0);
6224 match(ExitValue, m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6225
6227 RecurKind RK = RedPhiR->getRecurrenceKind();
6228 Type *PhiType = CostCtx.Types.inferScalarType(RedPhiR);
6229 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6230
6231 // Work backwards from the ExitValue examining each reduction operation.
6232 VPValue *CurrentValue = ExitValue;
6233 while (CurrentValue != RedPhiR) {
6234 auto *UpdateR = dyn_cast<VPWidenRecipe>(CurrentValue);
6235 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6236 return std::nullopt;
6237
6238 VPValue *Op = UpdateR->getOperand(1);
6239 VPValue *PrevValue = UpdateR->getOperand(0);
6240
6241 // Find the extended operand. The other operand (PrevValue) is the next link
6242 // in the reduction chain.
6243 std::optional<ExtendedReductionOperand> ExtendedOp =
6244 matchExtendedReductionOperand(UpdateR, Op);
6245 if (!ExtendedOp) {
6246 ExtendedOp = matchExtendedReductionOperand(UpdateR, PrevValue);
6247 if (!ExtendedOp)
6248 return std::nullopt;
6249 std::swap(Op, PrevValue);
6250 }
6251
6252 Type *ExtSrcType = CostCtx.Types.inferScalarType(
6253 ExtendedOp->CastRecipes[0]->getOperand(0));
6254 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6255 if (!PHISize.hasKnownScalarFactor(ExtSrcSize))
6256 return std::nullopt;
6257
6258 VPPartialReductionChain Chain(
6259 {UpdateR, *ExtendedOp,
6260 static_cast<unsigned>(PHISize.getKnownScalarFactor(ExtSrcSize)), RK});
6261 if (!isValidPartialReduction(Chain, PhiType, CostCtx, Range))
6262 return std::nullopt;
6263
6264 Chains.push_back(Chain);
6265 CurrentValue = PrevValue;
6266 }
6267
6268 // The chains were collected by traversing backwards from the exit value.
6269 // Reverse the chains so they are in program order.
6270 std::reverse(Chains.begin(), Chains.end());
6271 return Chains;
6272}
6273} // namespace
6274
6276 VPCostContext &CostCtx,
6277 VFRange &Range) {
6278 // Find all possible valid partial reductions, grouping chains by their PHI.
6279 // This grouping allows invalidating the whole chain, if any link is not a
6280 // valid partial reduction.
6282 ChainsByPhi;
6283 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6284 for (VPRecipeBase &R : HeaderVPBB->phis()) {
6285 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
6286 if (!RedPhiR)
6287 continue;
6288
6289 if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))
6290 ChainsByPhi.try_emplace(RedPhiR, std::move(*Chains));
6291 }
6292
6293 if (ChainsByPhi.empty())
6294 return;
6295
6296 // Build set of partial reduction operations for extend user validation and
6297 // a map of reduction bin ops to their scale factors for scale validation.
6298 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
6299 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
6300 for (const auto &[_, Chains] : ChainsByPhi)
6301 for (const VPPartialReductionChain &Chain : Chains) {
6302 PartialReductionOps.insert(Chain.ExtendedOp.BinOp);
6303 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
6304 }
6305
6306 // A partial reduction is invalid if any of its extends are used by
6307 // something that isn't another partial reduction. This is because the
6308 // extends are intended to be lowered along with the reduction itself.
6309 auto ExtendUsersValid = [&](VPWidenCastRecipe *Ext) {
6310 return !Ext || all_of(Ext->users(), [&](VPUser *U) {
6311 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
6312 });
6313 };
6314
6315 // Validate chains: check that extends are only used by partial reductions,
6316 // and that reduction bin ops are only used by other partial reductions with
6317 // matching scale factors, are outside the loop region or the select
6318 // introduced by tail-folding. Otherwise we would create users of scaled
6319 // reductions where the types of the other operands don't match.
6320 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6321 for (const VPPartialReductionChain &Chain : Chains) {
6322 if (!all_of(Chain.ExtendedOp.CastRecipes, ExtendUsersValid)) {
6323 Chains.clear();
6324 break;
6325 }
6326 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6327 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
6328 return PhiR == RedPhiR;
6329 auto *R = cast<VPSingleDefRecipe>(U);
6330 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
6332 m_Specific(Chain.ReductionBinOp))) ||
6333 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
6334 m_Specific(RedPhiR)));
6335 };
6336 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
6337 Chains.clear();
6338 break;
6339 }
6340
6341 // Check if the compute-reduction-result is used by a sunk store.
6342 // TODO: Also form partial reductions in those cases.
6343 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6344 if (any_of(RdxResult->users(), [](VPUser *U) {
6345 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
6346 return RepR && isa<StoreInst>(RepR->getUnderlyingInstr());
6347 })) {
6348 Chains.clear();
6349 break;
6350 }
6351 }
6352 }
6353 }
6354
6355 for (auto &[Phi, Chains] : ChainsByPhi)
6356 for (const VPPartialReductionChain &Chain : Chains)
6357 transformToPartialReduction(Chain, CostCtx.Types, Plan, Phi);
6358}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:849
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx, bool IsScalable)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectGroupedReplicateMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L, function_ref< bool(VPReplicateRecipe *)> FilterFn)
Collect either replicated Loads or Stores grouped by their address SCEV.
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
From the definition of llvm.experimental.get.vector.length, VPInstruction::ExplicitVectorLength(AVL) ...
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations in blocks between FirstBB and LastBB...
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool canNarrowOps(ArrayRef< VPValue * > Ops, bool IsScalable)
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static void narrowToSingleScalarRecipes(VPlan &Plan)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1023
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1527
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:207
APInt abs() const
Get the absolute value.
Definition APInt.h:1810
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition APInt.h:217
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:996
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
@ NoAlias
The two locations do not alias at all.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:215
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1603
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1080
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
bool empty() const
Definition MapVector.h:77
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getUDivExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
op_range operands()
Definition User.h:267
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3859
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4239
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4314
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4266
iterator end()
Definition VPlan.h:4276
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4274
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4327
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:232
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:598
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:565
const VPRecipeBase & front() const
Definition VPlan.h:4286
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:644
const VPRecipeBase & back() const
Definition VPlan.h:4288
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2764
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2800
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2790
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2806
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2786
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:82
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:303
VPRegionBlock * getParent()
Definition VPlan.h:174
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:202
size_t getNumSuccessors() const
Definition VPlan.h:225
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:294
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:210
VPlan * getPlan()
Definition VPlan.cpp:177
const std::string & getName() const
Definition VPlan.h:165
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:313
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:221
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:182
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:267
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:215
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:199
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:266
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:287
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:200
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:218
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:236
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3268
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3801
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:3891
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:466
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:439
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:451
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:461
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3971
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3313
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2276
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2318
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2307
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4392
Class to record and manage LLVM IR flags.
Definition VPlan.h:674
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1140
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1195
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1297
@ ComputeAnyOfResult
Compute the final result of a AnyOf reduction with select(cmp(),x,y), where one of (x,...
Definition VPlan.h:1242
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1239
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1291
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1234
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1231
@ CanonicalIVIncrementForPart
Definition VPlan.h:1215
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2909
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2901
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2930
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2982
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2940
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1563
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3455
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:390
VPRegionBlock * getRegion()
Definition VPlan.h:4544
VPBasicBlock * getParent()
Definition VPlan.h:465
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:539
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3142
A recipe for handling reduction phis.
Definition VPlan.h:2670
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2717
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2710
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2728
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3033
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4427
const VPBlockBase * getEntry() const
Definition VPlan.h:4463
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4538
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4495
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4480
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4525
const VPBlockBase * getExiting() const
Definition VPlan.h:4475
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4488
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3187
bool isSingleScalar() const
Definition VPlan.h:3228
bool isPredicated() const
Definition VPlan.h:3230
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3252
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:4043
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:591
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:659
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:297
operand_range operands()
Definition VPlanValue.h:365
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:341
unsigned getNumOperands() const
Definition VPlanValue.h:335
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:336
void addOperand(VPValue *Operand)
Definition VPlanValue.h:330
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:46
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:137
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1431
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:127
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:71
bool hasOneUse() const
Definition VPlanValue.h:167
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:197
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1434
unsigned getNumUsers() const
Definition VPlanValue.h:108
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1440
user_range users()
Definition VPlanValue.h:150
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2124
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3934
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1810
A recipe for handling GEP instructions.
Definition VPlan.h:2060
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2342
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2370
PHINode * getPHINode() const
Returns the underlying PHINode if one exists, or null otherwise.
Definition VPlan.h:2388
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2373
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2393
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2424
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2471
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2475
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2502
A recipe for widening vector intrinsics.
Definition VPlan.h:1862
A common base class for widening memory operations.
Definition VPlan.h:3498
A recipe for widened phis.
Definition VPlan.h:2560
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1754
unsigned getOpcode() const
Definition VPlan.h:1791
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4557
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4865
bool hasVF(ElementCount VF) const
Definition VPlan.h:4770
const DataLayout & getDataLayout() const
Definition VPlan.h:4752
LLVMContext & getContext() const
Definition VPlan.h:4748
VPBasicBlock * getEntry()
Definition VPlan.h:4649
bool hasScalableVF() const
Definition VPlan.h:4771
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4707
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4728
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4777
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:4836
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4746
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:4842
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4912
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4868
bool hasUF(unsigned UF) const
Definition VPlan.h:4788
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4697
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4736
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4813
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:4839
void setVF(ElementCount VF)
Definition VPlan.h:4758
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4804
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1064
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:4791
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4721
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4674
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4891
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:4833
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4743
bool hasScalarVFOnly() const
Definition VPlan.h:4781
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4688
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4654
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4739
void setUF(unsigned UF)
Definition VPlan.h:4796
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4944
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1212
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4847
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:427
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2774
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
bool match(const SCEV *S, const Pattern &P)
class_match< const SCEV > m_SCEV()
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
bind_ty< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
auto m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
auto m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:111
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
Definition VPlanUtils.h:132
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
void stable_sort(R &&Range)
Definition STLExtras.h:2116
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:280
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1152
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:273
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:552
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1796
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
iterator_range< po_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_post_order_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order.
Definition VPlanCFG.h:266
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2088
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2146
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:789
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane mask phi as machine instruction, incoming register Reg and incoming block Block are...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2652
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
TargetTransformInfo::TargetCostKind CostKind
VPTypeAnalysis Types
const TargetTransformInfo & TTI
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2608
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:208
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:141
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:248
bool isMaterialized() const
Returns true if this symbolic value has been materialized.
Definition VPlanValue.h:256
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3631
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3589
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3716
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3672
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses except the canoni...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step)
Materialize vector trip count computations to a set of VPInstructions.
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...